mirror of
https://github.com/xenia-project/xenia.git
synced 2025-12-06 07:12:03 +01:00
Merge pull request #46 from chrisps/canary_experimental
ome guest function calls can now be resolved and embedded directly in
This commit is contained in:
commit
9a72d6ab05
|
|
@ -43,7 +43,10 @@ DEFINE_bool(ignore_undefined_externs, true,
|
||||||
DEFINE_bool(emit_source_annotations, false,
|
DEFINE_bool(emit_source_annotations, false,
|
||||||
"Add extra movs and nops to make disassembly easier to read.",
|
"Add extra movs and nops to make disassembly easier to read.",
|
||||||
"CPU");
|
"CPU");
|
||||||
|
DEFINE_bool(resolve_rel32_guest_calls, false,
|
||||||
|
"Experimental optimization, directly call already resolved "
|
||||||
|
"functions via x86 rel32 call/jmp",
|
||||||
|
"CPU");
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace cpu {
|
namespace cpu {
|
||||||
namespace backend {
|
namespace backend {
|
||||||
|
|
@ -99,7 +102,28 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
||||||
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
|
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
|
||||||
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
|
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#undef TEST_EMIT_FEATURE
|
#undef TEST_EMIT_FEATURE
|
||||||
|
|
||||||
|
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
|
||||||
|
|
||||||
|
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||||
|
|
||||||
|
if (is_zennish) {
|
||||||
|
feature_flags_ |= kX64FastJrcx;
|
||||||
|
|
||||||
|
if (cpu_.displayFamily > 0x17) {
|
||||||
|
feature_flags_ |= kX64FastLoop;
|
||||||
|
|
||||||
|
} else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
|
||||||
|
feature_flags_ |= kX64FastLoop;
|
||||||
|
} // todo:figure out at model zen+ became zen2, this is just the model
|
||||||
|
// for my cpu, which is ripper90
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
X64Emitter::~X64Emitter() = default;
|
X64Emitter::~X64Emitter() = default;
|
||||||
|
|
@ -149,6 +173,26 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
||||||
if (function) {
|
if (function) {
|
||||||
code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
|
code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
|
||||||
new_execute_address, new_write_address);
|
new_execute_address, new_write_address);
|
||||||
|
if (cvars::resolve_rel32_guest_calls) {
|
||||||
|
for (auto&& callsite : call_sites_) {
|
||||||
|
#pragma pack(push, 1)
|
||||||
|
struct RGCEmitted {
|
||||||
|
uint8_t ff_;
|
||||||
|
uint32_t rgcid_;
|
||||||
|
};
|
||||||
|
#pragma pack(pop)
|
||||||
|
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
|
||||||
|
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
|
||||||
|
hunter = reinterpret_cast<RGCEmitted*>(
|
||||||
|
reinterpret_cast<char*>(hunter) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
|
||||||
|
hunter->rgcid_ =
|
||||||
|
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
|
||||||
|
reinterpret_cast<intptr_t>(hunter + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
|
code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
|
||||||
new_write_address);
|
new_write_address);
|
||||||
|
|
@ -157,6 +201,7 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
||||||
ready();
|
ready();
|
||||||
top_ = old_address;
|
top_ = old_address;
|
||||||
reset();
|
reset();
|
||||||
|
call_sites_.clear();
|
||||||
return new_execute_address;
|
return new_execute_address;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -287,11 +332,8 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
||||||
code_offsets.tail = getSize();
|
code_offsets.tail = getSize();
|
||||||
|
|
||||||
if (cvars::emit_source_annotations) {
|
if (cvars::emit_source_annotations) {
|
||||||
nop();
|
nop(5);
|
||||||
nop();
|
|
||||||
nop();
|
|
||||||
nop();
|
|
||||||
nop();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_zero(code_offsets.prolog);
|
assert_zero(code_offsets.prolog);
|
||||||
|
|
@ -313,11 +355,9 @@ void X64Emitter::MarkSourceOffset(const Instr* i) {
|
||||||
entry->code_offset = static_cast<uint32_t>(getSize());
|
entry->code_offset = static_cast<uint32_t>(getSize());
|
||||||
|
|
||||||
if (cvars::emit_source_annotations) {
|
if (cvars::emit_source_annotations) {
|
||||||
nop();
|
nop(2);
|
||||||
nop();
|
|
||||||
mov(eax, entry->guest_address);
|
mov(eax, entry->guest_address);
|
||||||
nop();
|
nop(2);
|
||||||
nop();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctionCoverage) {
|
if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctionCoverage) {
|
||||||
|
|
@ -414,10 +454,44 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
|
||||||
assert_not_null(function);
|
assert_not_null(function);
|
||||||
auto fn = static_cast<X64Function*>(function);
|
auto fn = static_cast<X64Function*>(function);
|
||||||
// Resolve address to the function to call and store in rax.
|
// Resolve address to the function to call and store in rax.
|
||||||
|
|
||||||
|
if (cvars::resolve_rel32_guest_calls && fn->machine_code()) {
|
||||||
|
ResolvableGuestCall rgc;
|
||||||
|
rgc.destination_ = uint32_t(uint64_t(fn->machine_code()));
|
||||||
|
rgc.offset_ = current_rgc_id_;
|
||||||
|
current_rgc_id_++;
|
||||||
|
|
||||||
|
if (!(instr->flags & hir::CALL_TAIL)) {
|
||||||
|
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||||
|
|
||||||
|
db(0xFF);
|
||||||
|
rgc.is_jump_ = false;
|
||||||
|
|
||||||
|
dd(rgc.offset_);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// tail call
|
||||||
|
EmitTraceUserCallReturn();
|
||||||
|
|
||||||
|
rgc.is_jump_ = true;
|
||||||
|
// Pass the callers return address over.
|
||||||
|
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||||
|
|
||||||
|
add(rsp, static_cast<uint32_t>(stack_size()));
|
||||||
|
db(0xFF);
|
||||||
|
dd(rgc.offset_);
|
||||||
|
}
|
||||||
|
call_sites_.push_back(rgc);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (fn->machine_code()) {
|
if (fn->machine_code()) {
|
||||||
// TODO(benvanik): is it worth it to do this? It removes the need for
|
// TODO(benvanik): is it worth it to do this? It removes the need for
|
||||||
// a ResolveFunction call, but makes the table less useful.
|
// a ResolveFunction call, but makes the table less useful.
|
||||||
assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000);
|
assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000);
|
||||||
|
// todo: this should be changed so that we can actually do a call to
|
||||||
|
// fn->machine_code. the code will be emitted near us, so 32 bit rel jmp
|
||||||
|
// should be possible
|
||||||
mov(eax, uint32_t(uint64_t(fn->machine_code())));
|
mov(eax, uint32_t(uint64_t(fn->machine_code())));
|
||||||
} else if (code_cache_->has_indirection_table()) {
|
} else if (code_cache_->has_indirection_table()) {
|
||||||
// Load the pointer to the indirection table maintained in X64CodeCache.
|
// Load the pointer to the indirection table maintained in X64CodeCache.
|
||||||
|
|
@ -600,6 +674,30 @@ void X64Emitter::ReloadContext() {
|
||||||
void X64Emitter::ReloadMembase() {
|
void X64Emitter::ReloadMembase() {
|
||||||
mov(GetMembaseReg(), qword[GetContextReg() + 8]); // membase
|
mov(GetMembaseReg(), qword[GetContextReg() + 8]); // membase
|
||||||
}
|
}
|
||||||
|
#define __NH_CONCAT(x, y) x##y
|
||||||
|
#define _MH_CONCAT(cb, ...) cb (__VA_ARGS__)
|
||||||
|
|
||||||
|
#define mh_concat2_m(x, y) __NH_CONCAT(x, y)
|
||||||
|
|
||||||
|
#define DECLNOP(n, ...) \
|
||||||
|
static constexpr unsigned char mh_concat2_m(nop_, n)[] = {__VA_ARGS__}
|
||||||
|
|
||||||
|
DECLNOP(1, 0x90);
|
||||||
|
DECLNOP(2, 0x66, 0x90);
|
||||||
|
DECLNOP(3, 0x0F, 0x1F, 0x00);
|
||||||
|
DECLNOP(4, 0x0F, 0x1F, 0x40, 0x00);
|
||||||
|
DECLNOP(5, 0x0F, 0x1F, 0x44, 0x00, 0x00);
|
||||||
|
DECLNOP(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00);
|
||||||
|
DECLNOP(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00);
|
||||||
|
DECLNOP(8, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
|
||||||
|
DECLNOP(9, 0x66, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
|
||||||
|
|
||||||
|
static constexpr const unsigned char* const g_noptable[] = {
|
||||||
|
&nop_1[0], &nop_1[0], &nop_2[0], &nop_3[0], &nop_4[0],
|
||||||
|
&nop_5[0], &nop_6[0], &nop_7[0], &nop_8[0], &nop_9[0]};
|
||||||
|
|
||||||
|
static constexpr unsigned LENGTHOF_NOPTABLE =
|
||||||
|
sizeof(g_noptable) / sizeof(g_noptable[0]);
|
||||||
|
|
||||||
// Len Assembly Byte Sequence
|
// Len Assembly Byte Sequence
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
@ -613,9 +711,17 @@ void X64Emitter::ReloadMembase() {
|
||||||
// 8b NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H
|
// 8b NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H
|
||||||
// 9b 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
|
// 9b 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
|
||||||
void X64Emitter::nop(size_t length) {
|
void X64Emitter::nop(size_t length) {
|
||||||
// TODO(benvanik): fat nop
|
while (length != 0) {
|
||||||
for (size_t i = 0; i < length; ++i) {
|
unsigned patchsize = length % LENGTHOF_NOPTABLE;
|
||||||
db(0x90);
|
|
||||||
|
// patch_memory(locptr, size, (char*)g_noptable[patchsize]);
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < patchsize; ++i) {
|
||||||
|
db(g_noptable[patchsize][i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
//locptr += patchsize;
|
||||||
|
length -= patchsize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -649,6 +755,35 @@ void X64Emitter::MovMem64(const Xbyak::RegExp& addr, uint64_t v) {
|
||||||
mov(dword[addr + 4], static_cast<uint32_t>(v >> 32));
|
mov(dword[addr + 4], static_cast<uint32_t>(v >> 32));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
|
||||||
|
unsigned char v2, unsigned char v3,
|
||||||
|
unsigned char v4, unsigned char v5,
|
||||||
|
unsigned char v6, unsigned char v7,
|
||||||
|
unsigned char v8, unsigned char v9,
|
||||||
|
unsigned char v10, unsigned char v11,
|
||||||
|
unsigned char v12, unsigned char v13,
|
||||||
|
unsigned char v14, unsigned char v15) {
|
||||||
|
vec128_t result;
|
||||||
|
|
||||||
|
result.u8[0] = v0;
|
||||||
|
result.u8[1] = v1;
|
||||||
|
result.u8[2] = v2;
|
||||||
|
result.u8[3] = v3;
|
||||||
|
result.u8[4] = v4;
|
||||||
|
result.u8[5] = v5;
|
||||||
|
result.u8[6] = v6;
|
||||||
|
result.u8[7] = v7;
|
||||||
|
result.u8[8] = v8;
|
||||||
|
result.u8[9] = v9;
|
||||||
|
result.u8[10] = v10;
|
||||||
|
result.u8[11] = v11;
|
||||||
|
result.u8[12] = v12;
|
||||||
|
result.u8[13] = v13;
|
||||||
|
result.u8[14] = v14;
|
||||||
|
|
||||||
|
result.u8[15] = v15;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
static const vec128_t xmm_consts[] = {
|
static const vec128_t xmm_consts[] = {
|
||||||
/* XMMZero */ vec128f(0.0f),
|
/* XMMZero */ vec128f(0.0f),
|
||||||
|
|
@ -761,8 +896,60 @@ static const vec128_t xmm_consts[] = {
|
||||||
/* XMMQNaN */ vec128i(0x7FC00000u),
|
/* XMMQNaN */ vec128i(0x7FC00000u),
|
||||||
/* XMMInt127 */ vec128i(0x7Fu),
|
/* XMMInt127 */ vec128i(0x7Fu),
|
||||||
/* XMM2To32 */ vec128f(0x1.0p32f),
|
/* XMM2To32 */ vec128f(0x1.0p32f),
|
||||||
|
/* xmminf */ vec128i(0x7f800000),
|
||||||
|
|
||||||
|
/* XMMIntsToBytes*/
|
||||||
|
v128_setr_bytes(0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||||
|
0x80, 0x80, 0x80, 0x80),
|
||||||
|
/*XMMShortsToBytes*/
|
||||||
|
v128_setr_bytes(0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||||
|
0x80, 0x80, 0x80)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
||||||
|
for (auto& vec : xmm_consts) {
|
||||||
|
for (auto& u8 : vec.u8) {
|
||||||
|
if (u8 == bytevalue) {
|
||||||
|
return reinterpret_cast<void*>(backend_->emitter_data() +
|
||||||
|
(&u8 - &xmm_consts[0].u8[0]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
void* X64Emitter::FindWordConstantOffset(unsigned wordvalue) {
|
||||||
|
for (auto& vec : xmm_consts) {
|
||||||
|
for (auto& u16 : vec.u16) {
|
||||||
|
if (u16 == wordvalue) {
|
||||||
|
return reinterpret_cast<void*>(backend_->emitter_data() +
|
||||||
|
((&u16 - &xmm_consts[0].u16[0]) * 2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
void* X64Emitter::FindDwordConstantOffset(unsigned dwordvalue) {
|
||||||
|
for (auto& vec : xmm_consts) {
|
||||||
|
for (auto& u32 : vec.u32) {
|
||||||
|
if (u32 == dwordvalue) {
|
||||||
|
return reinterpret_cast<void*>(backend_->emitter_data() +
|
||||||
|
((&u32 - &xmm_consts[0].u32[0]) * 4));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
void* X64Emitter::FindQwordConstantOffset(uint64_t qwordvalue) {
|
||||||
|
for (auto& vec : xmm_consts) {
|
||||||
|
for (auto& u64 : vec.u64) {
|
||||||
|
if (u64 == qwordvalue) {
|
||||||
|
return reinterpret_cast<void*>(backend_->emitter_data() +
|
||||||
|
((&u64 - &xmm_consts[0].u64[0]) * 8));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
// First location to try and place constants.
|
// First location to try and place constants.
|
||||||
static const uintptr_t kConstDataLocation = 0x20000000;
|
static const uintptr_t kConstDataLocation = 0x20000000;
|
||||||
static const uintptr_t kConstDataSize = sizeof(xmm_consts);
|
static const uintptr_t kConstDataSize = sizeof(xmm_consts);
|
||||||
|
|
@ -806,7 +993,6 @@ Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) {
|
||||||
return ptr[reinterpret_cast<void*>(backend_->emitter_data() +
|
return ptr[reinterpret_cast<void*>(backend_->emitter_data() +
|
||||||
sizeof(vec128_t) * id)];
|
sizeof(vec128_t) * id)];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Implies possible StashXmm(0, ...)!
|
// Implies possible StashXmm(0, ...)!
|
||||||
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
||||||
// https://www.agner.org/optimize/optimizing_assembly.pdf
|
// https://www.agner.org/optimize/optimizing_assembly.pdf
|
||||||
|
|
@ -818,12 +1004,115 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
||||||
// 1111...
|
// 1111...
|
||||||
vpcmpeqb(dest, dest);
|
vpcmpeqb(dest, dest);
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
||||||
if (xmm_consts[i] == v) {
|
if (xmm_consts[i] == v) {
|
||||||
vmovapd(dest, GetXmmConstPtr((XmmConst)i));
|
vmovapd(dest, GetXmmConstPtr((XmmConst)i));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (IsFeatureEnabled(kX64EmitAVX2)) {
|
||||||
|
bool all_equal_bytes = true;
|
||||||
|
|
||||||
|
unsigned firstbyte = v.u8[0];
|
||||||
|
for (unsigned i = 1; i < 16; ++i) {
|
||||||
|
if (v.u8[i] != firstbyte) {
|
||||||
|
all_equal_bytes = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (all_equal_bytes) {
|
||||||
|
void* bval = FindByteConstantOffset(firstbyte);
|
||||||
|
|
||||||
|
if (bval) {
|
||||||
|
vpbroadcastb(dest, byte[bval]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// didnt find existing mem with the value
|
||||||
|
mov(byte[rsp + kStashOffset], firstbyte);
|
||||||
|
vpbroadcastb(dest, byte[rsp + kStashOffset]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool all_equal_words = true;
|
||||||
|
unsigned firstword = v.u16[0];
|
||||||
|
for (unsigned i = 1; i < 8; ++i) {
|
||||||
|
if (v.u16[i] != firstword) {
|
||||||
|
all_equal_words = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (all_equal_words) {
|
||||||
|
void* wval = FindWordConstantOffset(firstword);
|
||||||
|
if (wval) {
|
||||||
|
vpbroadcastw(dest, word[wval]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// didnt find existing mem with the value
|
||||||
|
mov(word[rsp + kStashOffset], firstword);
|
||||||
|
vpbroadcastw(dest, word[rsp + kStashOffset]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool all_equal_dwords = true;
|
||||||
|
unsigned firstdword = v.u32[0];
|
||||||
|
for (unsigned i = 1; i < 4; ++i) {
|
||||||
|
if (v.u32[i] != firstdword) {
|
||||||
|
all_equal_dwords = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (all_equal_dwords) {
|
||||||
|
void* dwval = FindDwordConstantOffset(firstdword);
|
||||||
|
if (dwval) {
|
||||||
|
vpbroadcastd(dest, dword[dwval]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
mov(dword[rsp + kStashOffset], firstdword);
|
||||||
|
vpbroadcastd(dest, dword[rsp + kStashOffset]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool all_equal_qwords = v.low == v.high;
|
||||||
|
|
||||||
|
if (all_equal_qwords) {
|
||||||
|
void* qwval = FindQwordConstantOffset(v.low);
|
||||||
|
if (qwval) {
|
||||||
|
vpbroadcastq(dest, qword[qwval]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
MovMem64(rsp + kStashOffset, v.low);
|
||||||
|
vpbroadcastq(dest, qword[rsp + kStashOffset]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto& vec : xmm_consts) {
|
||||||
|
if (vec.low == v.low && vec.high == v.high) {
|
||||||
|
vmovdqa(dest,
|
||||||
|
ptr[reinterpret_cast<void*>(backend_->emitter_data() +
|
||||||
|
((&vec - &xmm_consts[0]) * 16))]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (v.high == 0 && v.low == ~0ULL) {
|
||||||
|
vpcmpeqb(dest, dest);
|
||||||
|
movq(dest, dest);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (v.high == 0) {
|
||||||
|
if ((v.low & 0xFFFFFFFF) == v.low) {
|
||||||
|
mov(dword[rsp + kStashOffset], static_cast<unsigned>(v.low));
|
||||||
|
movd(dest, dword[rsp + kStashOffset]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
MovMem64(rsp + kStashOffset, v.low);
|
||||||
|
movq(dest, qword[rsp + kStashOffset]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO(benvanik): see what other common values are.
|
// TODO(benvanik): see what other common values are.
|
||||||
// TODO(benvanik): build constant table - 99% are reused.
|
// TODO(benvanik): build constant table - 99% are reused.
|
||||||
MovMem64(rsp + kStashOffset, v.low);
|
MovMem64(rsp + kStashOffset, v.low);
|
||||||
|
|
|
||||||
|
|
@ -116,6 +116,9 @@ enum XmmConst {
|
||||||
XMMQNaN,
|
XMMQNaN,
|
||||||
XMMInt127,
|
XMMInt127,
|
||||||
XMM2To32,
|
XMM2To32,
|
||||||
|
XMMFloatInf,
|
||||||
|
XMMIntsToBytes,
|
||||||
|
XMMShortsToBytes
|
||||||
};
|
};
|
||||||
|
|
||||||
// Unfortunately due to the design of xbyak we have to pass this to the ctor.
|
// Unfortunately due to the design of xbyak we have to pass this to the ctor.
|
||||||
|
|
@ -141,7 +144,16 @@ enum X64EmitterFeatureFlags {
|
||||||
kX64EmitAVX512DQ = 1 << 11,
|
kX64EmitAVX512DQ = 1 << 11,
|
||||||
|
|
||||||
kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
|
kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
|
||||||
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ
|
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
|
||||||
|
kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
|
||||||
|
kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
|
||||||
|
};
|
||||||
|
class ResolvableGuestCall {
|
||||||
|
public:
|
||||||
|
bool is_jump_;
|
||||||
|
uintptr_t destination_;
|
||||||
|
// rgcid
|
||||||
|
unsigned offset_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class X64Emitter : public Xbyak::CodeGenerator {
|
class X64Emitter : public Xbyak::CodeGenerator {
|
||||||
|
|
@ -230,7 +242,10 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
||||||
Xbyak::Address StashConstantXmm(int index, float v);
|
Xbyak::Address StashConstantXmm(int index, float v);
|
||||||
Xbyak::Address StashConstantXmm(int index, double v);
|
Xbyak::Address StashConstantXmm(int index, double v);
|
||||||
Xbyak::Address StashConstantXmm(int index, const vec128_t& v);
|
Xbyak::Address StashConstantXmm(int index, const vec128_t& v);
|
||||||
|
void* FindByteConstantOffset(unsigned bytevalue);
|
||||||
|
void* FindWordConstantOffset(unsigned wordvalue);
|
||||||
|
void* FindDwordConstantOffset(unsigned bytevalue);
|
||||||
|
void* FindQwordConstantOffset(uint64_t bytevalue);
|
||||||
bool IsFeatureEnabled(uint32_t feature_flag) const {
|
bool IsFeatureEnabled(uint32_t feature_flag) const {
|
||||||
return (feature_flags_ & feature_flag) == feature_flag;
|
return (feature_flags_ & feature_flag) == feature_flag;
|
||||||
}
|
}
|
||||||
|
|
@ -267,6 +282,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
||||||
|
|
||||||
static const uint32_t gpr_reg_map_[GPR_COUNT];
|
static const uint32_t gpr_reg_map_[GPR_COUNT];
|
||||||
static const uint32_t xmm_reg_map_[XMM_COUNT];
|
static const uint32_t xmm_reg_map_[XMM_COUNT];
|
||||||
|
uint32_t current_rgc_id_ = 0xEEDDF00F;
|
||||||
|
std::vector<ResolvableGuestCall> call_sites_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace x64
|
} // namespace x64
|
||||||
|
|
|
||||||
|
|
@ -109,23 +109,40 @@ struct DEBUG_BREAK_TRUE_I32
|
||||||
: Sequence<DEBUG_BREAK_TRUE_I32,
|
: Sequence<DEBUG_BREAK_TRUE_I32,
|
||||||
I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I32Op>> {
|
I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I32Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
|
||||||
|
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||||
|
e.mov(e.ecx, i.src1);
|
||||||
|
Xbyak::Label skip;
|
||||||
|
e.jrcxz(skip);
|
||||||
|
e.DebugBreak();
|
||||||
|
e.L(skip);
|
||||||
|
} else {
|
||||||
e.test(i.src1, i.src1);
|
e.test(i.src1, i.src1);
|
||||||
Xbyak::Label skip;
|
Xbyak::Label skip;
|
||||||
e.jz(skip);
|
e.jz(skip);
|
||||||
e.DebugBreak();
|
e.DebugBreak();
|
||||||
e.L(skip);
|
e.L(skip);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
struct DEBUG_BREAK_TRUE_I64
|
struct DEBUG_BREAK_TRUE_I64
|
||||||
: Sequence<DEBUG_BREAK_TRUE_I64,
|
: Sequence<DEBUG_BREAK_TRUE_I64,
|
||||||
I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I64Op>> {
|
I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I64Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||||
|
e.mov(e.rcx, i.src1);
|
||||||
|
Xbyak::Label skip;
|
||||||
|
e.jrcxz(skip);
|
||||||
|
e.DebugBreak();
|
||||||
|
e.L(skip);
|
||||||
|
} else {
|
||||||
e.test(i.src1, i.src1);
|
e.test(i.src1, i.src1);
|
||||||
Xbyak::Label skip;
|
Xbyak::Label skip;
|
||||||
e.jz(skip);
|
e.jz(skip);
|
||||||
e.DebugBreak();
|
e.DebugBreak();
|
||||||
e.L(skip);
|
e.L(skip);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
struct DEBUG_BREAK_TRUE_F32
|
struct DEBUG_BREAK_TRUE_F32
|
||||||
: Sequence<DEBUG_BREAK_TRUE_F32,
|
: Sequence<DEBUG_BREAK_TRUE_F32,
|
||||||
|
|
@ -190,22 +207,38 @@ struct TRAP_TRUE_I16
|
||||||
struct TRAP_TRUE_I32
|
struct TRAP_TRUE_I32
|
||||||
: Sequence<TRAP_TRUE_I32, I<OPCODE_TRAP_TRUE, VoidOp, I32Op>> {
|
: Sequence<TRAP_TRUE_I32, I<OPCODE_TRAP_TRUE, VoidOp, I32Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||||
|
e.mov(e.ecx, i.src1);
|
||||||
|
Xbyak::Label skip;
|
||||||
|
e.jrcxz(skip);
|
||||||
|
e.Trap(i.instr->flags);
|
||||||
|
e.L(skip);
|
||||||
|
} else {
|
||||||
e.test(i.src1, i.src1);
|
e.test(i.src1, i.src1);
|
||||||
Xbyak::Label skip;
|
Xbyak::Label skip;
|
||||||
e.jz(skip);
|
e.jz(skip);
|
||||||
e.Trap(i.instr->flags);
|
e.Trap(i.instr->flags);
|
||||||
e.L(skip);
|
e.L(skip);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
struct TRAP_TRUE_I64
|
struct TRAP_TRUE_I64
|
||||||
: Sequence<TRAP_TRUE_I64, I<OPCODE_TRAP_TRUE, VoidOp, I64Op>> {
|
: Sequence<TRAP_TRUE_I64, I<OPCODE_TRAP_TRUE, VoidOp, I64Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||||
|
e.mov(e.rcx, i.src1);
|
||||||
|
Xbyak::Label skip;
|
||||||
|
e.jrcxz(skip);
|
||||||
|
e.Trap(i.instr->flags);
|
||||||
|
e.L(skip);
|
||||||
|
} else {
|
||||||
e.test(i.src1, i.src1);
|
e.test(i.src1, i.src1);
|
||||||
Xbyak::Label skip;
|
Xbyak::Label skip;
|
||||||
e.jz(skip);
|
e.jz(skip);
|
||||||
e.Trap(i.instr->flags);
|
e.Trap(i.instr->flags);
|
||||||
e.L(skip);
|
e.L(skip);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
struct TRAP_TRUE_F32
|
struct TRAP_TRUE_F32
|
||||||
: Sequence<TRAP_TRUE_F32, I<OPCODE_TRAP_TRUE, VoidOp, F32Op>> {
|
: Sequence<TRAP_TRUE_F32, I<OPCODE_TRAP_TRUE, VoidOp, F32Op>> {
|
||||||
|
|
@ -355,23 +388,40 @@ struct CALL_INDIRECT_TRUE_I32
|
||||||
: Sequence<CALL_INDIRECT_TRUE_I32,
|
: Sequence<CALL_INDIRECT_TRUE_I32,
|
||||||
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32Op, I64Op>> {
|
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32Op, I64Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
|
||||||
|
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||||
|
e.mov(e.ecx, i.src1);
|
||||||
|
Xbyak::Label skip;
|
||||||
|
e.jrcxz(skip);
|
||||||
|
e.CallIndirect(i.instr, i.src2);
|
||||||
|
e.L(skip);
|
||||||
|
} else {
|
||||||
e.test(i.src1, i.src1);
|
e.test(i.src1, i.src1);
|
||||||
Xbyak::Label skip;
|
Xbyak::Label skip;
|
||||||
e.jz(skip, CodeGenerator::T_NEAR);
|
e.jz(skip, CodeGenerator::T_NEAR);
|
||||||
e.CallIndirect(i.instr, i.src2);
|
e.CallIndirect(i.instr, i.src2);
|
||||||
e.L(skip);
|
e.L(skip);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
struct CALL_INDIRECT_TRUE_I64
|
struct CALL_INDIRECT_TRUE_I64
|
||||||
: Sequence<CALL_INDIRECT_TRUE_I64,
|
: Sequence<CALL_INDIRECT_TRUE_I64,
|
||||||
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I64Op, I64Op>> {
|
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I64Op, I64Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||||
|
e.mov(e.rcx, i.src1);
|
||||||
|
Xbyak::Label skip;
|
||||||
|
e.jrcxz(skip);
|
||||||
|
e.CallIndirect(i.instr, i.src2);
|
||||||
|
e.L(skip);
|
||||||
|
} else {
|
||||||
e.test(i.src1, i.src1);
|
e.test(i.src1, i.src1);
|
||||||
Xbyak::Label skip;
|
Xbyak::Label skip;
|
||||||
e.jz(skip, CodeGenerator::T_NEAR);
|
e.jz(skip, CodeGenerator::T_NEAR);
|
||||||
e.CallIndirect(i.instr, i.src2);
|
e.CallIndirect(i.instr, i.src2);
|
||||||
e.L(skip);
|
e.L(skip);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
struct CALL_INDIRECT_TRUE_F32
|
struct CALL_INDIRECT_TRUE_F32
|
||||||
: Sequence<CALL_INDIRECT_TRUE_F32,
|
: Sequence<CALL_INDIRECT_TRUE_F32,
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,13 @@
|
||||||
#include "xenia/base/memory.h"
|
#include "xenia/base/memory.h"
|
||||||
#include "xenia/cpu/backend/x64/x64_op.h"
|
#include "xenia/cpu/backend/x64/x64_op.h"
|
||||||
#include "xenia/cpu/backend/x64/x64_tracers.h"
|
#include "xenia/cpu/backend/x64/x64_tracers.h"
|
||||||
|
#include "xenia/cpu/ppc/ppc_context.h"
|
||||||
|
#include "xenia/base/cvar.h"
|
||||||
|
|
||||||
|
DEFINE_bool(
|
||||||
|
elide_e0_check, false,
|
||||||
|
"Eliminate e0 check on some memory accesses, like to r13(tls) or r1(sp)",
|
||||||
|
"CPU");
|
||||||
|
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace cpu {
|
namespace cpu {
|
||||||
|
|
@ -27,7 +34,30 @@ volatile int anchor_memory = 0;
|
||||||
RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
|
RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
|
||||||
return e.GetContextReg() + offset.value;
|
return e.GetContextReg() + offset.value;
|
||||||
}
|
}
|
||||||
|
static bool is_eo_def(const hir::Value* v) {
|
||||||
|
if (v->def) {
|
||||||
|
auto df = v->def;
|
||||||
|
if (df->opcode == &OPCODE_LOAD_CONTEXT_info) {
|
||||||
|
size_t offs = df->src1.offset;
|
||||||
|
if (offs == offsetof(ppc::PPCContext_s, r[1]) ||
|
||||||
|
offs == offsetof(ppc::PPCContext_s, r[13])) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else if (df->opcode == &OPCODE_ASSIGN_info) {
|
||||||
|
return is_eo_def(df->src1.value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static bool is_definitely_not_eo(const T& v) {
|
||||||
|
if (!cvars::elide_e0_check) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return is_eo_def(v.value);
|
||||||
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
||||||
const T& offset) {
|
const T& offset) {
|
||||||
|
|
@ -49,7 +79,8 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
||||||
return e.GetMembaseReg() + e.rax;
|
return e.GetMembaseReg() + e.rax;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (xe::memory::allocation_granularity() > 0x1000) {
|
if (xe::memory::allocation_granularity() > 0x1000 &&
|
||||||
|
!is_definitely_not_eo(guest)) {
|
||||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||||
// it via memory mapping.
|
// it via memory mapping.
|
||||||
e.xor_(e.eax, e.eax);
|
e.xor_(e.eax, e.eax);
|
||||||
|
|
@ -60,12 +91,12 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
||||||
} else {
|
} else {
|
||||||
// Clear the top 32 bits, as they are likely garbage.
|
// Clear the top 32 bits, as they are likely garbage.
|
||||||
// TODO(benvanik): find a way to avoid doing this.
|
// TODO(benvanik): find a way to avoid doing this.
|
||||||
|
|
||||||
e.mov(e.eax, guest.reg().cvt32());
|
e.mov(e.eax, guest.reg().cvt32());
|
||||||
}
|
}
|
||||||
return e.GetMembaseReg() + e.rax + offset_const;
|
return e.GetMembaseReg() + e.rax + offset_const;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: most *should* be aligned, but needs to be checked!
|
// Note: most *should* be aligned, but needs to be checked!
|
||||||
template <typename T>
|
template <typename T>
|
||||||
RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
|
RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
|
||||||
|
|
@ -86,7 +117,8 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
|
||||||
return e.GetMembaseReg() + e.rax;
|
return e.GetMembaseReg() + e.rax;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (xe::memory::allocation_granularity() > 0x1000) {
|
if (xe::memory::allocation_granularity() > 0x1000 &&
|
||||||
|
!is_definitely_not_eo(guest)) {
|
||||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||||
// it via memory mapping.
|
// it via memory mapping.
|
||||||
e.xor_(e.eax, e.eax);
|
e.xor_(e.eax, e.eax);
|
||||||
|
|
|
||||||
|
|
@ -728,28 +728,103 @@ struct VECTOR_SHL_V128
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||||
// TODO(benvanik): native version (with shift magic).
|
// TODO(benvanik): native version (with shift magic).
|
||||||
if (i.src2.is_constant) {
|
|
||||||
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
if (e.IsFeatureEnabled(kX64EmitAVX2)) {
|
||||||
const auto& shamt = i.src2.constant();
|
if (!i.src2.is_constant) {
|
||||||
|
// get high 8 bytes
|
||||||
|
e.vpunpckhqdq(e.xmm1, i.src1, i.src1);
|
||||||
|
e.vpunpckhqdq(e.xmm3, i.src2, i.src2);
|
||||||
|
|
||||||
|
e.vpmovzxbd(e.ymm0, i.src1);
|
||||||
|
e.vpmovzxbd(e.ymm1, e.xmm1);
|
||||||
|
|
||||||
|
e.vpmovzxbd(e.ymm2, i.src2);
|
||||||
|
e.vpmovzxbd(e.ymm3, e.xmm3);
|
||||||
|
|
||||||
|
e.vpsllvd(e.ymm0, e.ymm0, e.ymm2);
|
||||||
|
e.vpsllvd(e.ymm1, e.ymm1, e.ymm3);
|
||||||
|
e.vextracti128(e.xmm2, e.ymm0, 1);
|
||||||
|
e.vextracti128(e.xmm3, e.ymm1, 1);
|
||||||
|
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||||
|
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||||
|
e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||||
|
e.vpshufb(e.xmm3, e.xmm3, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||||
|
|
||||||
|
e.vpunpckldq(e.xmm0, e.xmm0, e.xmm1);
|
||||||
|
e.vpunpckldq(e.xmm2, e.xmm2, e.xmm3);
|
||||||
|
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm2);
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
vec128_t constmask = i.src2.constant();
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < 16; ++i) {
|
||||||
|
constmask.u8[i] &= 7;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned seenvalue = constmask.u8[0];
|
||||||
bool all_same = true;
|
bool all_same = true;
|
||||||
for (size_t n = 0; n < 16 - n; ++n) {
|
for (unsigned i = 1; i < 16; ++i) {
|
||||||
if (shamt.u8[n] != shamt.u8[n + 1]) {
|
if (constmask.u8[i] != seenvalue) {
|
||||||
all_same = false;
|
all_same = false;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (all_same) {
|
if (all_same) {
|
||||||
// Every count is the same, so we can use gf2p8affineqb.
|
// mul by two
|
||||||
const uint8_t shift_amount = shamt.u8[0] & 0b111;
|
/*if (seenvalue == 1) {
|
||||||
const uint64_t shift_matrix =
|
e.vpaddb(i.dest, i.src1, i.src1);
|
||||||
UINT64_C(0x0102040810204080) >> (shift_amount * 8);
|
} else if (seenvalue == 2) {
|
||||||
e.vgf2p8affineqb(i.dest, i.src1,
|
e.vpaddb(i.dest, i.src1, i.src1);
|
||||||
e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
|
e.vpaddb(i.dest, i.dest, i.dest);
|
||||||
|
} else if (seenvalue == 3) {
|
||||||
|
// mul by 8
|
||||||
|
e.vpaddb(i.dest, i.src1, i.src1);
|
||||||
|
e.vpaddb(i.dest, i.dest, i.dest);
|
||||||
|
e.vpaddb(i.dest, i.dest, i.dest);
|
||||||
|
} else*/
|
||||||
|
{
|
||||||
|
e.vpmovzxbw(e.ymm0, i.src1);
|
||||||
|
e.vpsllw(e.ymm0, e.ymm0, seenvalue);
|
||||||
|
e.vextracti128(e.xmm1, e.ymm0, 1);
|
||||||
|
|
||||||
|
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMShortsToBytes));
|
||||||
|
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMShortsToBytes));
|
||||||
|
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
e.LoadConstantXmm(e.xmm2, constmask);
|
||||||
|
|
||||||
|
e.vpunpckhqdq(e.xmm1, i.src1, i.src1);
|
||||||
|
e.vpunpckhqdq(e.xmm3, e.xmm2, e.xmm2);
|
||||||
|
|
||||||
|
e.vpmovzxbd(e.ymm0, i.src1);
|
||||||
|
e.vpmovzxbd(e.ymm1, e.xmm1);
|
||||||
|
|
||||||
|
e.vpmovzxbd(e.ymm2, e.xmm2);
|
||||||
|
e.vpmovzxbd(e.ymm3, e.xmm3);
|
||||||
|
|
||||||
|
e.vpsllvd(e.ymm0, e.ymm0, e.ymm2);
|
||||||
|
e.vpsllvd(e.ymm1, e.ymm1, e.ymm3);
|
||||||
|
e.vextracti128(e.xmm2, e.ymm0, 1);
|
||||||
|
e.vextracti128(e.xmm3, e.ymm1, 1);
|
||||||
|
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||||
|
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||||
|
e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||||
|
e.vpshufb(e.xmm3, e.xmm3, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||||
|
|
||||||
|
e.vpunpckldq(e.xmm0, e.xmm0, e.xmm1);
|
||||||
|
e.vpunpckldq(e.xmm2, e.xmm2, e.xmm3);
|
||||||
|
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm2);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (i.src2.is_constant) {
|
||||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||||
} else {
|
} else {
|
||||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||||
|
|
@ -758,7 +833,6 @@ struct VECTOR_SHL_V128
|
||||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>));
|
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>));
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
|
static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
|
||||||
Xmm src1;
|
Xmm src1;
|
||||||
if (i.src1.is_constant) {
|
if (i.src1.is_constant) {
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,10 @@
|
||||||
#include "xenia/cpu/hir/hir_builder.h"
|
#include "xenia/cpu/hir/hir_builder.h"
|
||||||
#include "xenia/cpu/processor.h"
|
#include "xenia/cpu/processor.h"
|
||||||
|
|
||||||
|
DEFINE_bool(use_fast_dot_product, false,
|
||||||
|
"Experimental optimization, much shorter sequence on dot products, treating inf as overflow instead of using mcxsr"
|
||||||
|
"four insn dotprod",
|
||||||
|
"CPU");
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace cpu {
|
namespace cpu {
|
||||||
namespace backend {
|
namespace backend {
|
||||||
|
|
@ -886,6 +890,9 @@ struct COMPARE_EQ_I8
|
||||||
e.cmp(src1, src2);
|
e.cmp(src1, src2);
|
||||||
},
|
},
|
||||||
[](X64Emitter& e, const Reg8& src1, int32_t constant) {
|
[](X64Emitter& e, const Reg8& src1, int32_t constant) {
|
||||||
|
if (constant == 0) {
|
||||||
|
e.test(src1, src1);
|
||||||
|
} else
|
||||||
e.cmp(src1, constant);
|
e.cmp(src1, constant);
|
||||||
});
|
});
|
||||||
e.sete(i.dest);
|
e.sete(i.dest);
|
||||||
|
|
@ -900,6 +907,9 @@ struct COMPARE_EQ_I16
|
||||||
e.cmp(src1, src2);
|
e.cmp(src1, src2);
|
||||||
},
|
},
|
||||||
[](X64Emitter& e, const Reg16& src1, int32_t constant) {
|
[](X64Emitter& e, const Reg16& src1, int32_t constant) {
|
||||||
|
if (constant == 0) {
|
||||||
|
e.test(src1, src1);
|
||||||
|
} else
|
||||||
e.cmp(src1, constant);
|
e.cmp(src1, constant);
|
||||||
});
|
});
|
||||||
e.sete(i.dest);
|
e.sete(i.dest);
|
||||||
|
|
@ -914,6 +924,9 @@ struct COMPARE_EQ_I32
|
||||||
e.cmp(src1, src2);
|
e.cmp(src1, src2);
|
||||||
},
|
},
|
||||||
[](X64Emitter& e, const Reg32& src1, int32_t constant) {
|
[](X64Emitter& e, const Reg32& src1, int32_t constant) {
|
||||||
|
if (constant == 0) {
|
||||||
|
e.test(src1, src1);
|
||||||
|
} else
|
||||||
e.cmp(src1, constant);
|
e.cmp(src1, constant);
|
||||||
});
|
});
|
||||||
e.sete(i.dest);
|
e.sete(i.dest);
|
||||||
|
|
@ -928,6 +941,9 @@ struct COMPARE_EQ_I64
|
||||||
e.cmp(src1, src2);
|
e.cmp(src1, src2);
|
||||||
},
|
},
|
||||||
[](X64Emitter& e, const Reg64& src1, int32_t constant) {
|
[](X64Emitter& e, const Reg64& src1, int32_t constant) {
|
||||||
|
if (constant == 0) {
|
||||||
|
e.test(src1, src1);
|
||||||
|
} else
|
||||||
e.cmp(src1, constant);
|
e.cmp(src1, constant);
|
||||||
});
|
});
|
||||||
e.sete(i.dest);
|
e.sete(i.dest);
|
||||||
|
|
@ -1980,6 +1996,8 @@ struct DIV_V128 : Sequence<DIV_V128, I<OPCODE_DIV, V128Op, V128Op, V128Op>> {
|
||||||
assert_true(!i.instr->flags);
|
assert_true(!i.instr->flags);
|
||||||
EmitAssociativeBinaryXmmOp(e, i,
|
EmitAssociativeBinaryXmmOp(e, i,
|
||||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||||
|
// e.vrcpps(e.xmm0, src2);
|
||||||
|
//e.vmulps(dest, src1, e.xmm0);
|
||||||
e.vdivps(dest, src1, src2);
|
e.vdivps(dest, src1, src2);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
@ -2591,6 +2609,13 @@ EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128);
|
||||||
|
|
||||||
struct DOT_PRODUCT_V128 {
|
struct DOT_PRODUCT_V128 {
|
||||||
static void Emit(X64Emitter& e, Xmm dest, Xmm src1, Xmm src2, uint8_t imm) {
|
static void Emit(X64Emitter& e, Xmm dest, Xmm src1, Xmm src2, uint8_t imm) {
|
||||||
|
if (cvars::use_fast_dot_product) {
|
||||||
|
e.vdpps(dest, src1, src2, imm);
|
||||||
|
e.vandps(e.xmm0, dest, e.GetXmmConstPtr(XMMAbsMaskPS));
|
||||||
|
e.vcmpgeps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFloatInf));
|
||||||
|
e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0);
|
||||||
|
|
||||||
|
} else {
|
||||||
// TODO(benvanik): apparently this is very slow
|
// TODO(benvanik): apparently this is very slow
|
||||||
// - find alternative?
|
// - find alternative?
|
||||||
Xbyak::Label end;
|
Xbyak::Label end;
|
||||||
|
|
@ -2629,6 +2654,7 @@ struct DOT_PRODUCT_V128 {
|
||||||
e.L(end);
|
e.L(end);
|
||||||
e.outLocalLabel();
|
e.outLocalLabel();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue