From 7fe0a10d43d11f947089f30781a3efaa23e7e0d9 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 11:16:55 -0800 Subject: [PATCH 001/184] Adding back --break_on_instruction. --- src/alloy/alloy-private.h | 3 +++ src/alloy/alloy.cc | 6 ++++++ src/alloy/frontend/ppc/ppc_hir_builder.cc | 5 +++-- src/xenia/cpu/cpu-private.h | 3 --- src/xenia/cpu/cpu.cc | 7 ------- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/alloy/alloy-private.h b/src/alloy/alloy-private.h index 76b59e381..213b4bfad 100644 --- a/src/alloy/alloy-private.h +++ b/src/alloy/alloy-private.h @@ -18,6 +18,9 @@ DECLARE_bool(debug); DECLARE_bool(always_disasm); +DECLARE_uint64(break_on_instruction); +DECLARE_uint64(break_on_memory); + namespace alloy { diff --git a/src/alloy/alloy.cc b/src/alloy/alloy.cc index 1fd5261c0..714036f71 100644 --- a/src/alloy/alloy.cc +++ b/src/alloy/alloy.cc @@ -24,3 +24,9 @@ DEFINE_bool(debug, DEFAULT_DEBUG_FLAG, DEFINE_bool(always_disasm, false, "Always add debug info to functions, even when no debugger is attached."); + +// Breakpoints: +DEFINE_uint64(break_on_instruction, 0, + "int3 before the given guest address is executed."); +DEFINE_uint64(break_on_memory, 0, + "int3 on read/write to the given memory address."); diff --git a/src/alloy/frontend/ppc/ppc_hir_builder.cc b/src/alloy/frontend/ppc/ppc_hir_builder.cc index 501fc3f15..2fc49396a 100644 --- a/src/alloy/frontend/ppc/ppc_hir_builder.cc +++ b/src/alloy/frontend/ppc/ppc_hir_builder.cc @@ -9,6 +9,7 @@ #include +#include #include #include #include @@ -125,10 +126,10 @@ int PPCHIRBuilder::Emit(FunctionInfo* symbol_info, bool with_debug_info) { typedef int (*InstrEmitter)(PPCHIRBuilder& f, InstrData& i); InstrEmitter emit = (InstrEmitter)i.type->emit; - /*if (i.address == FLAGS_break_on_instruction) { + if (i.address == FLAGS_break_on_instruction) { Comment("--break-on-instruction target"); DebugBreak(); - }*/ + } if (!i.type->emit || emit(*this, i)) { XELOGCPU("Unimplemented instr %.8X %.8X %s", diff --git a/src/xenia/cpu/cpu-private.h b/src/xenia/cpu/cpu-private.h index 1b49862f6..3272d2ece 100644 --- a/src/xenia/cpu/cpu-private.h +++ b/src/xenia/cpu/cpu-private.h @@ -20,9 +20,6 @@ DECLARE_bool(trace_user_calls); DECLARE_bool(trace_kernel_calls); DECLARE_uint64(trace_thread_mask); -DECLARE_uint64(break_on_instruction); -DECLARE_uint64(break_on_memory); - DECLARE_string(load_module_map); DECLARE_string(dump_path); diff --git a/src/xenia/cpu/cpu.cc b/src/xenia/cpu/cpu.cc index 12c8118f6..389aeee18 100644 --- a/src/xenia/cpu/cpu.cc +++ b/src/xenia/cpu/cpu.cc @@ -25,13 +25,6 @@ DEFINE_uint64(trace_thread_mask, -1, "Trace threads with IDs in the mask, or -1 for all."); -// Breakpoints: -DEFINE_uint64(break_on_instruction, 0, - "int3 before the given guest address is executed."); -DEFINE_uint64(break_on_memory, 0, - "int3 on read/write to the given memory address."); - - // Debugging: DEFINE_string(load_module_map, "", "Loads a .map for symbol names and to diff with the generated symbol " From 5eeeee7093ad97316ad7aea5a73d7c968713df3b Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 11:17:12 -0800 Subject: [PATCH 002/184] Fixing jumps. --- .../backend/x64/lowering/lowering_sequences.cc | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 5b2cac041..9e9bebdfd 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -89,6 +89,7 @@ void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { } // Sets EFLAGs with zf for the given value. +// ZF = 1 if false, 0 = true (so jz = jump if false) void CheckBoolean(X64Emitter& e, Value* v) { if (v->IsConstant()) { e.mov(e.ah, (v->IsConstantZero() ? 1 : 0) << 6); @@ -558,7 +559,7 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { table->AddSequence(OPCODE_DEBUG_BREAK_TRUE, [](X64Emitter& e, Instr*& i) { e.inLocalLabel(); CheckBoolean(e, i->src1.value); - e.jne(".x", e.T_SHORT); + e.jz(".x", e.T_SHORT); // TODO(benvanik): insert a call to the debug break function to let the // debugger know. e.db(0xCC); @@ -579,7 +580,7 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { table->AddSequence(OPCODE_TRAP_TRUE, [](X64Emitter& e, Instr*& i) { e.inLocalLabel(); CheckBoolean(e, i->src1.value); - e.jne(".x", e.T_SHORT); + e.jz(".x", e.T_SHORT); // TODO(benvanik): insert a call to the trap function to let the // debugger know. e.db(0xCC); @@ -602,7 +603,7 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { table->AddSequence(OPCODE_CALL_TRUE, [](X64Emitter& e, Instr*& i) { e.inLocalLabel(); CheckBoolean(e, i->src1.value); - e.jne(".x", e.T_SHORT); + e.jz(".x", e.T_SHORT); IssueCall(e, i->src2.symbol_info, i->flags); e.L(".x"); e.outLocalLabel(); @@ -619,7 +620,7 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { table->AddSequence(OPCODE_CALL_INDIRECT_TRUE, [](X64Emitter& e, Instr*& i) { e.inLocalLabel(); CheckBoolean(e, i->src1.value); - e.jne(".x", e.T_SHORT); + e.jz(".x", e.T_SHORT); IssueCallIndirect(e, i->src2.value, i->flags); e.L(".x"); e.outLocalLabel(); @@ -631,7 +632,7 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { // If this is the last instruction in the last block, just let us // fall through. if (i->next || i->block->next) { - e.jmp("epilog"); + e.jmp("epilog", CodeGenerator::T_NEAR); } i = e.Advance(i); return true; @@ -639,7 +640,7 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { table->AddSequence(OPCODE_RETURN_TRUE, [](X64Emitter& e, Instr*& i) { CheckBoolean(e, i->src1.value); - e.je("epilog"); + e.jnz("epilog", CodeGenerator::T_NEAR); i = e.Advance(i); return true; }); @@ -658,7 +659,7 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { table->AddSequence(OPCODE_BRANCH_TRUE, [](X64Emitter& e, Instr*& i) { CheckBoolean(e, i->src1.value); auto target = i->src2.label; - e.je(target->name, e.T_NEAR); + e.jnz(target->name, e.T_NEAR); i = e.Advance(i); return true; }); @@ -666,7 +667,7 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { table->AddSequence(OPCODE_BRANCH_FALSE, [](X64Emitter& e, Instr*& i) { CheckBoolean(e, i->src1.value); auto target = i->src2.label; - e.jne(target->name, e.T_NEAR); + e.jz(target->name, e.T_NEAR); i = e.Advance(i); return true; }); From ac59b6185133759815e70ecf163f2cd81610a361 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 11:17:53 -0800 Subject: [PATCH 003/184] Fixing register eviction. --- src/alloy/backend/x64/x64_emitter.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 3d6b3cfa3..6700b8ed4 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -242,6 +242,8 @@ void X64Emitter::FindFreeRegs( if (!free_regs) { // Need to evict something. EvictStaleRegs(); + free_regs = avail_regs & ~reg_state_.live_regs; + XEASSERT(free_regs); } // Find the first available. From aadf92e4eab0bc3376207b68edeb0b2bc2fd70e3 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 11:18:12 -0800 Subject: [PATCH 004/184] 'Fixing' register clobbering by disabling a bunch of nonvolatile regs. --- src/alloy/backend/x64/x64_emitter.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 6700b8ed4..2e17e1543 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -106,7 +106,13 @@ int X64Emitter::Emit(HIRBuilder* builder) { GetRegBit(rbp) | GetRegBit(rsi) | GetRegBit(rdi) | - GetRegBit(xmm0); + GetRegBit(xmm0) | + + // TODO(benvanik): save so that we can use these. + GetRegBit(r8) | + GetRegBit(r9) | + GetRegBit(r10) | + GetRegBit(r11); // Function prolog. // Must be 16b aligned. From bbf3b4bdab4fda7788171d9b9989c12d69c8236d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 11:45:58 -0800 Subject: [PATCH 005/184] Fixing tail calls. --- .../x64/lowering/lowering_sequences.cc | 69 +++++++++++++------ src/alloy/backend/x64/x64_emitter.cc | 5 +- src/alloy/backend/x64/x64_function.h | 3 + src/alloy/frontend/ppc/ppc_context.h | 3 +- 4 files changed, 57 insertions(+), 23 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 9e9bebdfd..a5c41f5a0 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -35,7 +36,7 @@ void Dummy() { // } -void PrintString(void* raw_context, uint8_t* membase, const char* str) { +void PrintString(void* raw_context, const char* str) { // TODO(benvanik): generate this thunk at runtime? or a shim? auto thread_state = *((ThreadState**)raw_context); fprintf(stdout, "XE[t] :%d: %s\n", thread_state->GetThreadID(), str); @@ -43,48 +44,74 @@ void PrintString(void* raw_context, uint8_t* membase, const char* str) { } // TODO(benvanik): fancy stuff. -void CallThunk(void* raw_context, uint8_t* membase, - FunctionInfo* symbol_info) { +void* ResolveFunctionSymbol(void* raw_context, FunctionInfo* symbol_info) { // TODO(benvanik): generate this thunk at runtime? or a shim? auto thread_state = *((ThreadState**)raw_context); Function* fn = NULL; thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn); XEASSERTNOTNULL(fn); - fn->Call(thread_state); + XEASSERT(fn->type() == Function::USER_FUNCTION); + auto x64_fn = (X64Function*)fn; + return x64_fn->machine_code(); +} +void* ResolveFunctionAddress(void* raw_context, uint64_t target_address) { + // TODO(benvanik): generate this thunk at runtime? or a shim? + auto thread_state = *((ThreadState**)raw_context); + + Function* fn = NULL; + thread_state->runtime()->ResolveFunction(target_address, &fn); + XEASSERTNOTNULL(fn); + XEASSERTALWAYS(); + //fn->Call(thread_state); + return 0; } void IssueCall(X64Emitter& e, FunctionInfo* symbol_info, uint32_t flags) { - e.mov(e.r8, (uint64_t)symbol_info); - e.mov(e.rax, (uint64_t)CallThunk); + // If we are an extern function, we can directly insert a call. + auto fn = symbol_info->function(); + if (fn && fn->type() == Function::EXTERN_FUNCTION) { + auto extern_fn = (ExternFunction*)fn; + e.mov(e.rdx, (uint64_t)extern_fn->arg0()); + e.mov(e.r8, (uint64_t)extern_fn->arg1()); + e.mov(e.rax, (uint64_t)extern_fn->handler()); + } else { + // Generic call, resolve address. + // TODO(benvanik): caching/etc. For now this makes debugging easier. + e.mov(e.rdx, (uint64_t)symbol_info); + e.mov(e.rax, (uint64_t)ResolveFunctionSymbol); + e.call(e.rax); + e.mov(e.rcx, e.qword[e.rsp + 0]); + e.mov(e.rdx, e.qword[e.rcx + 8]); // membase + } if (flags & CALL_TAIL) { + // TODO(benvanik): adjust stack? + e.add(e.rsp, 0x40); e.jmp(e.rax); } else { e.call(e.rax); - e.mov(e.rdx, e.qword[e.rsp + 8]); e.mov(e.rcx, e.qword[e.rsp + 0]); + e.mov(e.rdx, e.qword[e.rcx + 8]); // membase } } - -void IndirectCallThunk(void* raw_context, uint8_t* membase, - uint64_t target_address) { - // TODO(benvanik): generate this thunk at runtime? or a shim? - auto thread_state = *((ThreadState**)raw_context); - XEASSERTALWAYS(); -} void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { Reg64 r; e.BeginOp(target, r, 0); - if (r != e.r8) { - e.mov(e.r8, r); + if (r != e.rdx) { + e.mov(e.rdx, r); } e.EndOp(r); - e.mov(e.rax, (uint64_t)IndirectCallThunk); + e.mov(e.rax, (uint64_t)ResolveFunctionAddress); + e.call(e.rax); + e.mov(e.rcx, e.qword[e.rsp + 0]); + e.mov(e.rdx, e.qword[e.rcx + 8]); // membase if (flags & CALL_TAIL) { + // TODO(benvanik): adjust stack? + e.add(e.rsp, 0x40); e.jmp(e.rax); } else { - e.sub(e.rsp, 0x20); e.call(e.rax); - e.add(e.rsp, 0x20); + e.mov(e.rcx, e.qword[e.rsp + 0]); + e.mov(e.rdx, e.qword[e.rcx + 8]); // membase } } @@ -514,11 +541,11 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { // TODO(benvanik): pass through. auto str = (const char*)i->src1.offset; auto str_copy = xestrdupa(str); - e.mov(e.r8, (uint64_t)str_copy); + e.mov(e.rdx, (uint64_t)str_copy); e.mov(e.rax, (uint64_t)PrintString); e.call(e.rax); - e.mov(e.rdx, e.qword[e.rsp + 8]); e.mov(e.rcx, e.qword[e.rsp + 0]); + e.mov(e.rdx, e.qword[e.rcx + 8]); // membase i = e.Advance(i); return true; }); diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 2e17e1543..9b68e16be 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -128,7 +128,6 @@ int X64Emitter::Emit(HIRBuilder* builder) { const bool emit_prolog = true; const size_t stack_size = 64; if (emit_prolog) { - mov(qword[rsp + 16], rdx); mov(qword[rsp + 8], rcx); sub(rsp, stack_size); mov(qword[rsp + 8 * 0], rbx); @@ -138,6 +137,10 @@ int X64Emitter::Emit(HIRBuilder* builder) { mov(qword[rsp + 8 * 4], r15); } + // membase stays in rdx. If we evict it (like on function calls) we + // must put it back. + mov(rdx, qword[rcx + 8]); + auto lowering_table = backend_->lowering_table(); // Body. diff --git a/src/alloy/backend/x64/x64_function.h b/src/alloy/backend/x64/x64_function.h index cf06bb6c4..e879a72c7 100644 --- a/src/alloy/backend/x64/x64_function.h +++ b/src/alloy/backend/x64/x64_function.h @@ -25,6 +25,9 @@ public: X64Function(runtime::FunctionInfo* symbol_info); virtual ~X64Function(); + void* machine_code() const { return machine_code_; } + size_t code_size() const { return code_size_; } + void Setup(void* machine_code, size_t code_size); protected: diff --git a/src/alloy/frontend/ppc/ppc_context.h b/src/alloy/frontend/ppc/ppc_context.h index 92d6d2877..5bc5f159e 100644 --- a/src/alloy/frontend/ppc/ppc_context.h +++ b/src/alloy/frontend/ppc/ppc_context.h @@ -67,6 +67,8 @@ typedef struct XECACHEALIGN64 PPCContext_s { // Must be stored at 0x0 for now. // TODO(benvanik): find a nice way to describe this to the JIT. runtime::ThreadState* thread_state; + // TODO(benvanik): this is getting nasty. Must be here. + uint8_t* membase; // Most frequently used registers first. uint64_t r[32]; // General purpose registers @@ -196,7 +198,6 @@ typedef struct XECACHEALIGN64 PPCContext_s { // Runtime-specific data pointer. Used on callbacks to get access to the // current runtime and its data. - uint8_t* membase; runtime::Runtime* runtime; volatile int suspend_flag; From 122761835ebc73431a359add2b884e57da4b17d0 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 12:14:29 -0800 Subject: [PATCH 006/184] Adding new instructions and cleaning things up a bit. --- .../x64/lowering/lowering_sequences.cc | 144 ++++++++++++------ 1 file changed, 97 insertions(+), 47 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index a5c41f5a0..f6442a0a0 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -410,6 +410,9 @@ void BinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, e.EndOp(dest, src2); } void BinaryOp(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn) { + // TODO(benvanik): table lookup. This linear scan is slow. + // Note: we assume DEST.type = SRC1.type, but that SRC2.type may vary. + XEASSERT(i->dest->type == i->src1.value->type); if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { Reg8 dest, src1, src2; BinaryOpVV(e, i, vv_fn, dest, src1, src2); @@ -446,6 +449,40 @@ void BinaryOp(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn) { } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I64)) { Reg64 dest, src2; BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + // Start forced src2=i8 + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { + Reg16 dest, src1; + Reg8 src2; + BinaryOpVV(e, i, vv_fn, dest, src1, src2); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8C)) { + Reg16 dest, src1; + BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I8)) { + Reg16 dest; + Reg8 src2; + BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { + Reg32 dest, src1; + Reg8 src2; + BinaryOpVV(e, i, vv_fn, dest, src1, src2); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8C)) { + Reg32 dest, src1; + BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I8)) { + Reg32 dest; + Reg8 src2; + BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { + Reg64 dest, src1; + Reg8 src2; + BinaryOpVV(e, i, vv_fn, dest, src1, src2); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8C)) { + Reg64 dest, src1; + BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I8)) { + Reg64 dest; + Reg8 src2; + BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); } else { ASSERT_INVALID_TYPE(); } @@ -811,7 +848,7 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { Reg32 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.movsx(dest, src.cvt32()); + e.movsxd(dest, src.cvt32()); e.EndOp(dest, src); } else { UNIMPLEMENTED_SEQ(); @@ -1407,35 +1444,55 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { Reg8 dest, src1, src2; Reg8 ca; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0, + i->src3.value, ca, 0); TernaryOpVVV(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { e.mov(e.ah, src3); e.sahf(); e.adc(dest_src, src2); }, dest, src1, src2, ca); + e.EndOp(dest, src1, src2, ca); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { Reg16 dest, src1, src2; Reg8 ca; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0, + i->src3.value, ca, 0); TernaryOpVVV(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { e.mov(e.ah, src3); e.sahf(); e.adc(dest_src, src2); }, dest, src1, src2, ca); + e.EndOp(dest, src1, src2, ca); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { Reg32 dest, src1, src2; Reg8 ca; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0, + i->src3.value, ca, 0); TernaryOpVVV(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { e.mov(e.ah, src3); e.sahf(); e.adc(dest_src, src2); }, dest, src1, src2, ca); + e.EndOp(dest, src1, src2, ca); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { Reg64 dest, src1, src2; Reg8 ca; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0, + i->src3.value, ca, 0); TernaryOpVVV(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { e.mov(e.ah, src3); e.sahf(); e.adc(dest_src, src2); }, dest, src1, src2, ca); + e.EndOp(dest, src1, src2, ca); } else { UNIMPLEMENTED_SEQ(); } @@ -1584,7 +1641,16 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { }); table->AddSequence(OPCODE_SHL, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + // TODO(benvanik): use shlx if available. + BinaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + Reg8 shamt(src.getIdx()); + e.shl(dest_src, shamt); + }, + [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.shl(dest_src, src); + }); i = e.Advance(i); return true; }); @@ -1596,7 +1662,16 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { }); table->AddSequence(OPCODE_SHR, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + // TODO(benvanik): use shrx if available. + BinaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + Reg8 shamt(src.getIdx()); + e.shr(dest_src, shamt); + }, + [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.shr(dest_src, src); + }); i = e.Advance(i); return true; }); @@ -1608,7 +1683,16 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { }); table->AddSequence(OPCODE_SHA, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + // TODO(benvanik): use sarx if available. + BinaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + Reg8 shamt(src.getIdx()); + e.sar(dest_src, shamt); + }, + [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.sar(dest_src, src); + }); i = e.Advance(i); return true; }); @@ -1620,49 +1704,15 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { }); table->AddSequence(OPCODE_ROTATE_LEFT, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest; - Reg8 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest != src1) { - e.mov(dest, src1); - } - e.rol(dest, i->src2.value->constant.i8); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I8C)) { - Reg8 dest; - Reg16 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest != src1) { - e.mov(dest, src1); - } - e.rol(dest, i->src2.value->constant.i8); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I8C)) { - Reg8 dest; - Reg32 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest != src1) { - e.mov(dest, src1); - } - e.rol(dest, i->src2.value->constant.i8); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I8C)) { - Reg8 dest; - Reg64 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest != src1) { - e.mov(dest, src1); - } - e.rol(dest, i->src2.value->constant.i8); - e.EndOp(dest, src1); - } else { - UNIMPLEMENTED_SEQ(); - } + BinaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + Reg8 shamt(src.getIdx()); + e.rol(dest_src, shamt); + }, + [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.rol(dest_src, src); + }); i = e.Advance(i); return true; }); From e52a7bc3afc919c3710877a066bce17199a83856 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 15:10:39 -0800 Subject: [PATCH 007/184] Fixing things, breaking others. --- .../x64/lowering/lowering_sequences.cc | 224 +++++++++++++----- 1 file changed, 159 insertions(+), 65 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index f6442a0a0..459c2bbdf 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -373,7 +373,7 @@ template void BinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, TD& dest, Value* src1, TS2& src2) { e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src2, 0); + i->src2.value, src2, 0); if (dest.getBit() <= 32) { // 32-bit. if (dest == src2) { @@ -563,7 +563,111 @@ void TernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vvv_fn(e, *i, dest, src2, e.rax); } } - e.EndOp(dest, src1); + e.EndOp(dest, src1, src2); +} +template +void TernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, + TD& dest, TS1& src1, Value* src2, TS3& src3) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src3.value, src3, 0); + if (dest.getBit() <= 32) { + // 32-bit. + if (dest == src1) { + vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src3); + } else if (dest == src3) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src1); + } else { + // Eww. + e.mov(e.rax, src3); + e.mov(dest, src1); + vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), e.rax); + } + } else { + e.mov(dest, src1); + vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src3); + } + } else { + // 64-bit. + if (dest == src1) { + e.mov(e.rax, src2->constant.i64); + vvv_fn(e, *i, dest, e.rax, src3); + } else if (dest == src3) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + e.mov(e.rax, src2->constant.i64); + vvv_fn(e, *i, dest, src1, e.rax); + } else { + // Eww. + e.mov(e.rax, src1); + e.mov(src1, src3); + e.mov(dest, e.rax); + e.mov(e.rax, src2->constant.i64); + vvv_fn(e, *i, dest, e.rax, src1); + } + } else { + e.mov(e.rax, src2->constant.i64); + e.mov(dest, src1); + vvv_fn(e, *i, dest, e.rax, src3); + } + } + e.EndOp(dest, src1, src3); +} +void TernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn vcv_fn) { + // TODO(benvanik): table lookup. This linear scan is slow. + // Note: we assume DEST.type = SRC1.type = SRC2.type, but that SRC3.type may vary. + XEASSERT(i->dest->type == i->src1.value->type && + i->dest->type == i->src2.value->type); + // TODO(benvanik): table lookup. + if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { + Reg8 dest, src1, src2; + Reg8 src3; + TernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8C)) { + Reg8 dest, src1, src2; + TernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { + Reg16 dest, src1, src2; + Reg8 src3; + TernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8C)) { + Reg16 dest, src1, src2; + TernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { + Reg32 dest, src1, src2; + Reg8 src3; + TernaryOpVVV(e, i,vvv_fn, dest, src1, src2, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8C)) { + Reg32 dest, src1, src2; + TernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { + Reg64 dest, src1, src2; + Reg8 src3; + TernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8C)) { + Reg64 dest, src1, src2; + TernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); + // + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8C, SIG_TYPE_I8)) { + Reg8 dest, src1, src3; + TernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I8)) { + Reg16 dest, src1, src3; + TernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I8)) { + Reg32 dest, src1, src3; + TernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I8)) { + Reg64 dest, src1, src3; + TernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); + } else { + ASSERT_INVALID_TYPE(); + } + if (i->flags & ARITHMETIC_SET_CARRY) { + // EFLAGS should have CA set? + // (so long as we don't fuck with it) + // UNIMPLEMENTED_SEQ(); + } } } // namespace @@ -1441,61 +1545,36 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { table->AddSequence(OPCODE_ADD_CARRY, [](X64Emitter& e, Instr*& i) { // dest = src1 + src2 + src3.i8 - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest, src1, src2; - Reg8 ca; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0, - i->src3.value, ca, 0); - TernaryOpVVV(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { - e.mov(e.ah, src3); - e.sahf(); - e.adc(dest_src, src2); - }, dest, src1, src2, ca); - e.EndOp(dest, src1, src2, ca); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { - Reg16 dest, src1, src2; - Reg8 ca; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0, - i->src3.value, ca, 0); - TernaryOpVVV(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { - e.mov(e.ah, src3); - e.sahf(); - e.adc(dest_src, src2); - }, dest, src1, src2, ca); - e.EndOp(dest, src1, src2, ca); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { - Reg32 dest, src1, src2; - Reg8 ca; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0, - i->src3.value, ca, 0); - TernaryOpVVV(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { - e.mov(e.ah, src3); - e.sahf(); - e.adc(dest_src, src2); - }, dest, src1, src2, ca); - e.EndOp(dest, src1, src2, ca); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { - Reg64 dest, src1, src2; - Reg8 ca; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0, - i->src3.value, ca, 0); - TernaryOpVVV(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { - e.mov(e.ah, src3); - e.sahf(); - e.adc(dest_src, src2); - }, dest, src1, src2, ca); - e.EndOp(dest, src1, src2, ca); - } else { - UNIMPLEMENTED_SEQ(); - } + TernaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { + Reg8 src3_8(src3.getIdx()); + if (src3.getIdx() <= 4) { + e.mov(e.ah, src3_8); + } else { + e.mov(e.al, src3_8); + e.mov(e.ah, e.al); + } + e.sahf(); + e.adc(dest_src, src2); + }, + [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, uint32_t src3) { + e.mov(e.eax, src3); + e.mov(e.ah, e.al); + e.sahf(); + e.adc(dest_src, src2); + }, + [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src2, const Operand& src3) { + Reg8 src3_8(src3.getIdx()); + if (src3.getIdx() <= 4) { + e.mov(e.ah, src3_8); + } else { + e.mov(e.al, src3_8); + e.mov(e.ah, e.al); + } + e.sahf(); + e.adc(dest_src, src2); + }); i = e.Advance(i); return true; }); @@ -1645,8 +1724,17 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // Can only shl by cl. Eww x86. Reg8 shamt(src.getIdx()); - e.shl(dest_src, shamt); + e.mov(e.rax, e.rcx); + e.mov(e.cl, shamt); + e.shl(dest_src, e.cl); + e.mov(e.rcx, e.rax); + // BeaEngine can't disasm this, boo. + /*Reg32e dest_src_e(dest_src.getIdx(), MAX(dest_src.getBit(), 32)); + Reg32e src_e(src.getIdx(), MAX(dest_src.getBit(), 32)); + e.and(src_e, 0x3F); + e.shlx(dest_src_e, dest_src_e, src_e);*/ }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.shl(dest_src, src); @@ -1720,14 +1808,20 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { table->AddSequence(OPCODE_BYTE_SWAP, [](X64Emitter& e, Instr*& i) { if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16)) { Reg16 d, s1; - e.BeginOp(i->dest, d, REG_DEST | REG_ABCD, + // TODO(benvanik): fix register allocator to put the value in ABCD + //e.BeginOp(i->dest, d, REG_DEST | REG_ABCD, + // i->src1.value, s1, 0); + //if (d != s1) { + // e.mov(d, s1); + // e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); + //} else { + // e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); + //} + e.BeginOp(i->dest, d, REG_DEST, i->src1.value, s1, 0); - if (d != s1) { - e.mov(d, s1); - e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); - } else { - e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); - } + e.mov(e.ax, s1); + e.xchg(e.ah, e.al); + e.mov(d, e.ax); e.EndOp(d, s1); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32)) { Reg32 d, s1; From ebaf8aa16da298da6c5635177168519a66467c40 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 15:46:10 -0800 Subject: [PATCH 008/184] Fixing extends. Xbyak doesn't have good validation, so expecting more :( --- .../x64/lowering/lowering_sequences.cc | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 459c2bbdf..a68506b86 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -867,35 +867,35 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { Reg8 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.movzx(dest, src.cvt8()); + e.movzx(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I8)) { Reg32 dest; Reg8 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.movzx(dest, src.cvt8()); + e.movzx(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I16)) { Reg32 dest; Reg16 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.movzx(dest, src.cvt8()); + e.movzx(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I8)) { Reg64 dest; Reg8 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.movzx(dest, src.cvt16()); + e.movzx(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I16)) { Reg64 dest; Reg16 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.movzx(dest, src.cvt16()); + e.movzx(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I32)) { Reg64 dest; @@ -917,42 +917,42 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { Reg8 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.movsx(dest, src.cvt8()); + e.movsx(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I8)) { Reg32 dest; Reg8 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.movsx(dest, src.cvt8()); + e.movsx(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I16)) { Reg32 dest; Reg16 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.movsx(dest, src.cvt8()); + e.movsx(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I8)) { Reg64 dest; Reg8 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.movsx(dest, src.cvt16()); + e.movsx(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I16)) { Reg64 dest; Reg16 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.movsx(dest, src.cvt16()); + e.movsx(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I32)) { Reg64 dest; Reg32 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.movsxd(dest, src.cvt32()); + e.movsxd(dest, src); e.EndOp(dest, src); } else { UNIMPLEMENTED_SEQ(); @@ -1754,8 +1754,12 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // Can only sar by cl. Eww x86. Reg8 shamt(src.getIdx()); - e.shr(dest_src, shamt); + e.mov(e.rax, e.rcx); + e.mov(e.cl, shamt); + e.shr(dest_src, e.cl); + e.mov(e.rcx, e.rax); }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.shr(dest_src, src); @@ -1775,8 +1779,12 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // Can only sar by cl. Eww x86. Reg8 shamt(src.getIdx()); - e.sar(dest_src, shamt); + e.mov(e.rax, e.rcx); + e.mov(e.cl, shamt); + e.sar(dest_src, e.cl); + e.mov(e.rcx, e.rax); }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.sar(dest_src, src); @@ -1795,8 +1803,12 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // Can only rol by cl. Eww x86. Reg8 shamt(src.getIdx()); - e.rol(dest_src, shamt); + e.mov(e.rax, e.rcx); + e.mov(e.cl, shamt); + e.rol(dest_src, e.cl); + e.mov(e.rcx, e.rax); }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.rol(dest_src, src); From f55fb17e1b54b0396414f96e485f918a91bf7fac Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 16:09:25 -0800 Subject: [PATCH 009/184] MUL/MUL_HI/DIV --- .../x64/lowering/lowering_sequences.cc | 96 ++++++++++++++++++- 1 file changed, 93 insertions(+), 3 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index a68506b86..f29854bdc 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1592,20 +1592,110 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { return true; }); +#define LIKE_REG(dest, like) Operand(dest.getIdx(), dest.getKind(), like.getBit(), false) + table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + BinaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // RAX = value, RDX = clobbered + // TODO(benvanik): make the register allocator put dest_src in RAX? + e.db(0xCC); + auto Nax = LIKE_REG(e.rax, dest_src); + e.mov(Nax, dest_src); + if (i.flags & ARITHMETIC_UNSIGNED) { + e.mul(src); + } else { + e.imul(src); + } + e.mov(dest_src, Nax); + }, + [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + // RAX = value, RDX = clobbered + // TODO(benvanik): make the register allocator put dest_src in RAX? + auto Nax = LIKE_REG(e.rax, dest_src); + auto Ndx = LIKE_REG(e.rdx, dest_src); + e.db(0xCC); + e.mov(Nax, dest_src); + e.mov(Ndx, src); + if (i.flags & ARITHMETIC_UNSIGNED) { + e.mul(Ndx); + } else { + e.imul(Ndx); + } + e.mov(dest_src, Nax); + }); i = e.Advance(i); return true; }); table->AddSequence(OPCODE_MUL_HI, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + BinaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // RAX = value, RDX = clobbered + // TODO(benvanik): make the register allocator put dest_src in RAX? + e.db(0xCC); + auto Nax = LIKE_REG(e.rax, dest_src); + auto Ndx = LIKE_REG(e.rdx, dest_src); + e.mov(Nax, dest_src); + if (i.flags & ARITHMETIC_UNSIGNED) { + e.mul(src); + } else { + e.imul(src); + } + e.mov(dest_src, Ndx); + }, + [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + // RAX = value, RDX = clobbered + // TODO(benvanik): make the register allocator put dest_src in RAX? + auto Nax = LIKE_REG(e.rax, dest_src); + auto Ndx = LIKE_REG(e.rdx, dest_src); + e.db(0xCC); + e.mov(Nax, dest_src); + e.mov(Ndx, src); + if (i.flags & ARITHMETIC_UNSIGNED) { + e.mul(Ndx); + } else { + e.imul(Ndx); + } + e.mov(dest_src, Ndx); + }); i = e.Advance(i); return true; }); table->AddSequence(OPCODE_DIV, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + BinaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // RAX = value, RDX = clobbered + // TODO(benvanik): make the register allocator put dest_src in RAX? + e.db(0xCC); + auto Nax = LIKE_REG(e.rax, dest_src); + e.mov(Nax, dest_src); + if (i.flags & ARITHMETIC_UNSIGNED) { + e.div(src); + } else { + e.idiv(src); + } + e.mov(dest_src, Nax); + }, + [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + // RAX = value, RDX = clobbered + // TODO(benvanik): make the register allocator put dest_src in RAX? + auto Nax = LIKE_REG(e.rax, dest_src); + auto Ndx = LIKE_REG(e.rdx, dest_src); + e.db(0xCC); + e.mov(Nax, dest_src); + e.mov(Ndx, src); + if (i.flags & ARITHMETIC_UNSIGNED) { + e.div(Ndx); + } else { + e.idiv(Ndx); + } + e.mov(dest_src, Nax); + }); i = e.Advance(i); return true; }); From 638d9631afcd5ce344b7a4c049b780c956446b4d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 17:38:40 -0800 Subject: [PATCH 010/184] Fixing register allocation bug. --- .../x64/lowering/lowering_sequences.cc | 6 --- src/alloy/backend/x64/x64_emitter.cc | 37 +++++++++++++++++-- src/alloy/backend/x64/x64_emitter.h | 3 +- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index f29854bdc..fe86ae7ed 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1600,7 +1600,6 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { // RAX = value, RDX = clobbered // TODO(benvanik): make the register allocator put dest_src in RAX? - e.db(0xCC); auto Nax = LIKE_REG(e.rax, dest_src); e.mov(Nax, dest_src); if (i.flags & ARITHMETIC_UNSIGNED) { @@ -1615,7 +1614,6 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { // TODO(benvanik): make the register allocator put dest_src in RAX? auto Nax = LIKE_REG(e.rax, dest_src); auto Ndx = LIKE_REG(e.rdx, dest_src); - e.db(0xCC); e.mov(Nax, dest_src); e.mov(Ndx, src); if (i.flags & ARITHMETIC_UNSIGNED) { @@ -1635,7 +1633,6 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { // RAX = value, RDX = clobbered // TODO(benvanik): make the register allocator put dest_src in RAX? - e.db(0xCC); auto Nax = LIKE_REG(e.rax, dest_src); auto Ndx = LIKE_REG(e.rdx, dest_src); e.mov(Nax, dest_src); @@ -1651,7 +1648,6 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { // TODO(benvanik): make the register allocator put dest_src in RAX? auto Nax = LIKE_REG(e.rax, dest_src); auto Ndx = LIKE_REG(e.rdx, dest_src); - e.db(0xCC); e.mov(Nax, dest_src); e.mov(Ndx, src); if (i.flags & ARITHMETIC_UNSIGNED) { @@ -1671,7 +1667,6 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { // RAX = value, RDX = clobbered // TODO(benvanik): make the register allocator put dest_src in RAX? - e.db(0xCC); auto Nax = LIKE_REG(e.rax, dest_src); e.mov(Nax, dest_src); if (i.flags & ARITHMETIC_UNSIGNED) { @@ -1686,7 +1681,6 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { // TODO(benvanik): make the register allocator put dest_src in RAX? auto Nax = LIKE_REG(e.rax, dest_src); auto Ndx = LIKE_REG(e.rdx, dest_src); - e.db(0xCC); e.mov(Nax, dest_src); e.mov(Ndx, src); if (i.flags & ARITHMETIC_UNSIGNED) { diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 9b68e16be..7e0ee8a6a 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -143,6 +143,8 @@ int X64Emitter::Emit(HIRBuilder* builder) { auto lowering_table = backend_->lowering_table(); + reg_state_.active_regs = reg_state_.live_regs = reserved_regs; + // Body. auto block = builder->first_block(); while (block) { @@ -156,7 +158,7 @@ int X64Emitter::Emit(HIRBuilder* builder) { // Reset reg allocation state. // If we start keeping regs across blocks this needs to change. // We mark a few active so that the allocator doesn't use them. - reg_state_.active_regs = reg_state_.live_regs = reserved_regs; + ResetRegisters(reserved_regs); // Add instructions. // The table will process sequences of instructions to (try to) @@ -192,11 +194,27 @@ int X64Emitter::Emit(HIRBuilder* builder) { return 0; } -void X64Emitter::EvictStaleRegs() { +void X64Emitter::ResetRegisters(uint32_t reserved_regs) { + // Just need to reset the register for each live value. + uint32_t live_regs = reg_state_.live_regs; + for (size_t n = 0; n < 32; n++, live_regs >>= 1) { + if (live_regs & 0x1) { + auto v = reg_state_.reg_values[n]; + if (v) { + v->reg = -1; + } + } + reg_state_.reg_values[n] = 0; + } + reg_state_.active_regs = reg_state_.live_regs = reserved_regs; +} + +void X64Emitter::EvictStaleRegisters() { // NOTE: if we are getting called it's because we *need* a register. // We must get rid of something. - uint32_t current_ordinal = current_instr_->ordinal; + uint32_t current_ordinal = current_instr_ ? + current_instr_->ordinal : 0xFFFFFFFF; // Remove any register with no more uses. uint32_t new_live_regs = 0; @@ -216,7 +234,12 @@ void X64Emitter::EvictStaleRegs() { auto v = reg_state_.reg_values[n]; if (v->last_use->ordinal < current_ordinal) { reg_state_.reg_values[n] = NULL; + v->reg = -1; + continue; } + + // Register still in use. + new_live_regs |= bit; } // Hrm. We have spilled. @@ -225,6 +248,9 @@ void X64Emitter::EvictStaleRegs() { } reg_state_.live_regs = new_live_regs; + + // Assert that live is a superset of active. + XEASSERTZERO((reg_state_.live_regs ^ reg_state_.active_regs) & reg_state_.active_regs); } void X64Emitter::FindFreeRegs( @@ -234,6 +260,9 @@ void X64Emitter::FindFreeRegs( // Already in a register. Mark active and return. v0_idx = v0->reg; reg_state_.active_regs |= 1 << v0_idx; + + // Assert that live is a superset of active. + XEASSERTZERO((reg_state_.live_regs ^ reg_state_.active_regs) & reg_state_.active_regs); return; } @@ -250,7 +279,7 @@ void X64Emitter::FindFreeRegs( uint32_t free_regs = avail_regs & ~reg_state_.live_regs; if (!free_regs) { // Need to evict something. - EvictStaleRegs(); + EvictStaleRegisters(); free_regs = avail_regs & ~reg_state_.live_regs; XEASSERT(free_regs); } diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 3125d0c07..702eafa71 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -112,7 +112,8 @@ public: GetRegBit(r0) | GetRegBit(r1) | GetRegBit(r2) | GetRegBit(r3)); } - void EvictStaleRegs(); + void ResetRegisters(uint32_t reserved_regs); + void EvictStaleRegisters(); void FindFreeRegs(hir::Value* v0, uint32_t& v0_idx, uint32_t v0_flags); void FindFreeRegs(hir::Value* v0, uint32_t& v0_idx, uint32_t v0_flags, From 0123b63aa2391990547cb1bf30aba129d023c168 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 17:42:03 -0800 Subject: [PATCH 011/184] Adding (somewhat) ivm compatible ctx load/store tracing. --- .../x64/lowering/lowering_sequences.cc | 87 ++++++++++++++++++- 1 file changed, 83 insertions(+), 4 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index fe86ae7ed..cb1e88163 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -30,6 +30,9 @@ namespace { #define UNIMPLEMENTED_SEQ() __debugbreak() #define ASSERT_INVALID_TYPE() XEASSERTALWAYS() +#define ITRACE 1 +#define DTRACE 0 + // TODO(benvanik): emit traces/printfs/etc void Dummy() { @@ -43,6 +46,22 @@ void PrintString(void* raw_context, const char* str) { fflush(stdout); } +void TraceContextLoad(void* raw_context, uint64_t offset, uint64_t value) { + fprintf(stdout, "%lld (%.llX) = ctx i64 +%lld\n", (int64_t)value, value, offset); + fflush(stdout); +} +void TraceContextStore(void* raw_context, uint64_t offset, uint64_t value) { + fprintf(stdout, "ctx i64 +%lld = %lld (%.llX)\n", offset, (int64_t)value, value); + fflush(stdout); +} + +void CallNative(X64Emitter& e, void* target) { + e.mov(e.rax, (uint64_t)target); + e.call(e.rax); + e.mov(e.rcx, e.qword[e.rsp + 0]); + e.mov(e.rdx, e.qword[e.rcx + 8]); // membase +} + // TODO(benvanik): fancy stuff. void* ResolveFunctionSymbol(void* raw_context, FunctionInfo* symbol_info) { // TODO(benvanik): generate this thunk at runtime? or a shim? @@ -679,14 +698,14 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { // -------------------------------------------------------------------------- table->AddSequence(OPCODE_COMMENT, [](X64Emitter& e, Instr*& i) { +#if ITRACE // TODO(benvanik): pass through. + // TODO(benvanik): don't just leak this memory. auto str = (const char*)i->src1.offset; auto str_copy = xestrdupa(str); e.mov(e.rdx, (uint64_t)str_copy); - e.mov(e.rax, (uint64_t)PrintString); - e.call(e.rax); - e.mov(e.rcx, e.qword[e.rsp + 0]); - e.mov(e.rdx, e.qword[e.rcx + 8]); // membase + CallNative(e, PrintString); +#endif // ITRACE i = e.Advance(i); return true; }); @@ -1069,21 +1088,41 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { e.BeginOp(i->dest, dest, REG_DEST); e.mov(dest, e.byte[e.rcx + i->src1.offset]); e.EndOp(dest); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8b, dest); + CallNative(e, TraceContextLoad); +#endif // DTRACE } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { Reg16 dest; e.BeginOp(i->dest, dest, REG_DEST); e.mov(dest, e.word[e.rcx + i->src1.offset]); e.EndOp(dest); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8w, dest); + CallNative(e, TraceContextLoad); +#endif // DTRACE } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { Reg32 dest; e.BeginOp(i->dest, dest, REG_DEST); e.mov(dest, e.dword[e.rcx + i->src1.offset]); e.EndOp(dest); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8d, dest); + CallNative(e, TraceContextLoad); +#endif // DTRACE } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { Reg64 dest; e.BeginOp(i->dest, dest, REG_DEST); e.mov(dest, e.qword[e.rcx + i->src1.offset]); e.EndOp(dest); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8, dest); + CallNative(e, TraceContextLoad); +#endif // DTRACE } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); @@ -1113,29 +1152,69 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { e.BeginOp(i->src2.value, src, 0); e.mov(e.byte[e.rcx + i->src1.offset], src); e.EndOp(src); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8b, src); + CallNative(e, TraceContextStore); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { e.mov(e.byte[e.rcx + i->src1.offset], i->src2.value->constant.i8); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8b, i->src2.value->constant.i8); + CallNative(e, TraceContextStore); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { Reg16 src; e.BeginOp(i->src2.value, src, 0); e.mov(e.word[e.rcx + i->src1.offset], src); e.EndOp(src); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8w, src); + CallNative(e, TraceContextStore); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { e.mov(e.word[e.rcx + i->src1.offset], i->src2.value->constant.i16); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8w, i->src2.value->constant.i16); + CallNative(e, TraceContextStore); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { Reg32 src; e.BeginOp(i->src2.value, src, 0); e.mov(e.dword[e.rcx + i->src1.offset], src); e.EndOp(src); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8d, src); + CallNative(e, TraceContextStore); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8d, i->src2.value->constant.i32); + CallNative(e, TraceContextStore); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { Reg64 src; e.BeginOp(i->src2.value, src, 0); e.mov(e.qword[e.rcx + i->src1.offset], src); e.EndOp(src); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8, src); + CallNative(e, TraceContextStore); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.i64); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8, i->src2.value->constant.i64); + CallNative(e, TraceContextStore); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { Xmm src; e.BeginOp(i->src2.value, src, 0); From d323e6d3ec2debfb5f34a505b1409bbcaa64d947 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 17:52:32 -0800 Subject: [PATCH 012/184] Indirect calls and fixing v128 const loads. --- src/alloy/backend/x64/lowering/lowering_sequences.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index cb1e88163..083081bae 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -81,9 +81,9 @@ void* ResolveFunctionAddress(void* raw_context, uint64_t target_address) { Function* fn = NULL; thread_state->runtime()->ResolveFunction(target_address, &fn); XEASSERTNOTNULL(fn); - XEASSERTALWAYS(); - //fn->Call(thread_state); - return 0; + XEASSERT(fn->type() == Function::USER_FUNCTION); + auto x64_fn = (X64Function*)fn; + return x64_fn->machine_code(); } void IssueCall(X64Emitter& e, FunctionInfo* symbol_info, uint32_t flags) { // If we are an extern function, we can directly insert a call. @@ -1236,8 +1236,8 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { e.movaps(e.ptr[e.rcx + i->src1.offset], src); e.EndOp(src); } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { - e.mov(e.ptr[e.rcx + i->src1.offset], i->src2.value->constant.v128.low); - e.mov(e.ptr[e.rcx + i->src1.offset + 8], i->src2.value->constant.v128.high); + e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.v128.low); + e.mov(e.qword[e.rcx + i->src1.offset + 8], i->src2.value->constant.v128.high); } else { ASSERT_INVALID_TYPE(); } From e785e31a6fb761577e471562576651e77a183db1 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 17:52:45 -0800 Subject: [PATCH 013/184] Disabling context prop pass until register spilling is implemented. --- src/alloy/frontend/ppc/ppc_translator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alloy/frontend/ppc/ppc_translator.cc b/src/alloy/frontend/ppc/ppc_translator.cc index 9f82c9827..0cc601889 100644 --- a/src/alloy/frontend/ppc/ppc_translator.cc +++ b/src/alloy/frontend/ppc/ppc_translator.cc @@ -40,7 +40,7 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) : // Passes are executed in the order they are added. Multiple of the same // pass type may be used. - compiler_->AddPass(new passes::ContextPromotionPass()); + //compiler_->AddPass(new passes::ContextPromotionPass()); compiler_->AddPass(new passes::SimplificationPass()); // TODO(benvanik): run repeatedly? compiler_->AddPass(new passes::ConstantPropagationPass()); From 672a4fd504ba6dc66b5348c5167dbc6f3e97f128 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 18:20:59 -0800 Subject: [PATCH 014/184] LOAD_CLOCK. --- .../backend/x64/lowering/lowering_sequences.cc | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 083081bae..c576f6764 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -55,6 +55,15 @@ void TraceContextStore(void* raw_context, uint64_t offset, uint64_t value) { fflush(stdout); } +uint64_t LoadClock(void* raw_context) { + LARGE_INTEGER counter; + uint64_t time = 0; + if (QueryPerformanceCounter(&counter)) { + time = counter.QuadPart; + } + return time; +} + void CallNative(X64Emitter& e, void* target) { e.mov(e.rax, (uint64_t)target); e.call(e.rax); @@ -1073,7 +1082,12 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { }); table->AddSequence(OPCODE_LOAD_CLOCK, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + // It'd be cool to call QueryPerformanceCounter directly, but w/e. + CallNative(e, LoadClock); + Reg64 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.rax); + e.EndOp(dest); i = e.Advance(i); return true; }); From 638afa686c32721426031144432edc0f65d7cfa8 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 18:21:12 -0800 Subject: [PATCH 015/184] Skeleton for PACK/UNPACK. --- .../x64/lowering/lowering_sequences.cc | 69 ++++++++++++++++++- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index c576f6764..b9e3cb646 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -2125,13 +2125,78 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { }); table->AddSequence(OPCODE_PACK, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (i->flags == PACK_TYPE_D3DCOLOR) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_FLOAT16_2) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_FLOAT16_4) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_SHORT_2) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_S8_IN_16_LO) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_S8_IN_16_HI) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_S16_IN_32_LO) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_S16_IN_32_HI) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (i->flags == PACK_TYPE_D3DCOLOR) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_FLOAT16_2) { + // 1 bit sign, 5 bit exponent, 10 bit mantissa + // D3D10 half float format + // TODO(benvanik): http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx + // Use _mm_cvtph_ps -- requires very modern processors (SSE5+) + // Unpacking half floats: http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ + // Packing half floats: https://gist.github.com/rygorous/2156668 + // Load source, move from tight pack of X16Y16.... to X16...Y16... + // Also zero out the high end. + // TODO(benvanik): special case constant unpacks that just get 0/1/etc. + UnaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src) { + // sx = src.iw >> 16; + // sy = src.iw & 0xFFFF; + // dest = { 3.0 + (sx / float(1 << 22)), + // 3.0 + (sy / float(1 << 22)), + // 0.0, + // 1.0); --- or 3.0? + // So: + // xmm = {0,0,0,packed} + // xmm <<= 1w {0,0,packed,0} + // xmm = VCVTPH2PS(xmm) {sx,sy,0,0} + // xmm /= + }); + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_FLOAT16_4) { + // Could be shared with FLOAT16_2. + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_SHORT_2) { + // (VD.x) = 3.0 + (VB.x)*2^-22 + // (VD.y) = 3.0 + (VB.y)*2^-22 + // (VD.z) = 0.0 + // (VD.w) = 3.0 + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_S8_IN_16_LO) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_S8_IN_16_HI) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_S16_IN_32_LO) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_S16_IN_32_HI) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); From 15185236514edb30de55a4ae4eeb1edf163f6d81 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 18:51:56 -0800 Subject: [PATCH 016/184] Untested code for static register load/stores. --- .../x64/lowering/lowering_sequences.cc | 66 +++++++++++++++++++ src/alloy/backend/x64/x64_emitter.cc | 1 + src/alloy/backend/x64/x64_emitter.h | 4 ++ 3 files changed, 71 insertions(+) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index b9e3cb646..acef061ee 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1264,6 +1264,32 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { // -------------------------------------------------------------------------- table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { + // If this is a constant address load, check to see if it's in a register + // range. We'll also probably want a dynamic check for unverified loads. + // So far, most games use constants. + if (i->src1.value->IsConstant()) { + uint64_t address = i->src1.value->AsUint64(); + auto cbs = e.runtime()->access_callbacks(); + while (cbs) { + if (cbs->handles(cbs->context, address)) { + // Eh, hacking lambdas. + i->src3.offset = (uint64_t)cbs; + UnaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src) { + auto cbs = (RegisterAccessCallbacks*)i.src3.offset; + e.mov(e.rcx, (uint64_t)cbs->context); + e.mov(e.rdx, i.src1.value->AsUint64()); + CallNative(e, cbs->read); + e.mov(dest_src, e.rax); + }); + i = e.Advance(i); + return true; + } + cbs = cbs->next; + } + } + // TODO(benvanik): dynamic register access check. // mov reg, [membase + address.32] Reg64 addr_off; @@ -1324,6 +1350,46 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { }); table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { + // If this is a constant address store, check to see if it's in a + // register range. We'll also probably want a dynamic check for + // unverified stores. So far, most games use constants. + if (i->src1.value->IsConstant()) { + uint64_t address = i->src1.value->AsUint64(); + auto cbs = e.runtime()->access_callbacks(); + while (cbs) { + if (cbs->handles(cbs->context, address)) { + e.mov(e.rcx, (uint64_t)cbs->context); + e.mov(e.rdx, address); + if (i->src2.value->IsConstant()) { + e.mov(e.r8, i->src2.value->AsUint64()); + } else { + Reg64 src2; + e.BeginOp(i->src2.value, src2, 0); + switch (i->src2.value->type) { + case INT8_TYPE: + e.movzx(e.r8d, src2.cvt8()); + break; + case INT16_TYPE: + e.movzx(e.r8d, src2.cvt16()); + break; + case INT32_TYPE: + e.movzx(e.r8, src2.cvt32()); + break; + case INT64_TYPE: + e.mov(e.r8, src2); + break; + default: ASSERT_INVALID_TYPE(); break; + } + e.EndOp(src2); + } + // eh? + e.bswap(e.r8); + CallNative(e, cbs->write); + } + cbs = cbs->next; + } + } + // TODO(benvanik): dynamic register access check // mov [membase + address.32], reg Reg64 addr_off; diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 7e0ee8a6a..5e5a9eecd 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -36,6 +36,7 @@ static const size_t MAX_CODE_SIZE = 1 * 1024 * 1024; X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) : + runtime_(backend->runtime()), backend_(backend), code_cache_(backend->code_cache()), allocator_(allocator), diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 702eafa71..fe458b8cb 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -19,6 +19,7 @@ XEDECLARECLASS2(alloy, hir, HIRBuilder); XEDECLARECLASS2(alloy, hir, Instr); XEDECLARECLASS2(alloy, runtime, DebugInfo); +XEDECLARECLASS2(alloy, runtime, Runtime); namespace alloy { namespace backend { @@ -43,6 +44,8 @@ public: X64Emitter(X64Backend* backend, XbyakAllocator* allocator); virtual ~X64Emitter(); + runtime::Runtime* runtime() const { return runtime_; } + int Initialize(); int Emit(hir::HIRBuilder* builder, @@ -146,6 +149,7 @@ private: int Emit(hir::HIRBuilder* builder); private: + runtime::Runtime* runtime_; X64Backend* backend_; X64CodeCache* code_cache_; XbyakAllocator* allocator_; From 1988edfd554a2da5c0e7621e895445d8d3a2cae7 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 21:04:30 -0800 Subject: [PATCH 017/184] Cleaning up a bit in prep for float/vec. --- .../x64/lowering/lowering_sequences.cc | 2579 +++++++++-------- src/alloy/hir/value.h | 8 +- 2 files changed, 1344 insertions(+), 1243 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index acef061ee..d3baff80a 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -24,7 +24,6 @@ using namespace alloy::runtime; using namespace Xbyak; - namespace { #define UNIMPLEMENTED_SEQ() __debugbreak() @@ -33,6 +32,18 @@ namespace { #define ITRACE 1 #define DTRACE 0 +#define SHUFPS_SWAP_DWORDS 0x1B + +// A note about vectors: +// Alloy represents vectors as xyzw pairs, with indices 0123. +// XMM registers are xyzw pairs with indices 3210, making them more like wzyx. +// This makes things somewhat confusing. It'd be nice to just shuffle the +// registers around on load/store, however certain operations require that +// data be in the right offset. +// Basically, this identity must hold: +// shuffle(vec, b00011011) -> {x,y,z,w} => {x,y,z,w} +// All indices and operations must respect that. + // TODO(benvanik): emit traces/printfs/etc void Dummy() { @@ -702,994 +713,997 @@ void TernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn vc void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { - // -------------------------------------------------------------------------- - // General - // -------------------------------------------------------------------------- +// -------------------------------------------------------------------------- +// General +// -------------------------------------------------------------------------- - table->AddSequence(OPCODE_COMMENT, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_COMMENT, [](X64Emitter& e, Instr*& i) { #if ITRACE - // TODO(benvanik): pass through. - // TODO(benvanik): don't just leak this memory. - auto str = (const char*)i->src1.offset; - auto str_copy = xestrdupa(str); - e.mov(e.rdx, (uint64_t)str_copy); - CallNative(e, PrintString); + // TODO(benvanik): pass through. + // TODO(benvanik): don't just leak this memory. + auto str = (const char*)i->src1.offset; + auto str_copy = xestrdupa(str); + e.mov(e.rdx, (uint64_t)str_copy); + CallNative(e, PrintString); #endif // ITRACE - i = e.Advance(i); - return true; - }); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_NOP, [](X64Emitter& e, Instr*& i) { - // If we got this, chances are we want it. - e.nop(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_NOP, [](X64Emitter& e, Instr*& i) { + // If we got this, chances are we want it. + e.nop(); + i = e.Advance(i); + return true; +}); - // -------------------------------------------------------------------------- - // Debugging - // -------------------------------------------------------------------------- +// -------------------------------------------------------------------------- +// Debugging +// -------------------------------------------------------------------------- - table->AddSequence(OPCODE_SOURCE_OFFSET, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_SOURCE_OFFSET, [](X64Emitter& e, Instr*& i) { #if XE_DEBUG - e.nop(); - e.nop(); - e.mov(e.eax, (uint32_t)i->src1.offset); - e.nop(); - e.nop(); + e.nop(); + e.nop(); + e.mov(e.eax, (uint32_t)i->src1.offset); + e.nop(); + e.nop(); #endif // XE_DEBUG - e.MarkSourceOffset(i); - i = e.Advance(i); - return true; - }); + e.MarkSourceOffset(i); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_DEBUG_BREAK, [](X64Emitter& e, Instr*& i) { - // TODO(benvanik): insert a call to the debug break function to let the - // debugger know. - e.db(0xCC); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_DEBUG_BREAK, [](X64Emitter& e, Instr*& i) { + // TODO(benvanik): insert a call to the debug break function to let the + // debugger know. + e.db(0xCC); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_DEBUG_BREAK_TRUE, [](X64Emitter& e, Instr*& i) { - e.inLocalLabel(); - CheckBoolean(e, i->src1.value); - e.jz(".x", e.T_SHORT); - // TODO(benvanik): insert a call to the debug break function to let the - // debugger know. - e.db(0xCC); - e.L(".x"); - e.outLocalLabel(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_DEBUG_BREAK_TRUE, [](X64Emitter& e, Instr*& i) { + e.inLocalLabel(); + CheckBoolean(e, i->src1.value); + e.jz(".x", e.T_SHORT); + // TODO(benvanik): insert a call to the debug break function to let the + // debugger know. + e.db(0xCC); + e.L(".x"); + e.outLocalLabel(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_TRAP, [](X64Emitter& e, Instr*& i) { - // TODO(benvanik): insert a call to the trap function to let the - // debugger know. - e.db(0xCC); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_TRAP, [](X64Emitter& e, Instr*& i) { + // TODO(benvanik): insert a call to the trap function to let the + // debugger know. + e.db(0xCC); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_TRAP_TRUE, [](X64Emitter& e, Instr*& i) { - e.inLocalLabel(); - CheckBoolean(e, i->src1.value); - e.jz(".x", e.T_SHORT); - // TODO(benvanik): insert a call to the trap function to let the - // debugger know. - e.db(0xCC); - e.L(".x"); - e.outLocalLabel(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_TRAP_TRUE, [](X64Emitter& e, Instr*& i) { + e.inLocalLabel(); + CheckBoolean(e, i->src1.value); + e.jz(".x", e.T_SHORT); + // TODO(benvanik): insert a call to the trap function to let the + // debugger know. + e.db(0xCC); + e.L(".x"); + e.outLocalLabel(); + i = e.Advance(i); + return true; +}); - // -------------------------------------------------------------------------- - // Calls - // -------------------------------------------------------------------------- +// -------------------------------------------------------------------------- +// Calls +// -------------------------------------------------------------------------- - table->AddSequence(OPCODE_CALL, [](X64Emitter& e, Instr*& i) { - IssueCall(e, i->src1.symbol_info, i->flags); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_CALL, [](X64Emitter& e, Instr*& i) { + IssueCall(e, i->src1.symbol_info, i->flags); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_CALL_TRUE, [](X64Emitter& e, Instr*& i) { - e.inLocalLabel(); - CheckBoolean(e, i->src1.value); - e.jz(".x", e.T_SHORT); - IssueCall(e, i->src2.symbol_info, i->flags); - e.L(".x"); - e.outLocalLabel(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_CALL_TRUE, [](X64Emitter& e, Instr*& i) { + e.inLocalLabel(); + CheckBoolean(e, i->src1.value); + e.jz(".x", e.T_SHORT); + IssueCall(e, i->src2.symbol_info, i->flags); + e.L(".x"); + e.outLocalLabel(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_CALL_INDIRECT, [](X64Emitter& e, Instr*& i) { - IssueCallIndirect(e, i->src1.value, i->flags); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_CALL_INDIRECT, [](X64Emitter& e, Instr*& i) { + IssueCallIndirect(e, i->src1.value, i->flags); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_CALL_INDIRECT_TRUE, [](X64Emitter& e, Instr*& i) { - e.inLocalLabel(); - CheckBoolean(e, i->src1.value); - e.jz(".x", e.T_SHORT); - IssueCallIndirect(e, i->src2.value, i->flags); - e.L(".x"); - e.outLocalLabel(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_CALL_INDIRECT_TRUE, [](X64Emitter& e, Instr*& i) { + e.inLocalLabel(); + CheckBoolean(e, i->src1.value); + e.jz(".x", e.T_SHORT); + IssueCallIndirect(e, i->src2.value, i->flags); + e.L(".x"); + e.outLocalLabel(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_RETURN, [](X64Emitter& e, Instr*& i) { - // If this is the last instruction in the last block, just let us - // fall through. - if (i->next || i->block->next) { - e.jmp("epilog", CodeGenerator::T_NEAR); - } - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_RETURN, [](X64Emitter& e, Instr*& i) { + // If this is the last instruction in the last block, just let us + // fall through. + if (i->next || i->block->next) { + e.jmp("epilog", CodeGenerator::T_NEAR); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_RETURN_TRUE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - e.jnz("epilog", CodeGenerator::T_NEAR); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_RETURN_TRUE, [](X64Emitter& e, Instr*& i) { + CheckBoolean(e, i->src1.value); + e.jnz("epilog", CodeGenerator::T_NEAR); + i = e.Advance(i); + return true; +}); - // -------------------------------------------------------------------------- - // Branches - // -------------------------------------------------------------------------- +// -------------------------------------------------------------------------- +// Branches +// -------------------------------------------------------------------------- - table->AddSequence(OPCODE_BRANCH, [](X64Emitter& e, Instr*& i) { - auto target = i->src1.label; - e.jmp(target->name, e.T_NEAR); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_BRANCH, [](X64Emitter& e, Instr*& i) { + auto target = i->src1.label; + e.jmp(target->name, e.T_NEAR); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_BRANCH_TRUE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - auto target = i->src2.label; - e.jnz(target->name, e.T_NEAR); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_BRANCH_TRUE, [](X64Emitter& e, Instr*& i) { + CheckBoolean(e, i->src1.value); + auto target = i->src2.label; + e.jnz(target->name, e.T_NEAR); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_BRANCH_FALSE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - auto target = i->src2.label; - e.jz(target->name, e.T_NEAR); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_BRANCH_FALSE, [](X64Emitter& e, Instr*& i) { + CheckBoolean(e, i->src1.value); + auto target = i->src2.label; + e.jz(target->name, e.T_NEAR); + i = e.Advance(i); + return true; +}); - // -------------------------------------------------------------------------- - // Types - // -------------------------------------------------------------------------- +// -------------------------------------------------------------------------- +// Types +// -------------------------------------------------------------------------- - table->AddSequence(OPCODE_ASSIGN, [](X64Emitter& e, Instr*& i) { - UnaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src) { - // nop - the mov will have happened. - }); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_ASSIGN, [](X64Emitter& e, Instr*& i) { + UnaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src) { + // nop - the mov will have happened. + }); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { - // Need a matrix. +table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { + // Need a matrix. + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_ZERO_EXTEND, [](X64Emitter& e, Instr*& i) { + if (i->Match(SIG_TYPE_I16, SIG_TYPE_I8)) { + Reg16 dest; + Reg8 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.movzx(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I8)) { + Reg32 dest; + Reg8 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.movzx(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I16)) { + Reg32 dest; + Reg16 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.movzx(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I8)) { + Reg64 dest; + Reg8 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.movzx(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I16)) { + Reg64 dest; + Reg16 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.movzx(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I32)) { + Reg64 dest; + Reg32 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.mov(dest.cvt32(), src.cvt32()); + e.EndOp(dest, src); + } else { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_ZERO_EXTEND, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I16, SIG_TYPE_I8)) { - Reg16 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I8)) { - Reg32 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I16)) { - Reg32 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I8)) { - Reg64 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I16)) { - Reg64 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I32)) { - Reg64 dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest.cvt32(), src.cvt32()); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_SIGN_EXTEND, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I16, SIG_TYPE_I8)) { - Reg16 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I8)) { - Reg32 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I16)) { - Reg32 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I8)) { - Reg64 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I16)) { - Reg64 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I32)) { - Reg64 dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsxd(dest, src); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_TRUNCATE, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I8, SIG_TYPE_I16)) { - Reg8 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I32)) { - Reg8 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I64)) { - Reg8 dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I32)) { - Reg16 dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt16()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I64)) { - Reg16 dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt16()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I64)) { - Reg32 dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt32()); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_SIGN_EXTEND, [](X64Emitter& e, Instr*& i) { + if (i->Match(SIG_TYPE_I16, SIG_TYPE_I8)) { + Reg16 dest; + Reg8 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.movsx(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I8)) { + Reg32 dest; + Reg8 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.movsx(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I16)) { + Reg32 dest; + Reg16 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.movsx(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I8)) { + Reg64 dest; + Reg8 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.movsx(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I16)) { + Reg64 dest; + Reg16 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.movsx(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I32)) { + Reg64 dest; + Reg32 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.movsxd(dest, src); + e.EndOp(dest, src); + } else { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_ROUND, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_TRUNCATE, [](X64Emitter& e, Instr*& i) { + if (i->Match(SIG_TYPE_I8, SIG_TYPE_I16)) { + Reg8 dest; + Reg16 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.mov(dest, src.cvt8()); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I32)) { + Reg8 dest; + Reg16 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.mov(dest, src.cvt8()); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I64)) { + Reg8 dest; + Reg64 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.mov(dest, src.cvt8()); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I32)) { + Reg16 dest; + Reg32 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.mov(dest, src.cvt16()); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I64)) { + Reg16 dest; + Reg64 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.mov(dest, src.cvt16()); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I64)) { + Reg32 dest; + Reg64 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.mov(dest, src.cvt32()); + e.EndOp(dest, src); + } else { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_VECTOR_CONVERT_I2F, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_VECTOR_CONVERT_F2I, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_ROUND, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - // -------------------------------------------------------------------------- - // Constants - // -------------------------------------------------------------------------- +table->AddSequence(OPCODE_VECTOR_CONVERT_I2F, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - // specials for zeroing/etc (xor/etc) +table->AddSequence(OPCODE_VECTOR_CONVERT_F2I, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_LOAD_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +// -------------------------------------------------------------------------- +// Constants +// -------------------------------------------------------------------------- - table->AddSequence(OPCODE_LOAD_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +// specials for zeroing/etc (xor/etc) - table->AddSequence(OPCODE_LOAD_CLOCK, [](X64Emitter& e, Instr*& i) { - // It'd be cool to call QueryPerformanceCounter directly, but w/e. - CallNative(e, LoadClock); +table->AddSequence(OPCODE_LOAD_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_LOAD_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_LOAD_CLOCK, [](X64Emitter& e, Instr*& i) { + // It'd be cool to call QueryPerformanceCounter directly, but w/e. + CallNative(e, LoadClock); + Reg64 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.rax); + e.EndOp(dest); + i = e.Advance(i); + return true; +}); + +// -------------------------------------------------------------------------- +// Context +// -------------------------------------------------------------------------- + +table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { + if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { + Reg8 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.byte[e.rcx + i->src1.offset]); + e.EndOp(dest); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8b, dest); + CallNative(e, TraceContextLoad); +#endif // DTRACE + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { + Reg16 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.word[e.rcx + i->src1.offset]); + e.EndOp(dest); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8w, dest); + CallNative(e, TraceContextLoad); +#endif // DTRACE + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { + Reg32 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.dword[e.rcx + i->src1.offset]); + e.EndOp(dest); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8d, dest); + CallNative(e, TraceContextLoad); +#endif // DTRACE + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { Reg64 dest; e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.rax); + e.mov(dest, e.qword[e.rcx + i->src1.offset]); e.EndOp(dest); - i = e.Advance(i); - return true; - }); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8, dest); + CallNative(e, TraceContextLoad); +#endif // DTRACE + } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.movss(dest, e.dword[e.rcx + i->src1.offset]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.movsd(dest, e.qword[e.rcx + i->src1.offset]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + // NOTE: we always know we are aligned. + e.movaps(dest, e.ptr[e.rcx + i->src1.offset]); + e.EndOp(dest); + } else { + ASSERT_INVALID_TYPE(); + } + i = e.Advance(i); + return true; +}); - // -------------------------------------------------------------------------- - // Context - // -------------------------------------------------------------------------- +table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { + if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { + Reg8 src; + e.BeginOp(i->src2.value, src, 0); + e.mov(e.byte[e.rcx + i->src1.offset], src); + e.EndOp(src); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8b, src); + CallNative(e, TraceContextStore); +#endif // DTRACE + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { + e.mov(e.byte[e.rcx + i->src1.offset], i->src2.value->constant.i8); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8b, i->src2.value->constant.i8); + CallNative(e, TraceContextStore); +#endif // DTRACE + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { + Reg16 src; + e.BeginOp(i->src2.value, src, 0); + e.mov(e.word[e.rcx + i->src1.offset], src); + e.EndOp(src); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8w, src); + CallNative(e, TraceContextStore); +#endif // DTRACE + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { + e.mov(e.word[e.rcx + i->src1.offset], i->src2.value->constant.i16); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8w, i->src2.value->constant.i16); + CallNative(e, TraceContextStore); +#endif // DTRACE + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { + Reg32 src; + e.BeginOp(i->src2.value, src, 0); + e.mov(e.dword[e.rcx + i->src1.offset], src); + e.EndOp(src); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8d, src); + CallNative(e, TraceContextStore); +#endif // DTRACE + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { + e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8d, i->src2.value->constant.i32); + CallNative(e, TraceContextStore); +#endif // DTRACE + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { + Reg64 src; + e.BeginOp(i->src2.value, src, 0); + e.mov(e.qword[e.rcx + i->src1.offset], src); + e.EndOp(src); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8, src); + CallNative(e, TraceContextStore); +#endif // DTRACE + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { + e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.i64); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.mov(e.r8, i->src2.value->constant.i64); + CallNative(e, TraceContextStore); +#endif // DTRACE + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { + Xmm src; + e.BeginOp(i->src2.value, src, 0); + e.movss(e.dword[e.rcx + i->src1.offset], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { + e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { + Xmm src; + e.BeginOp(i->src2.value, src, 0); + e.movsd(e.qword[e.rcx + i->src1.offset], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { + e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.i64); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { + Xmm src; + e.BeginOp(i->src2.value, src, 0); + // NOTE: we always know we are aligned. + e.movaps(e.ptr[e.rcx + i->src1.offset], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { + e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.v128.low); + e.mov(e.qword[e.rcx + i->src1.offset + 8], i->src2.value->constant.v128.high); + } else { + ASSERT_INVALID_TYPE(); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.byte[e.rcx + i->src1.offset]); - e.EndOp(dest); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8b, dest); - CallNative(e, TraceContextLoad); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { - Reg16 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.word[e.rcx + i->src1.offset]); - e.EndOp(dest); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8w, dest); - CallNative(e, TraceContextLoad); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { - Reg32 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.dword[e.rcx + i->src1.offset]); - e.EndOp(dest); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8d, dest); - CallNative(e, TraceContextLoad); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { - Reg64 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.qword[e.rcx + i->src1.offset]); - e.EndOp(dest); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8, dest); - CallNative(e, TraceContextLoad); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movss(dest, e.dword[e.rcx + i->src1.offset]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movsd(dest, e.qword[e.rcx + i->src1.offset]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - // TODO(benvanik): we should try to stick to movaps if possible. - e.movups(dest, e.ptr[e.rcx + i->src1.offset]); - e.EndOp(dest); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; - }); +// -------------------------------------------------------------------------- +// Memory +// -------------------------------------------------------------------------- - table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { - Reg8 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.byte[e.rcx + i->src1.offset], src); - e.EndOp(src); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8b, src); - CallNative(e, TraceContextStore); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { - e.mov(e.byte[e.rcx + i->src1.offset], i->src2.value->constant.i8); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8b, i->src2.value->constant.i8); - CallNative(e, TraceContextStore); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { - Reg16 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.word[e.rcx + i->src1.offset], src); - e.EndOp(src); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8w, src); - CallNative(e, TraceContextStore); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { - e.mov(e.word[e.rcx + i->src1.offset], i->src2.value->constant.i16); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8w, i->src2.value->constant.i16); - CallNative(e, TraceContextStore); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { - Reg32 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.dword[e.rcx + i->src1.offset], src); - e.EndOp(src); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8d, src); - CallNative(e, TraceContextStore); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { - e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8d, i->src2.value->constant.i32); - CallNative(e, TraceContextStore); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { - Reg64 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.qword[e.rcx + i->src1.offset], src); - e.EndOp(src); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8, src); - CallNative(e, TraceContextStore); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { - e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.i64); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8, i->src2.value->constant.i64); - CallNative(e, TraceContextStore); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movss(e.dword[e.rcx + i->src1.offset], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { - e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movsd(e.qword[e.rcx + i->src1.offset], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { - e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.i64); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - // NOTE: we always know we are aligned. - e.movaps(e.ptr[e.rcx + i->src1.offset], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { - e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.v128.low); - e.mov(e.qword[e.rcx + i->src1.offset + 8], i->src2.value->constant.v128.high); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Memory - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { - // If this is a constant address load, check to see if it's in a register - // range. We'll also probably want a dynamic check for unverified loads. - // So far, most games use constants. - if (i->src1.value->IsConstant()) { - uint64_t address = i->src1.value->AsUint64(); - auto cbs = e.runtime()->access_callbacks(); - while (cbs) { - if (cbs->handles(cbs->context, address)) { - // Eh, hacking lambdas. - i->src3.offset = (uint64_t)cbs; - UnaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src) { - auto cbs = (RegisterAccessCallbacks*)i.src3.offset; - e.mov(e.rcx, (uint64_t)cbs->context); - e.mov(e.rdx, i.src1.value->AsUint64()); - CallNative(e, cbs->read); - e.mov(dest_src, e.rax); - }); - i = e.Advance(i); - return true; - } - cbs = cbs->next; +table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { + // If this is a constant address load, check to see if it's in a register + // range. We'll also probably want a dynamic check for unverified loads. + // So far, most games use constants. + if (i->src1.value->IsConstant()) { + uint64_t address = i->src1.value->AsUint64(); + auto cbs = e.runtime()->access_callbacks(); + while (cbs) { + if (cbs->handles(cbs->context, address)) { + // Eh, hacking lambdas. + i->src3.offset = (uint64_t)cbs; + UnaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src) { + auto cbs = (RegisterAccessCallbacks*)i.src3.offset; + e.mov(e.rcx, (uint64_t)cbs->context); + e.mov(e.rdx, i.src1.value->AsUint64()); + CallNative(e, cbs->read); + e.mov(dest_src, e.rax); + }); + i = e.Advance(i); + return true; } + cbs = cbs->next; } + } - // TODO(benvanik): dynamic register access check. - // mov reg, [membase + address.32] - Reg64 addr_off; - RegExp addr; - if (i->src1.value->IsConstant()) { - // TODO(benvanik): a way to do this without using a register. - e.mov(e.eax, i->src1.value->AsUint32()); - addr = e.rdx + e.rax; - } else { - e.BeginOp(i->src1.value, addr_off, 0); - e.mov(addr_off.cvt32(), addr_off.cvt32()); // trunc to 32bits - addr = e.rdx + addr_off; - } - if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.byte[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { - Reg16 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.word[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { - Reg32 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.dword[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { - Reg64 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.qword[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movss(dest, e.dword[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movsd(dest, e.qword[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - // TODO(benvanik): we should try to stick to movaps if possible. - e.movups(dest, e.ptr[addr]); - e.EndOp(dest); - } else { - ASSERT_INVALID_TYPE(); - } - if (!i->src1.value->IsConstant()) { - e.EndOp(addr_off); - } - i = e.Advance(i); - return true; - }); + // TODO(benvanik): dynamic register access check. + // mov reg, [membase + address.32] + Reg64 addr_off; + RegExp addr; + if (i->src1.value->IsConstant()) { + // TODO(benvanik): a way to do this without using a register. + e.mov(e.eax, i->src1.value->AsUint32()); + addr = e.rdx + e.rax; + } else { + e.BeginOp(i->src1.value, addr_off, 0); + e.mov(addr_off.cvt32(), addr_off.cvt32()); // trunc to 32bits + addr = e.rdx + addr_off; + } + if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { + Reg8 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.byte[addr]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { + Reg16 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.word[addr]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { + Reg32 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.dword[addr]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { + Reg64 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.qword[addr]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.movss(dest, e.dword[addr]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.movsd(dest, e.qword[addr]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + // TODO(benvanik): we should try to stick to movaps if possible. + e.movups(dest, e.ptr[addr]); + e.EndOp(dest); + e.db(0xCC); + } else { + ASSERT_INVALID_TYPE(); + } + if (!i->src1.value->IsConstant()) { + e.EndOp(addr_off); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { - // If this is a constant address store, check to see if it's in a - // register range. We'll also probably want a dynamic check for - // unverified stores. So far, most games use constants. - if (i->src1.value->IsConstant()) { - uint64_t address = i->src1.value->AsUint64(); - auto cbs = e.runtime()->access_callbacks(); - while (cbs) { - if (cbs->handles(cbs->context, address)) { - e.mov(e.rcx, (uint64_t)cbs->context); - e.mov(e.rdx, address); - if (i->src2.value->IsConstant()) { - e.mov(e.r8, i->src2.value->AsUint64()); - } else { - Reg64 src2; - e.BeginOp(i->src2.value, src2, 0); - switch (i->src2.value->type) { - case INT8_TYPE: - e.movzx(e.r8d, src2.cvt8()); - break; - case INT16_TYPE: - e.movzx(e.r8d, src2.cvt16()); - break; - case INT32_TYPE: - e.movzx(e.r8, src2.cvt32()); - break; - case INT64_TYPE: - e.mov(e.r8, src2); - break; - default: ASSERT_INVALID_TYPE(); break; - } - e.EndOp(src2); +table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { + // If this is a constant address store, check to see if it's in a + // register range. We'll also probably want a dynamic check for + // unverified stores. So far, most games use constants. + if (i->src1.value->IsConstant()) { + uint64_t address = i->src1.value->AsUint64(); + auto cbs = e.runtime()->access_callbacks(); + while (cbs) { + if (cbs->handles(cbs->context, address)) { + e.mov(e.rcx, (uint64_t)cbs->context); + e.mov(e.rdx, address); + if (i->src2.value->IsConstant()) { + e.mov(e.r8, i->src2.value->AsUint64()); + } else { + Reg64 src2; + e.BeginOp(i->src2.value, src2, 0); + switch (i->src2.value->type) { + case INT8_TYPE: + e.movzx(e.r8d, src2.cvt8()); + break; + case INT16_TYPE: + e.movzx(e.r8d, src2.cvt16()); + break; + case INT32_TYPE: + e.movzx(e.r8, src2.cvt32()); + break; + case INT64_TYPE: + e.mov(e.r8, src2); + break; + default: ASSERT_INVALID_TYPE(); break; } - // eh? - e.bswap(e.r8); - CallNative(e, cbs->write); + e.EndOp(src2); } - cbs = cbs->next; + // eh? + e.bswap(e.r8); + CallNative(e, cbs->write); } + cbs = cbs->next; } + } - // TODO(benvanik): dynamic register access check - // mov [membase + address.32], reg - Reg64 addr_off; - RegExp addr; - if (i->src1.value->IsConstant()) { - e.mov(e.eax, i->src1.value->AsUint32()); - addr = e.rdx + e.rax; + // TODO(benvanik): dynamic register access check + // mov [membase + address.32], reg + Reg64 addr_off; + RegExp addr; + if (i->src1.value->IsConstant()) { + e.mov(e.eax, i->src1.value->AsUint32()); + addr = e.rdx + e.rax; + } else { + e.BeginOp(i->src1.value, addr_off, 0); + e.mov(addr_off.cvt32(), addr_off.cvt32()); // trunc to 32bits + addr = e.rdx + addr_off; + } + if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { + Reg8 src; + e.BeginOp(i->src2.value, src, 0); + e.mov(e.byte[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { + e.mov(e.byte[addr], i->src2.value->constant.i8); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { + Reg16 src; + e.BeginOp(i->src2.value, src, 0); + e.mov(e.word[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { + e.mov(e.word[addr], i->src2.value->constant.i16); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { + Reg32 src; + e.BeginOp(i->src2.value, src, 0); + e.mov(e.dword[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { + e.mov(e.dword[addr], i->src2.value->constant.i32); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { + Reg64 src; + e.BeginOp(i->src2.value, src, 0); + e.mov(e.qword[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { + e.mov(e.qword[addr], i->src2.value->constant.i64); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { + Xmm src; + e.BeginOp(i->src2.value, src, 0); + e.movss(e.dword[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { + e.mov(e.dword[addr], i->src2.value->constant.i32); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { + Xmm src; + e.BeginOp(i->src2.value, src, 0); + e.movsd(e.qword[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { + e.mov(e.qword[addr], i->src2.value->constant.i64); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { + Xmm src; + e.BeginOp(i->src2.value, src, 0); + // TODO(benvanik): we should try to stick to movaps if possible. + e.movups(e.ptr[addr], src); + e.EndOp(src); + e.db(0xCC); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { + e.mov(e.ptr[addr], i->src2.value->constant.v128.low); + e.mov(e.ptr[addr + 8], i->src2.value->constant.v128.high); + } else { + ASSERT_INVALID_TYPE(); + } + if (!i->src1.value->IsConstant()) { + e.EndOp(addr_off); + } + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_PREFETCH, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +// -------------------------------------------------------------------------- +// Comparisons +// -------------------------------------------------------------------------- + +table->AddSequence(OPCODE_MAX, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_MIN, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_SELECT, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_IS_TRUE, [](X64Emitter& e, Instr*& i) { + CheckBoolean(e, i->src1.value); + Reg8 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.setnz(dest); + e.EndOp(dest); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_IS_FALSE, [](X64Emitter& e, Instr*& i) { + CheckBoolean(e, i->src1.value); + Reg8 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.setz(dest); + e.EndOp(dest); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_COMPARE_EQ, [](X64Emitter& e, Instr*& i) { + CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { + if (!invert) { + e.sete(dest); } else { - e.BeginOp(i->src1.value, addr_off, 0); - e.mov(addr_off.cvt32(), addr_off.cvt32()); // trunc to 32bits - addr = e.rdx + addr_off; + e.setne(dest); } - if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { - Reg8 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.byte[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { - e.mov(e.byte[addr], i->src2.value->constant.i8); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { - Reg16 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.word[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { - e.mov(e.word[addr], i->src2.value->constant.i16); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { - Reg32 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.dword[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { - e.mov(e.dword[addr], i->src2.value->constant.i32); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { - Reg64 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.qword[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { - e.mov(e.qword[addr], i->src2.value->constant.i64); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movss(e.dword[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { - e.mov(e.dword[addr], i->src2.value->constant.i32); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movsd(e.qword[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { - e.mov(e.qword[addr], i->src2.value->constant.i64); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - // TODO(benvanik): we should try to stick to movaps if possible. - e.movups(e.ptr[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { - e.mov(e.ptr[addr], i->src2.value->constant.v128.low); - e.mov(e.ptr[addr + 8], i->src2.value->constant.v128.high); + }); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_COMPARE_NE, [](X64Emitter& e, Instr*& i) { + CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { + if (!invert) { + e.setne(dest); } else { - ASSERT_INVALID_TYPE(); + e.sete(dest); } - if (!i->src1.value->IsConstant()) { - e.EndOp(addr_off); + }); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_COMPARE_SLT, [](X64Emitter& e, Instr*& i) { + CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { + if (!invert) { + e.setl(dest); + } else { + e.setge(dest); } - i = e.Advance(i); - return true; }); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_PREFETCH, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; +table->AddSequence(OPCODE_COMPARE_SLE, [](X64Emitter& e, Instr*& i) { + CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { + if (!invert) { + e.setle(dest); + } else { + e.setg(dest); + } }); + i = e.Advance(i); + return true; +}); - // -------------------------------------------------------------------------- - // Comparisons - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_MAX, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; +table->AddSequence(OPCODE_COMPARE_SGT, [](X64Emitter& e, Instr*& i) { + CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { + if (!invert) { + e.setg(dest); + } else { + e.setle(dest); + } }); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_MIN, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; +table->AddSequence(OPCODE_COMPARE_SGE, [](X64Emitter& e, Instr*& i) { + CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { + if (!invert) { + e.setge(dest); + } else { + e.setl(dest); + } }); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_SELECT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; +table->AddSequence(OPCODE_COMPARE_ULT, [](X64Emitter& e, Instr*& i) { + CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { + if (!invert) { + e.setb(dest); + } else { + e.setae(dest); + } }); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_IS_TRUE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.setnz(dest); - e.EndOp(dest); - i = e.Advance(i); - return true; +table->AddSequence(OPCODE_COMPARE_ULE, [](X64Emitter& e, Instr*& i) { + CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { + if (!invert) { + e.setbe(dest); + } else { + e.seta(dest); + } }); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_IS_FALSE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.setz(dest); - e.EndOp(dest); - i = e.Advance(i); - return true; +table->AddSequence(OPCODE_COMPARE_UGT, [](X64Emitter& e, Instr*& i) { + CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { + if (!invert) { + e.seta(dest); + } else { + e.setbe(dest); + } }); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_COMPARE_EQ, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.sete(dest); - } else { - e.setne(dest); - } - }); - i = e.Advance(i); - return true; +table->AddSequence(OPCODE_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { + CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { + if (!invert) { + e.setae(dest); + } else { + e.setb(dest); + } }); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_COMPARE_NE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setne(dest); - } else { - e.sete(dest); - } - }); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_DID_CARRY, [](X64Emitter& e, Instr*& i) { + Reg8 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.setc(dest); + e.EndOp(dest); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_COMPARE_SLT, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setl(dest); - } else { - e.setge(dest); - } - }); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_DID_OVERFLOW, [](X64Emitter& e, Instr*& i) { + Reg8 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.seto(dest); + e.EndOp(dest); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_COMPARE_SLE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setle(dest); - } else { - e.setg(dest); - } - }); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_DID_SATURATE, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_COMPARE_SGT, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setg(dest); - } else { - e.setle(dest); - } - }); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_VECTOR_COMPARE_EQ, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_COMPARE_SGE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setge(dest); - } else { - e.setl(dest); - } - }); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_VECTOR_COMPARE_SGT, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_COMPARE_ULT, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setb(dest); - } else { - e.setae(dest); - } - }); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_VECTOR_COMPARE_SGE, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_COMPARE_ULE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setbe(dest); - } else { - e.seta(dest); - } - }); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_VECTOR_COMPARE_UGT, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_COMPARE_UGT, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.seta(dest); - } else { - e.setbe(dest); - } - }); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_VECTOR_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setae(dest); - } else { - e.setb(dest); - } - }); - i = e.Advance(i); - return true; - }); +// -------------------------------------------------------------------------- +// Math +// -------------------------------------------------------------------------- - table->AddSequence(OPCODE_DID_CARRY, [](X64Emitter& e, Instr*& i) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.setc(dest); - e.EndOp(dest); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_DID_OVERFLOW, [](X64Emitter& e, Instr*& i) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.seto(dest); - e.EndOp(dest); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_DID_SATURATE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_COMPARE_EQ, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_COMPARE_SGT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_COMPARE_SGE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_COMPARE_UGT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Math - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_ADD, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_ADD, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { @@ -1698,11 +1712,15 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.add(dest_src, src); }); - i = e.Advance(i); - return true; - }); + } else { + UNIMPLEMENTED_SEQ(); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_ADD_CARRY, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_ADD_CARRY, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { // dest = src1 + src2 + src3.i8 TernaryOp( e, i, @@ -1734,11 +1752,15 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { e.sahf(); e.adc(dest_src, src2); }); - i = e.Advance(i); - return true; - }); + } else { + UNIMPLEMENTED_SEQ(); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { @@ -1747,13 +1769,17 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.sub(dest_src, src); }); - i = e.Advance(i); - return true; - }); + } else { + UNIMPLEMENTED_SEQ(); + } + i = e.Advance(i); + return true; +}); #define LIKE_REG(dest, like) Operand(dest.getIdx(), dest.getKind(), like.getBit(), false) - table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { @@ -1782,11 +1808,15 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { } e.mov(dest_src, Nax); }); - i = e.Advance(i); - return true; - }); + } else { + UNIMPLEMENTED_SEQ(); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_MUL_HI, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_MUL_HI, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { @@ -1816,11 +1846,15 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { } e.mov(dest_src, Ndx); }); - i = e.Advance(i); - return true; - }); + } else { + UNIMPLEMENTED_SEQ(); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_DIV, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_DIV, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { @@ -1849,71 +1883,75 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { } e.mov(dest_src, Nax); }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_MUL_ADD, [](X64Emitter& e, Instr*& i) { + } else { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_MUL_SUB, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_MUL_ADD, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_NEG, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_MUL_SUB, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_ABS, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_NEG, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_SQRT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_ABS, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_RSQRT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_SQRT, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_POW2, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_RSQRT, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_LOG2, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_POW2, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_DOT_PRODUCT_3, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_LOG2, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_DOT_PRODUCT_4, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); +table->AddSequence(OPCODE_DOT_PRODUCT_3, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_AND, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_DOT_PRODUCT_4, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_AND, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { @@ -1922,11 +1960,15 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.and(dest_src, src); }); - i = e.Advance(i); - return true; - }); + } else { + UNIMPLEMENTED_SEQ(); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_OR, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_OR, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { @@ -1935,11 +1977,15 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.or(dest_src, src); }); - i = e.Advance(i); - return true; - }); + } else { + UNIMPLEMENTED_SEQ(); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_XOR, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_XOR, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { @@ -1948,21 +1994,29 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.xor(dest_src, src); }); - i = e.Advance(i); - return true; - }); + } else { + UNIMPLEMENTED_SEQ(); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { UnaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { e.not(dest_src); }); - i = e.Advance(i); - return true; - }); + } else { + UNIMPLEMENTED_SEQ(); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_SHL, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_SHL, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { // TODO(benvanik): use shlx if available. BinaryOp( e, i, @@ -1982,17 +2036,21 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.shl(dest_src, src); }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { + } else { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_SHR, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_SHR, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { // TODO(benvanik): use shrx if available. BinaryOp( e, i, @@ -2007,17 +2065,21 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.shr(dest_src, src); }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { + } else { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_SHA, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_SHA, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { // TODO(benvanik): use sarx if available. BinaryOp( e, i, @@ -2032,17 +2094,21 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.sar(dest_src, src); }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_SHA, [](X64Emitter& e, Instr*& i) { + } else { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_ROTATE_LEFT, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_VECTOR_SHA, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_ROTATE_LEFT, [](X64Emitter& e, Instr*& i) { + if (IsIntType(i->dest->type)) { BinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { @@ -2056,242 +2122,271 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.rol(dest_src, src); }); - i = e.Advance(i); - return true; - }); + } else { + UNIMPLEMENTED_SEQ(); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_BYTE_SWAP, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg16 d, s1; - // TODO(benvanik): fix register allocator to put the value in ABCD - //e.BeginOp(i->dest, d, REG_DEST | REG_ABCD, - // i->src1.value, s1, 0); - //if (d != s1) { - // e.mov(d, s1); - // e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); - //} else { - // e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); - //} - e.BeginOp(i->dest, d, REG_DEST, - i->src1.value, s1, 0); - e.mov(e.ax, s1); - e.xchg(e.ah, e.al); - e.mov(d, e.ax); - e.EndOp(d, s1); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg32 d, s1; - e.BeginOp(i->dest, d, REG_DEST, - i->src1.value, s1, 0); - if (d != s1) { - e.mov(d, s1); - e.bswap(d); - } else { - e.bswap(d); - } - e.EndOp(d, s1); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg64 d, s1; - e.BeginOp(i->dest, d, REG_DEST, - i->src1.value, s1, 0); - if (d != s1) { - e.mov(d, s1); - e.bswap(d); - } else { - e.bswap(d); - } - e.EndOp(d, s1); +table->AddSequence(OPCODE_BYTE_SWAP, [](X64Emitter& e, Instr*& i) { + if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16)) { + Reg16 d, s1; + // TODO(benvanik): fix register allocator to put the value in ABCD + //e.BeginOp(i->dest, d, REG_DEST | REG_ABCD, + // i->src1.value, s1, 0); + //if (d != s1) { + // e.mov(d, s1); + // e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); + //} else { + // e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); + //} + e.BeginOp(i->dest, d, REG_DEST, + i->src1.value, s1, 0); + e.mov(e.ax, s1); + e.xchg(e.ah, e.al); + e.mov(d, e.ax); + e.EndOp(d, s1); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32)) { + Reg32 d, s1; + e.BeginOp(i->dest, d, REG_DEST, + i->src1.value, s1, 0); + if (d != s1) { + e.mov(d, s1); + e.bswap(d); } else { - ASSERT_INVALID_TYPE(); + e.bswap(d); } - i = e.Advance(i); - return true; - }); + e.EndOp(d, s1); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64)) { + Reg64 d, s1; + e.BeginOp(i->dest, d, REG_DEST, + i->src1.value, s1, 0); + if (d != s1) { + e.mov(d, s1); + e.bswap(d); + } else { + e.bswap(d); + } + e.EndOp(d, s1); + } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_V128)) { + Xmm d, s1; + e.db(0xCC); + e.BeginOp(i->dest, d, REG_DEST, + i->src1.value, s1, 0); + if (d != s1) { + e.shufps(d, s1, SHUFPS_SWAP_DWORDS); + } else { + e.shufps(d, d, SHUFPS_SWAP_DWORDS); + } + e.EndOp(d, s1); + } else { + ASSERT_INVALID_TYPE(); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_CNTLZ, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8)) { - Reg8 dest; - Reg8 src; +table->AddSequence(OPCODE_CNTLZ, [](X64Emitter& e, Instr*& i) { + if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8)) { + Reg8 dest; + Reg8 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.bsr(dest.cvt16(), src.cvt16()); + // ZF = 1 if zero + e.mov(e.eax, 16); + e.cmovz(dest.cvt32(), e.eax); + e.sub(dest, 8); + e.xor(dest, 0x7); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16)) { + Reg8 dest; + Reg16 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.bsr(dest.cvt16(), src); + // ZF = 1 if zero + e.mov(e.eax, 16); + e.cmovz(dest.cvt32(), e.eax); + e.xor(dest, 0xF); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32)) { + Reg8 dest; + Reg32 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.bsr(dest.cvt32(), src); + // ZF = 1 if zero + e.mov(e.eax, 32); + e.cmovz(dest.cvt32(), e.eax); + e.xor(dest, 0x1F); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64)) { + Reg8 dest; + Reg64 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.bsr(dest, src); + // ZF = 1 if zero + e.mov(e.eax, 64); + e.cmovz(dest.cvt32(), e.eax); + e.xor(dest, 0x3F); + e.EndOp(dest, src); + } else { + UNIMPLEMENTED_SEQ(); + } + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_INSERT, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_EXTRACT, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_SPLAT, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_SWIZZLE, [](X64Emitter& e, Instr*& i) { + if (IsVecType(i->dest->type)) { + // Defined by SWIZZLE_MASK() + if (i->flags == INT32_TYPE || i->flags == FLOAT32_TYPE) { + uint8_t swizzle_mask = (uint8_t)i->src2.offset; + e.db(0xCC); + Xmm dest, src1; e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.bsr(dest.cvt16(), src.cvt16()); - // ZF = 1 if zero - e.mov(e.eax, 16); - e.cmovz(dest.cvt32(), e.eax); - e.sub(dest, 8); - e.xor(dest, 0x7); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16)) { - Reg8 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.bsr(dest.cvt16(), src); - // ZF = 1 if zero - e.mov(e.eax, 16); - e.cmovz(dest.cvt32(), e.eax); - e.xor(dest, 0xF); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32)) { - Reg8 dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.bsr(dest.cvt32(), src); - // ZF = 1 if zero - e.mov(e.eax, 32); - e.cmovz(dest.cvt32(), e.eax); - e.xor(dest, 0x1F); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64)) { - Reg8 dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.bsr(dest, src); - // ZF = 1 if zero - e.mov(e.eax, 64); - e.cmovz(dest.cvt32(), e.eax); - e.xor(dest, 0x3F); - e.EndOp(dest, src); + i->src1.value, src1, 0); + e.pshufd(dest, src1, swizzle_mask); + e.EndOp(dest, src1); } else { UNIMPLEMENTED_SEQ(); } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_INSERT, [](X64Emitter& e, Instr*& i) { + } else { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); + } + i = e.Advance(i); + return true; +}); - table->AddSequence(OPCODE_EXTRACT, [](X64Emitter& e, Instr*& i) { +table->AddSequence(OPCODE_PACK, [](X64Emitter& e, Instr*& i) { + if (i->flags == PACK_TYPE_D3DCOLOR) { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_SPLAT, [](X64Emitter& e, Instr*& i) { + } else if (i->flags == PACK_TYPE_FLOAT16_2) { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) { + } else if (i->flags == PACK_TYPE_FLOAT16_4) { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_SWIZZLE, [](X64Emitter& e, Instr*& i) { + } else if (i->flags == PACK_TYPE_SHORT_2) { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_PACK, [](X64Emitter& e, Instr*& i) { - if (i->flags == PACK_TYPE_D3DCOLOR) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_FLOAT16_2) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_FLOAT16_4) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_SHORT_2) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S8_IN_16_LO) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S8_IN_16_HI) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S16_IN_32_LO) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S16_IN_32_HI) { - UNIMPLEMENTED_SEQ(); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { - if (i->flags == PACK_TYPE_D3DCOLOR) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_FLOAT16_2) { - // 1 bit sign, 5 bit exponent, 10 bit mantissa - // D3D10 half float format - // TODO(benvanik): http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx - // Use _mm_cvtph_ps -- requires very modern processors (SSE5+) - // Unpacking half floats: http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ - // Packing half floats: https://gist.github.com/rygorous/2156668 - // Load source, move from tight pack of X16Y16.... to X16...Y16... - // Also zero out the high end. - // TODO(benvanik): special case constant unpacks that just get 0/1/etc. - UnaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src) { - // sx = src.iw >> 16; - // sy = src.iw & 0xFFFF; - // dest = { 3.0 + (sx / float(1 << 22)), - // 3.0 + (sy / float(1 << 22)), - // 0.0, - // 1.0); --- or 3.0? - // So: - // xmm = {0,0,0,packed} - // xmm <<= 1w {0,0,packed,0} - // xmm = VCVTPH2PS(xmm) {sx,sy,0,0} - // xmm /= - }); - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_FLOAT16_4) { - // Could be shared with FLOAT16_2. - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_SHORT_2) { - // (VD.x) = 3.0 + (VB.x)*2^-22 - // (VD.y) = 3.0 + (VB.y)*2^-22 - // (VD.z) = 0.0 - // (VD.w) = 3.0 - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S8_IN_16_LO) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S8_IN_16_HI) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S16_IN_32_LO) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S16_IN_32_HI) { - UNIMPLEMENTED_SEQ(); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Atomic - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_COMPARE_EXCHANGE, [](X64Emitter& e, Instr*& i) { + } else if (i->flags == PACK_TYPE_S8_IN_16_LO) { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_ATOMIC_EXCHANGE, [](X64Emitter& e, Instr*& i) { + } else if (i->flags == PACK_TYPE_S8_IN_16_HI) { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_ATOMIC_ADD, [](X64Emitter& e, Instr*& i) { + } else if (i->flags == PACK_TYPE_S16_IN_32_LO) { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_ATOMIC_SUB, [](X64Emitter& e, Instr*& i) { + } else if (i->flags == PACK_TYPE_S16_IN_32_HI) { UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); + } else { + ASSERT_INVALID_TYPE(); + } + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { + if (i->flags == PACK_TYPE_D3DCOLOR) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_FLOAT16_2) { + // 1 bit sign, 5 bit exponent, 10 bit mantissa + // D3D10 half float format + // TODO(benvanik): http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx + // Use _mm_cvtph_ps -- requires very modern processors (SSE5+) + // Unpacking half floats: http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ + // Packing half floats: https://gist.github.com/rygorous/2156668 + // Load source, move from tight pack of X16Y16.... to X16...Y16... + // Also zero out the high end. + // TODO(benvanik): special case constant unpacks that just get 0/1/etc. + UnaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src) { + // sx = src.iw >> 16; + // sy = src.iw & 0xFFFF; + // dest = { 3.0 + (sx / float(1 << 22)), + // 3.0 + (sy / float(1 << 22)), + // 0.0, + // 1.0); --- or 3.0? + // So: + // xmm = {0,0,0,packed} + // xmm <<= 1w {0,0,packed,0} + // xmm = VCVTPH2PS(xmm) {sx,sy,0,0} + // xmm /= + }); + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_FLOAT16_4) { + // Could be shared with FLOAT16_2. + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_SHORT_2) { + // (VD.x) = 3.0 + (VB.x)*2^-22 + // (VD.y) = 3.0 + (VB.y)*2^-22 + // (VD.z) = 0.0 + // (VD.w) = 3.0 + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_S8_IN_16_LO) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_S8_IN_16_HI) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_S16_IN_32_LO) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == PACK_TYPE_S16_IN_32_HI) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } + i = e.Advance(i); + return true; +}); + +// -------------------------------------------------------------------------- +// Atomic +// -------------------------------------------------------------------------- + +table->AddSequence(OPCODE_COMPARE_EXCHANGE, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_ATOMIC_EXCHANGE, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_ATOMIC_ADD, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_ATOMIC_SUB, [](X64Emitter& e, Instr*& i) { + UNIMPLEMENTED_SEQ(); + i = e.Advance(i); + return true; +}); } diff --git a/src/alloy/hir/value.h b/src/alloy/hir/value.h index f1d4b1d37..6db5cc079 100644 --- a/src/alloy/hir/value.h +++ b/src/alloy/hir/value.h @@ -34,7 +34,13 @@ enum TypeName { }; static bool IsIntType(TypeName type_name) { - return type_name < 4; + return type_name <= INT64_TYPE; +} +static bool IsFloatType(TypeName type_name) { + return type_name == FLOAT32_TYPE || type_name == FLOAT64_TYPE; +} +static bool IsVecType(TypeName type_name) { + return type_name == VEC128_TYPE; } enum ValueFlags { From ed4efccc308e2d8932c6a584dca66b8d4f855455 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 21:27:45 -0800 Subject: [PATCH 018/184] Fleshing out skeletons for most sequences. --- .../x64/lowering/lowering_sequences.cc | 372 +++++++++++++++--- 1 file changed, 319 insertions(+), 53 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index d3baff80a..ed83d71bc 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -884,18 +884,33 @@ table->AddSequence(OPCODE_BRANCH_FALSE, [](X64Emitter& e, Instr*& i) { // -------------------------------------------------------------------------- table->AddSequence(OPCODE_ASSIGN, [](X64Emitter& e, Instr*& i) { - UnaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src) { - // nop - the mov will have happened. - }); + if (IsIntType(i->dest->type)) { + UnaryOp( + e, i, + [](X64Emitter& e, Instr& i, const Reg& dest_src) { + // nop - the mov will have happened. + }); + } else if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { - // Need a matrix. - UNIMPLEMENTED_SEQ(); + if (IsIntType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); @@ -1057,18 +1072,27 @@ table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_ROUND, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + // flags = ROUND_TO_* + if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_VECTOR_CONVERT_I2F, [](X64Emitter& e, Instr*& i) { + // flags = ARITHMETIC_SATURATE | ARITHMETIC_UNSIGNED UNIMPLEMENTED_SEQ(); i = e.Advance(i); return true; }); table->AddSequence(OPCODE_VECTOR_CONVERT_F2I, [](X64Emitter& e, Instr*& i) { + // flags = ARITHMETIC_SATURATE | ARITHMETIC_UNSIGNED UNIMPLEMENTED_SEQ(); i = e.Advance(i); return true; @@ -1081,12 +1105,14 @@ table->AddSequence(OPCODE_VECTOR_CONVERT_F2I, [](X64Emitter& e, Instr*& i) { // specials for zeroing/etc (xor/etc) table->AddSequence(OPCODE_LOAD_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { + XEASSERT(i->dest->type == VEC128_TYPE); UNIMPLEMENTED_SEQ(); i = e.Advance(i); return true; }); table->AddSequence(OPCODE_LOAD_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { + XEASSERT(i->dest->type == VEC128_TYPE); UNIMPLEMENTED_SEQ(); i = e.Advance(i); return true; @@ -1487,19 +1513,43 @@ table->AddSequence(OPCODE_PREFETCH, [](X64Emitter& e, Instr*& i) { // -------------------------------------------------------------------------- table->AddSequence(OPCODE_MAX, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsIntType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_MIN, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsIntType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_SELECT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsIntType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); @@ -1669,31 +1719,51 @@ table->AddSequence(OPCODE_DID_SATURATE, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_VECTOR_COMPARE_EQ, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_VECTOR_COMPARE_SGT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_VECTOR_COMPARE_SGE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_VECTOR_COMPARE_UGT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_VECTOR_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); @@ -1712,8 +1782,12 @@ table->AddSequence(OPCODE_ADD, [](X64Emitter& e, Instr*& i) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.add(dest_src, src); }); - } else { + } else if (IsFloatType(i->dest->type)) { UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); } i = e.Advance(i); return true; @@ -1759,6 +1833,26 @@ table->AddSequence(OPCODE_ADD_CARRY, [](X64Emitter& e, Instr*& i) { return true; }); +table->AddSequence(OPCODE_VECTOR_ADD, [](X64Emitter& e, Instr*& i) { + if (IsVecType(i->dest->type)) { + if (i->flags == INT8_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == INT16_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == INT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == FLOAT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } + } else { + ASSERT_INVALID_TYPE(); + } + i = e.Advance(i); + return true; +}); + table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { BinaryOp( @@ -1769,8 +1863,12 @@ table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.sub(dest_src, src); }); - } else { + } else if (IsFloatType(i->dest->type)) { UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); } i = e.Advance(i); return true; @@ -1808,8 +1906,12 @@ table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { } e.mov(dest_src, Nax); }); - } else { + } else if (IsFloatType(i->dest->type)) { UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); } i = e.Advance(i); return true; @@ -1883,69 +1985,137 @@ table->AddSequence(OPCODE_DIV, [](X64Emitter& e, Instr*& i) { } e.mov(dest_src, Nax); }); - } else { + } else if (IsFloatType(i->dest->type)) { UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_MUL_ADD, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsIntType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_MUL_SUB, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsIntType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_NEG, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsIntType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_ABS, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsIntType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_SQRT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_RSQRT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_POW2, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_LOG2, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsFloatType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_DOT_PRODUCT_3, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_DOT_PRODUCT_4, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsVecType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); @@ -1960,8 +2130,10 @@ table->AddSequence(OPCODE_AND, [](X64Emitter& e, Instr*& i) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.and(dest_src, src); }); - } else { + } else if (IsVecType(i->dest->type)) { UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); } i = e.Advance(i); return true; @@ -1977,8 +2149,10 @@ table->AddSequence(OPCODE_OR, [](X64Emitter& e, Instr*& i) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.or(dest_src, src); }); - } else { + } else if (IsVecType(i->dest->type)) { UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); } i = e.Advance(i); return true; @@ -1994,8 +2168,10 @@ table->AddSequence(OPCODE_XOR, [](X64Emitter& e, Instr*& i) { [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { e.xor(dest_src, src); }); - } else { + } else if (IsVecType(i->dest->type)) { UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); } i = e.Advance(i); return true; @@ -2008,8 +2184,10 @@ table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { [](X64Emitter& e, Instr& i, const Reg& dest_src) { e.not(dest_src); }); - } else { + } else if (IsVecType(i->dest->type)) { UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); } i = e.Advance(i); return true; @@ -2043,12 +2221,6 @@ table->AddSequence(OPCODE_SHL, [](X64Emitter& e, Instr*& i) { return true; }); -table->AddSequence(OPCODE_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; -}); - table->AddSequence(OPCODE_SHR, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { // TODO(benvanik): use shrx if available. @@ -2072,12 +2244,6 @@ table->AddSequence(OPCODE_SHR, [](X64Emitter& e, Instr*& i) { return true; }); -table->AddSequence(OPCODE_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; -}); - table->AddSequence(OPCODE_SHA, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { // TODO(benvanik): use sarx if available. @@ -2101,8 +2267,56 @@ table->AddSequence(OPCODE_SHA, [](X64Emitter& e, Instr*& i) { return true; }); +table->AddSequence(OPCODE_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { + if (IsVecType(i->dest->type)) { + if (i->flags == INT8_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == INT16_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == INT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } + } else { + ASSERT_INVALID_TYPE(); + } + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { + if (IsVecType(i->dest->type)) { + if (i->flags == INT8_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == INT16_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == INT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } + } else { + ASSERT_INVALID_TYPE(); + } + i = e.Advance(i); + return true; +}); + table->AddSequence(OPCODE_VECTOR_SHA, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsVecType(i->dest->type)) { + if (i->flags == INT8_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == INT16_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->flags == INT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); @@ -2241,25 +2455,73 @@ table->AddSequence(OPCODE_CNTLZ, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_INSERT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsVecType(i->dest->type)) { + if (i->src3.value->type == INT8_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->src3.value->type == INT16_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->src3.value->type == INT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_EXTRACT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsVecType(i->src1.value->type)) { + if (i->dest->type == INT8_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->dest->type == INT16_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->dest->type == INT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_SPLAT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsVecType(i->dest->type)) { + if (i->src1.value->type == INT8_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->src1.value->type == INT16_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->src1.value->type == INT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->src1.value->type == FLOAT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsVecType(i->dest->type)) { + if (i->src1.value->type == INT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (i->src1.value->type == VEC128_TYPE) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); @@ -2279,7 +2541,7 @@ table->AddSequence(OPCODE_SWIZZLE, [](X64Emitter& e, Instr*& i) { UNIMPLEMENTED_SEQ(); } } else { - UNIMPLEMENTED_SEQ(); + ASSERT_INVALID_TYPE(); } i = e.Advance(i); return true; @@ -2373,7 +2635,11 @@ table->AddSequence(OPCODE_COMPARE_EXCHANGE, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_ATOMIC_EXCHANGE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (IsIntType(i->dest->type)) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } i = e.Advance(i); return true; }); From 6e35b6efa32e588ca21f1741f98b2d8b6ac7718d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 21:30:34 -0800 Subject: [PATCH 019/184] Renaming op utils. --- .../x64/lowering/lowering_sequences.cc | 138 +++++++++--------- 1 file changed, 69 insertions(+), 69 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index ed83d71bc..2aa9bb948 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -302,7 +302,7 @@ void CompareXX(X64Emitter& e, Instr*& i, void(set_fn)(X64Emitter& e, Reg8& dest, typedef void(v_fn)(X64Emitter& e, Instr& i, const Reg& dest_src); template -void UnaryOpV(X64Emitter& e, Instr*& i, v_fn v_fn, +void IntUnaryOpV(X64Emitter& e, Instr*& i, v_fn v_fn, T& dest, T& src1) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0); @@ -315,38 +315,38 @@ void UnaryOpV(X64Emitter& e, Instr*& i, v_fn v_fn, e.EndOp(dest, src1); } template -void UnaryOpC(X64Emitter& e, Instr*& i, v_fn v_fn, +void IntUnaryOpC(X64Emitter& e, Instr*& i, v_fn v_fn, T& dest, Value* src1) { e.BeginOp(i->dest, dest, REG_DEST); e.mov(dest, (uint64_t)src1->get_constant(CT())); v_fn(e, *i, dest); e.EndOp(dest); } -void UnaryOp(X64Emitter& e, Instr*& i, v_fn v_fn) { +void IntUnaryOp(X64Emitter& e, Instr*& i, v_fn v_fn) { if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8)) { Reg8 dest, src1; - UnaryOpV(e, i, v_fn, dest, src1); + IntUnaryOpV(e, i, v_fn, dest, src1); } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8C)) { Reg8 dest; - UnaryOpC(e, i, v_fn, dest, i->src1.value); + IntUnaryOpC(e, i, v_fn, dest, i->src1.value); } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16)) { Reg16 dest, src1; - UnaryOpV(e, i, v_fn, dest, src1); + IntUnaryOpV(e, i, v_fn, dest, src1); } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C)) { Reg16 dest; - UnaryOpC(e, i, v_fn, dest, i->src1.value); + IntUnaryOpC(e, i, v_fn, dest, i->src1.value); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32)) { Reg32 dest, src1; - UnaryOpV(e, i, v_fn, dest, src1); + IntUnaryOpV(e, i, v_fn, dest, src1); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C)) { Reg32 dest; - UnaryOpC(e, i, v_fn, dest, i->src1.value); + IntUnaryOpC(e, i, v_fn, dest, i->src1.value); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64)) { Reg64 dest, src1; - UnaryOpV(e, i, v_fn, dest, src1); + IntUnaryOpV(e, i, v_fn, dest, src1); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C)) { Reg64 dest; - UnaryOpC(e, i, v_fn, dest, i->src1.value); + IntUnaryOpC(e, i, v_fn, dest, i->src1.value); } else { ASSERT_INVALID_TYPE(); } @@ -360,7 +360,7 @@ void UnaryOp(X64Emitter& e, Instr*& i, v_fn v_fn) { typedef void(vv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src); typedef void(vc_fn)(X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src); template -void BinaryOpVV(X64Emitter& e, Instr*& i, vv_fn vv_fn, +void IntBinaryOpVV(X64Emitter& e, Instr*& i, vv_fn vv_fn, TD& dest, TS1& src1, TS2& src2) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0, @@ -383,7 +383,7 @@ void BinaryOpVV(X64Emitter& e, Instr*& i, vv_fn vv_fn, e.EndOp(dest, src1, src2); } template -void BinaryOpVC(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, +void IntBinaryOpVC(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, TD& dest, TS1& src1, Value* src2) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0); @@ -409,7 +409,7 @@ void BinaryOpVC(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, e.EndOp(dest, src1); } template -void BinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, +void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, TD& dest, Value* src1, TS2& src2) { e.BeginOp(i->dest, dest, REG_DEST, i->src2.value, src2, 0); @@ -448,80 +448,80 @@ void BinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, } e.EndOp(dest, src2); } -void BinaryOp(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn) { +void IntBinaryOp(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn) { // TODO(benvanik): table lookup. This linear scan is slow. // Note: we assume DEST.type = SRC1.type, but that SRC2.type may vary. XEASSERT(i->dest->type == i->src1.value->type); if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { Reg8 dest, src1, src2; - BinaryOpVV(e, i, vv_fn, dest, src1, src2); + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8C)) { Reg8 dest, src1; - BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8C, SIG_TYPE_I8)) { Reg8 dest, src2; - BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I16)) { Reg16 dest, src1, src2; - BinaryOpVV(e, i, vv_fn, dest, src1, src2); + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I16C)) { Reg16 dest, src1; - BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I16)) { Reg16 dest, src2; - BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I32)) { Reg32 dest, src1, src2; - BinaryOpVV(e, i, vv_fn, dest, src1, src2); + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I32C)) { Reg32 dest, src1; - BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I32)) { Reg32 dest, src2; - BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I64)) { Reg64 dest, src1, src2; - BinaryOpVV(e, i, vv_fn, dest, src1, src2); + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I64C)) { Reg64 dest, src1; - BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I64)) { Reg64 dest, src2; - BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); // Start forced src2=i8 } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { Reg16 dest, src1; Reg8 src2; - BinaryOpVV(e, i, vv_fn, dest, src1, src2); + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8C)) { Reg16 dest, src1; - BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I8)) { Reg16 dest; Reg8 src2; - BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { Reg32 dest, src1; Reg8 src2; - BinaryOpVV(e, i, vv_fn, dest, src1, src2); + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8C)) { Reg32 dest, src1; - BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I8)) { Reg32 dest; Reg8 src2; - BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { Reg64 dest, src1; Reg8 src2; - BinaryOpVV(e, i, vv_fn, dest, src1, src2); + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8C)) { Reg64 dest, src1; - BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I8)) { Reg64 dest; Reg8 src2; - BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); } else { ASSERT_INVALID_TYPE(); } @@ -536,7 +536,7 @@ typedef void(vvv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, const Operan typedef void(vvc_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, const Operand& src2, uint32_t src3); typedef void(vcv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, uint32_t src2, const Operand& src3); template -void TernaryOpVVV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, +void IntTernaryOpVVV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, TD& dest, TS1& src1, TS2& src2, TS3& src3) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0, @@ -557,7 +557,7 @@ void TernaryOpVVV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, e.EndOp(dest, src1, src2, src3); } template -void TernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, +void IntTernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, TD& dest, TS1& src1, TS2& src2, Value* src3) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0, @@ -605,7 +605,7 @@ void TernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, e.EndOp(dest, src1, src2); } template -void TernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, +void IntTernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, TD& dest, TS1& src1, Value* src2, TS3& src3) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0, @@ -652,7 +652,7 @@ void TernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, } e.EndOp(dest, src1, src3); } -void TernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn vcv_fn) { +void IntTernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn vcv_fn) { // TODO(benvanik): table lookup. This linear scan is slow. // Note: we assume DEST.type = SRC1.type = SRC2.type, but that SRC3.type may vary. XEASSERT(i->dest->type == i->src1.value->type && @@ -661,44 +661,44 @@ void TernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn vc if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { Reg8 dest, src1, src2; Reg8 src3; - TernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); + IntTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8C)) { Reg8 dest, src1, src2; - TernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); + IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { Reg16 dest, src1, src2; Reg8 src3; - TernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); + IntTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8C)) { Reg16 dest, src1, src2; - TernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); + IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { Reg32 dest, src1, src2; Reg8 src3; - TernaryOpVVV(e, i,vvv_fn, dest, src1, src2, src3); + IntTernaryOpVVV(e, i,vvv_fn, dest, src1, src2, src3); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8C)) { Reg32 dest, src1, src2; - TernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); + IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { Reg64 dest, src1, src2; Reg8 src3; - TernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); + IntTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8C)) { Reg64 dest, src1, src2; - TernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); + IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); // } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8C, SIG_TYPE_I8)) { Reg8 dest, src1, src3; - TernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); + IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I8)) { Reg16 dest, src1, src3; - TernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); + IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I8)) { Reg32 dest, src1, src3; - TernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); + IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I8)) { Reg64 dest, src1, src3; - TernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); + IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); } else { ASSERT_INVALID_TYPE(); } @@ -885,7 +885,7 @@ table->AddSequence(OPCODE_BRANCH_FALSE, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_ASSIGN, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - UnaryOp( + IntUnaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { // nop - the mov will have happened. @@ -1311,7 +1311,7 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { if (cbs->handles(cbs->context, address)) { // Eh, hacking lambdas. i->src3.offset = (uint64_t)cbs; - UnaryOp( + IntUnaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { auto cbs = (RegisterAccessCallbacks*)i.src3.offset; @@ -1774,7 +1774,7 @@ table->AddSequence(OPCODE_VECTOR_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_ADD, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - BinaryOp( + IntBinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { e.add(dest_src, src); @@ -1796,7 +1796,7 @@ table->AddSequence(OPCODE_ADD, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_ADD_CARRY, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { // dest = src1 + src2 + src3.i8 - TernaryOp( + IntTernaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { Reg8 src3_8(src3.getIdx()); @@ -1855,7 +1855,7 @@ table->AddSequence(OPCODE_VECTOR_ADD, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - BinaryOp( + IntBinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { e.sub(dest_src, src); @@ -1878,7 +1878,7 @@ table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - BinaryOp( + IntBinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { // RAX = value, RDX = clobbered @@ -1919,7 +1919,7 @@ table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_MUL_HI, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - BinaryOp( + IntBinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { // RAX = value, RDX = clobbered @@ -1957,7 +1957,7 @@ table->AddSequence(OPCODE_MUL_HI, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_DIV, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - BinaryOp( + IntBinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { // RAX = value, RDX = clobbered @@ -2122,7 +2122,7 @@ table->AddSequence(OPCODE_DOT_PRODUCT_4, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_AND, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - BinaryOp( + IntBinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { e.and(dest_src, src); @@ -2141,7 +2141,7 @@ table->AddSequence(OPCODE_AND, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_OR, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - BinaryOp( + IntBinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { e.or(dest_src, src); @@ -2160,7 +2160,7 @@ table->AddSequence(OPCODE_OR, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_XOR, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - BinaryOp( + IntBinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { e.xor(dest_src, src); @@ -2179,7 +2179,7 @@ table->AddSequence(OPCODE_XOR, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - UnaryOp( + IntUnaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { e.not(dest_src); @@ -2196,7 +2196,7 @@ table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SHL, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { // TODO(benvanik): use shlx if available. - BinaryOp( + IntBinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { // Can only shl by cl. Eww x86. @@ -2224,7 +2224,7 @@ table->AddSequence(OPCODE_SHL, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SHR, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { // TODO(benvanik): use shrx if available. - BinaryOp( + IntBinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { // Can only sar by cl. Eww x86. @@ -2247,7 +2247,7 @@ table->AddSequence(OPCODE_SHR, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SHA, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { // TODO(benvanik): use sarx if available. - BinaryOp( + IntBinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { // Can only sar by cl. Eww x86. @@ -2323,7 +2323,7 @@ table->AddSequence(OPCODE_VECTOR_SHA, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_ROTATE_LEFT, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - BinaryOp( + IntBinaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { // Can only rol by cl. Eww x86. @@ -2584,7 +2584,7 @@ table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { // Load source, move from tight pack of X16Y16.... to X16...Y16... // Also zero out the high end. // TODO(benvanik): special case constant unpacks that just get 0/1/etc. - UnaryOp( + IntUnaryOp( e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { // sx = src.iw >> 16; From ae6c903173a0f52ecbed6aae5b791fdb54191db8 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 21:34:46 -0800 Subject: [PATCH 020/184] Reformating lambdas to make vs happier. --- .../x64/lowering/lowering_sequences.cc | 412 ++++++++---------- 1 file changed, 183 insertions(+), 229 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 2aa9bb948..564f66bf3 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1311,15 +1311,13 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { if (cbs->handles(cbs->context, address)) { // Eh, hacking lambdas. i->src3.offset = (uint64_t)cbs; - IntUnaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src) { - auto cbs = (RegisterAccessCallbacks*)i.src3.offset; - e.mov(e.rcx, (uint64_t)cbs->context); - e.mov(e.rdx, i.src1.value->AsUint64()); - CallNative(e, cbs->read); - e.mov(dest_src, e.rax); - }); + IntUnaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { + auto cbs = (RegisterAccessCallbacks*)i.src3.offset; + e.mov(e.rcx, (uint64_t)cbs->context); + e.mov(e.rdx, i.src1.value->AsUint64()); + CallNative(e, cbs->read); + e.mov(dest_src, e.rax); + }); i = e.Advance(i); return true; } @@ -1774,14 +1772,11 @@ table->AddSequence(OPCODE_VECTOR_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_ADD, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - IntBinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.add(dest_src, src); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.add(dest_src, src); - }); + IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + e.add(dest_src, src); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.add(dest_src, src); + }); } else if (IsFloatType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else if (IsVecType(i->dest->type)) { @@ -1796,36 +1791,32 @@ table->AddSequence(OPCODE_ADD, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_ADD_CARRY, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { // dest = src1 + src2 + src3.i8 - IntTernaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { - Reg8 src3_8(src3.getIdx()); - if (src3.getIdx() <= 4) { - e.mov(e.ah, src3_8); - } else { - e.mov(e.al, src3_8); - e.mov(e.ah, e.al); - } - e.sahf(); - e.adc(dest_src, src2); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, uint32_t src3) { - e.mov(e.eax, src3); - e.mov(e.ah, e.al); - e.sahf(); - e.adc(dest_src, src2); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src2, const Operand& src3) { - Reg8 src3_8(src3.getIdx()); - if (src3.getIdx() <= 4) { - e.mov(e.ah, src3_8); - } else { - e.mov(e.al, src3_8); - e.mov(e.ah, e.al); - } - e.sahf(); - e.adc(dest_src, src2); - }); + IntTernaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { + Reg8 src3_8(src3.getIdx()); + if (src3.getIdx() <= 4) { + e.mov(e.ah, src3_8); + } else { + e.mov(e.al, src3_8); + e.mov(e.ah, e.al); + } + e.sahf(); + e.adc(dest_src, src2); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, uint32_t src3) { + e.mov(e.eax, src3); + e.mov(e.ah, e.al); + e.sahf(); + e.adc(dest_src, src2); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src2, const Operand& src3) { + Reg8 src3_8(src3.getIdx()); + if (src3.getIdx() <= 4) { + e.mov(e.ah, src3_8); + } else { + e.mov(e.al, src3_8); + e.mov(e.ah, e.al); + } + e.sahf(); + e.adc(dest_src, src2); + }); } else { UNIMPLEMENTED_SEQ(); } @@ -1855,14 +1846,11 @@ table->AddSequence(OPCODE_VECTOR_ADD, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - IntBinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.sub(dest_src, src); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.sub(dest_src, src); - }); + IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + e.sub(dest_src, src); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.sub(dest_src, src); + }); } else if (IsFloatType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else if (IsVecType(i->dest->type)) { @@ -1878,34 +1866,31 @@ table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - IntBinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // RAX = value, RDX = clobbered - // TODO(benvanik): make the register allocator put dest_src in RAX? - auto Nax = LIKE_REG(e.rax, dest_src); - e.mov(Nax, dest_src); - if (i.flags & ARITHMETIC_UNSIGNED) { - e.mul(src); - } else { - e.imul(src); - } - e.mov(dest_src, Nax); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - // RAX = value, RDX = clobbered - // TODO(benvanik): make the register allocator put dest_src in RAX? - auto Nax = LIKE_REG(e.rax, dest_src); - auto Ndx = LIKE_REG(e.rdx, dest_src); - e.mov(Nax, dest_src); - e.mov(Ndx, src); - if (i.flags & ARITHMETIC_UNSIGNED) { - e.mul(Ndx); - } else { - e.imul(Ndx); - } - e.mov(dest_src, Nax); - }); + IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // RAX = value, RDX = clobbered + // TODO(benvanik): make the register allocator put dest_src in RAX? + auto Nax = LIKE_REG(e.rax, dest_src); + e.mov(Nax, dest_src); + if (i.flags & ARITHMETIC_UNSIGNED) { + e.mul(src); + } else { + e.imul(src); + } + e.mov(dest_src, Nax); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + // RAX = value, RDX = clobbered + // TODO(benvanik): make the register allocator put dest_src in RAX? + auto Nax = LIKE_REG(e.rax, dest_src); + auto Ndx = LIKE_REG(e.rdx, dest_src); + e.mov(Nax, dest_src); + e.mov(Ndx, src); + if (i.flags & ARITHMETIC_UNSIGNED) { + e.mul(Ndx); + } else { + e.imul(Ndx); + } + e.mov(dest_src, Nax); + }); } else if (IsFloatType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else if (IsVecType(i->dest->type)) { @@ -1919,35 +1904,32 @@ table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_MUL_HI, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - IntBinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // RAX = value, RDX = clobbered - // TODO(benvanik): make the register allocator put dest_src in RAX? - auto Nax = LIKE_REG(e.rax, dest_src); - auto Ndx = LIKE_REG(e.rdx, dest_src); - e.mov(Nax, dest_src); - if (i.flags & ARITHMETIC_UNSIGNED) { - e.mul(src); - } else { - e.imul(src); - } - e.mov(dest_src, Ndx); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - // RAX = value, RDX = clobbered - // TODO(benvanik): make the register allocator put dest_src in RAX? - auto Nax = LIKE_REG(e.rax, dest_src); - auto Ndx = LIKE_REG(e.rdx, dest_src); - e.mov(Nax, dest_src); - e.mov(Ndx, src); - if (i.flags & ARITHMETIC_UNSIGNED) { - e.mul(Ndx); - } else { - e.imul(Ndx); - } - e.mov(dest_src, Ndx); - }); + IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // RAX = value, RDX = clobbered + // TODO(benvanik): make the register allocator put dest_src in RAX? + auto Nax = LIKE_REG(e.rax, dest_src); + auto Ndx = LIKE_REG(e.rdx, dest_src); + e.mov(Nax, dest_src); + if (i.flags & ARITHMETIC_UNSIGNED) { + e.mul(src); + } else { + e.imul(src); + } + e.mov(dest_src, Ndx); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + // RAX = value, RDX = clobbered + // TODO(benvanik): make the register allocator put dest_src in RAX? + auto Nax = LIKE_REG(e.rax, dest_src); + auto Ndx = LIKE_REG(e.rdx, dest_src); + e.mov(Nax, dest_src); + e.mov(Ndx, src); + if (i.flags & ARITHMETIC_UNSIGNED) { + e.mul(Ndx); + } else { + e.imul(Ndx); + } + e.mov(dest_src, Ndx); + }); } else { UNIMPLEMENTED_SEQ(); } @@ -1957,34 +1939,31 @@ table->AddSequence(OPCODE_MUL_HI, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_DIV, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - IntBinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // RAX = value, RDX = clobbered - // TODO(benvanik): make the register allocator put dest_src in RAX? - auto Nax = LIKE_REG(e.rax, dest_src); - e.mov(Nax, dest_src); - if (i.flags & ARITHMETIC_UNSIGNED) { - e.div(src); - } else { - e.idiv(src); - } - e.mov(dest_src, Nax); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - // RAX = value, RDX = clobbered - // TODO(benvanik): make the register allocator put dest_src in RAX? - auto Nax = LIKE_REG(e.rax, dest_src); - auto Ndx = LIKE_REG(e.rdx, dest_src); - e.mov(Nax, dest_src); - e.mov(Ndx, src); - if (i.flags & ARITHMETIC_UNSIGNED) { - e.div(Ndx); - } else { - e.idiv(Ndx); - } - e.mov(dest_src, Nax); - }); + IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // RAX = value, RDX = clobbered + // TODO(benvanik): make the register allocator put dest_src in RAX? + auto Nax = LIKE_REG(e.rax, dest_src); + e.mov(Nax, dest_src); + if (i.flags & ARITHMETIC_UNSIGNED) { + e.div(src); + } else { + e.idiv(src); + } + e.mov(dest_src, Nax); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + // RAX = value, RDX = clobbered + // TODO(benvanik): make the register allocator put dest_src in RAX? + auto Nax = LIKE_REG(e.rax, dest_src); + auto Ndx = LIKE_REG(e.rdx, dest_src); + e.mov(Nax, dest_src); + e.mov(Ndx, src); + if (i.flags & ARITHMETIC_UNSIGNED) { + e.div(Ndx); + } else { + e.idiv(Ndx); + } + e.mov(dest_src, Nax); + }); } else if (IsFloatType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else if (IsVecType(i->dest->type)) { @@ -2122,14 +2101,11 @@ table->AddSequence(OPCODE_DOT_PRODUCT_4, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_AND, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - IntBinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.and(dest_src, src); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.and(dest_src, src); - }); + IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + e.and(dest_src, src); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.and(dest_src, src); + }); } else if (IsVecType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else { @@ -2141,14 +2117,11 @@ table->AddSequence(OPCODE_AND, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_OR, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - IntBinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.or(dest_src, src); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.or(dest_src, src); - }); + IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + e.or(dest_src, src); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.or(dest_src, src); + }); } else if (IsVecType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else { @@ -2160,14 +2133,11 @@ table->AddSequence(OPCODE_OR, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_XOR, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - IntBinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.xor(dest_src, src); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.xor(dest_src, src); - }); + IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + e.xor(dest_src, src); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.xor(dest_src, src); + }); } else if (IsVecType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else { @@ -2179,11 +2149,9 @@ table->AddSequence(OPCODE_XOR, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - IntUnaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src) { - e.not(dest_src); - }); + IntUnaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { + e.not(dest_src); + }); } else if (IsVecType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else { @@ -2196,24 +2164,21 @@ table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SHL, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { // TODO(benvanik): use shlx if available. - IntBinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // Can only shl by cl. Eww x86. - Reg8 shamt(src.getIdx()); - e.mov(e.rax, e.rcx); - e.mov(e.cl, shamt); - e.shl(dest_src, e.cl); - e.mov(e.rcx, e.rax); - // BeaEngine can't disasm this, boo. - /*Reg32e dest_src_e(dest_src.getIdx(), MAX(dest_src.getBit(), 32)); - Reg32e src_e(src.getIdx(), MAX(dest_src.getBit(), 32)); - e.and(src_e, 0x3F); - e.shlx(dest_src_e, dest_src_e, src_e);*/ - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.shl(dest_src, src); - }); + IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // Can only shl by cl. Eww x86. + Reg8 shamt(src.getIdx()); + e.mov(e.rax, e.rcx); + e.mov(e.cl, shamt); + e.shl(dest_src, e.cl); + e.mov(e.rcx, e.rax); + // BeaEngine can't disasm this, boo. + /*Reg32e dest_src_e(dest_src.getIdx(), MAX(dest_src.getBit(), 32)); + Reg32e src_e(src.getIdx(), MAX(dest_src.getBit(), 32)); + e.and(src_e, 0x3F); + e.shlx(dest_src_e, dest_src_e, src_e);*/ + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.shl(dest_src, src); + }); } else { UNIMPLEMENTED_SEQ(); } @@ -2224,19 +2189,16 @@ table->AddSequence(OPCODE_SHL, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SHR, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { // TODO(benvanik): use shrx if available. - IntBinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // Can only sar by cl. Eww x86. - Reg8 shamt(src.getIdx()); - e.mov(e.rax, e.rcx); - e.mov(e.cl, shamt); - e.shr(dest_src, e.cl); - e.mov(e.rcx, e.rax); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.shr(dest_src, src); - }); + IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // Can only sar by cl. Eww x86. + Reg8 shamt(src.getIdx()); + e.mov(e.rax, e.rcx); + e.mov(e.cl, shamt); + e.shr(dest_src, e.cl); + e.mov(e.rcx, e.rax); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.shr(dest_src, src); + }); } else { UNIMPLEMENTED_SEQ(); } @@ -2247,19 +2209,16 @@ table->AddSequence(OPCODE_SHR, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SHA, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { // TODO(benvanik): use sarx if available. - IntBinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // Can only sar by cl. Eww x86. - Reg8 shamt(src.getIdx()); - e.mov(e.rax, e.rcx); - e.mov(e.cl, shamt); - e.sar(dest_src, e.cl); - e.mov(e.rcx, e.rax); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.sar(dest_src, src); - }); + IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // Can only sar by cl. Eww x86. + Reg8 shamt(src.getIdx()); + e.mov(e.rax, e.rcx); + e.mov(e.cl, shamt); + e.sar(dest_src, e.cl); + e.mov(e.rcx, e.rax); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.sar(dest_src, src); + }); } else { UNIMPLEMENTED_SEQ(); } @@ -2323,19 +2282,16 @@ table->AddSequence(OPCODE_VECTOR_SHA, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_ROTATE_LEFT, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - IntBinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // Can only rol by cl. Eww x86. - Reg8 shamt(src.getIdx()); - e.mov(e.rax, e.rcx); - e.mov(e.cl, shamt); - e.rol(dest_src, e.cl); - e.mov(e.rcx, e.rax); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.rol(dest_src, src); - }); + IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { + // Can only rol by cl. Eww x86. + Reg8 shamt(src.getIdx()); + e.mov(e.rax, e.rcx); + e.mov(e.cl, shamt); + e.rol(dest_src, e.cl); + e.mov(e.rcx, e.rax); + }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { + e.rol(dest_src, src); + }); } else { UNIMPLEMENTED_SEQ(); } @@ -2584,9 +2540,7 @@ table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { // Load source, move from tight pack of X16Y16.... to X16...Y16... // Also zero out the high end. // TODO(benvanik): special case constant unpacks that just get 0/1/etc. - IntUnaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src) { + IntUnaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { // sx = src.iw >> 16; // sy = src.iw & 0xFFFF; // dest = { 3.0 + (sx / float(1 << 22)), From 6c6f10ad485b0b4529422f4bb7883446627db34a Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 21:40:25 -0800 Subject: [PATCH 021/184] Moving op templates to op_utils.inl. --- .../x64/lowering/lowering_sequences.cc | 559 +---------------- src/alloy/backend/x64/lowering/op_utils.inl | 574 ++++++++++++++++++ src/alloy/backend/x64/lowering/sources.gypi | 1 + 3 files changed, 579 insertions(+), 555 deletions(-) create mode 100644 src/alloy/backend/x64/lowering/op_utils.inl diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 564f66bf3..c2969be3f 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -154,564 +154,13 @@ void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { } } -// Sets EFLAGs with zf for the given value. -// ZF = 1 if false, 0 = true (so jz = jump if false) -void CheckBoolean(X64Emitter& e, Value* v) { - if (v->IsConstant()) { - e.mov(e.ah, (v->IsConstantZero() ? 1 : 0) << 6); - e.sahf(); - } else if (v->type == INT8_TYPE) { - Reg8 src; - e.BeginOp(v, src, 0); - e.test(src, src); - e.EndOp(src); - } else if (v->type == INT16_TYPE) { - Reg16 src; - e.BeginOp(v, src, 0); - e.test(src, src); - e.EndOp(src); - } else if (v->type == INT32_TYPE) { - Reg32 src; - e.BeginOp(v, src, 0); - e.test(src, src); - e.EndOp(src); - } else if (v->type == INT64_TYPE) { - Reg64 src; - e.BeginOp(v, src, 0); - e.test(src, src); - e.EndOp(src); - } else if (v->type == FLOAT32_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (v->type == FLOAT64_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (v->type == VEC128_TYPE) { - UNIMPLEMENTED_SEQ(); - } else { - ASSERT_INVALID_TYPE(); - } -} - -void CompareXX(X64Emitter& e, Instr*& i, void(set_fn)(X64Emitter& e, Reg8& dest, bool invert)) { - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest; - Reg8 src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.cmp(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest; - Reg8 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.cmp(src1, i->src2.value->constant.i8); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8C, SIG_TYPE_I8)) { - Reg8 dest; - Reg8 src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - e.cmp(src2, i->src1.value->constant.i8); - set_fn(e, dest, true); - e.EndOp(dest, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg8 dest; - Reg16 src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.cmp(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16C)) { - Reg8 dest; - Reg16 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.cmp(src1, i->src2.value->constant.i16); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16C, SIG_TYPE_I16)) { - Reg8 dest; - Reg16 src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - e.cmp(src2, i->src1.value->constant.i16); - e.sete(dest); - set_fn(e, dest, true); - e.EndOp(dest, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg8 dest; - Reg32 src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.cmp(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32C)) { - Reg8 dest; - Reg32 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.cmp(src1, i->src2.value->constant.i32); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32C, SIG_TYPE_I32)) { - Reg8 dest; - Reg32 src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - e.cmp(src2, i->src1.value->constant.i32); - set_fn(e, dest, true); - e.EndOp(dest, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg8 dest; - Reg64 src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.cmp(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64C)) { - Reg8 dest; - Reg64 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.mov(e.rax, i->src2.value->constant.i64); - e.cmp(src1, e.rax); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64C, SIG_TYPE_I64)) { - Reg8 dest; - Reg64 src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - e.mov(e.rax, i->src1.value->constant.i64); - e.cmp(src2, e.rax); - set_fn(e, dest, true); - e.EndOp(dest, src2); - } else { - UNIMPLEMENTED_SEQ(); - } -}; - -typedef void(v_fn)(X64Emitter& e, Instr& i, const Reg& dest_src); -template -void IntUnaryOpV(X64Emitter& e, Instr*& i, v_fn v_fn, - T& dest, T& src1) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest == src1) { - v_fn(e, *i, dest); - } else { - e.mov(dest, src1); - v_fn(e, *i, dest); - } - e.EndOp(dest, src1); -} -template -void IntUnaryOpC(X64Emitter& e, Instr*& i, v_fn v_fn, - T& dest, Value* src1) { - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, (uint64_t)src1->get_constant(CT())); - v_fn(e, *i, dest); - e.EndOp(dest); -} -void IntUnaryOp(X64Emitter& e, Instr*& i, v_fn v_fn) { - if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest, src1; - IntUnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest; - IntUnaryOpC(e, i, v_fn, dest, i->src1.value); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg16 dest, src1; - IntUnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C)) { - Reg16 dest; - IntUnaryOpC(e, i, v_fn, dest, i->src1.value); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg32 dest, src1; - IntUnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C)) { - Reg32 dest; - IntUnaryOpC(e, i, v_fn, dest, i->src1.value); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg64 dest, src1; - IntUnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C)) { - Reg64 dest; - IntUnaryOpC(e, i, v_fn, dest, i->src1.value); - } else { - ASSERT_INVALID_TYPE(); - } - if (i->flags & ARITHMETIC_SET_CARRY) { - // EFLAGS should have CA set? - // (so long as we don't fuck with it) - // UNIMPLEMENTED_SEQ(); - } -}; - -typedef void(vv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src); -typedef void(vc_fn)(X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src); -template -void IntBinaryOpVV(X64Emitter& e, Instr*& i, vv_fn vv_fn, - TD& dest, TS1& src1, TS2& src2) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - if (dest == src1) { - vv_fn(e, *i, dest, src2); - } else if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vv_fn(e, *i, dest, src1); - } else { - // Eww. - e.mov(e.rax, src1); - vv_fn(e, *i, e.rax, src2); - e.mov(dest, e.rax); - } - } else { - e.mov(dest, src1); - vv_fn(e, *i, dest, src2); - } - e.EndOp(dest, src1, src2); -} -template -void IntBinaryOpVC(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, - TD& dest, TS1& src1, Value* src2) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest.getBit() <= 32) { - // 32-bit. - if (dest == src1) { - vc_fn(e, *i, dest, (uint32_t)src2->get_constant(CT())); - } else { - e.mov(dest, src1); - vc_fn(e, *i, dest, (uint32_t)src2->get_constant(CT())); - } - } else { - // 64-bit. - if (dest == src1) { - e.mov(e.rax, src2->constant.i64); - vv_fn(e, *i, dest, e.rax); - } else { - e.mov(e.rax, src2->constant.i64); - e.mov(dest, src1); - vv_fn(e, *i, dest, e.rax); - } - } - e.EndOp(dest, src1); -} -template -void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, - TD& dest, Value* src1, TS2& src2) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - if (dest.getBit() <= 32) { - // 32-bit. - if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); - } else { - // Eww. - e.mov(e.rax, src2); - e.mov(dest, (uint32_t)src1->get_constant(CT())); - vv_fn(e, *i, dest, e.rax); - } - } else { - e.mov(dest, src2); - vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); - } - } else { - // 64-bit. - if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - e.mov(e.rax, src1->constant.i64); - vv_fn(e, *i, dest, e.rax); - } else { - // Eww. - e.mov(e.rax, src1->constant.i64); - vv_fn(e, *i, e.rax, src2); - e.mov(dest, e.rax); - } - } else { - e.mov(e.rax, src2); - e.mov(dest, src1->constant.i64); - vv_fn(e, *i, dest, e.rax); - } - } - e.EndOp(dest, src2); -} -void IntBinaryOp(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn) { - // TODO(benvanik): table lookup. This linear scan is slow. - // Note: we assume DEST.type = SRC1.type, but that SRC2.type may vary. - XEASSERT(i->dest->type == i->src1.value->type); - if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest, src1, src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8C, SIG_TYPE_I8)) { - Reg8 dest, src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg16 dest, src1, src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I16C)) { - Reg16 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I16)) { - Reg16 dest, src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg32 dest, src1, src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I32C)) { - Reg32 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I32)) { - Reg32 dest, src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg64 dest, src1, src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I64C)) { - Reg64 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I64)) { - Reg64 dest, src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - // Start forced src2=i8 - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { - Reg16 dest, src1; - Reg8 src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8C)) { - Reg16 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I8)) { - Reg16 dest; - Reg8 src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { - Reg32 dest, src1; - Reg8 src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8C)) { - Reg32 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I8)) { - Reg32 dest; - Reg8 src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { - Reg64 dest, src1; - Reg8 src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8C)) { - Reg64 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I8)) { - Reg64 dest; - Reg8 src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else { - ASSERT_INVALID_TYPE(); - } - if (i->flags & ARITHMETIC_SET_CARRY) { - // EFLAGS should have CA set? - // (so long as we don't fuck with it) - // UNIMPLEMENTED_SEQ(); - } -}; - -typedef void(vvv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, const Operand& src2, const Operand& src3); -typedef void(vvc_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, const Operand& src2, uint32_t src3); -typedef void(vcv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, uint32_t src2, const Operand& src3); -template -void IntTernaryOpVVV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, - TD& dest, TS1& src1, TS2& src2, TS3& src3) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0, - i->src3.value, src3, 0); - if (dest == src1) { - vvv_fn(e, *i, dest, src2, src3); - } else if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vvv_fn(e, *i, dest, src1, src3); - } else { - UNIMPLEMENTED_SEQ(); - } - } else { - e.mov(dest, src1); - vvv_fn(e, *i, dest, src2, src3); - } - e.EndOp(dest, src1, src2, src3); -} -template -void IntTernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, - TD& dest, TS1& src1, TS2& src2, Value* src3) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - if (dest.getBit() <= 32) { - // 32-bit. - if (dest == src1) { - vvc_fn(e, *i, dest, src2, (uint32_t)src3->get_constant(CT())); - } else if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vvc_fn(e, *i, dest, src1, (uint32_t)src3->get_constant(CT())); - } else { - // Eww. - e.mov(e.rax, src2); - e.mov(dest, src1); - vvc_fn(e, *i, dest, e.rax, (uint32_t)src3->get_constant(CT())); - } - } else { - e.mov(dest, src1); - vvc_fn(e, *i, dest, src2, (uint32_t)src3->get_constant(CT())); - } - } else { - // 64-bit. - if (dest == src1) { - e.mov(e.rax, src3->constant.i64); - vvv_fn(e, *i, dest, src2, e.rax); - } else if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - e.mov(e.rax, src3->constant.i64); - vvv_fn(e, *i, dest, src1, e.rax); - } else { - // Eww. - e.mov(e.rax, src1); - e.mov(src1, src2); - e.mov(dest, e.rax); - e.mov(e.rax, src3->constant.i64); - vvv_fn(e, *i, dest, src1, e.rax); - } - } else { - e.mov(e.rax, src3->constant.i64); - e.mov(dest, src1); - vvv_fn(e, *i, dest, src2, e.rax); - } - } - e.EndOp(dest, src1, src2); -} -template -void IntTernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, - TD& dest, TS1& src1, Value* src2, TS3& src3) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src3.value, src3, 0); - if (dest.getBit() <= 32) { - // 32-bit. - if (dest == src1) { - vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src3); - } else if (dest == src3) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src1); - } else { - // Eww. - e.mov(e.rax, src3); - e.mov(dest, src1); - vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), e.rax); - } - } else { - e.mov(dest, src1); - vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src3); - } - } else { - // 64-bit. - if (dest == src1) { - e.mov(e.rax, src2->constant.i64); - vvv_fn(e, *i, dest, e.rax, src3); - } else if (dest == src3) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - e.mov(e.rax, src2->constant.i64); - vvv_fn(e, *i, dest, src1, e.rax); - } else { - // Eww. - e.mov(e.rax, src1); - e.mov(src1, src3); - e.mov(dest, e.rax); - e.mov(e.rax, src2->constant.i64); - vvv_fn(e, *i, dest, e.rax, src1); - } - } else { - e.mov(e.rax, src2->constant.i64); - e.mov(dest, src1); - vvv_fn(e, *i, dest, e.rax, src3); - } - } - e.EndOp(dest, src1, src3); -} -void IntTernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn vcv_fn) { - // TODO(benvanik): table lookup. This linear scan is slow. - // Note: we assume DEST.type = SRC1.type = SRC2.type, but that SRC3.type may vary. - XEASSERT(i->dest->type == i->src1.value->type && - i->dest->type == i->src2.value->type); - // TODO(benvanik): table lookup. - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest, src1, src2; - Reg8 src3; - IntTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest, src1, src2; - IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { - Reg16 dest, src1, src2; - Reg8 src3; - IntTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8C)) { - Reg16 dest, src1, src2; - IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { - Reg32 dest, src1, src2; - Reg8 src3; - IntTernaryOpVVV(e, i,vvv_fn, dest, src1, src2, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8C)) { - Reg32 dest, src1, src2; - IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { - Reg64 dest, src1, src2; - Reg8 src3; - IntTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8C)) { - Reg64 dest, src1, src2; - IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); - // - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8C, SIG_TYPE_I8)) { - Reg8 dest, src1, src3; - IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I8)) { - Reg16 dest, src1, src3; - IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I8)) { - Reg32 dest, src1, src3; - IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I8)) { - Reg64 dest, src1, src3; - IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); - } else { - ASSERT_INVALID_TYPE(); - } - if (i->flags & ARITHMETIC_SET_CARRY) { - // EFLAGS should have CA set? - // (so long as we don't fuck with it) - // UNIMPLEMENTED_SEQ(); - } -} - } // namespace +// Major templating foo lives in here. +#include + + void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { // -------------------------------------------------------------------------- // General diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl new file mode 100644 index 000000000..8e502bd63 --- /dev/null +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -0,0 +1,574 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +// NOTE: this file is only designed to be included by lowering_sequencies.cc! + +#ifndef ALLOY_BACKEND_X64_X64_LOWERING_OP_UTILS_INL_ +#define ALLOY_BACKEND_X64_X64_LOWERING_OP_UTILS_INL_ + +namespace { + +// Sets EFLAGs with zf for the given value. +// ZF = 1 if false, 0 = true (so jz = jump if false) +void CheckBoolean(X64Emitter& e, Value* v) { + if (v->IsConstant()) { + e.mov(e.ah, (v->IsConstantZero() ? 1 : 0) << 6); + e.sahf(); + } else if (v->type == INT8_TYPE) { + Reg8 src; + e.BeginOp(v, src, 0); + e.test(src, src); + e.EndOp(src); + } else if (v->type == INT16_TYPE) { + Reg16 src; + e.BeginOp(v, src, 0); + e.test(src, src); + e.EndOp(src); + } else if (v->type == INT32_TYPE) { + Reg32 src; + e.BeginOp(v, src, 0); + e.test(src, src); + e.EndOp(src); + } else if (v->type == INT64_TYPE) { + Reg64 src; + e.BeginOp(v, src, 0); + e.test(src, src); + e.EndOp(src); + } else if (v->type == FLOAT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (v->type == FLOAT64_TYPE) { + UNIMPLEMENTED_SEQ(); + } else if (v->type == VEC128_TYPE) { + UNIMPLEMENTED_SEQ(); + } else { + ASSERT_INVALID_TYPE(); + } +} + +void CompareXX(X64Emitter& e, Instr*& i, void(set_fn)(X64Emitter& e, Reg8& dest, bool invert)) { + if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8)) { + Reg8 dest; + Reg8 src1, src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0); + e.cmp(src1, src2); + set_fn(e, dest, false); + e.EndOp(dest, src1, src2); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8C)) { + Reg8 dest; + Reg8 src1; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + e.cmp(src1, i->src2.value->constant.i8); + set_fn(e, dest, false); + e.EndOp(dest, src1); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8C, SIG_TYPE_I8)) { + Reg8 dest; + Reg8 src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src2.value, src2, 0); + e.cmp(src2, i->src1.value->constant.i8); + set_fn(e, dest, true); + e.EndOp(dest, src2); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16)) { + Reg8 dest; + Reg16 src1, src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0); + e.cmp(src1, src2); + set_fn(e, dest, false); + e.EndOp(dest, src1, src2); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16C)) { + Reg8 dest; + Reg16 src1; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + e.cmp(src1, i->src2.value->constant.i16); + set_fn(e, dest, false); + e.EndOp(dest, src1); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16C, SIG_TYPE_I16)) { + Reg8 dest; + Reg16 src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src2.value, src2, 0); + e.cmp(src2, i->src1.value->constant.i16); + e.sete(dest); + set_fn(e, dest, true); + e.EndOp(dest, src2); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32)) { + Reg8 dest; + Reg32 src1, src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0); + e.cmp(src1, src2); + set_fn(e, dest, false); + e.EndOp(dest, src1, src2); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32C)) { + Reg8 dest; + Reg32 src1; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + e.cmp(src1, i->src2.value->constant.i32); + set_fn(e, dest, false); + e.EndOp(dest, src1); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32C, SIG_TYPE_I32)) { + Reg8 dest; + Reg32 src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src2.value, src2, 0); + e.cmp(src2, i->src1.value->constant.i32); + set_fn(e, dest, true); + e.EndOp(dest, src2); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64)) { + Reg8 dest; + Reg64 src1, src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0); + e.cmp(src1, src2); + set_fn(e, dest, false); + e.EndOp(dest, src1, src2); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64C)) { + Reg8 dest; + Reg64 src1; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + e.mov(e.rax, i->src2.value->constant.i64); + e.cmp(src1, e.rax); + set_fn(e, dest, false); + e.EndOp(dest, src1); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64C, SIG_TYPE_I64)) { + Reg8 dest; + Reg64 src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src2.value, src2, 0); + e.mov(e.rax, i->src1.value->constant.i64); + e.cmp(src2, e.rax); + set_fn(e, dest, true); + e.EndOp(dest, src2); + } else { + UNIMPLEMENTED_SEQ(); + } +}; + +typedef void(v_fn)(X64Emitter& e, Instr& i, const Reg& dest_src); +template +void IntUnaryOpV(X64Emitter& e, Instr*& i, v_fn v_fn, + T& dest, T& src1) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + if (dest == src1) { + v_fn(e, *i, dest); + } else { + e.mov(dest, src1); + v_fn(e, *i, dest); + } + e.EndOp(dest, src1); +} +template +void IntUnaryOpC(X64Emitter& e, Instr*& i, v_fn v_fn, + T& dest, Value* src1) { + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, (uint64_t)src1->get_constant(CT())); + v_fn(e, *i, dest); + e.EndOp(dest); +} +void IntUnaryOp(X64Emitter& e, Instr*& i, v_fn v_fn) { + if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8)) { + Reg8 dest, src1; + IntUnaryOpV(e, i, v_fn, dest, src1); + } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8C)) { + Reg8 dest; + IntUnaryOpC(e, i, v_fn, dest, i->src1.value); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16)) { + Reg16 dest, src1; + IntUnaryOpV(e, i, v_fn, dest, src1); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C)) { + Reg16 dest; + IntUnaryOpC(e, i, v_fn, dest, i->src1.value); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32)) { + Reg32 dest, src1; + IntUnaryOpV(e, i, v_fn, dest, src1); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C)) { + Reg32 dest; + IntUnaryOpC(e, i, v_fn, dest, i->src1.value); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64)) { + Reg64 dest, src1; + IntUnaryOpV(e, i, v_fn, dest, src1); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C)) { + Reg64 dest; + IntUnaryOpC(e, i, v_fn, dest, i->src1.value); + } else { + ASSERT_INVALID_TYPE(); + } + if (i->flags & ARITHMETIC_SET_CARRY) { + // EFLAGS should have CA set? + // (so long as we don't fuck with it) + // UNIMPLEMENTED_SEQ(); + } +}; + +typedef void(vv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src); +typedef void(vc_fn)(X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src); +template +void IntBinaryOpVV(X64Emitter& e, Instr*& i, vv_fn vv_fn, + TD& dest, TS1& src1, TS2& src2) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0); + if (dest == src1) { + vv_fn(e, *i, dest, src2); + } else if (dest == src2) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + vv_fn(e, *i, dest, src1); + } else { + // Eww. + e.mov(e.rax, src1); + vv_fn(e, *i, e.rax, src2); + e.mov(dest, e.rax); + } + } else { + e.mov(dest, src1); + vv_fn(e, *i, dest, src2); + } + e.EndOp(dest, src1, src2); +} +template +void IntBinaryOpVC(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, + TD& dest, TS1& src1, Value* src2) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + if (dest.getBit() <= 32) { + // 32-bit. + if (dest == src1) { + vc_fn(e, *i, dest, (uint32_t)src2->get_constant(CT())); + } else { + e.mov(dest, src1); + vc_fn(e, *i, dest, (uint32_t)src2->get_constant(CT())); + } + } else { + // 64-bit. + if (dest == src1) { + e.mov(e.rax, src2->constant.i64); + vv_fn(e, *i, dest, e.rax); + } else { + e.mov(e.rax, src2->constant.i64); + e.mov(dest, src1); + vv_fn(e, *i, dest, e.rax); + } + } + e.EndOp(dest, src1); +} +template +void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, + TD& dest, Value* src1, TS2& src2) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src2.value, src2, 0); + if (dest.getBit() <= 32) { + // 32-bit. + if (dest == src2) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); + } else { + // Eww. + e.mov(e.rax, src2); + e.mov(dest, (uint32_t)src1->get_constant(CT())); + vv_fn(e, *i, dest, e.rax); + } + } else { + e.mov(dest, src2); + vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); + } + } else { + // 64-bit. + if (dest == src2) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + e.mov(e.rax, src1->constant.i64); + vv_fn(e, *i, dest, e.rax); + } else { + // Eww. + e.mov(e.rax, src1->constant.i64); + vv_fn(e, *i, e.rax, src2); + e.mov(dest, e.rax); + } + } else { + e.mov(e.rax, src2); + e.mov(dest, src1->constant.i64); + vv_fn(e, *i, dest, e.rax); + } + } + e.EndOp(dest, src2); +} +void IntBinaryOp(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn) { + // TODO(benvanik): table lookup. This linear scan is slow. + // Note: we assume DEST.type = SRC1.type, but that SRC2.type may vary. + XEASSERT(i->dest->type == i->src1.value->type); + if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { + Reg8 dest, src1, src2; + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); + } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8C)) { + Reg8 dest, src1; + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8C, SIG_TYPE_I8)) { + Reg8 dest, src2; + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I16)) { + Reg16 dest, src1, src2; + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I16C)) { + Reg16 dest, src1; + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I16)) { + Reg16 dest, src2; + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I32)) { + Reg32 dest, src1, src2; + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I32C)) { + Reg32 dest, src1; + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I32)) { + Reg32 dest, src2; + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I64)) { + Reg64 dest, src1, src2; + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I64C)) { + Reg64 dest, src1; + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I64)) { + Reg64 dest, src2; + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + // Start forced src2=i8 + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { + Reg16 dest, src1; + Reg8 src2; + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8C)) { + Reg16 dest, src1; + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I8)) { + Reg16 dest; + Reg8 src2; + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { + Reg32 dest, src1; + Reg8 src2; + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8C)) { + Reg32 dest, src1; + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I8)) { + Reg32 dest; + Reg8 src2; + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { + Reg64 dest, src1; + Reg8 src2; + IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8C)) { + Reg64 dest, src1; + IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I8)) { + Reg64 dest; + Reg8 src2; + IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); + } else { + ASSERT_INVALID_TYPE(); + } + if (i->flags & ARITHMETIC_SET_CARRY) { + // EFLAGS should have CA set? + // (so long as we don't fuck with it) + // UNIMPLEMENTED_SEQ(); + } +}; + +typedef void(vvv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, const Operand& src2, const Operand& src3); +typedef void(vvc_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, const Operand& src2, uint32_t src3); +typedef void(vcv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, uint32_t src2, const Operand& src3); +template +void IntTernaryOpVVV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, + TD& dest, TS1& src1, TS2& src2, TS3& src3) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0, + i->src3.value, src3, 0); + if (dest == src1) { + vvv_fn(e, *i, dest, src2, src3); + } else if (dest == src2) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + vvv_fn(e, *i, dest, src1, src3); + } else { + UNIMPLEMENTED_SEQ(); + } + } else { + e.mov(dest, src1); + vvv_fn(e, *i, dest, src2, src3); + } + e.EndOp(dest, src1, src2, src3); +} +template +void IntTernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, + TD& dest, TS1& src1, TS2& src2, Value* src3) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0); + if (dest.getBit() <= 32) { + // 32-bit. + if (dest == src1) { + vvc_fn(e, *i, dest, src2, (uint32_t)src3->get_constant(CT())); + } else if (dest == src2) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + vvc_fn(e, *i, dest, src1, (uint32_t)src3->get_constant(CT())); + } else { + // Eww. + e.mov(e.rax, src2); + e.mov(dest, src1); + vvc_fn(e, *i, dest, e.rax, (uint32_t)src3->get_constant(CT())); + } + } else { + e.mov(dest, src1); + vvc_fn(e, *i, dest, src2, (uint32_t)src3->get_constant(CT())); + } + } else { + // 64-bit. + if (dest == src1) { + e.mov(e.rax, src3->constant.i64); + vvv_fn(e, *i, dest, src2, e.rax); + } else if (dest == src2) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + e.mov(e.rax, src3->constant.i64); + vvv_fn(e, *i, dest, src1, e.rax); + } else { + // Eww. + e.mov(e.rax, src1); + e.mov(src1, src2); + e.mov(dest, e.rax); + e.mov(e.rax, src3->constant.i64); + vvv_fn(e, *i, dest, src1, e.rax); + } + } else { + e.mov(e.rax, src3->constant.i64); + e.mov(dest, src1); + vvv_fn(e, *i, dest, src2, e.rax); + } + } + e.EndOp(dest, src1, src2); +} +template +void IntTernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, + TD& dest, TS1& src1, Value* src2, TS3& src3) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src3.value, src3, 0); + if (dest.getBit() <= 32) { + // 32-bit. + if (dest == src1) { + vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src3); + } else if (dest == src3) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src1); + } else { + // Eww. + e.mov(e.rax, src3); + e.mov(dest, src1); + vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), e.rax); + } + } else { + e.mov(dest, src1); + vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src3); + } + } else { + // 64-bit. + if (dest == src1) { + e.mov(e.rax, src2->constant.i64); + vvv_fn(e, *i, dest, e.rax, src3); + } else if (dest == src3) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + e.mov(e.rax, src2->constant.i64); + vvv_fn(e, *i, dest, src1, e.rax); + } else { + // Eww. + e.mov(e.rax, src1); + e.mov(src1, src3); + e.mov(dest, e.rax); + e.mov(e.rax, src2->constant.i64); + vvv_fn(e, *i, dest, e.rax, src1); + } + } else { + e.mov(e.rax, src2->constant.i64); + e.mov(dest, src1); + vvv_fn(e, *i, dest, e.rax, src3); + } + } + e.EndOp(dest, src1, src3); +} +void IntTernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn vcv_fn) { + // TODO(benvanik): table lookup. This linear scan is slow. + // Note: we assume DEST.type = SRC1.type = SRC2.type, but that SRC3.type may vary. + XEASSERT(i->dest->type == i->src1.value->type && + i->dest->type == i->src2.value->type); + // TODO(benvanik): table lookup. + if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { + Reg8 dest, src1, src2; + Reg8 src3; + IntTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8C)) { + Reg8 dest, src1, src2; + IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { + Reg16 dest, src1, src2; + Reg8 src3; + IntTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8C)) { + Reg16 dest, src1, src2; + IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { + Reg32 dest, src1, src2; + Reg8 src3; + IntTernaryOpVVV(e, i,vvv_fn, dest, src1, src2, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8C)) { + Reg32 dest, src1, src2; + IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { + Reg64 dest, src1, src2; + Reg8 src3; + IntTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8C)) { + Reg64 dest, src1, src2; + IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); + // + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8C, SIG_TYPE_I8)) { + Reg8 dest, src1, src3; + IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I8)) { + Reg16 dest, src1, src3; + IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I8)) { + Reg32 dest, src1, src3; + IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I8)) { + Reg64 dest, src1, src3; + IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); + } else { + ASSERT_INVALID_TYPE(); + } + if (i->flags & ARITHMETIC_SET_CARRY) { + // EFLAGS should have CA set? + // (so long as we don't fuck with it) + // UNIMPLEMENTED_SEQ(); + } +} + +} // namespace + +#endif // ALLOY_BACKEND_X64_X64_LOWERING_OP_UTILS_INL_ diff --git a/src/alloy/backend/x64/lowering/sources.gypi b/src/alloy/backend/x64/lowering/sources.gypi index 5c710cfcc..93a754180 100644 --- a/src/alloy/backend/x64/lowering/sources.gypi +++ b/src/alloy/backend/x64/lowering/sources.gypi @@ -5,5 +5,6 @@ 'lowering_sequences.h', 'lowering_table.cc', 'lowering_table.h', + 'op_utils.inl', ], } From 234aa4f543cbb7b84a9bdcf15fb885e70588aa98 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 22:25:28 -0800 Subject: [PATCH 022/184] Some parts of vector comparison. --- .../x64/lowering/lowering_sequences.cc | 10 +- src/alloy/backend/x64/lowering/op_utils.inl | 128 ++++++++++++++++++ 2 files changed, 133 insertions(+), 5 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index c2969be3f..688c0aba9 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1167,7 +1167,7 @@ table->AddSequence(OPCODE_DID_SATURATE, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_VECTOR_COMPARE_EQ, [](X64Emitter& e, Instr*& i) { if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + VectorCompareXX(e, i, VECTOR_CMP_EQ, true); } else { ASSERT_INVALID_TYPE(); } @@ -1177,7 +1177,7 @@ table->AddSequence(OPCODE_VECTOR_COMPARE_EQ, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_VECTOR_COMPARE_SGT, [](X64Emitter& e, Instr*& i) { if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + VectorCompareXX(e, i, VECTOR_CMP_GT, true); } else { ASSERT_INVALID_TYPE(); } @@ -1187,7 +1187,7 @@ table->AddSequence(OPCODE_VECTOR_COMPARE_SGT, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_VECTOR_COMPARE_SGE, [](X64Emitter& e, Instr*& i) { if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + VectorCompareXX(e, i, VECTOR_CMP_GE, true); } else { ASSERT_INVALID_TYPE(); } @@ -1197,7 +1197,7 @@ table->AddSequence(OPCODE_VECTOR_COMPARE_SGE, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_VECTOR_COMPARE_UGT, [](X64Emitter& e, Instr*& i) { if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + VectorCompareXX(e, i, VECTOR_CMP_GT, false); } else { ASSERT_INVALID_TYPE(); } @@ -1207,7 +1207,7 @@ table->AddSequence(OPCODE_VECTOR_COMPARE_UGT, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_VECTOR_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + VectorCompareXX(e, i, VECTOR_CMP_GE, false); } else { ASSERT_INVALID_TYPE(); } diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 8e502bd63..736c432f0 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -51,6 +51,7 @@ void CheckBoolean(X64Emitter& e, Value* v) { } } +// Compares src1 and src2 and calls the given fn to set a byte based on EFLAGS. void CompareXX(X64Emitter& e, Instr*& i, void(set_fn)(X64Emitter& e, Reg8& dest, bool invert)) { if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8)) { Reg8 dest; @@ -160,6 +161,133 @@ void CompareXX(X64Emitter& e, Instr*& i, void(set_fn)(X64Emitter& e, Reg8& dest, } }; +enum VectoreCompareOp { + VECTOR_CMP_EQ, + VECTOR_CMP_GT, + VECTOR_CMP_GE, +}; +// Compares src1 to src2 with the given op and sets the dest. +// Dest will have each part set to all ones if the compare passes. +void VectorCompareXX(X64Emitter& e, Instr*& i, VectoreCompareOp op, bool as_signed) { + Xmm dest, src1, src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0); + if (op == VECTOR_CMP_EQ) { + // Commutative, so simple. + Xmm real_src; + if (dest == src1) { + real_src = src2; + } else if (dest == src2) { + real_src = src1; + } else { + e.movaps(dest, src1); + real_src = src2; + } + if (i->flags == INT8_TYPE) { + e.pcmpeqb(dest, real_src); + } else if (i->flags == INT16_TYPE) { + e.pcmpeqw(dest, real_src); + } else if (i->flags == INT32_TYPE) { + e.pcmpeqd(dest, real_src); + } else if (i->flags == FLOAT32_TYPE) { + e.cmpeqps(dest, real_src); + } else { + ASSERT_INVALID_TYPE(); + } + } else if (i->flags == FLOAT32_TYPE) { + // Float GT/GE must be emulated. + if (op == VECTOR_CMP_GT) { + // Have to swap: src2 < src1. + if (dest == src2) { + e.cmpltps(dest, src1); + } else if (dest == src1) { + e.movaps(e.xmm0, src1); + e.movaps(dest, src2); + e.cmpltps(dest, e.xmm0); + } else { + e.movaps(dest, src2); + e.cmpltps(dest, src1); + } + } else if (op == VECTOR_CMP_GE) { + // Have to swap: src2 <= src1. + if (dest == src2) { + e.cmpleps(dest, src1); + } else if (dest == src1) { + e.movaps(e.xmm0, src1); + e.movaps(dest, src2); + e.cmpleps(dest, e.xmm0); + } else { + e.movaps(dest, src2); + e.cmpleps(dest, src1); + } + } else { + ASSERT_INVALID_TYPE(); + } + } else { + // Integer types are easier. + Xmm real_src; + if (dest == src1) { + real_src = src2; + } else if (dest == src2) { + e.movaps(e.xmm0, src2); + e.movaps(dest, src1); + real_src = e.xmm0; + } else { + e.movaps(dest, src1); + real_src = src2; + } + if (op == VECTOR_CMP_GT) { + if (i->flags == INT8_TYPE) { + if (as_signed) { + e.pcmpgtb(dest, real_src); + } else { + UNIMPLEMENTED_SEQ(); + } + } else if (i->flags == INT16_TYPE) { + if (as_signed) { + e.pcmpgtw(dest, real_src); + } else { + UNIMPLEMENTED_SEQ(); + } + } else if (i->flags == INT32_TYPE) { + if (as_signed) { + e.pcmpgtd(dest, real_src); + } else { + UNIMPLEMENTED_SEQ(); + } + } else { + ASSERT_INVALID_TYPE(); + } + } else if (op == VECTOR_CMP_GE) { + if (i->flags == INT8_TYPE) { + if (as_signed) { + UNIMPLEMENTED_SEQ(); + } else { + UNIMPLEMENTED_SEQ(); + } + } else if (i->flags == INT16_TYPE) { + if (as_signed) { + UNIMPLEMENTED_SEQ(); + } else { + UNIMPLEMENTED_SEQ(); + } + } else if (i->flags == INT32_TYPE) { + if (as_signed) { + UNIMPLEMENTED_SEQ(); + } else { + UNIMPLEMENTED_SEQ(); + } + } else { + ASSERT_INVALID_TYPE(); + } + } else { + ASSERT_INVALID_TYPE(); + } + } + e.EndOp(dest, src1, src2); +}; + typedef void(v_fn)(X64Emitter& e, Instr& i, const Reg& dest_src); template void IntUnaryOpV(X64Emitter& e, Instr*& i, v_fn v_fn, From c828e5416e498f6ea9490a4ecfc6941d83842f4d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 22:47:15 -0800 Subject: [PATCH 023/184] Starting on some unary xmm opcodes. --- .../x64/lowering/lowering_sequences.cc | 60 +++++++++++++++---- src/alloy/backend/x64/lowering/op_utils.inl | 57 +++++++++++++++--- 2 files changed, 99 insertions(+), 18 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 688c0aba9..ca65a8e7d 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1454,11 +1454,21 @@ table->AddSequence(OPCODE_MUL_SUB, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_NEG, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + IntUnaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { + e.neg(dest_src); + }); } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + if (i.src1.value->type == FLOAT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else { + UNIMPLEMENTED_SEQ(); + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + UNIMPLEMENTED_SEQ(); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1470,9 +1480,17 @@ table->AddSequence(OPCODE_ABS, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + if (i.src1.value->type == FLOAT32_TYPE) { + UNIMPLEMENTED_SEQ(); + } else { + UNIMPLEMENTED_SEQ(); + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + UNIMPLEMENTED_SEQ(); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1482,9 +1500,17 @@ table->AddSequence(OPCODE_ABS, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SQRT, [](X64Emitter& e, Instr*& i) { if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + if (i.dest->type == FLOAT32_TYPE) { + e.sqrtss(dest_src, dest_src); + } else { + e.sqrtsd(dest_src, dest_src); + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + e.sqrtps(dest_src, dest_src); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1494,9 +1520,19 @@ table->AddSequence(OPCODE_SQRT, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_RSQRT, [](X64Emitter& e, Instr*& i) { if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + if (i.dest->type == FLOAT32_TYPE) { + e.rsqrtss(dest_src, dest_src); + } else { + e.cvtsd2ss(dest_src, dest_src); + e.rsqrtss(dest_src, dest_src); + e.cvtss2sd(dest_src, dest_src); + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + e.rsqrtps(dest_src, dest_src); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1602,7 +1638,11 @@ table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { e.not(dest_src); }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + // dest_src ^= 0xFFFF... + e.cmpeqps(e.xmm0, e.xmm0); + e.pxor(dest_src, e.xmm0); + }); } else { ASSERT_INVALID_TYPE(); } diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 736c432f0..689cd501b 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -291,7 +291,7 @@ void VectorCompareXX(X64Emitter& e, Instr*& i, VectoreCompareOp op, bool as_sign typedef void(v_fn)(X64Emitter& e, Instr& i, const Reg& dest_src); template void IntUnaryOpV(X64Emitter& e, Instr*& i, v_fn v_fn, - T& dest, T& src1) { + T& dest, T& src1) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0); if (dest == src1) { @@ -304,7 +304,7 @@ void IntUnaryOpV(X64Emitter& e, Instr*& i, v_fn v_fn, } template void IntUnaryOpC(X64Emitter& e, Instr*& i, v_fn v_fn, - T& dest, Value* src1) { + T& dest, Value* src1) { e.BeginOp(i->dest, dest, REG_DEST); e.mov(dest, (uint64_t)src1->get_constant(CT())); v_fn(e, *i, dest); @@ -349,7 +349,7 @@ typedef void(vv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& typedef void(vc_fn)(X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src); template void IntBinaryOpVV(X64Emitter& e, Instr*& i, vv_fn vv_fn, - TD& dest, TS1& src1, TS2& src2) { + TD& dest, TS1& src1, TS2& src2) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0, i->src2.value, src2, 0); @@ -372,7 +372,7 @@ void IntBinaryOpVV(X64Emitter& e, Instr*& i, vv_fn vv_fn, } template void IntBinaryOpVC(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, - TD& dest, TS1& src1, Value* src2) { + TD& dest, TS1& src1, Value* src2) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0); if (dest.getBit() <= 32) { @@ -398,7 +398,7 @@ void IntBinaryOpVC(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, } template void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, - TD& dest, Value* src1, TS2& src2) { + TD& dest, Value* src1, TS2& src2) { e.BeginOp(i->dest, dest, REG_DEST, i->src2.value, src2, 0); if (dest.getBit() <= 32) { @@ -525,7 +525,7 @@ typedef void(vvc_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, const Operan typedef void(vcv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, uint32_t src2, const Operand& src3); template void IntTernaryOpVVV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, - TD& dest, TS1& src1, TS2& src2, TS3& src3) { + TD& dest, TS1& src1, TS2& src2, TS3& src3) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0, i->src2.value, src2, 0, @@ -546,7 +546,7 @@ void IntTernaryOpVVV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, } template void IntTernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, - TD& dest, TS1& src1, TS2& src2, Value* src3) { + TD& dest, TS1& src1, TS2& src2, Value* src3) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0, i->src2.value, src2, 0); @@ -594,7 +594,7 @@ void IntTernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, } template void IntTernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, - TD& dest, TS1& src1, Value* src2, TS3& src3) { + TD& dest, TS1& src1, Value* src2, TS3& src3) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0, i->src3.value, src3, 0); @@ -697,6 +697,47 @@ void IntTernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn } } +typedef void(xmm_v_fn)(X64Emitter& e, Instr& i, const Xmm& dest_src); +template +void XmmUnaryOpV(X64Emitter& e, Instr*& i, xmm_v_fn v_fn, + T& dest, T& src1) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + if (dest == src1) { + v_fn(e, *i, dest); + } else { + e.movaps(dest, src1); + v_fn(e, *i, dest); + } + e.EndOp(dest, src1); +} +template +void XmmUnaryOpC(X64Emitter& e, Instr*& i, xmm_v_fn v_fn, + T& dest, Value* src1) { + e.BeginOp(i->dest, dest, REG_DEST); + //e.mov(dest, (uint64_t)src1->get_constant(CT())); + v_fn(e, *i, dest); + e.EndOp(dest); +} +void XmmUnaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_v_fn v_fn) { + if (IsFloatType(i->src1.value->type)) { + // + } else if (IsVecType(i->src1.value->type)) { + // + } else { + ASSERT_INVALID_TYPE(); + } + if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8)) { + Xmm dest, src1; + XmmUnaryOpV(e, i, v_fn, dest, src1); + } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8C)) { + Xmm dest, src1; + XmmUnaryOpC(e, i, v_fn, dest, i->src1.value); + } else { + ASSERT_INVALID_TYPE(); + } +}; + } // namespace #endif // ALLOY_BACKEND_X64_X64_LOWERING_OP_UTILS_INL_ From 6c7e392088dd6322e55a4575487748c53b4160fc Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 26 Jan 2014 22:57:39 -0800 Subject: [PATCH 024/184] Fixing xmm unary ops. --- .../x64/lowering/lowering_sequences.cc | 39 ++++++------ src/alloy/backend/x64/lowering/op_utils.inl | 60 ++++++++++++------- 2 files changed, 60 insertions(+), 39 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index ca65a8e7d..379e438c2 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1458,7 +1458,7 @@ table->AddSequence(OPCODE_NEG, [](X64Emitter& e, Instr*& i) { e.neg(dest_src); }); } else if (IsFloatType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { if (i.src1.value->type == FLOAT32_TYPE) { UNIMPLEMENTED_SEQ(); } else { @@ -1466,7 +1466,7 @@ table->AddSequence(OPCODE_NEG, [](X64Emitter& e, Instr*& i) { } }); } else if (IsVecType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { UNIMPLEMENTED_SEQ(); }); } else { @@ -1480,7 +1480,7 @@ table->AddSequence(OPCODE_ABS, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else if (IsFloatType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { if (i.src1.value->type == FLOAT32_TYPE) { UNIMPLEMENTED_SEQ(); } else { @@ -1488,7 +1488,7 @@ table->AddSequence(OPCODE_ABS, [](X64Emitter& e, Instr*& i) { } }); } else if (IsVecType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { UNIMPLEMENTED_SEQ(); }); } else { @@ -1500,16 +1500,16 @@ table->AddSequence(OPCODE_ABS, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SQRT, [](X64Emitter& e, Instr*& i) { if (IsFloatType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { if (i.dest->type == FLOAT32_TYPE) { - e.sqrtss(dest_src, dest_src); + e.sqrtss(dest, src); } else { - e.sqrtsd(dest_src, dest_src); + e.sqrtsd(dest, src); } }); } else if (IsVecType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { - e.sqrtps(dest_src, dest_src); + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { + e.sqrtps(dest, src); }); } else { ASSERT_INVALID_TYPE(); @@ -1520,18 +1520,18 @@ table->AddSequence(OPCODE_SQRT, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_RSQRT, [](X64Emitter& e, Instr*& i) { if (IsFloatType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { if (i.dest->type == FLOAT32_TYPE) { - e.rsqrtss(dest_src, dest_src); + e.rsqrtss(dest, src); } else { - e.cvtsd2ss(dest_src, dest_src); - e.rsqrtss(dest_src, dest_src); - e.cvtss2sd(dest_src, dest_src); + e.cvtsd2ss(dest, src); + e.rsqrtss(dest, dest); + e.cvtss2sd(dest, dest); } }); } else if (IsVecType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { - e.rsqrtps(dest_src, dest_src); + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { + e.rsqrtps(dest, src); }); } else { ASSERT_INVALID_TYPE(); @@ -1638,10 +1638,13 @@ table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { e.not(dest_src); }); } else if (IsVecType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src) { + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { // dest_src ^= 0xFFFF... e.cmpeqps(e.xmm0, e.xmm0); - e.pxor(dest_src, e.xmm0); + if (dest != src) { + e.movaps(dest, src); + } + e.pxor(dest, e.xmm0); }); } else { ASSERT_INVALID_TYPE(); diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 689cd501b..424e8eeb2 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -697,42 +697,60 @@ void IntTernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn } } -typedef void(xmm_v_fn)(X64Emitter& e, Instr& i, const Xmm& dest_src); +// Since alot of SSE ops can take dest + src, just do that. +// Worst case the callee can dedupe. +typedef void(xmm_v_fn)(X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src); template void XmmUnaryOpV(X64Emitter& e, Instr*& i, xmm_v_fn v_fn, T& dest, T& src1) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0); - if (dest == src1) { - v_fn(e, *i, dest); - } else { - e.movaps(dest, src1); - v_fn(e, *i, dest); - } + v_fn(e, *i, dest, src1); e.EndOp(dest, src1); } -template +template void XmmUnaryOpC(X64Emitter& e, Instr*& i, xmm_v_fn v_fn, T& dest, Value* src1) { e.BeginOp(i->dest, dest, REG_DEST); - //e.mov(dest, (uint64_t)src1->get_constant(CT())); - v_fn(e, *i, dest); + if (src1->type == FLOAT32_TYPE) { + e.mov(e.eax, (uint32_t)src1->constant.i32); + e.movd(dest, e.eax); + } else if (src1->type == FLOAT64_TYPE) { + e.mov(e.rax, (uint64_t)src1->constant.i64); + e.movq(dest, e.rax); + } else { + UNIMPLEMENTED_SEQ(); + } + v_fn(e, *i, dest, dest); e.EndOp(dest); } void XmmUnaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_v_fn v_fn) { if (IsFloatType(i->src1.value->type)) { - // + if (i->Match(SIG_TYPE_F32, SIG_TYPE_F32)) { + Xmm dest, src1; + XmmUnaryOpV(e, i, v_fn, dest, src1); + } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_F32C)) { + Xmm dest; + XmmUnaryOpC(e, i, v_fn, dest, i->src1.value); + } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_F64)) { + Xmm dest, src1; + XmmUnaryOpV(e, i, v_fn, dest, src1); + } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_F64C)) { + Xmm dest; + XmmUnaryOpC(e, i, v_fn, dest, i->src1.value); + } else { + ASSERT_INVALID_TYPE(); + } } else if (IsVecType(i->src1.value->type)) { - // - } else { - ASSERT_INVALID_TYPE(); - } - if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8)) { - Xmm dest, src1; - XmmUnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8C)) { - Xmm dest, src1; - XmmUnaryOpC(e, i, v_fn, dest, i->src1.value); + if (i->Match(SIG_TYPE_V128, SIG_TYPE_V128)) { + Xmm dest, src1; + XmmUnaryOpV(e, i, v_fn, dest, src1); + } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_V128C)) { + Xmm dest; + XmmUnaryOpC(e, i, v_fn, dest, i->src1.value); + } else { + ASSERT_INVALID_TYPE(); + } } else { ASSERT_INVALID_TYPE(); } From 58c0ea9ac7080a847fe2dd97ff35c9548d3daa65 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 27 Jan 2014 09:25:48 -0800 Subject: [PATCH 025/184] Fixing bad vector compare. --- src/alloy/backend/ivm/ivm_intcode.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 7100deaa8..cef1e7930 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -2098,7 +2098,7 @@ int Translate_DID_SATURATE(TranslationContext& ctx, Instr* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; \ vec128_t& dest = ics.rf[i->dest_reg].v128; \ for (int n = 0; n < count; n++) { \ - dest.value[n] = (type)src1.value[n] op (type)src2.value[n]; \ + dest.value[n] = ((type)src1.value[n] op (type)src2.value[n]) ? (type)0xFFFFFFFF : 0; \ } \ return IA_NEXT; From 9b45e6f2dc34ce336b2e6916831323f69fe7730e Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 27 Jan 2014 20:50:45 -0800 Subject: [PATCH 026/184] Tracers in x64, almost matching ivm. --- src/alloy/backend/ivm/ivm_intcode.cc | 24 +- .../x64/lowering/lowering_sequences.cc | 197 ++++++++++++++--- src/alloy/backend/x64/lowering/sources.gypi | 2 + src/alloy/backend/x64/lowering/tracers.cc | 208 ++++++++++++++++++ src/alloy/backend/x64/lowering/tracers.h | 62 ++++++ src/alloy/runtime/thread_state.cc | 1 - 6 files changed, 449 insertions(+), 45 deletions(-) create mode 100644 src/alloy/backend/x64/lowering/tracers.cc create mode 100644 src/alloy/backend/x64/lowering/tracers.h diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index cef1e7930..3bc84a771 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -1337,32 +1337,32 @@ int Translate_LOAD_CLOCK(TranslationContext& ctx, Instr* i) { uint32_t IntCode_LOAD_CONTEXT_I8(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].i8 = *((int8_t*)(ics.context + ics.rf[i->src1_reg].u64)); - DPRINT("%d (%.X) = ctx i8 +%d\n", ics.rf[i->dest_reg].i8, ics.rf[i->dest_reg].u8, ics.rf[i->src1_reg].u64); + DPRINT("%d (%X) = ctx i8 +%d\n", ics.rf[i->dest_reg].i8, ics.rf[i->dest_reg].u8, ics.rf[i->src1_reg].u64); return IA_NEXT; } uint32_t IntCode_LOAD_CONTEXT_I16(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].i16 = *((int16_t*)(ics.context + ics.rf[i->src1_reg].u64)); - DPRINT("%d (%.X) = ctx i16 +%d\n", ics.rf[i->dest_reg].i16, ics.rf[i->dest_reg].u16, ics.rf[i->src1_reg].u64); + DPRINT("%d (%X) = ctx i16 +%d\n", ics.rf[i->dest_reg].i16, ics.rf[i->dest_reg].u16, ics.rf[i->src1_reg].u64); return IA_NEXT; } uint32_t IntCode_LOAD_CONTEXT_I32(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].i32 = *((int32_t*)(ics.context + ics.rf[i->src1_reg].u64)); - DPRINT("%d (%.X) = ctx i32 +%d\n", ics.rf[i->dest_reg].i32, ics.rf[i->dest_reg].u32, ics.rf[i->src1_reg].u64); + DPRINT("%d (%X) = ctx i32 +%d\n", ics.rf[i->dest_reg].i32, ics.rf[i->dest_reg].u32, ics.rf[i->src1_reg].u64); return IA_NEXT; } uint32_t IntCode_LOAD_CONTEXT_I64(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].i64 = *((int64_t*)(ics.context + ics.rf[i->src1_reg].u64)); - DPRINT("%lld (%.llX) = ctx i64 +%d\n", ics.rf[i->dest_reg].i64, ics.rf[i->dest_reg].u64, ics.rf[i->src1_reg].u64); + DPRINT("%lld (%llX) = ctx i64 +%d\n", ics.rf[i->dest_reg].i64, ics.rf[i->dest_reg].u64, ics.rf[i->src1_reg].u64); return IA_NEXT; } uint32_t IntCode_LOAD_CONTEXT_F32(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].f32 = *((float*)(ics.context + ics.rf[i->src1_reg].u64)); - DPRINT("%e (%.X) = ctx f32 +%d\n", ics.rf[i->dest_reg].f32, ics.rf[i->dest_reg].u32, ics.rf[i->src1_reg].u64); + DPRINT("%e (%X) = ctx f32 +%d\n", ics.rf[i->dest_reg].f32, ics.rf[i->dest_reg].u32, ics.rf[i->src1_reg].u64); return IA_NEXT; } uint32_t IntCode_LOAD_CONTEXT_F64(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].f64 = *((double*)(ics.context + ics.rf[i->src1_reg].u64)); - DPRINT("%lle (%.llX) = ctx f64 +%d\n", ics.rf[i->dest_reg].f64, ics.rf[i->dest_reg].u64, ics.rf[i->src1_reg].u64); + DPRINT("%lle (%llX) = ctx f64 +%d\n", ics.rf[i->dest_reg].f64, ics.rf[i->dest_reg].u64, ics.rf[i->src1_reg].u64); return IA_NEXT; } uint32_t IntCode_LOAD_CONTEXT_V128(IntCodeState& ics, const IntCode* i) { @@ -1388,32 +1388,32 @@ int Translate_LOAD_CONTEXT(TranslationContext& ctx, Instr* i) { uint32_t IntCode_STORE_CONTEXT_I8(IntCodeState& ics, const IntCode* i) { *((int8_t*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i8; - DPRINT("ctx i8 +%d = %d (%.X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i8, ics.rf[i->src2_reg].u8); + DPRINT("ctx i8 +%d = %d (%X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i8, ics.rf[i->src2_reg].u8); return IA_NEXT; } uint32_t IntCode_STORE_CONTEXT_I16(IntCodeState& ics, const IntCode* i) { *((int16_t*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i16; - DPRINT("ctx i16 +%d = %d (%.X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i16, ics.rf[i->src2_reg].u16); + DPRINT("ctx i16 +%d = %d (%X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i16, ics.rf[i->src2_reg].u16); return IA_NEXT; } uint32_t IntCode_STORE_CONTEXT_I32(IntCodeState& ics, const IntCode* i) { *((int32_t*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i32; - DPRINT("ctx i32 +%d = %d (%.X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i32, ics.rf[i->src2_reg].u32); + DPRINT("ctx i32 +%d = %d (%X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i32, ics.rf[i->src2_reg].u32); return IA_NEXT; } uint32_t IntCode_STORE_CONTEXT_I64(IntCodeState& ics, const IntCode* i) { *((int64_t*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i64; - DPRINT("ctx i64 +%d = %lld (%.llX)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i64, ics.rf[i->src2_reg].u64); + DPRINT("ctx i64 +%d = %lld (%llX)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i64, ics.rf[i->src2_reg].u64); return IA_NEXT; } uint32_t IntCode_STORE_CONTEXT_F32(IntCodeState& ics, const IntCode* i) { *((float*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].f32; - DPRINT("ctx f32 +%d = %e (%.X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].f32, ics.rf[i->src2_reg].u32); + DPRINT("ctx f32 +%d = %e (%X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].f32, ics.rf[i->src2_reg].u32); return IA_NEXT; } uint32_t IntCode_STORE_CONTEXT_F64(IntCodeState& ics, const IntCode* i) { *((double*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].f64; - DPRINT("ctx f64 +%d = %lle (%.llX)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].f64, ics.rf[i->src2_reg].u64); + DPRINT("ctx f64 +%d = %lle (%llX)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].f64, ics.rf[i->src2_reg].u64); return IA_NEXT; } uint32_t IntCode_STORE_CONTEXT_V128(IntCodeState& ics, const IntCode* i) { diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 379e438c2..15ca6d19f 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -30,7 +31,7 @@ namespace { #define ASSERT_INVALID_TYPE() XEASSERTALWAYS() #define ITRACE 1 -#define DTRACE 0 +#define DTRACE 1 #define SHUFPS_SWAP_DWORDS 0x1B @@ -44,28 +45,10 @@ namespace { // shuffle(vec, b00011011) -> {x,y,z,w} => {x,y,z,w} // All indices and operations must respect that. -// TODO(benvanik): emit traces/printfs/etc - void Dummy() { // } -void PrintString(void* raw_context, const char* str) { - // TODO(benvanik): generate this thunk at runtime? or a shim? - auto thread_state = *((ThreadState**)raw_context); - fprintf(stdout, "XE[t] :%d: %s\n", thread_state->GetThreadID(), str); - fflush(stdout); -} - -void TraceContextLoad(void* raw_context, uint64_t offset, uint64_t value) { - fprintf(stdout, "%lld (%.llX) = ctx i64 +%lld\n", (int64_t)value, value, offset); - fflush(stdout); -} -void TraceContextStore(void* raw_context, uint64_t offset, uint64_t value) { - fprintf(stdout, "ctx i64 +%lld = %lld (%.llX)\n", offset, (int64_t)value, value); - fflush(stdout); -} - uint64_t LoadClock(void* raw_context) { LARGE_INTEGER counter; uint64_t time = 0; @@ -173,7 +156,7 @@ table->AddSequence(OPCODE_COMMENT, [](X64Emitter& e, Instr*& i) { auto str = (const char*)i->src1.offset; auto str_copy = xestrdupa(str); e.mov(e.rdx, (uint64_t)str_copy); - CallNative(e, PrintString); + CallNative(e, TraceString); #endif // ITRACE i = e.Advance(i); return true; @@ -591,7 +574,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8b, dest); - CallNative(e, TraceContextLoad); + CallNative(e, TraceContextLoadI8); #endif // DTRACE } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { Reg16 dest; @@ -601,7 +584,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8w, dest); - CallNative(e, TraceContextLoad); + CallNative(e, TraceContextLoadI16); #endif // DTRACE } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { Reg32 dest; @@ -611,7 +594,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8d, dest); - CallNative(e, TraceContextLoad); + CallNative(e, TraceContextLoadI32); #endif // DTRACE } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { Reg64 dest; @@ -621,24 +604,39 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8, dest); - CallNative(e, TraceContextLoad); + CallNative(e, TraceContextLoadI64); #endif // DTRACE } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); e.movss(dest, e.dword[e.rcx + i->src1.offset]); e.EndOp(dest); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.movaps(e.xmm0, dest); + CallNative(e, TraceContextLoadF32); +#endif // DTRACE } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); e.movsd(dest, e.qword[e.rcx + i->src1.offset]); e.EndOp(dest); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.movaps(e.xmm0, dest); + CallNative(e, TraceContextLoadF64); +#endif // DTRACE } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); // NOTE: we always know we are aligned. e.movaps(dest, e.ptr[e.rcx + i->src1.offset]); e.EndOp(dest); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.movaps(e.xmm0, dest); + CallNative(e, TraceContextLoadV128); +#endif // DTRACE } else { ASSERT_INVALID_TYPE(); } @@ -655,14 +653,14 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8b, src); - CallNative(e, TraceContextStore); + CallNative(e, TraceContextStoreI8); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { e.mov(e.byte[e.rcx + i->src1.offset], i->src2.value->constant.i8); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8b, i->src2.value->constant.i8); - CallNative(e, TraceContextStore); + CallNative(e, TraceContextStoreI8); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { Reg16 src; @@ -672,14 +670,14 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8w, src); - CallNative(e, TraceContextStore); + CallNative(e, TraceContextStoreI16); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { e.mov(e.word[e.rcx + i->src1.offset], i->src2.value->constant.i16); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8w, i->src2.value->constant.i16); - CallNative(e, TraceContextStore); + CallNative(e, TraceContextStoreI16); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { Reg32 src; @@ -689,14 +687,14 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8d, src); - CallNative(e, TraceContextStore); + CallNative(e, TraceContextStoreI32); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8d, i->src2.value->constant.i32); - CallNative(e, TraceContextStore); + CallNative(e, TraceContextStoreI32); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { Reg64 src; @@ -706,38 +704,68 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8, src); - CallNative(e, TraceContextStore); + CallNative(e, TraceContextStoreI64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.i64); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8, i->src2.value->constant.i64); - CallNative(e, TraceContextStore); + CallNative(e, TraceContextStoreI64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { Xmm src; e.BeginOp(i->src2.value, src, 0); e.movss(e.dword[e.rcx + i->src1.offset], src); e.EndOp(src); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.movss(e.xmm0, src); + CallNative(e, TraceContextStoreF32); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.movss(e.xmm0, e.dword[e.rcx + i->src1.offset]); + CallNative(e, TraceContextStoreF32); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { Xmm src; e.BeginOp(i->src2.value, src, 0); e.movsd(e.qword[e.rcx + i->src1.offset], src); e.EndOp(src); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.movsd(e.xmm0, src); + CallNative(e, TraceContextStoreF64); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.i64); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.movsd(e.xmm0, e.qword[e.rcx + i->src1.offset]); + CallNative(e, TraceContextStoreF64); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { Xmm src; e.BeginOp(i->src2.value, src, 0); // NOTE: we always know we are aligned. e.movaps(e.ptr[e.rcx + i->src1.offset], src); e.EndOp(src); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.movaps(e.xmm0, src); + CallNative(e, TraceContextStoreF64); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.v128.low); e.mov(e.qword[e.rcx + i->src1.offset + 8], i->src2.value->constant.v128.high); +#if DTRACE + e.mov(e.rdx, i->src1.offset); + e.movups(e.xmm0, e.ptr[e.rcx + i->src1.offset]); + CallNative(e, TraceContextStoreV128); +#endif // DTRACE } else { ASSERT_INVALID_TYPE(); } @@ -792,31 +820,61 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { e.BeginOp(i->dest, dest, REG_DEST); e.mov(dest, e.byte[addr]); e.EndOp(dest); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.mov(e.r8b, dest); + CallNative(e, TraceMemoryLoadI8); +#endif // DTRACE } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { Reg16 dest; e.BeginOp(i->dest, dest, REG_DEST); e.mov(dest, e.word[addr]); e.EndOp(dest); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.mov(e.r8w, dest); + CallNative(e, TraceMemoryLoadI16); +#endif // DTRACE } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { Reg32 dest; e.BeginOp(i->dest, dest, REG_DEST); e.mov(dest, e.dword[addr]); e.EndOp(dest); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.mov(e.r8d, dest); + CallNative(e, TraceMemoryLoadI32); +#endif // DTRACE } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { Reg64 dest; e.BeginOp(i->dest, dest, REG_DEST); e.mov(dest, e.qword[addr]); e.EndOp(dest); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.mov(e.r8, dest); + CallNative(e, TraceMemoryLoadI64); +#endif // DTRACE } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); e.movss(dest, e.dword[addr]); e.EndOp(dest); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.movss(e.xmm0, dest); + CallNative(e, TraceMemoryLoadF32); +#endif // DTRACE } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); e.movsd(dest, e.qword[addr]); e.EndOp(dest); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.movsd(e.xmm0, dest); + CallNative(e, TraceMemoryLoadF64); +#endif // DTRACE } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); @@ -824,6 +882,11 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { e.movups(dest, e.ptr[addr]); e.EndOp(dest); e.db(0xCC); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.movaps(e.xmm0, dest); + CallNative(e, TraceMemoryLoadV128); +#endif // DTRACE } else { ASSERT_INVALID_TYPE(); } @@ -892,43 +955,103 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { e.BeginOp(i->src2.value, src, 0); e.mov(e.byte[addr], src); e.EndOp(src); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.mov(e.r8b, src); + CallNative(e, TraceMemoryStoreI8); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { e.mov(e.byte[addr], i->src2.value->constant.i8); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.mov(e.r8b, i->src2.value->constant.i8); + CallNative(e, TraceMemoryStoreI8); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { Reg16 src; e.BeginOp(i->src2.value, src, 0); e.mov(e.word[addr], src); e.EndOp(src); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.mov(e.r8w, src); + CallNative(e, TraceMemoryStoreI16); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { e.mov(e.word[addr], i->src2.value->constant.i16); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.mov(e.r8w, i->src2.value->constant.i16); + CallNative(e, TraceMemoryStoreI16); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { Reg32 src; e.BeginOp(i->src2.value, src, 0); e.mov(e.dword[addr], src); e.EndOp(src); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.mov(e.r8d, src); + CallNative(e, TraceMemoryStoreI32); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { e.mov(e.dword[addr], i->src2.value->constant.i32); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.mov(e.r8d, i->src2.value->constant.i32); + CallNative(e, TraceMemoryStoreI32); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { Reg64 src; e.BeginOp(i->src2.value, src, 0); e.mov(e.qword[addr], src); e.EndOp(src); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.mov(e.r8, src); + CallNative(e, TraceMemoryStoreI64); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { e.mov(e.qword[addr], i->src2.value->constant.i64); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.mov(e.r8, i->src2.value->constant.i64); + CallNative(e, TraceMemoryStoreI64); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { Xmm src; e.BeginOp(i->src2.value, src, 0); e.movss(e.dword[addr], src); e.EndOp(src); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.movss(e.xmm0, src); + CallNative(e, TraceMemoryStoreF32); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { e.mov(e.dword[addr], i->src2.value->constant.i32); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.movss(e.xmm0, e.ptr[addr]); + CallNative(e, TraceMemoryStoreF32); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { Xmm src; e.BeginOp(i->src2.value, src, 0); e.movsd(e.qword[addr], src); e.EndOp(src); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.movsd(e.xmm0, src); + CallNative(e, TraceMemoryStoreF64); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { e.mov(e.qword[addr], i->src2.value->constant.i64); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.movsd(e.xmm0, e.ptr[addr]); + CallNative(e, TraceMemoryStoreF64); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { Xmm src; e.BeginOp(i->src2.value, src, 0); @@ -936,9 +1059,19 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { e.movups(e.ptr[addr], src); e.EndOp(src); e.db(0xCC); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.movaps(e.xmm0, src); + CallNative(e, TraceMemoryStoreV128); +#endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { e.mov(e.ptr[addr], i->src2.value->constant.v128.low); e.mov(e.ptr[addr + 8], i->src2.value->constant.v128.high); +#if DTRACE + e.lea(e.rdx, e.ptr[addr]); + e.movups(e.xmm0, e.ptr[addr]); + CallNative(e, TraceMemoryStoreV128); +#endif // DTRACE } else { ASSERT_INVALID_TYPE(); } diff --git a/src/alloy/backend/x64/lowering/sources.gypi b/src/alloy/backend/x64/lowering/sources.gypi index 93a754180..d6cdeb1bb 100644 --- a/src/alloy/backend/x64/lowering/sources.gypi +++ b/src/alloy/backend/x64/lowering/sources.gypi @@ -6,5 +6,7 @@ 'lowering_table.cc', 'lowering_table.h', 'op_utils.inl', + 'tracers.cc', + 'tracers.h', ], } diff --git a/src/alloy/backend/x64/lowering/tracers.cc b/src/alloy/backend/x64/lowering/tracers.cc new file mode 100644 index 000000000..a0a2f212b --- /dev/null +++ b/src/alloy/backend/x64/lowering/tracers.cc @@ -0,0 +1,208 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include + +using namespace alloy; +using namespace alloy::backend::x64; +using namespace alloy::backend::x64::lowering; +using namespace alloy::runtime; + +namespace alloy { +namespace backend { +namespace x64 { +namespace lowering { + + +#define IPRINT +#define IFLUSH() +#define DPRINT +#define DFLUSH() + +#define IPRINT if (thread_state->thread_id() == 1) printf +#define IFLUSH() fflush(stdout) +#define DPRINT if (thread_state->thread_id() == 1) printf +#define DFLUSH() fflush(stdout) + + +void TraceString(void* raw_context, const char* str) { + auto thread_state = *((ThreadState**)raw_context); + IPRINT("XE[t] :%d: %s\n", thread_state->GetThreadID(), str); + IFLUSH(); +} + +void TraceContextLoadI8(void* raw_context, uint64_t offset, uint8_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%d (%X) = ctx i8 +%d\n", (int8_t)value, value, offset); +} +void TraceContextLoadI16(void* raw_context, uint64_t offset, uint16_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%d (%X) = ctx i16 +%d\n", (int16_t)value, value, offset); +} +void TraceContextLoadI32(void* raw_context, uint64_t offset, uint32_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%d (%X) = ctx i32 +%d\n", (int32_t)value, value, offset); +} +void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%lld (%llX) = ctx i64 +%d\n", (int64_t)value, value, offset); +} +void TraceContextLoadF32(void* raw_context, uint64_t offset, float value) { + auto thread_state = *((ThreadState**)raw_context); + union { + float f; + uint32_t u; + } x; + x.f = value; + DPRINT("%e (%X) = ctx f32 +%d\n", x.f, x.u, offset); +} +void TraceContextLoadF64(void* raw_context, uint64_t offset, double value) { + auto thread_state = *((ThreadState**)raw_context); + union { + double f; + uint64_t u; + } x; + x.f = value; + DPRINT("%lle (%llX) = ctx f64 +%d\n", x.f, x.u, offset); +} +void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + //DPRINT("%d (%.X) = ctx i8 +%d\n", (int8_t)value, value, offset); +} + +void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("ctx i8 +%d = %d (%X)\n", offset, (int8_t)value, value); +} +void TraceContextStoreI16(void* raw_context, uint64_t offset, uint16_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("ctx i16 +%d = %d (%X)\n", offset, (int16_t)value, value); +} +void TraceContextStoreI32(void* raw_context, uint64_t offset, uint32_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("ctx i32 +%d = %d (%X)\n", offset, (int32_t)value, value); +} +void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("ctx i64 +%d = %lld (%llX)\n", offset, (int64_t)value, value); +} +void TraceContextStoreF32(void* raw_context, uint64_t offset, float value) { + auto thread_state = *((ThreadState**)raw_context); + union { + float f; + uint32_t u; + } x; + x.f = value; + DPRINT("ctx f32 +%d = %e (%.X)\n", offset, x.f, x.u); +} +void TraceContextStoreF64(void* raw_context, uint64_t offset, double value) { + auto thread_state = *((ThreadState**)raw_context); + union { + double f; + uint64_t u; + } x; + x.f = value; + DPRINT("ctx f64 +%d = %lle (%.llX)\n", offset, x.f, x.u); +} +void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + /*DPRINT("ctx v128 +%d = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", ics.rf[i->src1_reg].u64, + VECF4(ics.rf[i->src2_reg].v128,0), VECF4(ics.rf[i->src2_reg].v128,1), VECF4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3), + VECI4(ics.rf[i->src2_reg].v128,0), VECI4(ics.rf[i->src2_reg].v128,1), VECI4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3));*/ +} + +void TraceMemoryLoadI8(void* raw_context, uint64_t address, uint8_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%d (%X) = load.i8 %.8X\n", (int8_t)value, value, address); +} +void TraceMemoryLoadI16(void* raw_context, uint64_t address, uint16_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%d (%X) = load.i16 %.8X\n", (int16_t)value, value, address); +} +void TraceMemoryLoadI32(void* raw_context, uint64_t address, uint32_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%d (%X) = load.i32 %.8X\n", (int32_t)value, value, address); +} +void TraceMemoryLoadI64(void* raw_context, uint64_t address, uint64_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%lld (%llX) = load.i64 %.8X\n", (int64_t)value, value, address); +} +void TraceMemoryLoadF32(void* raw_context, uint64_t address, float value) { + auto thread_state = *((ThreadState**)raw_context); + union { + float f; + uint32_t u; + } x; + x.f = value; + DPRINT("%e (%X) = load.f32 %.8X\n", x.f, x.u, address); +} +void TraceMemoryLoadF64(void* raw_context, uint64_t address, double value) { + auto thread_state = *((ThreadState**)raw_context); + union { + double f; + uint64_t u; + } x; + x.f = value; + DPRINT("%lle (%llX) = load.f64 %.8X\n", x.f, x.u, address); +} +void TraceMemoryLoadV128(void* raw_context, uint64_t address, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + //DPRINT("%d (%.X) = load.v128 +%d\n", (int8_t)value, value, offset); +} + +void TraceMemoryStoreI8(void* raw_context, uint64_t address, uint8_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("store.i8 %.8X = %d (%X)\n", address, (int8_t)value, value); +} +void TraceMemoryStoreI16(void* raw_context, uint64_t address, uint16_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("store.i16 %.8X = %d (%X)\n", address, (int16_t)value, value); +} +void TraceMemoryStoreI32(void* raw_context, uint64_t address, uint32_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("store.i32 %.8X = %d (%X)\n", address, (int32_t)value, value); +} +void TraceMemoryStoreI64(void* raw_context, uint64_t address, uint64_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("store.i64 %.8X = %lld (%llX)\n", address, (int64_t)value, value); +} +void TraceMemoryStoreF32(void* raw_context, uint64_t address, float value) { + auto thread_state = *((ThreadState**)raw_context); + union { + float f; + uint32_t u; + } x; + x.f = value; + DPRINT("store.f32 %.8X = %e (%X)\n", address, x.f, x.u); +} +void TraceMemoryStoreF64(void* raw_context, uint64_t address, double value) { + auto thread_state = *((ThreadState**)raw_context); + union { + double f; + uint64_t u; + } x; + x.f = value; + DPRINT("store.f64 %.8X = %lle (%llX)\n", address, x.f, x.u); +} +void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + /*DPRINT("ctx v128 +%d = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", ics.rf[i->src1_reg].u64, + VECF4(ics.rf[i->src2_reg].v128,0), VECF4(ics.rf[i->src2_reg].v128,1), VECF4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3), + VECI4(ics.rf[i->src2_reg].v128,0), VECI4(ics.rf[i->src2_reg].v128,1), VECI4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3));*/ +} + + +} // namespace lowering +} // namespace x64 +} // namespace backend +} // namespace alloy diff --git a/src/alloy/backend/x64/lowering/tracers.h b/src/alloy/backend/x64/lowering/tracers.h new file mode 100644 index 000000000..eccc87de9 --- /dev/null +++ b/src/alloy/backend/x64/lowering/tracers.h @@ -0,0 +1,62 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef ALLOY_BACKEND_X64_X64_LOWERING_TRACERS_H_ +#define ALLOY_BACKEND_X64_X64_LOWERING_TRACERS_H_ + +#include + + +namespace alloy { +namespace backend { +namespace x64 { +class X64Emitter; +namespace lowering { + +void TraceString(void* raw_context, const char* str); + +void TraceContextLoadI8(void* raw_context, uint64_t offset, uint8_t value); +void TraceContextLoadI16(void* raw_context, uint64_t offset, uint16_t value); +void TraceContextLoadI32(void* raw_context, uint64_t offset, uint32_t value); +void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value); +void TraceContextLoadF32(void* raw_context, uint64_t offset, float value); +void TraceContextLoadF64(void* raw_context, uint64_t offset, double value); +void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value); + +void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value); +void TraceContextStoreI16(void* raw_context, uint64_t offset, uint16_t value); +void TraceContextStoreI32(void* raw_context, uint64_t offset, uint32_t value); +void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value); +void TraceContextStoreF32(void* raw_context, uint64_t offset, float value); +void TraceContextStoreF64(void* raw_context, uint64_t offset, double value); +void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value); + +void TraceMemoryLoadI8(void* raw_context, uint64_t address, uint8_t value); +void TraceMemoryLoadI16(void* raw_context, uint64_t address, uint16_t value); +void TraceMemoryLoadI32(void* raw_context, uint64_t address, uint32_t value); +void TraceMemoryLoadI64(void* raw_context, uint64_t address, uint64_t value); +void TraceMemoryLoadF32(void* raw_context, uint64_t address, float value); +void TraceMemoryLoadF64(void* raw_context, uint64_t address, double value); +void TraceMemoryLoadV128(void* raw_context, uint64_t address, __m128 value); + +void TraceMemoryStoreI8(void* raw_context, uint64_t address, uint8_t value); +void TraceMemoryStoreI16(void* raw_context, uint64_t address, uint16_t value); +void TraceMemoryStoreI32(void* raw_context, uint64_t address, uint32_t value); +void TraceMemoryStoreI64(void* raw_context, uint64_t address, uint64_t value); +void TraceMemoryStoreF32(void* raw_context, uint64_t address, float value); +void TraceMemoryStoreF64(void* raw_context, uint64_t address, double value); +void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value); + +} // namespace lowering +} // namespace x64 +} // namespace backend +} // namespace alloy + + +#endif // ALLOY_BACKEND_X64_X64_LOWERING_TRACERS_H_ diff --git a/src/alloy/runtime/thread_state.cc b/src/alloy/runtime/thread_state.cc index 32edf177e..84add8bce 100644 --- a/src/alloy/runtime/thread_state.cc +++ b/src/alloy/runtime/thread_state.cc @@ -64,6 +64,5 @@ ThreadState* ThreadState::Get() { } uint32_t ThreadState::GetThreadID() { - XEASSERT(thread_state_); return thread_state_->thread_id_; } From 01c1dd6417acc7315ef28d715aa7a31314212ff8 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 27 Jan 2014 21:03:17 -0800 Subject: [PATCH 027/184] Fixing unpack type. --- src/alloy/backend/x64/lowering/lowering_sequences.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 15ca6d19f..0c206bd37 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -2165,7 +2165,7 @@ table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { // Load source, move from tight pack of X16Y16.... to X16...Y16... // Also zero out the high end. // TODO(benvanik): special case constant unpacks that just get 0/1/etc. - IntUnaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { + XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm&, const Xmm& src) { // sx = src.iw >> 16; // sy = src.iw & 0xFFFF; // dest = { 3.0 + (sx / float(1 << 22)), @@ -2177,8 +2177,8 @@ table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { // xmm <<= 1w {0,0,packed,0} // xmm = VCVTPH2PS(xmm) {sx,sy,0,0} // xmm /= + UNIMPLEMENTED_SEQ(); }); - UNIMPLEMENTED_SEQ(); } else if (i->flags == PACK_TYPE_FLOAT16_4) { // Could be shared with FLOAT16_2. UNIMPLEMENTED_SEQ(); From 8894a0f86e09e1447a63bc3a08ce995ed864d777 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 27 Jan 2014 21:32:20 -0800 Subject: [PATCH 028/184] Fixing tracer formats. --- src/alloy/backend/ivm/ivm_intcode.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 3bc84a771..c5bfdd4b3 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -1554,7 +1554,7 @@ uint32_t IntCode_STORE_I8(IntCodeState& ics, const IntCode* i) { return IntCode_STORE_REGISTER_I8_DYNAMIC(ics, i); } DPRINT("store.i8 %.8X = %d (%X)\n", - address, ics.rf[i->src2_reg].i8, ics.rf[i->src2_reg].i8); + address, ics.rf[i->src2_reg].i8, ics.rf[i->src2_reg].u8); DFLUSH(); *((int8_t*)(ics.membase + address)) = ics.rf[i->src2_reg].i8; return IA_NEXT; @@ -1565,7 +1565,7 @@ uint32_t IntCode_STORE_I16(IntCodeState& ics, const IntCode* i) { return IntCode_STORE_REGISTER_I16_DYNAMIC(ics, i); } DPRINT("store.i16 %.8X = %d (%X)\n", - address, ics.rf[i->src2_reg].i16, ics.rf[i->src2_reg].i16); + address, ics.rf[i->src2_reg].i16, ics.rf[i->src2_reg].u16); DFLUSH(); *((int16_t*)(ics.membase + address)) = ics.rf[i->src2_reg].i16; return IA_NEXT; @@ -1576,7 +1576,7 @@ uint32_t IntCode_STORE_I32(IntCodeState& ics, const IntCode* i) { return IntCode_STORE_REGISTER_I32_DYNAMIC(ics, i); } DPRINT("store.i32 %.8X = %d (%X)\n", - address, ics.rf[i->src2_reg].i32, ics.rf[i->src2_reg].i32); + address, ics.rf[i->src2_reg].i32, ics.rf[i->src2_reg].u32); DFLUSH(); *((int32_t*)(ics.membase + address)) = ics.rf[i->src2_reg].i32; return IA_NEXT; @@ -1587,7 +1587,7 @@ uint32_t IntCode_STORE_I64(IntCodeState& ics, const IntCode* i) { return IntCode_STORE_REGISTER_I64_DYNAMIC(ics, i); } DPRINT("store.i64 %.8X = %lld (%llX)\n", - address, ics.rf[i->src2_reg].i64, ics.rf[i->src2_reg].i64); + address, ics.rf[i->src2_reg].i64, ics.rf[i->src2_reg].u64); DFLUSH(); *((int64_t*)(ics.membase + address)) = ics.rf[i->src2_reg].i64; return IA_NEXT; @@ -1595,7 +1595,7 @@ uint32_t IntCode_STORE_I64(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_STORE_F32(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; DPRINT("store.f32 %.8X = %e (%X)\n", - address, ics.rf[i->src2_reg].f32, ics.rf[i->src2_reg].i32); + address, ics.rf[i->src2_reg].f32, ics.rf[i->src2_reg].u32); DFLUSH(); *((float*)(ics.membase + address)) = ics.rf[i->src2_reg].f32; return IA_NEXT; @@ -1603,7 +1603,7 @@ uint32_t IntCode_STORE_F32(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_STORE_F64(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; DPRINT("store.f64 %.8X = %lle (%llX)\n", - address, ics.rf[i->src2_reg].f64, ics.rf[i->src2_reg].i64); + address, ics.rf[i->src2_reg].f64, ics.rf[i->src2_reg].u64); DFLUSH(); *((double*)(ics.membase + address)) = ics.rf[i->src2_reg].f64; return IA_NEXT; From da36baba8d0922a3c907736baff5ba78938f5e12 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 27 Jan 2014 21:32:58 -0800 Subject: [PATCH 029/184] Fixing 64-bit mov encoding. *shakes fist at xbyak for silently coercing* --- .../x64/lowering/lowering_sequences.cc | 20 +++++++++++-------- src/alloy/backend/x64/lowering/op_utils.inl | 15 ++++++++++++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 0c206bd37..77f84f7cc 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -707,7 +707,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceContextStoreI64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { - e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.i64); + MovMem64(e, e.rcx + i->src1.offset, i->src2.value->constant.i64); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8, i->src2.value->constant.i64); @@ -741,7 +741,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceContextStoreF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { - e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.i64); + MovMem64(e, e.rcx + i->src1.offset, i->src2.value->constant.i64); #if DTRACE e.mov(e.rdx, i->src1.offset); e.movsd(e.xmm0, e.qword[e.rcx + i->src1.offset]); @@ -759,8 +759,10 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceContextStoreF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { - e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.v128.low); - e.mov(e.qword[e.rcx + i->src1.offset + 8], i->src2.value->constant.v128.high); + // TODO(benvanik): check zero + // TODO(benvanik): correct order? + MovMem64(e, e.rcx + i->src1.offset, i->src2.value->constant.v128.low); + MovMem64(e, e.rcx + i->src1.offset + 8, i->src2.value->constant.v128.high); #if DTRACE e.mov(e.rdx, i->src1.offset); e.movups(e.xmm0, e.ptr[e.rcx + i->src1.offset]); @@ -1012,7 +1014,7 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceMemoryStoreI64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { - e.mov(e.qword[addr], i->src2.value->constant.i64); + MovMem64(e, addr, i->src2.value->constant.i64); #if DTRACE e.lea(e.rdx, e.ptr[addr]); e.mov(e.r8, i->src2.value->constant.i64); @@ -1046,7 +1048,7 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceMemoryStoreF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { - e.mov(e.qword[addr], i->src2.value->constant.i64); + MovMem64(e, addr, i->src2.value->constant.i64); #if DTRACE e.lea(e.rdx, e.ptr[addr]); e.movsd(e.xmm0, e.ptr[addr]); @@ -1065,8 +1067,10 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceMemoryStoreV128); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { - e.mov(e.ptr[addr], i->src2.value->constant.v128.low); - e.mov(e.ptr[addr + 8], i->src2.value->constant.v128.high); + // TODO(benvanik): check zero + // TODO(benvanik): correct order? + MovMem64(e, addr, i->src2.value->constant.v128.low); + MovMem64(e, addr + 8, i->src2.value->constant.v128.high); #if DTRACE e.lea(e.rdx, e.ptr[addr]); e.movups(e.xmm0, e.ptr[addr]); diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 424e8eeb2..7369f7eea 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -14,6 +14,21 @@ namespace { +// Moves a 64bit immediate into memory. +void MovMem64(X64Emitter& e, RegExp& addr, uint64_t v) { + if ((v & ~0x7FFFFFFF) == 0) { + // Fits under 31 bits, so just load using normal mov. + e.mov(e.qword[addr], v); + } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { + // Negative number that fits in 32bits. + e.mov(e.qword[addr], v); + } else { + // 64bit number that needs double movs. + e.mov(e.rax, v); + e.mov(e.qword[addr], e.rax); + } +} + // Sets EFLAGs with zf for the given value. // ZF = 1 if false, 0 = true (so jz = jump if false) void CheckBoolean(X64Emitter& e, Value* v) { From d67f786af8a95b50ebcdd208d6cfac11745e0253 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 27 Jan 2014 22:18:44 -0800 Subject: [PATCH 030/184] Some binary xmm ops. --- .../x64/lowering/lowering_sequences.cc | 78 +++++++++-- src/alloy/backend/x64/lowering/op_utils.inl | 125 +++++++++++++++++- 2 files changed, 186 insertions(+), 17 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 77f84f7cc..da9dc6a97 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1364,9 +1364,17 @@ table->AddSequence(OPCODE_ADD, [](X64Emitter& e, Instr*& i) { e.add(dest_src, src); }); } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + if (i.src1.value->type == FLOAT32_TYPE) { + e.addss(dest_src, src); + } else { + e.addsd(dest_src, src); + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + e.addps(dest_src, src); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1438,9 +1446,17 @@ table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { e.sub(dest_src, src); }); } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + if (i.src1.value->type == FLOAT32_TYPE) { + e.subss(dest_src, src); + } else { + e.subsd(dest_src, src); + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + e.subps(dest_src, src); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1478,9 +1494,19 @@ table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { e.mov(dest_src, Nax); }); } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + if (i.flags & ARITHMETIC_UNSIGNED) { UNIMPLEMENTED_SEQ(); } + if (i.src1.value->type == FLOAT32_TYPE) { + e.mulss(dest_src, src); + } else { + e.mulsd(dest_src, src); + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + if (i.flags & ARITHMETIC_UNSIGNED) { UNIMPLEMENTED_SEQ(); } + e.mulps(dest_src, src); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1551,9 +1577,19 @@ table->AddSequence(OPCODE_DIV, [](X64Emitter& e, Instr*& i) { e.mov(dest_src, Nax); }); } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + if (i.flags & ARITHMETIC_UNSIGNED) { UNIMPLEMENTED_SEQ(); } + if (i.src1.value->type == FLOAT32_TYPE) { + e.divss(dest_src, src); + } else { + e.divsd(dest_src, src); + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + if (i.flags & ARITHMETIC_UNSIGNED) { UNIMPLEMENTED_SEQ(); } + e.divps(dest_src, src); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1703,7 +1739,12 @@ table->AddSequence(OPCODE_LOG2, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_DOT_PRODUCT_3, [](X64Emitter& e, Instr*& i) { if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx + // TODO(benvanik): verify ordering + e.db(0xCC); + e.dpps(dest_src, src, B01110001); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1713,7 +1754,12 @@ table->AddSequence(OPCODE_DOT_PRODUCT_3, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_DOT_PRODUCT_4, [](X64Emitter& e, Instr*& i) { if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx + // TODO(benvanik): verify ordering + e.db(0xCC); + e.dpps(dest_src, src, B11110001); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1729,7 +1775,9 @@ table->AddSequence(OPCODE_AND, [](X64Emitter& e, Instr*& i) { e.and(dest_src, src); }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + e.pand(dest_src, src); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1745,7 +1793,9 @@ table->AddSequence(OPCODE_OR, [](X64Emitter& e, Instr*& i) { e.or(dest_src, src); }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + e.por(dest_src, src); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1761,7 +1811,9 @@ table->AddSequence(OPCODE_XOR, [](X64Emitter& e, Instr*& i) { e.xor(dest_src, src); }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + e.pxor(dest_src, src); + }); } else { ASSERT_INVALID_TYPE(); } diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 7369f7eea..1d538006f 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -715,17 +715,15 @@ void IntTernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn // Since alot of SSE ops can take dest + src, just do that. // Worst case the callee can dedupe. typedef void(xmm_v_fn)(X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src); -template void XmmUnaryOpV(X64Emitter& e, Instr*& i, xmm_v_fn v_fn, - T& dest, T& src1) { + Xmm& dest, Xmm& src1) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0); v_fn(e, *i, dest, src1); e.EndOp(dest, src1); } -template void XmmUnaryOpC(X64Emitter& e, Instr*& i, xmm_v_fn v_fn, - T& dest, Value* src1) { + Xmm& dest, Value* src1) { e.BeginOp(i->dest, dest, REG_DEST); if (src1->type == FLOAT32_TYPE) { e.mov(e.eax, (uint32_t)src1->constant.i32); @@ -771,6 +769,125 @@ void XmmUnaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_v_fn v_fn) { } }; +// TODO(benvanik): allow a vvv form for dest = src1 + src2 that new SSE +// ops support. +typedef void(xmm_vv_fn)(X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src); +void XmmBinaryOpVV(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, + Xmm& dest, Xmm& src1, Xmm& src2) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0); + if (dest == src1) { + vv_fn(e, *i, dest, src2); + } else if (dest == src2) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + vv_fn(e, *i, dest, src1); + } else { + // Eww. + e.movaps(e.xmm0, src1); + vv_fn(e, *i, e.xmm0, src2); + e.movaps(dest, e.xmm0); + } + } else { + e.movaps(dest, src1); + vv_fn(e, *i, dest, src2); + } + e.EndOp(dest, src1, src2); +} +void XmmBinaryOpVC(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, + Xmm& dest, Xmm& src1, Value* src2) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + if (src2->type == FLOAT32_TYPE) { + e.mov(e.eax, (uint32_t)src2->constant.i32); + e.movss(dest, e.eax); + } else if (src2->type == FLOAT64_TYPE) { + e.mov(e.rax, (uint64_t)src2->constant.i64); + e.movsd(dest, e.rax); + } else { + UNIMPLEMENTED_SEQ(); + } + vv_fn(e, *i, dest, src1); + } else { + if (dest != src1) { + e.movaps(dest, src1); + } + if (src2->type == FLOAT32_TYPE) { + e.mov(e.eax, (uint32_t)src2->constant.i32); + e.movss(e.xmm0, e.eax); + } else if (src2->type == FLOAT64_TYPE) { + e.mov(e.rax, (uint64_t)src2->constant.i64); + e.movsd(e.xmm0, e.rax); + } else { + UNIMPLEMENTED_SEQ(); + } + vv_fn(e, *i, dest, e.xmm0); + } + e.EndOp(dest, src1); +} +void XmmBinaryOpCV(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, + Xmm& dest, Value* src1, Xmm& src2) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src2.value, src2, 0); + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + if (src1->type == FLOAT32_TYPE) { + e.mov(e.eax, (uint32_t)src1->constant.i32); + e.movss(dest, e.eax); + } else if (src1->type == FLOAT64_TYPE) { + e.mov(e.rax, (uint64_t)src1->constant.i64); + e.movsd(dest, e.rax); + } else { + UNIMPLEMENTED_SEQ(); + } + vv_fn(e, *i, dest, src2); + } else { + auto real_src2 = src2; + if (dest == src2) { + e.movaps(e.xmm0, src2); + real_src2 = e.xmm0; + } + if (src1->type == FLOAT32_TYPE) { + e.mov(e.eax, (uint32_t)src1->constant.i32); + e.movss(dest, e.eax); + } else if (src1->type == FLOAT64_TYPE) { + e.mov(e.rax, (uint64_t)src1->constant.i64); + e.movsd(dest, e.rax); + } else { + UNIMPLEMENTED_SEQ(); + } + vv_fn(e, *i, dest, real_src2); + } + e.EndOp(dest, src2); +} +void XmmBinaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_vv_fn vv_fn) { + // TODO(benvanik): table lookup. This linear scan is slow. + XEASSERT(i->dest->type == i->src1.value->type); + if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32, SIG_TYPE_F32) || + i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64, SIG_TYPE_F64) || + i->Match(SIG_TYPE_IGNORE, SIG_TYPE_V128, SIG_TYPE_V128)) { + Xmm dest, src1, src2; + XmmBinaryOpVV(e, i, vv_fn, dest, src1, src2); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32, SIG_TYPE_F32C) || + i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64, SIG_TYPE_F64C) || + i->Match(SIG_TYPE_IGNORE, SIG_TYPE_V128, SIG_TYPE_V128C)) { + Xmm dest, src1; + XmmBinaryOpVC(e, i, vv_fn, dest, src1, i->src2.value); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32C, SIG_TYPE_F32) || + i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64C, SIG_TYPE_F64) || + i->Match(SIG_TYPE_IGNORE, SIG_TYPE_V128C, SIG_TYPE_V128)) { + Xmm dest, src2; + XmmBinaryOpCV(e, i, vv_fn, dest, i->src1.value, src2); + } else { + ASSERT_INVALID_TYPE(); + } + if (flags & ARITHMETIC_SET_CARRY) { + // EFLAGS should have CA set? + // (so long as we don't fuck with it) + // UNIMPLEMENTED_SEQ(); + } +}; + } // namespace #endif // ALLOY_BACKEND_X64_X64_LOWERING_OP_UTILS_INL_ From 5b2e44b0e859d19cbe5a4f3b7f740531d6d7cd88 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 27 Jan 2014 22:47:37 -0800 Subject: [PATCH 031/184] Cleaning up some constant handling. --- .../x64/lowering/lowering_sequences.cc | 2 -- src/alloy/backend/x64/lowering/op_utils.inl | 25 ++++++++++++------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index da9dc6a97..e433991e4 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1464,8 +1464,6 @@ table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { return true; }); -#define LIKE_REG(dest, like) Operand(dest.getIdx(), dest.getKind(), like.getBit(), false) - table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 1d538006f..8f924e59d 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -14,6 +14,9 @@ namespace { +#define LIKE_REG(dest, like) Reg(dest.getIdx(), dest.getKind(), like.getBit(), false) +#define NAX_LIKE(like) Reg(e.rax.getIdx(), e.rax.getKind(), like.getBit(), false) + // Moves a 64bit immediate into memory. void MovMem64(X64Emitter& e, RegExp& addr, uint64_t v) { if ((v & ~0x7FFFFFFF) == 0) { @@ -375,9 +378,10 @@ void IntBinaryOpVV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vv_fn(e, *i, dest, src1); } else { // Eww. - e.mov(e.rax, src1); - vv_fn(e, *i, e.rax, src2); - e.mov(dest, e.rax); + auto Nax = NAX_LIKE(src1); + e.mov(Nax, src1); + vv_fn(e, *i, Nax, src2); + e.mov(dest, Nax); } } else { e.mov(dest, src1); @@ -423,9 +427,10 @@ void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); } else { // Eww. - e.mov(e.rax, src2); + auto Nax = NAX_LIKE(src2); + e.mov(Nax, src2); e.mov(dest, (uint32_t)src1->get_constant(CT())); - vv_fn(e, *i, dest, e.rax); + vv_fn(e, *i, dest, Nax); } } else { e.mov(dest, src2); @@ -574,9 +579,10 @@ void IntTernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vvc_fn(e, *i, dest, src1, (uint32_t)src3->get_constant(CT())); } else { // Eww. - e.mov(e.rax, src2); + auto Nax = NAX_LIKE(src2); + e.mov(Nax, src2); e.mov(dest, src1); - vvc_fn(e, *i, dest, e.rax, (uint32_t)src3->get_constant(CT())); + vvc_fn(e, *i, dest, Nax, (uint32_t)src3->get_constant(CT())); } } else { e.mov(dest, src1); @@ -622,9 +628,10 @@ void IntTernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src1); } else { // Eww. - e.mov(e.rax, src3); + auto Nax = NAX_LIKE(src3); + e.mov(Nax, src3); e.mov(dest, src1); - vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), e.rax); + vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), Nax); } } else { e.mov(dest, src1); From 0908891bb25934a6243138345a1d90966483ac40 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 27 Jan 2014 22:56:56 -0800 Subject: [PATCH 032/184] v128 tracing. --- src/alloy/backend/ivm/ivm_intcode.cc | 12 +++++------ .../x64/lowering/lowering_sequences.cc | 2 +- src/alloy/backend/x64/lowering/tracers.cc | 21 ++++++++++++------- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index c5bfdd4b3..873a083b0 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -40,10 +40,10 @@ namespace ivm { #define DPRINT #define DFLUSH() -//#define IPRINT if (ics.thread_state->thread_id() == 1) printf -//#define IFLUSH() fflush(stdout) -//#define DPRINT if (ics.thread_state->thread_id() == 1) printf -//#define DFLUSH() fflush(stdout) +#define IPRINT if (ics.thread_state->thread_id() == 1) printf +#define IFLUSH() fflush(stdout) +#define DPRINT if (ics.thread_state->thread_id() == 1) printf +#define DFLUSH() fflush(stdout) #if XE_CPU_BIGENDIAN #define VECB16(v,n) (v.b16[n]) @@ -1515,7 +1515,7 @@ uint32_t IntCode_LOAD_V128(IntCodeState& ics, const IntCode* i) { for (int n = 0; n < 4; n++) { VECI4(dest,n) = *((uint32_t*)(ics.membase + address + n * 4)); } - DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = load v128 %.8X\n", + DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = load.v128 %.8X\n", VECF4(dest,0), VECF4(dest,1), VECF4(dest,2), VECF4(dest,3), VECI4(dest,0), VECI4(dest,1), VECI4(dest,2), VECI4(dest,3), address); @@ -1610,7 +1610,7 @@ uint32_t IntCode_STORE_F64(IntCodeState& ics, const IntCode* i) { } uint32_t IntCode_STORE_V128(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; - DPRINT("store v128 %.8X = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", + DPRINT("store.v128 %.8X = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", address, VECF4(ics.rf[i->src2_reg].v128,0), VECF4(ics.rf[i->src2_reg].v128,1), VECF4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3), VECI4(ics.rf[i->src2_reg].v128,0), VECI4(ics.rf[i->src2_reg].v128,1), VECI4(ics.rf[i->src2_reg].v128,2), VECI4(ics.rf[i->src2_reg].v128,3)); diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index e433991e4..8c37e7f5b 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -756,7 +756,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { #if DTRACE e.mov(e.rdx, i->src1.offset); e.movaps(e.xmm0, src); - CallNative(e, TraceContextStoreF64); + CallNative(e, TraceContextStoreV128); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { // TODO(benvanik): check zero diff --git a/src/alloy/backend/x64/lowering/tracers.cc b/src/alloy/backend/x64/lowering/tracers.cc index a0a2f212b..1115f360d 100644 --- a/src/alloy/backend/x64/lowering/tracers.cc +++ b/src/alloy/backend/x64/lowering/tracers.cc @@ -77,7 +77,9 @@ void TraceContextLoadF64(void* raw_context, uint64_t offset, double value) { } void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - //DPRINT("%d (%.X) = ctx i8 +%d\n", (int8_t)value, value, offset); + DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = ctx v128 +%d\n", offset, + value.m128_f32[0], value.m128_f32[1], value.m128_f32[2], value.m128_f32[3], + value.m128_i32[0], value.m128_i32[1], value.m128_i32[2], value.m128_i32[3]); } void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value) { @@ -116,9 +118,9 @@ void TraceContextStoreF64(void* raw_context, uint64_t offset, double value) { } void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - /*DPRINT("ctx v128 +%d = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", ics.rf[i->src1_reg].u64, - VECF4(ics.rf[i->src2_reg].v128,0), VECF4(ics.rf[i->src2_reg].v128,1), VECF4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3), - VECI4(ics.rf[i->src2_reg].v128,0), VECI4(ics.rf[i->src2_reg].v128,1), VECI4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3));*/ + DPRINT("ctx v128 +%d = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", offset, + value.m128_f32[0], value.m128_f32[1], value.m128_f32[2], value.m128_f32[3], + value.m128_i32[0], value.m128_i32[1], value.m128_i32[2], value.m128_i32[3]); } void TraceMemoryLoadI8(void* raw_context, uint64_t address, uint8_t value) { @@ -157,7 +159,10 @@ void TraceMemoryLoadF64(void* raw_context, uint64_t address, double value) { } void TraceMemoryLoadV128(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - //DPRINT("%d (%.X) = load.v128 +%d\n", (int8_t)value, value, offset); + DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = load.v128 %.8X\n", + value.m128_f32[0], value.m128_f32[1], value.m128_f32[2], value.m128_f32[3], + value.m128_i32[0], value.m128_i32[1], value.m128_i32[2], value.m128_i32[3], + address); } void TraceMemoryStoreI8(void* raw_context, uint64_t address, uint8_t value) { @@ -196,9 +201,9 @@ void TraceMemoryStoreF64(void* raw_context, uint64_t address, double value) { } void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - /*DPRINT("ctx v128 +%d = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", ics.rf[i->src1_reg].u64, - VECF4(ics.rf[i->src2_reg].v128,0), VECF4(ics.rf[i->src2_reg].v128,1), VECF4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3), - VECI4(ics.rf[i->src2_reg].v128,0), VECI4(ics.rf[i->src2_reg].v128,1), VECI4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3));*/ + DPRINT("store.v128 %.8X = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", address, + value.m128_f32[0], value.m128_f32[1], value.m128_f32[2], value.m128_f32[3], + value.m128_i32[0], value.m128_i32[1], value.m128_i32[2], value.m128_i32[3]); } From 5421108b9ef17f970120f592400b4718173b32b9 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 27 Jan 2014 23:00:26 -0800 Subject: [PATCH 033/184] Bad dp checks. --- src/alloy/backend/x64/lowering/lowering_sequences.cc | 4 ++-- src/alloy/backend/x64/lowering/op_utils.inl | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 8c37e7f5b..5a2b8b8a3 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1736,7 +1736,7 @@ table->AddSequence(OPCODE_LOG2, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_DOT_PRODUCT_3, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { + if (IsVecType(i->src1.value->type)) { XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx // TODO(benvanik): verify ordering @@ -1751,7 +1751,7 @@ table->AddSequence(OPCODE_DOT_PRODUCT_3, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_DOT_PRODUCT_4, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { + if (IsVecType(i->src1.value->type)) { XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx // TODO(benvanik): verify ordering diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 8f924e59d..b5ebc792b 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -869,7 +869,6 @@ void XmmBinaryOpCV(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, } void XmmBinaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_vv_fn vv_fn) { // TODO(benvanik): table lookup. This linear scan is slow. - XEASSERT(i->dest->type == i->src1.value->type); if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32, SIG_TYPE_F32) || i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64, SIG_TYPE_F64) || i->Match(SIG_TYPE_IGNORE, SIG_TYPE_V128, SIG_TYPE_V128)) { From 465c3a41ddb1f972a28a290e76a0070efcb22d79 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 28 Jan 2014 00:19:05 -0800 Subject: [PATCH 034/184] Hacking. --- .../x64/lowering/lowering_sequences.cc | 205 ++++++++++++++++-- src/alloy/backend/x64/lowering/op_utils.inl | 18 +- src/alloy/backend/x64/lowering/tracers.cc | 5 +- .../compiler/passes/value_reduction_pass.cc | 18 +- 4 files changed, 213 insertions(+), 33 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 5a2b8b8a3..a065ecf66 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -44,6 +44,11 @@ namespace { // Basically, this identity must hold: // shuffle(vec, b00011011) -> {x,y,z,w} => {x,y,z,w} // All indices and operations must respect that. +// +// Memory (big endian): +// [00 01 02 03] [04 05 06 07] [08 09 0A 0B] [0C 0D 0E 0F] (x, y, z, w) +// load into xmm register: +// [0F 0E 0D 0C] [0B 0A 09 08] [07 06 05 04] [03 02 01 00] (w, z, y, x) void Dummy() { // @@ -498,7 +503,63 @@ table->AddSequence(OPCODE_TRUNCATE, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); + if (i->Match(SIG_TYPE_I32, SIG_TYPE_F32)) { + Reg32 dest; + Xmm src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + // TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?) + e.cvtss2si(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_F64)) { + Reg32 dest; + Xmm src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + // TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?) + e.cvtsd2ss(e.xmm0, src); + e.cvtss2si(dest, e.xmm0); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_F64)) { + Reg64 dest; + Xmm src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + // TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?) + e.cvtsd2si(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_I32)) { + Xmm dest; + Reg32 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + // TODO(benvanik): additional checks for saturation/etc? + e.cvtsi2ss(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_F64)) { + Xmm dest, src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + // TODO(benvanik): additional checks for saturation/etc? + e.cvtsd2ss(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_I64)) { + Xmm dest; + Reg64 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + // TODO(benvanik): additional checks for saturation/etc? + e.cvtsi2sd(dest, src); + e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_F32)) { + Xmm dest, src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.cvtss2sd(dest, src); + e.EndOp(dest, src); + } else { + UNIMPLEMENTED_SEQ(); + } i = e.Advance(i); return true; }); @@ -506,9 +567,56 @@ table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_ROUND, [](X64Emitter& e, Instr*& i) { // flags = ROUND_TO_* if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { + if (i.src1.value->type == FLOAT32_TYPE) { + switch (i.flags) { + case ROUND_TO_ZERO: + e.roundss(dest, src, B00000011); + break; + case ROUND_TO_NEAREST: + e.roundss(dest, src, B00000000); + break; + case ROUND_TO_MINUS_INFINITY: + e.roundss(dest, src, B00000001); + break; + case ROUND_TO_POSITIVE_INFINITY: + e.roundss(dest, src, B00000010); + break; + } + } else { + switch (i.flags) { + case ROUND_TO_ZERO: + e.roundsd(dest, src, B00000011); + break; + case ROUND_TO_NEAREST: + e.roundsd(dest, src, B00000000); + break; + case ROUND_TO_MINUS_INFINITY: + e.roundsd(dest, src, B00000001); + break; + case ROUND_TO_POSITIVE_INFINITY: + e.roundsd(dest, src, B00000010); + break; + } + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { + switch (i.flags) { + case ROUND_TO_ZERO: + e.roundps(dest, src, B00000011); + break; + case ROUND_TO_NEAREST: + e.roundps(dest, src, B00000000); + break; + case ROUND_TO_MINUS_INFINITY: + e.roundps(dest, src, B00000001); + break; + case ROUND_TO_POSITIVE_INFINITY: + e.roundps(dest, src, B00000010); + break; + } + }); } else { ASSERT_INVALID_TYPE(); } @@ -634,7 +742,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movaps(e.xmm0, dest); + e.lea(e.r8, Stash(e, dest)); CallNative(e, TraceContextLoadV128); #endif // DTRACE } else { @@ -755,7 +863,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movaps(e.xmm0, src); + e.lea(e.r8, Stash(e, src)); CallNative(e, TraceContextStoreV128); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { @@ -765,7 +873,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { MovMem64(e, e.rcx + i->src1.offset + 8, i->src2.value->constant.v128.high); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movups(e.xmm0, e.ptr[e.rcx + i->src1.offset]); + e.lea(e.r8, e.ptr[e.rcx + i->src1.offset]); CallNative(e, TraceContextStoreV128); #endif // DTRACE } else { @@ -886,7 +994,7 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { e.db(0xCC); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movaps(e.xmm0, dest); + e.lea(e.r8, Stash(e, dest)); CallNative(e, TraceMemoryLoadV128); #endif // DTRACE } else { @@ -1063,7 +1171,7 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { e.db(0xCC); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movaps(e.xmm0, src); + e.lea(e.r8, Stash(e, src)); CallNative(e, TraceMemoryStoreV128); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { @@ -1073,7 +1181,7 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { MovMem64(e, addr + 8, i->src2.value->constant.v128.high); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movups(e.xmm0, e.ptr[addr]); + e.lea(e.r8, e.ptr[addr]); CallNative(e, TraceMemoryStoreV128); #endif // DTRACE } else { @@ -2107,14 +2215,57 @@ table->AddSequence(OPCODE_INSERT, [](X64Emitter& e, Instr*& i) { return true; }); +// TODO(benvanik): sequence extract/splat: +// v0.i32 = extract v0.v128, 0 +// v0.v128 = splat v0.i32 +// This can be a single broadcast. + table->AddSequence(OPCODE_EXTRACT, [](X64Emitter& e, Instr*& i) { if (IsVecType(i->src1.value->type)) { if (i->dest->type == INT8_TYPE) { - UNIMPLEMENTED_SEQ(); + Reg8 dest; + Xmm src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + if (i->src2.value->IsConstant()) { + e.pextrb(dest, src, i->src2.value->constant.i8); + } else { + UNIMPLEMENTED_SEQ(); + } + e.EndOp(dest, src); } else if (i->dest->type == INT16_TYPE) { - UNIMPLEMENTED_SEQ(); + Reg16 dest; + Xmm src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + if (i->src2.value->IsConstant()) { + e.pextrw(dest, src, i->src2.value->constant.i8); + } else { + UNIMPLEMENTED_SEQ(); + } + e.EndOp(dest, src); } else if (i->dest->type == INT32_TYPE) { - UNIMPLEMENTED_SEQ(); + Reg32 dest; + Xmm src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + if (i->src2.value->IsConstant()) { + e.pextrd(dest, src, i->src2.value->constant.i8); + } else { + UNIMPLEMENTED_SEQ(); + } + e.EndOp(dest, src); + } else if (i->dest->type == FLOAT32_TYPE) { + Reg32 dest; + Xmm src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + if (i->src2.value->IsConstant()) { + e.extractps(dest, src, i->src2.value->constant.i8); + } else { + UNIMPLEMENTED_SEQ(); + } + e.EndOp(dest, src); } else { ASSERT_INVALID_TYPE(); } @@ -2128,13 +2279,35 @@ table->AddSequence(OPCODE_EXTRACT, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SPLAT, [](X64Emitter& e, Instr*& i) { if (IsVecType(i->dest->type)) { if (i->src1.value->type == INT8_TYPE) { - UNIMPLEMENTED_SEQ(); + Xmm dest; + Reg8 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.pinsrb(e.xmm0, src, 0); + e.vpbroadcastb(dest, e.xmm0); + e.EndOp(dest, src); } else if (i->src1.value->type == INT16_TYPE) { - UNIMPLEMENTED_SEQ(); + Xmm dest; + Reg16 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.pinsrw(e.xmm0, src, 0); + e.vpbroadcastw(dest, e.xmm0); + e.EndOp(dest, src); } else if (i->src1.value->type == INT32_TYPE) { - UNIMPLEMENTED_SEQ(); + Xmm dest; + Reg32 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.pinsrd(e.xmm0, src, 0); + e.vpbroadcastd(dest, e.xmm0); + e.EndOp(dest, src); } else if (i->src1.value->type == FLOAT32_TYPE) { - UNIMPLEMENTED_SEQ(); + Xmm dest, src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.vbroadcastss(dest, src); + e.EndOp(dest, src); } else { ASSERT_INVALID_TYPE(); } diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index b5ebc792b..3f1c73f90 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -17,6 +17,12 @@ namespace { #define LIKE_REG(dest, like) Reg(dest.getIdx(), dest.getKind(), like.getBit(), false) #define NAX_LIKE(like) Reg(e.rax.getIdx(), e.rax.getKind(), like.getBit(), false) +Address Stash(X64Emitter& e, const Xmm& r) { + auto addr = e.ptr[e.rsp + 40]; + e.movaps(addr, r); + return addr; +} + // Moves a 64bit immediate into memory. void MovMem64(X64Emitter& e, RegExp& addr, uint64_t v) { if ((v & ~0x7FFFFFFF) == 0) { @@ -869,19 +875,13 @@ void XmmBinaryOpCV(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, } void XmmBinaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_vv_fn vv_fn) { // TODO(benvanik): table lookup. This linear scan is slow. - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32, SIG_TYPE_F32) || - i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64, SIG_TYPE_F64) || - i->Match(SIG_TYPE_IGNORE, SIG_TYPE_V128, SIG_TYPE_V128)) { + if (!i->src1.value->IsConstant() && !i->src2.value->IsConstant()) { Xmm dest, src1, src2; XmmBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32, SIG_TYPE_F32C) || - i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64, SIG_TYPE_F64C) || - i->Match(SIG_TYPE_IGNORE, SIG_TYPE_V128, SIG_TYPE_V128C)) { + } else if (!i->src1.value->IsConstant() && i->src2.value->IsConstant()) { Xmm dest, src1; XmmBinaryOpVC(e, i, vv_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32C, SIG_TYPE_F32) || - i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64C, SIG_TYPE_F64) || - i->Match(SIG_TYPE_IGNORE, SIG_TYPE_V128C, SIG_TYPE_V128)) { + } else if (i->src1.value->IsConstant() && !i->src2.value->IsConstant()) { Xmm dest, src2; XmmBinaryOpCV(e, i, vv_fn, dest, i->src1.value, src2); } else { diff --git a/src/alloy/backend/x64/lowering/tracers.cc b/src/alloy/backend/x64/lowering/tracers.cc index 1115f360d..d718f6def 100644 --- a/src/alloy/backend/x64/lowering/tracers.cc +++ b/src/alloy/backend/x64/lowering/tracers.cc @@ -77,9 +77,10 @@ void TraceContextLoadF64(void* raw_context, uint64_t offset, double value) { } void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = ctx v128 +%d\n", offset, + DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = ctx v128 +%d\n", value.m128_f32[0], value.m128_f32[1], value.m128_f32[2], value.m128_f32[3], - value.m128_i32[0], value.m128_i32[1], value.m128_i32[2], value.m128_i32[3]); + value.m128_i32[0], value.m128_i32[1], value.m128_i32[2], value.m128_i32[3], + offset); } void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value) { diff --git a/src/alloy/compiler/passes/value_reduction_pass.cc b/src/alloy/compiler/passes/value_reduction_pass.cc index 78367f35a..42984e891 100644 --- a/src/alloy/compiler/passes/value_reduction_pass.cc +++ b/src/alloy/compiler/passes/value_reduction_pass.cc @@ -74,34 +74,40 @@ int ValueReductionPass::Run(HIRBuilder* builder) { OpcodeSignatureType src1_type = GET_OPCODE_SIG_TYPE_SRC1(info->signature); OpcodeSignatureType src2_type = GET_OPCODE_SIG_TYPE_SRC2(info->signature); OpcodeSignatureType src3_type = GET_OPCODE_SIG_TYPE_SRC3(info->signature); - if (src1_type == OPCODE_SIG_TYPE_V && !instr->src1.value->IsConstant()) { + if (src1_type == OPCODE_SIG_TYPE_V) { auto v = instr->src1.value; if (!v->last_use) { ComputeLastUse(v); } if (v->last_use == instr) { // Available. - ordinals.set(v->ordinal, false); + if (!instr->src1.value->IsConstant()) { + ordinals.set(v->ordinal, false); + } } } - if (src2_type == OPCODE_SIG_TYPE_V && !instr->src2.value->IsConstant()) { + if (src2_type == OPCODE_SIG_TYPE_V) { auto v = instr->src2.value; if (!v->last_use) { ComputeLastUse(v); } if (v->last_use == instr) { // Available. - ordinals.set(v->ordinal, false); + if (!instr->src2.value->IsConstant()) { + ordinals.set(v->ordinal, false); + } } } - if (src3_type == OPCODE_SIG_TYPE_V && !instr->src3.value->IsConstant()) { + if (src3_type == OPCODE_SIG_TYPE_V) { auto v = instr->src3.value; if (!v->last_use) { ComputeLastUse(v); } if (v->last_use == instr) { // Available. - ordinals.set(v->ordinal, false); + if (!instr->src3.value->IsConstant()) { + ordinals.set(v->ordinal, false); + } } } if (dest_type == OPCODE_SIG_TYPE_V) { From 0ff1fe93af4f80624e2540d6c82e131110c214e3 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 28 Jan 2014 10:46:40 -0800 Subject: [PATCH 035/184] Disabling logging. --- src/alloy/backend/ivm/ivm_intcode.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 873a083b0..457a0f455 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -40,10 +40,10 @@ namespace ivm { #define DPRINT #define DFLUSH() -#define IPRINT if (ics.thread_state->thread_id() == 1) printf -#define IFLUSH() fflush(stdout) -#define DPRINT if (ics.thread_state->thread_id() == 1) printf -#define DFLUSH() fflush(stdout) +//#define IPRINT if (ics.thread_state->thread_id() == 1) printf +//#define IFLUSH() fflush(stdout) +//#define DPRINT if (ics.thread_state->thread_id() == 1) printf +//#define DFLUSH() fflush(stdout) #if XE_CPU_BIGENDIAN #define VECB16(v,n) (v.b16[n]) From ac4360913fe22f65f24a56e8223357873886c242 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 28 Jan 2014 13:56:30 -0800 Subject: [PATCH 036/184] Fixing typo in logging. --- src/alloy/backend/ivm/ivm_intcode.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 457a0f455..756db9203 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -1420,7 +1420,7 @@ uint32_t IntCode_STORE_CONTEXT_V128(IntCodeState& ics, const IntCode* i) { *((vec128_t*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].v128; DPRINT("ctx v128 +%d = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", ics.rf[i->src1_reg].u64, VECF4(ics.rf[i->src2_reg].v128,0), VECF4(ics.rf[i->src2_reg].v128,1), VECF4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3), - VECI4(ics.rf[i->src2_reg].v128,0), VECI4(ics.rf[i->src2_reg].v128,1), VECI4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3)); + VECI4(ics.rf[i->src2_reg].v128,0), VECI4(ics.rf[i->src2_reg].v128,1), VECI4(ics.rf[i->src2_reg].v128,2), VECI4(ics.rf[i->src2_reg].v128,3)); return IA_NEXT; } int Translate_STORE_CONTEXT(TranslationContext& ctx, Instr* i) { From e5cf47a0d8384990dfa2a64f61ee381445d40f5e Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 28 Jan 2014 20:33:13 -0800 Subject: [PATCH 037/184] More SSE work. --- .../x64/lowering/lowering_sequences.cc | 418 ++++++++++++++---- src/alloy/backend/x64/lowering/op_utils.inl | 114 ++++- src/alloy/backend/x64/lowering/tracers.cc | 8 +- src/alloy/core.h | 10 + 4 files changed, 463 insertions(+), 87 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index a065ecf66..461ef62d5 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -35,6 +35,39 @@ namespace { #define SHUFPS_SWAP_DWORDS 0x1B +enum XmmConst { + XMMZero = 0, + XMMOne = 1, + XMMNegativeOne = 2, + XMMMaskX16Y16 = 3, + XMMFlipX16Y16 = 4, + XMMFixX16Y16 = 5, + XMMNormalizeX16Y16 = 6, + XMM3301 = 7, + XMMSignMaskPS = 8, + XMMSignMaskPD = 9, + XMMByteSwapMask = 10, +}; +static const vec128_t xmm_consts[] = { + /* XMMZero */ vec128f(0.0f, 0.0f, 0.0f, 0.0f), + /* XMMOne */ vec128f(1.0f, 1.0f, 1.0f, 1.0f), + /* XMMNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f), + /* XMMMaskX16Y16 */ vec128i(0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000), + /* XMMFlipX16Y16 */ vec128i(0x00008000, 0x00000000, 0x00000000, 0x00000000), + /* XMMFixX16Y16 */ vec128f(-32768.0f, 0.0f, 0.0f, 0.0f), + /* XMMNormalizeX16Y16 */ vec128f(1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f), + /* XMM3301 */ vec128f(3.0f, 3.0f, 0.0f, 1.0f), + /* XMMSignMaskPS */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), + /* XMMSignMaskPD */ vec128i(0x80000000u, 0x00000000u, 0x80000000u, 0x00000000u), + /* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), +}; +// Use consts by first loading the base register then accessing memory: +// e.mov(e.rax, XMMCONSTBASE) +// e.andps(reg, XMMCONST(XMM3303)) +// TODO(benvanik): find a way to do this without the base register. +#define XMMCONSTBASE (uint64_t)&xmm_consts[0] +#define XMMCONST(base_reg, name) e.ptr[base_reg + name * 16] + // A note about vectors: // Alloy represents vectors as xyzw pairs, with indices 0123. // XMM registers are xyzw pairs with indices 3210, making them more like wzyx. @@ -339,10 +372,50 @@ table->AddSequence(OPCODE_ASSIGN, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + if (i->dest->type == INT32_TYPE) { + if (i->src1.value->type == FLOAT32_TYPE) { + Reg32 dest; + Xmm src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.pextrd(dest, src, 0); + e.EndOp(dest, src); + } else { + UNIMPLEMENTED_SEQ(); + } + } else if (i->dest->type == INT64_TYPE) { + if (i->src1.value->type == FLOAT64_TYPE) { + Reg64 dest; + Xmm src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.pextrq(dest, src, 0); + e.EndOp(dest, src); + } else { + UNIMPLEMENTED_SEQ(); + } + } else if (i->dest->type == FLOAT32_TYPE) { + if (i->src1.value->type == INT32_TYPE) { + Xmm dest; + Reg32 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.pinsrd(dest, src, 0); + e.EndOp(dest, src); + } else { + UNIMPLEMENTED_SEQ(); + } + } else if (i->dest->type == FLOAT64_TYPE) { + if (i->src1.value->type == INT64_TYPE) { + Xmm dest; + Reg64 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + e.pinsrq(dest, src, 0); + e.EndOp(dest, src); + } else { + UNIMPLEMENTED_SEQ(); + } } else if (IsVecType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else { @@ -625,15 +698,32 @@ table->AddSequence(OPCODE_ROUND, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_VECTOR_CONVERT_I2F, [](X64Emitter& e, Instr*& i) { - // flags = ARITHMETIC_SATURATE | ARITHMETIC_UNSIGNED - UNIMPLEMENTED_SEQ(); + // flags = ARITHMETIC_UNSIGNED + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { + // TODO(benvanik): are these really the same? VC++ thinks so. + if (i.flags & ARITHMETIC_UNSIGNED) { + e.cvtdq2ps(dest, src); + } else { + e.cvtdq2ps(dest, src); + } + }); i = e.Advance(i); return true; }); table->AddSequence(OPCODE_VECTOR_CONVERT_F2I, [](X64Emitter& e, Instr*& i) { // flags = ARITHMETIC_SATURATE | ARITHMETIC_UNSIGNED - UNIMPLEMENTED_SEQ(); + XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { + // TODO(benvanik): are these really the same? VC++ thinks so. + if (i.flags & ARITHMETIC_UNSIGNED) { + e.cvttps2dq(dest, src); + } else { + e.cvttps2dq(dest, src); + } + if (i.flags & ARITHMETIC_SATURATE) { + UNIMPLEMENTED_SEQ(); + } + }); i = e.Advance(i); return true; }); @@ -991,7 +1081,6 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { // TODO(benvanik): we should try to stick to movaps if possible. e.movups(dest, e.ptr[addr]); e.EndOp(dest); - e.db(0xCC); #if DTRACE e.lea(e.rdx, e.ptr[addr]); e.lea(e.r8, Stash(e, dest)); @@ -1168,7 +1257,6 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { // TODO(benvanik): we should try to stick to movaps if possible. e.movups(e.ptr[addr], src); e.EndOp(src); - e.db(0xCC); #if DTRACE e.lea(e.rdx, e.ptr[addr]); e.lea(e.r8, Stash(e, src)); @@ -1208,9 +1296,17 @@ table->AddSequence(OPCODE_MAX, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + if (i.src1.value->type == FLOAT32_TYPE) { + e.maxss(dest_src, src); + } else { + e.maxsd(dest_src, src); + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + e.maxps(dest_src, src); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1222,9 +1318,17 @@ table->AddSequence(OPCODE_MIN, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + if (i.src1.value->type == FLOAT32_TYPE) { + e.minss(dest_src, src); + } else { + e.minsd(dest_src, src); + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + e.minps(dest_src, src); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1233,12 +1337,22 @@ table->AddSequence(OPCODE_MIN, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_SELECT, [](X64Emitter& e, Instr*& i) { + CheckBoolean(e, i->src1.value); if (IsIntType(i->dest->type)) { UNIMPLEMENTED_SEQ(); - } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + } else if (IsFloatType(i->dest->type) || IsVecType(i->dest->type)) { + Xmm dest, src2, src3; + e.BeginOp(i->dest, dest, REG_DEST, + i->src2.value, src2, 0, + i->src3.value, src3, 0); + // TODO(benvanik): find a way to do this without branches. + e.inLocalLabel(); + e.movaps(dest, src3); + e.jz(".skip"); + e.movaps(dest, src2); + e.L(".skip"); + e.outLocalLabel(); + e.EndOp(dest, src2, src3); } else { ASSERT_INVALID_TYPE(); } @@ -1707,9 +1821,17 @@ table->AddSequence(OPCODE_MUL_ADD, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmTernaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3) { + if (i.dest->type == FLOAT32_TYPE) { + e.vfmadd132ss(dest_src, src3, src2); + } else { + e.vfmadd132sd(dest_src, src3, src2); + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmTernaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3) { + e.vfmadd132ps(dest_src, src3, src2); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1721,9 +1843,17 @@ table->AddSequence(OPCODE_MUL_SUB, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { UNIMPLEMENTED_SEQ(); } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmTernaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3) { + if (i.dest->type == FLOAT32_TYPE) { + e.vfmsub132ss(dest_src, src3, src2); + } else { + e.vfmsub132sd(dest_src, src3, src2); + } + }); } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + XmmTernaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3) { + e.vfmsub132ps(dest_src, src3, src2); + }); } else { ASSERT_INVALID_TYPE(); } @@ -1739,14 +1869,17 @@ table->AddSequence(OPCODE_NEG, [](X64Emitter& e, Instr*& i) { } else if (IsFloatType(i->dest->type)) { XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { if (i.src1.value->type == FLOAT32_TYPE) { - UNIMPLEMENTED_SEQ(); + e.mov(e.rax, XMMCONSTBASE); + e.vpxor(dest, src, XMMCONST(e.rax, XMMSignMaskPS)); } else { - UNIMPLEMENTED_SEQ(); + e.mov(e.rax, XMMCONSTBASE); + e.vpxor(dest, src, XMMCONST(e.rax, XMMSignMaskPD)); } }); } else if (IsVecType(i->dest->type)) { XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - UNIMPLEMENTED_SEQ(); + e.mov(e.rax, XMMCONSTBASE); + e.vpxor(dest, src, XMMCONST(e.rax, XMMSignMaskPS)); }); } else { ASSERT_INVALID_TYPE(); @@ -1761,14 +1894,20 @@ table->AddSequence(OPCODE_ABS, [](X64Emitter& e, Instr*& i) { } else if (IsFloatType(i->dest->type)) { XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { if (i.src1.value->type == FLOAT32_TYPE) { - UNIMPLEMENTED_SEQ(); + e.mov(e.rax, XMMCONSTBASE); + e.movaps(e.xmm0, XMMCONST(e.rax, XMMSignMaskPS)); + e.vpandn(dest, e.xmm0, src); } else { - UNIMPLEMENTED_SEQ(); + e.mov(e.rax, XMMCONSTBASE); + e.movaps(e.xmm0, XMMCONST(e.rax, XMMSignMaskPD));; + e.vpandn(dest, e.xmm0, src); } }); } else if (IsVecType(i->dest->type)) { XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - UNIMPLEMENTED_SEQ(); + e.mov(e.rax, XMMCONSTBASE); + e.movaps(e.xmm0, XMMCONST(e.rax, XMMSignMaskPS));; + e.vpandn(dest, e.xmm0, src); }); } else { ASSERT_INVALID_TYPE(); @@ -1848,7 +1987,6 @@ table->AddSequence(OPCODE_DOT_PRODUCT_3, [](X64Emitter& e, Instr*& i) { XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx // TODO(benvanik): verify ordering - e.db(0xCC); e.dpps(dest_src, src, B01110001); }); } else { @@ -1863,7 +2001,6 @@ table->AddSequence(OPCODE_DOT_PRODUCT_4, [](X64Emitter& e, Instr*& i) { XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx // TODO(benvanik): verify ordering - e.db(0xCC); e.dpps(dest_src, src, B11110001); }); } else { @@ -2020,7 +2157,16 @@ table->AddSequence(OPCODE_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { } else if (i->flags == INT16_TYPE) { UNIMPLEMENTED_SEQ(); } else if (i->flags == INT32_TYPE) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.db(0xCC); + e.mov(e.eax, 0x1F); + e.vmovd(e.xmm0, e.eax); + e.vpbroadcastd(e.xmm0, e.xmm0); + e.vandps(e.xmm0, src, e.xmm0); + e.vpsllvd(dest_src, dest_src, e.xmm0); + }); } else { ASSERT_INVALID_TYPE(); } @@ -2038,7 +2184,15 @@ table->AddSequence(OPCODE_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { } else if (i->flags == INT16_TYPE) { UNIMPLEMENTED_SEQ(); } else if (i->flags == INT32_TYPE) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.mov(e.eax, 0x1F); + e.vmovd(e.xmm0, e.eax); + e.vpbroadcastd(e.xmm0, e.xmm0); + e.vandps(e.xmm0, src, e.xmm0); + e.vpsrlvd(dest_src, dest_src, src); + }); } else { ASSERT_INVALID_TYPE(); } @@ -2056,7 +2210,15 @@ table->AddSequence(OPCODE_VECTOR_SHA, [](X64Emitter& e, Instr*& i) { } else if (i->flags == INT16_TYPE) { UNIMPLEMENTED_SEQ(); } else if (i->flags == INT32_TYPE) { - UNIMPLEMENTED_SEQ(); + XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.mov(e.eax, 0x1F); + e.vmovd(e.xmm0, e.eax); + e.vpbroadcastd(e.xmm0, e.xmm0); + e.vandps(e.xmm0, src, e.xmm0); + e.vpsravd(dest_src, dest_src, src); + }); } else { ASSERT_INVALID_TYPE(); } @@ -2088,7 +2250,7 @@ table->AddSequence(OPCODE_ROTATE_LEFT, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_BYTE_SWAP, [](X64Emitter& e, Instr*& i) { if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg16 d, s1; + Reg16 dest, src1; // TODO(benvanik): fix register allocator to put the value in ABCD //e.BeginOp(i->dest, d, REG_DEST | REG_ABCD, // i->src1.value, s1, 0); @@ -2098,45 +2260,42 @@ table->AddSequence(OPCODE_BYTE_SWAP, [](X64Emitter& e, Instr*& i) { //} else { // e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); //} - e.BeginOp(i->dest, d, REG_DEST, - i->src1.value, s1, 0); - e.mov(e.ax, s1); + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + e.mov(e.ax, src1); e.xchg(e.ah, e.al); - e.mov(d, e.ax); - e.EndOp(d, s1); + e.mov(dest, e.ax); + e.EndOp(dest, src1); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg32 d, s1; - e.BeginOp(i->dest, d, REG_DEST, - i->src1.value, s1, 0); - if (d != s1) { - e.mov(d, s1); - e.bswap(d); + Reg32 dest, src1; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + if (dest != src1) { + e.mov(dest, src1); + e.bswap(dest); } else { - e.bswap(d); + e.bswap(dest); } - e.EndOp(d, s1); + e.EndOp(dest, src1); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg64 d, s1; - e.BeginOp(i->dest, d, REG_DEST, - i->src1.value, s1, 0); - if (d != s1) { - e.mov(d, s1); - e.bswap(d); + Reg64 dest, src1; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + if (dest != src1) { + e.mov(dest, src1); + e.bswap(dest); } else { - e.bswap(d); + e.bswap(dest); } - e.EndOp(d, s1); + e.EndOp(dest, src1); } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_V128)) { - Xmm d, s1; - e.db(0xCC); - e.BeginOp(i->dest, d, REG_DEST, - i->src1.value, s1, 0); - if (d != s1) { - e.shufps(d, s1, SHUFPS_SWAP_DWORDS); - } else { - e.shufps(d, d, SHUFPS_SWAP_DWORDS); - } - e.EndOp(d, s1); + Xmm dest, src1; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + // TODO(benvanik): find a way to do this without the memory load. + e.mov(e.rax, XMMCONSTBASE); + e.vpshufb(dest, src1, XMMCONST(e.rax, XMMByteSwapMask)); + e.EndOp(dest, src1); } else { ASSERT_INVALID_TYPE(); } @@ -2278,36 +2437,67 @@ table->AddSequence(OPCODE_EXTRACT, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SPLAT, [](X64Emitter& e, Instr*& i) { if (IsVecType(i->dest->type)) { - if (i->src1.value->type == INT8_TYPE) { + if (i->Match(SIG_TYPE_V128, SIG_TYPE_I8)) { Xmm dest; Reg8 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.pinsrb(e.xmm0, src, 0); + e.vmovd(e.xmm0, src.cvt32()); e.vpbroadcastb(dest, e.xmm0); e.EndOp(dest, src); - } else if (i->src1.value->type == INT16_TYPE) { + } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I8C)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + // TODO(benvanik): faster constant splats. + e.mov(e.eax, i->src1.value->constant.i8); + e.vmovd(e.xmm0, e.eax); + e.vpbroadcastb(dest, e.xmm0); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I16)) { Xmm dest; Reg16 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.pinsrw(e.xmm0, src, 0); + e.vmovd(e.xmm0, src.cvt32()); e.vpbroadcastw(dest, e.xmm0); e.EndOp(dest, src); - } else if (i->src1.value->type == INT32_TYPE) { + } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I16C)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + // TODO(benvanik): faster constant splats. + e.mov(e.eax, i->src1.value->constant.i16); + e.vmovd(e.xmm0, e.eax); + e.vpbroadcastw(dest, e.xmm0); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I32)) { Xmm dest; Reg32 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.pinsrd(e.xmm0, src, 0); + e.vmovd(e.xmm0, src); e.vpbroadcastd(dest, e.xmm0); e.EndOp(dest, src); - } else if (i->src1.value->type == FLOAT32_TYPE) { + } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I32C)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + // TODO(benvanik): faster constant splats. + e.mov(e.eax, i->src1.value->constant.i32); + e.vmovd(e.xmm0, e.eax); + e.vpbroadcastd(dest, e.xmm0); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_F32)) { Xmm dest, src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); e.vbroadcastss(dest, src); e.EndOp(dest, src); + } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_F32C)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(e.eax, i->src1.value->constant.i32); + e.vmovd(e.xmm0, e.eax); + e.vbroadcastss(dest, e.xmm0); + e.EndOp(dest); } else { ASSERT_INVALID_TYPE(); } @@ -2321,9 +2511,57 @@ table->AddSequence(OPCODE_SPLAT, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) { if (IsVecType(i->dest->type)) { if (i->src1.value->type == INT32_TYPE) { - UNIMPLEMENTED_SEQ(); + // Permute words between src2 and src3. + // TODO(benvanik): check src3 for zero. if 0, we can use pshufb. + if (i->src1.value->IsConstant()) { + uint32_t control = i->src1.value->AsUint32(); + Xmm dest, src2, src3; + e.BeginOp(i->dest, dest, REG_DEST, + i->src2.value, src2, 0, + i->src3.value, src3, 0); + // Shuffle things into the right places in dest & xmm0, + // then we blend them together. + uint32_t src_control = + (((control >> 24) & 0x3) << 0) | + (((control >> 16) & 0x3) << 2) | + (((control >> 8) & 0x3) << 4) | + (((control >> 0) & 0x3) << 6); + uint32_t blend_control = + (((control >> 26) & 0x1) << 0) | + (((control >> 18) & 0x1) << 1) | + (((control >> 10) & 0x1) << 2) | + (((control >> 2) & 0x1) << 3); + if (dest != src3) { + e.pshufd(dest, src2, src_control); + e.pshufd(e.xmm0, src3, src_control); + e.blendps(dest, e.xmm0, blend_control); + } else { + e.movaps(e.xmm0, src3); + e.pshufd(dest, src2, src_control); + e.pshufd(e.xmm0, e.xmm0, src_control); + e.blendps(dest, e.xmm0, blend_control); + } + e.EndOp(dest, src2, src3); + } else { + Reg32 control; + Xmm dest, src2, src3; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, control, 0, + i->src2.value, src2, 0, + i->src3.value, src3, 0); + UNIMPLEMENTED_SEQ(); + e.EndOp(dest, control, src2, src3); + } } else if (i->src1.value->type == VEC128_TYPE) { + // Permute bytes between src2 and src3. + // TODO(benvanik): check src3 for zero. if 0, we can use pshufb. + Xmm dest, control, src2, src3; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, control, 0, + i->src2.value, src2, 0, + i->src3.value, src3, 0); UNIMPLEMENTED_SEQ(); + e.EndOp(dest, control, src2, src3); } else { ASSERT_INVALID_TYPE(); } @@ -2339,7 +2577,11 @@ table->AddSequence(OPCODE_SWIZZLE, [](X64Emitter& e, Instr*& i) { // Defined by SWIZZLE_MASK() if (i->flags == INT32_TYPE || i->flags == FLOAT32_TYPE) { uint8_t swizzle_mask = (uint8_t)i->src2.offset; - e.db(0xCC); + swizzle_mask = + (((swizzle_mask >> 6) & 0x3) << 0) | + (((swizzle_mask >> 4) & 0x3) << 2) | + (((swizzle_mask >> 2) & 0x3) << 4) | + (((swizzle_mask >> 0) & 0x3) << 6); Xmm dest, src1; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0); @@ -2392,7 +2634,7 @@ table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { // Load source, move from tight pack of X16Y16.... to X16...Y16... // Also zero out the high end. // TODO(benvanik): special case constant unpacks that just get 0/1/etc. - XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm&, const Xmm& src) { + XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { // sx = src.iw >> 16; // sy = src.iw & 0xFFFF; // dest = { 3.0 + (sx / float(1 << 22)), @@ -2410,11 +2652,31 @@ table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { // Could be shared with FLOAT16_2. UNIMPLEMENTED_SEQ(); } else if (i->flags == PACK_TYPE_SHORT_2) { - // (VD.x) = 3.0 + (VB.x)*2^-22 - // (VD.y) = 3.0 + (VB.y)*2^-22 + // (VD.x) = 3.0 + (VB.x>>16)*2^-22 + // (VD.y) = 3.0 + (VB.x)*2^-22 // (VD.z) = 0.0 - // (VD.w) = 3.0 - UNIMPLEMENTED_SEQ(); + // (VD.w) = 1.0 + XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { + // XMLoadShortN2 plus 3,3,0,3 (for some reason) + // src is (xx,xx,xx,VALUE) + e.mov(e.rax, XMMCONSTBASE); + // (VALUE,VALUE,VALUE,VALUE) + e.vbroadcastss(dest, src); + // (VALUE&0xFFFF,VALUE&0xFFFF0000,0,0) + e.andps(dest, XMMCONST(e.rax, XMMMaskX16Y16)); + // Sign extend. + e.xorps(dest, XMMCONST(e.rax, XMMFlipX16Y16)); + // Convert int->float. + e.cvtpi2ps(dest, Stash(e, dest)); + // 0x8000 to undo sign. + e.addps(dest, XMMCONST(e.rax, XMMFixX16Y16)); + // Normalize. + e.mulps(dest, XMMCONST(e.rax, XMMNormalizeX16Y16)); + // Clamp. + e.maxps(dest, XMMCONST(e.rax, XMMNegativeOne)); + // Add 3,3,0,1. + e.addps(dest, XMMCONST(e.rax, XMM3301)); + }); } else if (i->flags == PACK_TYPE_S8_IN_16_LO) { UNIMPLEMENTED_SEQ(); } else if (i->flags == PACK_TYPE_S8_IN_16_HI) { diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 3f1c73f90..17c947ef8 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -18,8 +18,9 @@ namespace { #define NAX_LIKE(like) Reg(e.rax.getIdx(), e.rax.getKind(), like.getBit(), false) Address Stash(X64Emitter& e, const Xmm& r) { - auto addr = e.ptr[e.rsp + 40]; - e.movaps(addr, r); + // TODO(benvanik): ensure aligned. + auto addr = e.ptr[e.rsp + 48]; + e.movups(addr, r); return addr; } @@ -65,11 +66,22 @@ void CheckBoolean(X64Emitter& e, Value* v) { e.test(src, src); e.EndOp(src); } else if (v->type == FLOAT32_TYPE) { - UNIMPLEMENTED_SEQ(); + // TODO(benvanik): mask? + Xmm src; + e.BeginOp(v, src, 0); + e.ptest(src, src); + e.EndOp(src); } else if (v->type == FLOAT64_TYPE) { - UNIMPLEMENTED_SEQ(); + // TODO(benvanik): mask? + Xmm src; + e.BeginOp(v, src, 0); + e.ptest(src, src); + e.EndOp(src); } else if (v->type == VEC128_TYPE) { - UNIMPLEMENTED_SEQ(); + Xmm src; + e.BeginOp(v, src, 0); + e.ptest(src, src); + e.EndOp(src); } else { ASSERT_INVALID_TYPE(); } @@ -180,6 +192,52 @@ void CompareXX(X64Emitter& e, Instr*& i, void(set_fn)(X64Emitter& e, Reg8& dest, e.cmp(src2, e.rax); set_fn(e, dest, true); e.EndOp(dest, src2); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32, SIG_TYPE_F32)) { + Reg8 dest; + Xmm src1, src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0); + e.comiss(src1, src2); + set_fn(e, dest, false); + e.EndOp(dest, src1, src2); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32, SIG_TYPE_F32C)) { + Reg8 dest; + Xmm src1; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + if (i->src2.value->IsConstantZero()) { + e.pxor(e.xmm0, e.xmm0); + } else { + e.mov(e.eax, (uint32_t)i->src2.value->constant.i32); + e.pinsrd(e.xmm0, e.eax, 0); + } + e.comiss(src1, e.xmm0); + set_fn(e, dest, false); + e.EndOp(dest, src1); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64, SIG_TYPE_F64)) { + Reg8 dest; + Xmm src1, src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0); + e.comisd(src1, src2); + set_fn(e, dest, false); + e.EndOp(dest, src1, src2); + } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64, SIG_TYPE_F64C)) { + Reg8 dest; + Xmm src1; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + if (i->src2.value->IsConstantZero()) { + e.pxor(e.xmm0, e.xmm0); + } else { + e.mov(e.rax, (uint64_t)i->src2.value->constant.i64); + e.pinsrq(e.xmm0, e.rax, 0); + } + e.comisd(src1, e.xmm0); + set_fn(e, dest, false); + e.EndOp(dest, src1); } else { UNIMPLEMENTED_SEQ(); } @@ -894,6 +952,52 @@ void XmmBinaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_vv_fn vv_fn) { } }; +typedef void(xmm_vvv_fn)(X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3); +void XmmTernaryOpVVV(X64Emitter& e, Instr*& i, xmm_vvv_fn vvv_fn, + Xmm& dest, Xmm& src1, Xmm& src2, Xmm& src3) { + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0, + i->src3.value, src3, 0); + if (dest == src1) { + vvv_fn(e, *i, dest, src2, src3); + } else if (dest == src2) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + vvv_fn(e, *i, dest, src1, src3); + } else { + // Eww. + e.movaps(e.xmm0, src1); + vvv_fn(e, *i, e.xmm0, src2, src3); + e.movaps(dest, e.xmm0); + } + } else if (dest == src3) { + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + vvv_fn(e, *i, dest, src1, src2); + } else { + UNIMPLEMENTED_SEQ(); + } + } else { + e.movaps(dest, src1); + vvv_fn(e, *i, dest, src2, src3); + } + e.EndOp(dest, src1, src2, src3); +} +void XmmTernaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_vvv_fn vvv_fn) { + // TODO(benvanik): table lookup. This linear scan is slow. + if (!i->src1.value->IsConstant() && !i->src2.value->IsConstant() && + !i->src3.value->IsConstant()) { + Xmm dest, src1, src2, src3; + XmmTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); + } else { + ASSERT_INVALID_TYPE(); + } + if (flags & ARITHMETIC_SET_CARRY) { + // EFLAGS should have CA set? + // (so long as we don't fuck with it) + // UNIMPLEMENTED_SEQ(); + } +}; + } // namespace #endif // ALLOY_BACKEND_X64_X64_LOWERING_OP_UTILS_INL_ diff --git a/src/alloy/backend/x64/lowering/tracers.cc b/src/alloy/backend/x64/lowering/tracers.cc index d718f6def..f53d70dc1 100644 --- a/src/alloy/backend/x64/lowering/tracers.cc +++ b/src/alloy/backend/x64/lowering/tracers.cc @@ -24,15 +24,15 @@ namespace x64 { namespace lowering { -#define IPRINT #define IFLUSH() -#define DPRINT +#define IPRINT #define DFLUSH() +#define DPRINT -#define IPRINT if (thread_state->thread_id() == 1) printf #define IFLUSH() fflush(stdout) -#define DPRINT if (thread_state->thread_id() == 1) printf +#define IPRINT if (thread_state->thread_id() == 1) printf #define DFLUSH() fflush(stdout) +#define DPRINT DFLUSH(); if (thread_state->thread_id() == 1) printf void TraceString(void* raw_context, const char* str) { diff --git a/src/alloy/core.h b/src/alloy/core.h index d61b3dd2c..cd7a32204 100644 --- a/src/alloy/core.h +++ b/src/alloy/core.h @@ -45,6 +45,16 @@ typedef struct XECACHEALIGN vec128_s { }; }; } vec128_t; +XEFORCEINLINE vec128_t vec128i(uint32_t x, uint32_t y, uint32_t z, uint32_t w) { + vec128_t v; + v.i4[0] = x; v.i4[1] = y; v.i4[2] = z; v.i4[3] = w; + return v; +} +XEFORCEINLINE vec128_t vec128f(float x, float y, float z, float w) { + vec128_t v; + v.f4[0] = x; v.f4[1] = y; v.f4[2] = z; v.f4[3] = w; + return v; +} } // namespace alloy From b2e9086932469a0207c037bc1c645ad59f0cf9f7 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 28 Jan 2014 20:39:44 -0800 Subject: [PATCH 038/184] Reserving volatile XMM registers (so long as we are calling tracers). --- src/alloy/backend/x64/x64_emitter.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 5e5a9eecd..1e20c84f4 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -113,7 +113,10 @@ int X64Emitter::Emit(HIRBuilder* builder) { GetRegBit(r8) | GetRegBit(r9) | GetRegBit(r10) | - GetRegBit(r11); + GetRegBit(r11) | + GetRegBit(xmm1) | + GetRegBit(xmm2) | + GetRegBit(xmm3); // Function prolog. // Must be 16b aligned. From 8cddfcbf19fcb5830a32370f3cc30e6e283cd222 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 28 Jan 2014 22:06:45 -0800 Subject: [PATCH 039/184] More SSE work. --- src/alloy/backend/ivm/ivm_intcode.cc | 44 +++---- .../x64/lowering/lowering_sequences.cc | 118 ++++++++++-------- src/alloy/backend/x64/lowering/tracers.cc | 84 +++++-------- src/alloy/backend/x64/lowering/tracers.h | 16 +-- src/alloy/backend/x64/x64_emitter.cc | 4 +- 5 files changed, 132 insertions(+), 134 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 756db9203..f7e0cdbde 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -2093,19 +2093,19 @@ int Translate_DID_SATURATE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, IntCode_DID_SATURATE); } -#define VECTOR_COMPARER(type, value, count, op) \ +#define VECTOR_COMPARER(type, value, dest_value, count, op) \ const vec128_t& src1 = ics.rf[i->src1_reg].v128; \ const vec128_t& src2 = ics.rf[i->src2_reg].v128; \ vec128_t& dest = ics.rf[i->dest_reg].v128; \ for (int n = 0; n < count; n++) { \ - dest.value[n] = ((type)src1.value[n] op (type)src2.value[n]) ? (type)0xFFFFFFFF : 0; \ + dest.dest_value[n] = ((type)src1.value[n] op (type)src2.value[n]) ? 0xFFFFFFFF : 0; \ } \ return IA_NEXT; -uint32_t IntCode_VECTOR_COMPARE_EQ_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, 16, ==) }; -uint32_t IntCode_VECTOR_COMPARE_EQ_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, 8, ==) }; -uint32_t IntCode_VECTOR_COMPARE_EQ_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, 4, ==) }; -uint32_t IntCode_VECTOR_COMPARE_EQ_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, ==) }; +uint32_t IntCode_VECTOR_COMPARE_EQ_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, b16, 16, ==) }; +uint32_t IntCode_VECTOR_COMPARE_EQ_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, s8, 8, ==) }; +uint32_t IntCode_VECTOR_COMPARE_EQ_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, i4, 4, ==) }; +uint32_t IntCode_VECTOR_COMPARE_EQ_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, ==) }; int Translate_VECTOR_COMPARE_EQ(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_EQ_I8, @@ -2119,10 +2119,10 @@ int Translate_VECTOR_COMPARE_EQ(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_SGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, 16, >) }; -uint32_t IntCode_VECTOR_COMPARE_SGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, 8, >) }; -uint32_t IntCode_VECTOR_COMPARE_SGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, 4, >) }; -uint32_t IntCode_VECTOR_COMPARE_SGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >) }; +uint32_t IntCode_VECTOR_COMPARE_SGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, b16, 16, >) }; +uint32_t IntCode_VECTOR_COMPARE_SGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, s8, 8, >) }; +uint32_t IntCode_VECTOR_COMPARE_SGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, i4, 4, >) }; +uint32_t IntCode_VECTOR_COMPARE_SGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >) }; int Translate_VECTOR_COMPARE_SGT(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_SGT_I8, @@ -2136,10 +2136,10 @@ int Translate_VECTOR_COMPARE_SGT(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_SGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, 16, >=) }; -uint32_t IntCode_VECTOR_COMPARE_SGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, 8, >=) }; -uint32_t IntCode_VECTOR_COMPARE_SGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, 4, >=) }; -uint32_t IntCode_VECTOR_COMPARE_SGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >=) }; +uint32_t IntCode_VECTOR_COMPARE_SGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, b16, 16, >=) }; +uint32_t IntCode_VECTOR_COMPARE_SGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, s8, 8, >=) }; +uint32_t IntCode_VECTOR_COMPARE_SGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, i4, 4, >=) }; +uint32_t IntCode_VECTOR_COMPARE_SGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >=) }; int Translate_VECTOR_COMPARE_SGE(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_SGE_I8, @@ -2153,10 +2153,10 @@ int Translate_VECTOR_COMPARE_SGE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_UGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, 16, >) }; -uint32_t IntCode_VECTOR_COMPARE_UGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, 8, >) }; -uint32_t IntCode_VECTOR_COMPARE_UGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, 4, >) }; -uint32_t IntCode_VECTOR_COMPARE_UGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >) }; +uint32_t IntCode_VECTOR_COMPARE_UGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, b16, 16, >) }; +uint32_t IntCode_VECTOR_COMPARE_UGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, s8, 8, >) }; +uint32_t IntCode_VECTOR_COMPARE_UGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, i4, 4, >) }; +uint32_t IntCode_VECTOR_COMPARE_UGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >) }; int Translate_VECTOR_COMPARE_UGT(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_UGT_I8, @@ -2170,10 +2170,10 @@ int Translate_VECTOR_COMPARE_UGT(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_UGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, 16, >=) }; -uint32_t IntCode_VECTOR_COMPARE_UGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, 8, >=) }; -uint32_t IntCode_VECTOR_COMPARE_UGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, 4, >=) }; -uint32_t IntCode_VECTOR_COMPARE_UGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >=) }; +uint32_t IntCode_VECTOR_COMPARE_UGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, b16, 16, >=) }; +uint32_t IntCode_VECTOR_COMPARE_UGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, s8, 8, >=) }; +uint32_t IntCode_VECTOR_COMPARE_UGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, i4, 4, >=) }; +uint32_t IntCode_VECTOR_COMPARE_UGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >=) }; int Translate_VECTOR_COMPARE_UGE(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_UGE_I8, diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 461ef62d5..f22d34b7d 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -17,6 +17,9 @@ #include #include +// TODO(benvanik): reimplement packing functions +#include + using namespace alloy; using namespace alloy::backend::x64; using namespace alloy::backend::x64::lowering; @@ -87,6 +90,14 @@ void Dummy() { // } +void Unpack_FLOAT16_2(void* raw_context, __m128& v) { + uint32_t src = v.m128_i32[3]; + v.m128_f32[0] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)src); + v.m128_f32[1] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)(src >> 16)); + v.m128_f32[2] = 0.0f; + v.m128_f32[3] = 1.0f; +} + uint64_t LoadClock(void* raw_context) { LARGE_INTEGER counter; uint64_t time = 0; @@ -378,7 +389,7 @@ table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { Xmm src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.pextrd(dest, src, 0); + e.vmovd(dest, src); e.EndOp(dest, src); } else { UNIMPLEMENTED_SEQ(); @@ -389,7 +400,7 @@ table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { Xmm src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.pextrq(dest, src, 0); + e.vmovq(dest, src); e.EndOp(dest, src); } else { UNIMPLEMENTED_SEQ(); @@ -400,7 +411,7 @@ table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { Reg32 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.pinsrd(dest, src, 0); + e.vmovd(dest, src); e.EndOp(dest, src); } else { UNIMPLEMENTED_SEQ(); @@ -411,7 +422,7 @@ table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { Reg64 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.pinsrq(dest, src, 0); + e.vmovq(dest, src); e.EndOp(dest, src); } else { UNIMPLEMENTED_SEQ(); @@ -582,7 +593,7 @@ table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); // TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?) - e.cvtss2si(dest, src); + e.cvttss2si(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_F64)) { Reg32 dest; @@ -591,7 +602,7 @@ table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { i->src1.value, src, 0); // TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?) e.cvtsd2ss(e.xmm0, src); - e.cvtss2si(dest, e.xmm0); + e.cvttss2si(dest, e.xmm0); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_F64)) { Reg64 dest; @@ -599,7 +610,7 @@ table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); // TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?) - e.cvtsd2si(dest, src); + e.cvttsd2si(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_I32)) { Xmm dest; @@ -764,10 +775,11 @@ table->AddSequence(OPCODE_LOAD_CLOCK, [](X64Emitter& e, Instr*& i) { // -------------------------------------------------------------------------- table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { + auto addr = e.rcx + i->src1.offset; if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { Reg8 dest; e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.byte[e.rcx + i->src1.offset]); + e.mov(dest, e.byte[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -777,7 +789,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { Reg16 dest; e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.word[e.rcx + i->src1.offset]); + e.mov(dest, e.word[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -787,7 +799,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { Reg32 dest; e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.dword[e.rcx + i->src1.offset]); + e.mov(dest, e.dword[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -797,7 +809,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { Reg64 dest; e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.qword[e.rcx + i->src1.offset]); + e.mov(dest, e.qword[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -807,28 +819,28 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); - e.movss(dest, e.dword[e.rcx + i->src1.offset]); + e.movss(dest, e.dword[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movaps(e.xmm0, dest); + e.lea(e.r8, Stash(e, dest)); CallNative(e, TraceContextLoadF32); #endif // DTRACE } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); - e.movsd(dest, e.qword[e.rcx + i->src1.offset]); + e.movsd(dest, e.qword[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movaps(e.xmm0, dest); + e.lea(e.r8, Stash(e, dest)); CallNative(e, TraceContextLoadF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); // NOTE: we always know we are aligned. - e.movaps(dest, e.ptr[e.rcx + i->src1.offset]); + e.movaps(dest, e.ptr[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -843,10 +855,11 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { + auto addr = e.rcx + i->src1.offset; if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { Reg8 src; e.BeginOp(i->src2.value, src, 0); - e.mov(e.byte[e.rcx + i->src1.offset], src); + e.mov(e.byte[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -854,7 +867,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceContextStoreI8); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { - e.mov(e.byte[e.rcx + i->src1.offset], i->src2.value->constant.i8); + e.mov(e.byte[addr], i->src2.value->constant.i8); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8b, i->src2.value->constant.i8); @@ -863,7 +876,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { Reg16 src; e.BeginOp(i->src2.value, src, 0); - e.mov(e.word[e.rcx + i->src1.offset], src); + e.mov(e.word[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -871,7 +884,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceContextStoreI16); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { - e.mov(e.word[e.rcx + i->src1.offset], i->src2.value->constant.i16); + e.mov(e.word[addr], i->src2.value->constant.i16); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8w, i->src2.value->constant.i16); @@ -880,7 +893,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { Reg32 src; e.BeginOp(i->src2.value, src, 0); - e.mov(e.dword[e.rcx + i->src1.offset], src); + e.mov(e.dword[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -888,7 +901,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceContextStoreI32); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { - e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); + e.mov(e.dword[addr], i->src2.value->constant.i32); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8d, i->src2.value->constant.i32); @@ -897,7 +910,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { Reg64 src; e.BeginOp(i->src2.value, src, 0); - e.mov(e.qword[e.rcx + i->src1.offset], src); + e.mov(e.qword[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -905,7 +918,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceContextStoreI64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { - MovMem64(e, e.rcx + i->src1.offset, i->src2.value->constant.i64); + MovMem64(e, addr, i->src2.value->constant.i64); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8, i->src2.value->constant.i64); @@ -914,42 +927,46 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { Xmm src; e.BeginOp(i->src2.value, src, 0); - e.movss(e.dword[e.rcx + i->src1.offset], src); + e.movss(e.dword[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movss(e.xmm0, src); + e.lea(e.r8, Stash(e, src)); CallNative(e, TraceContextStoreF32); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { - e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); + e.mov(e.dword[addr], i->src2.value->constant.i32); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movss(e.xmm0, e.dword[e.rcx + i->src1.offset]); + e.mov(e.eax, i->src2.value->constant.i32); + e.vmovd(e.xmm0, e.eax); + e.lea(e.r8, Stash(e, e.xmm0)); CallNative(e, TraceContextStoreF32); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { Xmm src; e.BeginOp(i->src2.value, src, 0); - e.movsd(e.qword[e.rcx + i->src1.offset], src); + e.movsd(e.qword[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movsd(e.xmm0, src); + e.lea(e.r8, Stash(e, src)); CallNative(e, TraceContextStoreF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { - MovMem64(e, e.rcx + i->src1.offset, i->src2.value->constant.i64); + MovMem64(e, addr, i->src2.value->constant.i64); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movsd(e.xmm0, e.qword[e.rcx + i->src1.offset]); + e.mov(e.rax, i->src2.value->constant.i64); + e.vmovq(e.xmm0, e.rax); + e.lea(e.r8, Stash(e, e.xmm0)); CallNative(e, TraceContextStoreF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { Xmm src; e.BeginOp(i->src2.value, src, 0); // NOTE: we always know we are aligned. - e.movaps(e.ptr[e.rcx + i->src1.offset], src); + e.movaps(e.ptr[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -959,11 +976,11 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { // TODO(benvanik): check zero // TODO(benvanik): correct order? - MovMem64(e, e.rcx + i->src1.offset, i->src2.value->constant.v128.low); - MovMem64(e, e.rcx + i->src1.offset + 8, i->src2.value->constant.v128.high); + MovMem64(e, addr, i->src2.value->constant.v128.low); + MovMem64(e, addr + 8, i->src2.value->constant.v128.high); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.lea(e.r8, e.ptr[e.rcx + i->src1.offset]); + e.lea(e.r8, e.ptr[addr]); CallNative(e, TraceContextStoreV128); #endif // DTRACE } else { @@ -1062,7 +1079,7 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { e.EndOp(dest); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movss(e.xmm0, dest); + e.lea(e.r8, Stash(e, dest)); CallNative(e, TraceMemoryLoadF32); #endif // DTRACE } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { @@ -1072,7 +1089,7 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { e.EndOp(dest); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movsd(e.xmm0, dest); + e.lea(e.r8, Stash(e, dest)); CallNative(e, TraceMemoryLoadF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { @@ -1224,14 +1241,16 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { e.EndOp(src); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movss(e.xmm0, src); + e.lea(e.r8, Stash(e, src)); CallNative(e, TraceMemoryStoreF32); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { e.mov(e.dword[addr], i->src2.value->constant.i32); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movss(e.xmm0, e.ptr[addr]); + e.mov(e.eax, i->src2.value->constant.i32); + e.vmovd(e.xmm0, e.eax); + e.lea(e.r8, Stash(e, e.xmm0)); CallNative(e, TraceMemoryStoreF32); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { @@ -1241,7 +1260,7 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { e.EndOp(src); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movsd(e.xmm0, src); + e.lea(e.r8, Stash(e, src)); CallNative(e, TraceMemoryStoreF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { @@ -2160,7 +2179,6 @@ table->AddSequence(OPCODE_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { // src shift mask may have values >31, and x86 sets to zero when // that happens so we mask. - e.db(0xCC); e.mov(e.eax, 0x1F); e.vmovd(e.xmm0, e.eax); e.vpbroadcastd(e.xmm0, e.xmm0); @@ -2637,16 +2655,14 @@ table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { // sx = src.iw >> 16; // sy = src.iw & 0xFFFF; - // dest = { 3.0 + (sx / float(1 << 22)), - // 3.0 + (sy / float(1 << 22)), + // dest = { XMConvertHalfToFloat(sx), + // XMConvertHalfToFloat(sy), // 0.0, - // 1.0); --- or 3.0? - // So: - // xmm = {0,0,0,packed} - // xmm <<= 1w {0,0,packed,0} - // xmm = VCVTPH2PS(xmm) {sx,sy,0,0} - // xmm /= - UNIMPLEMENTED_SEQ(); + // 1.0 }; + auto addr = Stash(e, src); + e.lea(e.rdx, addr); + CallNative(e, Unpack_FLOAT16_2); + e.movaps(dest, addr); }); } else if (i->flags == PACK_TYPE_FLOAT16_4) { // Could be shared with FLOAT16_2. diff --git a/src/alloy/backend/x64/lowering/tracers.cc b/src/alloy/backend/x64/lowering/tracers.cc index f53d70dc1..0d7975847 100644 --- a/src/alloy/backend/x64/lowering/tracers.cc +++ b/src/alloy/backend/x64/lowering/tracers.cc @@ -57,23 +57,18 @@ void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value) { auto thread_state = *((ThreadState**)raw_context); DPRINT("%lld (%llX) = ctx i64 +%d\n", (int64_t)value, value, offset); } -void TraceContextLoadF32(void* raw_context, uint64_t offset, float value) { +void TraceContextLoadF32(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - union { - float f; - uint32_t u; - } x; - x.f = value; - DPRINT("%e (%X) = ctx f32 +%d\n", x.f, x.u, offset); + DPRINT("%e (%X) = ctx f32 +%d\n", value.m128_f32[0], value.m128_i32[0], offset); } -void TraceContextLoadF64(void* raw_context, uint64_t offset, double value) { +void TraceContextLoadF64(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); union { - double f; - uint64_t u; - } x; - x.f = value; - DPRINT("%lle (%llX) = ctx f64 +%d\n", x.f, x.u, offset); + double d; + uint64_t x; + } f; + f.x = value.m128_i64[0]; + DPRINT("%lle (%llX) = ctx f64 +%d\n", f.d, value.m128_i64[0], offset); } void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); @@ -99,23 +94,18 @@ void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value) { auto thread_state = *((ThreadState**)raw_context); DPRINT("ctx i64 +%d = %lld (%llX)\n", offset, (int64_t)value, value); } -void TraceContextStoreF32(void* raw_context, uint64_t offset, float value) { +void TraceContextStoreF32(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - union { - float f; - uint32_t u; - } x; - x.f = value; - DPRINT("ctx f32 +%d = %e (%.X)\n", offset, x.f, x.u); + DPRINT("ctx f32 +%d = %e (%X)\n", offset, value.m128_i32[0], value.m128_f32[0]); } -void TraceContextStoreF64(void* raw_context, uint64_t offset, double value) { +void TraceContextStoreF64(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); union { - double f; - uint64_t u; - } x; - x.f = value; - DPRINT("ctx f64 +%d = %lle (%.llX)\n", offset, x.f, x.u); + double d; + uint64_t x; + } f; + f.x = value.m128_i64[0]; + DPRINT("ctx f64 +%d = %lle (%llX)\n", offset, value.m128_i64[0], f.d); } void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); @@ -140,23 +130,18 @@ void TraceMemoryLoadI64(void* raw_context, uint64_t address, uint64_t value) { auto thread_state = *((ThreadState**)raw_context); DPRINT("%lld (%llX) = load.i64 %.8X\n", (int64_t)value, value, address); } -void TraceMemoryLoadF32(void* raw_context, uint64_t address, float value) { +void TraceMemoryLoadF32(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - union { - float f; - uint32_t u; - } x; - x.f = value; - DPRINT("%e (%X) = load.f32 %.8X\n", x.f, x.u, address); + DPRINT("%e (%X) = load.f32 %.8X\n", value.m128_f32[0], value.m128_i32[0], address); } -void TraceMemoryLoadF64(void* raw_context, uint64_t address, double value) { +void TraceMemoryLoadF64(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); union { - double f; - uint64_t u; - } x; - x.f = value; - DPRINT("%lle (%llX) = load.f64 %.8X\n", x.f, x.u, address); + double d; + uint64_t x; + } f; + f.x = value.m128_i64[0]; + DPRINT("%lle (%llX) = load.f64 %.8X\n", f.d, value.m128_i64[0], address); } void TraceMemoryLoadV128(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); @@ -182,23 +167,18 @@ void TraceMemoryStoreI64(void* raw_context, uint64_t address, uint64_t value) { auto thread_state = *((ThreadState**)raw_context); DPRINT("store.i64 %.8X = %lld (%llX)\n", address, (int64_t)value, value); } -void TraceMemoryStoreF32(void* raw_context, uint64_t address, float value) { +void TraceMemoryStoreF32(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - union { - float f; - uint32_t u; - } x; - x.f = value; - DPRINT("store.f32 %.8X = %e (%X)\n", address, x.f, x.u); + DPRINT("store.f32 %.8X = %e (%X)\n", address, value.m128_f32[0], value.m128_i32[0]); } -void TraceMemoryStoreF64(void* raw_context, uint64_t address, double value) { +void TraceMemoryStoreF64(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); union { - double f; - uint64_t u; - } x; - x.f = value; - DPRINT("store.f64 %.8X = %lle (%llX)\n", address, x.f, x.u); + double d; + uint64_t x; + } f; + f.x = value.m128_i64[0]; + DPRINT("store.f64 %.8X = %lle (%llX)\n", address, f.d, value.m128_i64[0]); } void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); diff --git a/src/alloy/backend/x64/lowering/tracers.h b/src/alloy/backend/x64/lowering/tracers.h index eccc87de9..9afd58448 100644 --- a/src/alloy/backend/x64/lowering/tracers.h +++ b/src/alloy/backend/x64/lowering/tracers.h @@ -25,32 +25,32 @@ void TraceContextLoadI8(void* raw_context, uint64_t offset, uint8_t value); void TraceContextLoadI16(void* raw_context, uint64_t offset, uint16_t value); void TraceContextLoadI32(void* raw_context, uint64_t offset, uint32_t value); void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value); -void TraceContextLoadF32(void* raw_context, uint64_t offset, float value); -void TraceContextLoadF64(void* raw_context, uint64_t offset, double value); +void TraceContextLoadF32(void* raw_context, uint64_t offset, __m128 value); +void TraceContextLoadF64(void* raw_context, uint64_t offset, __m128 value); void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value); void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value); void TraceContextStoreI16(void* raw_context, uint64_t offset, uint16_t value); void TraceContextStoreI32(void* raw_context, uint64_t offset, uint32_t value); void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value); -void TraceContextStoreF32(void* raw_context, uint64_t offset, float value); -void TraceContextStoreF64(void* raw_context, uint64_t offset, double value); +void TraceContextStoreF32(void* raw_context, uint64_t offset, __m128 value); +void TraceContextStoreF64(void* raw_context, uint64_t offset, __m128 value); void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value); void TraceMemoryLoadI8(void* raw_context, uint64_t address, uint8_t value); void TraceMemoryLoadI16(void* raw_context, uint64_t address, uint16_t value); void TraceMemoryLoadI32(void* raw_context, uint64_t address, uint32_t value); void TraceMemoryLoadI64(void* raw_context, uint64_t address, uint64_t value); -void TraceMemoryLoadF32(void* raw_context, uint64_t address, float value); -void TraceMemoryLoadF64(void* raw_context, uint64_t address, double value); +void TraceMemoryLoadF32(void* raw_context, uint64_t address, __m128 value); +void TraceMemoryLoadF64(void* raw_context, uint64_t address, __m128 value); void TraceMemoryLoadV128(void* raw_context, uint64_t address, __m128 value); void TraceMemoryStoreI8(void* raw_context, uint64_t address, uint8_t value); void TraceMemoryStoreI16(void* raw_context, uint64_t address, uint16_t value); void TraceMemoryStoreI32(void* raw_context, uint64_t address, uint32_t value); void TraceMemoryStoreI64(void* raw_context, uint64_t address, uint64_t value); -void TraceMemoryStoreF32(void* raw_context, uint64_t address, float value); -void TraceMemoryStoreF64(void* raw_context, uint64_t address, double value); +void TraceMemoryStoreF32(void* raw_context, uint64_t address, __m128 value); +void TraceMemoryStoreF64(void* raw_context, uint64_t address, __m128 value); void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value); } // namespace lowering diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 1e20c84f4..59fcfb36a 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -116,7 +116,9 @@ int X64Emitter::Emit(HIRBuilder* builder) { GetRegBit(r11) | GetRegBit(xmm1) | GetRegBit(xmm2) | - GetRegBit(xmm3); + GetRegBit(xmm3) | + GetRegBit(xmm4) | + GetRegBit(xmm5); // Function prolog. // Must be 16b aligned. From 62ced2742e1f764d1b8a40b4892603ee06a08592 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 28 Jan 2014 22:11:33 -0800 Subject: [PATCH 040/184] Handling unimplemented externs. --- .../backend/x64/lowering/lowering_sequences.cc | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index f22d34b7d..943a28a5a 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -90,6 +90,12 @@ void Dummy() { // } +void UnimplementedExtern(void* raw_context, ExternFunction* extern_fn) { + // TODO(benvanik): generate this thunk at runtime? or a shim? + auto thread_state = *((ThreadState**)raw_context); + extern_fn->Call(thread_state); +} + void Unpack_FLOAT16_2(void* raw_context, __m128& v) { uint32_t src = v.m128_i32[3]; v.m128_f32[0] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)src); @@ -142,9 +148,15 @@ void IssueCall(X64Emitter& e, FunctionInfo* symbol_info, uint32_t flags) { auto fn = symbol_info->function(); if (fn && fn->type() == Function::EXTERN_FUNCTION) { auto extern_fn = (ExternFunction*)fn; - e.mov(e.rdx, (uint64_t)extern_fn->arg0()); - e.mov(e.r8, (uint64_t)extern_fn->arg1()); - e.mov(e.rax, (uint64_t)extern_fn->handler()); + if (extern_fn->handler()) { + e.mov(e.rdx, (uint64_t)extern_fn->arg0()); + e.mov(e.r8, (uint64_t)extern_fn->arg1()); + e.mov(e.rax, (uint64_t)extern_fn->handler()); + } else { + // Unimplemented - call dummy. + e.mov(e.rdx, (uint64_t)extern_fn); + e.mov(e.rax, (uint64_t)UnimplementedExtern); + } } else { // Generic call, resolve address. // TODO(benvanik): caching/etc. For now this makes debugging easier. From 53d4cbf2c5f440b9c76fecd9cc7bfd6b2a3a9225 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 28 Jan 2014 22:49:51 -0800 Subject: [PATCH 041/184] Messing with flags. subficx sometimes still wrong. --- .../x64/lowering/lowering_sequences.cc | 11 ++-- src/alloy/backend/x64/lowering/op_utils.inl | 53 +++++++++++++------ 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 943a28a5a..d02bcd18d 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -113,13 +113,6 @@ uint64_t LoadClock(void* raw_context) { return time; } -void CallNative(X64Emitter& e, void* target) { - e.mov(e.rax, (uint64_t)target); - e.call(e.rax); - e.mov(e.rcx, e.qword[e.rsp + 0]); - e.mov(e.rdx, e.qword[e.rcx + 8]); // membase -} - // TODO(benvanik): fancy stuff. void* ResolveFunctionSymbol(void* raw_context, FunctionInfo* symbol_info) { // TODO(benvanik): generate this thunk at runtime? or a shim? @@ -1161,6 +1154,8 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { // eh? e.bswap(e.r8); CallNative(e, cbs->write); + i = e.Advance(i); + return true; } cbs = cbs->next; } @@ -1534,6 +1529,7 @@ table->AddSequence(OPCODE_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_DID_CARRY, [](X64Emitter& e, Instr*& i) { Reg8 dest; e.BeginOp(i->dest, dest, REG_DEST); + LoadEflags(e); e.setc(dest); e.EndOp(dest); i = e.Advance(i); @@ -1543,6 +1539,7 @@ table->AddSequence(OPCODE_DID_CARRY, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_DID_OVERFLOW, [](X64Emitter& e, Instr*& i) { Reg8 dest; e.BeginOp(i->dest, dest, REG_DEST); + LoadEflags(e); e.seto(dest); e.EndOp(dest); i = e.Advance(i); diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 17c947ef8..d519b634c 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -17,6 +17,30 @@ namespace { #define LIKE_REG(dest, like) Reg(dest.getIdx(), dest.getKind(), like.getBit(), false) #define NAX_LIKE(like) Reg(e.rax.getIdx(), e.rax.getKind(), like.getBit(), false) +// If we are running with tracing on we have to store the EFLAGS in the stack, +// otherwise our calls out to C to print will clear it before DID_CARRY/etc +// can get the value. +#define STORE_EFLAGS 1 + +void LoadEflags(X64Emitter& e) { +#if STORE_EFLAGS + e.mov(e.eax, e.dword[e.rsp + 40]); + e.push(e.ax); + e.popf(); +#else + // EFLAGS already present. +#endif // STORE_EFLAGS +} +void StoreEflags(X64Emitter& e) { +#if STORE_EFLAGS + e.pushf(); + e.pop(e.word[e.rsp + 40]); +#else + // EFLAGS should have CA set? + // (so long as we don't fuck with it) +#endif // STORE_EFLAGS +} + Address Stash(X64Emitter& e, const Xmm& r) { // TODO(benvanik): ensure aligned. auto addr = e.ptr[e.rsp + 48]; @@ -39,6 +63,15 @@ void MovMem64(X64Emitter& e, RegExp& addr, uint64_t v) { } } +void CallNative(X64Emitter& e, void* target) { + e.sub(e.rsp, 0x18); + e.mov(e.rax, (uint64_t)target); + e.call(e.rax); + e.mov(e.rcx, e.qword[e.rsp + 0]); + e.mov(e.rdx, e.qword[e.rcx + 8]); // membase + e.add(e.rsp, 0x18); +} + // Sets EFLAGs with zf for the given value. // ZF = 1 if false, 0 = true (so jz = jump if false) void CheckBoolean(X64Emitter& e, Value* v) { @@ -421,9 +454,7 @@ void IntUnaryOp(X64Emitter& e, Instr*& i, v_fn v_fn) { ASSERT_INVALID_TYPE(); } if (i->flags & ARITHMETIC_SET_CARRY) { - // EFLAGS should have CA set? - // (so long as we don't fuck with it) - // UNIMPLEMENTED_SEQ(); + StoreEflags(e); } }; @@ -598,9 +629,7 @@ void IntBinaryOp(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn) { ASSERT_INVALID_TYPE(); } if (i->flags & ARITHMETIC_SET_CARRY) { - // EFLAGS should have CA set? - // (so long as we don't fuck with it) - // UNIMPLEMENTED_SEQ(); + StoreEflags(e); } }; @@ -777,9 +806,7 @@ void IntTernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn ASSERT_INVALID_TYPE(); } if (i->flags & ARITHMETIC_SET_CARRY) { - // EFLAGS should have CA set? - // (so long as we don't fuck with it) - // UNIMPLEMENTED_SEQ(); + StoreEflags(e); } } @@ -946,9 +973,7 @@ void XmmBinaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_vv_fn vv_fn) { ASSERT_INVALID_TYPE(); } if (flags & ARITHMETIC_SET_CARRY) { - // EFLAGS should have CA set? - // (so long as we don't fuck with it) - // UNIMPLEMENTED_SEQ(); + StoreEflags(e); } }; @@ -992,9 +1017,7 @@ void XmmTernaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_vvv_fn vvv_fn) { ASSERT_INVALID_TYPE(); } if (flags & ARITHMETIC_SET_CARRY) { - // EFLAGS should have CA set? - // (so long as we don't fuck with it) - // UNIMPLEMENTED_SEQ(); + StoreEflags(e); } }; From 949b1a222072624af2936e6d1722cdde585483d4 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 28 Jan 2014 23:21:55 -0800 Subject: [PATCH 042/184] Adding dynamic access checks. Eww. --- .../x64/lowering/lowering_sequences.cc | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index d02bcd18d..9393c2da9 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -30,6 +30,12 @@ using namespace Xbyak; namespace { +// Make loads/stores to ints check to see if they are doing a register value. +// This is slow, and with proper constant propagation we may be able to always +// avoid it. +// TODO(benvanik): make a compile time flag? +#define DYNAMIC_REGISTER_ACCESS_CHECK 1 + #define UNIMPLEMENTED_SEQ() __debugbreak() #define ASSERT_INVALID_TYPE() XEASSERTALWAYS() @@ -96,6 +102,28 @@ void UnimplementedExtern(void* raw_context, ExternFunction* extern_fn) { extern_fn->Call(thread_state); } +uint64_t DynamicRegisterLoad(void* raw_context, uint32_t address) { + auto thread_state = *((ThreadState**)raw_context); + auto cbs = thread_state->runtime()->access_callbacks(); + while (cbs) { + if (cbs->handles(cbs->context, address)) { + return cbs->read(cbs->context, address); + } + } + return 0; +} + +void DynamicRegisterStore(void* raw_context, uint32_t address, uint64_t value) { + auto thread_state = *((ThreadState**)raw_context); + auto cbs = thread_state->runtime()->access_callbacks(); + while (cbs) { + if (cbs->handles(cbs->context, address)) { + cbs->write(cbs->context, address, value); + return; + } + } +} + void Unpack_FLOAT16_2(void* raw_context, __m128& v) { uint32_t src = v.m128_i32[3]; v.m128_f32[0] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)src); @@ -1037,6 +1065,44 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { e.mov(addr_off.cvt32(), addr_off.cvt32()); // trunc to 32bits addr = e.rdx + addr_off; } + +#if DYNAMIC_REGISTER_ACCESS_CHECK + e.inLocalLabel(); + // if ((address & 0xFF000000) == 0x7F000000) do check; + e.lea(e.r8d, e.ptr[addr]); + e.and(e.r8d, 0xFF000000); + e.cmp(e.r8d, 0x7F000000); + e.jne(".normal_addr"); + if (IsIntType(i->dest->type)) { + e.mov(e.rdx, addr_off); + CallNative(e, DynamicRegisterLoad); + Reg64 dyn_dest; + e.BeginOp(i->dest, dyn_dest, REG_DEST); + switch (i->dest->type) { + case INT8_TYPE: + e.movzx(dyn_dest, e.al); + break; + case INT16_TYPE: + e.movzx(dyn_dest, e.ax); + break; + case INT32_TYPE: + e.mov(dyn_dest.cvt32(), e.eax); + break; + case INT64_TYPE: + e.mov(dyn_dest, e.rax); + break; + default: + e.db(0xCC); + break; + } + e.EndOp(dyn_dest); + } else { + e.db(0xCC); + } + e.jmp(".skip_access"); + e.L(".normal_addr"); +#endif // DYNAMIC_REGISTER_ACCESS_CHECK + if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { Reg8 dest; e.BeginOp(i->dest, dest, REG_DEST); @@ -1114,6 +1180,12 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { if (!i->src1.value->IsConstant()) { e.EndOp(addr_off); } + +#if DYNAMIC_REGISTER_ACCESS_CHECK + e.L(".skip_access"); + e.outLocalLabel(); +#endif // DYNAMIC_REGISTER_ACCESS_CHECK + i = e.Advance(i); return true; }); @@ -1173,6 +1245,44 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { e.mov(addr_off.cvt32(), addr_off.cvt32()); // trunc to 32bits addr = e.rdx + addr_off; } + +#if DYNAMIC_REGISTER_ACCESS_CHECK + // if ((address & 0xFF000000) == 0x7F000000) do check; + e.lea(e.r8d, e.ptr[addr]); + e.and(e.r8d, 0xFF000000); + e.cmp(e.r8d, 0x7F000000); + e.inLocalLabel(); + e.jne(".normal_addr"); + if (IsIntType(i->src2.value->type)) { + Reg64 dyn_src; + e.BeginOp(i->src2.value, dyn_src, 0); + switch (i->src2.value->type) { + case INT8_TYPE: + e.movzx(e.r8, dyn_src.cvt8()); + break; + case INT16_TYPE: + e.movzx(e.r8, dyn_src.cvt16()); + break; + case INT32_TYPE: + e.mov(e.r8d, dyn_src.cvt32()); + break; + case INT64_TYPE: + e.mov(e.r8, dyn_src); + break; + default: + e.db(0xCC); + break; + } + e.EndOp(dyn_src); + e.mov(e.rdx, addr_off); + CallNative(e, DynamicRegisterStore); + } else { + e.db(0xCC); + } + e.jmp(".skip_access"); + e.L(".normal_addr"); +#endif // DYNAMIC_REGISTER_ACCESS_CHECK + if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { Reg8 src; e.BeginOp(i->src2.value, src, 0); @@ -1304,6 +1414,12 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { if (!i->src1.value->IsConstant()) { e.EndOp(addr_off); } + +#if DYNAMIC_REGISTER_ACCESS_CHECK + e.L(".skip_access"); + e.outLocalLabel(); +#endif // DYNAMIC_REGISTER_ACCESS_CHECK + i = e.Advance(i); return true; }); From 0ec8e32861a6dd3d04296168323d433e9712827f Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 28 Jan 2014 23:22:10 -0800 Subject: [PATCH 043/184] Removing stack padding. Still broken even with it. --- src/alloy/backend/x64/lowering/op_utils.inl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index d519b634c..35b707780 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -64,12 +64,10 @@ void MovMem64(X64Emitter& e, RegExp& addr, uint64_t v) { } void CallNative(X64Emitter& e, void* target) { - e.sub(e.rsp, 0x18); e.mov(e.rax, (uint64_t)target); e.call(e.rax); e.mov(e.rcx, e.qword[e.rsp + 0]); e.mov(e.rdx, e.qword[e.rcx + 8]); // membase - e.add(e.rsp, 0x18); } // Sets EFLAGs with zf for the given value. From bdee924494068ebe10630c268755b77db549198c Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 28 Jan 2014 23:51:40 -0800 Subject: [PATCH 044/184] Various fixes. --- .../backend/x64/lowering/lowering_sequences.cc | 16 ++++++++++++---- src/alloy/backend/x64/lowering/op_utils.inl | 8 +++++--- src/alloy/backend/x64/x64_code_cache.cc | 2 +- src/alloy/backend/x64/x64_emitter.cc | 5 +++-- src/alloy/frontend/ppc/ppc_emit_memory.cc | 3 ++- 5 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 9393c2da9..ec5be7d6f 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -189,7 +189,7 @@ void IssueCall(X64Emitter& e, FunctionInfo* symbol_info, uint32_t flags) { } if (flags & CALL_TAIL) { // TODO(benvanik): adjust stack? - e.add(e.rsp, 0x40); + e.add(e.rsp, 72); e.jmp(e.rax); } else { e.call(e.rax); @@ -210,7 +210,7 @@ void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { e.mov(e.rdx, e.qword[e.rcx + 8]); // membase if (flags & CALL_TAIL) { // TODO(benvanik): adjust stack? - e.add(e.rsp, 0x40); + e.add(e.rsp, 72); e.jmp(e.rax); } else { e.call(e.rax); @@ -2844,8 +2844,16 @@ table->AddSequence(OPCODE_COMPARE_EXCHANGE, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_ATOMIC_EXCHANGE, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); + if (i->dest->type == INT32_TYPE) { + // dest = old_value = InterlockedExchange(src1 = address, src2 = new_value); + Reg32 dest, src2; + Reg64 src1; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0, + i->src2.value, src2, 0); + e.mov(dest, src2); + e.xchg(e.dword[src1], dest); + e.EndOp(dest, src1, src2); } else { ASSERT_INVALID_TYPE(); } diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 35b707780..52bf39d7d 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -17,6 +17,8 @@ namespace { #define LIKE_REG(dest, like) Reg(dest.getIdx(), dest.getKind(), like.getBit(), false) #define NAX_LIKE(like) Reg(e.rax.getIdx(), e.rax.getKind(), like.getBit(), false) +#define STASH_OFFSET 48 + // If we are running with tracing on we have to store the EFLAGS in the stack, // otherwise our calls out to C to print will clear it before DID_CARRY/etc // can get the value. @@ -24,7 +26,7 @@ namespace { void LoadEflags(X64Emitter& e) { #if STORE_EFLAGS - e.mov(e.eax, e.dword[e.rsp + 40]); + e.mov(e.eax, e.dword[e.rsp + STASH_OFFSET]); e.push(e.ax); e.popf(); #else @@ -34,7 +36,7 @@ void LoadEflags(X64Emitter& e) { void StoreEflags(X64Emitter& e) { #if STORE_EFLAGS e.pushf(); - e.pop(e.word[e.rsp + 40]); + e.pop(e.word[e.rsp + STASH_OFFSET]); #else // EFLAGS should have CA set? // (so long as we don't fuck with it) @@ -43,7 +45,7 @@ void StoreEflags(X64Emitter& e) { Address Stash(X64Emitter& e, const Xmm& r) { // TODO(benvanik): ensure aligned. - auto addr = e.ptr[e.rsp + 48]; + auto addr = e.ptr[e.rsp + STASH_OFFSET]; e.movups(addr, r); return addr; } diff --git a/src/alloy/backend/x64/x64_code_cache.cc b/src/alloy/backend/x64/x64_code_cache.cc index 7bbf91f2a..c7a456830 100644 --- a/src/alloy/backend/x64/x64_code_cache.cc +++ b/src/alloy/backend/x64/x64_code_cache.cc @@ -216,7 +216,7 @@ void X64CodeChunk::AddTableEntry(uint8_t* code, size_t code_size) { // TODO(benvanik): take as parameters? bool has_prolog = true; uint8_t prolog_size = 4; - uint8_t stack_bytes = 64; + uint8_t stack_bytes = 72; // http://msdn.microsoft.com/en-us/library/ddssxxy8.aspx UNWIND_INFO* unwind_info = (UNWIND_INFO*)(buffer + unwind_info_offset); diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 59fcfb36a..fc5dce840 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -132,7 +132,7 @@ int X64Emitter::Emit(HIRBuilder* builder) { // X64CodeCache, which dynamically generates exception information. // Adding or changing anything here must be matched! const bool emit_prolog = true; - const size_t stack_size = 64; + const size_t stack_size = 72; if (emit_prolog) { mov(qword[rsp + 8], rcx); sub(rsp, stack_size); @@ -238,7 +238,8 @@ void X64Emitter::EvictStaleRegisters() { // Register is live, not active. Check and see if we get rid of it. auto v = reg_state_.reg_values[n]; - if (v->last_use->ordinal < current_ordinal) { + if (!v->last_use || + v->last_use->ordinal < current_ordinal) { reg_state_.reg_values[n] = NULL; v->reg = -1; continue; diff --git a/src/alloy/frontend/ppc/ppc_emit_memory.cc b/src/alloy/frontend/ppc/ppc_emit_memory.cc index ab810f6b2..738090abf 100644 --- a/src/alloy/frontend/ppc/ppc_emit_memory.cc +++ b/src/alloy/frontend/ppc/ppc_emit_memory.cc @@ -891,7 +891,8 @@ XEEMITTER(stfiwx, 0x7C0007AE, X )(PPCHIRBuilder& f, InstrData& i) { // EA <- b + (RB) // MEM(EA, 4) <- (FRS)[32:63] Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); - f.Store(ea, f.ByteSwap(f.Cast(f.LoadFPR(i.X.RT), INT32_TYPE))); + f.Store(ea, f.ByteSwap( + f.Truncate(f.Cast(f.LoadFPR(i.X.RT), INT64_TYPE), INT32_TYPE))); return 0; } From f85b83709e26eefa98423d1dd6a44b3d0db199ef Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Thu, 30 Jan 2014 00:22:55 -0800 Subject: [PATCH 045/184] Refactoring of function types. Also fixes some library import bugs. --- src/alloy/backend/ivm/ivm_function.cc | 2 +- src/alloy/backend/ivm/ivm_function.h | 2 +- src/alloy/backend/ivm/ivm_intcode.cc | 8 +++ .../x64/lowering/lowering_sequences.cc | 55 ++++++++--------- src/alloy/backend/x64/x64_function.cc | 2 +- src/alloy/backend/x64/x64_function.h | 2 +- src/alloy/frontend/ppc/ppc_emit_control.cc | 4 +- src/alloy/hir/hir_builder.cc | 7 +++ src/alloy/hir/hir_builder.h | 1 + src/alloy/hir/opcodes.h | 1 + src/alloy/hir/opcodes.inl | 6 ++ src/alloy/runtime/function.cc | 59 +++++++------------ src/alloy/runtime/function.h | 49 +-------------- src/alloy/runtime/symbol_info.cc | 8 +++ src/alloy/runtime/symbol_info.h | 12 ++++ src/xenia/cpu/processor.cc | 1 - src/xenia/cpu/xex_module.cc | 56 ++++++++++-------- src/xenia/export_resolver.cc | 22 +++++++ src/xenia/export_resolver.h | 4 ++ src/xenia/kernel/objects/xuser_module.cc | 2 - src/xenia/kernel/util/xex2.cc | 55 +++++++++++++---- 21 files changed, 198 insertions(+), 160 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_function.cc b/src/alloy/backend/ivm/ivm_function.cc index c4c0d97f9..8cd6835d6 100644 --- a/src/alloy/backend/ivm/ivm_function.cc +++ b/src/alloy/backend/ivm/ivm_function.cc @@ -23,7 +23,7 @@ using namespace alloy::runtime; IVMFunction::IVMFunction(FunctionInfo* symbol_info) : register_count_(0), intcode_count_(0), intcodes_(0), source_map_count_(0), source_map_(0), - GuestFunction(symbol_info) { + Function(symbol_info) { } IVMFunction::~IVMFunction() { diff --git a/src/alloy/backend/ivm/ivm_function.h b/src/alloy/backend/ivm/ivm_function.h index 7fee49db0..c7da76f89 100644 --- a/src/alloy/backend/ivm/ivm_function.h +++ b/src/alloy/backend/ivm/ivm_function.h @@ -21,7 +21,7 @@ namespace backend { namespace ivm { -class IVMFunction : public runtime::GuestFunction { +class IVMFunction : public runtime::Function { public: IVMFunction(runtime::FunctionInfo* symbol_info); virtual ~IVMFunction(); diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index f7e0cdbde..20544028c 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -712,6 +712,13 @@ int Translate_CALL_INDIRECT_TRUE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->src1.value->type]); } +uint32_t IntCode_CALL_EXTERN(IntCodeState& ics, const IntCode* i) { + return IntCode_CALL_XX(ics, i, i->src1_reg); +} +int Translate_CALL_EXTERN(TranslationContext& ctx, Instr* i) { + return DispatchToC(ctx, i, IntCode_CALL_EXTERN); +} + uint32_t IntCode_RETURN(IntCodeState& ics, const IntCode* i) { return IA_RETURN; } @@ -4009,6 +4016,7 @@ static const TranslateFn dispatch_table[] = { Translate_CALL_TRUE, Translate_CALL_INDIRECT, Translate_CALL_INDIRECT_TRUE, + Translate_CALL_EXTERN, Translate_RETURN, Translate_RETURN_TRUE, diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index ec5be7d6f..f161424a8 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -96,12 +96,6 @@ void Dummy() { // } -void UnimplementedExtern(void* raw_context, ExternFunction* extern_fn) { - // TODO(benvanik): generate this thunk at runtime? or a shim? - auto thread_state = *((ThreadState**)raw_context); - extern_fn->Call(thread_state); -} - uint64_t DynamicRegisterLoad(void* raw_context, uint32_t address) { auto thread_state = *((ThreadState**)raw_context); auto cbs = thread_state->runtime()->access_callbacks(); @@ -149,44 +143,30 @@ void* ResolveFunctionSymbol(void* raw_context, FunctionInfo* symbol_info) { Function* fn = NULL; thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn); XEASSERTNOTNULL(fn); - XEASSERT(fn->type() == Function::USER_FUNCTION); auto x64_fn = (X64Function*)fn; return x64_fn->machine_code(); } -void* ResolveFunctionAddress(void* raw_context, uint64_t target_address) { +void* ResolveFunctionAddress(void* raw_context, uint32_t target_address) { // TODO(benvanik): generate this thunk at runtime? or a shim? auto thread_state = *((ThreadState**)raw_context); Function* fn = NULL; thread_state->runtime()->ResolveFunction(target_address, &fn); XEASSERTNOTNULL(fn); - XEASSERT(fn->type() == Function::USER_FUNCTION); auto x64_fn = (X64Function*)fn; return x64_fn->machine_code(); } void IssueCall(X64Emitter& e, FunctionInfo* symbol_info, uint32_t flags) { - // If we are an extern function, we can directly insert a call. auto fn = symbol_info->function(); - if (fn && fn->type() == Function::EXTERN_FUNCTION) { - auto extern_fn = (ExternFunction*)fn; - if (extern_fn->handler()) { - e.mov(e.rdx, (uint64_t)extern_fn->arg0()); - e.mov(e.r8, (uint64_t)extern_fn->arg1()); - e.mov(e.rax, (uint64_t)extern_fn->handler()); - } else { - // Unimplemented - call dummy. - e.mov(e.rdx, (uint64_t)extern_fn); - e.mov(e.rax, (uint64_t)UnimplementedExtern); - } - } else { - // Generic call, resolve address. - // TODO(benvanik): caching/etc. For now this makes debugging easier. - e.mov(e.rdx, (uint64_t)symbol_info); - e.mov(e.rax, (uint64_t)ResolveFunctionSymbol); - e.call(e.rax); - e.mov(e.rcx, e.qword[e.rsp + 0]); - e.mov(e.rdx, e.qword[e.rcx + 8]); // membase - } + // Resolve address to the function to call and store in rax. + // TODO(benvanik): caching/etc. For now this makes debugging easier. + e.mov(e.rdx, (uint64_t)symbol_info); + e.mov(e.rax, (uint64_t)ResolveFunctionSymbol); + e.call(e.rax); + e.mov(e.rcx, e.qword[e.rsp + 0]); + e.mov(e.rdx, e.qword[e.rcx + 8]); // membase + + // Actually jump/call to rax. if (flags & CALL_TAIL) { // TODO(benvanik): adjust stack? e.add(e.rsp, 72); @@ -198,6 +178,8 @@ void IssueCall(X64Emitter& e, FunctionInfo* symbol_info, uint32_t flags) { } } void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { + // Resolve address to the function to call and store in rax. + // TODO(benvanik): caching/etc. For now this makes debugging easier. Reg64 r; e.BeginOp(target, r, 0); if (r != e.rdx) { @@ -208,6 +190,8 @@ void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { e.call(e.rax); e.mov(e.rcx, e.qword[e.rsp + 0]); e.mov(e.rdx, e.qword[e.rcx + 8]); // membase + + // Actually jump/call to rax. if (flags & CALL_TAIL) { // TODO(benvanik): adjust stack? e.add(e.rsp, 72); @@ -349,6 +333,17 @@ table->AddSequence(OPCODE_CALL_INDIRECT_TRUE, [](X64Emitter& e, Instr*& i) { return true; }); +table->AddSequence(OPCODE_CALL_EXTERN, [](X64Emitter& e, Instr*& i) { + auto symbol_info = i->src1.symbol_info; + XEASSERT(symbol_info->behavior() == FunctionInfo::BEHAVIOR_EXTERN); + XEASSERTNOTNULL(symbol_info->extern_handler()); + e.mov(e.rdx, (uint64_t)symbol_info->extern_arg0()); + e.mov(e.r8, (uint64_t)symbol_info->extern_arg1()); + CallNative(e, symbol_info->extern_handler()); + i = e.Advance(i); + return true; +}); + table->AddSequence(OPCODE_RETURN, [](X64Emitter& e, Instr*& i) { // If this is the last instruction in the last block, just let us // fall through. diff --git a/src/alloy/backend/x64/x64_function.cc b/src/alloy/backend/x64/x64_function.cc index c668c14f1..b8172247e 100644 --- a/src/alloy/backend/x64/x64_function.cc +++ b/src/alloy/backend/x64/x64_function.cc @@ -21,7 +21,7 @@ using namespace alloy::runtime; X64Function::X64Function(FunctionInfo* symbol_info) : machine_code_(NULL), code_size_(0), - GuestFunction(symbol_info) { + Function(symbol_info) { } X64Function::~X64Function() { diff --git a/src/alloy/backend/x64/x64_function.h b/src/alloy/backend/x64/x64_function.h index e879a72c7..5166fd879 100644 --- a/src/alloy/backend/x64/x64_function.h +++ b/src/alloy/backend/x64/x64_function.h @@ -20,7 +20,7 @@ namespace backend { namespace x64 { -class X64Function : public runtime::GuestFunction { +class X64Function : public runtime::Function { public: X64Function(runtime::FunctionInfo* symbol_info); virtual ~X64Function(); diff --git a/src/alloy/frontend/ppc/ppc_emit_control.cc b/src/alloy/frontend/ppc/ppc_emit_control.cc index 83a50a2c4..9815c4649 100644 --- a/src/alloy/frontend/ppc/ppc_emit_control.cc +++ b/src/alloy/frontend/ppc/ppc_emit_control.cc @@ -380,8 +380,8 @@ XEEMITTER(mcrf, 0x4C000000, XL )(PPCHIRBuilder& f, InstrData& i) { // System linkage (A-24) XEEMITTER(sc, 0x44000002, SC )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + f.CallExtern(f.symbol_info()); + return 0; } diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index 99d5649d1..69cabc286 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -557,6 +557,13 @@ void HIRBuilder::CallIndirectTrue( EndBlock(); } +void HIRBuilder::CallExtern(FunctionInfo* symbol_info) { + Instr* i = AppendInstr(OPCODE_CALL_EXTERN_info, 0); + i->src1.symbol_info = symbol_info; + i->src2.value = i->src3.value = NULL; + EndBlock(); +} + void HIRBuilder::Return() { Instr* i = AppendInstr(OPCODE_RETURN_info, 0); i->src1.value = i->src2.value = i->src3.value = NULL; diff --git a/src/alloy/hir/hir_builder.h b/src/alloy/hir/hir_builder.h index 05fa632de..e5a0aef07 100644 --- a/src/alloy/hir/hir_builder.h +++ b/src/alloy/hir/hir_builder.h @@ -74,6 +74,7 @@ public: uint32_t call_flags = 0); void CallIndirect(Value* value, uint32_t call_flags = 0); void CallIndirectTrue(Value* cond, Value* value, uint32_t call_flags = 0); + void CallExtern(runtime::FunctionInfo* symbol_info); void Return(); void ReturnTrue(Value* cond); diff --git a/src/alloy/hir/opcodes.h b/src/alloy/hir/opcodes.h index 9fdcd311e..2b8649afe 100644 --- a/src/alloy/hir/opcodes.h +++ b/src/alloy/hir/opcodes.h @@ -94,6 +94,7 @@ enum Opcode { OPCODE_CALL_TRUE, OPCODE_CALL_INDIRECT, OPCODE_CALL_INDIRECT_TRUE, + OPCODE_CALL_EXTERN, OPCODE_RETURN, OPCODE_RETURN_TRUE, diff --git a/src/alloy/hir/opcodes.inl b/src/alloy/hir/opcodes.inl index abdea12db..485fa529b 100644 --- a/src/alloy/hir/opcodes.inl +++ b/src/alloy/hir/opcodes.inl @@ -74,6 +74,12 @@ DEFINE_OPCODE( OPCODE_SIG_X_V_V, OPCODE_FLAG_BRANCH); +DEFINE_OPCODE( + OPCODE_CALL_EXTERN, + "call_extern", + OPCODE_SIG_X_S, + OPCODE_FLAG_BRANCH); + DEFINE_OPCODE( OPCODE_RETURN, "return", diff --git a/src/alloy/runtime/function.cc b/src/alloy/runtime/function.cc index c09d3b929..f8b74f48f 100644 --- a/src/alloy/runtime/function.cc +++ b/src/alloy/runtime/function.cc @@ -17,8 +17,9 @@ using namespace alloy; using namespace alloy::runtime; -Function::Function(Type type, uint64_t address) : - type_(type), address_(address), debug_info_(0) { +Function::Function(FunctionInfo* symbol_info) : + address_(symbol_info->address()), + symbol_info_(symbol_info), debug_info_(0) { // TODO(benvanik): create on demand? lock_ = AllocMutex(); } @@ -77,43 +78,27 @@ int Function::Call(ThreadState* thread_state) { if (original_thread_state != thread_state) { ThreadState::Bind(thread_state); } - int result = CallImpl(thread_state); + + int result = 0; + + if (symbol_info_->behavior() == FunctionInfo::BEHAVIOR_EXTERN) { + auto handler = symbol_info_->extern_handler(); + if (handler) { + handler(thread_state->raw_context(), + symbol_info_->extern_arg0(), + symbol_info_->extern_arg1()); + } else { + XELOGW("undefined extern call to %.8X %s", + symbol_info_->address(), + symbol_info_->name()); + result = 1; + } + } else { + CallImpl(thread_state); + } + if (original_thread_state != thread_state) { ThreadState::Bind(original_thread_state); } return result; } - -ExternFunction::ExternFunction( - uint64_t address, Handler handler, void* arg0, void* arg1) : - name_(0), - handler_(handler), arg0_(arg0), arg1_(arg1), - Function(Function::EXTERN_FUNCTION, address) { -} - -ExternFunction::~ExternFunction() { - if (name_) { - xe_free(name_); - } -} - -void ExternFunction::set_name(const char* name) { - name_ = xestrdupa(name); -} - -int ExternFunction::CallImpl(ThreadState* thread_state) { - if (!handler_) { - XELOGW("undefined extern call to %.8X %s", address(), name()); - return 0; - } - handler_(thread_state->raw_context(), arg0_, arg1_); - return 0; -} - -GuestFunction::GuestFunction(FunctionInfo* symbol_info) : - symbol_info_(symbol_info), - Function(Function::USER_FUNCTION, symbol_info->address()) { -} - -GuestFunction::~GuestFunction() { -} diff --git a/src/alloy/runtime/function.h b/src/alloy/runtime/function.h index d150f91a6..629276c0b 100644 --- a/src/alloy/runtime/function.h +++ b/src/alloy/runtime/function.h @@ -24,17 +24,11 @@ class ThreadState; class Function { public: - enum Type { - UNKNOWN_FUNCTION = 0, - EXTERN_FUNCTION, - USER_FUNCTION, - }; -public: - Function(Type type, uint64_t address); + Function(FunctionInfo* symbol_info); virtual ~Function(); - Type type() const { return type_; } uint64_t address() const { return address_; } + FunctionInfo* symbol_info() const { return symbol_info_; } DebugInfo* debug_info() const { return debug_info_; } void set_debug_info(DebugInfo* debug_info) { debug_info_ = debug_info; } @@ -51,8 +45,8 @@ protected: virtual int CallImpl(ThreadState* thread_state) = 0; protected: - Type type_; uint64_t address_; + FunctionInfo* symbol_info_; DebugInfo* debug_info_; // TODO(benvanik): move elsewhere? DebugData? @@ -61,43 +55,6 @@ protected: }; -class ExternFunction : public Function { -public: - typedef void(*Handler)(void* context, void* arg0, void* arg1); -public: - ExternFunction(uint64_t address, Handler handler, void* arg0, void* arg1); - virtual ~ExternFunction(); - - const char* name() const { return name_; } - void set_name(const char* name); - - Handler handler() const { return handler_; } - void* arg0() const { return arg0_; } - void* arg1() const { return arg1_; } - -protected: - virtual int CallImpl(ThreadState* thread_state); - -protected: - char* name_; - Handler handler_; - void* arg0_; - void* arg1_; -}; - - -class GuestFunction : public Function { -public: - GuestFunction(FunctionInfo* symbol_info); - virtual ~GuestFunction(); - - FunctionInfo* symbol_info() const { return symbol_info_; } - -protected: - FunctionInfo* symbol_info_; -}; - - } // namespace runtime } // namespace alloy diff --git a/src/alloy/runtime/symbol_info.cc b/src/alloy/runtime/symbol_info.cc index 3a486840d..e87727b3a 100644 --- a/src/alloy/runtime/symbol_info.cc +++ b/src/alloy/runtime/symbol_info.cc @@ -34,11 +34,19 @@ void SymbolInfo::set_name(const char* name) { FunctionInfo::FunctionInfo(Module* module, uint64_t address) : end_address_(0), behavior_(BEHAVIOR_DEFAULT), function_(0), SymbolInfo(SymbolInfo::TYPE_FUNCTION, module, address) { + xe_zero_struct(&extern_info_, sizeof(extern_info_)); } FunctionInfo::~FunctionInfo() { } +void FunctionInfo::SetupExtern(ExternHandler handler, void* arg0, void* arg1) { + behavior_ = BEHAVIOR_EXTERN; + extern_info_.handler = handler; + extern_info_.arg0 = arg0; + extern_info_.arg1 = arg1; +} + VariableInfo::VariableInfo(Module* module, uint64_t address) : SymbolInfo(SymbolInfo::TYPE_VARIABLE, module, address) { } diff --git a/src/alloy/runtime/symbol_info.h b/src/alloy/runtime/symbol_info.h index c91fda40a..8d2a964e7 100644 --- a/src/alloy/runtime/symbol_info.h +++ b/src/alloy/runtime/symbol_info.h @@ -63,6 +63,7 @@ public: BEHAVIOR_PROLOG, BEHAVIOR_EPILOG, BEHAVIOR_EPILOG_RETURN, + BEHAVIOR_EXTERN, }; public: @@ -79,10 +80,21 @@ public: Function* function() const { return function_; } void set_function(Function* value) { function_ = value; } + typedef void(*ExternHandler)(void* context, void* arg0, void* arg1); + void SetupExtern(ExternHandler handler, void* arg0, void* arg1); + ExternHandler extern_handler() const { return extern_info_.handler; } + void* extern_arg0() const { return extern_info_.arg0; } + void* extern_arg1() const { return extern_info_.arg1; } + private: uint64_t end_address_; Behavior behavior_; Function* function_; + struct { + ExternHandler handler; + void* arg0; + void* arg1; + } extern_info_; }; class VariableInfo : public SymbolInfo { diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index e03890a92..6fd7347dd 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -648,7 +648,6 @@ json_t* Processor::DumpModule(XexModule* module, bool& succeeded) { json_object_set_new(import_library_json, "imports", imports_json); json_array_append_new(library_imports_json, import_library_json); - xe_free(import_infos); } json_object_set_new(module_json, "libraryImports", library_imports_json); diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc index 9291724d2..2ff90c16a 100644 --- a/src/xenia/cpu/xex_module.cc +++ b/src/xenia/cpu/xex_module.cc @@ -135,14 +135,12 @@ int XexModule::SetupLibraryImports(const xe_xex2_import_library_t* library) { if (kernel_export->type == KernelExport::Function) { // Not exactly sure what this should be... if (info->thunk_address) { - // slot = XESWAP32BE(info->thunk_address); - // Setting this breaks other emu code that relies on it not being - // modified. Not sure what to do. + *slot = XESWAP32BE(info->thunk_address); } else { // TODO(benvanik): find out what import variables are. XELOGW("kernel import variable not defined %.8X %s", info->value_address, kernel_export->name); - //*slot = XESWAP32BE(0xF00DF00D); + *slot = XESWAP32BE(0xF00DF00D); } } else { if (kernel_export->is_implemented) { @@ -165,39 +163,45 @@ int XexModule::SetupLibraryImports(const xe_xex2_import_library_t* library) { info->ordinal); } - FunctionInfo* fn_info; - DeclareFunction(info->thunk_address, &fn_info); - fn_info->set_end_address(info->thunk_address + 16 - 4); - //fn->type = FunctionSymbol::Kernel; - //fn->kernel_export = kernel_export; - fn_info->set_name(name); - fn_info->set_status(SymbolInfo::STATUS_DECLARED); + // On load we have something like this in memory: + // li r3, 0 + // li r4, 0x1F5 + // mtspr CTR, r11 + // bctr + // Real consoles rewrite this with some code that sets r11. + // If we did that we'd still have to put a thunk somewhere and do the + // dynamic lookup. Instead, we rewrite it to use syscalls, as they + // aren't used on the 360. Alloy backends can either take the syscall + // or do something smarter. + // sc + // blr + // nop + // nop + uint8_t* p = memory()->Translate(info->thunk_address); + XESETUINT32BE(p + 0x0, 0x44000002); + XESETUINT32BE(p + 0x4, 0x4E800020); + XESETUINT32BE(p + 0x8, 0x60000000); + XESETUINT32BE(p + 0xC, 0x60000000); - ExternFunction::Handler handler = 0; + FunctionInfo::ExternHandler handler = 0; void* handler_data = 0; if (kernel_export) { - handler = (ExternFunction::Handler)kernel_export->function_data.shim; + handler = (FunctionInfo::ExternHandler)kernel_export->function_data.shim; handler_data = kernel_export->function_data.shim_data; } else { - handler = (ExternFunction::Handler)UndefinedImport; + handler = (FunctionInfo::ExternHandler)UndefinedImport; handler_data = this; } - DefineFunction(fn_info); - auto fn = new ExternFunction( - info->thunk_address, - handler, - handler_data, - NULL); - if (kernel_export) { - fn->set_name(kernel_export->name); - } - fn_info->set_function(fn); - fn_info->set_status(SymbolInfo::STATUS_DEFINED); + FunctionInfo* fn_info; + DeclareFunction(info->thunk_address, &fn_info); + fn_info->set_end_address(info->thunk_address + 16 - 4); + fn_info->set_name(name); + fn_info->SetupExtern(handler, handler_data, NULL); + fn_info->set_status(SymbolInfo::STATUS_DECLARED); } } - xe_free(import_infos); return 0; } diff --git a/src/xenia/export_resolver.cc b/src/xenia/export_resolver.cc index f630a3c48..9d09b63ec 100644 --- a/src/xenia/export_resolver.cc +++ b/src/xenia/export_resolver.cc @@ -36,6 +36,28 @@ void ExportResolver::RegisterTable( } } +uint16_t ExportResolver::GetLibraryOrdinal(const char* library_name) { + uint16_t n = 0; + for (auto it = tables_.begin(); it != tables_.end(); ++it, n++) { + if (!xestrcmpa(library_name, it->name)) { + return n; + } + } + return -1; +} + +KernelExport* ExportResolver::GetExportByOrdinal( + const uint16_t library_ordinal, const uint32_t ordinal) { + auto& table = tables_[library_ordinal]; + // TODO(benvanik): binary search? + for (size_t n = 0; n < table.count; n++) { + if (table.exports[n].ordinal == ordinal) { + return &table.exports[n]; + } + } + return NULL; +} + KernelExport* ExportResolver::GetExportByOrdinal(const char* library_name, const uint32_t ordinal) { for (std::vector::iterator it = tables_.begin(); diff --git a/src/xenia/export_resolver.h b/src/xenia/export_resolver.h index fcc9a6d87..487e8bd9c 100644 --- a/src/xenia/export_resolver.h +++ b/src/xenia/export_resolver.h @@ -68,6 +68,10 @@ public: void RegisterTable(const char* library_name, KernelExport* exports, const size_t count); + uint16_t GetLibraryOrdinal(const char* library_name); + + KernelExport* GetExportByOrdinal(const uint16_t library_ordinal, + const uint32_t ordinal); KernelExport* GetExportByOrdinal(const char* library_name, const uint32_t ordinal); KernelExport* GetExportByName(const char* library_name, const char* name); diff --git a/src/xenia/kernel/objects/xuser_module.cc b/src/xenia/kernel/objects/xuser_module.cc index 86d25847e..3a07b209a 100644 --- a/src/xenia/kernel/objects/xuser_module.cc +++ b/src/xenia/kernel/objects/xuser_module.cc @@ -345,8 +345,6 @@ void XUserModule::Dump() { } } - - xe_free(import_infos); } printf("\n"); diff --git a/src/xenia/kernel/util/xex2.cc b/src/xenia/kernel/util/xex2.cc index 5532200d8..af71eb182 100644 --- a/src/xenia/kernel/util/xex2.cc +++ b/src/xenia/kernel/util/xex2.cc @@ -24,11 +24,16 @@ using namespace alloy; typedef struct xe_xex2 { xe_ref_t ref; - Memory* memory; + Memory* memory; - xe_xex2_header_t header; + xe_xex2_header_t header; std::vector* sections; + + struct { + size_t count; + xe_xex2_import_info_t* infos; + } library_imports[16]; } xe_xex2_t; @@ -39,6 +44,8 @@ int xe_xex2_read_image(xe_xex2_ref xex, const uint8_t *xex_addr, const size_t xex_length, Memory* memory); int xe_xex2_load_pe(xe_xex2_ref xex); +int xe_xex2_find_import_infos(xe_xex2_ref xex, + const xe_xex2_import_library_t* library); xe_xex2_ref xe_xex2_load(Memory* memory, @@ -58,6 +65,11 @@ xe_xex2_ref xe_xex2_load(Memory* memory, XEEXPECTZERO(xe_xex2_load_pe(xex)); + for (size_t n = 0; n < xex->header.import_library_count; n++) { + auto library = &xex->header.import_libraries[n]; + XEEXPECTZERO(xe_xex2_find_import_infos(xex, library)); + } + return xex; XECLEANUP: @@ -894,12 +906,10 @@ const PESection* xe_xex2_get_pe_section(xe_xex2_ref xex, const char* name) { return NULL; } -int xe_xex2_get_import_infos(xe_xex2_ref xex, - const xe_xex2_import_library_t *library, - xe_xex2_import_info_t **out_import_infos, - size_t *out_import_info_count) { - uint8_t *mem = xex->memory->membase(); - const xe_xex2_header_t *header = xe_xex2_get_header(xex); +int xe_xex2_find_import_infos(xe_xex2_ref xex, + const xe_xex2_import_library_t *library) { + uint8_t* mem = xex->memory->membase(); + auto header = xe_xex2_get_header(xex); // Find library index for verification. size_t library_index = -1; @@ -970,13 +980,34 @@ int xe_xex2_get_import_infos(xe_xex2_ref xex, } } - *out_import_info_count = info_count; - *out_import_infos = infos; + xex->library_imports[library_index].count = info_count; + xex->library_imports[library_index].infos = infos; return 0; XECLEANUP: xe_free(infos); - *out_import_info_count = 0; - *out_import_infos = NULL; return 1; } + +int xe_xex2_get_import_infos(xe_xex2_ref xex, + const xe_xex2_import_library_t *library, + xe_xex2_import_info_t **out_import_infos, + size_t *out_import_info_count) { + auto header = xe_xex2_get_header(xex); + + // Find library index for verification. + size_t library_index = -1; + for (size_t n = 0; n < header->import_library_count; n++) { + if (&header->import_libraries[n] == library) { + library_index = n; + break; + } + } + if (library_index == (size_t)-1) { + return 1; + } + + *out_import_info_count = xex->library_imports[library_index].count; + *out_import_infos = xex->library_imports[library_index].infos; + return 0; +} From 009a6d0745f5a756e56e303415a2534576aecb62 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 31 Jan 2014 22:16:05 -0800 Subject: [PATCH 046/184] Hacking together proper stack saving. Can't get >128b frames working. --- .../x64/lowering/lowering_sequences.cc | 53 ++++--- src/alloy/backend/x64/lowering/op_utils.inl | 4 +- src/alloy/backend/x64/sources.gypi | 2 + src/alloy/backend/x64/x64_assembler.cc | 7 +- src/alloy/backend/x64/x64_assembler.h | 2 + src/alloy/backend/x64/x64_backend.cc | 8 + src/alloy/backend/x64/x64_backend.h | 9 ++ src/alloy/backend/x64/x64_code_cache.cc | 103 ++++++++++--- src/alloy/backend/x64/x64_code_cache.h | 2 +- src/alloy/backend/x64/x64_emitter.cc | 29 ++-- src/alloy/backend/x64/x64_emitter.h | 13 +- src/alloy/backend/x64/x64_function.cc | 9 +- src/alloy/backend/x64/x64_thunk_emitter.cc | 139 ++++++++++++++++++ src/alloy/backend/x64/x64_thunk_emitter.h | 124 ++++++++++++++++ 14 files changed, 424 insertions(+), 80 deletions(-) create mode 100644 src/alloy/backend/x64/x64_thunk_emitter.cc create mode 100644 src/alloy/backend/x64/x64_thunk_emitter.h diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index f161424a8..d2420c740 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -9,8 +9,10 @@ #include +#include #include #include +#include #include #include #include @@ -44,6 +46,11 @@ namespace { #define SHUFPS_SWAP_DWORDS 0x1B + +// Major templating foo lives in here. +#include + + enum XmmConst { XMMZero = 0, XMMOne = 1, @@ -156,25 +163,31 @@ void* ResolveFunctionAddress(void* raw_context, uint32_t target_address) { auto x64_fn = (X64Function*)fn; return x64_fn->machine_code(); } +void TransitionToHost(X64Emitter& e) { + // Expects: + // rcx = context + // rdx = target host function + // r8 = arg0 + // r9 = arg1 + // Returns: + // rax = host return + auto thunk = e.backend()->guest_to_host_thunk(); + e.mov(e.rax, (uint64_t)thunk); + e.call(e.rax); +} void IssueCall(X64Emitter& e, FunctionInfo* symbol_info, uint32_t flags) { auto fn = symbol_info->function(); // Resolve address to the function to call and store in rax. // TODO(benvanik): caching/etc. For now this makes debugging easier. e.mov(e.rdx, (uint64_t)symbol_info); - e.mov(e.rax, (uint64_t)ResolveFunctionSymbol); - e.call(e.rax); - e.mov(e.rcx, e.qword[e.rsp + 0]); - e.mov(e.rdx, e.qword[e.rcx + 8]); // membase + CallNative(e, ResolveFunctionSymbol); // Actually jump/call to rax. if (flags & CALL_TAIL) { - // TODO(benvanik): adjust stack? - e.add(e.rsp, 72); + e.add(e.rsp, StackLayout::GUEST_STACK_SIZE); e.jmp(e.rax); } else { e.call(e.rax); - e.mov(e.rcx, e.qword[e.rsp + 0]); - e.mov(e.rdx, e.qword[e.rcx + 8]); // membase } } void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { @@ -186,30 +199,20 @@ void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { e.mov(e.rdx, r); } e.EndOp(r); - e.mov(e.rax, (uint64_t)ResolveFunctionAddress); - e.call(e.rax); - e.mov(e.rcx, e.qword[e.rsp + 0]); - e.mov(e.rdx, e.qword[e.rcx + 8]); // membase + CallNative(e, ResolveFunctionAddress); // Actually jump/call to rax. if (flags & CALL_TAIL) { - // TODO(benvanik): adjust stack? - e.add(e.rsp, 72); + e.add(e.rsp, StackLayout::GUEST_STACK_SIZE); e.jmp(e.rax); } else { e.call(e.rax); - e.mov(e.rcx, e.qword[e.rsp + 0]); - e.mov(e.rdx, e.qword[e.rcx + 8]); // membase } } } // namespace -// Major templating foo lives in here. -#include - - void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { // -------------------------------------------------------------------------- // General @@ -337,9 +340,13 @@ table->AddSequence(OPCODE_CALL_EXTERN, [](X64Emitter& e, Instr*& i) { auto symbol_info = i->src1.symbol_info; XEASSERT(symbol_info->behavior() == FunctionInfo::BEHAVIOR_EXTERN); XEASSERTNOTNULL(symbol_info->extern_handler()); - e.mov(e.rdx, (uint64_t)symbol_info->extern_arg0()); - e.mov(e.r8, (uint64_t)symbol_info->extern_arg1()); - CallNative(e, symbol_info->extern_handler()); + // rdx = target host function + // r8 = arg0 + // r9 = arg1 + e.mov(e.rdx, (uint64_t)symbol_info->extern_handler()); + e.mov(e.r8, (uint64_t)symbol_info->extern_arg0()); + e.mov(e.r9, (uint64_t)symbol_info->extern_arg1()); + TransitionToHost(e); i = e.Advance(i); return true; }); diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 52bf39d7d..7fe1bda6d 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -17,7 +17,7 @@ namespace { #define LIKE_REG(dest, like) Reg(dest.getIdx(), dest.getKind(), like.getBit(), false) #define NAX_LIKE(like) Reg(e.rax.getIdx(), e.rax.getKind(), like.getBit(), false) -#define STASH_OFFSET 48 +#define STASH_OFFSET 0 // If we are running with tracing on we have to store the EFLAGS in the stack, // otherwise our calls out to C to print will clear it before DID_CARRY/etc @@ -68,7 +68,7 @@ void MovMem64(X64Emitter& e, RegExp& addr, uint64_t v) { void CallNative(X64Emitter& e, void* target) { e.mov(e.rax, (uint64_t)target); e.call(e.rax); - e.mov(e.rcx, e.qword[e.rsp + 0]); + e.mov(e.rcx, e.qword[e.rsp + StackLayout::RCX_HOME]); e.mov(e.rdx, e.qword[e.rcx + 8]); // membase } diff --git a/src/alloy/backend/x64/sources.gypi b/src/alloy/backend/x64/sources.gypi index 0a3ead5a9..7ca63e25d 100644 --- a/src/alloy/backend/x64/sources.gypi +++ b/src/alloy/backend/x64/sources.gypi @@ -12,6 +12,8 @@ 'x64_emitter.h', 'x64_function.cc', 'x64_function.h', + 'x64_thunk_emitter.cc', + 'x64_thunk_emitter.h', ], 'includes': [ diff --git a/src/alloy/backend/x64/x64_assembler.cc b/src/alloy/backend/x64/x64_assembler.cc index d4e88e621..3f90b077b 100644 --- a/src/alloy/backend/x64/x64_assembler.cc +++ b/src/alloy/backend/x64/x64_assembler.cc @@ -30,7 +30,7 @@ using namespace alloy::runtime; X64Assembler::X64Assembler(X64Backend* backend) : x64_backend_(backend), - emitter_(0), + emitter_(0), allocator_(0), Assembler(backend) { } @@ -39,6 +39,7 @@ X64Assembler::~X64Assembler() { })); delete emitter_; + delete allocator_; } int X64Assembler::Initialize() { @@ -47,8 +48,8 @@ int X64Assembler::Initialize() { return result; } - emitter_ = new X64Emitter(x64_backend_, - new XbyakAllocator()); + allocator_ = new XbyakAllocator(); + emitter_ = new X64Emitter(x64_backend_, allocator_); alloy::tracing::WriteEvent(EventType::AssemblerInit({ })); diff --git a/src/alloy/backend/x64/x64_assembler.h b/src/alloy/backend/x64/x64_assembler.h index 3d6235254..063e19c63 100644 --- a/src/alloy/backend/x64/x64_assembler.h +++ b/src/alloy/backend/x64/x64_assembler.h @@ -21,6 +21,7 @@ namespace x64 { class X64Backend; class X64Emitter; +class XbyakAllocator; class X64Assembler : public Assembler { @@ -45,6 +46,7 @@ private: private: X64Backend* x64_backend_; X64Emitter* emitter_; + XbyakAllocator* allocator_; StringBuffer string_buffer_; }; diff --git a/src/alloy/backend/x64/x64_backend.cc b/src/alloy/backend/x64/x64_backend.cc index 560328750..031dc6bda 100644 --- a/src/alloy/backend/x64/x64_backend.cc +++ b/src/alloy/backend/x64/x64_backend.cc @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -46,6 +47,13 @@ int X64Backend::Initialize() { return result; } + auto allocator = new XbyakAllocator(); + auto thunk_emitter = new X64ThunkEmitter(this, allocator); + host_to_guest_thunk_ = thunk_emitter->EmitHostToGuestThunk(); + guest_to_host_thunk_ = thunk_emitter->EmitGuestToHostThunk(); + delete thunk_emitter; + delete allocator; + lowering_table_ = new LoweringTable(this); RegisterSequences(lowering_table_); diff --git a/src/alloy/backend/x64/x64_backend.h b/src/alloy/backend/x64/x64_backend.h index b10f7e571..dd12c0347 100644 --- a/src/alloy/backend/x64/x64_backend.h +++ b/src/alloy/backend/x64/x64_backend.h @@ -26,12 +26,18 @@ namespace lowering { class LoweringTable; } #define ALLOY_HAS_X64_BACKEND 1 +typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1); +typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1); + class X64Backend : public Backend { public: X64Backend(runtime::Runtime* runtime); virtual ~X64Backend(); X64CodeCache* code_cache() const { return code_cache_; } + HostToGuestThunk host_to_guest_thunk() const { return host_to_guest_thunk_; } + GuestToHostThunk guest_to_host_thunk() const { return guest_to_host_thunk_; } + lowering::LoweringTable* lowering_table() const { return lowering_table_; } virtual int Initialize(); @@ -40,6 +46,9 @@ public: private: X64CodeCache* code_cache_; + HostToGuestThunk host_to_guest_thunk_; + GuestToHostThunk guest_to_host_thunk_; + lowering::LoweringTable* lowering_table_; }; diff --git a/src/alloy/backend/x64/x64_code_cache.cc b/src/alloy/backend/x64/x64_code_cache.cc index c7a456830..2b2bf322d 100644 --- a/src/alloy/backend/x64/x64_code_cache.cc +++ b/src/alloy/backend/x64/x64_code_cache.cc @@ -34,14 +34,14 @@ public: const static uint32_t ESTIMATED_FN_SIZE = 512; // Size of unwind info per function. // TODO(benvanik): move this to emitter. - const static uint32_t UNWIND_INFO_SIZE = 4 + (2 * 1); + const static uint32_t UNWIND_INFO_SIZE = 4 + (2 * 1 + 2 + 2); void* fn_table_handle; RUNTIME_FUNCTION* fn_table; uint32_t fn_table_count; uint32_t fn_table_capacity; - void AddTableEntry(uint8_t* code, size_t code_size); + void AddTableEntry(uint8_t* code, size_t code_size, size_t stack_size); }; @@ -73,7 +73,8 @@ int X64CodeCache::Initialize() { return 0; } -void* X64CodeCache::PlaceCode(void* machine_code, size_t code_size) { +void* X64CodeCache::PlaceCode(void* machine_code, size_t code_size, + size_t stack_size) { // Add unwind info into the allocation size. Keep things 16b aligned. code_size += XEROUNDUP(X64CodeChunk::UNWIND_INFO_SIZE, 16); @@ -101,7 +102,7 @@ void* X64CodeCache::PlaceCode(void* machine_code, size_t code_size) { active_chunk_->offset += code_size; // Add entry to fn table. - active_chunk_->AddTableEntry(final_address, code_size); + active_chunk_->AddTableEntry(final_address, code_size, stack_size); UnlockMutex(lock_); @@ -156,6 +157,27 @@ typedef enum _UNWIND_OP_CODES { UWOP_SAVE_XMM128_FAR, /* info == XMM reg number, offset in next 2 slots */ UWOP_PUSH_MACHFRAME /* info == 0: no error-code, 1: error-code */ } UNWIND_CODE_OPS; +class UNWIND_REGISTER { +public: + enum _ { + RAX = 0, + RCX = 1, + RDX = 2, + RBX = 3, + RSP = 4, + RBP = 5, + RSI = 6, + RDI = 7, + R8 = 8, + R9 = 9, + R10 = 10, + R11 = 11, + R12 = 12, + R13 = 13, + R14 = 14, + R15 = 15, + }; +}; typedef union _UNWIND_CODE { struct { @@ -183,7 +205,8 @@ typedef struct _UNWIND_INFO { } UNWIND_INFO, *PUNWIND_INFO; } // namespace -void X64CodeChunk::AddTableEntry(uint8_t* code, size_t code_size) { +void X64CodeChunk::AddTableEntry(uint8_t* code, size_t code_size, + size_t stack_size) { // NOTE: we assume a chunk lock. if (fn_table_count + 1 > fn_table_capacity) { @@ -213,26 +236,60 @@ void X64CodeChunk::AddTableEntry(uint8_t* code, size_t code_size) { size_t unwind_info_offset = offset; offset += UNWIND_INFO_SIZE; - // TODO(benvanik): take as parameters? - bool has_prolog = true; - uint8_t prolog_size = 4; - uint8_t stack_bytes = 72; + if (!stack_size) { + uint8_t prolog_size = 0; - // http://msdn.microsoft.com/en-us/library/ddssxxy8.aspx - UNWIND_INFO* unwind_info = (UNWIND_INFO*)(buffer + unwind_info_offset); - unwind_info->Version = 1; - unwind_info->Flags = 0; - unwind_info->SizeOfProlog = has_prolog ? prolog_size : 0; - unwind_info->CountOfCodes = has_prolog ? 1 : 0; - unwind_info->FrameRegister = 0; - unwind_info->FrameOffset = 0; + // http://msdn.microsoft.com/en-us/library/ddssxxy8.aspx + UNWIND_INFO* unwind_info = (UNWIND_INFO*)(buffer + unwind_info_offset); + unwind_info->Version = 1; + unwind_info->Flags = 0; + unwind_info->SizeOfProlog = 0; + unwind_info->CountOfCodes = 0; + unwind_info->FrameRegister = 0; + unwind_info->FrameOffset = 0; + } else if (stack_size <= 128) { + uint8_t prolog_size = 4; - // http://msdn.microsoft.com/en-us/library/ck9asaa9.aspx - auto& code_0 = unwind_info->UnwindCode[0]; - code_0.CodeOffset = 4; // end of instruction + 1 == offset of next instruction - code_0.UnwindOp = UWOP_ALLOC_SMALL; - code_0.OpInfo = stack_bytes / 8 - 1; - XEASSERT(stack_bytes < 128); + // http://msdn.microsoft.com/en-us/library/ddssxxy8.aspx + UNWIND_INFO* unwind_info = (UNWIND_INFO*)(buffer + unwind_info_offset); + unwind_info->Version = 1; + unwind_info->Flags = 0; + unwind_info->SizeOfProlog = prolog_size; + unwind_info->CountOfCodes = 1; + unwind_info->FrameRegister = 0; + unwind_info->FrameOffset = 0; + + // http://msdn.microsoft.com/en-us/library/ck9asaa9.aspx + size_t co = 0; + auto& unwind_code = unwind_info->UnwindCode[co++]; + unwind_code.CodeOffset = 14; // end of instruction + 1 == offset of next instruction + unwind_code.UnwindOp = UWOP_ALLOC_SMALL; + unwind_code.OpInfo = stack_size / 8 - 1; + } else { + // TODO(benvanik): take as parameters? + uint8_t prolog_size = 17; + + // This doesn't work, for some reason. + XEASSERTALWAYS(); + + // http://msdn.microsoft.com/en-us/library/ddssxxy8.aspx + UNWIND_INFO* unwind_info = (UNWIND_INFO*)(buffer + unwind_info_offset); + unwind_info->Version = 1; + unwind_info->Flags = 0; + unwind_info->SizeOfProlog = prolog_size; + unwind_info->CountOfCodes = 3; + unwind_info->FrameRegister = 0; + unwind_info->FrameOffset = 0; + + // http://msdn.microsoft.com/en-us/library/ck9asaa9.aspx + size_t co = 0; + auto& unwind_code = unwind_info->UnwindCode[co++]; + unwind_code.CodeOffset = 17; // end of instruction + 1 == offset of next instruction + unwind_code.UnwindOp = UWOP_ALLOC_LARGE; + unwind_code.OpInfo = 0; + unwind_code = unwind_info->UnwindCode[co++]; + unwind_code.FrameOffset = (USHORT)(stack_size) / 8; + } // Add entry. auto& fn_entry = fn_table[fn_table_count++]; diff --git a/src/alloy/backend/x64/x64_code_cache.h b/src/alloy/backend/x64/x64_code_cache.h index 1d6140430..23ba2e639 100644 --- a/src/alloy/backend/x64/x64_code_cache.h +++ b/src/alloy/backend/x64/x64_code_cache.h @@ -30,7 +30,7 @@ public: // TODO(benvanik): keep track of code blocks // TODO(benvanik): padding/guards/etc - void* PlaceCode(void* machine_code, size_t code_size); + void* PlaceCode(void* machine_code, size_t code_size, size_t stack_size); private: const static size_t DEFAULT_CHUNK_SIZE = 4 * 1024 * 1024; diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index fc5dce840..1e938b084 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -46,7 +47,6 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) : } X64Emitter::~X64Emitter() { - delete allocator_; } int X64Emitter::Initialize() { @@ -71,7 +71,7 @@ int X64Emitter::Emit( // Copy the final code to the cache and relocate it. out_code_size = getSize(); - out_code_address = Emplace(code_cache_); + out_code_address = Emplace(StackLayout::GUEST_STACK_SIZE); // Stash source map. if (debug_info_flags & DEBUG_INFO_SOURCE_MAP) { @@ -83,13 +83,13 @@ int X64Emitter::Emit( return 0; } -void* X64Emitter::Emplace(X64CodeCache* code_cache) { +void* X64Emitter::Emplace(size_t stack_size) { // To avoid changing xbyak, we do a switcharoo here. // top_ points to the Xbyak buffer, and since we are in AutoGrow mode // it has pending relocations. We copy the top_ to our buffer, swap the // pointer, relocate, then return the original scratch pointer for use. uint8_t* old_address = top_; - void* new_address = code_cache->PlaceCode(top_, size_); + void* new_address = code_cache_->PlaceCode(top_, size_, stack_size); top_ = (uint8_t*)new_address; ready(); top_ = old_address; @@ -132,21 +132,13 @@ int X64Emitter::Emit(HIRBuilder* builder) { // X64CodeCache, which dynamically generates exception information. // Adding or changing anything here must be matched! const bool emit_prolog = true; - const size_t stack_size = 72; + const size_t stack_size = StackLayout::GUEST_STACK_SIZE; if (emit_prolog) { - mov(qword[rsp + 8], rcx); + mov(qword[rsp + 8 * 2], rdx); + mov(qword[rsp + 8 * 1], rcx); sub(rsp, stack_size); - mov(qword[rsp + 8 * 0], rbx); - mov(qword[rsp + 8 * 1], r12); - mov(qword[rsp + 8 * 2], r13); - mov(qword[rsp + 8 * 3], r14); - mov(qword[rsp + 8 * 4], r15); } - // membase stays in rdx. If we evict it (like on function calls) we - // must put it back. - mov(rdx, qword[rcx + 8]); - auto lowering_table = backend_->lowering_table(); reg_state_.active_regs = reg_state_.live_regs = reserved_regs; @@ -180,12 +172,9 @@ int X64Emitter::Emit(HIRBuilder* builder) { // Function epilog. L("epilog"); if (emit_prolog) { - mov(rbx, qword[rsp + 8 * 0]); - mov(r12, qword[rsp + 8 * 1]); - mov(r13, qword[rsp + 8 * 2]); - mov(r14, qword[rsp + 8 * 3]); - mov(r15, qword[rsp + 8 * 4]); add(rsp, stack_size); + mov(rcx, qword[rsp + 8 * 1]); + mov(rdx, qword[rsp + 8 * 2]); } ret(); diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index fe458b8cb..c5bc51e05 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -45,6 +45,7 @@ public: virtual ~X64Emitter(); runtime::Runtime* runtime() const { return runtime_; } + X64Backend* backend() const { return backend_; } int Initialize(); @@ -144,15 +145,15 @@ public: void MarkSourceOffset(hir::Instr* i); -private: - void* Emplace(X64CodeCache* code_cache); +protected: + void* Emplace(size_t stack_size); int Emit(hir::HIRBuilder* builder); -private: +protected: runtime::Runtime* runtime_; - X64Backend* backend_; - X64CodeCache* code_cache_; - XbyakAllocator* allocator_; + X64Backend* backend_; + X64CodeCache* code_cache_; + XbyakAllocator* allocator_; struct { // Registers currently active within a begin/end op block. These diff --git a/src/alloy/backend/x64/x64_function.cc b/src/alloy/backend/x64/x64_function.cc index b8172247e..3f7f4bc57 100644 --- a/src/alloy/backend/x64/x64_function.cc +++ b/src/alloy/backend/x64/x64_function.cc @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -42,7 +43,11 @@ int X64Function::RemoveBreakpointImpl(Breakpoint* breakpoint) { } int X64Function::CallImpl(ThreadState* thread_state) { - typedef void(*call_t)(void* raw_context, uint8_t* membase); - ((call_t)machine_code_)(thread_state->raw_context(), thread_state->memory()->membase()); + auto backend = (X64Backend*)thread_state->runtime()->backend(); + auto thunk = backend->host_to_guest_thunk(); + thunk( + machine_code_, + thread_state->raw_context(), + thread_state->memory()->membase()); return 0; } diff --git a/src/alloy/backend/x64/x64_thunk_emitter.cc b/src/alloy/backend/x64/x64_thunk_emitter.cc new file mode 100644 index 000000000..0bd7239f6 --- /dev/null +++ b/src/alloy/backend/x64/x64_thunk_emitter.cc @@ -0,0 +1,139 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace alloy; +using namespace alloy::backend; +using namespace alloy::backend::x64; + +using namespace Xbyak; + + +X64ThunkEmitter::X64ThunkEmitter( + X64Backend* backend, XbyakAllocator* allocator) : + X64Emitter(backend, allocator) { +} + +X64ThunkEmitter::~X64ThunkEmitter() { +} + +HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { + // rcx = target + // rdx = arg0 + // r8 = arg1 + + const size_t stack_size = StackLayout::THUNK_STACK_SIZE; + // rsp + 0 = return address + mov(qword[rsp + 8 * 2], rdx); + mov(qword[rsp + 8 * 1], rcx); + sub(rsp, stack_size); + + mov(qword[rsp + 56], rbx); + mov(qword[rsp + 64], rbp); + mov(qword[rsp + 72], rsi); + mov(qword[rsp + 80], rdi); + mov(qword[rsp + 88], r12); + mov(qword[rsp + 96], r13); + mov(qword[rsp + 104], r14); + mov(qword[rsp + 112], r15); + + /*movaps(ptr[rsp + 128], xmm6); + movaps(ptr[rsp + 144], xmm7); + movaps(ptr[rsp + 160], xmm8); + movaps(ptr[rsp + 176], xmm9); + movaps(ptr[rsp + 192], xmm10); + movaps(ptr[rsp + 208], xmm11); + movaps(ptr[rsp + 224], xmm12); + movaps(ptr[rsp + 240], xmm13); + movaps(ptr[rsp + 256], xmm14); + movaps(ptr[rsp + 272], xmm15);*/ + + mov(rax, rcx); + mov(rcx, rdx); + mov(rdx, r8); + call(rax); + + /*movaps(xmm6, ptr[rsp + 128]); + movaps(xmm7, ptr[rsp + 144]); + movaps(xmm8, ptr[rsp + 160]); + movaps(xmm9, ptr[rsp + 176]); + movaps(xmm10, ptr[rsp + 192]); + movaps(xmm11, ptr[rsp + 208]); + movaps(xmm12, ptr[rsp + 224]); + movaps(xmm13, ptr[rsp + 240]); + movaps(xmm14, ptr[rsp + 256]); + movaps(xmm15, ptr[rsp + 272]);*/ + + mov(rbx, qword[rsp + 56]); + mov(rbp, qword[rsp + 64]); + mov(rsi, qword[rsp + 72]); + mov(rdi, qword[rsp + 80]); + mov(r12, qword[rsp + 88]); + mov(r13, qword[rsp + 96]); + mov(r14, qword[rsp + 104]); + mov(r15, qword[rsp + 112]); + + add(rsp, stack_size); + mov(rcx, qword[rsp + 8 * 1]); + mov(rdx, qword[rsp + 8 * 2]); + ret(); + + void* fn = Emplace(stack_size); + return (HostToGuestThunk)fn; +} + +GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { + // rcx = context + // rdx = target function + // r8 = arg0 + // r9 = arg1 + + const size_t stack_size = StackLayout::THUNK_STACK_SIZE; + // rsp + 0 = return address + mov(qword[rsp + 8 * 2], rdx); + mov(qword[rsp + 8 * 1], rcx); + sub(rsp, stack_size); + + mov(qword[rsp + 56], rbx); + mov(qword[rsp + 64], rbp); + mov(qword[rsp + 72], rsi); + mov(qword[rsp + 80], rdi); + mov(qword[rsp + 88], r12); + mov(qword[rsp + 96], r13); + mov(qword[rsp + 104], r14); + mov(qword[rsp + 112], r15); + + // TODO(benvanik): save things? XMM0-5? + + mov(rax, rdx); + mov(rdx, r8); + mov(r8, r9); + call(rax); + + mov(rbx, qword[rsp + 56]); + mov(rbp, qword[rsp + 64]); + mov(rsi, qword[rsp + 72]); + mov(rdi, qword[rsp + 80]); + mov(r12, qword[rsp + 88]); + mov(r13, qword[rsp + 96]); + mov(r14, qword[rsp + 104]); + mov(r15, qword[rsp + 112]); + + add(rsp, stack_size); + mov(rcx, qword[rsp + 8 * 1]); + mov(rdx, qword[rsp + 8 * 2]); + ret(); + + void* fn = Emplace(stack_size); + return (HostToGuestThunk)fn; +} diff --git a/src/alloy/backend/x64/x64_thunk_emitter.h b/src/alloy/backend/x64/x64_thunk_emitter.h new file mode 100644 index 000000000..4ce0669a7 --- /dev/null +++ b/src/alloy/backend/x64/x64_thunk_emitter.h @@ -0,0 +1,124 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_X64_X64_THUNK_EMITTER_H_ +#define XENIA_CPU_X64_X64_THUNK_EMITTER_H_ + +#include +#include +#include + + +namespace alloy { +namespace backend { +namespace x64 { + + +/** + * Stack Layout + * ---------------------------- + * NOTE: stack must always be 16b aligned. + * + * +------------------+ + * | scratch, 56b | rsp + 0 + * | | + * | .... | + * | | + * | | + * +------------------+ + * | rbx | rsp + 56 + * +------------------+ + * | rbp | rsp + 64 + * +------------------+ + * | rsi | rsp + 72 + * +------------------+ + * | rdi | rsp + 80 + * +------------------+ + * | r12 | rsp + 88 + * +------------------+ + * | r13 | rsp + 96 + * +------------------+ + * | r14 | rsp + 104 + * +------------------+ + * | r15 | rsp + 112 + * +------------------+ + * | (return address) | rsp + 120 + * +------------------+ + * | (rcx home) | rsp + 128 + * +------------------+ + * | (rdx home) | rsp + 136 + * +------------------+ + * + * + * TODO: + * +------------------+ + * | xmm6 | rsp + 128 + * | | + * +------------------+ + * | xmm7 | rsp + 144 + * | | + * +------------------+ + * | xmm8 | rsp + 160 + * | | + * +------------------+ + * | xmm9 | rsp + 176 + * | | + * +------------------+ + * | xmm10 | rsp + 192 + * | | + * +------------------+ + * | xmm11 | rsp + 208 + * | | + * +------------------+ + * | xmm12 | rsp + 224 + * | | + * +------------------+ + * | xmm13 | rsp + 240 + * | | + * +------------------+ + * | xmm14 | rsp + 256 + * | | + * +------------------+ + * | xmm15 | rsp + 272 + * | | + * +------------------+ + * + */ + +class StackLayout { +public: + const static size_t GUEST_STACK_SIZE = 120; + + const static size_t THUNK_STACK_SIZE = 120; + + const static size_t RETURN_ADDRESS = 120; + const static size_t RCX_HOME = 128; + const static size_t RDX_HOME = 136; +}; + + +class X64ThunkEmitter : public X64Emitter { +public: + X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator); + virtual ~X64ThunkEmitter(); + + // Call a generated function, saving all stack parameters. + HostToGuestThunk EmitHostToGuestThunk(); + + // Function that guest code can call to transition into host code. + GuestToHostThunk EmitGuestToHostThunk(); +}; + + +} // namespace x64 +} // namespace backend +} // namespace alloy + + +#endif // XENIA_CPU_X64_X64_THUNK_EMITTER_H_ From 458368c35e21c709a1da54362d752d8c8e7ea1d8 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 31 Jan 2014 22:51:04 -0800 Subject: [PATCH 047/184] Fixing IVM SUB bug. --- src/alloy/backend/ivm/ivm_intcode.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 20544028c..be1c9c206 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -2473,9 +2473,9 @@ uint32_t IntCode_SUB_I16_I16(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_SUB_I32_I32(IntCodeState& ics, const IntCode* i) { int32_t a = ics.rf[i->src1_reg].i32; int32_t b = ics.rf[i->src2_reg].i32; if (i->flags == ARITHMETIC_SET_CARRY) { - ics.did_carry = a < ~b; + ics.did_carry = SUB_DID_CARRY(a, b); } - ics.did_carry = SUB_DID_CARRY(a, b); + ics.rf[i->dest_reg].i32 = a - b; return IA_NEXT; } uint32_t IntCode_SUB_I64_I64(IntCodeState& ics, const IntCode* i) { From 6ed411a08d7e3bf9a4bf415e044e559196e9eda5 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 31 Jan 2014 22:51:18 -0800 Subject: [PATCH 048/184] Fixing printing of floats. --- src/alloy/backend/x64/lowering/op_utils.inl | 2 +- src/alloy/backend/x64/x64_thunk_emitter.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 7fe1bda6d..bfb0e63d9 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -17,7 +17,7 @@ namespace { #define LIKE_REG(dest, like) Reg(dest.getIdx(), dest.getKind(), like.getBit(), false) #define NAX_LIKE(like) Reg(e.rax.getIdx(), e.rax.getKind(), like.getBit(), false) -#define STASH_OFFSET 0 +#define STASH_OFFSET 32 // If we are running with tracing on we have to store the EFLAGS in the stack, // otherwise our calls out to C to print will clear it before DID_CARRY/etc diff --git a/src/alloy/backend/x64/x64_thunk_emitter.h b/src/alloy/backend/x64/x64_thunk_emitter.h index 4ce0669a7..6559ab9a5 100644 --- a/src/alloy/backend/x64/x64_thunk_emitter.h +++ b/src/alloy/backend/x64/x64_thunk_emitter.h @@ -26,10 +26,11 @@ namespace x64 { * NOTE: stack must always be 16b aligned. * * +------------------+ - * | scratch, 56b | rsp + 0 + * | arg temp, 3 * 8 | rsp + 0 * | | - * | .... | * | | + * +------------------+ + * | scratch, 24b | rsp + 32 * | | * +------------------+ * | rbx | rsp + 56 From 7141fbad27ecd6e5eb38f8c71b036970ca865b0f Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 31 Jan 2014 22:51:40 -0800 Subject: [PATCH 049/184] Ignoring comment on fall-through for now, as all imports cause it. --- src/alloy/hir/hir_builder.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index 69cabc286..efc12e723 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -71,7 +71,7 @@ int HIRBuilder::Finalize() { // No following block. // Sometimes VC++ generates functions with bl at the end even if they // will never return. Just add a return to satisfy things. - XELOGW("Fall-through out of the function."); + //XELOGW("Fall-through out of the function."); Trap(); Return(); current_block_ = NULL; From efb2b6f0378cd6fdacb7a9ee3096946e22d43f3e Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 31 Jan 2014 22:54:50 -0800 Subject: [PATCH 050/184] Fixing disasm of XO. --- src/alloy/frontend/ppc/ppc_disasm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alloy/frontend/ppc/ppc_disasm.cc b/src/alloy/frontend/ppc/ppc_disasm.cc index 99e325fa6..ee9f21522 100644 --- a/src/alloy/frontend/ppc/ppc_disasm.cc +++ b/src/alloy/frontend/ppc/ppc_disasm.cc @@ -115,7 +115,7 @@ void Disasm_X_RA_RB(InstrData& i, StringBuffer* str) { i.X.RA, i.X.RB); } void Disasm_XO_RT_RA_RB(InstrData& i, StringBuffer* str) { - str->Append("%*s%s%s r%d, r%d", i.XO.Rc ? -7 : -8, i.type->name, + str->Append("%*s%s%s r%d, r%d, r%d", i.XO.Rc ? -7 : -8, i.type->name, i.XO.OE ? "o" : "", i.XO.Rc ? "." : "", i.XO.RT, i.XO.RA, i.XO.RB); } From e63fbcc2cdce602322a2e46e144f1811482eb56e Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 31 Jan 2014 23:16:38 -0800 Subject: [PATCH 051/184] Fixing sub w/ SET_CARRY. --- .../x64/lowering/lowering_sequences.cc | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index d2420c740..e17fe0cc1 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1809,9 +1809,25 @@ table->AddSequence(OPCODE_VECTOR_ADD, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { if (IsIntType(i->dest->type)) { IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.sub(dest_src, src); + if (i.flags & ARITHMETIC_SET_CARRY) { + auto Nax = LIKE_REG(e.rax, src); + e.mov(Nax, src); + e.not(Nax); + e.stc(); + e.adc(dest_src, Nax); + } else { + e.sub(dest_src, src); + } }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.sub(dest_src, src); + if (i.flags & ARITHMETIC_SET_CARRY) { + auto Nax = LIKE_REG(e.rax, dest_src); + e.mov(Nax, src); + e.not(Nax); + e.stc(); + e.adc(dest_src, Nax); + } else { + e.sub(dest_src, src); + } }); } else if (IsFloatType(i->dest->type)) { XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { From 7044b74dc229eb38c551b4ab96e0275374a0a162 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 31 Jan 2014 23:33:32 -0800 Subject: [PATCH 052/184] Fixing register value overwriting in address calculation. --- .../x64/lowering/lowering_sequences.cc | 31 ++++++------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index e17fe0cc1..b9fd38a77 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1054,19 +1054,16 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { } } - // TODO(benvanik): dynamic register access check. // mov reg, [membase + address.32] - Reg64 addr_off; - RegExp addr; if (i->src1.value->IsConstant()) { - // TODO(benvanik): a way to do this without using a register. e.mov(e.eax, i->src1.value->AsUint32()); - addr = e.rdx + e.rax; } else { + Reg64 addr_off; e.BeginOp(i->src1.value, addr_off, 0); - e.mov(addr_off.cvt32(), addr_off.cvt32()); // trunc to 32bits - addr = e.rdx + addr_off; + e.mov(e.eax, addr_off.cvt32()); // trunc to 32bits + e.EndOp(addr_off); } + auto addr = e.rdx + e.rax; #if DYNAMIC_REGISTER_ACCESS_CHECK e.inLocalLabel(); @@ -1076,7 +1073,7 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { e.cmp(e.r8d, 0x7F000000); e.jne(".normal_addr"); if (IsIntType(i->dest->type)) { - e.mov(e.rdx, addr_off); + e.mov(e.rdx, e.rax); CallNative(e, DynamicRegisterLoad); Reg64 dyn_dest; e.BeginOp(i->dest, dyn_dest, REG_DEST); @@ -1179,9 +1176,6 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { } else { ASSERT_INVALID_TYPE(); } - if (!i->src1.value->IsConstant()) { - e.EndOp(addr_off); - } #if DYNAMIC_REGISTER_ACCESS_CHECK e.L(".skip_access"); @@ -1235,18 +1229,16 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { } } - // TODO(benvanik): dynamic register access check // mov [membase + address.32], reg - Reg64 addr_off; - RegExp addr; if (i->src1.value->IsConstant()) { e.mov(e.eax, i->src1.value->AsUint32()); - addr = e.rdx + e.rax; } else { + Reg64 addr_off; e.BeginOp(i->src1.value, addr_off, 0); - e.mov(addr_off.cvt32(), addr_off.cvt32()); // trunc to 32bits - addr = e.rdx + addr_off; + e.mov(e.eax, addr_off.cvt32()); // trunc to 32bits + e.EndOp(addr_off); } + auto addr = e.rdx + e.rax; #if DYNAMIC_REGISTER_ACCESS_CHECK // if ((address & 0xFF000000) == 0x7F000000) do check; @@ -1276,7 +1268,7 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { break; } e.EndOp(dyn_src); - e.mov(e.rdx, addr_off); + e.mov(e.rdx, e.rax); CallNative(e, DynamicRegisterStore); } else { e.db(0xCC); @@ -1413,9 +1405,6 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { } else { ASSERT_INVALID_TYPE(); } - if (!i->src1.value->IsConstant()) { - e.EndOp(addr_off); - } #if DYNAMIC_REGISTER_ACCESS_CHECK e.L(".skip_access"); From 10c9537836df513185998148a0245bcb21de950a Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 1 Feb 2014 00:05:21 -0800 Subject: [PATCH 053/184] Swapping register values. --- src/alloy/backend/x64/lowering/lowering_sequences.cc | 2 -- src/xenia/apu/audio_system.cc | 3 ++- src/xenia/gpu/graphics_system.cc | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index b9fd38a77..fd85ac495 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1219,8 +1219,6 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { } e.EndOp(src2); } - // eh? - e.bswap(e.r8); CallNative(e, cbs->write); i = e.Advance(i); return true; diff --git a/src/xenia/apu/audio_system.cc b/src/xenia/apu/audio_system.cc index 1793fc92d..46b0b3924 100644 --- a/src/xenia/apu/audio_system.cc +++ b/src/xenia/apu/audio_system.cc @@ -187,10 +187,11 @@ uint64_t AudioSystem::ReadRegister(uint64_t addr) { XELOGAPU("ReadRegister(%.4X)", r); // 1800h is read on startup and stored -- context? buffers? // 1818h is read during a lock? - return 0; + return XESWAP32BE(0); } void AudioSystem::WriteRegister(uint64_t addr, uint64_t value) { + value = XESWAP32BE((uint32_t)value); uint32_t r = addr & 0xFFFF; XELOGAPU("WriteRegister(%.4X, %.8X)", r, value); // 1804h is written to with 0x02000000 and 0x03000000 around a lock operation diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index fbcb1d744..524ec467b 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -152,10 +152,11 @@ uint64_t GraphicsSystem::ReadRegister(uint64_t addr) { } XEASSERT(r >= 0 && r < kXEGpuRegisterCount); - return regs->values[r].u32; + return XESWAP32BE(regs->values[r].u32); } void GraphicsSystem::WriteRegister(uint64_t addr, uint64_t value) { + value = XESWAP32BE((uint32_t)value); uint32_t r = addr & 0xFFFF; XELOGGPU("WriteRegister(%.4X, %.8X)", r, value); From 50cb12634e9d6bdd161e4a4b0f233b5bc17ea9f6 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 1 Feb 2014 00:05:38 -0800 Subject: [PATCH 054/184] Fixed rax clobber on mul/div. --- src/alloy/backend/x64/lowering/op_utils.inl | 87 +++++++++++---------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index bfb0e63d9..afc075ce9 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -15,7 +15,8 @@ namespace { #define LIKE_REG(dest, like) Reg(dest.getIdx(), dest.getKind(), like.getBit(), false) -#define NAX_LIKE(like) Reg(e.rax.getIdx(), e.rax.getKind(), like.getBit(), false) +#define TEMP_REG e.r8 +#define TEMP_LIKE(like) Reg(TEMP_REG.getIdx(), TEMP_REG.getKind(), like.getBit(), false) #define STASH_OFFSET 32 @@ -473,10 +474,10 @@ void IntBinaryOpVV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vv_fn(e, *i, dest, src1); } else { // Eww. - auto Nax = NAX_LIKE(src1); - e.mov(Nax, src1); - vv_fn(e, *i, Nax, src2); - e.mov(dest, Nax); + auto Ntx = TEMP_LIKE(src1); + e.mov(Ntx, src1); + vv_fn(e, *i, Ntx, src2); + e.mov(dest, Ntx); } } else { e.mov(dest, src1); @@ -500,12 +501,12 @@ void IntBinaryOpVC(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, } else { // 64-bit. if (dest == src1) { - e.mov(e.rax, src2->constant.i64); - vv_fn(e, *i, dest, e.rax); + e.mov(TEMP_REG, src2->constant.i64); + vv_fn(e, *i, dest, TEMP_REG); } else { - e.mov(e.rax, src2->constant.i64); + e.mov(TEMP_REG, src2->constant.i64); e.mov(dest, src1); - vv_fn(e, *i, dest, e.rax); + vv_fn(e, *i, dest, TEMP_REG); } } e.EndOp(dest, src1); @@ -522,10 +523,10 @@ void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); } else { // Eww. - auto Nax = NAX_LIKE(src2); - e.mov(Nax, src2); + auto Ntx = TEMP_LIKE(src2); + e.mov(Ntx, src2); e.mov(dest, (uint32_t)src1->get_constant(CT())); - vv_fn(e, *i, dest, Nax); + vv_fn(e, *i, dest, Ntx); } } else { e.mov(dest, src2); @@ -535,18 +536,18 @@ void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, // 64-bit. if (dest == src2) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - e.mov(e.rax, src1->constant.i64); - vv_fn(e, *i, dest, e.rax); + e.mov(TEMP_REG, src1->constant.i64); + vv_fn(e, *i, dest, TEMP_REG); } else { // Eww. - e.mov(e.rax, src1->constant.i64); - vv_fn(e, *i, e.rax, src2); - e.mov(dest, e.rax); + e.mov(TEMP_REG, src1->constant.i64); + vv_fn(e, *i, TEMP_REG, src2); + e.mov(dest, TEMP_REG); } } else { - e.mov(e.rax, src2); + e.mov(TEMP_REG, src2); e.mov(dest, src1->constant.i64); - vv_fn(e, *i, dest, e.rax); + vv_fn(e, *i, dest, TEMP_REG); } } e.EndOp(dest, src2); @@ -672,10 +673,10 @@ void IntTernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vvc_fn(e, *i, dest, src1, (uint32_t)src3->get_constant(CT())); } else { // Eww. - auto Nax = NAX_LIKE(src2); - e.mov(Nax, src2); + auto Ntx = TEMP_LIKE(src2); + e.mov(Ntx, src2); e.mov(dest, src1); - vvc_fn(e, *i, dest, Nax, (uint32_t)src3->get_constant(CT())); + vvc_fn(e, *i, dest, Ntx, (uint32_t)src3->get_constant(CT())); } } else { e.mov(dest, src1); @@ -684,24 +685,24 @@ void IntTernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, } else { // 64-bit. if (dest == src1) { - e.mov(e.rax, src3->constant.i64); - vvv_fn(e, *i, dest, src2, e.rax); + e.mov(TEMP_REG, src3->constant.i64); + vvv_fn(e, *i, dest, src2, TEMP_REG); } else if (dest == src2) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - e.mov(e.rax, src3->constant.i64); - vvv_fn(e, *i, dest, src1, e.rax); + e.mov(TEMP_REG, src3->constant.i64); + vvv_fn(e, *i, dest, src1, TEMP_REG); } else { // Eww. - e.mov(e.rax, src1); + e.mov(TEMP_REG, src1); e.mov(src1, src2); - e.mov(dest, e.rax); - e.mov(e.rax, src3->constant.i64); - vvv_fn(e, *i, dest, src1, e.rax); + e.mov(dest, TEMP_REG); + e.mov(TEMP_REG, src3->constant.i64); + vvv_fn(e, *i, dest, src1, TEMP_REG); } } else { - e.mov(e.rax, src3->constant.i64); + e.mov(TEMP_REG, src3->constant.i64); e.mov(dest, src1); - vvv_fn(e, *i, dest, src2, e.rax); + vvv_fn(e, *i, dest, src2, TEMP_REG); } } e.EndOp(dest, src1, src2); @@ -721,10 +722,10 @@ void IntTernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src1); } else { // Eww. - auto Nax = NAX_LIKE(src3); - e.mov(Nax, src3); + auto Ntx = TEMP_LIKE(src3); + e.mov(Ntx, src3); e.mov(dest, src1); - vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), Nax); + vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), Ntx); } } else { e.mov(dest, src1); @@ -733,24 +734,24 @@ void IntTernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, } else { // 64-bit. if (dest == src1) { - e.mov(e.rax, src2->constant.i64); - vvv_fn(e, *i, dest, e.rax, src3); + e.mov(TEMP_REG, src2->constant.i64); + vvv_fn(e, *i, dest, TEMP_REG, src3); } else if (dest == src3) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - e.mov(e.rax, src2->constant.i64); - vvv_fn(e, *i, dest, src1, e.rax); + e.mov(TEMP_REG, src2->constant.i64); + vvv_fn(e, *i, dest, src1, TEMP_REG); } else { // Eww. - e.mov(e.rax, src1); + e.mov(TEMP_REG, src1); e.mov(src1, src3); e.mov(dest, e.rax); - e.mov(e.rax, src2->constant.i64); - vvv_fn(e, *i, dest, e.rax, src1); + e.mov(TEMP_REG, src2->constant.i64); + vvv_fn(e, *i, dest, TEMP_REG, src1); } } else { e.mov(e.rax, src2->constant.i64); e.mov(dest, src1); - vvv_fn(e, *i, dest, e.rax, src3); + vvv_fn(e, *i, dest, TEMP_REG, src3); } } e.EndOp(dest, src1, src3); From 6eef76374df893c509cd73ce32df1ef03a2cc74b Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 1 Feb 2014 00:05:53 -0800 Subject: [PATCH 055/184] Fix CNTLZ with 0. --- src/alloy/backend/x64/lowering/lowering_sequences.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index fd85ac495..c2610571e 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -2459,7 +2459,7 @@ table->AddSequence(OPCODE_CNTLZ, [](X64Emitter& e, Instr*& i) { i->src1.value, src, 0); e.bsr(dest.cvt16(), src.cvt16()); // ZF = 1 if zero - e.mov(e.eax, 16); + e.mov(e.eax, 16 ^ 0x7); e.cmovz(dest.cvt32(), e.eax); e.sub(dest, 8); e.xor(dest, 0x7); @@ -2471,7 +2471,7 @@ table->AddSequence(OPCODE_CNTLZ, [](X64Emitter& e, Instr*& i) { i->src1.value, src, 0); e.bsr(dest.cvt16(), src); // ZF = 1 if zero - e.mov(e.eax, 16); + e.mov(e.eax, 16 ^ 0xF); e.cmovz(dest.cvt32(), e.eax); e.xor(dest, 0xF); e.EndOp(dest, src); @@ -2482,7 +2482,7 @@ table->AddSequence(OPCODE_CNTLZ, [](X64Emitter& e, Instr*& i) { i->src1.value, src, 0); e.bsr(dest.cvt32(), src); // ZF = 1 if zero - e.mov(e.eax, 32); + e.mov(e.eax, 32 ^ 0x1F); e.cmovz(dest.cvt32(), e.eax); e.xor(dest, 0x1F); e.EndOp(dest, src); @@ -2493,7 +2493,7 @@ table->AddSequence(OPCODE_CNTLZ, [](X64Emitter& e, Instr*& i) { i->src1.value, src, 0); e.bsr(dest, src); // ZF = 1 if zero - e.mov(e.eax, 64); + e.mov(e.eax, 64 ^ 0x3F); e.cmovz(dest.cvt32(), e.eax); e.xor(dest, 0x3F); e.EndOp(dest, src); From f4d0eb03bc8132090643986ec10e1ebc58d8e09d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 1 Feb 2014 00:08:26 -0800 Subject: [PATCH 056/184] Fixing 64bit fneg. --- src/alloy/backend/x64/lowering/lowering_sequences.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index c2610571e..00489d38f 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -74,7 +74,7 @@ static const vec128_t xmm_consts[] = { /* XMMNormalizeX16Y16 */ vec128f(1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f), /* XMM3301 */ vec128f(3.0f, 3.0f, 0.0f, 1.0f), /* XMMSignMaskPS */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), - /* XMMSignMaskPD */ vec128i(0x80000000u, 0x00000000u, 0x80000000u, 0x00000000u), + /* XMMSignMaskPD */ vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u), /* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), }; // Use consts by first loading the base register then accessing memory: From 68b5a0979ec2dccbaf8488dd4b8ddcb9c4af9920 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 1 Feb 2014 00:12:36 -0800 Subject: [PATCH 057/184] Logging undefined extern calls. --- .../x64/lowering/lowering_sequences.cc | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 00489d38f..9b41948cc 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -103,6 +103,12 @@ void Dummy() { // } +void UndefinedCallExtern(void* raw_context, FunctionInfo* symbol_info) { + XELOGW("undefined extern call to %.8X %s", + symbol_info->address(), + symbol_info->name()); +} + uint64_t DynamicRegisterLoad(void* raw_context, uint32_t address) { auto thread_state = *((ThreadState**)raw_context); auto cbs = thread_state->runtime()->access_callbacks(); @@ -339,14 +345,18 @@ table->AddSequence(OPCODE_CALL_INDIRECT_TRUE, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_CALL_EXTERN, [](X64Emitter& e, Instr*& i) { auto symbol_info = i->src1.symbol_info; XEASSERT(symbol_info->behavior() == FunctionInfo::BEHAVIOR_EXTERN); - XEASSERTNOTNULL(symbol_info->extern_handler()); - // rdx = target host function - // r8 = arg0 - // r9 = arg1 - e.mov(e.rdx, (uint64_t)symbol_info->extern_handler()); - e.mov(e.r8, (uint64_t)symbol_info->extern_arg0()); - e.mov(e.r9, (uint64_t)symbol_info->extern_arg1()); - TransitionToHost(e); + if (!symbol_info->extern_handler()) { + e.mov(e.rdx, (uint64_t)symbol_info); + CallNative(e, UndefinedCallExtern); + } else { + // rdx = target host function + // r8 = arg0 + // r9 = arg1 + e.mov(e.rdx, (uint64_t)symbol_info->extern_handler()); + e.mov(e.r8, (uint64_t)symbol_info->extern_arg0()); + e.mov(e.r9, (uint64_t)symbol_info->extern_arg1()); + TransitionToHost(e); + } i = e.Advance(i); return true; }); From f524693dbbb1f8927703966750d951365ab0b09a Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 1 Feb 2014 00:20:45 -0800 Subject: [PATCH 058/184] Unswapping IVM register accesses. --- src/alloy/backend/ivm/ivm_intcode.cc | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index be1c9c206..02c7fb902 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -207,21 +207,21 @@ uint32_t IntCode_LOAD_REGISTER_I16(IntCodeState& ics, const IntCode* i) { uint64_t address = ics.rf[i->src1_reg].u32; RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i16 = XESWAP16((int16_t)cbs->read(cbs->context, address)); + ics.rf[i->dest_reg].i16 = (int16_t)cbs->read(cbs->context, address); return IA_NEXT; } uint32_t IntCode_LOAD_REGISTER_I32(IntCodeState& ics, const IntCode* i) { uint64_t address = ics.rf[i->src1_reg].u32; RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i32 = XESWAP32((int32_t)cbs->read(cbs->context, address)); + ics.rf[i->dest_reg].i32 = (int32_t)cbs->read(cbs->context, address); return IA_NEXT; } uint32_t IntCode_LOAD_REGISTER_I64(IntCodeState& ics, const IntCode* i) { uint64_t address = ics.rf[i->src1_reg].u32; RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i64 = XESWAP64((int64_t)cbs->read(cbs->context, address)); + ics.rf[i->dest_reg].i64 = (int64_t)cbs->read(cbs->context, address); return IA_NEXT; } int DispatchRegisterRead( @@ -267,7 +267,7 @@ uint32_t IntCode_LOAD_REGISTER_I16_DYNAMIC(IntCodeState& ics, const IntCode* i) RegisterAccessCallbacks* cbs = ics.access_callbacks; while (cbs) { if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i16 = XESWAP16((int16_t)cbs->read(cbs->context, address)); + ics.rf[i->dest_reg].i16 = (int16_t)cbs->read(cbs->context, address); return IA_NEXT; } cbs = cbs->next; @@ -279,7 +279,7 @@ uint32_t IntCode_LOAD_REGISTER_I32_DYNAMIC(IntCodeState& ics, const IntCode* i) RegisterAccessCallbacks* cbs = ics.access_callbacks; while (cbs) { if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i32 = XESWAP32((int32_t)cbs->read(cbs->context, address)); + ics.rf[i->dest_reg].i32 = (int32_t)cbs->read(cbs->context, address); return IA_NEXT; } cbs = cbs->next; @@ -291,7 +291,7 @@ uint32_t IntCode_LOAD_REGISTER_I64_DYNAMIC(IntCodeState& ics, const IntCode* i) RegisterAccessCallbacks* cbs = ics.access_callbacks; while (cbs) { if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i64 = XESWAP64((int64_t)cbs->read(cbs->context, address)); + ics.rf[i->dest_reg].i64 = (int64_t)cbs->read(cbs->context, address); return IA_NEXT; } cbs = cbs->next; @@ -310,21 +310,21 @@ uint32_t IntCode_STORE_REGISTER_I16(IntCodeState& ics, const IntCode* i) { uint64_t address = ics.rf[i->src1_reg].u32; RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, XESWAP16(ics.rf[i->src2_reg].i16)); + cbs->write(cbs->context, address, ics.rf[i->src2_reg].i16); return IA_NEXT; } uint32_t IntCode_STORE_REGISTER_I32(IntCodeState& ics, const IntCode* i) { uint64_t address = ics.rf[i->src1_reg].u32; RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, XESWAP32(ics.rf[i->src2_reg].i32)); + cbs->write(cbs->context, address, ics.rf[i->src2_reg].i32); return IA_NEXT; } uint32_t IntCode_STORE_REGISTER_I64(IntCodeState& ics, const IntCode* i) { uint64_t address = ics.rf[i->src1_reg].u32; RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, XESWAP64(ics.rf[i->src2_reg].i64)); + cbs->write(cbs->context, address, ics.rf[i->src2_reg].i64); return IA_NEXT; } int DispatchRegisterWrite( @@ -370,7 +370,7 @@ uint32_t IntCode_STORE_REGISTER_I16_DYNAMIC(IntCodeState& ics, const IntCode* i) RegisterAccessCallbacks* cbs = ics.access_callbacks; while (cbs) { if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, XESWAP16(ics.rf[i->src2_reg].i16)); + cbs->write(cbs->context, address, ics.rf[i->src2_reg].i16); return IA_NEXT; } cbs = cbs->next; @@ -382,7 +382,7 @@ uint32_t IntCode_STORE_REGISTER_I32_DYNAMIC(IntCodeState& ics, const IntCode* i) RegisterAccessCallbacks* cbs = ics.access_callbacks; while (cbs) { if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, XESWAP32(ics.rf[i->src2_reg].i32)); + cbs->write(cbs->context, address, ics.rf[i->src2_reg].i32); return IA_NEXT; } cbs = cbs->next; @@ -394,7 +394,7 @@ uint32_t IntCode_STORE_REGISTER_I64_DYNAMIC(IntCodeState& ics, const IntCode* i) RegisterAccessCallbacks* cbs = ics.access_callbacks; while (cbs) { if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, XESWAP64(ics.rf[i->src2_reg].i64)); + cbs->write(cbs->context, address, ics.rf[i->src2_reg].i64); return IA_NEXT; } cbs = cbs->next; From b1ab2fb0a7026361a8e585c5e9e2f0b98ad4b90f Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 1 Feb 2014 00:41:36 -0800 Subject: [PATCH 059/184] That was a terrible idea. Let us never speak of it again. --- src/alloy/backend/ivm/ivm_intcode.cc | 24 +++++++-------- .../x64/lowering/lowering_sequences.cc | 29 +++++++++++++++++-- src/xenia/apu/audio_system.cc | 3 +- src/xenia/gpu/graphics_system.cc | 3 +- 4 files changed, 41 insertions(+), 18 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 02c7fb902..be1c9c206 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -207,21 +207,21 @@ uint32_t IntCode_LOAD_REGISTER_I16(IntCodeState& ics, const IntCode* i) { uint64_t address = ics.rf[i->src1_reg].u32; RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i16 = (int16_t)cbs->read(cbs->context, address); + ics.rf[i->dest_reg].i16 = XESWAP16((int16_t)cbs->read(cbs->context, address)); return IA_NEXT; } uint32_t IntCode_LOAD_REGISTER_I32(IntCodeState& ics, const IntCode* i) { uint64_t address = ics.rf[i->src1_reg].u32; RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i32 = (int32_t)cbs->read(cbs->context, address); + ics.rf[i->dest_reg].i32 = XESWAP32((int32_t)cbs->read(cbs->context, address)); return IA_NEXT; } uint32_t IntCode_LOAD_REGISTER_I64(IntCodeState& ics, const IntCode* i) { uint64_t address = ics.rf[i->src1_reg].u32; RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i64 = (int64_t)cbs->read(cbs->context, address); + ics.rf[i->dest_reg].i64 = XESWAP64((int64_t)cbs->read(cbs->context, address)); return IA_NEXT; } int DispatchRegisterRead( @@ -267,7 +267,7 @@ uint32_t IntCode_LOAD_REGISTER_I16_DYNAMIC(IntCodeState& ics, const IntCode* i) RegisterAccessCallbacks* cbs = ics.access_callbacks; while (cbs) { if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i16 = (int16_t)cbs->read(cbs->context, address); + ics.rf[i->dest_reg].i16 = XESWAP16((int16_t)cbs->read(cbs->context, address)); return IA_NEXT; } cbs = cbs->next; @@ -279,7 +279,7 @@ uint32_t IntCode_LOAD_REGISTER_I32_DYNAMIC(IntCodeState& ics, const IntCode* i) RegisterAccessCallbacks* cbs = ics.access_callbacks; while (cbs) { if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i32 = (int32_t)cbs->read(cbs->context, address); + ics.rf[i->dest_reg].i32 = XESWAP32((int32_t)cbs->read(cbs->context, address)); return IA_NEXT; } cbs = cbs->next; @@ -291,7 +291,7 @@ uint32_t IntCode_LOAD_REGISTER_I64_DYNAMIC(IntCodeState& ics, const IntCode* i) RegisterAccessCallbacks* cbs = ics.access_callbacks; while (cbs) { if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i64 = (int64_t)cbs->read(cbs->context, address); + ics.rf[i->dest_reg].i64 = XESWAP64((int64_t)cbs->read(cbs->context, address)); return IA_NEXT; } cbs = cbs->next; @@ -310,21 +310,21 @@ uint32_t IntCode_STORE_REGISTER_I16(IntCodeState& ics, const IntCode* i) { uint64_t address = ics.rf[i->src1_reg].u32; RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, ics.rf[i->src2_reg].i16); + cbs->write(cbs->context, address, XESWAP16(ics.rf[i->src2_reg].i16)); return IA_NEXT; } uint32_t IntCode_STORE_REGISTER_I32(IntCodeState& ics, const IntCode* i) { uint64_t address = ics.rf[i->src1_reg].u32; RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, ics.rf[i->src2_reg].i32); + cbs->write(cbs->context, address, XESWAP32(ics.rf[i->src2_reg].i32)); return IA_NEXT; } uint32_t IntCode_STORE_REGISTER_I64(IntCodeState& ics, const IntCode* i) { uint64_t address = ics.rf[i->src1_reg].u32; RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, ics.rf[i->src2_reg].i64); + cbs->write(cbs->context, address, XESWAP64(ics.rf[i->src2_reg].i64)); return IA_NEXT; } int DispatchRegisterWrite( @@ -370,7 +370,7 @@ uint32_t IntCode_STORE_REGISTER_I16_DYNAMIC(IntCodeState& ics, const IntCode* i) RegisterAccessCallbacks* cbs = ics.access_callbacks; while (cbs) { if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, ics.rf[i->src2_reg].i16); + cbs->write(cbs->context, address, XESWAP16(ics.rf[i->src2_reg].i16)); return IA_NEXT; } cbs = cbs->next; @@ -382,7 +382,7 @@ uint32_t IntCode_STORE_REGISTER_I32_DYNAMIC(IntCodeState& ics, const IntCode* i) RegisterAccessCallbacks* cbs = ics.access_callbacks; while (cbs) { if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, ics.rf[i->src2_reg].i32); + cbs->write(cbs->context, address, XESWAP32(ics.rf[i->src2_reg].i32)); return IA_NEXT; } cbs = cbs->next; @@ -394,7 +394,7 @@ uint32_t IntCode_STORE_REGISTER_I64_DYNAMIC(IntCodeState& ics, const IntCode* i) RegisterAccessCallbacks* cbs = ics.access_callbacks; while (cbs) { if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, ics.rf[i->src2_reg].i64); + cbs->write(cbs->context, address, XESWAP64(ics.rf[i->src2_reg].i64)); return IA_NEXT; } cbs = cbs->next; diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 9b41948cc..c6f0c305f 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1055,6 +1055,20 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { e.mov(e.rcx, (uint64_t)cbs->context); e.mov(e.rdx, i.src1.value->AsUint64()); CallNative(e, cbs->read); + switch (i.dest->type) { + case INT8_TYPE: + break; + case INT16_TYPE: + e.xchg(e.al, e.ah); + break; + case INT32_TYPE: + e.bswap(e.eax); + break; + case INT64_TYPE: + e.bswap(e.rax); + break; + default: ASSERT_INVALID_TYPE(); break; + } e.mov(dest_src, e.rax); }); i = e.Advance(i); @@ -1092,12 +1106,15 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { e.movzx(dyn_dest, e.al); break; case INT16_TYPE: + e.xchg(e.al, e.ah); e.movzx(dyn_dest, e.ax); break; case INT32_TYPE: + e.bswap(e.eax); e.mov(dyn_dest.cvt32(), e.eax); break; case INT64_TYPE: + e.bswap(e.rax); e.mov(dyn_dest, e.rax); break; default: @@ -1217,13 +1234,17 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { e.movzx(e.r8d, src2.cvt8()); break; case INT16_TYPE: - e.movzx(e.r8d, src2.cvt16()); + e.movzx(e.rax, src2.cvt16()); + e.xchg(e.al, e.ah); + e.mov(e.r8, e.rax); break; case INT32_TYPE: e.movzx(e.r8, src2.cvt32()); + e.bswap(e.r8d); break; case INT64_TYPE: e.mov(e.r8, src2); + e.bswap(e.r8); break; default: ASSERT_INVALID_TYPE(); break; } @@ -1263,13 +1284,17 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { e.movzx(e.r8, dyn_src.cvt8()); break; case INT16_TYPE: - e.movzx(e.r8, dyn_src.cvt16()); + e.movzx(e.rax, dyn_src.cvt16()); + e.xchg(e.al, e.ah); + e.mov(e.r8, e.rax); break; case INT32_TYPE: e.mov(e.r8d, dyn_src.cvt32()); + e.bswap(e.r8d); break; case INT64_TYPE: e.mov(e.r8, dyn_src); + e.bswap(e.r8); break; default: e.db(0xCC); diff --git a/src/xenia/apu/audio_system.cc b/src/xenia/apu/audio_system.cc index 46b0b3924..1793fc92d 100644 --- a/src/xenia/apu/audio_system.cc +++ b/src/xenia/apu/audio_system.cc @@ -187,11 +187,10 @@ uint64_t AudioSystem::ReadRegister(uint64_t addr) { XELOGAPU("ReadRegister(%.4X)", r); // 1800h is read on startup and stored -- context? buffers? // 1818h is read during a lock? - return XESWAP32BE(0); + return 0; } void AudioSystem::WriteRegister(uint64_t addr, uint64_t value) { - value = XESWAP32BE((uint32_t)value); uint32_t r = addr & 0xFFFF; XELOGAPU("WriteRegister(%.4X, %.8X)", r, value); // 1804h is written to with 0x02000000 and 0x03000000 around a lock operation diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index 524ec467b..fbcb1d744 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -152,11 +152,10 @@ uint64_t GraphicsSystem::ReadRegister(uint64_t addr) { } XEASSERT(r >= 0 && r < kXEGpuRegisterCount); - return XESWAP32BE(regs->values[r].u32); + return regs->values[r].u32; } void GraphicsSystem::WriteRegister(uint64_t addr, uint64_t value) { - value = XESWAP32BE((uint32_t)value); uint32_t r = addr & 0xFFFF; XELOGGPU("WriteRegister(%.4X, %.8X)", r, value); From 91a43a0ab6cf263661ca4c006cc17171b95b0fb1 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 1 Feb 2014 01:13:05 -0800 Subject: [PATCH 060/184] Fixing broken temp reg. --- src/alloy/backend/x64/lowering/op_utils.inl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index afc075ce9..ef0be4ab1 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -744,12 +744,12 @@ void IntTernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, // Eww. e.mov(TEMP_REG, src1); e.mov(src1, src3); - e.mov(dest, e.rax); + e.mov(dest, TEMP_REG); e.mov(TEMP_REG, src2->constant.i64); vvv_fn(e, *i, dest, TEMP_REG, src1); } } else { - e.mov(e.rax, src2->constant.i64); + e.mov(TEMP_REG, src2->constant.i64); e.mov(dest, src1); vvv_fn(e, *i, dest, TEMP_REG, src3); } From b29276e1671dfb1814541c201387da01ac74e81c Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 1 Feb 2014 01:13:23 -0800 Subject: [PATCH 061/184] Reserve address cleanup. --- src/alloy/backend/ivm/ivm_function.cc | 1 - src/alloy/backend/ivm/ivm_intcode.h | 1 - src/alloy/memory.cc | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_function.cc b/src/alloy/backend/ivm/ivm_function.cc index 8cd6835d6..335dc7c87 100644 --- a/src/alloy/backend/ivm/ivm_function.cc +++ b/src/alloy/backend/ivm/ivm_function.cc @@ -115,7 +115,6 @@ int IVMFunction::CallImpl(ThreadState* thread_state) { ics.rf = register_file; ics.context = (uint8_t*)thread_state->raw_context(); ics.membase = memory->membase(); - ics.reserve_address = memory->reserve_address(); ics.did_carry = 0; ics.did_saturate = 0; ics.access_callbacks = thread_state->runtime()->access_callbacks(); diff --git a/src/alloy/backend/ivm/ivm_intcode.h b/src/alloy/backend/ivm/ivm_intcode.h index 9f361b2f9..d5ba5cfec 100644 --- a/src/alloy/backend/ivm/ivm_intcode.h +++ b/src/alloy/backend/ivm/ivm_intcode.h @@ -43,7 +43,6 @@ typedef struct { Register* rf; uint8_t* context; uint8_t* membase; - uint32_t* reserve_address; int8_t did_carry; int8_t did_saturate; runtime::RegisterAccessCallbacks* access_callbacks; diff --git a/src/alloy/memory.cc b/src/alloy/memory.cc index 4948d22d6..8b4eaa2e3 100644 --- a/src/alloy/memory.cc +++ b/src/alloy/memory.cc @@ -13,7 +13,7 @@ using namespace alloy; Memory::Memory() : - membase_(0) { + membase_(0), reserve_address_(0) { SYSTEM_INFO si; GetSystemInfo(&si); system_page_size_ = si.dwPageSize; From bca349b302152e798c73c224d966d226b1523a6b Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 2 Feb 2014 00:33:57 -0800 Subject: [PATCH 062/184] Oh my. Basic CFA/DFA, local variable support, misc fixes, etc. --- src/alloy/alloy-private.h | 2 + src/alloy/alloy.cc | 4 +- src/alloy/backend/ivm/ivm_assembler.cc | 9 + src/alloy/backend/ivm/ivm_function.cc | 3 + src/alloy/backend/ivm/ivm_function.h | 7 +- src/alloy/backend/ivm/ivm_intcode.cc | 85 +++ src/alloy/backend/ivm/ivm_intcode.h | 2 + .../x64/lowering/lowering_sequences.cc | 112 ++++ src/alloy/backend/x64/x64_emitter.cc | 15 + src/alloy/compiler/compiler_passes.h | 3 + .../passes/control_flow_analysis_pass.cc | 72 ++ .../passes/control_flow_analysis_pass.h | 37 ++ .../passes/data_flow_analysis_pass.cc | 203 ++++++ .../compiler/passes/data_flow_analysis_pass.h | 39 ++ .../passes/dead_code_elimination_pass.cc | 69 +- .../passes/dead_code_elimination_pass.h | 1 + src/alloy/compiler/passes/sources.gypi | 6 + src/alloy/compiler/passes/validation_pass.cc | 99 +++ src/alloy/compiler/passes/validation_pass.h | 39 ++ .../compiler/passes/value_reduction_pass.cc | 17 +- src/alloy/frontend/ppc/ppc_translator.cc | 26 +- src/alloy/hir/block.h | 26 + src/alloy/hir/hir_builder.cc | 83 ++- src/alloy/hir/hir_builder.h | 16 +- src/alloy/hir/instr.cc | 30 + src/alloy/hir/instr.h | 1 + src/alloy/hir/opcodes.h | 4 + src/alloy/hir/opcodes.inl | 18 +- src/alloy/hir/value.h | 20 + third_party/llvm.gypi | 35 + third_party/llvm/LICENSE.txt | 71 ++ third_party/llvm/dummy.cc | 1 + third_party/llvm/include/llvm/ADT/BitVector.h | 602 +++++++++++++++++ .../llvm/include/llvm/Support/Compiler.h | 446 +++++++++++++ .../llvm/include/llvm/Support/MathExtras.h | 626 ++++++++++++++++++ .../llvm/include/llvm/Support/type_traits.h | 244 +++++++ xenia.gyp | 3 + 37 files changed, 3048 insertions(+), 28 deletions(-) create mode 100644 src/alloy/compiler/passes/control_flow_analysis_pass.cc create mode 100644 src/alloy/compiler/passes/control_flow_analysis_pass.h create mode 100644 src/alloy/compiler/passes/data_flow_analysis_pass.cc create mode 100644 src/alloy/compiler/passes/data_flow_analysis_pass.h create mode 100644 src/alloy/compiler/passes/validation_pass.cc create mode 100644 src/alloy/compiler/passes/validation_pass.h create mode 100644 third_party/llvm.gypi create mode 100644 third_party/llvm/LICENSE.txt create mode 100644 third_party/llvm/dummy.cc create mode 100644 third_party/llvm/include/llvm/ADT/BitVector.h create mode 100644 third_party/llvm/include/llvm/Support/Compiler.h create mode 100644 third_party/llvm/include/llvm/Support/MathExtras.h create mode 100644 third_party/llvm/include/llvm/Support/type_traits.h diff --git a/src/alloy/alloy-private.h b/src/alloy/alloy-private.h index 213b4bfad..a22be71c4 100644 --- a/src/alloy/alloy-private.h +++ b/src/alloy/alloy-private.h @@ -18,6 +18,8 @@ DECLARE_bool(debug); DECLARE_bool(always_disasm); +DECLARE_bool(validate_hir); + DECLARE_uint64(break_on_instruction); DECLARE_uint64(break_on_memory); diff --git a/src/alloy/alloy.cc b/src/alloy/alloy.cc index 714036f71..bae955976 100644 --- a/src/alloy/alloy.cc +++ b/src/alloy/alloy.cc @@ -21,10 +21,12 @@ using namespace alloy; DEFINE_bool(debug, DEFAULT_DEBUG_FLAG, "Allow debugging and retain debug information."); - DEFINE_bool(always_disasm, false, "Always add debug info to functions, even when no debugger is attached."); +DEFINE_bool(validate_hir, false, + "Perform validation checks on the HIR during compilation."); + // Breakpoints: DEFINE_uint64(break_on_instruction, 0, "int3 before the given guest address is executed."); diff --git a/src/alloy/backend/ivm/ivm_assembler.cc b/src/alloy/backend/ivm/ivm_assembler.cc index d2b08b964..b869d41ef 100644 --- a/src/alloy/backend/ivm/ivm_assembler.cc +++ b/src/alloy/backend/ivm/ivm_assembler.cc @@ -74,6 +74,15 @@ int IVMAssembler::Assemble( builder->ResetLabelTags(); // Function prologue. + size_t stack_size = 0; + auto locals = builder->locals(); + for (auto it = locals.begin(); it != locals.end(); ++it) { + auto slot = *it; + size_t stack_offset = stack_size; + slot->set_constant(stack_offset); + stack_size += GetTypeSize(slot->type); + } + ctx.stack_size = stack_size; auto block = builder->first_block(); while (block) { diff --git a/src/alloy/backend/ivm/ivm_function.cc b/src/alloy/backend/ivm/ivm_function.cc index 335dc7c87..701cbac1c 100644 --- a/src/alloy/backend/ivm/ivm_function.cc +++ b/src/alloy/backend/ivm/ivm_function.cc @@ -33,6 +33,7 @@ IVMFunction::~IVMFunction() { void IVMFunction::Setup(TranslationContext& ctx) { register_count_ = ctx.register_count; + stack_size_ = ctx.stack_size; intcode_count_ = ctx.intcode_count; intcodes_ = (IntCode*)ctx.intcode_arena->CloneContents(); source_map_count_ = ctx.source_map_count; @@ -108,11 +109,13 @@ int IVMFunction::CallImpl(ThreadState* thread_state) { // Setup register file on stack. auto stack = (IVMStack*)thread_state->backend_data(); auto register_file = (Register*)stack->Alloc(register_count_); + auto local_stack = (uint8_t*)alloca(stack_size_); Memory* memory = thread_state->memory(); IntCodeState ics; ics.rf = register_file; + ics.locals = local_stack; ics.context = (uint8_t*)thread_state->raw_context(); ics.membase = memory->membase(); ics.did_carry = 0; diff --git a/src/alloy/backend/ivm/ivm_function.h b/src/alloy/backend/ivm/ivm_function.h index c7da76f89..7ee24cddf 100644 --- a/src/alloy/backend/ivm/ivm_function.h +++ b/src/alloy/backend/ivm/ivm_function.h @@ -38,9 +38,10 @@ private: void OnBreakpointHit(runtime::ThreadState* thread_state, IntCode* i); private: - size_t register_count_; - size_t intcode_count_; - IntCode* intcodes_; + size_t register_count_; + size_t stack_size_; + size_t intcode_count_; + IntCode* intcodes_; size_t source_map_count_; SourceMapEntry* source_map_; }; diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index be1c9c206..f1460e2c9 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -1342,6 +1342,88 @@ int Translate_LOAD_CLOCK(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, IntCode_LOAD_CLOCK); } +uint32_t IntCode_LOAD_LOCAL_I8(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].i8 = *((int8_t*)(ics.locals + ics.rf[i->src1_reg].u64)); + return IA_NEXT; +} +uint32_t IntCode_LOAD_LOCAL_I16(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].i16 = *((int16_t*)(ics.locals + ics.rf[i->src1_reg].u64)); + return IA_NEXT; +} +uint32_t IntCode_LOAD_LOCAL_I32(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].i32 = *((int32_t*)(ics.locals + ics.rf[i->src1_reg].u64)); + return IA_NEXT; +} +uint32_t IntCode_LOAD_LOCAL_I64(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].i64 = *((int64_t*)(ics.locals + ics.rf[i->src1_reg].u64)); + return IA_NEXT; +} +uint32_t IntCode_LOAD_LOCAL_F32(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].f32 = *((float*)(ics.locals + ics.rf[i->src1_reg].u64)); + return IA_NEXT; +} +uint32_t IntCode_LOAD_LOCAL_F64(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].f64 = *((double*)(ics.locals + ics.rf[i->src1_reg].u64)); + return IA_NEXT; +} +uint32_t IntCode_LOAD_LOCAL_V128(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].v128 = *((vec128_t*)(ics.locals + ics.rf[i->src1_reg].u64)); + return IA_NEXT; +} +int Translate_LOAD_LOCAL(TranslationContext& ctx, Instr* i) { + static IntCodeFn fns[] = { + IntCode_LOAD_LOCAL_I8, + IntCode_LOAD_LOCAL_I16, + IntCode_LOAD_LOCAL_I32, + IntCode_LOAD_LOCAL_I64, + IntCode_LOAD_LOCAL_F32, + IntCode_LOAD_LOCAL_F64, + IntCode_LOAD_LOCAL_V128, + }; + return DispatchToC(ctx, i, fns[i->dest->type]); +} + +uint32_t IntCode_STORE_LOCAL_I8(IntCodeState& ics, const IntCode* i) { + *((int8_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i8; + return IA_NEXT; +} +uint32_t IntCode_STORE_LOCAL_I16(IntCodeState& ics, const IntCode* i) { + *((int16_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i16; + return IA_NEXT; +} +uint32_t IntCode_STORE_LOCAL_I32(IntCodeState& ics, const IntCode* i) { + *((int32_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i32; + return IA_NEXT; +} +uint32_t IntCode_STORE_LOCAL_I64(IntCodeState& ics, const IntCode* i) { + *((int64_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i64; + return IA_NEXT; +} +uint32_t IntCode_STORE_LOCAL_F32(IntCodeState& ics, const IntCode* i) { + *((float*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].f32; + return IA_NEXT; +} +uint32_t IntCode_STORE_LOCAL_F64(IntCodeState& ics, const IntCode* i) { + *((double*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].f64; + return IA_NEXT; +} +uint32_t IntCode_STORE_LOCAL_V128(IntCodeState& ics, const IntCode* i) { + *((vec128_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].v128; + return IA_NEXT; +} +int Translate_STORE_LOCAL(TranslationContext& ctx, Instr* i) { + static IntCodeFn fns[] = { + IntCode_STORE_LOCAL_I8, + IntCode_STORE_LOCAL_I16, + IntCode_STORE_LOCAL_I32, + IntCode_STORE_LOCAL_I64, + IntCode_STORE_LOCAL_F32, + IntCode_STORE_LOCAL_F64, + IntCode_STORE_LOCAL_V128, + }; + return DispatchToC(ctx, i, fns[i->src2.value->type]); +} + uint32_t IntCode_LOAD_CONTEXT_I8(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].i8 = *((int8_t*)(ics.context + ics.rf[i->src1_reg].u64)); DPRINT("%d (%X) = ctx i8 +%d\n", ics.rf[i->dest_reg].i8, ics.rf[i->dest_reg].u8, ics.rf[i->src1_reg].u64); @@ -4039,6 +4121,9 @@ static const TranslateFn dispatch_table[] = { Translate_LOAD_CLOCK, + Translate_LOAD_LOCAL, + Translate_STORE_LOCAL, + Translate_LOAD_CONTEXT, Translate_STORE_CONTEXT, diff --git a/src/alloy/backend/ivm/ivm_intcode.h b/src/alloy/backend/ivm/ivm_intcode.h index d5ba5cfec..dcb59c106 100644 --- a/src/alloy/backend/ivm/ivm_intcode.h +++ b/src/alloy/backend/ivm/ivm_intcode.h @@ -41,6 +41,7 @@ typedef union { typedef struct { Register* rf; + uint8_t* locals; uint8_t* context; uint8_t* membase; int8_t did_carry; @@ -103,6 +104,7 @@ typedef struct { Arena* source_map_arena; Arena* scratch_arena; LabelRef* label_ref_head; + size_t stack_size; } TranslationContext; diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index c6f0c305f..d779d731c 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -815,6 +815,117 @@ table->AddSequence(OPCODE_LOAD_CLOCK, [](X64Emitter& e, Instr*& i) { return true; }); +// -------------------------------------------------------------------------- +// Stack Locals +// -------------------------------------------------------------------------- + +table->AddSequence(OPCODE_LOAD_LOCAL, [](X64Emitter& e, Instr*& i) { + auto addr = e.rsp + i->src1.value->AsUint32(); + if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { + Reg8 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.byte[addr]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { + Reg16 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.word[addr]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { + Reg32 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.dword[addr]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { + Reg64 dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.mov(dest, e.qword[addr]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.movss(dest, e.dword[addr]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + e.movsd(dest, e.qword[addr]); + e.EndOp(dest); + } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + // NOTE: we always know we are aligned. + e.movaps(dest, e.ptr[addr]); + e.EndOp(dest); + } else { + ASSERT_INVALID_TYPE(); + } + i = e.Advance(i); + return true; +}); + +table->AddSequence(OPCODE_STORE_LOCAL, [](X64Emitter& e, Instr*& i) { + auto addr = e.rsp + i->src1.value->AsUint32(); + if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { + Reg8 src; + e.BeginOp(i->src2.value, src, 0); + e.mov(e.byte[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { + e.mov(e.byte[addr], i->src2.value->constant.i8); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { + Reg16 src; + e.BeginOp(i->src2.value, src, 0); + e.mov(e.word[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { + e.mov(e.word[addr], i->src2.value->constant.i16); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { + Reg32 src; + e.BeginOp(i->src2.value, src, 0); + e.mov(e.dword[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { + e.mov(e.dword[addr], i->src2.value->constant.i32); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { + Reg64 src; + e.BeginOp(i->src2.value, src, 0); + e.mov(e.qword[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { + MovMem64(e, addr, i->src2.value->constant.i64); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { + Xmm src; + e.BeginOp(i->src2.value, src, 0); + e.movss(e.dword[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { + e.mov(e.dword[addr], i->src2.value->constant.i32); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { + Xmm src; + e.BeginOp(i->src2.value, src, 0); + e.movsd(e.qword[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { + MovMem64(e, addr, i->src2.value->constant.i64); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { + Xmm src; + e.BeginOp(i->src2.value, src, 0); + // NOTE: we always know we are aligned. + e.movaps(e.ptr[addr], src); + e.EndOp(src); + } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { + // TODO(benvanik): check zero + // TODO(benvanik): correct order? + MovMem64(e, addr, i->src2.value->constant.v128.low); + MovMem64(e, addr + 8, i->src2.value->constant.v128.high); + } else { + ASSERT_INVALID_TYPE(); + } + i = e.Advance(i); + return true; +}); + // -------------------------------------------------------------------------- // Context // -------------------------------------------------------------------------- @@ -2892,6 +3003,7 @@ table->AddSequence(OPCODE_ATOMIC_EXCHANGE, [](X64Emitter& e, Instr*& i) { i->src1.value, src1, 0, i->src2.value, src2, 0); e.mov(dest, src2); + e.lock(); e.xchg(e.dword[src1], dest); e.EndOp(dest, src1, src2); } else { diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 1e938b084..4d441673f 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -97,6 +97,8 @@ void* X64Emitter::Emplace(size_t stack_size) { return new_address; } +#define XEALIGN(value, align) ((value + align - 1) & ~(align - 1)) + int X64Emitter::Emit(HIRBuilder* builder) { // These are the registers we will not be using. All others are fare game. const uint32_t reserved_regs = @@ -120,6 +122,19 @@ int X64Emitter::Emit(HIRBuilder* builder) { GetRegBit(xmm4) | GetRegBit(xmm5); + // Calculate stack size. We need to align things to their natural sizes. + // This could be much better (sort by type/etc). + auto locals = builder->locals(); + size_t stack_offset = 0; + for (auto it = locals.begin(); it != locals.end(); ++it) { + auto slot = *it; + size_t type_size = GetTypeSize(slot->type); + // Align to natural size. + stack_offset = XEALIGN(stack_offset, type_size); + slot->set_constant(stack_offset); + stack_offset += type_size; + } + // Function prolog. // Must be 16b aligned. // Windows is very strict about the form of this and the epilog: diff --git a/src/alloy/compiler/compiler_passes.h b/src/alloy/compiler/compiler_passes.h index 200159ac2..ca074e221 100644 --- a/src/alloy/compiler/compiler_passes.h +++ b/src/alloy/compiler/compiler_passes.h @@ -11,11 +11,14 @@ #define ALLOY_COMPILER_COMPILER_PASSES_H_ #include +#include #include +#include #include #include //#include #include +#include #include // TODO: diff --git a/src/alloy/compiler/passes/control_flow_analysis_pass.cc b/src/alloy/compiler/passes/control_flow_analysis_pass.cc new file mode 100644 index 000000000..5e73bd502 --- /dev/null +++ b/src/alloy/compiler/passes/control_flow_analysis_pass.cc @@ -0,0 +1,72 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4244) +#pragma warning(disable : 4267) +#include +#pragma warning(pop) + +using namespace alloy; +using namespace alloy::backend; +using namespace alloy::compiler; +using namespace alloy::compiler::passes; +using namespace alloy::frontend; +using namespace alloy::hir; +using namespace alloy::runtime; + + +ControlFlowAnalysisPass::ControlFlowAnalysisPass() : + CompilerPass() { +} + +ControlFlowAnalysisPass::~ControlFlowAnalysisPass() { +} + +int ControlFlowAnalysisPass::Run(HIRBuilder* builder) { + // TODO(benvanik): reset edges for all blocks? Needed to be re-runnable. + + // Add edges. + auto block = builder->first_block(); + while (block) { + auto instr = block->instr_head; + while (instr) { + if (instr->opcode->flags & OPCODE_FLAG_BRANCH) { + if (instr->opcode == &OPCODE_BRANCH_info) { + auto label = instr->src1.label; + builder->AddEdge(block, label->block, Edge::UNCONDITIONAL); + } else if (instr->opcode == &OPCODE_BRANCH_TRUE_info || + instr->opcode == &OPCODE_BRANCH_FALSE_info) { + auto label = instr->src2.label; + builder->AddEdge(block, label->block, 0); + } + } + instr = instr->next; + } + block = block->next; + } + + // Mark dominators. + block = builder->first_block(); + while (block) { + if (block->incoming_edge_head && + !block->incoming_edge_head->incoming_next) { + block->incoming_edge_head->flags |= Edge::DOMINATES; + } + block = block->next; + } + + return 0; +} diff --git a/src/alloy/compiler/passes/control_flow_analysis_pass.h b/src/alloy/compiler/passes/control_flow_analysis_pass.h new file mode 100644 index 000000000..c639db5cb --- /dev/null +++ b/src/alloy/compiler/passes/control_flow_analysis_pass.h @@ -0,0 +1,37 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef ALLOY_COMPILER_PASSES_CONTROL_FLOW_ANALYSIS_PASS_H_ +#define ALLOY_COMPILER_PASSES_CONTROL_FLOW_ANALYSIS_PASS_H_ + +#include + + +namespace alloy { +namespace compiler { +namespace passes { + + +class ControlFlowAnalysisPass : public CompilerPass { +public: + ControlFlowAnalysisPass(); + virtual ~ControlFlowAnalysisPass(); + + virtual int Run(hir::HIRBuilder* builder); + +private: +}; + + +} // namespace passes +} // namespace compiler +} // namespace alloy + + +#endif // ALLOY_COMPILER_PASSES_CONTROL_FLOW_ANALYSIS_PASS_H_ diff --git a/src/alloy/compiler/passes/data_flow_analysis_pass.cc b/src/alloy/compiler/passes/data_flow_analysis_pass.cc new file mode 100644 index 000000000..8501d1675 --- /dev/null +++ b/src/alloy/compiler/passes/data_flow_analysis_pass.cc @@ -0,0 +1,203 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4244) +#pragma warning(disable : 4267) +#include +#pragma warning(pop) + +using namespace alloy; +using namespace alloy::backend; +using namespace alloy::compiler; +using namespace alloy::compiler::passes; +using namespace alloy::frontend; +using namespace alloy::hir; +using namespace alloy::runtime; + + +DataFlowAnalysisPass::DataFlowAnalysisPass() : + CompilerPass() { +} + +DataFlowAnalysisPass::~DataFlowAnalysisPass() { +} + +int DataFlowAnalysisPass::Run(HIRBuilder* builder) { + auto arena = builder->arena(); + + // Linearize blocks so that we can detect cycles and propagate dependencies. + uint32_t block_count = LinearizeBlocks(builder); + + // Analyze value flow and add locals as needed. + AnalyzeFlow(builder, block_count); + + return 0; +} + +uint32_t DataFlowAnalysisPass::LinearizeBlocks(HIRBuilder* builder) { + // TODO(benvanik): actually do this - we cheat now knowing that they are in + // sequential order. + uint32_t block_ordinal = 0; + auto block = builder->first_block(); + while (block) { + block->ordinal = block_ordinal++; + block = block->next; + } + return block_ordinal; +} + +void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder, + uint32_t block_count) { + uint32_t max_value_estimate = + builder->max_value_ordinal() + 1 + block_count * 4; + + // Stash for value map. We may want to maintain this during building. + auto arena = builder->arena(); + Value** value_map = (Value**)arena->Alloc( + sizeof(Value*) * max_value_estimate); + + // Allocate incoming bitvectors for use by blocks. We don't need outgoing + // because they are only used during the block iteration. + // Mapped by block ordinal. + // TODO(benvanik): cache this list, grow as needed, etc. + auto incoming_bitvectors = (llvm::BitVector**)arena->Alloc( + sizeof(llvm::BitVector*) * block_count); + for (auto n = 0u; n < block_count; n++) { + incoming_bitvectors[n] = new llvm::BitVector(max_value_estimate); + } + + // Walk blocks in reverse and calculate incoming/outgoing values. + auto block = builder->last_block(); + while (block) { + // allocate bitsets based on max value number + block->incoming_values = incoming_bitvectors[block->ordinal]; + auto& incoming_values = *block->incoming_values; + + // Walk instructions and gather up incoming values. + auto instr = block->instr_head; + while (instr) { + uint32_t signature = instr->opcode->signature; +#define SET_INCOMING_VALUE(v) \ + if (v->def && v->def->block != block) { \ + incoming_values.set(v->ordinal); \ + } \ + XEASSERT(v->ordinal < max_value_estimate); \ + value_map[v->ordinal] = v; + if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) { + SET_INCOMING_VALUE(instr->src1.value); + } + if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) { + SET_INCOMING_VALUE(instr->src2.value); + } + if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) { + SET_INCOMING_VALUE(instr->src3.value); + } +#undef SET_INCOMING_VALUE + instr = instr->next; + } + + // Add all successor incoming values to our outgoing, as we need to + // pass them through. + llvm::BitVector outgoing_values(max_value_estimate); + auto outgoing_edge = block->outgoing_edge_head; + while (outgoing_edge) { + if (outgoing_edge->dest->ordinal > block->ordinal) { + outgoing_values |= *outgoing_edge->dest->incoming_values; + } + outgoing_edge = outgoing_edge->outgoing_next; + } + incoming_values |= outgoing_values; + + // Add stores for all outgoing values. + auto outgoing_ordinal = outgoing_values.find_first(); + while (outgoing_ordinal != -1) { + Value* src_value = value_map[outgoing_ordinal]; + XEASSERTNOTNULL(src_value); + if (!src_value->local_slot) { + src_value->local_slot = builder->AllocLocal(src_value->type); + } + builder->StoreLocal(src_value->local_slot, src_value); + + // If we are in the block the value was defined in: + if (src_value->def->block == block) { + // Move the store to right after the def, or as soon after + // as we can (respecting PAIRED flags). + auto def_next = src_value->def->next; + while (def_next && def_next->opcode->flags & OPCODE_FLAG_PAIRED_PREV) { + def_next = def_next->next; + } + XEASSERTNOTNULL(def_next); + builder->last_instr()->MoveBefore(def_next); + + // We don't need it in the incoming list. + incoming_values.reset(outgoing_ordinal); + } else { + // Eh, just throw at the end, before the first branch. + auto tail = block->instr_tail; + while (tail && tail->opcode->flags & OPCODE_FLAG_BRANCH) { + tail = tail->prev; + } + XEASSERTNOTZERO(tail); + builder->last_instr()->MoveBefore(tail->next); + } + + outgoing_ordinal = outgoing_values.find_next(outgoing_ordinal); + } + + // Add loads for all incoming values and rename them in the block. + auto incoming_ordinal = incoming_values.find_first(); + while (incoming_ordinal != -1) { + Value* src_value = value_map[incoming_ordinal]; + XEASSERTNOTNULL(src_value); + if (!src_value->local_slot) { + src_value->local_slot = builder->AllocLocal(src_value->type); + } + Value* local_value = builder->LoadLocal(src_value->local_slot); + builder->last_instr()->MoveBefore(block->instr_head); + + // Swap uses of original value with the local value. + auto instr = block->instr_head; + while (instr) { + uint32_t signature = instr->opcode->signature; + if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) { + if (instr->src1.value == src_value) { + instr->set_src1(local_value); + } + } + if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) { + if (instr->src2.value == src_value) { + instr->set_src2(local_value); + } + } + if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) { + if (instr->src3.value == src_value) { + instr->set_src3(local_value); + } + } + instr = instr->next; + } + + incoming_ordinal = incoming_values.find_next(incoming_ordinal); + } + + block = block->prev; + } + + // Cleanup bitvectors. + for (auto n = 0u; n < block_count; n++) { + delete incoming_bitvectors[n]; + } +} diff --git a/src/alloy/compiler/passes/data_flow_analysis_pass.h b/src/alloy/compiler/passes/data_flow_analysis_pass.h new file mode 100644 index 000000000..d19dc6e1c --- /dev/null +++ b/src/alloy/compiler/passes/data_flow_analysis_pass.h @@ -0,0 +1,39 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef ALLOY_COMPILER_PASSES_DATA_FLOW_ANALYSIS_PASS_H_ +#define ALLOY_COMPILER_PASSES_DATA_FLOW_ANALYSIS_PASS_H_ + +#include + + +namespace alloy { +namespace compiler { +namespace passes { + + +class DataFlowAnalysisPass : public CompilerPass { +public: + DataFlowAnalysisPass(); + virtual ~DataFlowAnalysisPass(); + + virtual int Run(hir::HIRBuilder* builder); + +private: + uint32_t LinearizeBlocks(hir::HIRBuilder* builder); + void AnalyzeFlow(hir::HIRBuilder* builder, uint32_t block_count); +}; + + +} // namespace passes +} // namespace compiler +} // namespace alloy + + +#endif // ALLOY_COMPILER_PASSES_DATA_FLOW_ANALYSIS_PASS_H_ diff --git a/src/alloy/compiler/passes/dead_code_elimination_pass.cc b/src/alloy/compiler/passes/dead_code_elimination_pass.cc index a9b7c7bdb..d295cebec 100644 --- a/src/alloy/compiler/passes/dead_code_elimination_pass.cc +++ b/src/alloy/compiler/passes/dead_code_elimination_pass.cc @@ -59,20 +59,21 @@ int DeadCodeEliminationPass::Run(HIRBuilder* builder) { // all removed ops with NOP and then do a single pass that removes them // all. - bool any_removed = false; + bool any_instr_removed = false; + bool any_locals_removed = false; Block* block = builder->first_block(); while (block) { + // Walk instructions in reverse. Instr* i = block->instr_tail; while (i) { - Instr* prev = i->prev; + auto prev = i->prev; - const OpcodeInfo* opcode = i->opcode; - uint32_t signature = opcode->signature; + auto opcode = i->opcode; if (!(opcode->flags & OPCODE_FLAG_VOLATILE) && i->dest && !i->dest->use_head) { // Has no uses and is not volatile. This instruction can die! MakeNopRecursive(i); - any_removed = true; + any_instr_removed = true; } else if (opcode == &OPCODE_ASSIGN_info) { // Assignment. These are useless, so just try to remove by completely // replacing the value. @@ -82,11 +83,31 @@ int DeadCodeEliminationPass::Run(HIRBuilder* builder) { i = prev; } + // Walk instructions forward. + i = block->instr_head; + while (i) { + auto next = i->next; + + auto opcode = i->opcode; + if (opcode == &OPCODE_STORE_LOCAL_info) { + // Check to see if the store has any interceeding uses after the load. + // If not, it can be removed (as the local is just passing through the + // function). + // We do this after the previous pass so that removed code doesn't keep + // the local alive. + if (!CheckLocalUse(i)) { + any_locals_removed = true; + } + } + + i = next; + } + block = block->next; } // Remove all nops. - if (any_removed) { + if (any_instr_removed) { Block* block = builder->first_block(); while (block) { Instr* i = block->instr_head; @@ -102,6 +123,21 @@ int DeadCodeEliminationPass::Run(HIRBuilder* builder) { } } + // Remove any locals that no longer have uses. + if (any_locals_removed) { + // TODO(benvanik): local removal/dealloc. + auto locals = builder->locals(); + for (auto it = locals.begin(); it != locals.end();) { + auto next = ++it; + auto value = *it; + if (!value->use_head) { + // Unused, can be removed. + locals.erase(it); + } + it = next; + } + } + return 0; } @@ -150,3 +186,24 @@ void DeadCodeEliminationPass::ReplaceAssignment(Instr* i) { i->Remove(); } + +bool DeadCodeEliminationPass::CheckLocalUse(Instr* i) { + auto slot = i->src1.value; + auto src = i->src2.value; + + auto use = src->use_head; + if (use) { + auto use_instr = use->instr; + if (use_instr->opcode != &OPCODE_LOAD_LOCAL_info) { + // A valid use (probably). Keep it. + return true; + } + + // Load/store are paired. They can both be removed. + use_instr->Remove(); + } + + i->Remove(); + + return false; +} diff --git a/src/alloy/compiler/passes/dead_code_elimination_pass.h b/src/alloy/compiler/passes/dead_code_elimination_pass.h index 9a8cfc43a..9c3100f8c 100644 --- a/src/alloy/compiler/passes/dead_code_elimination_pass.h +++ b/src/alloy/compiler/passes/dead_code_elimination_pass.h @@ -28,6 +28,7 @@ public: private: void MakeNopRecursive(hir::Instr* i); void ReplaceAssignment(hir::Instr* i); + bool CheckLocalUse(hir::Instr* i); }; diff --git a/src/alloy/compiler/passes/sources.gypi b/src/alloy/compiler/passes/sources.gypi index 251e6350a..bd5559319 100644 --- a/src/alloy/compiler/passes/sources.gypi +++ b/src/alloy/compiler/passes/sources.gypi @@ -5,6 +5,10 @@ 'constant_propagation_pass.h', 'context_promotion_pass.cc', 'context_promotion_pass.h', + 'control_flow_analysis_pass.cc', + 'control_flow_analysis_pass.h', + 'data_flow_analysis_pass.cc', + 'data_flow_analysis_pass.h', 'dead_code_elimination_pass.cc', 'dead_code_elimination_pass.h', 'finalization_pass.cc', @@ -13,6 +17,8 @@ #'dead_store_elimination_pass.h', 'simplification_pass.cc', 'simplification_pass.h', + 'validation_pass.cc', + 'validation_pass.h', 'value_reduction_pass.cc', 'value_reduction_pass.h', ], diff --git a/src/alloy/compiler/passes/validation_pass.cc b/src/alloy/compiler/passes/validation_pass.cc new file mode 100644 index 000000000..15e89bd67 --- /dev/null +++ b/src/alloy/compiler/passes/validation_pass.cc @@ -0,0 +1,99 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include + +using namespace alloy; +using namespace alloy::backend; +using namespace alloy::compiler; +using namespace alloy::compiler::passes; +using namespace alloy::frontend; +using namespace alloy::hir; +using namespace alloy::runtime; + + +ValidationPass::ValidationPass() : + CompilerPass() { +} + +ValidationPass::~ValidationPass() { +} + +int ValidationPass::Run(HIRBuilder* builder) { + StringBuffer str; + builder->Dump(&str); + printf(str.GetString()); + fflush(stdout); + str.Reset(); + + auto block = builder->first_block(); + while (block) { + auto label = block->label_head; + while (label) { + XEASSERT(label->block == block); + if (label->block != block) { + return 1; + } + label = label->next; + } + + auto instr = block->instr_head; + while (instr) { + if (ValidateInstruction(block, instr)) { + return 1; + } + instr = instr->next; + } + + block = block->next; + } + + return 0; +} + +int ValidationPass::ValidateInstruction(Block* block, Instr* instr) { + XEASSERT(instr->block == block); + if (instr->block != block) { + return 1; + } + + uint32_t signature = instr->opcode->signature; + if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) { + if (ValidateValue(block, instr, instr->src1.value)) { + return 1; + } + } + if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) { + if (ValidateValue(block, instr, instr->src2.value)) { + return 1; + } + } + if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) { + if (ValidateValue(block, instr, instr->src3.value)) { + return 1; + } + } + + return 0; +} + +int ValidationPass::ValidateValue(Block* block, Instr* instr, Value* value) { + if (value->def) { + /*auto def = value->def; + XEASSERT(def->block == block); + if (def->block != block) { + return 1; + }*/ + } + return 0; +} diff --git a/src/alloy/compiler/passes/validation_pass.h b/src/alloy/compiler/passes/validation_pass.h new file mode 100644 index 000000000..a9f0c8f9a --- /dev/null +++ b/src/alloy/compiler/passes/validation_pass.h @@ -0,0 +1,39 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef ALLOY_COMPILER_PASSES_VALIDATION_PASS_H_ +#define ALLOY_COMPILER_PASSES_VALIDATION_PASS_H_ + +#include + + +namespace alloy { +namespace compiler { +namespace passes { + + +class ValidationPass : public CompilerPass { +public: + ValidationPass(); + virtual ~ValidationPass(); + + virtual int Run(hir::HIRBuilder* builder); + +private: + int ValidateInstruction(hir::Block* block, hir::Instr* instr); + int ValidateValue(hir::Block* block, hir::Instr* instr, hir::Value* value); +}; + + +} // namespace passes +} // namespace compiler +} // namespace alloy + + +#endif // ALLOY_COMPILER_PASSES_VALIDATION_PASS_H_ diff --git a/src/alloy/compiler/passes/value_reduction_pass.cc b/src/alloy/compiler/passes/value_reduction_pass.cc index 42984e891..4eb61a38b 100644 --- a/src/alloy/compiler/passes/value_reduction_pass.cc +++ b/src/alloy/compiler/passes/value_reduction_pass.cc @@ -13,7 +13,11 @@ #include #include -#include +#pragma warning(push) +#pragma warning(disable : 4244) +#pragma warning(disable : 4267) +#include +#pragma warning(pop) using namespace alloy; using namespace alloy::backend; @@ -51,8 +55,7 @@ void ValueReductionPass::ComputeLastUse(Value* value) { int ValueReductionPass::Run(HIRBuilder* builder) { // Walk each block and reuse variable ordinals as much as possible. - // Let's hope this is enough. - std::bitset<1024> ordinals; + llvm::BitVector ordinals(builder->max_value_ordinal()); auto block = builder->first_block(); while (block) { @@ -82,7 +85,7 @@ int ValueReductionPass::Run(HIRBuilder* builder) { if (v->last_use == instr) { // Available. if (!instr->src1.value->IsConstant()) { - ordinals.set(v->ordinal, false); + ordinals.reset(v->ordinal); } } } @@ -94,7 +97,7 @@ int ValueReductionPass::Run(HIRBuilder* builder) { if (v->last_use == instr) { // Available. if (!instr->src2.value->IsConstant()) { - ordinals.set(v->ordinal, false); + ordinals.reset(v->ordinal); } } } @@ -106,7 +109,7 @@ int ValueReductionPass::Run(HIRBuilder* builder) { if (v->last_use == instr) { // Available. if (!instr->src3.value->IsConstant()) { - ordinals.set(v->ordinal, false); + ordinals.reset(v->ordinal); } } } @@ -115,7 +118,7 @@ int ValueReductionPass::Run(HIRBuilder* builder) { // source value ordinal. auto v = instr->dest; // Find a lower ordinal. - for (auto n = 0; n < ordinals.size(); n++) { + for (auto n = 0u; n < ordinals.size(); n++) { if (!ordinals.test(n)) { ordinals.set(n); v->ordinal = n; diff --git a/src/alloy/frontend/ppc/ppc_translator.cc b/src/alloy/frontend/ppc/ppc_translator.cc index 0cc601889..2431f1761 100644 --- a/src/alloy/frontend/ppc/ppc_translator.cc +++ b/src/alloy/frontend/ppc/ppc_translator.cc @@ -38,20 +38,40 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) : assembler_ = backend->CreateAssembler(); assembler_->Initialize(); + bool validate = FLAGS_validate_hir; + + // Build the CFG first. + compiler_->AddPass(new passes::ControlFlowAnalysisPass()); + // Passes are executed in the order they are added. Multiple of the same // pass type may be used. + if (validate) compiler_->AddPass(new passes::ValidationPass()); //compiler_->AddPass(new passes::ContextPromotionPass()); + if (validate) compiler_->AddPass(new passes::ValidationPass()); compiler_->AddPass(new passes::SimplificationPass()); - // TODO(benvanik): run repeatedly? + if (validate) compiler_->AddPass(new passes::ValidationPass()); compiler_->AddPass(new passes::ConstantPropagationPass()); - //compiler_->AddPass(new passes::TypePropagationPass()); - //compiler_->AddPass(new passes::ByteSwapEliminationPass()); + if (validate) compiler_->AddPass(new passes::ValidationPass()); compiler_->AddPass(new passes::SimplificationPass()); + if (validate) compiler_->AddPass(new passes::ValidationPass()); //compiler_->AddPass(new passes::DeadStoreEliminationPass()); + //if (validate) compiler_->AddPass(new passes::ValidationPass()); compiler_->AddPass(new passes::DeadCodeEliminationPass()); + if (validate) compiler_->AddPass(new passes::ValidationPass()); + + // Adds local load/stores. + compiler_->AddPass(new passes::DataFlowAnalysisPass()); + if (validate) compiler_->AddPass(new passes::ValidationPass()); + compiler_->AddPass(new passes::SimplificationPass()); + if (validate) compiler_->AddPass(new passes::ValidationPass()); + + // Run DCE one more time to cleanup any local manipulation. + compiler_->AddPass(new passes::DeadCodeEliminationPass()); + if (validate) compiler_->AddPass(new passes::ValidationPass()); // Removes all unneeded variables. Try not to add new ones after this. compiler_->AddPass(new passes::ValueReductionPass()); + if (validate) compiler_->AddPass(new passes::ValidationPass()); // Must come last. The HIR is not really HIR after this. compiler_->AddPass(new passes::FinalizationPass()); diff --git a/src/alloy/hir/block.h b/src/alloy/hir/block.h index 1cb6d6414..1683b333c 100644 --- a/src/alloy/hir/block.h +++ b/src/alloy/hir/block.h @@ -12,15 +12,37 @@ #include +XEDECLARECLASS1(llvm, BitVector); + namespace alloy { namespace hir { +class Block; class HIRBuilder; class Instr; class Label; +class Edge { +public: + enum EdgeFlags { + UNCONDITIONAL = (1 << 0), + DOMINATES = (1 << 1), + }; +public: + Edge* outgoing_next; + Edge* outgoing_prev; + Edge* incoming_next; + Edge* incoming_prev; + + Block* src; + Block* dest; + + uint32_t flags; +}; + + class Block { public: Arena* arena; @@ -28,6 +50,10 @@ public: Block* next; Block* prev; + Edge* incoming_edge_head; + Edge* outgoing_edge_head; + llvm::BitVector* incoming_values; + Label* label_head; Label* label_tail; diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index efc12e723..44f1b758c 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -41,6 +41,7 @@ void HIRBuilder::Reset() { attributes_ = 0; next_label_id_ = 0; next_value_ordinal_ = 0; + locals_.clear(); block_head_ = block_tail_ = NULL; current_block_ = NULL; #if XE_DEBUG @@ -141,6 +142,13 @@ void HIRBuilder::Dump(StringBuffer* str) { str->Append("; attributes = %.8X\n", attributes_); } + for (auto it = locals_.begin(); it != locals_.end(); ++it) { + auto local = *it; + str->Append(" ; local "); + DumpValue(str, local); + str->Append("\n"); + } + uint32_t block_ordinal = 0; Block* block = block_head_; while (block) { @@ -161,6 +169,39 @@ void HIRBuilder::Dump(StringBuffer* str) { label = label->next; } + Edge* incoming_edge = block->incoming_edge_head; + while (incoming_edge) { + auto src_label = incoming_edge->src->label_head; + if (src_label && src_label->name) { + str->Append(" ; in: %s", src_label->name); + } else if (src_label) { + str->Append(" ; in: label%d", src_label->id); + } else { + str->Append(" ; in: ", + incoming_edge->src->ordinal); + } + str->Append(", dom:%d, uncond:%d\n", + (incoming_edge->flags & Edge::DOMINATES) ? 1 : 0, + (incoming_edge->flags & Edge::UNCONDITIONAL) ? 1 : 0); + incoming_edge = incoming_edge->incoming_next; + } + Edge* outgoing_edge = block->outgoing_edge_head; + while (outgoing_edge) { + auto dest_label = outgoing_edge->dest->label_head; + if (dest_label && dest_label->name) { + str->Append(" ; out: %s", dest_label->name); + } else if (dest_label) { + str->Append(" ; out: label%d", dest_label->id); + } else { + str->Append(" ; out: ", + outgoing_edge->dest->ordinal); + } + str->Append(", dom:%d, uncond:%d\n", + (outgoing_edge->flags & Edge::DOMINATES) ? 1 : 0, + (outgoing_edge->flags & Edge::UNCONDITIONAL) ? 1 : 0); + outgoing_edge = outgoing_edge->outgoing_next; + } + Instr* i = block->instr_head; while (i) { if (i->opcode->flags & OPCODE_FLAG_HIDE) { @@ -303,6 +344,7 @@ void HIRBuilder::InsertLabel(Label* label, Instr* prev_instr) { block_tail_ = new_block; } new_block->label_head = new_block->label_tail = label; + new_block->incoming_edge_head = new_block->outgoing_edge_head = NULL; label->block = new_block; label->prev = label->next = NULL; @@ -319,8 +361,7 @@ void HIRBuilder::InsertLabel(Label* label, Instr* prev_instr) { new_block->instr_tail = old_prev_tail; } - for (auto instr = new_block->instr_head; instr != new_block->instr_tail; - instr = instr->next) { + for (auto instr = new_block->instr_head; instr; instr = instr->next) { instr->block = new_block; } @@ -342,6 +383,19 @@ void HIRBuilder::ResetLabelTags() { } } +void HIRBuilder::AddEdge(Block* src, Block* dest, uint32_t flags) { + Edge* edge = arena_->Alloc(); + edge->src = src; + edge->dest = dest; + edge->flags = flags; + edge->outgoing_prev = NULL; + edge->outgoing_next = src->outgoing_edge_head; + src->outgoing_edge_head = edge; + edge->incoming_prev = NULL; + edge->incoming_next = dest->incoming_edge_head; + dest->incoming_edge_head = edge; +} + Block* HIRBuilder::AppendBlock() { Block* block = arena_->Alloc(); block->arena = arena_; @@ -356,6 +410,7 @@ Block* HIRBuilder::AppendBlock() { } current_block_ = block; block->label_head = block->label_tail = NULL; + block->incoming_edge_head = block->outgoing_edge_head = NULL; block->instr_head = block->instr_tail = NULL; return block; } @@ -420,6 +475,7 @@ Value* HIRBuilder::AllocValue(TypeName type) { value->def = NULL; value->use_head = NULL; value->last_use = NULL; + value->local_slot = NULL; value->tag = NULL; value->reg = -1; return value; @@ -434,6 +490,7 @@ Value* HIRBuilder::CloneValue(Value* source) { value->def = NULL; value->use_head = NULL; value->last_use = NULL; + value->local_slot = NULL; value->tag = NULL; value->reg = -1; return value; @@ -877,6 +934,28 @@ Value* HIRBuilder::LoadClock() { return i->dest; } +Value* HIRBuilder::AllocLocal(TypeName type) { + Value* slot = AllocValue(type); + locals_.push_back(slot); + return slot; +} + +Value* HIRBuilder::LoadLocal(Value* slot) { + Instr* i = AppendInstr( + OPCODE_LOAD_LOCAL_info, 0, + AllocValue(slot->type)); + i->set_src1(slot); + i->src2.value = i->src3.value = NULL; + return i->dest; +} + +void HIRBuilder::StoreLocal(Value* slot, Value* value) { + Instr* i = AppendInstr(OPCODE_STORE_LOCAL_info, 0); + i->set_src1(slot); + i->set_src2(value); + i->src3.value = NULL; +} + Value* HIRBuilder::LoadContext(size_t offset, TypeName type) { Instr* i = AppendInstr( OPCODE_LOAD_CONTEXT_info, 0, diff --git a/src/alloy/hir/hir_builder.h b/src/alloy/hir/hir_builder.h index e5a0aef07..542b1e7ae 100644 --- a/src/alloy/hir/hir_builder.h +++ b/src/alloy/hir/hir_builder.h @@ -41,7 +41,12 @@ public: uint32_t attributes() const { return attributes_; } void set_attributes(uint32_t value) { attributes_ = value; } + std::vector& locals() { return locals_; } + + uint32_t max_value_ordinal() const { return next_value_ordinal_; } + Block* first_block() const { return block_head_; } + Block* last_block() const { return block_tail_; } Block* current_block() const; Instr* last_instr() const; @@ -50,12 +55,11 @@ public: void InsertLabel(Label* label, Instr* prev_instr); void ResetLabelTags(); + void AddEdge(Block* src, Block* dest, uint32_t flags); + // static allocations: // Value* AllocStatic(size_t length); - // stack allocations: - // Value* AllocLocal(TypeName type); - void Comment(const char* format, ...); void Nop(); @@ -116,6 +120,10 @@ public: Value* LoadClock(); + Value* AllocLocal(TypeName type); + Value* LoadLocal(Value* slot); + void StoreLocal(Value* slot, Value* value); + Value* LoadContext(size_t offset, TypeName type); void StoreContext(size_t offset, Value* value); @@ -230,6 +238,8 @@ protected: uint32_t next_label_id_; uint32_t next_value_ordinal_; + std::vector locals_; + Block* block_head_; Block* block_tail_; Block* current_block_; diff --git a/src/alloy/hir/instr.cc b/src/alloy/hir/instr.cc index 35349f28e..51de2da2c 100644 --- a/src/alloy/hir/instr.cc +++ b/src/alloy/hir/instr.cc @@ -61,6 +61,36 @@ bool Instr::Match(SignatureType dest_req, ((src3_req == SIG_TYPE_IGNORE) || (src3_req == TO_SIG_TYPE(src3.value))); } +void Instr::MoveBefore(Instr* other) { + if (next == other) { + return; + } + + // Remove from current location. + if (prev) { + prev->next = next; + } else { + block->instr_head = next; + } + if (next) { + next->prev = prev; + } else { + block->instr_tail = prev; + } + + // Insert into new location. + block = other->block; + next = other; + prev = other->prev; + other->prev = this; + if (prev) { + prev->next = this; + } + if (other == block->instr_head) { + block->instr_head = this; + } +} + void Instr::Replace(const OpcodeInfo* opcode, uint16_t flags) { this->opcode = opcode; this->flags = flags; diff --git a/src/alloy/hir/instr.h b/src/alloy/hir/instr.h index 42b3c36bf..57effa650 100644 --- a/src/alloy/hir/instr.h +++ b/src/alloy/hir/instr.h @@ -79,6 +79,7 @@ public: SignatureType src2 = SIG_TYPE_X, SignatureType src3 = SIG_TYPE_X) const; + void MoveBefore(Instr* other); void Replace(const OpcodeInfo* opcode, uint16_t flags); void Remove(); }; diff --git a/src/alloy/hir/opcodes.h b/src/alloy/hir/opcodes.h index 2b8649afe..14e3d5d65 100644 --- a/src/alloy/hir/opcodes.h +++ b/src/alloy/hir/opcodes.h @@ -117,6 +117,9 @@ enum Opcode { OPCODE_LOAD_CLOCK, + OPCODE_LOAD_LOCAL, + OPCODE_STORE_LOCAL, + OPCODE_LOAD_CONTEXT, OPCODE_STORE_CONTEXT, @@ -202,6 +205,7 @@ enum OpcodeFlags { OPCODE_FLAG_VOLATILE = (1 << 4), OPCODE_FLAG_IGNORE = (1 << 5), OPCODE_FLAG_HIDE = (1 << 6), + OPCODE_FLAG_PAIRED_PREV = (1 << 7), }; enum OpcodeSignatureType { diff --git a/src/alloy/hir/opcodes.inl b/src/alloy/hir/opcodes.inl index 485fa529b..df1427db2 100644 --- a/src/alloy/hir/opcodes.inl +++ b/src/alloy/hir/opcodes.inl @@ -182,6 +182,18 @@ DEFINE_OPCODE( OPCODE_SIG_V, 0); +DEFINE_OPCODE( + OPCODE_LOAD_LOCAL, + "load_local", + OPCODE_SIG_V_V, + 0); + +DEFINE_OPCODE( + OPCODE_STORE_LOCAL, + "store_local", + OPCODE_SIG_X_V_V, + 0); + DEFINE_OPCODE( OPCODE_LOAD_CONTEXT, "load_context", @@ -297,17 +309,17 @@ DEFINE_OPCODE( OPCODE_DID_CARRY, "did_carry", OPCODE_SIG_V_V, - 0); + OPCODE_FLAG_PAIRED_PREV); DEFINE_OPCODE( OPCODE_DID_OVERFLOW, "did_overflow", OPCODE_SIG_V_V, - 0); + OPCODE_FLAG_PAIRED_PREV); DEFINE_OPCODE( OPCODE_DID_SATURATE, "did_saturate", OPCODE_SIG_V_V, - 0); + OPCODE_FLAG_PAIRED_PREV); DEFINE_OPCODE( OPCODE_VECTOR_COMPARE_EQ, diff --git a/src/alloy/hir/value.h b/src/alloy/hir/value.h index 6db5cc079..37a0a4a5a 100644 --- a/src/alloy/hir/value.h +++ b/src/alloy/hir/value.h @@ -42,6 +42,25 @@ static bool IsFloatType(TypeName type_name) { static bool IsVecType(TypeName type_name) { return type_name == VEC128_TYPE; } +static size_t GetTypeSize(TypeName type_name) { + switch (type_name) { + case INT8_TYPE: + return 1; + case INT16_TYPE: + return 2; + case INT32_TYPE: + return 4; + case INT64_TYPE: + return 8; + case FLOAT32_TYPE: + return 4; + case FLOAT64_TYPE: + return 8; + default: + case VEC128_TYPE: + return 16; + } +} enum ValueFlags { VALUE_IS_CONSTANT = (1 << 1), @@ -78,6 +97,7 @@ public: Use* use_head; // NOTE: for performance reasons this is not maintained during construction. Instr* last_use; + Value* local_slot; // TODO(benvanik): remove to shrink size. void* tag; diff --git a/third_party/llvm.gypi b/third_party/llvm.gypi new file mode 100644 index 000000000..3b8449729 --- /dev/null +++ b/third_party/llvm.gypi @@ -0,0 +1,35 @@ +# Copyright 2014 Ben Vanik. All Rights Reserved. +{ + 'targets': [ + { + 'target_name': 'llvm', + 'type': '<(library)', + + 'direct_dependent_settings': { + 'include_dirs': [ + 'llvm/include/', + ], + + 'defines': [ + ], + }, + + 'msvs_disabled_warnings': [4267], + + 'defines': [ + ], + + 'include_dirs': [ + 'llvm/include/', + ], + + 'sources': [ + 'llvm/dummy.cc', + 'llvm/include/llvm/ADT/BitVector.h', + 'llvm/include/llvm/Support/Compiler.h', + 'llvm/include/llvm/Support/MathExtras.h', + 'llvm/include/llvm/Support/type_traits.h', + ], + } + ] +} diff --git a/third_party/llvm/LICENSE.txt b/third_party/llvm/LICENSE.txt new file mode 100644 index 000000000..37d3c2552 --- /dev/null +++ b/third_party/llvm/LICENSE.txt @@ -0,0 +1,71 @@ +============================================================================== +LLVM Release License +============================================================================== +University of Illinois/NCSA +Open Source License + +Copyright (c) 2003-2014 University of Illinois at Urbana-Champaign. +All rights reserved. + +Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at + Urbana-Champaign, nor the names of its contributors may be used to + endorse or promote products derived from this Software without specific + prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. + +============================================================================== +Copyrights and Licenses for Third Party Software Distributed with LLVM: +============================================================================== +The LLVM software contains code written by third parties. Such software will +have its own individual LICENSE.TXT file in the directory in which it appears. +This file will describe the copyrights, license, and restrictions which apply +to that code. + +The disclaimer of warranty in the University of Illinois Open Source License +applies to all code in the LLVM Distribution, and nothing in any of the +other licenses gives permission to use the names of the LLVM Team or the +University of Illinois to endorse or promote products derived from this +Software. + +The following pieces of software have additional or alternate copyrights, +licenses, and/or restrictions: + +Program Directory +------- --------- +Autoconf llvm/autoconf + llvm/projects/ModuleMaker/autoconf + llvm/projects/sample/autoconf +Google Test llvm/utils/unittest/googletest +OpenBSD regex llvm/lib/Support/{reg*, COPYRIGHT.regex} +pyyaml tests llvm/test/YAMLParser/{*.data, LICENSE.TXT} +ARM contributions llvm/lib/Target/ARM/LICENSE.TXT +md5 contributions llvm/lib/Support/MD5.cpp llvm/include/llvm/Support/MD5.h diff --git a/third_party/llvm/dummy.cc b/third_party/llvm/dummy.cc new file mode 100644 index 000000000..ef866db23 --- /dev/null +++ b/third_party/llvm/dummy.cc @@ -0,0 +1 @@ +// here just to keep gyp happy diff --git a/third_party/llvm/include/llvm/ADT/BitVector.h b/third_party/llvm/include/llvm/ADT/BitVector.h new file mode 100644 index 000000000..90e6d3652 --- /dev/null +++ b/third_party/llvm/include/llvm/ADT/BitVector.h @@ -0,0 +1,602 @@ +//===- llvm/ADT/BitVector.h - Bit vectors -----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the BitVector class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ADT_BITVECTOR_H +#define LLVM_ADT_BITVECTOR_H + +#include "llvm/Support/Compiler.h" +#ifdef LLVM_IGNORE_XENIA +#include "llvm/Support/ErrorHandling.h" +#else +#define llvm_unreachable(msg) assert(false) +#endif // LLVM_IGNORE_XENIA +#include "llvm/Support/MathExtras.h" +#include +#include +#include +#include + +namespace llvm { + +class BitVector { + typedef unsigned long BitWord; + + enum { BITWORD_SIZE = (unsigned)sizeof(BitWord) * CHAR_BIT }; + + BitWord *Bits; // Actual bits. + unsigned Size; // Size of bitvector in bits. + unsigned Capacity; // Size of allocated memory in BitWord. + +public: + // Encapsulation of a single bit. + class reference { + friend class BitVector; + + BitWord *WordRef; + unsigned BitPos; + + reference(); // Undefined + + public: + reference(BitVector &b, unsigned Idx) { + WordRef = &b.Bits[Idx / BITWORD_SIZE]; + BitPos = Idx % BITWORD_SIZE; + } + + ~reference() {} + + reference &operator=(reference t) { + *this = bool(t); + return *this; + } + + reference& operator=(bool t) { + if (t) + *WordRef |= 1L << BitPos; + else + *WordRef &= ~(1L << BitPos); + return *this; + } + + operator bool() const { + return ((*WordRef) & (1L << BitPos)) ? true : false; + } + }; + + + /// BitVector default ctor - Creates an empty bitvector. + BitVector() : Size(0), Capacity(0) { + Bits = 0; + } + + /// BitVector ctor - Creates a bitvector of specified number of bits. All + /// bits are initialized to the specified value. + explicit BitVector(unsigned s, bool t = false) : Size(s) { + Capacity = NumBitWords(s); + Bits = (BitWord *)std::malloc(Capacity * sizeof(BitWord)); + init_words(Bits, Capacity, t); + if (t) + clear_unused_bits(); + } + + /// BitVector copy ctor. + BitVector(const BitVector &RHS) : Size(RHS.size()) { + if (Size == 0) { + Bits = 0; + Capacity = 0; + return; + } + + Capacity = NumBitWords(RHS.size()); + Bits = (BitWord *)std::malloc(Capacity * sizeof(BitWord)); + std::memcpy(Bits, RHS.Bits, Capacity * sizeof(BitWord)); + } + +#if LLVM_HAS_RVALUE_REFERENCES + BitVector(BitVector &&RHS) + : Bits(RHS.Bits), Size(RHS.Size), Capacity(RHS.Capacity) { + RHS.Bits = 0; + } +#endif + + ~BitVector() { + std::free(Bits); + } + + /// empty - Tests whether there are no bits in this bitvector. + bool empty() const { return Size == 0; } + + /// size - Returns the number of bits in this bitvector. + unsigned size() const { return Size; } + + /// count - Returns the number of bits which are set. + unsigned count() const { + unsigned NumBits = 0; + for (unsigned i = 0; i < NumBitWords(size()); ++i) + if (sizeof(BitWord) == 4) + NumBits += CountPopulation_32((uint32_t)Bits[i]); + else if (sizeof(BitWord) == 8) + NumBits += CountPopulation_64(Bits[i]); + else + llvm_unreachable("Unsupported!"); + return NumBits; + } + + /// any - Returns true if any bit is set. + bool any() const { + for (unsigned i = 0; i < NumBitWords(size()); ++i) + if (Bits[i] != 0) + return true; + return false; + } + + /// all - Returns true if all bits are set. + bool all() const { + for (unsigned i = 0; i < Size / BITWORD_SIZE; ++i) + if (Bits[i] != ~0UL) + return false; + + // If bits remain check that they are ones. The unused bits are always zero. + if (unsigned Remainder = Size % BITWORD_SIZE) + return Bits[Size / BITWORD_SIZE] == (1UL << Remainder) - 1; + + return true; + } + + /// none - Returns true if none of the bits are set. + bool none() const { + return !any(); + } + + /// find_first - Returns the index of the first set bit, -1 if none + /// of the bits are set. + int find_first() const { + for (unsigned i = 0; i < NumBitWords(size()); ++i) + if (Bits[i] != 0) { + if (sizeof(BitWord) == 4) + return i * BITWORD_SIZE + countTrailingZeros((uint32_t)Bits[i]); + if (sizeof(BitWord) == 8) + return i * BITWORD_SIZE + countTrailingZeros(Bits[i]); + llvm_unreachable("Unsupported!"); + } + return -1; + } + + /// find_next - Returns the index of the next set bit following the + /// "Prev" bit. Returns -1 if the next set bit is not found. + int find_next(unsigned Prev) const { + ++Prev; + if (Prev >= Size) + return -1; + + unsigned WordPos = Prev / BITWORD_SIZE; + unsigned BitPos = Prev % BITWORD_SIZE; + BitWord Copy = Bits[WordPos]; + // Mask off previous bits. + Copy &= ~0UL << BitPos; + + if (Copy != 0) { + if (sizeof(BitWord) == 4) + return WordPos * BITWORD_SIZE + countTrailingZeros((uint32_t)Copy); + if (sizeof(BitWord) == 8) + return WordPos * BITWORD_SIZE + countTrailingZeros(Copy); + llvm_unreachable("Unsupported!"); + } + + // Check subsequent words. + for (unsigned i = WordPos+1; i < NumBitWords(size()); ++i) + if (Bits[i] != 0) { + if (sizeof(BitWord) == 4) + return i * BITWORD_SIZE + countTrailingZeros((uint32_t)Bits[i]); + if (sizeof(BitWord) == 8) + return i * BITWORD_SIZE + countTrailingZeros(Bits[i]); + llvm_unreachable("Unsupported!"); + } + return -1; + } + + /// clear - Clear all bits. + void clear() { + Size = 0; + } + + /// resize - Grow or shrink the bitvector. + void resize(unsigned N, bool t = false) { + if (N > Capacity * BITWORD_SIZE) { + unsigned OldCapacity = Capacity; + grow(N); + init_words(&Bits[OldCapacity], (Capacity-OldCapacity), t); + } + + // Set any old unused bits that are now included in the BitVector. This + // may set bits that are not included in the new vector, but we will clear + // them back out below. + if (N > Size) + set_unused_bits(t); + + // Update the size, and clear out any bits that are now unused + unsigned OldSize = Size; + Size = N; + if (t || N < OldSize) + clear_unused_bits(); + } + + void reserve(unsigned N) { + if (N > Capacity * BITWORD_SIZE) + grow(N); + } + + // Set, reset, flip + BitVector &set() { + init_words(Bits, Capacity, true); + clear_unused_bits(); + return *this; + } + + BitVector &set(unsigned Idx) { + Bits[Idx / BITWORD_SIZE] |= 1L << (Idx % BITWORD_SIZE); + return *this; + } + + /// set - Efficiently set a range of bits in [I, E) + BitVector &set(unsigned I, unsigned E) { + assert(I <= E && "Attempted to set backwards range!"); + assert(E <= size() && "Attempted to set out-of-bounds range!"); + + if (I == E) return *this; + + if (I / BITWORD_SIZE == E / BITWORD_SIZE) { + BitWord EMask = 1UL << (E % BITWORD_SIZE); + BitWord IMask = 1UL << (I % BITWORD_SIZE); + BitWord Mask = EMask - IMask; + Bits[I / BITWORD_SIZE] |= Mask; + return *this; + } + + BitWord PrefixMask = ~0UL << (I % BITWORD_SIZE); + Bits[I / BITWORD_SIZE] |= PrefixMask; + I = RoundUpToAlignment(I, BITWORD_SIZE); + + for (; I + BITWORD_SIZE <= E; I += BITWORD_SIZE) + Bits[I / BITWORD_SIZE] = ~0UL; + + BitWord PostfixMask = (1UL << (E % BITWORD_SIZE)) - 1; + if (I < E) + Bits[I / BITWORD_SIZE] |= PostfixMask; + + return *this; + } + + BitVector &reset() { + init_words(Bits, Capacity, false); + return *this; + } + + BitVector &reset(unsigned Idx) { + Bits[Idx / BITWORD_SIZE] &= ~(1L << (Idx % BITWORD_SIZE)); + return *this; + } + + /// reset - Efficiently reset a range of bits in [I, E) + BitVector &reset(unsigned I, unsigned E) { + assert(I <= E && "Attempted to reset backwards range!"); + assert(E <= size() && "Attempted to reset out-of-bounds range!"); + + if (I == E) return *this; + + if (I / BITWORD_SIZE == E / BITWORD_SIZE) { + BitWord EMask = 1UL << (E % BITWORD_SIZE); + BitWord IMask = 1UL << (I % BITWORD_SIZE); + BitWord Mask = EMask - IMask; + Bits[I / BITWORD_SIZE] &= ~Mask; + return *this; + } + + BitWord PrefixMask = ~0UL << (I % BITWORD_SIZE); + Bits[I / BITWORD_SIZE] &= ~PrefixMask; + I = RoundUpToAlignment(I, BITWORD_SIZE); + + for (; I + BITWORD_SIZE <= E; I += BITWORD_SIZE) + Bits[I / BITWORD_SIZE] = 0UL; + + BitWord PostfixMask = (1UL << (E % BITWORD_SIZE)) - 1; + if (I < E) + Bits[I / BITWORD_SIZE] &= ~PostfixMask; + + return *this; + } + + BitVector &flip() { + for (unsigned i = 0; i < NumBitWords(size()); ++i) + Bits[i] = ~Bits[i]; + clear_unused_bits(); + return *this; + } + + BitVector &flip(unsigned Idx) { + Bits[Idx / BITWORD_SIZE] ^= 1L << (Idx % BITWORD_SIZE); + return *this; + } + + // Indexing. + reference operator[](unsigned Idx) { + assert (Idx < Size && "Out-of-bounds Bit access."); + return reference(*this, Idx); + } + + bool operator[](unsigned Idx) const { + assert (Idx < Size && "Out-of-bounds Bit access."); + BitWord Mask = 1L << (Idx % BITWORD_SIZE); + return (Bits[Idx / BITWORD_SIZE] & Mask) != 0; + } + + bool test(unsigned Idx) const { + return (*this)[Idx]; + } + + /// Test if any common bits are set. + bool anyCommon(const BitVector &RHS) const { + unsigned ThisWords = NumBitWords(size()); + unsigned RHSWords = NumBitWords(RHS.size()); + for (unsigned i = 0, e = std::min(ThisWords, RHSWords); i != e; ++i) + if (Bits[i] & RHS.Bits[i]) + return true; + return false; + } + + // Comparison operators. + bool operator==(const BitVector &RHS) const { + unsigned ThisWords = NumBitWords(size()); + unsigned RHSWords = NumBitWords(RHS.size()); + unsigned i; + for (i = 0; i != std::min(ThisWords, RHSWords); ++i) + if (Bits[i] != RHS.Bits[i]) + return false; + + // Verify that any extra words are all zeros. + if (i != ThisWords) { + for (; i != ThisWords; ++i) + if (Bits[i]) + return false; + } else if (i != RHSWords) { + for (; i != RHSWords; ++i) + if (RHS.Bits[i]) + return false; + } + return true; + } + + bool operator!=(const BitVector &RHS) const { + return !(*this == RHS); + } + + /// Intersection, union, disjoint union. + BitVector &operator&=(const BitVector &RHS) { + unsigned ThisWords = NumBitWords(size()); + unsigned RHSWords = NumBitWords(RHS.size()); + unsigned i; + for (i = 0; i != std::min(ThisWords, RHSWords); ++i) + Bits[i] &= RHS.Bits[i]; + + // Any bits that are just in this bitvector become zero, because they aren't + // in the RHS bit vector. Any words only in RHS are ignored because they + // are already zero in the LHS. + for (; i != ThisWords; ++i) + Bits[i] = 0; + + return *this; + } + + /// reset - Reset bits that are set in RHS. Same as *this &= ~RHS. + BitVector &reset(const BitVector &RHS) { + unsigned ThisWords = NumBitWords(size()); + unsigned RHSWords = NumBitWords(RHS.size()); + unsigned i; + for (i = 0; i != std::min(ThisWords, RHSWords); ++i) + Bits[i] &= ~RHS.Bits[i]; + return *this; + } + + /// test - Check if (This - RHS) is zero. + /// This is the same as reset(RHS) and any(). + bool test(const BitVector &RHS) const { + unsigned ThisWords = NumBitWords(size()); + unsigned RHSWords = NumBitWords(RHS.size()); + unsigned i; + for (i = 0; i != std::min(ThisWords, RHSWords); ++i) + if ((Bits[i] & ~RHS.Bits[i]) != 0) + return true; + + for (; i != ThisWords ; ++i) + if (Bits[i] != 0) + return true; + + return false; + } + + BitVector &operator|=(const BitVector &RHS) { + if (size() < RHS.size()) + resize(RHS.size()); + for (size_t i = 0, e = NumBitWords(RHS.size()); i != e; ++i) + Bits[i] |= RHS.Bits[i]; + return *this; + } + + BitVector &operator^=(const BitVector &RHS) { + if (size() < RHS.size()) + resize(RHS.size()); + for (size_t i = 0, e = NumBitWords(RHS.size()); i != e; ++i) + Bits[i] ^= RHS.Bits[i]; + return *this; + } + + // Assignment operator. + const BitVector &operator=(const BitVector &RHS) { + if (this == &RHS) return *this; + + Size = RHS.size(); + unsigned RHSWords = NumBitWords(Size); + if (Size <= Capacity * BITWORD_SIZE) { + if (Size) + std::memcpy(Bits, RHS.Bits, RHSWords * sizeof(BitWord)); + clear_unused_bits(); + return *this; + } + + // Grow the bitvector to have enough elements. + Capacity = RHSWords; + BitWord *NewBits = (BitWord *)std::malloc(Capacity * sizeof(BitWord)); + std::memcpy(NewBits, RHS.Bits, Capacity * sizeof(BitWord)); + + // Destroy the old bits. + std::free(Bits); + Bits = NewBits; + + return *this; + } + +#if LLVM_HAS_RVALUE_REFERENCES + const BitVector &operator=(BitVector &&RHS) { + if (this == &RHS) return *this; + + std::free(Bits); + Bits = RHS.Bits; + Size = RHS.Size; + Capacity = RHS.Capacity; + + RHS.Bits = 0; + + return *this; + } +#endif + + void swap(BitVector &RHS) { + std::swap(Bits, RHS.Bits); + std::swap(Size, RHS.Size); + std::swap(Capacity, RHS.Capacity); + } + + //===--------------------------------------------------------------------===// + // Portable bit mask operations. + //===--------------------------------------------------------------------===// + // + // These methods all operate on arrays of uint32_t, each holding 32 bits. The + // fixed word size makes it easier to work with literal bit vector constants + // in portable code. + // + // The LSB in each word is the lowest numbered bit. The size of a portable + // bit mask is always a whole multiple of 32 bits. If no bit mask size is + // given, the bit mask is assumed to cover the entire BitVector. + + /// setBitsInMask - Add '1' bits from Mask to this vector. Don't resize. + /// This computes "*this |= Mask". + void setBitsInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) { + applyMask(Mask, MaskWords); + } + + /// clearBitsInMask - Clear any bits in this vector that are set in Mask. + /// Don't resize. This computes "*this &= ~Mask". + void clearBitsInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) { + applyMask(Mask, MaskWords); + } + + /// setBitsNotInMask - Add a bit to this vector for every '0' bit in Mask. + /// Don't resize. This computes "*this |= ~Mask". + void setBitsNotInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) { + applyMask(Mask, MaskWords); + } + + /// clearBitsNotInMask - Clear a bit in this vector for every '0' bit in Mask. + /// Don't resize. This computes "*this &= Mask". + void clearBitsNotInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) { + applyMask(Mask, MaskWords); + } + +private: + unsigned NumBitWords(unsigned S) const { + return (S + BITWORD_SIZE-1) / BITWORD_SIZE; + } + + // Set the unused bits in the high words. + void set_unused_bits(bool t = true) { + // Set high words first. + unsigned UsedWords = NumBitWords(Size); + if (Capacity > UsedWords) + init_words(&Bits[UsedWords], (Capacity-UsedWords), t); + + // Then set any stray high bits of the last used word. + unsigned ExtraBits = Size % BITWORD_SIZE; + if (ExtraBits) { + BitWord ExtraBitMask = ~0UL << ExtraBits; + if (t) + Bits[UsedWords-1] |= ExtraBitMask; + else + Bits[UsedWords-1] &= ~ExtraBitMask; + } + } + + // Clear the unused bits in the high words. + void clear_unused_bits() { + set_unused_bits(false); + } + + void grow(unsigned NewSize) { + Capacity = std::max(NumBitWords(NewSize), Capacity * 2); + Bits = (BitWord *)std::realloc(Bits, Capacity * sizeof(BitWord)); + + clear_unused_bits(); + } + + void init_words(BitWord *B, unsigned NumWords, bool t) { + memset(B, 0 - (int)t, NumWords*sizeof(BitWord)); + } + + template + void applyMask(const uint32_t *Mask, unsigned MaskWords) { + assert(BITWORD_SIZE % 32 == 0 && "Unsupported BitWord size."); + MaskWords = std::min(MaskWords, (size() + 31) / 32); + const unsigned Scale = BITWORD_SIZE / 32; + unsigned i; + for (i = 0; MaskWords >= Scale; ++i, MaskWords -= Scale) { + BitWord BW = Bits[i]; + // This inner loop should unroll completely when BITWORD_SIZE > 32. + for (unsigned b = 0; b != BITWORD_SIZE; b += 32) { + uint32_t M = *Mask++; + if (InvertMask) M = ~M; + if (AddBits) BW |= BitWord(M) << b; + else BW &= ~(BitWord(M) << b); + } + Bits[i] = BW; + } + for (unsigned b = 0; MaskWords; b += 32, --MaskWords) { + uint32_t M = *Mask++; + if (InvertMask) M = ~M; + if (AddBits) Bits[i] |= BitWord(M) << b; + else Bits[i] &= ~(BitWord(M) << b); + } + if (AddBits) + clear_unused_bits(); + } +}; + +} // End llvm namespace + +namespace std { + /// Implement std::swap in terms of BitVector swap. + inline void + swap(llvm::BitVector &LHS, llvm::BitVector &RHS) { + LHS.swap(RHS); + } +} + +#endif diff --git a/third_party/llvm/include/llvm/Support/Compiler.h b/third_party/llvm/include/llvm/Support/Compiler.h new file mode 100644 index 000000000..806e75917 --- /dev/null +++ b/third_party/llvm/include/llvm/Support/Compiler.h @@ -0,0 +1,446 @@ +//===-- llvm/Support/Compiler.h - Compiler abstraction support --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines several macros, based on the current compiler. This allows +// use of compiler-specific features in a way that remains portable. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_COMPILER_H +#define LLVM_SUPPORT_COMPILER_H + +//#include "llvm/Config/llvm-config.h" + +#ifndef __has_feature +# define __has_feature(x) 0 +#endif + +#ifndef __has_extension +# define __has_extension(x) 0 +#endif + +#ifndef __has_attribute +# define __has_attribute(x) 0 +#endif + +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +/// \macro __GNUC_PREREQ +/// \brief Defines __GNUC_PREREQ if glibc's features.h isn't available. +#ifndef __GNUC_PREREQ +# if defined(__GNUC__) && defined(__GNUC_MINOR__) +# define __GNUC_PREREQ(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +# else +# define __GNUC_PREREQ(maj, min) 0 +# endif +#endif + +/// \macro LLVM_MSC_PREREQ +/// \brief Is the compiler MSVC of at least the specified version? +/// The common \param version values to check for are: +/// * 1600: Microsoft Visual Studio 2010 / 10.0 +/// * 1700: Microsoft Visual Studio 2012 / 11.0 +/// * 1800: Microsoft Visual Studio 2013 / 12.0 +#ifdef _MSC_VER +#define LLVM_MSC_PREREQ(version) (_MSC_VER >= (version)) +#else +#define LLVM_MSC_PREREQ(version) 0 +#endif + +/// \brief Does the compiler support r-value references? +/// This implies that provides the one-argument std::move; it +/// does not imply the existence of any other C++ library features. +#if __has_feature(cxx_rvalue_references) || \ + defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1600) +#define LLVM_HAS_RVALUE_REFERENCES 1 +#else +#define LLVM_HAS_RVALUE_REFERENCES 0 +#endif + +/// \brief Does the compiler support r-value reference *this? +/// +/// Sadly, this is separate from just r-value reference support because GCC +/// implemented everything but this thus far. No release of GCC yet has support +/// for this feature so it is enabled with Clang only. +/// FIXME: This should change to a version check when GCC grows support for it. +#if __has_feature(cxx_rvalue_references) +#define LLVM_HAS_RVALUE_REFERENCE_THIS 1 +#else +#define LLVM_HAS_RVALUE_REFERENCE_THIS 0 +#endif + +/// \macro LLVM_HAS_CXX11_TYPETRAITS +/// \brief Does the compiler have the C++11 type traits. +/// +/// #include +/// +/// * enable_if +/// * {true,false}_type +/// * is_constructible +/// * etc... +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1700) +#define LLVM_HAS_CXX11_TYPETRAITS 1 +#else +#define LLVM_HAS_CXX11_TYPETRAITS 0 +#endif + +/// \macro LLVM_HAS_CXX11_STDLIB +/// \brief Does the compiler have the C++11 standard library. +/// +/// Implies LLVM_HAS_RVALUE_REFERENCES, LLVM_HAS_CXX11_TYPETRAITS +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1700) +#define LLVM_HAS_CXX11_STDLIB 1 +#else +#define LLVM_HAS_CXX11_STDLIB 0 +#endif + +/// \macro LLVM_HAS_VARIADIC_TEMPLATES +/// \brief Does this compiler support variadic templates. +/// +/// Implies LLVM_HAS_RVALUE_REFERENCES and the existence of std::forward. +#if __has_feature(cxx_variadic_templates) || LLVM_MSC_PREREQ(1800) +# define LLVM_HAS_VARIADIC_TEMPLATES 1 +#else +# define LLVM_HAS_VARIADIC_TEMPLATES 0 +#endif + +/// llvm_move - Expands to ::std::move if the compiler supports +/// r-value references; otherwise, expands to the argument. +#if LLVM_HAS_RVALUE_REFERENCES +#define llvm_move(value) (::std::move(value)) +#else +#define llvm_move(value) (value) +#endif + +/// Expands to '&' if r-value references are supported. +/// +/// This can be used to provide l-value/r-value overrides of member functions. +/// The r-value override should be guarded by LLVM_HAS_RVALUE_REFERENCE_THIS +#if LLVM_HAS_RVALUE_REFERENCE_THIS +#define LLVM_LVALUE_FUNCTION & +#else +#define LLVM_LVALUE_FUNCTION +#endif + +/// LLVM_DELETED_FUNCTION - Expands to = delete if the compiler supports it. +/// Use to mark functions as uncallable. Member functions with this should +/// be declared private so that some behavior is kept in C++03 mode. +/// +/// class DontCopy { +/// private: +/// DontCopy(const DontCopy&) LLVM_DELETED_FUNCTION; +/// DontCopy &operator =(const DontCopy&) LLVM_DELETED_FUNCTION; +/// public: +/// ... +/// }; +#if __has_feature(cxx_deleted_functions) || \ + defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1800) +#define LLVM_DELETED_FUNCTION = delete +#else +#define LLVM_DELETED_FUNCTION +#endif + +/// LLVM_FINAL - Expands to 'final' if the compiler supports it. +/// Use to mark classes or virtual methods as final. +#if __has_feature(cxx_override_control) || \ + defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1700) +#define LLVM_FINAL final +#else +#define LLVM_FINAL +#endif + +/// LLVM_OVERRIDE - Expands to 'override' if the compiler supports it. +/// Use to mark virtual methods as overriding a base class method. +#if __has_feature(cxx_override_control) || \ + defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1700) +#define LLVM_OVERRIDE override +#else +#define LLVM_OVERRIDE +#endif + +#if __has_feature(cxx_constexpr) || defined(__GXX_EXPERIMENTAL_CXX0X__) +# define LLVM_CONSTEXPR constexpr +#else +# define LLVM_CONSTEXPR +#endif + +/// LLVM_LIBRARY_VISIBILITY - If a class marked with this attribute is linked +/// into a shared library, then the class should be private to the library and +/// not accessible from outside it. Can also be used to mark variables and +/// functions, making them private to any shared library they are linked into. +/// On PE/COFF targets, library visibility is the default, so this isn't needed. +#if (__has_attribute(visibility) || __GNUC_PREREQ(4, 0)) && \ + !defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(LLVM_ON_WIN32) +#define LLVM_LIBRARY_VISIBILITY __attribute__ ((visibility("hidden"))) +#else +#define LLVM_LIBRARY_VISIBILITY +#endif + +#if __has_attribute(used) || __GNUC_PREREQ(3, 1) +#define LLVM_ATTRIBUTE_USED __attribute__((__used__)) +#else +#define LLVM_ATTRIBUTE_USED +#endif + +#if __has_attribute(warn_unused_result) || __GNUC_PREREQ(3, 4) +#define LLVM_ATTRIBUTE_UNUSED_RESULT __attribute__((__warn_unused_result__)) +#else +#define LLVM_ATTRIBUTE_UNUSED_RESULT +#endif + +// Some compilers warn about unused functions. When a function is sometimes +// used or not depending on build settings (e.g. a function only called from +// within "assert"), this attribute can be used to suppress such warnings. +// +// However, it shouldn't be used for unused *variables*, as those have a much +// more portable solution: +// (void)unused_var_name; +// Prefer cast-to-void wherever it is sufficient. +#if __has_attribute(unused) || __GNUC_PREREQ(3, 1) +#define LLVM_ATTRIBUTE_UNUSED __attribute__((__unused__)) +#else +#define LLVM_ATTRIBUTE_UNUSED +#endif + +// FIXME: Provide this for PE/COFF targets. +#if (__has_attribute(weak) || __GNUC_PREREQ(4, 0)) && \ + (!defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(LLVM_ON_WIN32)) +#define LLVM_ATTRIBUTE_WEAK __attribute__((__weak__)) +#else +#define LLVM_ATTRIBUTE_WEAK +#endif + +// Prior to clang 3.2, clang did not accept any spelling of +// __has_attribute(const), so assume it is supported. +#if defined(__clang__) || defined(__GNUC__) +// aka 'CONST' but following LLVM Conventions. +#define LLVM_READNONE __attribute__((__const__)) +#else +#define LLVM_READNONE +#endif + +#if __has_attribute(pure) || defined(__GNUC__) +// aka 'PURE' but following LLVM Conventions. +#define LLVM_READONLY __attribute__((__pure__)) +#else +#define LLVM_READONLY +#endif + +#if __has_builtin(__builtin_expect) || __GNUC_PREREQ(4, 0) +#define LLVM_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true) +#define LLVM_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false) +#else +#define LLVM_LIKELY(EXPR) (EXPR) +#define LLVM_UNLIKELY(EXPR) (EXPR) +#endif + +// C++ doesn't support 'extern template' of template specializations. GCC does, +// but requires __extension__ before it. In the header, use this: +// EXTERN_TEMPLATE_INSTANTIATION(class foo); +// in the .cpp file, use this: +// TEMPLATE_INSTANTIATION(class foo); +#ifdef __GNUC__ +#define EXTERN_TEMPLATE_INSTANTIATION(X) __extension__ extern template X +#define TEMPLATE_INSTANTIATION(X) template X +#else +#define EXTERN_TEMPLATE_INSTANTIATION(X) +#define TEMPLATE_INSTANTIATION(X) +#endif + +/// LLVM_ATTRIBUTE_NOINLINE - On compilers where we have a directive to do so, +/// mark a method "not for inlining". +#if __has_attribute(noinline) || __GNUC_PREREQ(3, 4) +#define LLVM_ATTRIBUTE_NOINLINE __attribute__((noinline)) +#elif defined(_MSC_VER) +#define LLVM_ATTRIBUTE_NOINLINE __declspec(noinline) +#else +#define LLVM_ATTRIBUTE_NOINLINE +#endif + +/// LLVM_ATTRIBUTE_ALWAYS_INLINE - On compilers where we have a directive to do +/// so, mark a method "always inline" because it is performance sensitive. GCC +/// 3.4 supported this but is buggy in various cases and produces unimplemented +/// errors, just use it in GCC 4.0 and later. +#if __has_attribute(always_inline) || __GNUC_PREREQ(4, 0) +#define LLVM_ATTRIBUTE_ALWAYS_INLINE inline __attribute__((always_inline)) +#elif defined(_MSC_VER) +#define LLVM_ATTRIBUTE_ALWAYS_INLINE __forceinline +#else +#define LLVM_ATTRIBUTE_ALWAYS_INLINE +#endif + +#ifdef __GNUC__ +#define LLVM_ATTRIBUTE_NORETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define LLVM_ATTRIBUTE_NORETURN __declspec(noreturn) +#else +#define LLVM_ATTRIBUTE_NORETURN +#endif + +/// LLVM_EXTENSION - Support compilers where we have a keyword to suppress +/// pedantic diagnostics. +#ifdef __GNUC__ +#define LLVM_EXTENSION __extension__ +#else +#define LLVM_EXTENSION +#endif + +// LLVM_ATTRIBUTE_DEPRECATED(decl, "message") +#if __has_feature(attribute_deprecated_with_message) +# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \ + decl __attribute__((deprecated(message))) +#elif defined(__GNUC__) +# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \ + decl __attribute__((deprecated)) +#elif defined(_MSC_VER) +# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \ + __declspec(deprecated(message)) decl +#else +# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \ + decl +#endif + +/// LLVM_BUILTIN_UNREACHABLE - On compilers which support it, expands +/// to an expression which states that it is undefined behavior for the +/// compiler to reach this point. Otherwise is not defined. +#if __has_builtin(__builtin_unreachable) || __GNUC_PREREQ(4, 5) +# define LLVM_BUILTIN_UNREACHABLE __builtin_unreachable() +#elif defined(_MSC_VER) +# define LLVM_BUILTIN_UNREACHABLE __assume(false) +#endif + +/// LLVM_BUILTIN_TRAP - On compilers which support it, expands to an expression +/// which causes the program to exit abnormally. +#if __has_builtin(__builtin_trap) || __GNUC_PREREQ(4, 3) +# define LLVM_BUILTIN_TRAP __builtin_trap() +#else +# define LLVM_BUILTIN_TRAP *(volatile int*)0x11 = 0 +#endif + +/// \macro LLVM_ASSUME_ALIGNED +/// \brief Returns a pointer with an assumed alignment. +#if __has_builtin(__builtin_assume_aligned) && __GNUC_PREREQ(4, 7) +# define LLVM_ASSUME_ALIGNED(p, a) __builtin_assume_aligned(p, a) +#elif defined(LLVM_BUILTIN_UNREACHABLE) +// As of today, clang does not support __builtin_assume_aligned. +# define LLVM_ASSUME_ALIGNED(p, a) \ + (((uintptr_t(p) % (a)) == 0) ? (p) : (LLVM_BUILTIN_UNREACHABLE, (p))) +#else +# define LLVM_ASSUME_ALIGNED(p, a) (p) +#endif + +/// \macro LLVM_FUNCTION_NAME +/// \brief Expands to __func__ on compilers which support it. Otherwise, +/// expands to a compiler-dependent replacement. +#if defined(_MSC_VER) +# define LLVM_FUNCTION_NAME __FUNCTION__ +#else +# define LLVM_FUNCTION_NAME __func__ +#endif + +#if defined(HAVE_SANITIZER_MSAN_INTERFACE_H) +# include +#else +# define __msan_allocated_memory(p, size) +# define __msan_unpoison(p, size) +#endif + +/// \macro LLVM_MEMORY_SANITIZER_BUILD +/// \brief Whether LLVM itself is built with MemorySanitizer instrumentation. +#if __has_feature(memory_sanitizer) +# define LLVM_MEMORY_SANITIZER_BUILD 1 +#else +# define LLVM_MEMORY_SANITIZER_BUILD 0 +#endif + +/// \macro LLVM_ADDRESS_SANITIZER_BUILD +/// \brief Whether LLVM itself is built with AddressSanitizer instrumentation. +#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) +# define LLVM_ADDRESS_SANITIZER_BUILD 1 +#else +# define LLVM_ADDRESS_SANITIZER_BUILD 0 +#endif + +/// \macro LLVM_IS_UNALIGNED_ACCESS_FAST +/// \brief Is unaligned memory access fast on the host machine. +/// +/// Don't specialize on alignment for platforms where unaligned memory accesses +/// generates the same code as aligned memory accesses for common types. +#if defined(_M_AMD64) || defined(_M_IX86) || defined(__amd64) || \ + defined(__amd64__) || defined(__x86_64) || defined(__x86_64__) || \ + defined(_X86_) || defined(__i386) || defined(__i386__) +# define LLVM_IS_UNALIGNED_ACCESS_FAST 1 +#else +# define LLVM_IS_UNALIGNED_ACCESS_FAST 0 +#endif + +/// \macro LLVM_EXPLICIT +/// \brief Expands to explicit on compilers which support explicit conversion +/// operators. Otherwise expands to nothing. +#if __has_feature(cxx_explicit_conversions) || \ + defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1800) +#define LLVM_EXPLICIT explicit +#else +#define LLVM_EXPLICIT +#endif + +/// \macro LLVM_STATIC_ASSERT +/// \brief Expands to C/C++'s static_assert on compilers which support it. +#if __has_feature(cxx_static_assert) || \ + defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1600) +# define LLVM_STATIC_ASSERT(expr, msg) static_assert(expr, msg) +#elif __has_feature(c_static_assert) +# define LLVM_STATIC_ASSERT(expr, msg) _Static_assert(expr, msg) +#elif __has_extension(c_static_assert) +# define LLVM_STATIC_ASSERT(expr, msg) LLVM_EXTENSION _Static_assert(expr, msg) +#else +# define LLVM_STATIC_ASSERT(expr, msg) +#endif + +/// \macro LLVM_ENUM_INT_TYPE +/// \brief Expands to colon followed by the given integral type on compilers +/// which support C++11 strong enums. This can be used to make enums unsigned +/// with MSVC. +#if __has_feature(cxx_strong_enums) || LLVM_MSC_PREREQ(1600) +# define LLVM_ENUM_INT_TYPE(intty) : intty +#else +# define LLVM_ENUM_INT_TYPE(intty) +#endif + +/// \brief Does the compiler support C++11 semantics for strongly typed forward +/// declared enums? +#if __has_feature(cxx_strong_enums) || LLVM_MSC_PREREQ(1700) +#define LLVM_HAS_STRONG_ENUMS 1 +#else +#define LLVM_HAS_STRONG_ENUMS 0 +#endif + +/// \brief Does the compiler support generalized initializers (using braced +/// lists and std::initializer_list). While clang may claim it supports general +/// initializers, if we're using MSVC's headers, we might not have a usable +/// std::initializer list type from the STL. Disable this for now. +#if __has_feature(cxx_generalized_initializers) && !defined(_MSC_VER) +#define LLVM_HAS_INITIALIZER_LISTS 1 +#else +#define LLVM_HAS_INITIALIZER_LISTS 0 +#endif + +/// \brief Mark debug helper function definitions like dump() that should not be +/// stripped from debug builds. +// FIXME: Move this to a private config.h as it's not usable in public headers. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +#define LLVM_DUMP_METHOD LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED +#else +#define LLVM_DUMP_METHOD LLVM_ATTRIBUTE_NOINLINE +#endif + +#endif diff --git a/third_party/llvm/include/llvm/Support/MathExtras.h b/third_party/llvm/include/llvm/Support/MathExtras.h new file mode 100644 index 000000000..4d2ff0989 --- /dev/null +++ b/third_party/llvm/include/llvm/Support/MathExtras.h @@ -0,0 +1,626 @@ +//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains some functions that are useful for math stuff. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_MATHEXTRAS_H +#define LLVM_SUPPORT_MATHEXTRAS_H + +#include "llvm/Support/Compiler.h" +#ifdef IGNORED_LLVM_XENIA +#include "llvm/Support/SwapByteOrder.h" +#endif // IGNORED_LLVM_XENIA +#include "llvm/Support/type_traits.h" +#include + +#ifdef _MSC_VER +#include +#include +#endif + +namespace llvm { +/// \brief The behavior an operation has on an input of 0. +enum ZeroBehavior { + /// \brief The returned value is undefined. + ZB_Undefined, + /// \brief The returned value is numeric_limits::max() + ZB_Max, + /// \brief The returned value is numeric_limits::digits + ZB_Width +}; + +/// \brief Count number of 0's from the least significant bit to the most +/// stopping at the first 1. +/// +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are +/// valid arguments. +template +typename enable_if_c::is_integer && + !std::numeric_limits::is_signed, std::size_t>::type +countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { + (void)ZB; + + if (!Val) + return std::numeric_limits::digits; + if (Val & 0x1) + return 0; + + // Bisection method. + std::size_t ZeroBits = 0; + T Shift = std::numeric_limits::digits >> 1; + T Mask = std::numeric_limits::max() >> Shift; + while (Shift) { + if ((Val & Mask) == 0) { + Val >>= Shift; + ZeroBits |= Shift; + } + Shift >>= 1; + Mask >>= Shift; + } + return ZeroBits; +} + +// Disable signed. +template +typename enable_if_c::is_integer && + std::numeric_limits::is_signed, std::size_t>::type +countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) LLVM_DELETED_FUNCTION; + +#if __GNUC__ >= 4 || _MSC_VER +template <> +inline std::size_t countTrailingZeros(uint32_t Val, ZeroBehavior ZB) { + if (ZB != ZB_Undefined && Val == 0) + return 32; + +#if __has_builtin(__builtin_ctz) || __GNUC_PREREQ(4, 0) + return __builtin_ctz(Val); +#elif _MSC_VER + unsigned long Index; + _BitScanForward(&Index, Val); + return Index; +#endif +} + +#if !defined(_MSC_VER) || defined(_M_X64) +template <> +inline std::size_t countTrailingZeros(uint64_t Val, ZeroBehavior ZB) { + if (ZB != ZB_Undefined && Val == 0) + return 64; + +#if __has_builtin(__builtin_ctzll) || __GNUC_PREREQ(4, 0) + return __builtin_ctzll(Val); +#elif _MSC_VER + unsigned long Index; + _BitScanForward64(&Index, Val); + return Index; +#endif +} +#endif +#endif + +/// \brief Count number of 0's from the most significant bit to the least +/// stopping at the first 1. +/// +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are +/// valid arguments. +template +typename enable_if_c::is_integer && + !std::numeric_limits::is_signed, std::size_t>::type +countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { + (void)ZB; + + if (!Val) + return std::numeric_limits::digits; + + // Bisection method. + std::size_t ZeroBits = 0; + for (T Shift = std::numeric_limits::digits >> 1; Shift; Shift >>= 1) { + T Tmp = Val >> Shift; + if (Tmp) + Val = Tmp; + else + ZeroBits |= Shift; + } + return ZeroBits; +} + +// Disable signed. +template +typename enable_if_c::is_integer && + std::numeric_limits::is_signed, std::size_t>::type +countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) LLVM_DELETED_FUNCTION; + +#if __GNUC__ >= 4 || _MSC_VER +template <> +inline std::size_t countLeadingZeros(uint32_t Val, ZeroBehavior ZB) { + if (ZB != ZB_Undefined && Val == 0) + return 32; + +#if __has_builtin(__builtin_clz) || __GNUC_PREREQ(4, 0) + return __builtin_clz(Val); +#elif _MSC_VER + unsigned long Index; + _BitScanReverse(&Index, Val); + return Index ^ 31; +#endif +} + +#if !defined(_MSC_VER) || defined(_M_X64) +template <> +inline std::size_t countLeadingZeros(uint64_t Val, ZeroBehavior ZB) { + if (ZB != ZB_Undefined && Val == 0) + return 64; + +#if __has_builtin(__builtin_clzll) || __GNUC_PREREQ(4, 0) + return __builtin_clzll(Val); +#elif _MSC_VER + unsigned long Index; + _BitScanReverse64(&Index, Val); + return Index ^ 63; +#endif +} +#endif +#endif + +/// \brief Get the index of the first set bit starting from the least +/// significant bit. +/// +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are +/// valid arguments. +template +typename enable_if_c::is_integer && + !std::numeric_limits::is_signed, T>::type +findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { + if (ZB == ZB_Max && Val == 0) + return std::numeric_limits::max(); + + return countTrailingZeros(Val, ZB_Undefined); +} + +// Disable signed. +template +typename enable_if_c::is_integer && + std::numeric_limits::is_signed, T>::type +findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) LLVM_DELETED_FUNCTION; + +/// \brief Get the index of the last set bit starting from the least +/// significant bit. +/// +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are +/// valid arguments. +template +typename enable_if_c::is_integer && + !std::numeric_limits::is_signed, T>::type +findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { + if (ZB == ZB_Max && Val == 0) + return std::numeric_limits::max(); + + // Use ^ instead of - because both gcc and llvm can remove the associated ^ + // in the __builtin_clz intrinsic on x86. + return countLeadingZeros(Val, ZB_Undefined) ^ + (std::numeric_limits::digits - 1); +} + +// Disable signed. +template +typename enable_if_c::is_integer && + std::numeric_limits::is_signed, T>::type +findLastSet(T Val, ZeroBehavior ZB = ZB_Max) LLVM_DELETED_FUNCTION; + +/// \brief Macro compressed bit reversal table for 256 bits. +/// +/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable +static const unsigned char BitReverseTable256[256] = { +#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 +#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) +#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) + R6(0), R6(2), R6(1), R6(3) +}; + +/// \brief Reverse the bits in \p Val. +template +T reverseBits(T Val) { + unsigned char in[sizeof(Val)]; + unsigned char out[sizeof(Val)]; + std::memcpy(in, &Val, sizeof(Val)); + for (unsigned i = 0; i < sizeof(Val); ++i) + out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; + std::memcpy(&Val, out, sizeof(Val)); + return Val; +} + +// NOTE: The following support functions use the _32/_64 extensions instead of +// type overloading so that signed and unsigned integers can be used without +// ambiguity. + +/// Hi_32 - This function returns the high 32 bits of a 64 bit value. +inline uint32_t Hi_32(uint64_t Value) { + return static_cast(Value >> 32); +} + +/// Lo_32 - This function returns the low 32 bits of a 64 bit value. +inline uint32_t Lo_32(uint64_t Value) { + return static_cast(Value); +} + +/// isInt - Checks if an integer fits into the given bit width. +template +inline bool isInt(int64_t x) { + return N >= 64 || (-(INT64_C(1)<<(N-1)) <= x && x < (INT64_C(1)<<(N-1))); +} +// Template specializations to get better code for common cases. +template<> +inline bool isInt<8>(int64_t x) { + return static_cast(x) == x; +} +template<> +inline bool isInt<16>(int64_t x) { + return static_cast(x) == x; +} +template<> +inline bool isInt<32>(int64_t x) { + return static_cast(x) == x; +} + +/// isShiftedInt - Checks if a signed integer is an N bit number shifted +/// left by S. +template +inline bool isShiftedInt(int64_t x) { + return isInt(x) && (x % (1< +inline bool isUInt(uint64_t x) { + return N >= 64 || x < (UINT64_C(1)<<(N)); +} +// Template specializations to get better code for common cases. +template<> +inline bool isUInt<8>(uint64_t x) { + return static_cast(x) == x; +} +template<> +inline bool isUInt<16>(uint64_t x) { + return static_cast(x) == x; +} +template<> +inline bool isUInt<32>(uint64_t x) { + return static_cast(x) == x; +} + +/// isShiftedUInt - Checks if a unsigned integer is an N bit number shifted +/// left by S. +template +inline bool isShiftedUInt(uint64_t x) { + return isUInt(x) && (x % (1<> (64 - N))); +} + +/// isIntN - Checks if an signed integer fits into the given (dynamic) +/// bit width. +inline bool isIntN(unsigned N, int64_t x) { + return N >= 64 || (-(INT64_C(1)<<(N-1)) <= x && x < (INT64_C(1)<<(N-1))); +} + +/// isMask_32 - This function returns true if the argument is a sequence of ones +/// starting at the least significant bit with the remainder zero (32 bit +/// version). Ex. isMask_32(0x0000FFFFU) == true. +inline bool isMask_32(uint32_t Value) { + return Value && ((Value + 1) & Value) == 0; +} + +/// isMask_64 - This function returns true if the argument is a sequence of ones +/// starting at the least significant bit with the remainder zero (64 bit +/// version). +inline bool isMask_64(uint64_t Value) { + return Value && ((Value + 1) & Value) == 0; +} + +/// isShiftedMask_32 - This function returns true if the argument contains a +/// sequence of ones with the remainder zero (32 bit version.) +/// Ex. isShiftedMask_32(0x0000FF00U) == true. +inline bool isShiftedMask_32(uint32_t Value) { + return isMask_32((Value - 1) | Value); +} + +/// isShiftedMask_64 - This function returns true if the argument contains a +/// sequence of ones with the remainder zero (64 bit version.) +inline bool isShiftedMask_64(uint64_t Value) { + return isMask_64((Value - 1) | Value); +} + +/// isPowerOf2_32 - This function returns true if the argument is a power of +/// two > 0. Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) +inline bool isPowerOf2_32(uint32_t Value) { + return Value && !(Value & (Value - 1)); +} + +/// isPowerOf2_64 - This function returns true if the argument is a power of two +/// > 0 (64 bit edition.) +inline bool isPowerOf2_64(uint64_t Value) { + return Value && !(Value & (Value - int64_t(1L))); +} + +#ifdef IGNORED_LLVM_XENIA +/// ByteSwap_16 - This function returns a byte-swapped representation of the +/// 16-bit argument, Value. +inline uint16_t ByteSwap_16(uint16_t Value) { + return sys::SwapByteOrder_16(Value); +} + +/// ByteSwap_32 - This function returns a byte-swapped representation of the +/// 32-bit argument, Value. +inline uint32_t ByteSwap_32(uint32_t Value) { + return sys::SwapByteOrder_32(Value); +} + +/// ByteSwap_64 - This function returns a byte-swapped representation of the +/// 64-bit argument, Value. +inline uint64_t ByteSwap_64(uint64_t Value) { + return sys::SwapByteOrder_64(Value); +} +#endif // IGNORED_LLVM_XENIA + +/// CountLeadingOnes_32 - this function performs the operation of +/// counting the number of ones from the most significant bit to the first zero +/// bit. Ex. CountLeadingOnes_32(0xFF0FFF00) == 8. +/// Returns 32 if the word is all ones. +inline unsigned CountLeadingOnes_32(uint32_t Value) { + return countLeadingZeros(~Value); +} + +/// CountLeadingOnes_64 - This function performs the operation +/// of counting the number of ones from the most significant bit to the first +/// zero bit (64 bit edition.) +/// Returns 64 if the word is all ones. +inline unsigned CountLeadingOnes_64(uint64_t Value) { + return countLeadingZeros(~Value); +} + +/// CountTrailingOnes_32 - this function performs the operation of +/// counting the number of ones from the least significant bit to the first zero +/// bit. Ex. CountTrailingOnes_32(0x00FF00FF) == 8. +/// Returns 32 if the word is all ones. +inline unsigned CountTrailingOnes_32(uint32_t Value) { + return countTrailingZeros(~Value); +} + +/// CountTrailingOnes_64 - This function performs the operation +/// of counting the number of ones from the least significant bit to the first +/// zero bit (64 bit edition.) +/// Returns 64 if the word is all ones. +inline unsigned CountTrailingOnes_64(uint64_t Value) { + return countTrailingZeros(~Value); +} + +/// CountPopulation_32 - this function counts the number of set bits in a value. +/// Ex. CountPopulation(0xF000F000) = 8 +/// Returns 0 if the word is zero. +inline unsigned CountPopulation_32(uint32_t Value) { +#if __GNUC__ >= 4 + return __builtin_popcount(Value); +#else + uint32_t v = Value - ((Value >> 1) & 0x55555555); + v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; +#endif +} + +/// CountPopulation_64 - this function counts the number of set bits in a value, +/// (64 bit edition.) +inline unsigned CountPopulation_64(uint64_t Value) { +#if __GNUC__ >= 4 + return __builtin_popcountll(Value); +#else + uint64_t v = Value - ((Value >> 1) & 0x5555555555555555ULL); + v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); + v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); +#endif +} + +/// Log2_32 - This function returns the floor log base 2 of the specified value, +/// -1 if the value is zero. (32 bit edition.) +/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 +inline unsigned Log2_32(uint32_t Value) { + return 31 - countLeadingZeros(Value); +} + +/// Log2_64 - This function returns the floor log base 2 of the specified value, +/// -1 if the value is zero. (64 bit edition.) +inline unsigned Log2_64(uint64_t Value) { + return 63 - countLeadingZeros(Value); +} + +/// Log2_32_Ceil - This function returns the ceil log base 2 of the specified +/// value, 32 if the value is zero. (32 bit edition). +/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 +inline unsigned Log2_32_Ceil(uint32_t Value) { + return 32 - countLeadingZeros(Value - 1); +} + +/// Log2_64_Ceil - This function returns the ceil log base 2 of the specified +/// value, 64 if the value is zero. (64 bit edition.) +inline unsigned Log2_64_Ceil(uint64_t Value) { + return 64 - countLeadingZeros(Value - 1); +} + +/// GreatestCommonDivisor64 - Return the greatest common divisor of the two +/// values using Euclid's algorithm. +inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { + while (B) { + uint64_t T = B; + B = A % B; + A = T; + } + return A; +} + +/// BitsToDouble - This function takes a 64-bit integer and returns the bit +/// equivalent double. +inline double BitsToDouble(uint64_t Bits) { + union { + uint64_t L; + double D; + } T; + T.L = Bits; + return T.D; +} + +/// BitsToFloat - This function takes a 32-bit integer and returns the bit +/// equivalent float. +inline float BitsToFloat(uint32_t Bits) { + union { + uint32_t I; + float F; + } T; + T.I = Bits; + return T.F; +} + +/// DoubleToBits - This function takes a double and returns the bit +/// equivalent 64-bit integer. Note that copying doubles around +/// changes the bits of NaNs on some hosts, notably x86, so this +/// routine cannot be used if these bits are needed. +inline uint64_t DoubleToBits(double Double) { + union { + uint64_t L; + double D; + } T; + T.D = Double; + return T.L; +} + +/// FloatToBits - This function takes a float and returns the bit +/// equivalent 32-bit integer. Note that copying floats around +/// changes the bits of NaNs on some hosts, notably x86, so this +/// routine cannot be used if these bits are needed. +inline uint32_t FloatToBits(float Float) { + union { + uint32_t I; + float F; + } T; + T.F = Float; + return T.I; +} + +/// Platform-independent wrappers for the C99 isnan() function. +int IsNAN(float f); +int IsNAN(double d); + +/// Platform-independent wrappers for the C99 isinf() function. +int IsInf(float f); +int IsInf(double d); + +/// MinAlign - A and B are either alignments or offsets. Return the minimum +/// alignment that may be assumed after adding the two together. +inline uint64_t MinAlign(uint64_t A, uint64_t B) { + // The largest power of 2 that divides both A and B. + // + // Replace "-Value" by "1+~Value" in the following commented code to avoid + // MSVC warning C4146 + // return (A | B) & -(A | B); + return (A | B) & (1 + ~(A | B)); +} + +/// NextPowerOf2 - Returns the next power of two (in 64-bits) +/// that is strictly greater than A. Returns zero on overflow. +inline uint64_t NextPowerOf2(uint64_t A) { + A |= (A >> 1); + A |= (A >> 2); + A |= (A >> 4); + A |= (A >> 8); + A |= (A >> 16); + A |= (A >> 32); + return A + 1; +} + +/// Returns the power of two which is less than or equal to the given value. +/// Essentially, it is a floor operation across the domain of powers of two. +inline uint64_t PowerOf2Floor(uint64_t A) { + if (!A) return 0; + return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); +} + +/// Returns the next integer (mod 2**64) that is greater than or equal to +/// \p Value and is a multiple of \p Align. \p Align must be non-zero. +/// +/// Examples: +/// \code +/// RoundUpToAlignment(5, 8) = 8 +/// RoundUpToAlignment(17, 8) = 24 +/// RoundUpToAlignment(~0LL, 8) = 0 +/// \endcode +inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align) { + return ((Value + Align - 1) / Align) * Align; +} + +/// Returns the offset to the next integer (mod 2**64) that is greater than +/// or equal to \p Value and is a multiple of \p Align. \p Align must be +/// non-zero. +inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) { + return RoundUpToAlignment(Value, Align) - Value; +} + +/// abs64 - absolute value of a 64-bit int. Not all environments support +/// "abs" on whatever their name for the 64-bit int type is. The absolute +/// value of the largest negative number is undefined, as with "abs". +inline int64_t abs64(int64_t x) { + return (x < 0) ? -x : x; +} + +/// SignExtend32 - Sign extend B-bit number x to 32-bit int. +/// Usage int32_t r = SignExtend32<5>(x); +template inline int32_t SignExtend32(uint32_t x) { + return int32_t(x << (32 - B)) >> (32 - B); +} + +/// \brief Sign extend number in the bottom B bits of X to a 32-bit int. +/// Requires 0 < B <= 32. +inline int32_t SignExtend32(uint32_t X, unsigned B) { + return int32_t(X << (32 - B)) >> (32 - B); +} + +/// SignExtend64 - Sign extend B-bit number x to 64-bit int. +/// Usage int64_t r = SignExtend64<5>(x); +template inline int64_t SignExtend64(uint64_t x) { + return int64_t(x << (64 - B)) >> (64 - B); +} + +/// \brief Sign extend number in the bottom B bits of X to a 64-bit int. +/// Requires 0 < B <= 64. +inline int64_t SignExtend64(uint64_t X, unsigned B) { + return int64_t(X << (64 - B)) >> (64 - B); +} + +#if defined(_MSC_VER) + // Visual Studio defines the HUGE_VAL class of macros using purposeful + // constant arithmetic overflow, which it then warns on when encountered. + const float huge_valf = std::numeric_limits::infinity(); +#else + const float huge_valf = HUGE_VALF; +#endif +} // End llvm namespace + +#endif diff --git a/third_party/llvm/include/llvm/Support/type_traits.h b/third_party/llvm/include/llvm/Support/type_traits.h new file mode 100644 index 000000000..ad812de98 --- /dev/null +++ b/third_party/llvm/include/llvm/Support/type_traits.h @@ -0,0 +1,244 @@ +//===- llvm/Support/type_traits.h - Simplfied type traits -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides a template class that determines if a type is a class or +// not. The basic mechanism, based on using the pointer to member function of +// a zero argument to a function was "boosted" from the boost type_traits +// library. See http://www.boost.org/ for all the gory details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_TYPE_TRAITS_H +#define LLVM_SUPPORT_TYPE_TRAITS_H + +//#include "llvm/Support/DataTypes.h" +#include +#include + +#ifndef __has_feature +#define LLVM_DEFINED_HAS_FEATURE +#define __has_feature(x) 0 +#endif + +// This is actually the conforming implementation which works with abstract +// classes. However, enough compilers have trouble with it that most will use +// the one in boost/type_traits/object_traits.hpp. This implementation actually +// works with VC7.0, but other interactions seem to fail when we use it. + +namespace llvm { + +namespace dont_use +{ + // These two functions should never be used. They are helpers to + // the is_class template below. They cannot be located inside + // is_class because doing so causes at least GCC to think that + // the value of the "value" enumerator is not constant. Placing + // them out here (for some strange reason) allows the sizeof + // operator against them to magically be constant. This is + // important to make the is_class::value idiom zero cost. it + // evaluates to a constant 1 or 0 depending on whether the + // parameter T is a class or not (respectively). + template char is_class_helper(void(T::*)()); + template double is_class_helper(...); +} + +template +struct is_class +{ + // is_class<> metafunction due to Paul Mensonides (leavings@attbi.com). For + // more details: + // http://groups.google.com/groups?hl=en&selm=000001c1cc83%24e154d5e0%247772e50c%40c161550a&rnum=1 +public: + static const bool value = + sizeof(char) == sizeof(dont_use::is_class_helper(0)); +}; + + +/// isPodLike - This is a type trait that is used to determine whether a given +/// type can be copied around with memcpy instead of running ctors etc. +template +struct isPodLike { +#if __has_feature(is_trivially_copyable) + // If the compiler supports the is_trivially_copyable trait use it, as it + // matches the definition of isPodLike closely. + static const bool value = __is_trivially_copyable(T); +#else + // If we don't know anything else, we can (at least) assume that all non-class + // types are PODs. + static const bool value = !is_class::value; +#endif +}; + +// std::pair's are pod-like if their elements are. +template +struct isPodLike > { + static const bool value = isPodLike::value && isPodLike::value; +}; + + +template +struct integral_constant { + typedef T value_type; + static const value_type value = v; + typedef integral_constant type; + operator value_type() { return value; } +}; + +typedef integral_constant true_type; +typedef integral_constant false_type; + +/// \brief Metafunction that determines whether the two given types are +/// equivalent. +template struct is_same : public false_type {}; +template struct is_same : public true_type {}; + +/// \brief Metafunction that removes const qualification from a type. +template struct remove_const { typedef T type; }; +template struct remove_const { typedef T type; }; + +/// \brief Metafunction that removes volatile qualification from a type. +template struct remove_volatile { typedef T type; }; +template struct remove_volatile { typedef T type; }; + +/// \brief Metafunction that removes both const and volatile qualification from +/// a type. +template struct remove_cv { + typedef typename remove_const::type>::type type; +}; + +/// \brief Helper to implement is_integral metafunction. +template struct is_integral_impl : false_type {}; +template <> struct is_integral_impl< bool> : true_type {}; +template <> struct is_integral_impl< char> : true_type {}; +template <> struct is_integral_impl< signed char> : true_type {}; +template <> struct is_integral_impl : true_type {}; +template <> struct is_integral_impl< wchar_t> : true_type {}; +template <> struct is_integral_impl< short> : true_type {}; +template <> struct is_integral_impl : true_type {}; +template <> struct is_integral_impl< int> : true_type {}; +template <> struct is_integral_impl : true_type {}; +template <> struct is_integral_impl< long> : true_type {}; +template <> struct is_integral_impl : true_type {}; +template <> struct is_integral_impl< long long> : true_type {}; +template <> struct is_integral_impl : true_type {}; + +/// \brief Metafunction that determines whether the given type is an integral +/// type. +template +struct is_integral : is_integral_impl {}; + +/// \brief Metafunction to remove reference from a type. +template struct remove_reference { typedef T type; }; +template struct remove_reference { typedef T type; }; + +/// \brief Metafunction that determines whether the given type is a pointer +/// type. +template struct is_pointer : false_type {}; +template struct is_pointer : true_type {}; +template struct is_pointer : true_type {}; +template struct is_pointer : true_type {}; +template struct is_pointer : true_type {}; + +/// \brief Metafunction that determines wheather the given type is a reference. +template struct is_reference : false_type {}; +template struct is_reference : true_type {}; + +/// \brief Metafunction that determines whether the given type is either an +/// integral type or an enumeration type. +/// +/// Note that this accepts potentially more integral types than we whitelist +/// above for is_integral because it is based on merely being convertible +/// implicitly to an integral type. +template class is_integral_or_enum { + // Provide an overload which can be called with anything implicitly + // convertible to an unsigned long long. This should catch integer types and + // enumeration types at least. We blacklist classes with conversion operators + // below. + static double check_int_convertible(unsigned long long); + static char check_int_convertible(...); + + typedef typename remove_reference::type UnderlyingT; + static UnderlyingT &nonce_instance; + +public: + static const bool + value = (!is_class::value && !is_pointer::value && + !is_same::value && + !is_same::value && + sizeof(char) != sizeof(check_int_convertible(nonce_instance))); +}; + +// enable_if_c - Enable/disable a template based on a metafunction +template +struct enable_if_c { + typedef T type; +}; + +template struct enable_if_c { }; + +// enable_if - Enable/disable a template based on a metafunction +template +struct enable_if : public enable_if_c { }; + +namespace dont_use { + template char base_of_helper(const volatile Base*); + template double base_of_helper(...); +} + +/// is_base_of - Metafunction to determine whether one type is a base class of +/// (or identical to) another type. +template +struct is_base_of { + static const bool value + = is_class::value && is_class::value && + sizeof(char) == sizeof(dont_use::base_of_helper((Derived*)0)); +}; + +// remove_pointer - Metafunction to turn Foo* into Foo. Defined in +// C++0x [meta.trans.ptr]. +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { + typedef T type; }; + +// If T is a pointer, just return it. If it is not, return T&. +template +struct add_lvalue_reference_if_not_pointer { typedef T &type; }; + +template +struct add_lvalue_reference_if_not_pointer >::type> { + typedef T type; +}; + +// If T is a pointer to X, return a pointer to const X. If it is not, return +// const T. +template +struct add_const_past_pointer { typedef const T type; }; + +template +struct add_const_past_pointer >::type> { + typedef const typename remove_pointer::type *type; +}; + +template +struct conditional { typedef T type; }; + +template +struct conditional { typedef F type; }; + +} + +#ifdef LLVM_DEFINED_HAS_FEATURE +#undef __has_feature +#endif + +#endif diff --git a/xenia.gyp b/xenia.gyp index ddcd45b2a..aea3ec75f 100644 --- a/xenia.gyp +++ b/xenia.gyp @@ -5,6 +5,7 @@ 'third_party/beaengine.gypi', 'third_party/gflags.gypi', 'third_party/jansson.gypi', + 'third_party/llvm.gypi', 'third_party/sparsehash.gypi', 'third_party/wslay.gypi', ], @@ -187,10 +188,12 @@ 'dependencies': [ 'beaengine', 'gflags', + 'llvm', ], 'export_dependent_settings': [ 'beaengine', 'gflags', + 'llvm', ], 'direct_dependent_settings': { From ae02dc7ebac161f4c7edffd13e2706501207e9af Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 2 Feb 2014 01:51:38 -0800 Subject: [PATCH 063/184] Fixing rdx clobber from mul/div. --- src/alloy/backend/x64/lowering/lowering_sequences.cc | 6 ++++++ src/alloy/backend/x64/lowering/op_utils.inl | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index d779d731c..2607c4495 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -1994,6 +1994,7 @@ table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { e.imul(src); } e.mov(dest_src, Nax); + ReloadRDX(e); }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { // RAX = value, RDX = clobbered // TODO(benvanik): make the register allocator put dest_src in RAX? @@ -2007,6 +2008,7 @@ table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { e.imul(Ndx); } e.mov(dest_src, Nax); + ReloadRDX(e); }); } else if (IsFloatType(i->dest->type)) { XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { @@ -2043,6 +2045,7 @@ table->AddSequence(OPCODE_MUL_HI, [](X64Emitter& e, Instr*& i) { e.imul(src); } e.mov(dest_src, Ndx); + ReloadRDX(e); }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { // RAX = value, RDX = clobbered // TODO(benvanik): make the register allocator put dest_src in RAX? @@ -2056,6 +2059,7 @@ table->AddSequence(OPCODE_MUL_HI, [](X64Emitter& e, Instr*& i) { e.imul(Ndx); } e.mov(dest_src, Ndx); + ReloadRDX(e); }); } else { UNIMPLEMENTED_SEQ(); @@ -2077,6 +2081,7 @@ table->AddSequence(OPCODE_DIV, [](X64Emitter& e, Instr*& i) { e.idiv(src); } e.mov(dest_src, Nax); + ReloadRDX(e); }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { // RAX = value, RDX = clobbered // TODO(benvanik): make the register allocator put dest_src in RAX? @@ -2090,6 +2095,7 @@ table->AddSequence(OPCODE_DIV, [](X64Emitter& e, Instr*& i) { e.idiv(Ndx); } e.mov(dest_src, Nax); + ReloadRDX(e); }); } else if (IsFloatType(i->dest->type)) { XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index ef0be4ab1..da28030c5 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -73,6 +73,10 @@ void CallNative(X64Emitter& e, void* target) { e.mov(e.rdx, e.qword[e.rcx + 8]); // membase } +void ReloadRDX(X64Emitter& e) { + e.mov(e.rdx, e.qword[e.rcx + 8]); // membase +} + // Sets EFLAGs with zf for the given value. // ZF = 1 if false, 0 = true (so jz = jump if false) void CheckBoolean(X64Emitter& e, Value* v) { From 44c29a669113e91ff20f26dad36b6a09d735d57d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 2 Feb 2014 02:18:59 -0800 Subject: [PATCH 064/184] Possibly working LOAD_VECTOR_SHL/SHR. --- .../x64/lowering/lowering_sequences.cc | 87 ++++++++++++++++++- src/alloy/backend/x64/lowering/op_utils.inl | 6 ++ src/alloy/core.h | 12 +++ src/alloy/hir/value.h | 11 +-- 4 files changed, 109 insertions(+), 7 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 2607c4495..8f3b2599f 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -63,6 +63,7 @@ enum XmmConst { XMMSignMaskPS = 8, XMMSignMaskPD = 9, XMMByteSwapMask = 10, + XMMPermuteControl15 = 11, }; static const vec128_t xmm_consts[] = { /* XMMZero */ vec128f(0.0f, 0.0f, 0.0f, 0.0f), @@ -76,6 +77,7 @@ static const vec128_t xmm_consts[] = { /* XMMSignMaskPS */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), /* XMMSignMaskPD */ vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u), /* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), + /* XMMPermuteControl15 */ vec128b(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15), }; // Use consts by first loading the base register then accessing memory: // e.mov(e.rax, XMMCONSTBASE) @@ -84,6 +86,45 @@ static const vec128_t xmm_consts[] = { #define XMMCONSTBASE (uint64_t)&xmm_consts[0] #define XMMCONST(base_reg, name) e.ptr[base_reg + name * 16] +static vec128_t lvsl_table[17] = { + vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), + vec128b( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b( 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b( 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b( 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b( 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b( 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), +}; +static vec128_t lvsr_table[17] = { + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b( 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b( 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b( 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b( 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b( 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), + vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), +}; + // A note about vectors: // Alloy represents vectors as xyzw pairs, with indices 0123. // XMM registers are xyzw pairs with indices 3210, making them more like wzyx. @@ -792,14 +833,56 @@ table->AddSequence(OPCODE_VECTOR_CONVERT_F2I, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_LOAD_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { XEASSERT(i->dest->type == VEC128_TYPE); - UNIMPLEMENTED_SEQ(); + if (i->src1.value->IsConstant()) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + auto sh = MIN(16, i->src1.value->AsUint32()); + e.mov(e.rax, (uintptr_t)&lvsl_table[sh]); + e.movaps(dest, e.ptr[e.rax]); + e.EndOp(dest); + } else { + Xmm dest; + Reg8 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + // TODO(benvanik): probably a way to do this with addressing. + e.mov(TEMP_REG, 16); + e.movzx(e.rax, src); + e.cmp(src, 16); + e.cmovb(TEMP_REG, e.rax); + e.shl(TEMP_REG, 4); + e.mov(e.rax, (uintptr_t)lvsl_table); + e.movaps(dest, e.ptr[e.rax + TEMP_REG]); + e.EndOp(dest, src); + } i = e.Advance(i); return true; }); table->AddSequence(OPCODE_LOAD_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { XEASSERT(i->dest->type == VEC128_TYPE); - UNIMPLEMENTED_SEQ(); + if (i->src1.value->IsConstant()) { + Xmm dest; + e.BeginOp(i->dest, dest, REG_DEST); + auto sh = MIN(16, i->src1.value->AsUint32()); + e.mov(e.rax, (uintptr_t)&lvsr_table[sh]); + e.movaps(dest, e.ptr[e.rax]); + e.EndOp(dest); + } else { + Xmm dest; + Reg8 src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); + // TODO(benvanik): probably a way to do this with addressing. + e.mov(TEMP_REG, 16); + e.movzx(e.rax, src); + e.cmp(src, 16); + e.cmovb(TEMP_REG, e.rax); + e.shl(TEMP_REG, 4); + e.mov(e.rax, (uintptr_t)lvsr_table); + e.movaps(dest, e.ptr[e.rax + TEMP_REG]); + e.EndOp(dest, src); + } i = e.Advance(i); return true; }); diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index da28030c5..94aaaef72 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -51,6 +51,12 @@ Address Stash(X64Emitter& e, const Xmm& r) { return addr; } +void LoadXmmConstant(X64Emitter& e, Xmm& dest, const vec128_t& v) { + e.mov(e.qword[e.rsp + STASH_OFFSET], v.low); + e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high); + e.movaps(dest, e.ptr[e.rsp + STASH_OFFSET]); +} + // Moves a 64bit immediate into memory. void MovMem64(X64Emitter& e, RegExp& addr, uint64_t v) { if ((v & ~0x7FFFFFFF) == 0) { diff --git a/src/alloy/core.h b/src/alloy/core.h index cd7a32204..aef7e57c2 100644 --- a/src/alloy/core.h +++ b/src/alloy/core.h @@ -55,6 +55,18 @@ XEFORCEINLINE vec128_t vec128f(float x, float y, float z, float w) { v.f4[0] = x; v.f4[1] = y; v.f4[2] = z; v.f4[3] = w; return v; } +XEFORCEINLINE vec128_t vec128b( + uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, + uint8_t y0, uint8_t y1, uint8_t y2, uint8_t y3, + uint8_t z0, uint8_t z1, uint8_t z2, uint8_t z3, + uint8_t w0, uint8_t w1, uint8_t w2, uint8_t w3) { + vec128_t v; + v.b16[0] = x3; v.b16[1] = x2; v.b16[2] = x1; v.b16[3] = x0; + v.b16[4] = y3; v.b16[5] = y2; v.b16[6] = y1; v.b16[7] = y0; + v.b16[8] = z3; v.b16[9] = z2; v.b16[10] = z1; v.b16[11] = z0; + v.b16[12] = w3; v.b16[13] = w2; v.b16[14] = w1; v.b16[15] = w0; + return v; +} } // namespace alloy diff --git a/src/alloy/hir/value.h b/src/alloy/hir/value.h index 37a0a4a5a..4fa957932 100644 --- a/src/alloy/hir/value.h +++ b/src/alloy/hir/value.h @@ -184,25 +184,26 @@ public: } bool IsConstantTrue() const { if (type == VEC128_TYPE) { - return false; + XEASSERTALWAYS(); } return (flags & VALUE_IS_CONSTANT) && !!constant.i64; } bool IsConstantFalse() const { if (type == VEC128_TYPE) { - return false; + XEASSERTALWAYS(); } return (flags & VALUE_IS_CONSTANT) && !constant.i64; } bool IsConstantZero() const { if (type == VEC128_TYPE) { - return false; + return (flags & VALUE_IS_CONSTANT) && + !constant.v128.low && !constant.v128.high; } return (flags & VALUE_IS_CONSTANT) && !constant.i64; } bool IsConstantEQ(Value* other) const { if (type == VEC128_TYPE) { - return false; + XEASSERTALWAYS(); } return (flags & VALUE_IS_CONSTANT) && (other->flags & VALUE_IS_CONSTANT) && @@ -210,7 +211,7 @@ public: } bool IsConstantNE(Value* other) const { if (type == VEC128_TYPE) { - return false; + XEASSERTALWAYS(); } return (flags & VALUE_IS_CONSTANT) && (other->flags & VALUE_IS_CONSTANT) && From 14d6855b6d0b840a984946914ed5941254286068 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 2 Feb 2014 11:23:03 -0800 Subject: [PATCH 065/184] PERMUTE by V128 and fixing some ops. --- .../x64/lowering/lowering_sequences.cc | 70 ++++++++++++++++--- src/alloy/backend/x64/lowering/op_utils.inl | 25 +++++-- src/alloy/backend/x64/x64_emitter.cc | 26 +++---- 3 files changed, 93 insertions(+), 28 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 8f3b2599f..45bbaa32e 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -2456,11 +2456,11 @@ table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { } else if (IsVecType(i->dest->type)) { XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { // dest_src ^= 0xFFFF... - e.cmpeqps(e.xmm0, e.xmm0); if (dest != src) { e.movaps(dest, src); } - e.pxor(dest, e.xmm0); + e.mov(e.rax, XMMCONSTBASE); + e.pxor(dest, XMMCONST(e.rax, XMMOne)); }); } else { ASSERT_INVALID_TYPE(); @@ -2937,14 +2937,64 @@ table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) { } } else if (i->src1.value->type == VEC128_TYPE) { // Permute bytes between src2 and src3. - // TODO(benvanik): check src3 for zero. if 0, we can use pshufb. - Xmm dest, control, src2, src3; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, control, 0, - i->src2.value, src2, 0, - i->src3.value, src3, 0); - UNIMPLEMENTED_SEQ(); - e.EndOp(dest, control, src2, src3); + if (i->src3.value->IsConstantZero()) { + // Permuting with src2/zero, so just shuffle/mask. + Xmm dest, control, src2; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, control, 0, + i->src2.value, src2, 0); + if (i->src2.value->IsConstantZero()) { + e.vpxor(dest, src2, src2); + } else { + if (i->src2.value->IsConstant()) { + LoadXmmConstant(e, src2, i->src2.value->constant.v128); + } + // Control mask needs to be shuffled. + e.mov(e.rax, XMMCONSTBASE); + e.vpshufb(e.xmm0, control, XMMCONST(e.rax, XMMByteSwapMask)); + e.vpshufb(dest, src2, e.xmm0); + // Build a mask with values in src2 having 0 and values in src3 having 1. + e.vpcmpgtb(e.xmm0, e.xmm0, XMMCONST(e.rax, XMMPermuteControl15)); + e.vpandn(dest, e.xmm0, dest); + } + e.EndOp(dest, control, src2); + } else { + // General permute. + Xmm dest, control, src2, src3; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, control, 0, + i->src2.value, src2, 0, + i->src3.value, src3, 0); + e.mov(e.rax, XMMCONSTBASE); + // Control mask needs to be shuffled. + e.vpshufb(e.xmm1, control, XMMCONST(e.rax, XMMByteSwapMask)); + // Build a mask with values in src2 having 0 and values in src3 having 1. + e.vpcmpgtb(dest, e.xmm1, XMMCONST(e.rax, XMMPermuteControl15)); + Xmm src2_shuf, src3_shuf; + if (i->src2.value->IsConstantZero()) { + e.vpxor(src2, src2); + src2_shuf = src2; + } else { + if (i->src2.value->IsConstant()) { + LoadXmmConstant(e, src2, i->src2.value->constant.v128); + } + src2_shuf = e.xmm0; + e.vpshufb(src2_shuf, src2, e.xmm1); + } + if (i->src3.value->IsConstantZero()) { + e.vpxor(src3, src3); + src3_shuf = src3; + } else { + if (i->src3.value->IsConstant()) { + LoadXmmConstant(e, src3, i->src3.value->constant.v128); + } + // NOTE: reusing xmm1 here. + src3_shuf = e.xmm1; + e.vpshufb(src3_shuf, src3, e.xmm1); + } + e.vpblendvb(dest, src2_shuf, src3_shuf, dest); + e.EndOp(dest, control, src2, src3); + } } else { ASSERT_INVALID_TYPE(); } diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 94aaaef72..3e0ed6789 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -52,9 +52,18 @@ Address Stash(X64Emitter& e, const Xmm& r) { } void LoadXmmConstant(X64Emitter& e, Xmm& dest, const vec128_t& v) { - e.mov(e.qword[e.rsp + STASH_OFFSET], v.low); - e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high); - e.movaps(dest, e.ptr[e.rsp + STASH_OFFSET]); + if (!v.low && !v.high) { + // zero + e.vpxor(dest, dest); + //} else if (v.low == ~0ull && v.high == ~0ull) { + // one + // TODO(benvanik): XMMCONST? + } else { + // TODO(benvanik): more efficient loading of partial values? + e.mov(e.qword[e.rsp + STASH_OFFSET], v.low); + e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high); + e.vmovaps(dest, e.ptr[e.rsp + STASH_OFFSET]); + } } // Moves a 64bit immediate into memory. @@ -539,8 +548,14 @@ void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, vv_fn(e, *i, dest, Ntx); } } else { - e.mov(dest, src2); - vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); + if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { + e.mov(dest, src2); + vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); + } else { + // Need a cv_fn. Or a better way to do all of this. + e.mov(dest, (uint32_t)src1->get_constant(CT())); + vv_fn(e, *i, dest, src2); + } } } else { // 64-bit. diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 4d441673f..5aee9fb59 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -102,21 +102,21 @@ void* X64Emitter::Emplace(size_t stack_size) { int X64Emitter::Emit(HIRBuilder* builder) { // These are the registers we will not be using. All others are fare game. const uint32_t reserved_regs = - GetRegBit(rax) | - GetRegBit(rcx) | - GetRegBit(rdx) | - GetRegBit(rsp) | - GetRegBit(rbp) | - GetRegBit(rsi) | - GetRegBit(rdi) | - GetRegBit(xmm0) | + GetRegBit(rax) | // scratch + GetRegBit(rcx) | // arg + GetRegBit(rdx) | // arg/clobbered + GetRegBit(rsp) | + GetRegBit(rbp) | + GetRegBit(rsi) | + GetRegBit(rdi) | + GetRegBit(r8) | // arg/clobbered + GetRegBit(xmm0) | // scratch + GetRegBit(xmm1) | // sometimes used for scratch, could be fixed // TODO(benvanik): save so that we can use these. - GetRegBit(r8) | - GetRegBit(r9) | - GetRegBit(r10) | - GetRegBit(r11) | - GetRegBit(xmm1) | + GetRegBit(r9) | + GetRegBit(r10) | + GetRegBit(r11) | GetRegBit(xmm2) | GetRegBit(xmm3) | GetRegBit(xmm4) | From e5e490ca9bcd20e7714bfb40e6be8bde80697353 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 2 Feb 2014 11:25:00 -0800 Subject: [PATCH 066/184] AVX1+ required. I'm sure I'm using AVX2 instructions now, though. --- README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 63525bece..590d2ee63 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ Xenia - Xbox 360 Emulator Research Project ========================================== -Xenia is an experimental emulator for the Xbox 360. It does not run games (yet), -and if you are unable to understand that please leave now. +Xenia is an experimental emulator for the Xbox 360. It does not run games (yet). Pull requests are welcome but the code is in a very high churn state and may not be accepted, so ask in IRC before taking on anything big. Contributions are @@ -54,7 +53,7 @@ See [building](docs/building.md) for setup and information about the Have some spare time, know advanced C++, and want to write an emulator? Contribute! There's a ton of work that needs to be done, a lot of which -is wide open greenfield fun. +is wide open greenfield fun. That said, the project is currently undergoing a lot of major foundational development and core pieces are changing rapidly and poorly documented. @@ -69,7 +68,7 @@ that there are some major work areas still untouched: * Start [hacking on audio](https://github.com/benvanik/xenia/issues/62) * Support [loading of PIRS files](https://github.com/benvanik/xenia/issues/63) * Build a [virtual LIVE service](https://github.com/benvanik/xenia/issues/64) - + See more projects [good for contributors](https://github.com/benvanik/xenia/issues?labels=good+for+contributors&page=1&state=open). It's a good idea to ask on IRC/the bugs before beginning work on something. @@ -85,7 +84,7 @@ Come on people. Jeez. ### What kind of machine do I need to run this? -You'll need 64-bit Windows 7 with a processor supporting at least SSE4. +You'll need 64-bit Windows 7 with a processor supporting at least AVX1. It's only tested on Windows 8 and that may become a requirement as several of the APIs exposed there are beneficial to emulation. In general if you have to ask if your machine is good enough to run games at a decent speed the answer is @@ -108,7 +107,7 @@ be required in the future. I get asked this about once a day. Yes, I have heard of them. In fact, I spent a long time trying them out: -[LLVM](https://github.com/benvanik/xenia/tree/85bdbd24d1b5923cfb104f45194a96e7ac57026e/src/xenia/cpu/codegen), +[LLVM](https://github.com/benvanik/xenia/tree/85bdbd24d1b5923cfb104f45194a96e7ac57026e/src/xenia/cpu/codegen), [libjit](https://github.com/benvanik/xenia/tree/eee856be0499a4bc721b6097f5f2b9446929f2cc/src/xenia/cpu/libjit), [asmjit](https://github.com/benvanik/xenia/tree/ca208fa60a0285d396409743064784cc2320c094/src/xenia/cpu/x64). They don't work for this purpose. I understand if you disagree, but please From 544d453691976ed468c7557e00c5f440b9d5b3ab Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 2 Feb 2014 12:28:40 -0800 Subject: [PATCH 067/184] Stack fixes. --- .../x64/lowering/lowering_sequences.cc | 5 ++-- src/alloy/backend/x64/lowering/op_utils.inl | 6 ++-- src/alloy/backend/x64/x64_code_cache.cc | 7 ++--- src/alloy/backend/x64/x64_emitter.cc | 27 ++++++++++------- src/alloy/backend/x64/x64_emitter.h | 6 +++- src/alloy/backend/x64/x64_thunk_emitter.cc | 14 +++++---- src/alloy/backend/x64/x64_thunk_emitter.h | 30 ++++++++++++++----- 7 files changed, 61 insertions(+), 34 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 45bbaa32e..576ad65a0 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -231,7 +231,7 @@ void IssueCall(X64Emitter& e, FunctionInfo* symbol_info, uint32_t flags) { // Actually jump/call to rax. if (flags & CALL_TAIL) { - e.add(e.rsp, StackLayout::GUEST_STACK_SIZE); + e.add(e.rsp, (uint32_t)e.stack_size()); e.jmp(e.rax); } else { e.call(e.rax); @@ -250,7 +250,7 @@ void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { // Actually jump/call to rax. if (flags & CALL_TAIL) { - e.add(e.rsp, StackLayout::GUEST_STACK_SIZE); + e.add(e.rsp, (uint32_t)e.stack_size()); e.jmp(e.rax); } else { e.call(e.rax); @@ -397,6 +397,7 @@ table->AddSequence(OPCODE_CALL_EXTERN, [](X64Emitter& e, Instr*& i) { e.mov(e.r8, (uint64_t)symbol_info->extern_arg0()); e.mov(e.r9, (uint64_t)symbol_info->extern_arg1()); TransitionToHost(e); + ReloadRDX(e); } i = e.Advance(i); return true; diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 3e0ed6789..5ec86b65d 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -28,7 +28,7 @@ namespace { void LoadEflags(X64Emitter& e) { #if STORE_EFLAGS e.mov(e.eax, e.dword[e.rsp + STASH_OFFSET]); - e.push(e.ax); + e.push(e.rax); e.popf(); #else // EFLAGS already present. @@ -37,7 +37,7 @@ void LoadEflags(X64Emitter& e) { void StoreEflags(X64Emitter& e) { #if STORE_EFLAGS e.pushf(); - e.pop(e.word[e.rsp + STASH_OFFSET]); + e.pop(e.qword[e.rsp + STASH_OFFSET]); #else // EFLAGS should have CA set? // (so long as we don't fuck with it) @@ -84,7 +84,7 @@ void MovMem64(X64Emitter& e, RegExp& addr, uint64_t v) { void CallNative(X64Emitter& e, void* target) { e.mov(e.rax, (uint64_t)target); e.call(e.rax); - e.mov(e.rcx, e.qword[e.rsp + StackLayout::RCX_HOME]); + e.mov(e.rcx, e.qword[e.rsp + StackLayout::GUEST_RCX_HOME]); e.mov(e.rdx, e.qword[e.rcx + 8]); // membase } diff --git a/src/alloy/backend/x64/x64_code_cache.cc b/src/alloy/backend/x64/x64_code_cache.cc index 2b2bf322d..7282c2e23 100644 --- a/src/alloy/backend/x64/x64_code_cache.cc +++ b/src/alloy/backend/x64/x64_code_cache.cc @@ -267,10 +267,7 @@ void X64CodeChunk::AddTableEntry(uint8_t* code, size_t code_size, unwind_code.OpInfo = stack_size / 8 - 1; } else { // TODO(benvanik): take as parameters? - uint8_t prolog_size = 17; - - // This doesn't work, for some reason. - XEASSERTALWAYS(); + uint8_t prolog_size = 7; // http://msdn.microsoft.com/en-us/library/ddssxxy8.aspx UNWIND_INFO* unwind_info = (UNWIND_INFO*)(buffer + unwind_info_offset); @@ -284,7 +281,7 @@ void X64CodeChunk::AddTableEntry(uint8_t* code, size_t code_size, // http://msdn.microsoft.com/en-us/library/ck9asaa9.aspx size_t co = 0; auto& unwind_code = unwind_info->UnwindCode[co++]; - unwind_code.CodeOffset = 17; // end of instruction + 1 == offset of next instruction + unwind_code.CodeOffset = 7; // end of instruction + 1 == offset of next instruction unwind_code.UnwindOp = UWOP_ALLOC_LARGE; unwind_code.OpInfo = 0; unwind_code = unwind_info->UnwindCode[co++]; diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 5aee9fb59..c8ddaddb4 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -64,14 +64,15 @@ int X64Emitter::Emit( } // Fill the generator with code. - int result = Emit(builder); + size_t stack_size = 0; + int result = Emit(builder, stack_size); if (result) { return result; } // Copy the final code to the cache and relocate it. out_code_size = getSize(); - out_code_address = Emplace(StackLayout::GUEST_STACK_SIZE); + out_code_address = Emplace(stack_size); // Stash source map. if (debug_info_flags & DEBUG_INFO_SOURCE_MAP) { @@ -99,7 +100,7 @@ void* X64Emitter::Emplace(size_t stack_size) { #define XEALIGN(value, align) ((value + align - 1) & ~(align - 1)) -int X64Emitter::Emit(HIRBuilder* builder) { +int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) { // These are the registers we will not be using. All others are fare game. const uint32_t reserved_regs = GetRegBit(rax) | // scratch @@ -125,7 +126,7 @@ int X64Emitter::Emit(HIRBuilder* builder) { // Calculate stack size. We need to align things to their natural sizes. // This could be much better (sort by type/etc). auto locals = builder->locals(); - size_t stack_offset = 0; + size_t stack_offset = StackLayout::GUEST_STACK_SIZE; for (auto it = locals.begin(); it != locals.end(); ++it) { auto slot = *it; size_t type_size = GetTypeSize(slot->type); @@ -134,6 +135,9 @@ int X64Emitter::Emit(HIRBuilder* builder) { slot->set_constant(stack_offset); stack_offset += type_size; } + // Ensure 16b alignment. + stack_offset -= StackLayout::GUEST_STACK_SIZE; + stack_offset = XEALIGN(stack_offset, 16); // Function prolog. // Must be 16b aligned. @@ -147,11 +151,13 @@ int X64Emitter::Emit(HIRBuilder* builder) { // X64CodeCache, which dynamically generates exception information. // Adding or changing anything here must be matched! const bool emit_prolog = true; - const size_t stack_size = StackLayout::GUEST_STACK_SIZE; + const size_t stack_size = StackLayout::GUEST_STACK_SIZE + stack_offset; + XEASSERT((stack_size + 8) % 16 == 0); + out_stack_size = stack_size; + stack_size_ = stack_size; if (emit_prolog) { - mov(qword[rsp + 8 * 2], rdx); - mov(qword[rsp + 8 * 1], rcx); - sub(rsp, stack_size); + sub(rsp, (uint32_t)stack_size); + mov(qword[rsp + StackLayout::GUEST_RCX_HOME], rcx); } auto lowering_table = backend_->lowering_table(); @@ -187,9 +193,8 @@ int X64Emitter::Emit(HIRBuilder* builder) { // Function epilog. L("epilog"); if (emit_prolog) { - add(rsp, stack_size); - mov(rcx, qword[rsp + 8 * 1]); - mov(rdx, qword[rsp + 8 * 2]); + mov(rcx, qword[rsp + StackLayout::GUEST_RCX_HOME]); + add(rsp, (uint32_t)stack_size); } ret(); diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index c5bc51e05..4962dab14 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -145,9 +145,11 @@ public: void MarkSourceOffset(hir::Instr* i); + size_t stack_size() const { return stack_size_; } + protected: void* Emplace(size_t stack_size); - int Emit(hir::HIRBuilder* builder); + int Emit(hir::HIRBuilder* builder, size_t& out_stack_size); protected: runtime::Runtime* runtime_; @@ -168,6 +170,8 @@ protected: size_t source_map_count_; Arena source_map_arena_; + + size_t stack_size_; }; diff --git a/src/alloy/backend/x64/x64_thunk_emitter.cc b/src/alloy/backend/x64/x64_thunk_emitter.cc index 0bd7239f6..7fc6fab60 100644 --- a/src/alloy/backend/x64/x64_thunk_emitter.cc +++ b/src/alloy/backend/x64/x64_thunk_emitter.cc @@ -38,7 +38,8 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { mov(qword[rsp + 8 * 1], rcx); sub(rsp, stack_size); - mov(qword[rsp + 56], rbx); + mov(qword[rsp + 48], rbx); + mov(qword[rsp + 56], rcx); mov(qword[rsp + 64], rbp); mov(qword[rsp + 72], rsi); mov(qword[rsp + 80], rdi); @@ -74,7 +75,8 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { movaps(xmm14, ptr[rsp + 256]); movaps(xmm15, ptr[rsp + 272]);*/ - mov(rbx, qword[rsp + 56]); + mov(rbx, qword[rsp + 48]); + mov(rcx, qword[rsp + 56]); mov(rbp, qword[rsp + 64]); mov(rsi, qword[rsp + 72]); mov(rdi, qword[rsp + 80]); @@ -104,7 +106,8 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { mov(qword[rsp + 8 * 1], rcx); sub(rsp, stack_size); - mov(qword[rsp + 56], rbx); + mov(qword[rsp + 48], rbx); + mov(qword[rsp + 56], rcx); mov(qword[rsp + 64], rbp); mov(qword[rsp + 72], rsi); mov(qword[rsp + 80], rdi); @@ -120,7 +123,8 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { mov(r8, r9); call(rax); - mov(rbx, qword[rsp + 56]); + mov(rbx, qword[rsp + 48]); + mov(rcx, qword[rsp + 56]); mov(rbp, qword[rsp + 64]); mov(rsi, qword[rsp + 72]); mov(rdi, qword[rsp + 80]); @@ -128,7 +132,7 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { mov(r13, qword[rsp + 96]); mov(r14, qword[rsp + 104]); mov(r15, qword[rsp + 112]); - + add(rsp, stack_size); mov(rcx, qword[rsp + 8 * 1]); mov(rdx, qword[rsp + 8 * 2]); diff --git a/src/alloy/backend/x64/x64_thunk_emitter.h b/src/alloy/backend/x64/x64_thunk_emitter.h index 6559ab9a5..a9f27650b 100644 --- a/src/alloy/backend/x64/x64_thunk_emitter.h +++ b/src/alloy/backend/x64/x64_thunk_emitter.h @@ -25,15 +25,18 @@ namespace x64 { * ---------------------------- * NOTE: stack must always be 16b aligned. * + * Thunk stack: * +------------------+ * | arg temp, 3 * 8 | rsp + 0 * | | * | | * +------------------+ - * | scratch, 24b | rsp + 32 + * | scratch, 16b | rsp + 32 * | | * +------------------+ - * | rbx | rsp + 56 + * | rbx | rsp + 48 + * +------------------+ + * | rcx / context | rsp + 56 * +------------------+ * | rbp | rsp + 64 * +------------------+ @@ -90,17 +93,30 @@ namespace x64 { * | | * +------------------+ * + * Guest stack: + * +------------------+ + * | arg temp, 3 * 8 | rsp + 0 + * | | + * | | + * +------------------+ + * | scratch, 32b | rsp + 32 + * | | + * +------------------+ + * | rcx / context | rsp + 64 + * +------------------+ + * ... locals ... + * +------------------+ + * | (return address) | + * +------------------+ + * */ class StackLayout { public: - const static size_t GUEST_STACK_SIZE = 120; - const static size_t THUNK_STACK_SIZE = 120; - const static size_t RETURN_ADDRESS = 120; - const static size_t RCX_HOME = 128; - const static size_t RDX_HOME = 136; + const static size_t GUEST_STACK_SIZE = 72; + const static size_t GUEST_RCX_HOME = 64; }; From bc54fc1ae8077d45f3338be4a46bc1dbaaba120b Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 2 Feb 2014 13:04:42 -0800 Subject: [PATCH 068/184] UNPACK D3DCOLOR and EXTRACT INT32 variable. --- .../x64/lowering/lowering_sequences.cc | 52 ++++++++++++++++--- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 576ad65a0..2cc0cfd53 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -64,6 +64,8 @@ enum XmmConst { XMMSignMaskPD = 9, XMMByteSwapMask = 10, XMMPermuteControl15 = 11, + XMMUnpackD3DCOLOR = 12, + XMMOneOver255 = 13, }; static const vec128_t xmm_consts[] = { /* XMMZero */ vec128f(0.0f, 0.0f, 0.0f, 0.0f), @@ -78,6 +80,8 @@ static const vec128_t xmm_consts[] = { /* XMMSignMaskPD */ vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u), /* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), /* XMMPermuteControl15 */ vec128b(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15), + /* XMMUnpackD3DCOLOR */ vec128i(0xFFFFFF02, 0xFFFFFF01, 0xFFFFFF00, 0xFFFFFF02), + /* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f), }; // Use consts by first loading the base register then accessing memory: // e.mov(e.rax, XMMCONSTBASE) @@ -124,6 +128,12 @@ static vec128_t lvsr_table[17] = { vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), }; +static vec128_t extract_table_32[4] = { + vec128b( 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b( 7, 6, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b(11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), +}; // A note about vectors: // Alloy represents vectors as xyzw pairs, with indices 0123. @@ -2788,16 +2798,30 @@ table->AddSequence(OPCODE_EXTRACT, [](X64Emitter& e, Instr*& i) { } e.EndOp(dest, src); } else if (i->dest->type == INT32_TYPE) { - Reg32 dest; - Xmm src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); if (i->src2.value->IsConstant()) { + Reg32 dest; + Xmm src; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0); e.pextrd(dest, src, i->src2.value->constant.i8); + e.EndOp(dest, src); } else { - UNIMPLEMENTED_SEQ(); + Reg32 dest; + Xmm src; + Reg8 sel; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src, 0, + i->src2.value, sel, 0); + // Get the desired word in xmm0, then extract that. + e.mov(TEMP_REG, sel); + e.and(TEMP_REG, 0x03); + e.shl(TEMP_REG, 4); + e.mov(e.rax, (uintptr_t)extract_table_32); + e.movaps(e.xmm0, e.ptr[e.rax + TEMP_REG]); + e.vpshufb(e.xmm0, src, e.xmm0); + e.pextrd(dest, e.xmm0, 0); + e.EndOp(dest, src, sel); } - e.EndOp(dest, src); } else if (i->dest->type == FLOAT32_TYPE) { Reg32 dest; Xmm src; @@ -3057,7 +3081,21 @@ table->AddSequence(OPCODE_PACK, [](X64Emitter& e, Instr*& i) { table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { if (i->flags == PACK_TYPE_D3DCOLOR) { - UNIMPLEMENTED_SEQ(); + // ARGB (WXYZ) -> RGBA (XYZW) + // XMLoadColor + // int32_t src = (int32_t)src1.iw; + // dest.f4[0] = (float)((src >> 16) & 0xFF) * (1.0f / 255.0f); + // dest.f4[1] = (float)((src >> 8) & 0xFF) * (1.0f / 255.0f); + // dest.f4[2] = (float)(src & 0xFF) * (1.0f / 255.0f); + // dest.f4[3] = (float)((src >> 24) & 0xFF) * (1.0f / 255.0f); + XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { + // src = ZZYYXXWW + // unpack to 000000ZZ,000000YY,000000XX,000000WW + e.mov(e.rax, XMMCONSTBASE); + e.vpshufb(dest, src, XMMCONST(e.rax, XMMUnpackD3DCOLOR)); + // mult by 1/255 + e.vmulps(dest, XMMCONST(e.rax, XMMOneOver255)); + }); } else if (i->flags == PACK_TYPE_FLOAT16_2) { // 1 bit sign, 5 bit exponent, 10 bit mantissa // D3D10 half float format From 05387b499644205f6b5cebba9c58fcb783acc7ec Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 2 Feb 2014 13:10:24 -0800 Subject: [PATCH 069/184] Fixing type mismatches. --- src/alloy/frontend/ppc/ppc_emit_alu.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/alloy/frontend/ppc/ppc_emit_alu.cc b/src/alloy/frontend/ppc/ppc_emit_alu.cc index 7144e7eb6..9b25e824c 100644 --- a/src/alloy/frontend/ppc/ppc_emit_alu.cc +++ b/src/alloy/frontend/ppc/ppc_emit_alu.cc @@ -1027,7 +1027,8 @@ XEEMITTER(rlwnmx, 0x5C000000, M )(PPCHIRBuilder& f, InstrData& i) { // m <- MASK(MB+32, ME+32) // RA <- r & m Value* v = f.Truncate(f.LoadGPR(i.M.RT), INT32_TYPE); - Value* sh = f.And(f.LoadGPR(i.M.SH), f.LoadConstant(0x1F)); + Value* sh = f.And(f.Truncate(f.LoadGPR(i.M.SH), INT32_TYPE), + f.LoadConstant(0x1F)); v = f.RotateLeft(v, sh); // Compiler sometimes masks with 0xFFFFFFFF (identity) - avoid the work here // as our truncation/zero-extend does it for us. @@ -1197,7 +1198,7 @@ XEEMITTER(srawx, 0x7C000630, X )(PPCHIRBuilder& f, InstrData& i) { Value* v = f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE); Value* sh = f.And( f.Truncate(f.LoadGPR(i.X.RB), INT32_TYPE), - f.LoadConstant((int8_t)0x7F)); + f.LoadConstant(0x7F)); // CA is set if any bits are shifted out of the right and if the result // is negative. Value* mask = f.Not(f.Shl(f.LoadConstant(-1), sh)); From 2d65bea0ea52237f5ba4096ac9b6cc6e5abce485 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 2 Feb 2014 13:34:03 -0800 Subject: [PATCH 070/184] Fix warning. --- src/alloy/runtime/debug_info.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alloy/runtime/debug_info.cc b/src/alloy/runtime/debug_info.cc index b7b060ef8..ad6056eec 100644 --- a/src/alloy/runtime/debug_info.cc +++ b/src/alloy/runtime/debug_info.cc @@ -62,7 +62,7 @@ SourceMapEntry* DebugInfo::LookupHIROffset(uint64_t offset) { SourceMapEntry* DebugInfo::LookupCodeOffset(uint64_t offset) { // TODO(benvanik): binary search? We know the list is sorted by code order. - for (int n = source_map_count_ - 1; n >= 0; n--) { + for (int64_t n = source_map_count_ - 1; n >= 0; n--) { auto entry = &source_map_[n]; if (entry->code_offset <= offset) { return entry; From ef5f59ed0b82eb9934e3f641df4eb6b81ecb5038 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 2 Feb 2014 14:35:16 -0800 Subject: [PATCH 071/184] I seem to relearn blr != return once every month or so. --- src/alloy/backend/ivm/ivm_function.cc | 4 +++- src/alloy/backend/ivm/ivm_function.h | 3 ++- src/alloy/backend/ivm/ivm_intcode.cc | 24 ++++++++++++++++++++-- src/alloy/backend/ivm/ivm_intcode.h | 2 ++ src/alloy/backend/x64/x64_function.cc | 2 +- src/alloy/backend/x64/x64_function.h | 3 ++- src/alloy/frontend/ppc/ppc_emit_control.cc | 12 +++++++++++ src/alloy/hir/hir_builder.cc | 6 ++++++ src/alloy/hir/hir_builder.h | 1 + src/alloy/hir/opcodes.h | 4 +++- src/alloy/hir/opcodes.inl | 6 ++++++ src/alloy/runtime/function.cc | 4 ++-- src/alloy/runtime/function.h | 5 +++-- src/xenia/cpu/processor.cc | 2 +- tools/alloy-sandbox/alloy-sandbox.cc | 2 +- 15 files changed, 67 insertions(+), 13 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_function.cc b/src/alloy/backend/ivm/ivm_function.cc index 701cbac1c..ff60f5994 100644 --- a/src/alloy/backend/ivm/ivm_function.cc +++ b/src/alloy/backend/ivm/ivm_function.cc @@ -105,7 +105,7 @@ void IVMFunction::OnBreakpointHit(ThreadState* thread_state, IntCode* i) { #undef TRACE_SOURCE_OFFSET -int IVMFunction::CallImpl(ThreadState* thread_state) { +int IVMFunction::CallImpl(ThreadState* thread_state, uint64_t return_address) { // Setup register file on stack. auto stack = (IVMStack*)thread_state->backend_data(); auto register_file = (Register*)stack->Alloc(register_count_); @@ -122,6 +122,8 @@ int IVMFunction::CallImpl(ThreadState* thread_state) { ics.did_saturate = 0; ics.access_callbacks = thread_state->runtime()->access_callbacks(); ics.thread_state = thread_state; + ics.return_address = return_address; + ics.call_return_address = 0; volatile int* suspend_flag_address = thread_state->suspend_flag_address(); diff --git a/src/alloy/backend/ivm/ivm_function.h b/src/alloy/backend/ivm/ivm_function.h index 7ee24cddf..0169ee5b1 100644 --- a/src/alloy/backend/ivm/ivm_function.h +++ b/src/alloy/backend/ivm/ivm_function.h @@ -31,7 +31,8 @@ public: protected: virtual int AddBreakpointImpl(runtime::Breakpoint* breakpoint); virtual int RemoveBreakpointImpl(runtime::Breakpoint* breakpoint); - virtual int CallImpl(runtime::ThreadState* thread_state); + virtual int CallImpl(runtime::ThreadState* thread_state, + uint64_t return_address); private: IntCode* GetIntCodeAtSourceOffset(uint64_t offset); diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index f1460e2c9..6aa87ac8e 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -580,7 +580,9 @@ uint32_t IntCode_CALL_XX(IntCodeState& ics, const IntCode* i, uint32_t reg) { ics.thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn); XEASSERTNOTNULL(fn); // TODO(benvanik): proper tail call support, somehow. - fn->Call(ics.thread_state); + uint64_t return_address = + (i->flags & CALL_TAIL) ? ics.return_address : ics.call_return_address; + fn->Call(ics.thread_state, return_address); if (i->flags & CALL_TAIL) { return IA_RETURN; } @@ -645,12 +647,21 @@ int Translate_CALL_TRUE(TranslationContext& ctx, Instr* i) { uint32_t IntCode_CALL_INDIRECT_XX(IntCodeState& ics, const IntCode* i, uint32_t reg) { uint64_t target = ics.rf[reg].u32; + // Check if return address - if so, return. + if (i->flags & CALL_POSSIBLE_RETURN) { + if (target == ics.return_address) { + return IA_RETURN; + } + } + // Real call. Function* fn = NULL; ics.thread_state->runtime()->ResolveFunction(target, &fn); XEASSERTNOTNULL(fn); // TODO(benvanik): proper tail call support, somehow. - fn->Call(ics.thread_state); + uint64_t return_address = + (i->flags & CALL_TAIL) ? ics.return_address : ics.call_return_address; + fn->Call(ics.thread_state, return_address); if (i->flags & CALL_TAIL) { return IA_RETURN; } @@ -775,6 +786,14 @@ int Translate_RETURN_TRUE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->src1.value->type]); } +uint32_t IntCode_SET_RETURN_ADDRESS(IntCodeState& ics, const IntCode* i) { + ics.call_return_address = ics.rf[i->src1_reg].u32; + return IA_NEXT; +} +int Translate_SET_RETURN_ADDRESS(TranslationContext& ctx, Instr* i) { + return DispatchToC(ctx, i, IntCode_SET_RETURN_ADDRESS); +} + uint32_t IntCode_BRANCH_XX(IntCodeState& ics, const IntCode* i, uint32_t reg) { return ics.rf[reg].u32; } @@ -4101,6 +4120,7 @@ static const TranslateFn dispatch_table[] = { Translate_CALL_EXTERN, Translate_RETURN, Translate_RETURN_TRUE, + Translate_SET_RETURN_ADDRESS, Translate_BRANCH, Translate_BRANCH_TRUE, diff --git a/src/alloy/backend/ivm/ivm_intcode.h b/src/alloy/backend/ivm/ivm_intcode.h index dcb59c106..ded43d5e1 100644 --- a/src/alloy/backend/ivm/ivm_intcode.h +++ b/src/alloy/backend/ivm/ivm_intcode.h @@ -48,6 +48,8 @@ typedef struct { int8_t did_saturate; runtime::RegisterAccessCallbacks* access_callbacks; runtime::ThreadState* thread_state; + uint64_t return_address; + uint64_t call_return_address; } IntCodeState; diff --git a/src/alloy/backend/x64/x64_function.cc b/src/alloy/backend/x64/x64_function.cc index 3f7f4bc57..0f6b4d12b 100644 --- a/src/alloy/backend/x64/x64_function.cc +++ b/src/alloy/backend/x64/x64_function.cc @@ -42,7 +42,7 @@ int X64Function::RemoveBreakpointImpl(Breakpoint* breakpoint) { return 0; } -int X64Function::CallImpl(ThreadState* thread_state) { +int X64Function::CallImpl(ThreadState* thread_state, uint64_t return_address) { auto backend = (X64Backend*)thread_state->runtime()->backend(); auto thunk = backend->host_to_guest_thunk(); thunk( diff --git a/src/alloy/backend/x64/x64_function.h b/src/alloy/backend/x64/x64_function.h index 5166fd879..0f9659ca6 100644 --- a/src/alloy/backend/x64/x64_function.h +++ b/src/alloy/backend/x64/x64_function.h @@ -33,7 +33,8 @@ public: protected: virtual int AddBreakpointImpl(runtime::Breakpoint* breakpoint); virtual int RemoveBreakpointImpl(runtime::Breakpoint* breakpoint); - virtual int CallImpl(runtime::ThreadState* thread_state); + virtual int CallImpl(runtime::ThreadState* thread_state, + uint64_t return_address); private: void* machine_code_; diff --git a/src/alloy/frontend/ppc/ppc_emit_control.cc b/src/alloy/frontend/ppc/ppc_emit_control.cc index 9815c4649..0365c849b 100644 --- a/src/alloy/frontend/ppc/ppc_emit_control.cc +++ b/src/alloy/frontend/ppc/ppc_emit_control.cc @@ -35,6 +35,7 @@ int InstrEmit_branch( // be correct for returns. if (lk) { Value* return_address = f.LoadConstant(cia + 4); + f.SetReturnAddress(return_address); f.StoreLR(return_address); } @@ -104,6 +105,10 @@ int InstrEmit_branch( // // TODO(benvanik): evaluate hint here. // c.je(e.GetReturnLabel(), kCondHintLikely); //} +#if 0 + // This breaks longjump, as that uses blr with a non-return lr. + // It'd be nice to move SET_RETURN_ADDRESS semantics up into context + // so that we can just use this. if (!lk && nia_is_lr) { // Return (most likely). // TODO(benvanik): test? ReturnCheck()? @@ -116,7 +121,14 @@ int InstrEmit_branch( f.Return(); } } else { +#else + { +#endif // Jump to pointer. + bool likely_return = !lk && nia_is_lr; + if (likely_return) { + call_flags |= CALL_POSSIBLE_RETURN; + } if (cond) { if (!expect_true) { cond = f.IsFalse(cond); diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index 44f1b758c..5e0be6dad 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -642,6 +642,12 @@ void HIRBuilder::ReturnTrue(Value* cond) { EndBlock(); } +void HIRBuilder::SetReturnAddress(Value* value) { + Instr* i = AppendInstr(OPCODE_SET_RETURN_ADDRESS_info, 0); + i->set_src1(value); + i->src2.value = i->src3.value = NULL; +} + void HIRBuilder::Branch(Label* label, uint32_t branch_flags) { Instr* i = AppendInstr(OPCODE_BRANCH_info, branch_flags); i->src1.label = label; diff --git a/src/alloy/hir/hir_builder.h b/src/alloy/hir/hir_builder.h index 542b1e7ae..1ebdb01a1 100644 --- a/src/alloy/hir/hir_builder.h +++ b/src/alloy/hir/hir_builder.h @@ -81,6 +81,7 @@ public: void CallExtern(runtime::FunctionInfo* symbol_info); void Return(); void ReturnTrue(Value* cond); + void SetReturnAddress(Value* value); void Branch(Label* label, uint32_t branch_flags = 0); void Branch(Block* block, uint32_t branch_flags = 0); diff --git a/src/alloy/hir/opcodes.h b/src/alloy/hir/opcodes.h index 14e3d5d65..b52e7b55d 100644 --- a/src/alloy/hir/opcodes.h +++ b/src/alloy/hir/opcodes.h @@ -18,7 +18,8 @@ namespace hir { enum CallFlags { - CALL_TAIL = (1 << 1), + CALL_TAIL = (1 << 1), + CALL_POSSIBLE_RETURN = (1 << 2), }; enum BranchFlags { BRANCH_LIKELY = (1 << 1), @@ -97,6 +98,7 @@ enum Opcode { OPCODE_CALL_EXTERN, OPCODE_RETURN, OPCODE_RETURN_TRUE, + OPCODE_SET_RETURN_ADDRESS, OPCODE_BRANCH, OPCODE_BRANCH_TRUE, diff --git a/src/alloy/hir/opcodes.inl b/src/alloy/hir/opcodes.inl index df1427db2..4fc7bd9dd 100644 --- a/src/alloy/hir/opcodes.inl +++ b/src/alloy/hir/opcodes.inl @@ -92,6 +92,12 @@ DEFINE_OPCODE( OPCODE_SIG_X_V, OPCODE_FLAG_BRANCH); +DEFINE_OPCODE( + OPCODE_SET_RETURN_ADDRESS, + "set_return_address", + OPCODE_SIG_X_V, + 0); + DEFINE_OPCODE( OPCODE_BRANCH, "branch", diff --git a/src/alloy/runtime/function.cc b/src/alloy/runtime/function.cc index f8b74f48f..853808d53 100644 --- a/src/alloy/runtime/function.cc +++ b/src/alloy/runtime/function.cc @@ -73,7 +73,7 @@ Breakpoint* Function::FindBreakpoint(uint64_t address) { return result; } -int Function::Call(ThreadState* thread_state) { +int Function::Call(ThreadState* thread_state, uint64_t return_address) { ThreadState* original_thread_state = ThreadState::Get(); if (original_thread_state != thread_state) { ThreadState::Bind(thread_state); @@ -94,7 +94,7 @@ int Function::Call(ThreadState* thread_state) { result = 1; } } else { - CallImpl(thread_state); + CallImpl(thread_state, return_address); } if (original_thread_state != thread_state) { diff --git a/src/alloy/runtime/function.h b/src/alloy/runtime/function.h index 629276c0b..22f4df0aa 100644 --- a/src/alloy/runtime/function.h +++ b/src/alloy/runtime/function.h @@ -36,13 +36,14 @@ public: int AddBreakpoint(Breakpoint* breakpoint); int RemoveBreakpoint(Breakpoint* breakpoint); - int Call(ThreadState* thread_state); + int Call(ThreadState* thread_state, uint64_t return_address); protected: Breakpoint* FindBreakpoint(uint64_t address); virtual int AddBreakpointImpl(Breakpoint* breakpoint) { return 0; } virtual int RemoveBreakpointImpl(Breakpoint* breakpoint) { return 0; } - virtual int CallImpl(ThreadState* thread_state) = 0; + virtual int CallImpl(ThreadState* thread_state, + uint64_t return_address) = 0; protected: uint64_t address_; diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index 6fd7347dd..db11ef2ab 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -165,7 +165,7 @@ int Processor::Execute(XenonThreadState* thread_state, uint64_t address) { context->lr = lr; // Execute the function. - fn->Call(thread_state); + fn->Call(thread_state, lr); return 0; } diff --git a/tools/alloy-sandbox/alloy-sandbox.cc b/tools/alloy-sandbox/alloy-sandbox.cc index da8d1b80e..e7f6bb2d6 100644 --- a/tools/alloy-sandbox/alloy-sandbox.cc +++ b/tools/alloy-sandbox/alloy-sandbox.cc @@ -49,7 +49,7 @@ int alloy_sandbox(int argc, xechar_t** argv) { ctx->lr = 0xBEBEBEBE; ctx->r[5] = 10; ctx->r[25] = 25; - fn->Call(thread_state); + fn->Call(thread_state, ctx->lr); auto result = ctx->r[11]; delete thread_state; From 0d88e83daa1c7443b9d8966d2afc254584ab695d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 2 Feb 2014 14:41:57 -0800 Subject: [PATCH 072/184] Avoiding function lookup for compiled functions. Still need caching. --- src/alloy/backend/ivm/ivm_intcode.cc | 6 ++++-- src/alloy/backend/x64/lowering/lowering_sequences.cc | 10 +++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 6aa87ac8e..cae092909 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -576,8 +576,10 @@ int Translate_TRAP_TRUE(TranslationContext& ctx, Instr* i) { uint32_t IntCode_CALL_XX(IntCodeState& ics, const IntCode* i, uint32_t reg) { FunctionInfo* symbol_info = (FunctionInfo*)ics.rf[reg].u64; - Function* fn = NULL; - ics.thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn); + Function* fn = symbol_info->function(); + if (!fn) { + ics.thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn); + } XEASSERTNOTNULL(fn); // TODO(benvanik): proper tail call support, somehow. uint64_t return_address = diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 2cc0cfd53..bb59ca222 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -233,11 +233,15 @@ void TransitionToHost(X64Emitter& e) { e.call(e.rax); } void IssueCall(X64Emitter& e, FunctionInfo* symbol_info, uint32_t flags) { - auto fn = symbol_info->function(); + auto fn = (X64Function*)symbol_info->function(); // Resolve address to the function to call and store in rax. // TODO(benvanik): caching/etc. For now this makes debugging easier. - e.mov(e.rdx, (uint64_t)symbol_info); - CallNative(e, ResolveFunctionSymbol); + if (fn) { + e.mov(e.rax, (uint64_t)fn->machine_code()); + } else { + e.mov(e.rdx, (uint64_t)symbol_info); + CallNative(e, ResolveFunctionSymbol); + } // Actually jump/call to rax. if (flags & CALL_TAIL) { From 5309356908c823b049fb7fe9c267e99a4bfa5d5d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Thu, 6 Feb 2014 21:53:31 -0800 Subject: [PATCH 073/184] Fixing tail calls in the jit. --- .../x64/lowering/lowering_sequences.cc | 31 +++++++++++++++++-- src/alloy/backend/x64/lowering/tracers.cc | 6 ++-- src/alloy/backend/x64/x64_emitter.cc | 4 +++ src/alloy/backend/x64/x64_function.cc | 2 +- src/alloy/backend/x64/x64_thunk_emitter.cc | 2 ++ src/alloy/backend/x64/x64_thunk_emitter.h | 8 ++++- 6 files changed, 47 insertions(+), 6 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index bb59ca222..38007aed5 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -245,17 +245,30 @@ void IssueCall(X64Emitter& e, FunctionInfo* symbol_info, uint32_t flags) { // Actually jump/call to rax. if (flags & CALL_TAIL) { + // Pass the callers return address over. + e.mov(e.rdx, e.qword[e.rsp + StackLayout::GUEST_RET_ADDR]); + e.add(e.rsp, (uint32_t)e.stack_size()); e.jmp(e.rax); } else { + // Return address is from the previous SET_RETURN_ADDRESS. + e.mov(e.rdx, e.qword[e.rsp + StackLayout::GUEST_CALL_RET_ADDR]); + e.call(e.rax); } } void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { - // Resolve address to the function to call and store in rax. - // TODO(benvanik): caching/etc. For now this makes debugging easier. Reg64 r; e.BeginOp(target, r, 0); + + // Check if return. + if (flags & CALL_POSSIBLE_RETURN) { + e.cmp(r.cvt32(), e.dword[e.rsp + StackLayout::GUEST_RET_ADDR]); + e.je("epilog", CodeGenerator::T_NEAR); + } + + // Resolve address to the function to call and store in rax. + // TODO(benvanik): caching/etc. For now this makes debugging easier. if (r != e.rdx) { e.mov(e.rdx, r); } @@ -264,9 +277,15 @@ void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { // Actually jump/call to rax. if (flags & CALL_TAIL) { + // Pass the callers return address over. + e.mov(e.rdx, e.qword[e.rsp + StackLayout::GUEST_RET_ADDR]); + e.add(e.rsp, (uint32_t)e.stack_size()); e.jmp(e.rax); } else { + // Return address is from the previous SET_RETURN_ADDRESS. + e.mov(e.rdx, e.qword[e.rsp + StackLayout::GUEST_CALL_RET_ADDR]); + e.call(e.rax); } } @@ -434,6 +453,14 @@ table->AddSequence(OPCODE_RETURN_TRUE, [](X64Emitter& e, Instr*& i) { return true; }); +table->AddSequence(OPCODE_SET_RETURN_ADDRESS, [](X64Emitter& e, Instr*& i) { + XEASSERT(i->src1.value->IsConstant()); + e.mov(e.qword[e.rsp + StackLayout::GUEST_CALL_RET_ADDR], + i->src1.value->AsUint64()); + i = e.Advance(i); + return true; +}); + // -------------------------------------------------------------------------- // Branches // -------------------------------------------------------------------------- diff --git a/src/alloy/backend/x64/lowering/tracers.cc b/src/alloy/backend/x64/lowering/tracers.cc index 0d7975847..b4c0ae74c 100644 --- a/src/alloy/backend/x64/lowering/tracers.cc +++ b/src/alloy/backend/x64/lowering/tracers.cc @@ -29,10 +29,12 @@ namespace lowering { #define DFLUSH() #define DPRINT +#define TARGET_THREAD 1 + #define IFLUSH() fflush(stdout) -#define IPRINT if (thread_state->thread_id() == 1) printf +#define IPRINT if (thread_state->thread_id() == TARGET_THREAD) printf #define DFLUSH() fflush(stdout) -#define DPRINT DFLUSH(); if (thread_state->thread_id() == 1) printf +#define DPRINT DFLUSH(); if (thread_state->thread_id() == TARGET_THREAD) printf void TraceString(void* raw_context, const char* str) { diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index c8ddaddb4..02a1aa132 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -158,6 +158,10 @@ int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) { if (emit_prolog) { sub(rsp, (uint32_t)stack_size); mov(qword[rsp + StackLayout::GUEST_RCX_HOME], rcx); + mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rdx); + mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0); + // ReloadRDX: + mov(rdx, qword[rcx + 8]); // membase } auto lowering_table = backend_->lowering_table(); diff --git a/src/alloy/backend/x64/x64_function.cc b/src/alloy/backend/x64/x64_function.cc index 0f6b4d12b..71452ac14 100644 --- a/src/alloy/backend/x64/x64_function.cc +++ b/src/alloy/backend/x64/x64_function.cc @@ -48,6 +48,6 @@ int X64Function::CallImpl(ThreadState* thread_state, uint64_t return_address) { thunk( machine_code_, thread_state->raw_context(), - thread_state->memory()->membase()); + (void*)return_address); return 0; } diff --git a/src/alloy/backend/x64/x64_thunk_emitter.cc b/src/alloy/backend/x64/x64_thunk_emitter.cc index 7fc6fab60..0e1922581 100644 --- a/src/alloy/backend/x64/x64_thunk_emitter.cc +++ b/src/alloy/backend/x64/x64_thunk_emitter.cc @@ -34,6 +34,7 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { const size_t stack_size = StackLayout::THUNK_STACK_SIZE; // rsp + 0 = return address + mov(qword[rsp + 8 * 3], r8); mov(qword[rsp + 8 * 2], rdx); mov(qword[rsp + 8 * 1], rcx); sub(rsp, stack_size); @@ -88,6 +89,7 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { add(rsp, stack_size); mov(rcx, qword[rsp + 8 * 1]); mov(rdx, qword[rsp + 8 * 2]); + mov(r8, qword[rsp + 8 * 3]); ret(); void* fn = Emplace(stack_size); diff --git a/src/alloy/backend/x64/x64_thunk_emitter.h b/src/alloy/backend/x64/x64_thunk_emitter.h index a9f27650b..ae9c7b967 100644 --- a/src/alloy/backend/x64/x64_thunk_emitter.h +++ b/src/alloy/backend/x64/x64_thunk_emitter.h @@ -104,6 +104,10 @@ namespace x64 { * +------------------+ * | rcx / context | rsp + 64 * +------------------+ + * | guest ret addr | rsp + 72 + * +------------------+ + * | call ret addr | rsp + 80 + * +------------------+ * ... locals ... * +------------------+ * | (return address) | @@ -115,8 +119,10 @@ class StackLayout { public: const static size_t THUNK_STACK_SIZE = 120; - const static size_t GUEST_STACK_SIZE = 72; + const static size_t GUEST_STACK_SIZE = 88; const static size_t GUEST_RCX_HOME = 64; + const static size_t GUEST_RET_ADDR = 72; + const static size_t GUEST_CALL_RET_ADDR = 80; }; From c5b70e615f9f7a09a54426d9b2623920d2659b65 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Thu, 6 Feb 2014 22:18:15 -0800 Subject: [PATCH 074/184] Fixing div. --- src/alloy/backend/x64/lowering/lowering_sequences.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 38007aed5..754fd0d07 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -2199,7 +2199,9 @@ table->AddSequence(OPCODE_DIV, [](X64Emitter& e, Instr*& i) { // RAX = value, RDX = clobbered // TODO(benvanik): make the register allocator put dest_src in RAX? auto Nax = LIKE_REG(e.rax, dest_src); + auto Ndx = LIKE_REG(e.rdx, dest_src); e.mov(Nax, dest_src); + e.xor(Ndx, Ndx); if (i.flags & ARITHMETIC_UNSIGNED) { e.div(src); } else { From 6199e9f7b5cdf590144dd136bc8704a18c9e9b5d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Thu, 6 Feb 2014 22:18:30 -0800 Subject: [PATCH 075/184] Vector constant support. --- src/alloy/backend/x64/lowering/op_utils.inl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 5ec86b65d..ce8f019db 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -51,7 +51,7 @@ Address Stash(X64Emitter& e, const Xmm& r) { return addr; } -void LoadXmmConstant(X64Emitter& e, Xmm& dest, const vec128_t& v) { +void LoadXmmConstant(X64Emitter& e, const Xmm& dest, const vec128_t& v) { if (!v.low && !v.high) { // zero e.vpxor(dest, dest); @@ -930,7 +930,7 @@ void XmmBinaryOpVC(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, e.mov(e.rax, (uint64_t)src2->constant.i64); e.movsd(dest, e.rax); } else { - UNIMPLEMENTED_SEQ(); + LoadXmmConstant(e, dest, src2->constant.v128); } vv_fn(e, *i, dest, src1); } else { @@ -944,7 +944,7 @@ void XmmBinaryOpVC(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, e.mov(e.rax, (uint64_t)src2->constant.i64); e.movsd(e.xmm0, e.rax); } else { - UNIMPLEMENTED_SEQ(); + LoadXmmConstant(e, e.xmm0, src2->constant.v128); } vv_fn(e, *i, dest, e.xmm0); } @@ -962,7 +962,7 @@ void XmmBinaryOpCV(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, e.mov(e.rax, (uint64_t)src1->constant.i64); e.movsd(dest, e.rax); } else { - UNIMPLEMENTED_SEQ(); + LoadXmmConstant(e, dest, src1->constant.v128); } vv_fn(e, *i, dest, src2); } else { @@ -978,7 +978,7 @@ void XmmBinaryOpCV(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, e.mov(e.rax, (uint64_t)src1->constant.i64); e.movsd(dest, e.rax); } else { - UNIMPLEMENTED_SEQ(); + LoadXmmConstant(e, dest, src1->constant.v128); } vv_fn(e, *i, dest, real_src2); } From ee6969648521129f1c28e38de41e7887cca2c0b2 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Thu, 6 Feb 2014 22:18:44 -0800 Subject: [PATCH 076/184] Hiding some D3D11 log spew. --- src/xenia/gpu/d3d11/d3d11_graphics_driver.cc | 31 ++++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc index ddc1d8d1e..25410bf6f 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc @@ -20,6 +20,9 @@ using namespace xe::gpu::d3d11; using namespace xe::gpu::xenos; +#define XETRACED3D(fmt, ...) if (FLAGS_trace_ring_buffer) XELOGGPU(fmt, ##__VA_ARGS__) + + D3D11GraphicsDriver::D3D11GraphicsDriver( Memory* memory, IDXGISwapChain* swap_chain, ID3D11Device* device) : GraphicsDriver(memory) { @@ -145,13 +148,13 @@ void D3D11GraphicsDriver::Initialize() { void D3D11GraphicsDriver::InvalidateState( uint32_t mask) { if (mask == XE_GPU_INVALIDATE_MASK_ALL) { - XELOGGPU("D3D11: (invalidate all)"); + XETRACED3D("D3D11: (invalidate all)"); } if (mask & XE_GPU_INVALIDATE_MASK_VERTEX_SHADER) { - XELOGGPU("D3D11: invalidate vertex shader"); + XETRACED3D("D3D11: invalidate vertex shader"); } if (mask & XE_GPU_INVALIDATE_MASK_PIXEL_SHADER) { - XELOGGPU("D3D11: invalidate pixel shader"); + XETRACED3D("D3D11: invalidate pixel shader"); } } @@ -165,13 +168,15 @@ void D3D11GraphicsDriver::SetShader( Shader* shader = shader_cache_->FindOrCreate( type, p, length); - // Disassemble. - const char* source = shader->disasm_src(); - if (!source) { - source = ""; + if (!shader->is_prepared()) { + // Disassemble. + const char* source = shader->disasm_src(); + if (!source) { + source = ""; + } + XETRACED3D("D3D11: set shader %d at %0.8X (%db):\n%s", + type, address, length, source); } - XELOGGPU("D3D11: set shader %d at %0.8X (%db):\n%s", - type, address, length, source); // Stash for later. switch (type) { @@ -293,8 +298,8 @@ void D3D11GraphicsDriver::DrawIndexBuffer( uint32_t index_base, uint32_t index_size, uint32_t endianness) { RegisterFile& rf = register_file_; - XELOGGPU("D3D11: draw indexed %d (%d indicies) from %.8X", - prim_type, index_count, index_base); + XETRACED3D("D3D11: draw indexed %d (%d indicies) from %.8X", + prim_type, index_count, index_base); // Setup shaders/etc. if (SetupDraw(prim_type)) { @@ -318,8 +323,8 @@ void D3D11GraphicsDriver::DrawIndexAuto( uint32_t index_count) { RegisterFile& rf = register_file_; - XELOGGPU("D3D11: draw indexed %d (%d indicies)", - prim_type, index_count); + XETRACED3D("D3D11: draw indexed %d (%d indicies)", + prim_type, index_count); // Setup shaders/etc. if (SetupDraw(prim_type)) { From 2403f367b12ef8c7b13e9c80c5c3af422b190817 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 8 Feb 2014 22:00:21 -0800 Subject: [PATCH 077/184] MachineInfo --- src/alloy/backend/backend.cc | 1 + src/alloy/backend/backend.h | 3 +++ src/alloy/backend/ivm/ivm_backend.cc | 14 ++++++++++ src/alloy/backend/machine_info.h | 39 ++++++++++++++++++++++++++++ src/alloy/backend/sources.gypi | 1 + src/alloy/backend/x64/x64_backend.cc | 14 ++++++++++ 6 files changed, 72 insertions(+) create mode 100644 src/alloy/backend/machine_info.h diff --git a/src/alloy/backend/backend.cc b/src/alloy/backend/backend.cc index 2f6531fb5..d49fb713e 100644 --- a/src/alloy/backend/backend.cc +++ b/src/alloy/backend/backend.cc @@ -18,6 +18,7 @@ using namespace alloy::runtime; Backend::Backend(Runtime* runtime) : runtime_(runtime) { + xe_zero_struct(&machine_info_, sizeof(machine_info_)); } Backend::~Backend() { diff --git a/src/alloy/backend/backend.h b/src/alloy/backend/backend.h index 885844d3f..b6c2c431e 100644 --- a/src/alloy/backend/backend.h +++ b/src/alloy/backend/backend.h @@ -11,6 +11,7 @@ #define ALLOY_BACKEND_BACKEND_H_ #include +#include namespace alloy { namespace runtime { class Runtime; } } @@ -27,6 +28,7 @@ public: virtual ~Backend(); runtime::Runtime* runtime() const { return runtime_; } + const MachineInfo* machine_info() const { return &machine_info_; } virtual int Initialize(); @@ -37,6 +39,7 @@ public: protected: runtime::Runtime* runtime_; + MachineInfo machine_info_; }; diff --git a/src/alloy/backend/ivm/ivm_backend.cc b/src/alloy/backend/ivm/ivm_backend.cc index bb2a42f67..6bd51037f 100644 --- a/src/alloy/backend/ivm/ivm_backend.cc +++ b/src/alloy/backend/ivm/ivm_backend.cc @@ -34,6 +34,20 @@ int IVMBackend::Initialize() { return result; } + machine_info_.register_sets[0] = { + 0, + "gpr", + MachineInfo::RegisterSet::INT_TYPES, + 10, + }; + machine_info_.register_sets[1] = { + 1, + "vec", + MachineInfo::RegisterSet::FLOAT_TYPES | + MachineInfo::RegisterSet::VEC_TYPES, + 10, + }; + alloy::tracing::WriteEvent(EventType::Init({ })); diff --git a/src/alloy/backend/machine_info.h b/src/alloy/backend/machine_info.h new file mode 100644 index 000000000..2aa7add22 --- /dev/null +++ b/src/alloy/backend/machine_info.h @@ -0,0 +1,39 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef ALLOY_BACKEND_MACHINE_INFO_H_ +#define ALLOY_BACKEND_MACHINE_INFO_H_ + +#include + + +namespace alloy { +namespace backend { + + +struct MachineInfo { + struct RegisterSet { + enum Types { + INT_TYPES = (1 << 1), + FLOAT_TYPES = (1 << 2), + VEC_TYPES = (1 << 3), + }; + uint8_t id; + char name[4]; + uint32_t types; + uint32_t count; + } register_sets[8]; +}; + + +} // namespace backend +} // namespace alloy + + +#endif // ALLOY_BACKEND_MACHINE_INFO_H_ diff --git a/src/alloy/backend/sources.gypi b/src/alloy/backend/sources.gypi index 154cd75ad..41419ac7a 100644 --- a/src/alloy/backend/sources.gypi +++ b/src/alloy/backend/sources.gypi @@ -5,6 +5,7 @@ 'assembler.h', 'backend.cc', 'backend.h', + 'machine_info.h', 'tracing.h', ], diff --git a/src/alloy/backend/x64/x64_backend.cc b/src/alloy/backend/x64/x64_backend.cc index 031dc6bda..8c1968571 100644 --- a/src/alloy/backend/x64/x64_backend.cc +++ b/src/alloy/backend/x64/x64_backend.cc @@ -41,6 +41,20 @@ int X64Backend::Initialize() { return result; } + machine_info_.register_sets[0] = { + 0, + "gpr", + MachineInfo::RegisterSet::INT_TYPES, + 10, + }; + machine_info_.register_sets[1] = { + 1, + "xmm", + MachineInfo::RegisterSet::FLOAT_TYPES | + MachineInfo::RegisterSet::VEC_TYPES, + 10, + }; + code_cache_ = new X64CodeCache(); result = code_cache_->Initialize(); if (result) { From e36e1acc1a83257190ded1b10c8b32abb42bc4c3 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 8 Feb 2014 22:00:53 -0800 Subject: [PATCH 078/184] TLS access is slow. --- src/alloy/backend/ivm/ivm_intcode.cc | 2 +- src/alloy/backend/x64/lowering/tracers.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index cae092909..211f466c7 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -417,7 +417,7 @@ int TranslateInvalid(TranslationContext& ctx, Instr* i) { uint32_t IntCode_COMMENT(IntCodeState& ics, const IntCode* i) { char* value = (char*)(i->src1_reg | ((uint64_t)i->src2_reg << 32)); - IPRINT("XE[t] :%d: %s\n", ics.thread_state->GetThreadID(), value); + IPRINT("XE[t] :%d: %s\n", ics.thread_state->thread_id(), value); IFLUSH(); return IA_NEXT; } diff --git a/src/alloy/backend/x64/lowering/tracers.cc b/src/alloy/backend/x64/lowering/tracers.cc index b4c0ae74c..f1c18f882 100644 --- a/src/alloy/backend/x64/lowering/tracers.cc +++ b/src/alloy/backend/x64/lowering/tracers.cc @@ -39,7 +39,7 @@ namespace lowering { void TraceString(void* raw_context, const char* str) { auto thread_state = *((ThreadState**)raw_context); - IPRINT("XE[t] :%d: %s\n", thread_state->GetThreadID(), str); + IPRINT("XE[t] :%d: %s\n", thread_state->thread_id(), str); IFLUSH(); } From e6f3716d87c6f160e50234bdea4c38c5134736a4 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 8 Feb 2014 22:01:26 -0800 Subject: [PATCH 079/184] Disabling x64 backend for now, as it's rubbish. --- src/alloy/backend/sources.gypi | 2 +- src/alloy/runtime/runtime.cc | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/alloy/backend/sources.gypi b/src/alloy/backend/sources.gypi index 41419ac7a..a7e2c0928 100644 --- a/src/alloy/backend/sources.gypi +++ b/src/alloy/backend/sources.gypi @@ -11,6 +11,6 @@ 'includes': [ 'ivm/sources.gypi', - 'x64/sources.gypi', + #'x64/sources.gypi', ], } diff --git a/src/alloy/runtime/runtime.cc b/src/alloy/runtime/runtime.cc index 8a49a1bc4..d39ac4220 100644 --- a/src/alloy/runtime/runtime.cc +++ b/src/alloy/runtime/runtime.cc @@ -58,7 +58,7 @@ Runtime::~Runtime() { // TODO(benvanik): based on compiler support #include -#include +//#include int Runtime::Initialize(Frontend* frontend, Backend* backend) { // Must be initialized by subclass before calling into this. @@ -91,10 +91,10 @@ int Runtime::Initialize(Frontend* frontend, Backend* backend) { #endif // ALLOY_HAS_IVM_BACKEND if (FLAGS_runtime_backend == "any") { #if defined(ALLOY_HAS_X64_BACKEND) && ALLOY_HAS_X64_BACKEND - /*if (!backend) { + if (!backend) { backend = new alloy::backend::x64::X64Backend( this); - }*/ + } #endif // ALLOY_HAS_X64_BACKEND #if defined(ALLOY_HAS_IVM_BACKEND) && ALLOY_HAS_IVM_BACKEND if (!backend) { From 6bd214af0bdbc3f3afe3459150da388f677eaa3c Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 8 Feb 2014 22:01:51 -0800 Subject: [PATCH 080/184] Adding a shared scratch arena for compiler passes. --- src/alloy/compiler/compiler.cc | 5 +++++ src/alloy/compiler/compiler.h | 2 ++ src/alloy/compiler/compiler_pass.cc | 4 ++++ src/alloy/compiler/compiler_pass.h | 3 +++ 4 files changed, 14 insertions(+) diff --git a/src/alloy/compiler/compiler.cc b/src/alloy/compiler/compiler.cc index 07e786ade..a28f6b48b 100644 --- a/src/alloy/compiler/compiler.cc +++ b/src/alloy/compiler/compiler.cc @@ -20,6 +20,8 @@ using namespace alloy::runtime; Compiler::Compiler(Runtime* runtime) : runtime_(runtime) { + scratch_arena_ = new Arena(); + alloy::tracing::WriteEvent(EventType::Init({ })); } @@ -32,6 +34,8 @@ Compiler::~Compiler() { delete pass; } + delete scratch_arena_; + alloy::tracing::WriteEvent(EventType::Deinit({ })); } @@ -49,6 +53,7 @@ int Compiler::Compile(HIRBuilder* builder) { // stop changing things, etc. for (auto it = passes_.begin(); it != passes_.end(); ++it) { CompilerPass* pass = *it; + scratch_arena_->Reset(); if (pass->Run(builder)) { return 1; } diff --git a/src/alloy/compiler/compiler.h b/src/alloy/compiler/compiler.h index ae6b48455..d2874cceb 100644 --- a/src/alloy/compiler/compiler.h +++ b/src/alloy/compiler/compiler.h @@ -28,6 +28,7 @@ public: ~Compiler(); runtime::Runtime* runtime() const { return runtime_; } + Arena* scratch_arena() const { return scratch_arena_; } void AddPass(CompilerPass* pass); @@ -37,6 +38,7 @@ public: private: runtime::Runtime* runtime_; + Arena* scratch_arena_; typedef std::vector PassList; PassList passes_; diff --git a/src/alloy/compiler/compiler_pass.cc b/src/alloy/compiler/compiler_pass.cc index 535bcb490..59f71902c 100644 --- a/src/alloy/compiler/compiler_pass.cc +++ b/src/alloy/compiler/compiler_pass.cc @@ -27,3 +27,7 @@ int CompilerPass::Initialize(Compiler* compiler) { compiler_ = compiler; return 0; } + +Arena* CompilerPass::scratch_arena() const { + return compiler_->scratch_arena(); +} diff --git a/src/alloy/compiler/compiler_pass.h b/src/alloy/compiler/compiler_pass.h index 1ed1b8144..4ba38b6c4 100644 --- a/src/alloy/compiler/compiler_pass.h +++ b/src/alloy/compiler/compiler_pass.h @@ -32,6 +32,9 @@ public: virtual int Run(hir::HIRBuilder* builder) = 0; +protected: + Arena* scratch_arena() const; + protected: runtime::Runtime* runtime_; Compiler* compiler_; From 4a584129d205582ca2ce6ff09dc3410a9f0762b5 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 10 Feb 2014 21:16:38 -0800 Subject: [PATCH 081/184] A probably-working register allocator. --- src/alloy/backend/ivm/ivm_assembler.cc | 12 +- src/alloy/backend/ivm/ivm_backend.cc | 4 +- src/alloy/backend/x64/x64_emitter.cc | 6 +- src/alloy/compiler/compiler_passes.h | 40 +- .../passes/control_flow_analysis_pass.cc | 6 +- .../passes/register_allocation_pass.cc | 471 ++++++++++++++++++ .../passes/register_allocation_pass.h | 60 +++ src/alloy/compiler/passes/sources.gypi | 2 + src/alloy/frontend/ppc/ppc_translator.cc | 22 +- src/alloy/hir/hir_builder.cc | 10 +- src/alloy/hir/instr.h | 2 +- src/alloy/hir/value.h | 6 +- src/xenia/types.h | 1 + 13 files changed, 613 insertions(+), 29 deletions(-) create mode 100644 src/alloy/compiler/passes/register_allocation_pass.cc create mode 100644 src/alloy/compiler/passes/register_allocation_pass.h diff --git a/src/alloy/backend/ivm/ivm_assembler.cc b/src/alloy/backend/ivm/ivm_assembler.cc index b869d41ef..ff665b8f3 100644 --- a/src/alloy/backend/ivm/ivm_assembler.cc +++ b/src/alloy/backend/ivm/ivm_assembler.cc @@ -74,15 +74,19 @@ int IVMAssembler::Assemble( builder->ResetLabelTags(); // Function prologue. - size_t stack_size = 0; + size_t stack_offset = 0; auto locals = builder->locals(); for (auto it = locals.begin(); it != locals.end(); ++it) { auto slot = *it; - size_t stack_offset = stack_size; + size_t type_size = GetTypeSize(slot->type); + // Align to natural size. + stack_offset = XEALIGN(stack_offset, type_size); slot->set_constant(stack_offset); - stack_size += GetTypeSize(slot->type); + stack_offset += type_size; } - ctx.stack_size = stack_size; + // Ensure 16b alignment. + stack_offset = XEALIGN(stack_offset, 16); + ctx.stack_size = stack_offset; auto block = builder->first_block(); while (block) { diff --git a/src/alloy/backend/ivm/ivm_backend.cc b/src/alloy/backend/ivm/ivm_backend.cc index 6bd51037f..411d16d30 100644 --- a/src/alloy/backend/ivm/ivm_backend.cc +++ b/src/alloy/backend/ivm/ivm_backend.cc @@ -38,14 +38,14 @@ int IVMBackend::Initialize() { 0, "gpr", MachineInfo::RegisterSet::INT_TYPES, - 10, + 6, }; machine_info_.register_sets[1] = { 1, "vec", MachineInfo::RegisterSet::FLOAT_TYPES | MachineInfo::RegisterSet::VEC_TYPES, - 10, + 6, }; alloy::tracing::WriteEvent(EventType::Init({ diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 02a1aa132..4a1442ca5 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -54,7 +54,7 @@ int X64Emitter::Initialize() { } int X64Emitter::Emit( - HIRBuilder* builder, + HIRBuilder* builder, uint32_t debug_info_flags, runtime::DebugInfo* debug_info, void*& out_code_address, size_t& out_code_size) { // Reset. @@ -98,8 +98,6 @@ void* X64Emitter::Emplace(size_t stack_size) { return new_address; } -#define XEALIGN(value, align) ((value + align - 1) & ~(align - 1)) - int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) { // These are the registers we will not be using. All others are fare game. const uint32_t reserved_regs = @@ -220,7 +218,7 @@ void X64Emitter::ResetRegisters(uint32_t reserved_regs) { if (live_regs & 0x1) { auto v = reg_state_.reg_values[n]; if (v) { - v->reg = -1; + v->reg.index = -1; } } reg_state_.reg_values[n] = 0; diff --git a/src/alloy/compiler/compiler_passes.h b/src/alloy/compiler/compiler_passes.h index ca074e221..20ec91c66 100644 --- a/src/alloy/compiler/compiler_passes.h +++ b/src/alloy/compiler/compiler_passes.h @@ -15,8 +15,9 @@ #include #include #include + //#include #include -//#include +#include #include #include #include @@ -137,5 +138,42 @@ // store_context +302, v5 // branch_true v5, ... // +// - X86Canonicalization +// For various opcodes add copies/commute the arguments to match x86 +// operand semantics. This makes code generation easier and if done +// before register allocation can prevent a lot of extra shuffling in +// the emitted code. +// +// Example: +// : +// v0 = ... +// v1 = ... +// v2 = add v0, v1 <-- v1 now unused +// Becomes: +// v0 = ... +// v1 = ... +// v1 = add v1, v0 <-- src1 = dest/src, so reuse for both +// by commuting and setting dest = src1 +// +// - RegisterAllocation +// Given a machine description (register classes, counts) run over values +// and assign them to registers, adding spills as needed. It should be +// possible to directly emit code from this form. +// +// Example: +// : +// v0 = load_context +0 +// v1 = load_context +1 +// v0 = add v0, v1 +// ... +// v2 = mul v0, v1 +// Becomes: +// reg0 = load_context +0 +// reg1 = load_context +1 +// reg2 = add reg0, reg1 +// store_local +123, reg2 <-- spill inserted +// ... +// reg0 = load_local +123 <-- load inserted +// reg0 = mul reg0, reg1 #endif // ALLOY_COMPILER_COMPILER_PASSES_H_ diff --git a/src/alloy/compiler/passes/control_flow_analysis_pass.cc b/src/alloy/compiler/passes/control_flow_analysis_pass.cc index 5e73bd502..89442bcb6 100644 --- a/src/alloy/compiler/passes/control_flow_analysis_pass.cc +++ b/src/alloy/compiler/passes/control_flow_analysis_pass.cc @@ -41,19 +41,21 @@ int ControlFlowAnalysisPass::Run(HIRBuilder* builder) { // Add edges. auto block = builder->first_block(); while (block) { - auto instr = block->instr_head; + auto instr = block->instr_tail; while (instr) { if (instr->opcode->flags & OPCODE_FLAG_BRANCH) { if (instr->opcode == &OPCODE_BRANCH_info) { auto label = instr->src1.label; builder->AddEdge(block, label->block, Edge::UNCONDITIONAL); + break; } else if (instr->opcode == &OPCODE_BRANCH_TRUE_info || instr->opcode == &OPCODE_BRANCH_FALSE_info) { auto label = instr->src2.label; builder->AddEdge(block, label->block, 0); + break; } } - instr = instr->next; + instr = instr->prev; } block = block->next; } diff --git a/src/alloy/compiler/passes/register_allocation_pass.cc b/src/alloy/compiler/passes/register_allocation_pass.cc new file mode 100644 index 000000000..20b4b021f --- /dev/null +++ b/src/alloy/compiler/passes/register_allocation_pass.cc @@ -0,0 +1,471 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +using namespace alloy; +using namespace alloy::backend; +using namespace alloy::compiler; +using namespace alloy::compiler::passes; +using namespace alloy::hir; + + +struct RegisterAllocationPass::Interval { + uint32_t start_ordinal; + uint32_t end_ordinal; + Value* value; + RegisterFreeUntilSet* free_until_set; + // TODO(benvanik): reduce to offsets in arena? + struct Interval* next; + struct Interval* prev; + + void AddToList(Interval** list_head) { + auto list_next = *list_head; + this->next = list_next; + if (list_next) { + list_next->prev = this; + } + *list_head = this; + } + + void InsertIntoList(Interval** list_head) { + auto it = *list_head; + while (it) { + if (it->start_ordinal > this->start_ordinal) { + // Went too far. Insert before this interval. + this->prev = it->prev; + this->next = it; + if (it->prev) { + it->prev->next = this; + } else { + *list_head = this; + } + it->prev = this; + return; + } + if (!it->next) { + // None found, add at tail. + it->next = this; + this->prev = it; + return; + } + it = it->next; + } + } + + void RemoveFromList(Interval** list_head) { + if (this->next) { + this->next->prev = this->prev; + } + if (this->prev) { + this->prev->next = this->next; + } else { + *list_head = this->next; + } + this->next = this->prev = NULL; + } +}; + +struct RegisterAllocationPass::Intervals { + Interval* unhandled; + Interval* active; + Interval* handled; +}; + +RegisterAllocationPass::RegisterAllocationPass( + const MachineInfo* machine_info) : + machine_info_(machine_info), + CompilerPass() { + // Initialize register sets. The values of these will be + // cleared before use, so just the structure is required. + auto mi_sets = machine_info->register_sets; + xe_zero_struct(&free_until_sets_, sizeof(free_until_sets_)); + uint32_t n = 0; + while (mi_sets[n].count) { + auto& mi_set = mi_sets[n]; + auto free_until_set = new RegisterFreeUntilSet(); + free_until_sets_.all_sets[n] = free_until_set; + free_until_set->count = mi_set.count; + free_until_set->set = &mi_set; + if (mi_set.types & MachineInfo::RegisterSet::INT_TYPES) { + free_until_sets_.int_set = free_until_set; + } + if (mi_set.types & MachineInfo::RegisterSet::FLOAT_TYPES) { + free_until_sets_.float_set = free_until_set; + } + if (mi_set.types & MachineInfo::RegisterSet::VEC_TYPES) { + free_until_sets_.vec_set = free_until_set; + } + n++; + } +} + +RegisterAllocationPass::~RegisterAllocationPass() { + for (size_t n = 0; n < XECOUNT(free_until_sets_.all_sets); n++) { + if (!free_until_sets_.all_sets[n]) { + break; + } + delete free_until_sets_.all_sets[n]; + } +} + +int RegisterAllocationPass::Run(HIRBuilder* builder) { + // A (probably broken) implementation of a linear scan register allocator + // that operates directly on SSA form: + // http://www.christianwimmer.at/Publications/Wimmer10a/Wimmer10a.pdf + // + // Requirements: + // - SSA form (single definition for variables) + // - block should be in linear order: + // - dominators *should* come before (a->b->c) + // - loop block sequences *should not* have intervening non-loop blocks + + auto arena = scratch_arena(); + + // Renumber everything. + uint32_t block_ordinal = 0; + uint32_t instr_ordinal = 0; + auto block = builder->first_block(); + while (block) { + // Sequential block ordinals. + block->ordinal = block_ordinal++; + auto instr = block->instr_head; + while (instr) { + // Sequential global instruction ordinals. + instr->ordinal = instr_ordinal++; + instr = instr->next; + } + block = block->next; + } + + // Compute all liveness ranges by walking forward through all + // blocks/instructions and checking the last use of each value. This lets + // us know the exact order in (block#,instr#) form, which is then used to + // setup the range. + // TODO(benvanik): ideally we would have a list of all values and not have + // to keep walking instructions over and over. + Interval* prev_interval = NULL; + Interval* head_interval = NULL; + block = builder->first_block(); + while (block) { + auto instr = block->instr_head; + while (instr) { + // Compute last-use for the dest value. + // Since we know all values of importance must be defined, we can avoid + // having to check every value and just look at dest. + const OpcodeInfo* info = instr->opcode; + if (GET_OPCODE_SIG_TYPE_DEST(info->signature) == OPCODE_SIG_TYPE_V) { + auto v = instr->dest; + if (!v->last_use) { + ComputeLastUse(v); + } + + // Add interval. + auto interval = arena->Alloc(); + interval->start_ordinal = instr->ordinal; + interval->end_ordinal = v->last_use ? + v->last_use->ordinal : v->def->ordinal; + interval->value = v; + interval->next = NULL; + interval->prev = prev_interval; + if (prev_interval) { + prev_interval->next = interval; + } else { + head_interval = interval; + } + prev_interval = interval; + + // Grab register set to use. + // We do this now so it's only once per interval, and it makes it easy + // to only compare intervals that overlap their sets. + if (v->type <= INT64_TYPE) { + interval->free_until_set = free_until_sets_.int_set; + } else if (v->type <= FLOAT64_TYPE) { + interval->free_until_set = free_until_sets_.float_set; + } else { + interval->free_until_set = free_until_sets_.vec_set; + } + } + + instr = instr->next; + } + block = block->next; + } + + // Now have a sorted list of intervals, minus their ending ordinals. + Intervals intervals; + intervals.unhandled = head_interval; + intervals.active = intervals.handled = NULL; + while (intervals.unhandled) { + // Get next unhandled interval. + auto current = intervals.unhandled; + intervals.unhandled = intervals.unhandled->next; + current->RemoveFromList(&intervals.unhandled); + + // Check for intervals in active that are handled or inactive. + auto it = intervals.active; + while (it) { + auto next = it->next; + if (it->end_ordinal <= current->start_ordinal) { + // Move from active to handled. + it->RemoveFromList(&intervals.active); + it->AddToList(&intervals.handled); + } + it = next; + } + + // Find a register for current. + if (!TryAllocateFreeReg(current, intervals)) { + // Failed, spill. + AllocateBlockedReg(builder, current, intervals); + } + + if (current->value->reg.index!= -1) { + // Add current to active. + current->AddToList(&intervals.active); + } + } + + return 0; +} + +void RegisterAllocationPass::ComputeLastUse(Value* value) { + // TODO(benvanik): compute during construction? + // Note that this list isn't sorted (unfortunately), so we have to scan + // them all. + uint32_t max_ordinal = 0; + Value::Use* last_use = NULL; + auto use = value->use_head; + while (use) { + if (!last_use || use->instr->ordinal >= max_ordinal) { + last_use = use; + max_ordinal = use->instr->ordinal; + } + use = use->next; + } + value->last_use = last_use ? last_use->instr : NULL; +} + +bool RegisterAllocationPass::TryAllocateFreeReg( + Interval* current, Intervals& intervals) { + // Reset all registers in the set to unused. + auto free_until_set = current->free_until_set; + for (uint32_t n = 0; n < free_until_set->count; n++) { + free_until_set->pos[n] = -1; + } + + // Mark all active registers as used. + // TODO(benvanik): keep some kind of bitvector so that this is instant? + auto it = intervals.active; + while (it) { + if (it->free_until_set == free_until_set) { + free_until_set->pos[it->value->reg.index] = 0; + } + it = it->next; + } + + uint32_t max_pos = 0; + for (uint32_t n = 0; n < free_until_set->count; n++) { + if (max_pos == -1) { + max_pos = n; + } else { + if (free_until_set->pos[n] > free_until_set->pos[max_pos]) { + max_pos = n; + } + } + } + if (!free_until_set->pos[max_pos]) { + // No register available without spilling. + return false; + } + if (current->end_ordinal < free_until_set->pos[max_pos]) { + // Register available for the whole interval. + current->value->reg.set = free_until_set->set; + current->value->reg.index = max_pos; + } else { + // Register available for the first part of the interval. + // Split the interval at where it hits the next one. + //current->value->reg = max_pos; + //SplitRange(current, free_until_set->pos[max_pos]); + // TODO(benvanik): actually split -- for now we just spill. + return false; + } + + return true; +} + +void RegisterAllocationPass::AllocateBlockedReg( + HIRBuilder* builder, Interval* current, Intervals& intervals) { + auto free_until_set = current->free_until_set; + + // TODO(benvanik): smart heuristics. + // wimmer AllocateBlockedReg has some stuff for deciding whether to + // spill current or some other active interval - which we ignore. + + // Pick a random interval. Maybe the first. Sure. + auto spill_interval = intervals.active; + Value* spill_value = NULL; + Instr* prev_use = NULL; + Instr* next_use = NULL; + while (spill_interval) { + if (spill_interval->free_until_set != free_until_set || + spill_interval->start_ordinal == current->start_ordinal) { + // Only interested in ones of the same register set. + // We also ensure that ones at the same ordinal as us are ignored, + // which can happen with multiple local inserts/etc. + spill_interval = spill_interval->next; + continue; + } + spill_value = spill_interval->value; + + // Find the uses right before/after current. + auto use = spill_value->use_head; + while (use) { + if (use->instr->ordinal != -1) { + if (use->instr->ordinal < current->start_ordinal) { + if (!prev_use || prev_use->ordinal < use->instr->ordinal) { + prev_use = use->instr; + } + } else if (use->instr->ordinal > current->start_ordinal) { + if (!next_use || next_use->ordinal > use->instr->ordinal) { + next_use = use->instr; + } + } + } + use = use->next; + } + if (!prev_use) { + prev_use = spill_value->def; + } + if (prev_use->next == next_use) { + // Uh, this interval is way too short. + spill_interval = spill_interval->next; + continue; + } + XEASSERT(prev_use->ordinal != -1); + XEASSERTNOTNULL(next_use); + break; + } + XEASSERT(spill_interval->free_until_set == free_until_set); + + // Find the real last use -- paired ops may require sequences to stay + // intact. This is a bad design. + auto prev_def_tail = prev_use; + while (prev_def_tail && + prev_def_tail->opcode->flags & OPCODE_FLAG_PAIRED_PREV) { + prev_def_tail = prev_def_tail->prev; + } + + Value* new_value; + uint32_t end_ordinal; + if (spill_value->local_slot) { + // Value is already assigned a slot, so load from that. + // We can then split the interval right after the previous use to + // before the next use. + + // Update the last use of the spilled interval/value. + end_ordinal = spill_interval->end_ordinal; + spill_interval->end_ordinal = current->start_ordinal;//prev_def_tail->ordinal; + XEASSERT(end_ordinal != -1); + XEASSERT(spill_interval->end_ordinal != -1); + + // Insert a load right before the next use. + new_value = builder->LoadLocal(spill_value->local_slot); + builder->last_instr()->MoveBefore(next_use); + + // Update last use info. + new_value->last_use = spill_value->last_use; + spill_value->last_use = prev_use; + } else { + // Allocate a local slot. + spill_value->local_slot = builder->AllocLocal(spill_value->type); + + // Insert a spill right after the def. + builder->StoreLocal(spill_value->local_slot, spill_value); + auto spill_store = builder->last_instr(); + spill_store->MoveBefore(prev_def_tail->next); + + // Update last use of spilled interval/value. + end_ordinal = spill_interval->end_ordinal; + spill_interval->end_ordinal = current->start_ordinal;//prev_def_tail->ordinal; + XEASSERT(end_ordinal != -1); + XEASSERT(spill_interval->end_ordinal != -1); + + // Insert a load right before the next use. + new_value = builder->LoadLocal(spill_value->local_slot); + builder->last_instr()->MoveBefore(next_use); + + // Update last use info. + new_value->last_use = spill_value->last_use; + spill_value->last_use = spill_store; + } + + // Reuse the same local slot. Hooray SSA. + new_value->local_slot = spill_value->local_slot; + + // Rename all future uses to that loaded value. + auto use = spill_value->use_head; + while (use) { + // TODO(benvanik): keep use list sorted so we don't have to do this. + if (use->instr->ordinal <= spill_interval->end_ordinal || + use->instr->ordinal == -1) { + use = use->next; + continue; + } + auto next = use->next; + auto instr = use->instr; + uint32_t signature = instr->opcode->signature; + if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) { + if (instr->src1.value == spill_value) { + instr->set_src1(new_value); + } + } + if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) { + if (instr->src2.value == spill_value) { + instr->set_src2(new_value); + } + } + if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) { + if (instr->src3.value == spill_value) { + instr->set_src3(new_value); + } + } + use = next; + } + + // Create new interval. + auto arena = scratch_arena(); + auto new_interval = arena->Alloc(); + new_interval->start_ordinal = new_value->def->ordinal; + new_interval->end_ordinal = end_ordinal; + new_interval->value = new_value; + new_interval->next = NULL; + new_interval->prev = NULL; + if (new_value->type <= INT64_TYPE) { + new_interval->free_until_set = free_until_sets_.int_set; + } else if (new_value->type <= FLOAT64_TYPE) { + new_interval->free_until_set = free_until_sets_.float_set; + } else { + new_interval->free_until_set = free_until_sets_.vec_set; + } + + // Remove the old interval from the active list, as it's been spilled. + spill_interval->RemoveFromList(&intervals.active); + spill_interval->AddToList(&intervals.handled); + + // Insert interval into the right place in the list. + // We know it's ahead of us. + new_interval->InsertIntoList(&intervals.unhandled); + + // TODO(benvanik): use the register we just freed? + //current->value->reg.set = free_until_set->set; + //current->value->reg.index = spill_interval->value->reg.index; + bool allocated = TryAllocateFreeReg(current, intervals); + XEASSERTTRUE(allocated); +} diff --git a/src/alloy/compiler/passes/register_allocation_pass.h b/src/alloy/compiler/passes/register_allocation_pass.h new file mode 100644 index 000000000..3167000ec --- /dev/null +++ b/src/alloy/compiler/passes/register_allocation_pass.h @@ -0,0 +1,60 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_ +#define ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_ + +#include +#include + + +namespace alloy { +namespace compiler { +namespace passes { + + +class RegisterAllocationPass : public CompilerPass { +public: + RegisterAllocationPass(const backend::MachineInfo* machine_info); + virtual ~RegisterAllocationPass(); + + virtual int Run(hir::HIRBuilder* builder); + +private: + struct Interval; + struct Intervals; + void ComputeLastUse(hir::Value* value); + bool TryAllocateFreeReg(Interval* current, Intervals& intervals); + void AllocateBlockedReg(hir::HIRBuilder* builder, + Interval* current, Intervals& intervals); + +private: + const backend::MachineInfo* machine_info_; + + struct RegisterFreeUntilSet { + uint32_t count; + uint32_t pos[32]; + const backend::MachineInfo::RegisterSet* set; + }; + struct RegisterFreeUntilSets { + RegisterFreeUntilSet* int_set; + RegisterFreeUntilSet* float_set; + RegisterFreeUntilSet* vec_set; + RegisterFreeUntilSet* all_sets[3]; + }; + RegisterFreeUntilSets free_until_sets_; +}; + + +} // namespace passes +} // namespace compiler +} // namespace alloy + + +#endif // ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_ diff --git a/src/alloy/compiler/passes/sources.gypi b/src/alloy/compiler/passes/sources.gypi index bd5559319..ed16920ad 100644 --- a/src/alloy/compiler/passes/sources.gypi +++ b/src/alloy/compiler/passes/sources.gypi @@ -15,6 +15,8 @@ 'finalization_pass.h', #'dead_store_elimination_pass.cc', #'dead_store_elimination_pass.h', + 'register_allocation_pass.cc', + 'register_allocation_pass.h', 'simplification_pass.cc', 'simplification_pass.h', 'validation_pass.cc', diff --git a/src/alloy/frontend/ppc/ppc_translator.cc b/src/alloy/frontend/ppc/ppc_translator.cc index 2431f1761..61617db33 100644 --- a/src/alloy/frontend/ppc/ppc_translator.cc +++ b/src/alloy/frontend/ppc/ppc_translator.cc @@ -46,7 +46,7 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) : // Passes are executed in the order they are added. Multiple of the same // pass type may be used. if (validate) compiler_->AddPass(new passes::ValidationPass()); - //compiler_->AddPass(new passes::ContextPromotionPass()); + compiler_->AddPass(new passes::ContextPromotionPass()); if (validate) compiler_->AddPass(new passes::ValidationPass()); compiler_->AddPass(new passes::SimplificationPass()); if (validate) compiler_->AddPass(new passes::ValidationPass()); @@ -59,18 +59,16 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) : compiler_->AddPass(new passes::DeadCodeEliminationPass()); if (validate) compiler_->AddPass(new passes::ValidationPass()); - // Adds local load/stores. - compiler_->AddPass(new passes::DataFlowAnalysisPass()); - if (validate) compiler_->AddPass(new passes::ValidationPass()); - compiler_->AddPass(new passes::SimplificationPass()); - if (validate) compiler_->AddPass(new passes::ValidationPass()); + //// Removes all unneeded variables. Try not to add new ones after this. + //compiler_->AddPass(new passes::ValueReductionPass()); + //if (validate) compiler_->AddPass(new passes::ValidationPass()); - // Run DCE one more time to cleanup any local manipulation. - compiler_->AddPass(new passes::DeadCodeEliminationPass()); - if (validate) compiler_->AddPass(new passes::ValidationPass()); - - // Removes all unneeded variables. Try not to add new ones after this. - compiler_->AddPass(new passes::ValueReductionPass()); + // Register allocation for the target backend. + // Will modify the HIR to add loads/stores. + // This should be the last pass before finalization, as after this all + // registers are assigned and ready to be emitted. + compiler_->AddPass(new passes::RegisterAllocationPass( + backend->machine_info())); if (validate) compiler_->AddPass(new passes::ValidationPass()); // Must come last. The HIR is not really HIR after this. diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index 5e0be6dad..cad24c32c 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -108,6 +108,9 @@ void HIRBuilder::DumpValue(StringBuffer* str, Value* value) { }; str->Append("v%d.%s", value->ordinal, type_names[value->type]); } + if (value->reg.index != -1) { + str->Append("<%s%d>", value->reg.set->name, value->reg.index); + } } void HIRBuilder::DumpOp( @@ -453,6 +456,7 @@ Instr* HIRBuilder::AppendInstr( if (!block->instr_head) { block->instr_head = instr; } + instr->ordinal = -1; instr->block = block; instr->opcode = &opcode_info; instr->flags = flags; @@ -477,7 +481,8 @@ Value* HIRBuilder::AllocValue(TypeName type) { value->last_use = NULL; value->local_slot = NULL; value->tag = NULL; - value->reg = -1; + value->reg.set = NULL; + value->reg.index = -1; return value; } @@ -492,7 +497,8 @@ Value* HIRBuilder::CloneValue(Value* source) { value->last_use = NULL; value->local_slot = NULL; value->tag = NULL; - value->reg = -1; + value->reg.set = NULL; + value->reg.index = -1; return value; } diff --git a/src/alloy/hir/instr.h b/src/alloy/hir/instr.h index 57effa650..62983401d 100644 --- a/src/alloy/hir/instr.h +++ b/src/alloy/hir/instr.h @@ -52,7 +52,7 @@ public: const OpcodeInfo* opcode; uint16_t flags; - uint16_t ordinal; + uint32_t ordinal; typedef union { runtime::FunctionInfo* symbol_info; diff --git a/src/alloy/hir/value.h b/src/alloy/hir/value.h index 4fa957932..c2c8ed7ae 100644 --- a/src/alloy/hir/value.h +++ b/src/alloy/hir/value.h @@ -11,6 +11,7 @@ #define ALLOY_HIR_VALUE_H_ #include +#include #include @@ -90,7 +91,10 @@ public: TypeName type; uint32_t flags; - uint32_t reg; + struct { + const backend::MachineInfo::RegisterSet* set; + int32_t index; + } reg; ConstantValue constant; Instr* def; diff --git a/src/xenia/types.h b/src/xenia/types.h index f4356e94a..928c71766 100644 --- a/src/xenia/types.h +++ b/src/xenia/types.h @@ -145,6 +145,7 @@ typedef XECACHEALIGN volatile void xe_aligned_void_t; static inline uint32_t XENEXTPOW2(uint32_t v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; } +#define XEALIGN(value, align) ((value + align - 1) & ~(align - 1)) #define XESUCCEED() goto XECLEANUP #define XEFAIL() goto XECLEANUP From 74c9df669744d240743e82b784744a56763f2411 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 10 Feb 2014 23:24:46 -0800 Subject: [PATCH 082/184] Re-enabling x64 backend, fixing many bugs. --- src/alloy/backend/sources.gypi | 2 +- .../x64/lowering/lowering_sequences.cc | 8 +- src/alloy/backend/x64/lowering/op_utils.inl | 81 +++--- src/alloy/backend/x64/x64_backend.cc | 4 +- src/alloy/backend/x64/x64_emitter.cc | 235 +----------------- src/alloy/backend/x64/x64_emitter.h | 103 +++----- .../passes/constant_propagation_pass.cc | 70 ++++++ src/alloy/frontend/ppc/ppc_hir_builder.cc | 2 + src/alloy/hir/opcodes.inl | 2 +- src/alloy/hir/value.h | 144 +++++++++++ src/alloy/runtime/runtime.cc | 2 +- 11 files changed, 320 insertions(+), 333 deletions(-) diff --git a/src/alloy/backend/sources.gypi b/src/alloy/backend/sources.gypi index a7e2c0928..41419ac7a 100644 --- a/src/alloy/backend/sources.gypi +++ b/src/alloy/backend/sources.gypi @@ -11,6 +11,6 @@ 'includes': [ 'ivm/sources.gypi', - #'x64/sources.gypi', + 'x64/sources.gypi', ], } diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 754fd0d07..0573c928f 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -2500,7 +2500,7 @@ table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { } else if (IsVecType(i->dest->type)) { XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { // dest_src ^= 0xFFFF... - if (dest != src) { + if (dest.getIdx() != src.getIdx()) { e.movaps(dest, src); } e.mov(e.rax, XMMCONSTBASE); @@ -2697,7 +2697,7 @@ table->AddSequence(OPCODE_BYTE_SWAP, [](X64Emitter& e, Instr*& i) { Reg32 dest, src1; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0); - if (dest != src1) { + if (dest.getIdx() != src1.getIdx()) { e.mov(dest, src1); e.bswap(dest); } else { @@ -2708,7 +2708,7 @@ table->AddSequence(OPCODE_BYTE_SWAP, [](X64Emitter& e, Instr*& i) { Reg64 dest, src1; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0); - if (dest != src1) { + if (dest.getIdx() != src1.getIdx()) { e.mov(dest, src1); e.bswap(dest); } else { @@ -2972,7 +2972,7 @@ table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) { (((control >> 18) & 0x1) << 1) | (((control >> 10) & 0x1) << 2) | (((control >> 2) & 0x1) << 3); - if (dest != src3) { + if (dest.getIdx() != src3.getIdx()) { e.pshufd(dest, src2, src_control); e.pshufd(e.xmm0, src3, src_control); e.blendps(dest, e.xmm0, blend_control); diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index ce8f019db..0daac5d64 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -311,9 +311,9 @@ void VectorCompareXX(X64Emitter& e, Instr*& i, VectoreCompareOp op, bool as_sign if (op == VECTOR_CMP_EQ) { // Commutative, so simple. Xmm real_src; - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { real_src = src2; - } else if (dest == src2) { + } else if (dest.getIdx() == src2.getIdx()) { real_src = src1; } else { e.movaps(dest, src1); @@ -334,9 +334,9 @@ void VectorCompareXX(X64Emitter& e, Instr*& i, VectoreCompareOp op, bool as_sign // Float GT/GE must be emulated. if (op == VECTOR_CMP_GT) { // Have to swap: src2 < src1. - if (dest == src2) { + if (dest.getIdx() == src2.getIdx()) { e.cmpltps(dest, src1); - } else if (dest == src1) { + } else if (dest.getIdx() == src1.getIdx()) { e.movaps(e.xmm0, src1); e.movaps(dest, src2); e.cmpltps(dest, e.xmm0); @@ -346,9 +346,9 @@ void VectorCompareXX(X64Emitter& e, Instr*& i, VectoreCompareOp op, bool as_sign } } else if (op == VECTOR_CMP_GE) { // Have to swap: src2 <= src1. - if (dest == src2) { + if (dest.getIdx() == src2.getIdx()) { e.cmpleps(dest, src1); - } else if (dest == src1) { + } else if (dest.getIdx() == src1.getIdx()) { e.movaps(e.xmm0, src1); e.movaps(dest, src2); e.cmpleps(dest, e.xmm0); @@ -362,9 +362,9 @@ void VectorCompareXX(X64Emitter& e, Instr*& i, VectoreCompareOp op, bool as_sign } else { // Integer types are easier. Xmm real_src; - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { real_src = src2; - } else if (dest == src2) { + } else if (dest.getIdx() == src2.getIdx()) { e.movaps(e.xmm0, src2); e.movaps(dest, src1); real_src = e.xmm0; @@ -429,7 +429,7 @@ void IntUnaryOpV(X64Emitter& e, Instr*& i, v_fn v_fn, T& dest, T& src1) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0); - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { v_fn(e, *i, dest); } else { e.mov(dest, src1); @@ -486,9 +486,9 @@ void IntBinaryOpVV(X64Emitter& e, Instr*& i, vv_fn vv_fn, e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0, i->src2.value, src2, 0); - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { vv_fn(e, *i, dest, src2); - } else if (dest == src2) { + } else if (dest.getIdx() == src2.getIdx()) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { vv_fn(e, *i, dest, src1); } else { @@ -511,7 +511,7 @@ void IntBinaryOpVC(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, i->src1.value, src1, 0); if (dest.getBit() <= 32) { // 32-bit. - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { vc_fn(e, *i, dest, (uint32_t)src2->get_constant(CT())); } else { e.mov(dest, src1); @@ -519,7 +519,7 @@ void IntBinaryOpVC(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, } } else { // 64-bit. - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { e.mov(TEMP_REG, src2->constant.i64); vv_fn(e, *i, dest, TEMP_REG); } else { @@ -537,7 +537,7 @@ void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, i->src2.value, src2, 0); if (dest.getBit() <= 32) { // 32-bit. - if (dest == src2) { + if (dest.getIdx() == src2.getIdx()) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); } else { @@ -559,7 +559,7 @@ void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, } } else { // 64-bit. - if (dest == src2) { + if (dest.getIdx() == src2.getIdx()) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { e.mov(TEMP_REG, src1->constant.i64); vv_fn(e, *i, dest, TEMP_REG); @@ -669,14 +669,19 @@ void IntTernaryOpVVV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, i->src1.value, src1, 0, i->src2.value, src2, 0, i->src3.value, src3, 0); - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { vvv_fn(e, *i, dest, src2, src3); - } else if (dest == src2) { + } else if (dest.getIdx() == src2.getIdx()) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { vvv_fn(e, *i, dest, src1, src3); } else { UNIMPLEMENTED_SEQ(); } + } else if (dest.getIdx() == src3.getIdx()) { + auto Ntx = TEMP_LIKE(src3); + e.mov(Ntx, src3); + e.mov(dest, src1); + vvv_fn(e, *i, dest, src2, Ntx); } else { e.mov(dest, src1); vvv_fn(e, *i, dest, src2, src3); @@ -691,7 +696,7 @@ void IntTernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, i->src2.value, src2, 0); if (dest.getBit() <= 32) { // 32-bit. - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { vvc_fn(e, *i, dest, src2, (uint32_t)src3->get_constant(CT())); } else if (dest == src2) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { @@ -709,10 +714,10 @@ void IntTernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, } } else { // 64-bit. - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { e.mov(TEMP_REG, src3->constant.i64); vvv_fn(e, *i, dest, src2, TEMP_REG); - } else if (dest == src2) { + } else if (dest.getIdx() == src2.getIdx()) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { e.mov(TEMP_REG, src3->constant.i64); vvv_fn(e, *i, dest, src1, TEMP_REG); @@ -740,9 +745,9 @@ void IntTernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, i->src3.value, src3, 0); if (dest.getBit() <= 32) { // 32-bit. - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src3); - } else if (dest == src3) { + } else if (dest.getIdx() == src3.getIdx()) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src1); } else { @@ -758,10 +763,10 @@ void IntTernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, } } else { // 64-bit. - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { e.mov(TEMP_REG, src2->constant.i64); vvv_fn(e, *i, dest, TEMP_REG, src3); - } else if (dest == src3) { + } else if (dest.getIdx() == src3.getIdx()) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { e.mov(TEMP_REG, src2->constant.i64); vvv_fn(e, *i, dest, src1, TEMP_REG); @@ -817,16 +822,20 @@ void IntTernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); // } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8C, SIG_TYPE_I8)) { - Reg8 dest, src1, src3; + Reg8 dest, src1; + Reg8 src3; IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I8)) { - Reg16 dest, src1, src3; + Reg16 dest, src1; + Reg8 src3; IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I8)) { - Reg32 dest, src1, src3; + Reg32 dest, src1; + Reg8 src3; IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I8)) { - Reg64 dest, src1, src3; + Reg64 dest, src1; + Reg8 src3; IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); } else { ASSERT_INVALID_TYPE(); @@ -856,7 +865,7 @@ void XmmUnaryOpC(X64Emitter& e, Instr*& i, xmm_v_fn v_fn, e.mov(e.rax, (uint64_t)src1->constant.i64); e.movq(dest, e.rax); } else { - UNIMPLEMENTED_SEQ(); + LoadXmmConstant(e, dest, src1->constant.v128); } v_fn(e, *i, dest, dest); e.EndOp(dest); @@ -901,9 +910,9 @@ void XmmBinaryOpVV(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0, i->src2.value, src2, 0); - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { vv_fn(e, *i, dest, src2); - } else if (dest == src2) { + } else if (dest.getIdx() == src2.getIdx()) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { vv_fn(e, *i, dest, src1); } else { @@ -934,7 +943,7 @@ void XmmBinaryOpVC(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, } vv_fn(e, *i, dest, src1); } else { - if (dest != src1) { + if (dest.getIdx() != src1.getIdx()) { e.movaps(dest, src1); } if (src2->type == FLOAT32_TYPE) { @@ -967,7 +976,7 @@ void XmmBinaryOpCV(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, vv_fn(e, *i, dest, src2); } else { auto real_src2 = src2; - if (dest == src2) { + if (dest.getIdx() == src2.getIdx()) { e.movaps(e.xmm0, src2); real_src2 = e.xmm0; } @@ -1010,9 +1019,9 @@ void XmmTernaryOpVVV(X64Emitter& e, Instr*& i, xmm_vvv_fn vvv_fn, i->src1.value, src1, 0, i->src2.value, src2, 0, i->src3.value, src3, 0); - if (dest == src1) { + if (dest.getIdx() == src1.getIdx()) { vvv_fn(e, *i, dest, src2, src3); - } else if (dest == src2) { + } else if (dest.getIdx() == src2.getIdx()) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { vvv_fn(e, *i, dest, src1, src3); } else { @@ -1021,7 +1030,7 @@ void XmmTernaryOpVVV(X64Emitter& e, Instr*& i, xmm_vvv_fn vvv_fn, vvv_fn(e, *i, e.xmm0, src2, src3); e.movaps(dest, e.xmm0); } - } else if (dest == src3) { + } else if (dest.getIdx() == src3.getIdx()) { if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { vvv_fn(e, *i, dest, src1, src2); } else { diff --git a/src/alloy/backend/x64/x64_backend.cc b/src/alloy/backend/x64/x64_backend.cc index 8c1968571..076ab1cbb 100644 --- a/src/alloy/backend/x64/x64_backend.cc +++ b/src/alloy/backend/x64/x64_backend.cc @@ -45,14 +45,14 @@ int X64Backend::Initialize() { 0, "gpr", MachineInfo::RegisterSet::INT_TYPES, - 10, + X64Emitter::GPR_COUNT, }; machine_info_.register_sets[1] = { 1, "xmm", MachineInfo::RegisterSet::FLOAT_TYPES | MachineInfo::RegisterSet::VEC_TYPES, - 10, + X64Emitter::XMM_COUNT, }; code_cache_ = new X64CodeCache(); diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 4a1442ca5..8ae38d80d 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -36,6 +36,16 @@ static const size_t MAX_CODE_SIZE = 1 * 1024 * 1024; } // namespace alloy +const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = { + Operand::RBX, + Operand::R12, Operand::R13, Operand::R14, Operand::R15, +}; + +const uint32_t X64Emitter::xmm_reg_map_[X64Emitter::XMM_COUNT] = { + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; + + X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) : runtime_(backend->runtime()), backend_(backend), @@ -43,7 +53,6 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) : allocator_(allocator), current_instr_(0), CodeGenerator(MAX_CODE_SIZE, AutoGrow, allocator) { - xe_zero_struct(®_state_, sizeof(reg_state_)); } X64Emitter::~X64Emitter() { @@ -99,28 +108,6 @@ void* X64Emitter::Emplace(size_t stack_size) { } int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) { - // These are the registers we will not be using. All others are fare game. - const uint32_t reserved_regs = - GetRegBit(rax) | // scratch - GetRegBit(rcx) | // arg - GetRegBit(rdx) | // arg/clobbered - GetRegBit(rsp) | - GetRegBit(rbp) | - GetRegBit(rsi) | - GetRegBit(rdi) | - GetRegBit(r8) | // arg/clobbered - GetRegBit(xmm0) | // scratch - GetRegBit(xmm1) | // sometimes used for scratch, could be fixed - - // TODO(benvanik): save so that we can use these. - GetRegBit(r9) | - GetRegBit(r10) | - GetRegBit(r11) | - GetRegBit(xmm2) | - GetRegBit(xmm3) | - GetRegBit(xmm4) | - GetRegBit(xmm5); - // Calculate stack size. We need to align things to their natural sizes. // This could be much better (sort by type/etc). auto locals = builder->locals(); @@ -164,8 +151,6 @@ int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) { auto lowering_table = backend_->lowering_table(); - reg_state_.active_regs = reg_state_.live_regs = reserved_regs; - // Body. auto block = builder->first_block(); while (block) { @@ -176,11 +161,6 @@ int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) { label = label->next; } - // Reset reg allocation state. - // If we start keeping regs across blocks this needs to change. - // We mark a few active so that the allocator doesn't use them. - ResetRegisters(reserved_regs); - // Add instructions. // The table will process sequences of instructions to (try to) // generate optimal code. @@ -211,201 +191,6 @@ int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) { return 0; } -void X64Emitter::ResetRegisters(uint32_t reserved_regs) { - // Just need to reset the register for each live value. - uint32_t live_regs = reg_state_.live_regs; - for (size_t n = 0; n < 32; n++, live_regs >>= 1) { - if (live_regs & 0x1) { - auto v = reg_state_.reg_values[n]; - if (v) { - v->reg.index = -1; - } - } - reg_state_.reg_values[n] = 0; - } - reg_state_.active_regs = reg_state_.live_regs = reserved_regs; -} - -void X64Emitter::EvictStaleRegisters() { - // NOTE: if we are getting called it's because we *need* a register. - // We must get rid of something. - - uint32_t current_ordinal = current_instr_ ? - current_instr_->ordinal : 0xFFFFFFFF; - - // Remove any register with no more uses. - uint32_t new_live_regs = 0; - for (size_t n = 0; n < 32; n++) { - uint32_t bit = 1 << n; - if (bit & reg_state_.active_regs) { - // Register is active and cannot be freed. - new_live_regs |= bit; - continue; - } - if (!(bit & reg_state_.live_regs)) { - // Register is not alive - nothing to do. - continue; - } - - // Register is live, not active. Check and see if we get rid of it. - auto v = reg_state_.reg_values[n]; - if (!v->last_use || - v->last_use->ordinal < current_ordinal) { - reg_state_.reg_values[n] = NULL; - v->reg = -1; - continue; - } - - // Register still in use. - new_live_regs |= bit; - } - - // Hrm. We have spilled. - if (reg_state_.live_regs == new_live_regs) { - XEASSERTALWAYS(); - } - - reg_state_.live_regs = new_live_regs; - - // Assert that live is a superset of active. - XEASSERTZERO((reg_state_.live_regs ^ reg_state_.active_regs) & reg_state_.active_regs); -} - -void X64Emitter::FindFreeRegs( - Value* v0, uint32_t& v0_idx, uint32_t v0_flags) { - // If the value is already in a register, use it. - if (v0->reg != -1) { - // Already in a register. Mark active and return. - v0_idx = v0->reg; - reg_state_.active_regs |= 1 << v0_idx; - - // Assert that live is a superset of active. - XEASSERTZERO((reg_state_.live_regs ^ reg_state_.active_regs) & reg_state_.active_regs); - return; - } - - uint32_t avail_regs = 0; - if (IsIntType(v0->type)) { - if (v0_flags & REG_ABCD) { - avail_regs = B00001111; - } else { - avail_regs = 0xFFFF; - } - } else { - avail_regs = 0xFFFF0000; - } - uint32_t free_regs = avail_regs & ~reg_state_.live_regs; - if (!free_regs) { - // Need to evict something. - EvictStaleRegisters(); - free_regs = avail_regs & ~reg_state_.live_regs; - XEASSERT(free_regs); - } - - // Find the first available. - // We start from the MSB so that we get the non-rNx regs that are often - // in short supply. - _BitScanReverse((DWORD*)&v0_idx, free_regs); - - reg_state_.active_regs |= 1 << v0_idx; - reg_state_.live_regs |= 1 << v0_idx; - v0->reg = v0_idx; - reg_state_.reg_values[v0_idx] = v0; -} - -void X64Emitter::FindFreeRegs( - Value* v0, uint32_t& v0_idx, uint32_t v0_flags, - Value* v1, uint32_t& v1_idx, uint32_t v1_flags) { - // TODO(benvanik): support REG_DEST reuse/etc. - // Grab all already-present registers first. - // This way we won't spill them trying to get new registers. - bool need_v0 = v0->reg == -1; - bool need_v1 = v1->reg == -1; - if (!need_v0) { - FindFreeRegs(v0, v0_idx, v0_flags); - } - if (!need_v1) { - FindFreeRegs(v1, v1_idx, v1_flags); - } - // Grab any registers we still need. These calls may evict. - if (need_v0) { - FindFreeRegs(v0, v0_idx, v0_flags); - } - if (need_v1) { - FindFreeRegs(v1, v1_idx, v1_flags); - } -} - -void X64Emitter::FindFreeRegs( - Value* v0, uint32_t& v0_idx, uint32_t v0_flags, - Value* v1, uint32_t& v1_idx, uint32_t v1_flags, - Value* v2, uint32_t& v2_idx, uint32_t v2_flags) { - // TODO(benvanik): support REG_DEST reuse/etc. - // Grab all already-present registers first. - // This way we won't spill them trying to get new registers. - bool need_v0 = v0->reg == -1; - bool need_v1 = v1->reg == -1; - bool need_v2 = v2->reg == -1; - if (!need_v0) { - FindFreeRegs(v0, v0_idx, v0_flags); - } - if (!need_v1) { - FindFreeRegs(v1, v1_idx, v1_flags); - } - if (!need_v2) { - FindFreeRegs(v2, v2_idx, v2_flags); - } - // Grab any registers we still need. These calls may evict. - if (need_v0) { - FindFreeRegs(v0, v0_idx, v0_flags); - } - if (need_v1) { - FindFreeRegs(v1, v1_idx, v1_flags); - } - if (need_v2) { - FindFreeRegs(v2, v2_idx, v2_flags); - } -} - -void X64Emitter::FindFreeRegs( - Value* v0, uint32_t& v0_idx, uint32_t v0_flags, - Value* v1, uint32_t& v1_idx, uint32_t v1_flags, - Value* v2, uint32_t& v2_idx, uint32_t v2_flags, - Value* v3, uint32_t& v3_idx, uint32_t v3_flags) { - // TODO(benvanik): support REG_DEST reuse/etc. - // Grab all already-present registers first. - // This way we won't spill them trying to get new registers. - bool need_v0 = v0->reg == -1; - bool need_v1 = v1->reg == -1; - bool need_v2 = v2->reg == -1; - bool need_v3 = v3->reg == -1; - if (!need_v0) { - FindFreeRegs(v0, v0_idx, v0_flags); - } - if (!need_v1) { - FindFreeRegs(v1, v1_idx, v1_flags); - } - if (!need_v2) { - FindFreeRegs(v2, v2_idx, v2_flags); - } - if (!need_v3) { - FindFreeRegs(v3, v3_idx, v3_flags); - } - // Grab any registers we still need. These calls may evict. - if (need_v0) { - FindFreeRegs(v0, v0_idx, v0_flags); - } - if (need_v1) { - FindFreeRegs(v1, v1_idx, v1_flags); - } - if (need_v2) { - FindFreeRegs(v2, v2_idx, v2_flags); - } - if (need_v3) { - FindFreeRegs(v3, v3_idx, v3_flags); - } -} - Instr* X64Emitter::Advance(Instr* i) { auto next = i->next; current_instr_ = next; diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 4962dab14..ca13354a6 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -56,90 +56,73 @@ public: public: template void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags) { - uint32_t v0_idx; - FindFreeRegs(v0, v0_idx, r0_flags); - SetupReg(v0_idx, r0); + SetupReg(v0, r0); } template void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags, hir::Value* v1, V1& r1, uint32_t r1_flags) { - uint32_t v0_idx, v1_idx; - FindFreeRegs(v0, v0_idx, r0_flags, - v1, v1_idx, r1_flags); - SetupReg(v0_idx, r0); - SetupReg(v1_idx, r1); + SetupReg(v0, r0); + SetupReg(v1, r1); } template void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags, hir::Value* v1, V1& r1, uint32_t r1_flags, hir::Value* v2, V2& r2, uint32_t r2_flags) { - uint32_t v0_idx, v1_idx, v2_idx; - FindFreeRegs(v0, v0_idx, r0_flags, - v1, v1_idx, r1_flags, - v2, v2_idx, r2_flags); - SetupReg(v0_idx, r0); - SetupReg(v1_idx, r1); - SetupReg(v2_idx, r2); + SetupReg(v0, r0); + SetupReg(v1, r1); + SetupReg(v2, r2); } template void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags, hir::Value* v1, V1& r1, uint32_t r1_flags, hir::Value* v2, V2& r2, uint32_t r2_flags, hir::Value* v3, V3& r3, uint32_t r3_flags) { - uint32_t v0_idx, v1_idx, v2_idx, v3_idx; - FindFreeRegs(v0, v0_idx, r0_flags, - v1, v1_idx, r1_flags, - v2, v2_idx, r2_flags, - v3, v3_idx, r3_flags); - SetupReg(v0_idx, r0); - SetupReg(v1_idx, r1); - SetupReg(v2_idx, r2); - SetupReg(v3_idx, r3); + SetupReg(v0, r0); + SetupReg(v1, r1); + SetupReg(v2, r2); + SetupReg(v3, r3); } template void EndOp(V0& r0) { - reg_state_.active_regs = reg_state_.active_regs ^ GetRegBit(r0); } template void EndOp(V0& r0, V1& r1) { - reg_state_.active_regs = reg_state_.active_regs ^ ( - GetRegBit(r0) | GetRegBit(r1)); } template void EndOp(V0& r0, V1& r1, V2& r2) { - reg_state_.active_regs = reg_state_.active_regs ^ ( - GetRegBit(r0) | GetRegBit(r1) | GetRegBit(r2)); } template void EndOp(V0& r0, V1& r1, V2& r2, V3& r3) { - reg_state_.active_regs = reg_state_.active_regs ^ ( - GetRegBit(r0) | GetRegBit(r1) | GetRegBit(r2) | GetRegBit(r3)); } - void ResetRegisters(uint32_t reserved_regs); - void EvictStaleRegisters(); + // Reserved: rsp + // Scratch: rax/rcx/rdx + // xmm0-1 + // Available: rbx, r12-r15 (maybe r8-r11, rbp, rsi, rdi?) + // xmm2-xmm15 + static const int GPR_COUNT = 5; + static const int XMM_COUNT = 14; - void FindFreeRegs(hir::Value* v0, uint32_t& v0_idx, uint32_t v0_flags); - void FindFreeRegs(hir::Value* v0, uint32_t& v0_idx, uint32_t v0_flags, - hir::Value* v1, uint32_t& v1_idx, uint32_t v1_flags); - void FindFreeRegs(hir::Value* v0, uint32_t& v0_idx, uint32_t v0_flags, - hir::Value* v1, uint32_t& v1_idx, uint32_t v1_flags, - hir::Value* v2, uint32_t& v2_idx, uint32_t v2_flags); - void FindFreeRegs(hir::Value* v0, uint32_t& v0_idx, uint32_t v0_flags, - hir::Value* v1, uint32_t& v1_idx, uint32_t v1_flags, - hir::Value* v2, uint32_t& v2_idx, uint32_t v2_flags, - hir::Value* v3, uint32_t& v3_idx, uint32_t v3_flags); - - static void SetupReg(uint32_t idx, Xbyak::Reg8& r) { r = Xbyak::Reg8(idx); } - static void SetupReg(uint32_t idx, Xbyak::Reg16& r) { r = Xbyak::Reg16(idx); } - static void SetupReg(uint32_t idx, Xbyak::Reg32& r) { r = Xbyak::Reg32(idx); } - static void SetupReg(uint32_t idx, Xbyak::Reg64& r) { r = Xbyak::Reg64(idx); } - static void SetupReg(uint32_t idx, Xbyak::Xmm& r) { r = Xbyak::Xmm(idx - 16); } - static uint32_t GetRegBit(const Xbyak::Reg8& r) { return 1 << r.getIdx(); } - static uint32_t GetRegBit(const Xbyak::Reg16& r) { return 1 << r.getIdx(); } - static uint32_t GetRegBit(const Xbyak::Reg32& r) { return 1 << r.getIdx(); } - static uint32_t GetRegBit(const Xbyak::Reg64& r) { return 1 << r.getIdx(); } - static uint32_t GetRegBit(const Xbyak::Xmm& r) { return 1 << (16 + r.getIdx()); } + static void SetupReg(hir::Value* v, Xbyak::Reg8& r) { + auto idx = gpr_reg_map_[v->reg.index]; + r = Xbyak::Reg8(idx); + } + static void SetupReg(hir::Value* v, Xbyak::Reg16& r) { + auto idx = gpr_reg_map_[v->reg.index]; + r = Xbyak::Reg16(idx); + } + static void SetupReg(hir::Value* v, Xbyak::Reg32& r) { + auto idx = gpr_reg_map_[v->reg.index]; + r = Xbyak::Reg32(idx); + } + static void SetupReg(hir::Value* v, Xbyak::Reg64& r) { + auto idx = gpr_reg_map_[v->reg.index]; + r = Xbyak::Reg64(idx); + } + static void SetupReg(hir::Value* v, Xbyak::Xmm& r) { + auto idx = xmm_reg_map_[v->reg.index]; + r = Xbyak::Xmm(idx); + } hir::Instr* Advance(hir::Instr* i); @@ -157,21 +140,15 @@ protected: X64CodeCache* code_cache_; XbyakAllocator* allocator_; - struct { - // Registers currently active within a begin/end op block. These - // cannot be reused. - uint32_t active_regs; - // Registers with values in them. - uint32_t live_regs; - // Current register values. - hir::Value* reg_values[32]; - } reg_state_; hir::Instr* current_instr_; size_t source_map_count_; Arena source_map_arena_; size_t stack_size_; + + static const uint32_t gpr_reg_map_[GPR_COUNT]; + static const uint32_t xmm_reg_map_[XMM_COUNT]; }; diff --git a/src/alloy/compiler/passes/constant_propagation_pass.cc b/src/alloy/compiler/passes/constant_propagation_pass.cc index 0bf269334..03a514a94 100644 --- a/src/alloy/compiler/passes/constant_propagation_pass.cc +++ b/src/alloy/compiler/passes/constant_propagation_pass.cc @@ -179,6 +179,76 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) { break; // TODO(benvanik): compares + case OPCODE_COMPARE_EQ: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantEQ(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_NE: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantNE(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_SLT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantSLT(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_SLE: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantSLE(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_SGT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantSGT(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_SGE: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantSGE(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_ULT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantULT(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_ULE: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantULE(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_UGT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantUGT(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_UGE: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantUGE(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; case OPCODE_ADD: if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { diff --git a/src/alloy/frontend/ppc/ppc_hir_builder.cc b/src/alloy/frontend/ppc/ppc_hir_builder.cc index 2fc49396a..dd25c4f8a 100644 --- a/src/alloy/frontend/ppc/ppc_hir_builder.cc +++ b/src/alloy/frontend/ppc/ppc_hir_builder.cc @@ -280,6 +280,7 @@ Value* PPCHIRBuilder::LoadCA() { } void PPCHIRBuilder::StoreCA(Value* value) { + value = Truncate(value, INT8_TYPE); StoreContext(offsetof(PPCContext, xer_ca), value); } @@ -288,6 +289,7 @@ Value* PPCHIRBuilder::LoadSAT() { } void PPCHIRBuilder::StoreSAT(Value* value) { + value = Truncate(value, INT8_TYPE); StoreContext(offsetof(PPCContext, vscr_sat), value); } diff --git a/src/alloy/hir/opcodes.inl b/src/alloy/hir/opcodes.inl index 4fc7bd9dd..baf214f25 100644 --- a/src/alloy/hir/opcodes.inl +++ b/src/alloy/hir/opcodes.inl @@ -363,7 +363,7 @@ DEFINE_OPCODE( OPCODE_ADD_CARRY, "add_carry", OPCODE_SIG_V_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + 0); DEFINE_OPCODE( OPCODE_VECTOR_ADD, diff --git a/src/alloy/hir/value.h b/src/alloy/hir/value.h index c2c8ed7ae..4587efb19 100644 --- a/src/alloy/hir/value.h +++ b/src/alloy/hir/value.h @@ -221,6 +221,150 @@ public: (other->flags & VALUE_IS_CONSTANT) && constant.i64 != other->constant.i64; } + bool IsConstantSLT(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return constant.i8 < other->constant.i8; + case INT16_TYPE: + return constant.i16 < other->constant.i16; + case INT32_TYPE: + return constant.i32 < other->constant.i32; + case INT64_TYPE: + return constant.i64 < other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 < other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 < other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantSLE(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return constant.i8 <= other->constant.i8; + case INT16_TYPE: + return constant.i16 <= other->constant.i16; + case INT32_TYPE: + return constant.i32 <= other->constant.i32; + case INT64_TYPE: + return constant.i64 <= other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 <= other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 <= other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantSGT(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return constant.i8 > other->constant.i8; + case INT16_TYPE: + return constant.i16 > other->constant.i16; + case INT32_TYPE: + return constant.i32 > other->constant.i32; + case INT64_TYPE: + return constant.i64 > other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 > other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 > other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantSGE(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return constant.i8 >= other->constant.i8; + case INT16_TYPE: + return constant.i16 >= other->constant.i16; + case INT32_TYPE: + return constant.i32 >= other->constant.i32; + case INT64_TYPE: + return constant.i64 >= other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 >= other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 >= other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantULT(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return (uint8_t)constant.i8 < (uint8_t)other->constant.i8; + case INT16_TYPE: + return (uint16_t)constant.i16 < (uint16_t)other->constant.i16; + case INT32_TYPE: + return (uint32_t)constant.i32 < (uint32_t)other->constant.i32; + case INT64_TYPE: + return (uint64_t)constant.i64 < (uint64_t)other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 < other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 < other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantULE(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return (uint8_t)constant.i8 <= (uint8_t)other->constant.i8; + case INT16_TYPE: + return (uint16_t)constant.i16 <= (uint16_t)other->constant.i16; + case INT32_TYPE: + return (uint32_t)constant.i32 <= (uint32_t)other->constant.i32; + case INT64_TYPE: + return (uint64_t)constant.i64 <= (uint64_t)other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 <= other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 <= other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantUGT(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return (uint8_t)constant.i8 > (uint8_t)other->constant.i8; + case INT16_TYPE: + return (uint16_t)constant.i16 > (uint16_t)other->constant.i16; + case INT32_TYPE: + return (uint32_t)constant.i32 > (uint32_t)other->constant.i32; + case INT64_TYPE: + return (uint64_t)constant.i64 > (uint64_t)other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 > other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 > other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantUGE(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return (uint8_t)constant.i8 >= (uint8_t)other->constant.i8; + case INT16_TYPE: + return (uint16_t)constant.i16 >= (uint16_t)other->constant.i16; + case INT32_TYPE: + return (uint32_t)constant.i32 >= (uint32_t)other->constant.i32; + case INT64_TYPE: + return (uint64_t)constant.i64 >= (uint64_t)other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 >= other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 >= other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } uint32_t AsUint32(); uint64_t AsUint64(); diff --git a/src/alloy/runtime/runtime.cc b/src/alloy/runtime/runtime.cc index d39ac4220..3fc45a447 100644 --- a/src/alloy/runtime/runtime.cc +++ b/src/alloy/runtime/runtime.cc @@ -58,7 +58,7 @@ Runtime::~Runtime() { // TODO(benvanik): based on compiler support #include -//#include +#include int Runtime::Initialize(Frontend* frontend, Backend* backend) { // Must be initialized by subclass before calling into this. From b2f886be98101afadc8be231d20789f41d541022 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 10 Feb 2014 23:48:41 -0800 Subject: [PATCH 083/184] Fixing more x64 stuff. --- .../x64/lowering/lowering_sequences.cc | 25 ++++++++++++++++--- src/alloy/backend/x64/lowering/op_utils.inl | 4 ++- src/alloy/backend/x64/x64_emitter.cc | 2 +- src/alloy/backend/x64/x64_emitter.h | 6 ++--- 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 0573c928f..5ab38f41f 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -3206,17 +3206,36 @@ table->AddSequence(OPCODE_COMPARE_EXCHANGE, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_ATOMIC_EXCHANGE, [](X64Emitter& e, Instr*& i) { - if (i->dest->type == INT32_TYPE) { - // dest = old_value = InterlockedExchange(src1 = address, src2 = new_value); + // dest = old_value = InterlockedExchange(src1 = address, src2 = new_value); + if (i->Match(SIG_TYPE_I32, SIG_TYPE_I64, SIG_TYPE_I32)) { Reg32 dest, src2; Reg64 src1; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src1, 0, i->src2.value, src2, 0); + Reg64 real_src1 = src1; + if (dest.getIdx() == src1.getIdx()) { + e.mov(TEMP_REG, src1); + real_src1 = TEMP_REG; + } e.mov(dest, src2); e.lock(); - e.xchg(e.dword[src1], dest); + e.xchg(e.dword[real_src1], dest); e.EndOp(dest, src1, src2); + } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I64, SIG_TYPE_I32C)) { + Reg32 dest; + Reg64 src1; + e.BeginOp(i->dest, dest, REG_DEST, + i->src1.value, src1, 0); + Reg64 real_src1 = src1; + if (dest.getIdx() == src1.getIdx()) { + e.mov(TEMP_REG, src1); + real_src1 = TEMP_REG; + } + e.mov(dest, i->src2.value->constant.i32); + e.lock(); + e.xchg(e.dword[real_src1], dest); + e.EndOp(dest, src1); } else { ASSERT_INVALID_TYPE(); } diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl index 0daac5d64..749e84901 100644 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ b/src/alloy/backend/x64/lowering/op_utils.inl @@ -1034,7 +1034,9 @@ void XmmTernaryOpVVV(X64Emitter& e, Instr*& i, xmm_vvv_fn vvv_fn, if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { vvv_fn(e, *i, dest, src1, src2); } else { - UNIMPLEMENTED_SEQ(); + e.movaps(e.xmm0, src3); + e.movaps(dest, src1); + vvv_fn(e, *i, dest, src2, e.xmm0); } } else { e.movaps(dest, src1); diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 8ae38d80d..3a9e6d142 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -42,7 +42,7 @@ const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = { }; const uint32_t X64Emitter::xmm_reg_map_[X64Emitter::XMM_COUNT] = { - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index ca13354a6..e006bf3f9 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -98,10 +98,10 @@ public: // Reserved: rsp // Scratch: rax/rcx/rdx // xmm0-1 - // Available: rbx, r12-r15 (maybe r8-r11, rbp, rsi, rdi?) - // xmm2-xmm15 + // Available: rbx, r12-r15 (save to get r8-r11, rbp, rsi, rdi?) + // xmm6-xmm15 (save to get xmm2-xmm5) static const int GPR_COUNT = 5; - static const int XMM_COUNT = 14; + static const int XMM_COUNT = 10; static void SetupReg(hir::Value* v, Xbyak::Reg8& r) { auto idx = gpr_reg_map_[v->reg.index]; From 86f66c4ab7111c2b94792e0a3355572f634514de Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 15 Feb 2014 15:49:41 -0800 Subject: [PATCH 084/184] So few (faked) registers seems to break some things. --- src/alloy/backend/ivm/ivm_backend.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_backend.cc b/src/alloy/backend/ivm/ivm_backend.cc index 411d16d30..67703f7d4 100644 --- a/src/alloy/backend/ivm/ivm_backend.cc +++ b/src/alloy/backend/ivm/ivm_backend.cc @@ -38,14 +38,14 @@ int IVMBackend::Initialize() { 0, "gpr", MachineInfo::RegisterSet::INT_TYPES, - 6, + 16, }; machine_info_.register_sets[1] = { 1, "vec", MachineInfo::RegisterSet::FLOAT_TYPES | MachineInfo::RegisterSet::VEC_TYPES, - 6, + 16, }; alloy::tracing::WriteEvent(EventType::Init({ From 24fc5acb07208e3adc187f28aafc46a006f99b0f Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 23 Feb 2014 16:36:17 -0800 Subject: [PATCH 085/184] Partially implemented Windows keyboard HID for testing. --- src/xenia/hid/hid.cc | 10 +- src/xenia/hid/sources.gypi | 1 + src/xenia/hid/winkey/sources.gypi | 10 ++ src/xenia/hid/winkey/winkey_hid-private.h | 31 ++++ src/xenia/hid/winkey/winkey_hid.cc | 44 +++++ src/xenia/hid/winkey/winkey_hid.h | 33 ++++ src/xenia/hid/winkey/winkey_input_driver.cc | 181 ++++++++++++++++++++ src/xenia/hid/winkey/winkey_input_driver.h | 50 ++++++ 8 files changed, 359 insertions(+), 1 deletion(-) create mode 100644 src/xenia/hid/winkey/sources.gypi create mode 100644 src/xenia/hid/winkey/winkey_hid-private.h create mode 100644 src/xenia/hid/winkey/winkey_hid.cc create mode 100644 src/xenia/hid/winkey/winkey_hid.h create mode 100644 src/xenia/hid/winkey/winkey_input_driver.cc create mode 100644 src/xenia/hid/winkey/winkey_input_driver.h diff --git a/src/xenia/hid/hid.cc b/src/xenia/hid/hid.cc index 9aa58c618..fc3a3b9e6 100644 --- a/src/xenia/hid/hid.cc +++ b/src/xenia/hid/hid.cc @@ -17,11 +17,12 @@ using namespace xe::hid; DEFINE_string(hid, "any", - "Input system. Use: [any, nop, xinput]"); + "Input system. Use: [any, nop, winkey, xinput]"); #include #if XE_PLATFORM_WIN32 +#include #include #endif // WIN32 @@ -33,6 +34,8 @@ InputSystem* xe::hid::Create(Emulator* emulator) { if (FLAGS_hid.compare("nop") == 0) { input_system->AddDriver(xe::hid::nop::Create(input_system)); #if XE_PLATFORM_WIN32 + } else if (FLAGS_hid.compare("winkey") == 0) { + input_system->AddDriver(xe::hid::winkey::Create(input_system)); } else if (FLAGS_hid.compare("xinput") == 0) { input_system->AddDriver(xe::hid::xinput::Create(input_system)); #endif // WIN32 @@ -43,6 +46,11 @@ InputSystem* xe::hid::Create(Emulator* emulator) { // NOTE: in any mode we create as many as we can, falling back to nop. #if XE_PLATFORM_WIN32 + InputDriver* winkey_driver = xe::hid::winkey::Create(input_system); + if (winkey_driver) { + input_system->AddDriver(winkey_driver); + any_created = true; + } InputDriver* xinput_driver = xe::hid::xinput::Create(input_system); if (xinput_driver) { input_system->AddDriver(xinput_driver); diff --git a/src/xenia/hid/sources.gypi b/src/xenia/hid/sources.gypi index e166ec3f0..079d059ca 100644 --- a/src/xenia/hid/sources.gypi +++ b/src/xenia/hid/sources.gypi @@ -17,6 +17,7 @@ 'conditions': [ ['OS == "win"', { 'includes': [ + 'winkey/sources.gypi', 'xinput/sources.gypi', ], }], diff --git a/src/xenia/hid/winkey/sources.gypi b/src/xenia/hid/winkey/sources.gypi new file mode 100644 index 000000000..792ac571d --- /dev/null +++ b/src/xenia/hid/winkey/sources.gypi @@ -0,0 +1,10 @@ +# Copyright 2013 Ben Vanik. All Rights Reserved. +{ + 'sources': [ + 'winkey_hid-private.h', + 'winkey_hid.cc', + 'winkey_hid.h', + 'winkey_input_driver.cc', + 'winkey_input_driver.h', + ], +} diff --git a/src/xenia/hid/winkey/winkey_hid-private.h b/src/xenia/hid/winkey/winkey_hid-private.h new file mode 100644 index 000000000..2100f9185 --- /dev/null +++ b/src/xenia/hid/winkey/winkey_hid-private.h @@ -0,0 +1,31 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_HID_WINKEY_WINKEY_HID_PRIVATE_H_ +#define XENIA_HID_WINKEY_WINKEY_HID_PRIVATE_H_ + +#include + +#include + + +namespace xe { +namespace hid { +namespace winkey { + + + + + +} // namespace winkey +} // namespace hid +} // namespace xe + + +#endif // XENIA_HID_WINKEY_WINKEY_HID_PRIVATE_H_ diff --git a/src/xenia/hid/winkey/winkey_hid.cc b/src/xenia/hid/winkey/winkey_hid.cc new file mode 100644 index 000000000..43d363271 --- /dev/null +++ b/src/xenia/hid/winkey/winkey_hid.cc @@ -0,0 +1,44 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace xe; +using namespace xe::hid; +using namespace xe::hid::winkey; + + +namespace { + void InitializeIfNeeded(); + void CleanupOnShutdown(); + + void InitializeIfNeeded() { + static bool has_initialized = false; + if (has_initialized) { + return; + } + has_initialized = true; + + // + + atexit(CleanupOnShutdown); + } + + void CleanupOnShutdown() { + } +} + + +InputDriver* xe::hid::winkey::Create(InputSystem* input_system) { + InitializeIfNeeded(); + return new WinKeyInputDriver(input_system); +} diff --git a/src/xenia/hid/winkey/winkey_hid.h b/src/xenia/hid/winkey/winkey_hid.h new file mode 100644 index 000000000..a5ed273bc --- /dev/null +++ b/src/xenia/hid/winkey/winkey_hid.h @@ -0,0 +1,33 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_HID_WINKEY_WINKEY_HID_H_ +#define XENIA_HID_WINKEY_WINKEY_HID_H_ + +#include + + +XEDECLARECLASS2(xe, hid, InputDriver); +XEDECLARECLASS2(xe, hid, InputSystem); + + +namespace xe { +namespace hid { +namespace winkey { + + +InputDriver* Create(InputSystem* input_system); + + +} // namespace winkey +} // namespace hid +} // namespace xe + + +#endif // XENIA_HID_WINKEY_WINKEY_HID_H_ diff --git a/src/xenia/hid/winkey/winkey_input_driver.cc b/src/xenia/hid/winkey/winkey_input_driver.cc new file mode 100644 index 000000000..d0e63d64b --- /dev/null +++ b/src/xenia/hid/winkey/winkey_input_driver.cc @@ -0,0 +1,181 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace xe; +using namespace xe::hid; +using namespace xe::hid::winkey; + + +WinKeyInputDriver::WinKeyInputDriver(InputSystem* input_system) : + packet_number_(1), + InputDriver(input_system) { +} + +WinKeyInputDriver::~WinKeyInputDriver() { +} + +X_STATUS WinKeyInputDriver::Setup() { + return X_STATUS_SUCCESS; +} + +X_RESULT WinKeyInputDriver::GetCapabilities( + uint32_t user_index, uint32_t flags, X_INPUT_CAPABILITIES& out_caps) { + if (user_index != 0) { + return X_ERROR_DEVICE_NOT_CONNECTED; + } + + // TODO(benvanik): confirm with a real XInput controller. + out_caps.type = 0x01; // XINPUT_DEVTYPE_GAMEPAD + out_caps.sub_type = 0x01; // XINPUT_DEVSUBTYPE_GAMEPAD + out_caps.flags = 0; + out_caps.gamepad.buttons = 0xFFFF; + out_caps.gamepad.left_trigger = 0xFF; + out_caps.gamepad.right_trigger = 0xFF; + out_caps.gamepad.thumb_lx = (int16_t)0xFFFF; + out_caps.gamepad.thumb_ly = (int16_t)0xFFFF; + out_caps.gamepad.thumb_rx = (int16_t)0xFFFF; + out_caps.gamepad.thumb_ry = (int16_t)0xFFFF; + out_caps.vibration.left_motor_speed = 0; + out_caps.vibration.right_motor_speed = 0; + return X_ERROR_SUCCESS; +} + +#define IS_KEY_TOGGLED(key) ((GetKeyState(key) & 0x1) == 0x1) +#define IS_KEY_DOWN(key) ((GetAsyncKeyState(key) & 0x8000) == 0x8000) + +X_RESULT WinKeyInputDriver::GetState( + uint32_t user_index, X_INPUT_STATE& out_state) { + if (user_index != 0) { + return X_ERROR_DEVICE_NOT_CONNECTED; + } + + packet_number_++; + + uint16_t buttons = 0; + uint8_t left_trigger = 0; + uint8_t right_trigger = 0; + int16_t thumb_lx = 0; + int16_t thumb_ly = 0; + int16_t thumb_rx = 0; + int16_t thumb_ry = 0; + + if (IS_KEY_TOGGLED(VK_CAPITAL)) { + // dpad toggled + if (IS_KEY_DOWN(0x41)) { + // A + buttons |= 0x0004; // XINPUT_GAMEPAD_DPAD_LEFT + } + if (IS_KEY_DOWN(0x44)) { + // D + buttons |= 0x0008; // XINPUT_GAMEPAD_DPAD_RIGHT + } + if (IS_KEY_DOWN(0x53)) { + // S + buttons |= 0x0002; // XINPUT_GAMEPAD_DPAD_DOWN + } + if (IS_KEY_DOWN(0x57)) { + // W + buttons |= 0x0001; // XINPUT_GAMEPAD_DPAD_UP + } + } else { + // left stick + if (IS_KEY_DOWN(0x41)) { + // A + thumb_lx += SHRT_MIN; + } + if (IS_KEY_DOWN(0x44)) { + // D + thumb_lx += SHRT_MAX; + } + if (IS_KEY_DOWN(0x53)) { + // S + thumb_ly += SHRT_MIN; + } + if (IS_KEY_DOWN(0x57)) { + // W + thumb_ly += SHRT_MAX; + } + } + + if (IS_KEY_DOWN(0x4C)) { + // L + buttons |= 0x4000; // XINPUT_GAMEPAD_X + } + if (IS_KEY_DOWN(VK_OEM_7)) { + // ' + buttons |= 0x2000; // XINPUT_GAMEPAD_B + } + if (IS_KEY_DOWN(VK_OEM_1)) { + // ; + buttons |= 0x1000; // XINPUT_GAMEPAD_A + } + if (IS_KEY_DOWN(0x50)) { + // P + buttons |= 0x8000; // XINPUT_GAMEPAD_Y + } + + if (IS_KEY_DOWN(0x5A)) { + // Z + buttons |= 0x0020; // XINPUT_GAMEPAD_BACK + } + if (IS_KEY_DOWN(0x58)) { + // X + buttons |= 0x0010; // XINPUT_GAMEPAD_START + } + + out_state.packet_number = packet_number_; + out_state.gamepad.buttons = buttons; + out_state.gamepad.left_trigger = left_trigger; + out_state.gamepad.right_trigger = right_trigger; + out_state.gamepad.thumb_lx = thumb_lx; + out_state.gamepad.thumb_ly = thumb_ly; + out_state.gamepad.thumb_rx = thumb_rx; + out_state.gamepad.thumb_ry = thumb_ry; + + return X_ERROR_SUCCESS; +} + +X_RESULT WinKeyInputDriver::SetState( + uint32_t user_index, X_INPUT_VIBRATION& vibration) { + if (user_index != 0) { + return X_ERROR_DEVICE_NOT_CONNECTED; + } + + return X_ERROR_SUCCESS; +} + +X_RESULT WinKeyInputDriver::GetKeystroke( + uint32_t user_index, uint32_t flags, X_INPUT_KEYSTROKE& out_keystroke) { + if (user_index != 0) { + return X_ERROR_DEVICE_NOT_CONNECTED; + } + + X_RESULT result = X_ERROR_EMPTY; + + uint16_t virtual_key = 0; + uint16_t unicode = 0; + uint16_t keystroke_flags = 0; + uint8_t hid_code = 0; + + out_keystroke.virtual_key = virtual_key; + out_keystroke.unicode = unicode; + out_keystroke.flags = keystroke_flags; + out_keystroke.user_index = 0; + out_keystroke.hid_code = hid_code; + + // X_ERROR_EMPTY if no new keys + // X_ERROR_DEVICE_NOT_CONNECTED if no device + // X_ERROR_SUCCESS if key + return result; +} diff --git a/src/xenia/hid/winkey/winkey_input_driver.h b/src/xenia/hid/winkey/winkey_input_driver.h new file mode 100644 index 000000000..b1d00fd10 --- /dev/null +++ b/src/xenia/hid/winkey/winkey_input_driver.h @@ -0,0 +1,50 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_HID_WINKEY_WINKEY_DRIVER_H_ +#define XENIA_HID_WINKEY_WINKEY_DRIVER_H_ + +#include + +#include +#include + + +namespace xe { +namespace hid { +namespace winkey { + + +class WinKeyInputDriver : public InputDriver { +public: + WinKeyInputDriver(InputSystem* input_system); + virtual ~WinKeyInputDriver(); + + virtual X_STATUS Setup(); + + virtual X_RESULT GetCapabilities( + uint32_t user_index, uint32_t flags, X_INPUT_CAPABILITIES& out_caps); + virtual X_RESULT GetState( + uint32_t user_index, X_INPUT_STATE& out_state); + virtual X_RESULT SetState( + uint32_t user_index, X_INPUT_VIBRATION& vibration); + virtual X_RESULT GetKeystroke( + uint32_t user_index, uint32_t flags, X_INPUT_KEYSTROKE& out_keystroke); + +protected: + uint32_t packet_number_; +}; + + +} // namespace winkey +} // namespace hid +} // namespace xe + + +#endif // XENIA_HID_WINKEY_WINKEY_DRIVER_H_ From 9a2d99d6521b63f524dcafec61a13833e79ea227 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 28 Mar 2014 20:39:21 -0700 Subject: [PATCH 086/184] Fixing CFA. --- .../passes/control_flow_analysis_pass.cc | 21 +++++++++---------- .../passes/data_flow_analysis_pass.cc | 2 +- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/alloy/compiler/passes/control_flow_analysis_pass.cc b/src/alloy/compiler/passes/control_flow_analysis_pass.cc index 89442bcb6..bff651fe2 100644 --- a/src/alloy/compiler/passes/control_flow_analysis_pass.cc +++ b/src/alloy/compiler/passes/control_flow_analysis_pass.cc @@ -43,17 +43,16 @@ int ControlFlowAnalysisPass::Run(HIRBuilder* builder) { while (block) { auto instr = block->instr_tail; while (instr) { - if (instr->opcode->flags & OPCODE_FLAG_BRANCH) { - if (instr->opcode == &OPCODE_BRANCH_info) { - auto label = instr->src1.label; - builder->AddEdge(block, label->block, Edge::UNCONDITIONAL); - break; - } else if (instr->opcode == &OPCODE_BRANCH_TRUE_info || - instr->opcode == &OPCODE_BRANCH_FALSE_info) { - auto label = instr->src2.label; - builder->AddEdge(block, label->block, 0); - break; - } + if ((instr->opcode->flags & OPCODE_FLAG_BRANCH) == 0) { + break; + } + if (instr->opcode == &OPCODE_BRANCH_info) { + auto label = instr->src1.label; + builder->AddEdge(block, label->block, Edge::UNCONDITIONAL); + } else if (instr->opcode == &OPCODE_BRANCH_TRUE_info || + instr->opcode == &OPCODE_BRANCH_FALSE_info) { + auto label = instr->src2.label; + builder->AddEdge(block, label->block, 0); } instr = instr->prev; } diff --git a/src/alloy/compiler/passes/data_flow_analysis_pass.cc b/src/alloy/compiler/passes/data_flow_analysis_pass.cc index 8501d1675..b4e1ea644 100644 --- a/src/alloy/compiler/passes/data_flow_analysis_pass.cc +++ b/src/alloy/compiler/passes/data_flow_analysis_pass.cc @@ -82,7 +82,7 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder, // Walk blocks in reverse and calculate incoming/outgoing values. auto block = builder->last_block(); while (block) { - // allocate bitsets based on max value number + // Allocate bitsets based on max value number. block->incoming_values = incoming_bitvectors[block->ordinal]; auto& incoming_values = *block->incoming_values; From 1e196df4b3259747a5c470fbf4505b795dd8947c Mon Sep 17 00:00:00 2001 From: Shawn Hoffman Date: Fri, 28 Mar 2014 21:48:12 -0700 Subject: [PATCH 087/184] Fix spelling of GetLogicalProcessorInformation in GetProcAddress call - spelling actually matters here. --- src/xenia/core/pal_win.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xenia/core/pal_win.cc b/src/xenia/core/pal_win.cc index febf87935..3ef63b141 100644 --- a/src/xenia/core/pal_win.cc +++ b/src/xenia/core/pal_win.cc @@ -77,7 +77,7 @@ int xe_pal_get_system_info(xe_system_info* out_info) { kernel32 = GetModuleHandle(TEXT("kernel32")); XEEXPECTNOTNULL(kernel32); - glpi = (LPFN_GLPI)GetProcAddress(kernel32, "GetLogicalProcessorInfomration"); + glpi = (LPFN_GLPI)GetProcAddress(kernel32, "GetLogicalProcessorInformation"); XEEXPECTNOTNULL(glpi); // Call GLPI once to get the buffer size, allocate it, then call again. From 60a7e79e1aa59f8e82a6d27b0e7bbc4b5479d330 Mon Sep 17 00:00:00 2001 From: Anthony Pesch Date: Tue, 13 May 2014 22:20:42 -0700 Subject: [PATCH 088/184] assume c++11 clang didn't like static members in anonymous structures, gave them names WriteEvent template wouldn't resolve for temporary values without const decl in clang added a few missing headers added -fno-operator-names for xbyak compilation under gcc/clang --- src/alloy/backend/ivm/ivm_assembler.cc | 2 +- src/alloy/backend/ivm/tracing.h | 8 ++++---- src/alloy/backend/x64/lowering/tracers.h | 1 + src/alloy/backend/x64/tracing.h | 8 ++++---- src/alloy/compiler/tracing.h | 4 ++-- src/alloy/delegate.h | 1 + src/alloy/frontend/tracing.h | 4 ++-- src/alloy/runtime/entry_table.h | 2 +- src/alloy/runtime/module.h | 2 +- src/alloy/runtime/tracing.h | 20 ++++++++++---------- src/alloy/tracing/event_type.h | 4 ++-- src/alloy/tracing/tracing.h | 2 +- src/xenia/kernel/fs/filesystem.cc | 4 ++-- src/xenia/kernel/fs/filesystem.h | 2 +- src/xenia/platform_includes.h | 5 ----- src/xenia/types.h | 2 +- xenia.gyp | 18 +++++++++++++++++- 17 files changed, 51 insertions(+), 38 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_assembler.cc b/src/alloy/backend/ivm/ivm_assembler.cc index ff665b8f3..bfd7600a7 100644 --- a/src/alloy/backend/ivm/ivm_assembler.cc +++ b/src/alloy/backend/ivm/ivm_assembler.cc @@ -109,7 +109,7 @@ int IVMAssembler::Assemble( // Fixup label references. LabelRef* label_ref = ctx.label_ref_head; while (label_ref) { - label_ref->instr->src1_reg = (uint32_t)label_ref->label->tag & ~0x80000000; + label_ref->instr->src1_reg = (uint32_t)(intptr_t)label_ref->label->tag & ~0x80000000; label_ref = label_ref->next; } diff --git a/src/alloy/backend/ivm/tracing.h b/src/alloy/backend/ivm/tracing.h index a1fcdf20d..526aa912e 100644 --- a/src/alloy/backend/ivm/tracing.h +++ b/src/alloy/backend/ivm/tracing.h @@ -32,17 +32,17 @@ public: ALLOY_BACKEND_IVM_ASSEMBLER_DEINIT = ALLOY_BACKEND_IVM_ASSEMBLER | (2), }; - typedef struct { + typedef struct Init_s { static const uint32_t event_type = ALLOY_BACKEND_IVM_INIT; } Init; - typedef struct { + typedef struct Deinit_s { static const uint32_t event_type = ALLOY_BACKEND_IVM_DEINIT; } Deinit; - typedef struct { + typedef struct AssemblerInit_s { static const uint32_t event_type = ALLOY_BACKEND_IVM_ASSEMBLER_INIT; } AssemblerInit; - typedef struct { + typedef struct AssemblerDeinit_s { static const uint32_t event_type = ALLOY_BACKEND_IVM_ASSEMBLER_DEINIT; } AssemblerDeinit; }; diff --git a/src/alloy/backend/x64/lowering/tracers.h b/src/alloy/backend/x64/lowering/tracers.h index 9afd58448..e0536c7c5 100644 --- a/src/alloy/backend/x64/lowering/tracers.h +++ b/src/alloy/backend/x64/lowering/tracers.h @@ -11,6 +11,7 @@ #define ALLOY_BACKEND_X64_X64_LOWERING_TRACERS_H_ #include +#include namespace alloy { diff --git a/src/alloy/backend/x64/tracing.h b/src/alloy/backend/x64/tracing.h index 36d814d67..e6689b830 100644 --- a/src/alloy/backend/x64/tracing.h +++ b/src/alloy/backend/x64/tracing.h @@ -32,17 +32,17 @@ public: ALLOY_BACKEND_X64_ASSEMBLER_DEINIT = ALLOY_BACKEND_X64_ASSEMBLER | (2), }; - typedef struct { + typedef struct Init_s { static const uint32_t event_type = ALLOY_BACKEND_X64_INIT; } Init; - typedef struct { + typedef struct Deinit_s { static const uint32_t event_type = ALLOY_BACKEND_X64_DEINIT; } Deinit; - typedef struct { + typedef struct AssemblerInit_s { static const uint32_t event_type = ALLOY_BACKEND_X64_ASSEMBLER_INIT; } AssemblerInit; - typedef struct { + typedef struct AssemblerDeinit_s { static const uint32_t event_type = ALLOY_BACKEND_X64_ASSEMBLER_DEINIT; } AssemblerDeinit; }; diff --git a/src/alloy/compiler/tracing.h b/src/alloy/compiler/tracing.h index 04da6d9ee..85d99992a 100644 --- a/src/alloy/compiler/tracing.h +++ b/src/alloy/compiler/tracing.h @@ -27,10 +27,10 @@ public: ALLOY_COMPILER_DEINIT = ALLOY_COMPILER | (2), }; - typedef struct { + typedef struct Init_s { static const uint32_t event_type = ALLOY_COMPILER_INIT; } Init; - typedef struct { + typedef struct Deinit_s { static const uint32_t event_type = ALLOY_COMPILER_DEINIT; } Deinit; }; diff --git a/src/alloy/delegate.h b/src/alloy/delegate.h index e6ad2fcd1..176ff4b6b 100644 --- a/src/alloy/delegate.h +++ b/src/alloy/delegate.h @@ -11,6 +11,7 @@ #define ALLOY_DELEGATE_H_ #include +#include #include #include diff --git a/src/alloy/frontend/tracing.h b/src/alloy/frontend/tracing.h index 61aadb949..ad9e8dae7 100644 --- a/src/alloy/frontend/tracing.h +++ b/src/alloy/frontend/tracing.h @@ -27,10 +27,10 @@ public: ALLOY_FRONTEND_DEINIT = ALLOY_FRONTEND | (2), }; - typedef struct { + typedef struct Init_s { static const uint32_t event_type = ALLOY_FRONTEND_INIT; } Init; - typedef struct { + typedef struct Deinit_s { static const uint32_t event_type = ALLOY_FRONTEND_DEINIT; } Deinit; }; diff --git a/src/alloy/runtime/entry_table.h b/src/alloy/runtime/entry_table.h index e9f1ca9f2..acbabc26e 100644 --- a/src/alloy/runtime/entry_table.h +++ b/src/alloy/runtime/entry_table.h @@ -47,7 +47,7 @@ public: private: // TODO(benvanik): replace with a better data structure. Mutex* lock_; - typedef std::tr1::unordered_map EntryMap; + typedef std::unordered_map EntryMap; EntryMap map_; }; diff --git a/src/alloy/runtime/module.h b/src/alloy/runtime/module.h index 005e325a1..c05e009ca 100644 --- a/src/alloy/runtime/module.h +++ b/src/alloy/runtime/module.h @@ -62,7 +62,7 @@ protected: private: // TODO(benvanik): replace with a better data structure. Mutex* lock_; - typedef std::tr1::unordered_map SymbolMap; + typedef std::unordered_map SymbolMap; SymbolMap map_; typedef std::vector SymbolList; SymbolList list_; diff --git a/src/alloy/runtime/tracing.h b/src/alloy/runtime/tracing.h index 005562d07..262662b90 100644 --- a/src/alloy/runtime/tracing.h +++ b/src/alloy/runtime/tracing.h @@ -40,46 +40,46 @@ public: ALLOY_RUNTIME_MEMORY_HEAP_FREE = ALLOY_RUNTIME_MEMORY | (4), }; - typedef struct { + typedef struct Init_s { static const uint32_t event_type = ALLOY_RUNTIME_INIT; } Init; - typedef struct { + typedef struct Deinit_s { static const uint32_t event_type = ALLOY_RUNTIME_DEINIT; } Deinit; - typedef struct { + typedef struct ThreadInit_s { static const uint32_t event_type = ALLOY_RUNTIME_THREAD_INIT; } ThreadInit; - typedef struct { + typedef struct ThreadDeinit_s { static const uint32_t event_type = ALLOY_RUNTIME_THREAD_DEINIT; } ThreadDeinit; - typedef struct { + typedef struct MemoryInit_s { static const uint32_t event_type = ALLOY_RUNTIME_MEMORY_INIT; // map of memory, etc? } MemoryInit; - typedef struct { + typedef struct MemoryDeinit_s { static const uint32_t event_type = ALLOY_RUNTIME_MEMORY_DEINIT; } MemoryDeinit; - typedef struct { + typedef struct MemoryHeapInit_s { static const uint32_t event_type = ALLOY_RUNTIME_MEMORY_HEAP_INIT; uint32_t heap_id; uint64_t low_address; uint64_t high_address; uint32_t is_physical; } MemoryHeapInit; - typedef struct { + typedef struct MemoryHeapDeinit_s { static const uint32_t event_type = ALLOY_RUNTIME_MEMORY_HEAP_DEINIT; uint32_t heap_id; } MemoryHeapDeinit; - typedef struct { + typedef struct MemoryHeapAlloc_s { static const uint32_t event_type = ALLOY_RUNTIME_MEMORY_HEAP_ALLOC; uint32_t heap_id; uint32_t flags; uint64_t address; size_t size; } MemoryHeapAlloc; - typedef struct { + typedef struct MemoryHeapFree_s { static const uint32_t event_type = ALLOY_RUNTIME_MEMORY_HEAP_FREE; uint32_t heap_id; uint64_t address; diff --git a/src/alloy/tracing/event_type.h b/src/alloy/tracing/event_type.h index e51353708..33e2614fb 100644 --- a/src/alloy/tracing/event_type.h +++ b/src/alloy/tracing/event_type.h @@ -33,10 +33,10 @@ public: USER = (1 << 31), }; - typedef struct { + typedef struct TraceInit_s { static const uint32_t event_type = ALLOY_TRACE_INIT; } TraceInit; - typedef struct { + typedef struct TraceEOF_s { static const uint32_t event_type = ALLOY_TRACE_EOF; } TraceEOF; }; diff --git a/src/alloy/tracing/tracing.h b/src/alloy/tracing/tracing.h index b4eb2c865..ced2081de 100644 --- a/src/alloy/tracing/tracing.h +++ b/src/alloy/tracing/tracing.h @@ -30,7 +30,7 @@ Tracer* GetThreadTracer(); void WriteEvent(uint32_t event_type, size_t size = 0, const void* data = 0); -template void WriteEvent(T& ev) { +template void WriteEvent(const T& ev) { if (sizeof(T) > 1) { alloy::tracing::WriteEvent(T::event_type, sizeof(T), &ev); } else { diff --git a/src/xenia/kernel/fs/filesystem.cc b/src/xenia/kernel/fs/filesystem.cc index 6efa53de6..e83d409c8 100644 --- a/src/xenia/kernel/fs/filesystem.cc +++ b/src/xenia/kernel/fs/filesystem.cc @@ -70,7 +70,7 @@ int FileSystem::CreateSymbolicLink(const char* path, const char* target) { } int FileSystem::DeleteSymbolicLink(const char* path) { - std::tr1::unordered_map::iterator it = + std::unordered_map::iterator it = symlinks_.find(std::string(path)); if (it != symlinks_.end()) { symlinks_.erase(it); @@ -93,7 +93,7 @@ Entry* FileSystem::ResolvePath(const char* path) { // drive path -> device mappings with nothing nested. char full_path[XE_MAX_PATH]; XEIGNORE(xestrcpya(full_path, XECOUNT(full_path), path)); - for (std::tr1::unordered_map::iterator it = + for (std::unordered_map::iterator it = symlinks_.begin(); it != symlinks_.end(); ++it) { if (xestrcasestra(path, it->first.c_str()) == path) { // Found symlink, fixup. diff --git a/src/xenia/kernel/fs/filesystem.h b/src/xenia/kernel/fs/filesystem.h index acc6dbda6..94b9d787f 100644 --- a/src/xenia/kernel/fs/filesystem.h +++ b/src/xenia/kernel/fs/filesystem.h @@ -43,7 +43,7 @@ public: private: std::vector devices_; - std::tr1::unordered_map symlinks_; + std::unordered_map symlinks_; }; diff --git a/src/xenia/platform_includes.h b/src/xenia/platform_includes.h index e9d513316..6aaf86811 100644 --- a/src/xenia/platform_includes.h +++ b/src/xenia/platform_includes.h @@ -56,13 +56,8 @@ #include #include -#if XE_COMPILER_MSVC #include #include -#else -#include -#include -#endif // MSVC #endif // XENIA_PLATFORM_INCLUDES_H_ diff --git a/src/xenia/types.h b/src/xenia/types.h index 928c71766..42d6aa658 100644 --- a/src/xenia/types.h +++ b/src/xenia/types.h @@ -16,7 +16,7 @@ namespace xe { // TODO(benvanik): support other compilers/etc using std::auto_ptr; -using std::tr1::shared_ptr; +using std::shared_ptr; } // namespace xe diff --git a/xenia.gyp b/xenia.gyp index aea3ec75f..e59823058 100644 --- a/xenia.gyp +++ b/xenia.gyp @@ -96,7 +96,7 @@ 'SYMROOT': '<(DEPTH)/build/xenia/', 'ALWAYS_SEARCH_USER_PATHS': 'NO', 'ARCHS': ['x86_64'], - #'CLANG_CXX_LANGUAGE_STANDARD': 'c++0x', + 'CLANG_CXX_LANGUAGE_STANDARD': 'c++11', 'COMBINE_HIDPI_IMAGES': 'YES', 'GCC_C_LANGUAGE_STANDARD': 'gnu99', 'GCC_SYMBOLS_PRIVATE_EXTERN': 'YES', @@ -190,6 +190,22 @@ 'gflags', 'llvm', ], + + 'conditions': [ + ['OS == "mac"', { + 'xcode_settings': { + 'OTHER_CFLAGS': [ + '-fno-operator-names', + ], + }, + }], + ['OS == "linux"', { + 'cflags': [ + '-fno-operator-names', + ], + }], + ], + 'export_dependent_settings': [ 'beaengine', 'gflags', From c6cdf1f6726957d4353a66f1bfce021b0f5eebf1 Mon Sep 17 00:00:00 2001 From: Anthony Pesch Date: Tue, 13 May 2014 23:19:56 -0700 Subject: [PATCH 089/184] nest X64Function in its own block to avoid clang errors related to goto usage --- src/alloy/backend/x64/x64_assembler.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/alloy/backend/x64/x64_assembler.cc b/src/alloy/backend/x64/x64_assembler.cc index 3f90b077b..5a7028e11 100644 --- a/src/alloy/backend/x64/x64_assembler.cc +++ b/src/alloy/backend/x64/x64_assembler.cc @@ -83,13 +83,15 @@ int X64Assembler::Assemble( string_buffer_.Reset(); } - X64Function* fn = new X64Function(symbol_info); - fn->set_debug_info(debug_info); - fn->Setup(machine_code, code_size); + { + X64Function* fn = new X64Function(symbol_info); + fn->set_debug_info(debug_info); + fn->Setup(machine_code, code_size); - *out_function = fn; + *out_function = fn; - result = 0; + result = 0; + } XECLEANUP: Reset(); From 0e6c47aac57678c4d9eb43ecc52746285faaf298 Mon Sep 17 00:00:00 2001 From: Anthony Pesch Date: Tue, 13 May 2014 23:46:42 -0700 Subject: [PATCH 090/184] use custom __m128 struct on non-win32 platforms to provide element-wise access explicitly cast param for set_constant to correctly resolve overloaded function --- src/alloy/backend/ivm/ivm_assembler.cc | 2 +- src/alloy/backend/x64/lowering/tracers.h | 15 +++++++++++++++ src/alloy/backend/x64/x64_emitter.cc | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_assembler.cc b/src/alloy/backend/ivm/ivm_assembler.cc index bfd7600a7..a95237f2e 100644 --- a/src/alloy/backend/ivm/ivm_assembler.cc +++ b/src/alloy/backend/ivm/ivm_assembler.cc @@ -81,7 +81,7 @@ int IVMAssembler::Assemble( size_t type_size = GetTypeSize(slot->type); // Align to natural size. stack_offset = XEALIGN(stack_offset, type_size); - slot->set_constant(stack_offset); + slot->set_constant((uint32_t)stack_offset); stack_offset += type_size; } // Ensure 16b alignment. diff --git a/src/alloy/backend/x64/lowering/tracers.h b/src/alloy/backend/x64/lowering/tracers.h index e0536c7c5..7201b4f25 100644 --- a/src/alloy/backend/x64/lowering/tracers.h +++ b/src/alloy/backend/x64/lowering/tracers.h @@ -11,7 +11,22 @@ #define ALLOY_BACKEND_X64_X64_LOWERING_TRACERS_H_ #include + +#if XE_LIKE_WIN32 #include +#else +typedef union __declspec(align(16)) __m128 { + float m128_f32[4]; + uint64_t m128_u64[2]; + int8_t m128_i8[16]; + int16_t m128_i16[8]; + int32_t m128_i32[4]; + int64_t m128_i64[2]; + uint8_t m128_u8[16]; + uint16_t m128_u16[8]; + uint32_t m128_u32[4]; +} __m128; +#endif namespace alloy { diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 3a9e6d142..80ed2cbca 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -117,7 +117,7 @@ int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) { size_t type_size = GetTypeSize(slot->type); // Align to natural size. stack_offset = XEALIGN(stack_offset, type_size); - slot->set_constant(stack_offset); + slot->set_constant((uint32_t)stack_offset); stack_offset += type_size; } // Ensure 16b alignment. From 68e5833647091ea04f31ab7ea99bea3bb989cecd Mon Sep 17 00:00:00 2001 From: Anthony Pesch Date: Wed, 14 May 2014 00:14:32 -0700 Subject: [PATCH 091/184] use getpagesize on non-win32 platforms --- src/alloy/memory.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/alloy/memory.cc b/src/alloy/memory.cc index 8b4eaa2e3..2933392dd 100644 --- a/src/alloy/memory.cc +++ b/src/alloy/memory.cc @@ -9,14 +9,22 @@ #include +#if !XE_LIKE_WIN32 +#include +#endif + using namespace alloy; Memory::Memory() : membase_(0), reserve_address_(0) { +#if XE_LIKE_WIN32 SYSTEM_INFO si; GetSystemInfo(&si); system_page_size_ = si.dwPageSize; +#else + system_page_size_ = getpagesize(); +#endif } Memory::~Memory() { From f01609c5aa69cc24519f6b61686fb0bc22bae85a Mon Sep 17 00:00:00 2001 From: Anthony Pesch Date: Wed, 14 May 2014 00:14:56 -0700 Subject: [PATCH 092/184] replace usage of itoa with snprintf --- src/alloy/compiler/passes/finalization_pass.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/alloy/compiler/passes/finalization_pass.cc b/src/alloy/compiler/passes/finalization_pass.cc index 3fa3fc1b6..9a430ecc1 100644 --- a/src/alloy/compiler/passes/finalization_pass.cc +++ b/src/alloy/compiler/passes/finalization_pass.cc @@ -44,9 +44,9 @@ int FinalizationPass::Run(HIRBuilder* builder) { auto label = block->label_head; while (label) { if (!label->name) { - char* name = (char*)arena->Alloc(6 + 4 + 1); - xestrcpya(name, 6 + 1, "_label"); - char* part = _itoa(label->id, name + 6, 10); + const size_t label_len = 6 + 4 + 1; + char* name = (char*)arena->Alloc(label_len); + xesnprintf(name, label_len, "_label%d", label->id); label->name = name; } label = label->next; From 1a4355a36bfc9ef3281a1deedd498fbd5985caef Mon Sep 17 00:00:00 2001 From: Anthony Pesch Date: Wed, 14 May 2014 00:33:00 -0700 Subject: [PATCH 093/184] implemented xerotl to replace win32 specific _rotl usage --- src/alloy/frontend/ppc/ppc_emit_altivec.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/alloy/frontend/ppc/ppc_emit_altivec.cc b/src/alloy/frontend/ppc/ppc_emit_altivec.cc index d5a77c400..1a985d1ae 100644 --- a/src/alloy/frontend/ppc/ppc_emit_altivec.cc +++ b/src/alloy/frontend/ppc/ppc_emit_altivec.cc @@ -105,6 +105,10 @@ Value* CalculateEA_0(PPCHIRBuilder& f, uint32_t ra, uint32_t rb); // } +unsigned int xerotl(unsigned int value, unsigned int shift) { + XEASSERT(shift < 32); + return shift == 0 ? value : ((value << shift) | (value >> (32 - shift))); +} XEEMITTER(dst, 0x7C0002AC, XDSS)(PPCHIRBuilder& f, InstrData& i) { XEINSTRNOTIMPLEMENTED(); @@ -1797,7 +1801,7 @@ XEEMITTER(vpkd3d128, VX128_4(6, 1552), VX128_4)(PPCHIRBuilder& f, InstrData // http://hlssmod.net/he_code/public/pixelwriter.h // control = prev:0123 | new:4567 uint32_t control = 0x00010203; // original - uint32_t src = _rotl(0x04050607, shift * 8); + uint32_t src = xerotl(0x04050607, shift * 8); uint32_t mask = 0; switch (pack) { case 1: // VPACK_32 From 1bb0b08a142ea0fc1b1ba487f31a9c48afa23580 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 21 May 2014 10:38:41 -0700 Subject: [PATCH 094/184] Fixing WIN32 build. --- src/alloy/compiler/passes/finalization_pass.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alloy/compiler/passes/finalization_pass.cc b/src/alloy/compiler/passes/finalization_pass.cc index 9a430ecc1..7f827da15 100644 --- a/src/alloy/compiler/passes/finalization_pass.cc +++ b/src/alloy/compiler/passes/finalization_pass.cc @@ -46,7 +46,7 @@ int FinalizationPass::Run(HIRBuilder* builder) { if (!label->name) { const size_t label_len = 6 + 4 + 1; char* name = (char*)arena->Alloc(label_len); - xesnprintf(name, label_len, "_label%d", label->id); + xesnprintfa(name, label_len, "_label%d", label->id); label->name = name; } label = label->next; From a001714fb0ed2afd9980dc98f801563324d1007e Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 21 May 2014 11:24:44 -0700 Subject: [PATCH 095/184] Adding lock to stdout logging. Disable with --fast_stdout. Fixes #78. --- src/xenia/logging.cc | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/xenia/logging.cc b/src/xenia/logging.cc index 99f3963e6..14b19aaf2 100644 --- a/src/xenia/logging.cc +++ b/src/xenia/logging.cc @@ -10,6 +10,18 @@ #include #include +#include + +#include + + +DEFINE_bool(fast_stdout, false, + "Don't lock around stdout/stderr. May introduce weirdness."); + + +namespace { +xe_mutex_t* log_lock = xe_mutex_alloc(); +} // namespace void xe_format_log_line( @@ -54,15 +66,18 @@ void xe_log_line(const char* file_path, const uint32_t line_number, fmt, args); va_end(args); - fprintf(stderr, buffer); - fflush(stderr); - + if (!FLAGS_fast_stdout) { + xe_mutex_lock(log_lock); + } #if 0// defined(OutputDebugString) OutputDebugStringA(buffer); #else XEIGNORE(fprintf(stdout, buffer)); fflush(stdout); #endif // OutputDebugString + if (!FLAGS_fast_stdout) { + xe_mutex_unlock(log_lock); + } } void xe_handle_fatal( @@ -76,12 +91,18 @@ void xe_handle_fatal( fmt, args); va_end(args); + if (!FLAGS_fast_stdout) { + xe_mutex_lock(log_lock); + } #if defined(OutputDebugString) OutputDebugStringA(buffer); -#endif // OutputDebugString - - fprintf(stderr, buffer); +#else + XEIGNORE(fprintf(stderr, buffer)); fflush(stderr); +#endif // OutputDebugString + if (!FLAGS_fast_stdout) { + xe_mutex_unlock(log_lock); + } #if XE_LIKE_WIN32 if (!xe_has_console()) { From 5a85263e5f1089f496eeb86540c21ca106324a47 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 26 May 2014 20:28:21 -0700 Subject: [PATCH 096/184] Trying out a new style of JIT pattern matching. --- src/alloy/backend/ivm/ivm_intcode.cc | 42 +- .../x64/lowering/lowering_sequences.cc | 3257 ------------ .../backend/x64/lowering/lowering_table.cc | 71 - .../backend/x64/lowering/lowering_table.h | 58 - src/alloy/backend/x64/lowering/op_utils.inl | 1063 ---- src/alloy/backend/x64/lowering/sources.gypi | 12 - src/alloy/backend/x64/sources.gypi | 9 +- src/alloy/backend/x64/x64_backend.cc | 12 +- src/alloy/backend/x64/x64_backend.h | 5 - src/alloy/backend/x64/x64_emitter.cc | 351 +- src/alloy/backend/x64/x64_emitter.h | 110 +- src/alloy/backend/x64/x64_sequence.inl | 714 +++ src/alloy/backend/x64/x64_sequences.cc | 4488 +++++++++++++++++ .../lowering_sequences.h => x64_sequences.h} | 20 +- .../{lowering/tracers.cc => x64_tracers.cc} | 22 +- .../x64/{lowering/tracers.h => x64_tracers.h} | 17 +- .../passes/constant_propagation_pass.cc | 7 + .../compiler/passes/context_promotion_pass.cc | 16 +- .../passes/control_flow_analysis_pass.cc | 6 - .../passes/data_flow_analysis_pass.cc | 2 - .../passes/register_allocation_pass.cc | 760 +-- .../passes/register_allocation_pass.h | 67 +- src/alloy/compiler/passes/validation_pass.cc | 14 +- src/alloy/core.h | 4 + src/alloy/frontend/ppc/ppc_emit_alu.cc | 54 +- src/alloy/frontend/ppc/ppc_hir_builder.cc | 18 +- src/alloy/hir/block.cc | 39 + src/alloy/hir/block.h | 2 + src/alloy/hir/hir_builder.cc | 32 +- src/alloy/hir/hir_builder.h | 3 +- src/alloy/hir/instr.cc | 13 - src/alloy/hir/instr.h | 25 - src/alloy/hir/opcodes.inl | 204 +- src/alloy/hir/sources.gypi | 1 + src/alloy/hir/value.cc | 20 + src/alloy/hir/value.h | 10 +- third_party/xbyak | 2 +- xenia.gyp | 13 + 38 files changed, 6403 insertions(+), 5160 deletions(-) delete mode 100644 src/alloy/backend/x64/lowering/lowering_sequences.cc delete mode 100644 src/alloy/backend/x64/lowering/lowering_table.cc delete mode 100644 src/alloy/backend/x64/lowering/lowering_table.h delete mode 100644 src/alloy/backend/x64/lowering/op_utils.inl delete mode 100644 src/alloy/backend/x64/lowering/sources.gypi create mode 100644 src/alloy/backend/x64/x64_sequence.inl create mode 100644 src/alloy/backend/x64/x64_sequences.cc rename src/alloy/backend/x64/{lowering/lowering_sequences.h => x64_sequences.h} (59%) rename src/alloy/backend/x64/{lowering/tracers.cc => x64_tracers.cc} (96%) rename src/alloy/backend/x64/{lowering/tracers.h => x64_tracers.h} (89%) create mode 100644 src/alloy/hir/block.cc diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 211f466c7..6001cb15b 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -40,10 +40,10 @@ namespace ivm { #define DPRINT #define DFLUSH() -//#define IPRINT if (ics.thread_state->thread_id() == 1) printf -//#define IFLUSH() fflush(stdout) -//#define DPRINT if (ics.thread_state->thread_id() == 1) printf -//#define DFLUSH() fflush(stdout) +#define IPRINT if (ics.thread_state->thread_id() == 1) printf +#define IFLUSH() fflush(stdout) +#define DPRINT if (ics.thread_state->thread_id() == 1) printf +#define DFLUSH() fflush(stdout) #if XE_CPU_BIGENDIAN #define VECB16(v,n) (v.b16[n]) @@ -1364,31 +1364,31 @@ int Translate_LOAD_CLOCK(TranslationContext& ctx, Instr* i) { } uint32_t IntCode_LOAD_LOCAL_I8(IntCodeState& ics, const IntCode* i) { - ics.rf[i->dest_reg].i8 = *((int8_t*)(ics.locals + ics.rf[i->src1_reg].u64)); + ics.rf[i->dest_reg].i8 = *((int8_t*)(ics.locals + ics.rf[i->src1_reg].u32)); return IA_NEXT; } uint32_t IntCode_LOAD_LOCAL_I16(IntCodeState& ics, const IntCode* i) { - ics.rf[i->dest_reg].i16 = *((int16_t*)(ics.locals + ics.rf[i->src1_reg].u64)); + ics.rf[i->dest_reg].i16 = *((int16_t*)(ics.locals + ics.rf[i->src1_reg].u32)); return IA_NEXT; } uint32_t IntCode_LOAD_LOCAL_I32(IntCodeState& ics, const IntCode* i) { - ics.rf[i->dest_reg].i32 = *((int32_t*)(ics.locals + ics.rf[i->src1_reg].u64)); + ics.rf[i->dest_reg].i32 = *((int32_t*)(ics.locals + ics.rf[i->src1_reg].u32)); return IA_NEXT; } uint32_t IntCode_LOAD_LOCAL_I64(IntCodeState& ics, const IntCode* i) { - ics.rf[i->dest_reg].i64 = *((int64_t*)(ics.locals + ics.rf[i->src1_reg].u64)); + ics.rf[i->dest_reg].i64 = *((int64_t*)(ics.locals + ics.rf[i->src1_reg].u32)); return IA_NEXT; } uint32_t IntCode_LOAD_LOCAL_F32(IntCodeState& ics, const IntCode* i) { - ics.rf[i->dest_reg].f32 = *((float*)(ics.locals + ics.rf[i->src1_reg].u64)); + ics.rf[i->dest_reg].f32 = *((float*)(ics.locals + ics.rf[i->src1_reg].u32)); return IA_NEXT; } uint32_t IntCode_LOAD_LOCAL_F64(IntCodeState& ics, const IntCode* i) { - ics.rf[i->dest_reg].f64 = *((double*)(ics.locals + ics.rf[i->src1_reg].u64)); + ics.rf[i->dest_reg].f64 = *((double*)(ics.locals + ics.rf[i->src1_reg].u32)); return IA_NEXT; } uint32_t IntCode_LOAD_LOCAL_V128(IntCodeState& ics, const IntCode* i) { - ics.rf[i->dest_reg].v128 = *((vec128_t*)(ics.locals + ics.rf[i->src1_reg].u64)); + ics.rf[i->dest_reg].v128 = *((vec128_t*)(ics.locals + ics.rf[i->src1_reg].u32)); return IA_NEXT; } int Translate_LOAD_LOCAL(TranslationContext& ctx, Instr* i) { @@ -1405,31 +1405,31 @@ int Translate_LOAD_LOCAL(TranslationContext& ctx, Instr* i) { } uint32_t IntCode_STORE_LOCAL_I8(IntCodeState& ics, const IntCode* i) { - *((int8_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i8; + *((int8_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].i8; return IA_NEXT; } uint32_t IntCode_STORE_LOCAL_I16(IntCodeState& ics, const IntCode* i) { - *((int16_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i16; + *((int16_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].i16; return IA_NEXT; } uint32_t IntCode_STORE_LOCAL_I32(IntCodeState& ics, const IntCode* i) { - *((int32_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i32; + *((int32_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].i32; return IA_NEXT; } uint32_t IntCode_STORE_LOCAL_I64(IntCodeState& ics, const IntCode* i) { - *((int64_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i64; + *((int64_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].i64; return IA_NEXT; } uint32_t IntCode_STORE_LOCAL_F32(IntCodeState& ics, const IntCode* i) { - *((float*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].f32; + *((float*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].f32; return IA_NEXT; } uint32_t IntCode_STORE_LOCAL_F64(IntCodeState& ics, const IntCode* i) { - *((double*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].f64; + *((double*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].f64; return IA_NEXT; } uint32_t IntCode_STORE_LOCAL_V128(IntCodeState& ics, const IntCode* i) { - *((vec128_t*)(ics.locals + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].v128; + *((vec128_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].v128; return IA_NEXT; } int Translate_STORE_LOCAL(TranslationContext& ctx, Instr* i) { @@ -3715,17 +3715,17 @@ int Translate_CNTLZ(TranslationContext& ctx, Instr* i) { uint32_t IntCode_EXTRACT_INT8_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; - ics.rf[i->dest_reg].i8 = VECB16(src1,ics.rf[i->src2_reg].i64); + ics.rf[i->dest_reg].i8 = VECB16(src1,ics.rf[i->src2_reg].i8); return IA_NEXT; } uint32_t IntCode_EXTRACT_INT16_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; - ics.rf[i->dest_reg].i16 = VECS8(src1,ics.rf[i->src2_reg].i64); + ics.rf[i->dest_reg].i16 = VECS8(src1,ics.rf[i->src2_reg].i8); return IA_NEXT; } uint32_t IntCode_EXTRACT_INT32_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; - ics.rf[i->dest_reg].i32 = VECI4(src1,ics.rf[i->src2_reg].i64); + ics.rf[i->dest_reg].i32 = VECI4(src1,ics.rf[i->src2_reg].i8); return IA_NEXT; } int Translate_EXTRACT(TranslationContext& ctx, Instr* i) { diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc deleted file mode 100644 index 5ab38f41f..000000000 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ /dev/null @@ -1,3257 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// TODO(benvanik): reimplement packing functions -#include - -using namespace alloy; -using namespace alloy::backend::x64; -using namespace alloy::backend::x64::lowering; -using namespace alloy::hir; -using namespace alloy::runtime; - -using namespace Xbyak; - -namespace { - -// Make loads/stores to ints check to see if they are doing a register value. -// This is slow, and with proper constant propagation we may be able to always -// avoid it. -// TODO(benvanik): make a compile time flag? -#define DYNAMIC_REGISTER_ACCESS_CHECK 1 - -#define UNIMPLEMENTED_SEQ() __debugbreak() -#define ASSERT_INVALID_TYPE() XEASSERTALWAYS() - -#define ITRACE 1 -#define DTRACE 1 - -#define SHUFPS_SWAP_DWORDS 0x1B - - -// Major templating foo lives in here. -#include - - -enum XmmConst { - XMMZero = 0, - XMMOne = 1, - XMMNegativeOne = 2, - XMMMaskX16Y16 = 3, - XMMFlipX16Y16 = 4, - XMMFixX16Y16 = 5, - XMMNormalizeX16Y16 = 6, - XMM3301 = 7, - XMMSignMaskPS = 8, - XMMSignMaskPD = 9, - XMMByteSwapMask = 10, - XMMPermuteControl15 = 11, - XMMUnpackD3DCOLOR = 12, - XMMOneOver255 = 13, -}; -static const vec128_t xmm_consts[] = { - /* XMMZero */ vec128f(0.0f, 0.0f, 0.0f, 0.0f), - /* XMMOne */ vec128f(1.0f, 1.0f, 1.0f, 1.0f), - /* XMMNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f), - /* XMMMaskX16Y16 */ vec128i(0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000), - /* XMMFlipX16Y16 */ vec128i(0x00008000, 0x00000000, 0x00000000, 0x00000000), - /* XMMFixX16Y16 */ vec128f(-32768.0f, 0.0f, 0.0f, 0.0f), - /* XMMNormalizeX16Y16 */ vec128f(1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f), - /* XMM3301 */ vec128f(3.0f, 3.0f, 0.0f, 1.0f), - /* XMMSignMaskPS */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), - /* XMMSignMaskPD */ vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u), - /* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), - /* XMMPermuteControl15 */ vec128b(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15), - /* XMMUnpackD3DCOLOR */ vec128i(0xFFFFFF02, 0xFFFFFF01, 0xFFFFFF00, 0xFFFFFF02), - /* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f), -}; -// Use consts by first loading the base register then accessing memory: -// e.mov(e.rax, XMMCONSTBASE) -// e.andps(reg, XMMCONST(XMM3303)) -// TODO(benvanik): find a way to do this without the base register. -#define XMMCONSTBASE (uint64_t)&xmm_consts[0] -#define XMMCONST(base_reg, name) e.ptr[base_reg + name * 16] - -static vec128_t lvsl_table[17] = { - vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), - vec128b( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), - vec128b( 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), - vec128b( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), - vec128b( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), - vec128b( 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), - vec128b( 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), - vec128b( 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), - vec128b( 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), - vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), - vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), - vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), - vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), - vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), - vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), - vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), -}; -static vec128_t lvsr_table[17] = { - vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), - vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), - vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), - vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), - vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), - vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), - vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), - vec128b( 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), - vec128b( 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), - vec128b( 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), - vec128b( 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), - vec128b( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), - vec128b( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), - vec128b( 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), - vec128b( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), - vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), - vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), -}; -static vec128_t extract_table_32[4] = { - vec128b( 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), - vec128b( 7, 6, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), - vec128b(11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), - vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), -}; - -// A note about vectors: -// Alloy represents vectors as xyzw pairs, with indices 0123. -// XMM registers are xyzw pairs with indices 3210, making them more like wzyx. -// This makes things somewhat confusing. It'd be nice to just shuffle the -// registers around on load/store, however certain operations require that -// data be in the right offset. -// Basically, this identity must hold: -// shuffle(vec, b00011011) -> {x,y,z,w} => {x,y,z,w} -// All indices and operations must respect that. -// -// Memory (big endian): -// [00 01 02 03] [04 05 06 07] [08 09 0A 0B] [0C 0D 0E 0F] (x, y, z, w) -// load into xmm register: -// [0F 0E 0D 0C] [0B 0A 09 08] [07 06 05 04] [03 02 01 00] (w, z, y, x) - -void Dummy() { - // -} - -void UndefinedCallExtern(void* raw_context, FunctionInfo* symbol_info) { - XELOGW("undefined extern call to %.8X %s", - symbol_info->address(), - symbol_info->name()); -} - -uint64_t DynamicRegisterLoad(void* raw_context, uint32_t address) { - auto thread_state = *((ThreadState**)raw_context); - auto cbs = thread_state->runtime()->access_callbacks(); - while (cbs) { - if (cbs->handles(cbs->context, address)) { - return cbs->read(cbs->context, address); - } - } - return 0; -} - -void DynamicRegisterStore(void* raw_context, uint32_t address, uint64_t value) { - auto thread_state = *((ThreadState**)raw_context); - auto cbs = thread_state->runtime()->access_callbacks(); - while (cbs) { - if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, value); - return; - } - } -} - -void Unpack_FLOAT16_2(void* raw_context, __m128& v) { - uint32_t src = v.m128_i32[3]; - v.m128_f32[0] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)src); - v.m128_f32[1] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)(src >> 16)); - v.m128_f32[2] = 0.0f; - v.m128_f32[3] = 1.0f; -} - -uint64_t LoadClock(void* raw_context) { - LARGE_INTEGER counter; - uint64_t time = 0; - if (QueryPerformanceCounter(&counter)) { - time = counter.QuadPart; - } - return time; -} - -// TODO(benvanik): fancy stuff. -void* ResolveFunctionSymbol(void* raw_context, FunctionInfo* symbol_info) { - // TODO(benvanik): generate this thunk at runtime? or a shim? - auto thread_state = *((ThreadState**)raw_context); - - Function* fn = NULL; - thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn); - XEASSERTNOTNULL(fn); - auto x64_fn = (X64Function*)fn; - return x64_fn->machine_code(); -} -void* ResolveFunctionAddress(void* raw_context, uint32_t target_address) { - // TODO(benvanik): generate this thunk at runtime? or a shim? - auto thread_state = *((ThreadState**)raw_context); - - Function* fn = NULL; - thread_state->runtime()->ResolveFunction(target_address, &fn); - XEASSERTNOTNULL(fn); - auto x64_fn = (X64Function*)fn; - return x64_fn->machine_code(); -} -void TransitionToHost(X64Emitter& e) { - // Expects: - // rcx = context - // rdx = target host function - // r8 = arg0 - // r9 = arg1 - // Returns: - // rax = host return - auto thunk = e.backend()->guest_to_host_thunk(); - e.mov(e.rax, (uint64_t)thunk); - e.call(e.rax); -} -void IssueCall(X64Emitter& e, FunctionInfo* symbol_info, uint32_t flags) { - auto fn = (X64Function*)symbol_info->function(); - // Resolve address to the function to call and store in rax. - // TODO(benvanik): caching/etc. For now this makes debugging easier. - if (fn) { - e.mov(e.rax, (uint64_t)fn->machine_code()); - } else { - e.mov(e.rdx, (uint64_t)symbol_info); - CallNative(e, ResolveFunctionSymbol); - } - - // Actually jump/call to rax. - if (flags & CALL_TAIL) { - // Pass the callers return address over. - e.mov(e.rdx, e.qword[e.rsp + StackLayout::GUEST_RET_ADDR]); - - e.add(e.rsp, (uint32_t)e.stack_size()); - e.jmp(e.rax); - } else { - // Return address is from the previous SET_RETURN_ADDRESS. - e.mov(e.rdx, e.qword[e.rsp + StackLayout::GUEST_CALL_RET_ADDR]); - - e.call(e.rax); - } -} -void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { - Reg64 r; - e.BeginOp(target, r, 0); - - // Check if return. - if (flags & CALL_POSSIBLE_RETURN) { - e.cmp(r.cvt32(), e.dword[e.rsp + StackLayout::GUEST_RET_ADDR]); - e.je("epilog", CodeGenerator::T_NEAR); - } - - // Resolve address to the function to call and store in rax. - // TODO(benvanik): caching/etc. For now this makes debugging easier. - if (r != e.rdx) { - e.mov(e.rdx, r); - } - e.EndOp(r); - CallNative(e, ResolveFunctionAddress); - - // Actually jump/call to rax. - if (flags & CALL_TAIL) { - // Pass the callers return address over. - e.mov(e.rdx, e.qword[e.rsp + StackLayout::GUEST_RET_ADDR]); - - e.add(e.rsp, (uint32_t)e.stack_size()); - e.jmp(e.rax); - } else { - // Return address is from the previous SET_RETURN_ADDRESS. - e.mov(e.rdx, e.qword[e.rsp + StackLayout::GUEST_CALL_RET_ADDR]); - - e.call(e.rax); - } -} - -} // namespace - - -void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { -// -------------------------------------------------------------------------- -// General -// -------------------------------------------------------------------------- - -table->AddSequence(OPCODE_COMMENT, [](X64Emitter& e, Instr*& i) { -#if ITRACE - // TODO(benvanik): pass through. - // TODO(benvanik): don't just leak this memory. - auto str = (const char*)i->src1.offset; - auto str_copy = xestrdupa(str); - e.mov(e.rdx, (uint64_t)str_copy); - CallNative(e, TraceString); -#endif // ITRACE - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_NOP, [](X64Emitter& e, Instr*& i) { - // If we got this, chances are we want it. - e.nop(); - i = e.Advance(i); - return true; -}); - -// -------------------------------------------------------------------------- -// Debugging -// -------------------------------------------------------------------------- - -table->AddSequence(OPCODE_SOURCE_OFFSET, [](X64Emitter& e, Instr*& i) { -#if XE_DEBUG - e.nop(); - e.nop(); - e.mov(e.eax, (uint32_t)i->src1.offset); - e.nop(); - e.nop(); -#endif // XE_DEBUG - - e.MarkSourceOffset(i); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_DEBUG_BREAK, [](X64Emitter& e, Instr*& i) { - // TODO(benvanik): insert a call to the debug break function to let the - // debugger know. - e.db(0xCC); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_DEBUG_BREAK_TRUE, [](X64Emitter& e, Instr*& i) { - e.inLocalLabel(); - CheckBoolean(e, i->src1.value); - e.jz(".x", e.T_SHORT); - // TODO(benvanik): insert a call to the debug break function to let the - // debugger know. - e.db(0xCC); - e.L(".x"); - e.outLocalLabel(); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_TRAP, [](X64Emitter& e, Instr*& i) { - // TODO(benvanik): insert a call to the trap function to let the - // debugger know. - e.db(0xCC); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_TRAP_TRUE, [](X64Emitter& e, Instr*& i) { - e.inLocalLabel(); - CheckBoolean(e, i->src1.value); - e.jz(".x", e.T_SHORT); - // TODO(benvanik): insert a call to the trap function to let the - // debugger know. - e.db(0xCC); - e.L(".x"); - e.outLocalLabel(); - i = e.Advance(i); - return true; -}); - -// -------------------------------------------------------------------------- -// Calls -// -------------------------------------------------------------------------- - -table->AddSequence(OPCODE_CALL, [](X64Emitter& e, Instr*& i) { - IssueCall(e, i->src1.symbol_info, i->flags); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_CALL_TRUE, [](X64Emitter& e, Instr*& i) { - e.inLocalLabel(); - CheckBoolean(e, i->src1.value); - e.jz(".x", e.T_SHORT); - IssueCall(e, i->src2.symbol_info, i->flags); - e.L(".x"); - e.outLocalLabel(); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_CALL_INDIRECT, [](X64Emitter& e, Instr*& i) { - IssueCallIndirect(e, i->src1.value, i->flags); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_CALL_INDIRECT_TRUE, [](X64Emitter& e, Instr*& i) { - e.inLocalLabel(); - CheckBoolean(e, i->src1.value); - e.jz(".x", e.T_SHORT); - IssueCallIndirect(e, i->src2.value, i->flags); - e.L(".x"); - e.outLocalLabel(); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_CALL_EXTERN, [](X64Emitter& e, Instr*& i) { - auto symbol_info = i->src1.symbol_info; - XEASSERT(symbol_info->behavior() == FunctionInfo::BEHAVIOR_EXTERN); - if (!symbol_info->extern_handler()) { - e.mov(e.rdx, (uint64_t)symbol_info); - CallNative(e, UndefinedCallExtern); - } else { - // rdx = target host function - // r8 = arg0 - // r9 = arg1 - e.mov(e.rdx, (uint64_t)symbol_info->extern_handler()); - e.mov(e.r8, (uint64_t)symbol_info->extern_arg0()); - e.mov(e.r9, (uint64_t)symbol_info->extern_arg1()); - TransitionToHost(e); - ReloadRDX(e); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_RETURN, [](X64Emitter& e, Instr*& i) { - // If this is the last instruction in the last block, just let us - // fall through. - if (i->next || i->block->next) { - e.jmp("epilog", CodeGenerator::T_NEAR); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_RETURN_TRUE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - e.jnz("epilog", CodeGenerator::T_NEAR); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_SET_RETURN_ADDRESS, [](X64Emitter& e, Instr*& i) { - XEASSERT(i->src1.value->IsConstant()); - e.mov(e.qword[e.rsp + StackLayout::GUEST_CALL_RET_ADDR], - i->src1.value->AsUint64()); - i = e.Advance(i); - return true; -}); - -// -------------------------------------------------------------------------- -// Branches -// -------------------------------------------------------------------------- - -table->AddSequence(OPCODE_BRANCH, [](X64Emitter& e, Instr*& i) { - auto target = i->src1.label; - e.jmp(target->name, e.T_NEAR); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_BRANCH_TRUE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - auto target = i->src2.label; - e.jnz(target->name, e.T_NEAR); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_BRANCH_FALSE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - auto target = i->src2.label; - e.jz(target->name, e.T_NEAR); - i = e.Advance(i); - return true; -}); - -// -------------------------------------------------------------------------- -// Types -// -------------------------------------------------------------------------- - -table->AddSequence(OPCODE_ASSIGN, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - IntUnaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src) { - // nop - the mov will have happened. - }); - } else if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { - if (i->dest->type == INT32_TYPE) { - if (i->src1.value->type == FLOAT32_TYPE) { - Reg32 dest; - Xmm src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.vmovd(dest, src); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - } else if (i->dest->type == INT64_TYPE) { - if (i->src1.value->type == FLOAT64_TYPE) { - Reg64 dest; - Xmm src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.vmovq(dest, src); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - } else if (i->dest->type == FLOAT32_TYPE) { - if (i->src1.value->type == INT32_TYPE) { - Xmm dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.vmovd(dest, src); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - } else if (i->dest->type == FLOAT64_TYPE) { - if (i->src1.value->type == INT64_TYPE) { - Xmm dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.vmovq(dest, src); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_ZERO_EXTEND, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I16, SIG_TYPE_I8)) { - Reg16 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I8)) { - Reg32 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I16)) { - Reg32 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I8)) { - Reg64 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I16)) { - Reg64 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I32)) { - Reg64 dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest.cvt32(), src.cvt32()); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_SIGN_EXTEND, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I16, SIG_TYPE_I8)) { - Reg16 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I8)) { - Reg32 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I16)) { - Reg32 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I8)) { - Reg64 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I16)) { - Reg64 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I32)) { - Reg64 dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsxd(dest, src); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_TRUNCATE, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I8, SIG_TYPE_I16)) { - Reg8 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I32)) { - Reg8 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I64)) { - Reg8 dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I32)) { - Reg16 dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt16()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I64)) { - Reg16 dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt16()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I64)) { - Reg32 dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt32()); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I32, SIG_TYPE_F32)) { - Reg32 dest; - Xmm src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - // TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?) - e.cvttss2si(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_F64)) { - Reg32 dest; - Xmm src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - // TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?) - e.cvtsd2ss(e.xmm0, src); - e.cvttss2si(dest, e.xmm0); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_F64)) { - Reg64 dest; - Xmm src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - // TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?) - e.cvttsd2si(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_I32)) { - Xmm dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - // TODO(benvanik): additional checks for saturation/etc? - e.cvtsi2ss(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_F64)) { - Xmm dest, src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - // TODO(benvanik): additional checks for saturation/etc? - e.cvtsd2ss(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_I64)) { - Xmm dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - // TODO(benvanik): additional checks for saturation/etc? - e.cvtsi2sd(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_F32)) { - Xmm dest, src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.cvtss2sd(dest, src); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_ROUND, [](X64Emitter& e, Instr*& i) { - // flags = ROUND_TO_* - if (IsFloatType(i->dest->type)) { - XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - if (i.src1.value->type == FLOAT32_TYPE) { - switch (i.flags) { - case ROUND_TO_ZERO: - e.roundss(dest, src, B00000011); - break; - case ROUND_TO_NEAREST: - e.roundss(dest, src, B00000000); - break; - case ROUND_TO_MINUS_INFINITY: - e.roundss(dest, src, B00000001); - break; - case ROUND_TO_POSITIVE_INFINITY: - e.roundss(dest, src, B00000010); - break; - } - } else { - switch (i.flags) { - case ROUND_TO_ZERO: - e.roundsd(dest, src, B00000011); - break; - case ROUND_TO_NEAREST: - e.roundsd(dest, src, B00000000); - break; - case ROUND_TO_MINUS_INFINITY: - e.roundsd(dest, src, B00000001); - break; - case ROUND_TO_POSITIVE_INFINITY: - e.roundsd(dest, src, B00000010); - break; - } - } - }); - } else if (IsVecType(i->dest->type)) { - XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - switch (i.flags) { - case ROUND_TO_ZERO: - e.roundps(dest, src, B00000011); - break; - case ROUND_TO_NEAREST: - e.roundps(dest, src, B00000000); - break; - case ROUND_TO_MINUS_INFINITY: - e.roundps(dest, src, B00000001); - break; - case ROUND_TO_POSITIVE_INFINITY: - e.roundps(dest, src, B00000010); - break; - } - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_VECTOR_CONVERT_I2F, [](X64Emitter& e, Instr*& i) { - // flags = ARITHMETIC_UNSIGNED - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - // TODO(benvanik): are these really the same? VC++ thinks so. - if (i.flags & ARITHMETIC_UNSIGNED) { - e.cvtdq2ps(dest, src); - } else { - e.cvtdq2ps(dest, src); - } - }); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_VECTOR_CONVERT_F2I, [](X64Emitter& e, Instr*& i) { - // flags = ARITHMETIC_SATURATE | ARITHMETIC_UNSIGNED - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - // TODO(benvanik): are these really the same? VC++ thinks so. - if (i.flags & ARITHMETIC_UNSIGNED) { - e.cvttps2dq(dest, src); - } else { - e.cvttps2dq(dest, src); - } - if (i.flags & ARITHMETIC_SATURATE) { - UNIMPLEMENTED_SEQ(); - } - }); - i = e.Advance(i); - return true; -}); - -// -------------------------------------------------------------------------- -// Constants -// -------------------------------------------------------------------------- - -// specials for zeroing/etc (xor/etc) - -table->AddSequence(OPCODE_LOAD_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { - XEASSERT(i->dest->type == VEC128_TYPE); - if (i->src1.value->IsConstant()) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - auto sh = MIN(16, i->src1.value->AsUint32()); - e.mov(e.rax, (uintptr_t)&lvsl_table[sh]); - e.movaps(dest, e.ptr[e.rax]); - e.EndOp(dest); - } else { - Xmm dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - // TODO(benvanik): probably a way to do this with addressing. - e.mov(TEMP_REG, 16); - e.movzx(e.rax, src); - e.cmp(src, 16); - e.cmovb(TEMP_REG, e.rax); - e.shl(TEMP_REG, 4); - e.mov(e.rax, (uintptr_t)lvsl_table); - e.movaps(dest, e.ptr[e.rax + TEMP_REG]); - e.EndOp(dest, src); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_LOAD_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { - XEASSERT(i->dest->type == VEC128_TYPE); - if (i->src1.value->IsConstant()) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - auto sh = MIN(16, i->src1.value->AsUint32()); - e.mov(e.rax, (uintptr_t)&lvsr_table[sh]); - e.movaps(dest, e.ptr[e.rax]); - e.EndOp(dest); - } else { - Xmm dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - // TODO(benvanik): probably a way to do this with addressing. - e.mov(TEMP_REG, 16); - e.movzx(e.rax, src); - e.cmp(src, 16); - e.cmovb(TEMP_REG, e.rax); - e.shl(TEMP_REG, 4); - e.mov(e.rax, (uintptr_t)lvsr_table); - e.movaps(dest, e.ptr[e.rax + TEMP_REG]); - e.EndOp(dest, src); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_LOAD_CLOCK, [](X64Emitter& e, Instr*& i) { - // It'd be cool to call QueryPerformanceCounter directly, but w/e. - CallNative(e, LoadClock); - Reg64 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.rax); - e.EndOp(dest); - i = e.Advance(i); - return true; -}); - -// -------------------------------------------------------------------------- -// Stack Locals -// -------------------------------------------------------------------------- - -table->AddSequence(OPCODE_LOAD_LOCAL, [](X64Emitter& e, Instr*& i) { - auto addr = e.rsp + i->src1.value->AsUint32(); - if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.byte[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { - Reg16 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.word[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { - Reg32 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.dword[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { - Reg64 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.qword[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movss(dest, e.dword[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movsd(dest, e.qword[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - // NOTE: we always know we are aligned. - e.movaps(dest, e.ptr[addr]); - e.EndOp(dest); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_STORE_LOCAL, [](X64Emitter& e, Instr*& i) { - auto addr = e.rsp + i->src1.value->AsUint32(); - if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { - Reg8 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.byte[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { - e.mov(e.byte[addr], i->src2.value->constant.i8); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { - Reg16 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.word[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { - e.mov(e.word[addr], i->src2.value->constant.i16); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { - Reg32 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.dword[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { - e.mov(e.dword[addr], i->src2.value->constant.i32); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { - Reg64 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.qword[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { - MovMem64(e, addr, i->src2.value->constant.i64); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movss(e.dword[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { - e.mov(e.dword[addr], i->src2.value->constant.i32); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movsd(e.qword[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { - MovMem64(e, addr, i->src2.value->constant.i64); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - // NOTE: we always know we are aligned. - e.movaps(e.ptr[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { - // TODO(benvanik): check zero - // TODO(benvanik): correct order? - MovMem64(e, addr, i->src2.value->constant.v128.low); - MovMem64(e, addr + 8, i->src2.value->constant.v128.high); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -// -------------------------------------------------------------------------- -// Context -// -------------------------------------------------------------------------- - -table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { - auto addr = e.rcx + i->src1.offset; - if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.byte[addr]); - e.EndOp(dest); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8b, dest); - CallNative(e, TraceContextLoadI8); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { - Reg16 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.word[addr]); - e.EndOp(dest); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8w, dest); - CallNative(e, TraceContextLoadI16); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { - Reg32 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.dword[addr]); - e.EndOp(dest); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8d, dest); - CallNative(e, TraceContextLoadI32); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { - Reg64 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.qword[addr]); - e.EndOp(dest); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8, dest); - CallNative(e, TraceContextLoadI64); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movss(dest, e.dword[addr]); - e.EndOp(dest); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.lea(e.r8, Stash(e, dest)); - CallNative(e, TraceContextLoadF32); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movsd(dest, e.qword[addr]); - e.EndOp(dest); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.lea(e.r8, Stash(e, dest)); - CallNative(e, TraceContextLoadF64); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - // NOTE: we always know we are aligned. - e.movaps(dest, e.ptr[addr]); - e.EndOp(dest); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.lea(e.r8, Stash(e, dest)); - CallNative(e, TraceContextLoadV128); -#endif // DTRACE - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { - auto addr = e.rcx + i->src1.offset; - if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { - Reg8 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.byte[addr], src); - e.EndOp(src); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8b, src); - CallNative(e, TraceContextStoreI8); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { - e.mov(e.byte[addr], i->src2.value->constant.i8); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8b, i->src2.value->constant.i8); - CallNative(e, TraceContextStoreI8); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { - Reg16 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.word[addr], src); - e.EndOp(src); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8w, src); - CallNative(e, TraceContextStoreI16); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { - e.mov(e.word[addr], i->src2.value->constant.i16); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8w, i->src2.value->constant.i16); - CallNative(e, TraceContextStoreI16); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { - Reg32 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.dword[addr], src); - e.EndOp(src); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8d, src); - CallNative(e, TraceContextStoreI32); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { - e.mov(e.dword[addr], i->src2.value->constant.i32); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8d, i->src2.value->constant.i32); - CallNative(e, TraceContextStoreI32); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { - Reg64 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.qword[addr], src); - e.EndOp(src); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8, src); - CallNative(e, TraceContextStoreI64); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { - MovMem64(e, addr, i->src2.value->constant.i64); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.r8, i->src2.value->constant.i64); - CallNative(e, TraceContextStoreI64); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movss(e.dword[addr], src); - e.EndOp(src); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.lea(e.r8, Stash(e, src)); - CallNative(e, TraceContextStoreF32); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { - e.mov(e.dword[addr], i->src2.value->constant.i32); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.eax, i->src2.value->constant.i32); - e.vmovd(e.xmm0, e.eax); - e.lea(e.r8, Stash(e, e.xmm0)); - CallNative(e, TraceContextStoreF32); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movsd(e.qword[addr], src); - e.EndOp(src); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.lea(e.r8, Stash(e, src)); - CallNative(e, TraceContextStoreF64); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { - MovMem64(e, addr, i->src2.value->constant.i64); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.mov(e.rax, i->src2.value->constant.i64); - e.vmovq(e.xmm0, e.rax); - e.lea(e.r8, Stash(e, e.xmm0)); - CallNative(e, TraceContextStoreF64); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - // NOTE: we always know we are aligned. - e.movaps(e.ptr[addr], src); - e.EndOp(src); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.lea(e.r8, Stash(e, src)); - CallNative(e, TraceContextStoreV128); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { - // TODO(benvanik): check zero - // TODO(benvanik): correct order? - MovMem64(e, addr, i->src2.value->constant.v128.low); - MovMem64(e, addr + 8, i->src2.value->constant.v128.high); -#if DTRACE - e.mov(e.rdx, i->src1.offset); - e.lea(e.r8, e.ptr[addr]); - CallNative(e, TraceContextStoreV128); -#endif // DTRACE - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -// -------------------------------------------------------------------------- -// Memory -// -------------------------------------------------------------------------- - -table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { - // If this is a constant address load, check to see if it's in a register - // range. We'll also probably want a dynamic check for unverified loads. - // So far, most games use constants. - if (i->src1.value->IsConstant()) { - uint64_t address = i->src1.value->AsUint64(); - auto cbs = e.runtime()->access_callbacks(); - while (cbs) { - if (cbs->handles(cbs->context, address)) { - // Eh, hacking lambdas. - i->src3.offset = (uint64_t)cbs; - IntUnaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { - auto cbs = (RegisterAccessCallbacks*)i.src3.offset; - e.mov(e.rcx, (uint64_t)cbs->context); - e.mov(e.rdx, i.src1.value->AsUint64()); - CallNative(e, cbs->read); - switch (i.dest->type) { - case INT8_TYPE: - break; - case INT16_TYPE: - e.xchg(e.al, e.ah); - break; - case INT32_TYPE: - e.bswap(e.eax); - break; - case INT64_TYPE: - e.bswap(e.rax); - break; - default: ASSERT_INVALID_TYPE(); break; - } - e.mov(dest_src, e.rax); - }); - i = e.Advance(i); - return true; - } - cbs = cbs->next; - } - } - - // mov reg, [membase + address.32] - if (i->src1.value->IsConstant()) { - e.mov(e.eax, i->src1.value->AsUint32()); - } else { - Reg64 addr_off; - e.BeginOp(i->src1.value, addr_off, 0); - e.mov(e.eax, addr_off.cvt32()); // trunc to 32bits - e.EndOp(addr_off); - } - auto addr = e.rdx + e.rax; - -#if DYNAMIC_REGISTER_ACCESS_CHECK - e.inLocalLabel(); - // if ((address & 0xFF000000) == 0x7F000000) do check; - e.lea(e.r8d, e.ptr[addr]); - e.and(e.r8d, 0xFF000000); - e.cmp(e.r8d, 0x7F000000); - e.jne(".normal_addr"); - if (IsIntType(i->dest->type)) { - e.mov(e.rdx, e.rax); - CallNative(e, DynamicRegisterLoad); - Reg64 dyn_dest; - e.BeginOp(i->dest, dyn_dest, REG_DEST); - switch (i->dest->type) { - case INT8_TYPE: - e.movzx(dyn_dest, e.al); - break; - case INT16_TYPE: - e.xchg(e.al, e.ah); - e.movzx(dyn_dest, e.ax); - break; - case INT32_TYPE: - e.bswap(e.eax); - e.mov(dyn_dest.cvt32(), e.eax); - break; - case INT64_TYPE: - e.bswap(e.rax); - e.mov(dyn_dest, e.rax); - break; - default: - e.db(0xCC); - break; - } - e.EndOp(dyn_dest); - } else { - e.db(0xCC); - } - e.jmp(".skip_access"); - e.L(".normal_addr"); -#endif // DYNAMIC_REGISTER_ACCESS_CHECK - - if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.byte[addr]); - e.EndOp(dest); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.r8b, dest); - CallNative(e, TraceMemoryLoadI8); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { - Reg16 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.word[addr]); - e.EndOp(dest); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.r8w, dest); - CallNative(e, TraceMemoryLoadI16); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { - Reg32 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.dword[addr]); - e.EndOp(dest); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.r8d, dest); - CallNative(e, TraceMemoryLoadI32); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { - Reg64 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.qword[addr]); - e.EndOp(dest); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.r8, dest); - CallNative(e, TraceMemoryLoadI64); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movss(dest, e.dword[addr]); - e.EndOp(dest); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.lea(e.r8, Stash(e, dest)); - CallNative(e, TraceMemoryLoadF32); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movsd(dest, e.qword[addr]); - e.EndOp(dest); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.lea(e.r8, Stash(e, dest)); - CallNative(e, TraceMemoryLoadF64); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - // TODO(benvanik): we should try to stick to movaps if possible. - e.movups(dest, e.ptr[addr]); - e.EndOp(dest); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.lea(e.r8, Stash(e, dest)); - CallNative(e, TraceMemoryLoadV128); -#endif // DTRACE - } else { - ASSERT_INVALID_TYPE(); - } - -#if DYNAMIC_REGISTER_ACCESS_CHECK - e.L(".skip_access"); - e.outLocalLabel(); -#endif // DYNAMIC_REGISTER_ACCESS_CHECK - - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { - // If this is a constant address store, check to see if it's in a - // register range. We'll also probably want a dynamic check for - // unverified stores. So far, most games use constants. - if (i->src1.value->IsConstant()) { - uint64_t address = i->src1.value->AsUint64(); - auto cbs = e.runtime()->access_callbacks(); - while (cbs) { - if (cbs->handles(cbs->context, address)) { - e.mov(e.rcx, (uint64_t)cbs->context); - e.mov(e.rdx, address); - if (i->src2.value->IsConstant()) { - e.mov(e.r8, i->src2.value->AsUint64()); - } else { - Reg64 src2; - e.BeginOp(i->src2.value, src2, 0); - switch (i->src2.value->type) { - case INT8_TYPE: - e.movzx(e.r8d, src2.cvt8()); - break; - case INT16_TYPE: - e.movzx(e.rax, src2.cvt16()); - e.xchg(e.al, e.ah); - e.mov(e.r8, e.rax); - break; - case INT32_TYPE: - e.movzx(e.r8, src2.cvt32()); - e.bswap(e.r8d); - break; - case INT64_TYPE: - e.mov(e.r8, src2); - e.bswap(e.r8); - break; - default: ASSERT_INVALID_TYPE(); break; - } - e.EndOp(src2); - } - CallNative(e, cbs->write); - i = e.Advance(i); - return true; - } - cbs = cbs->next; - } - } - - // mov [membase + address.32], reg - if (i->src1.value->IsConstant()) { - e.mov(e.eax, i->src1.value->AsUint32()); - } else { - Reg64 addr_off; - e.BeginOp(i->src1.value, addr_off, 0); - e.mov(e.eax, addr_off.cvt32()); // trunc to 32bits - e.EndOp(addr_off); - } - auto addr = e.rdx + e.rax; - -#if DYNAMIC_REGISTER_ACCESS_CHECK - // if ((address & 0xFF000000) == 0x7F000000) do check; - e.lea(e.r8d, e.ptr[addr]); - e.and(e.r8d, 0xFF000000); - e.cmp(e.r8d, 0x7F000000); - e.inLocalLabel(); - e.jne(".normal_addr"); - if (IsIntType(i->src2.value->type)) { - Reg64 dyn_src; - e.BeginOp(i->src2.value, dyn_src, 0); - switch (i->src2.value->type) { - case INT8_TYPE: - e.movzx(e.r8, dyn_src.cvt8()); - break; - case INT16_TYPE: - e.movzx(e.rax, dyn_src.cvt16()); - e.xchg(e.al, e.ah); - e.mov(e.r8, e.rax); - break; - case INT32_TYPE: - e.mov(e.r8d, dyn_src.cvt32()); - e.bswap(e.r8d); - break; - case INT64_TYPE: - e.mov(e.r8, dyn_src); - e.bswap(e.r8); - break; - default: - e.db(0xCC); - break; - } - e.EndOp(dyn_src); - e.mov(e.rdx, e.rax); - CallNative(e, DynamicRegisterStore); - } else { - e.db(0xCC); - } - e.jmp(".skip_access"); - e.L(".normal_addr"); -#endif // DYNAMIC_REGISTER_ACCESS_CHECK - - if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { - Reg8 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.byte[addr], src); - e.EndOp(src); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.r8b, src); - CallNative(e, TraceMemoryStoreI8); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { - e.mov(e.byte[addr], i->src2.value->constant.i8); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.r8b, i->src2.value->constant.i8); - CallNative(e, TraceMemoryStoreI8); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { - Reg16 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.word[addr], src); - e.EndOp(src); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.r8w, src); - CallNative(e, TraceMemoryStoreI16); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { - e.mov(e.word[addr], i->src2.value->constant.i16); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.r8w, i->src2.value->constant.i16); - CallNative(e, TraceMemoryStoreI16); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { - Reg32 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.dword[addr], src); - e.EndOp(src); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.r8d, src); - CallNative(e, TraceMemoryStoreI32); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { - e.mov(e.dword[addr], i->src2.value->constant.i32); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.r8d, i->src2.value->constant.i32); - CallNative(e, TraceMemoryStoreI32); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { - Reg64 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.qword[addr], src); - e.EndOp(src); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.r8, src); - CallNative(e, TraceMemoryStoreI64); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { - MovMem64(e, addr, i->src2.value->constant.i64); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.r8, i->src2.value->constant.i64); - CallNative(e, TraceMemoryStoreI64); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movss(e.dword[addr], src); - e.EndOp(src); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.lea(e.r8, Stash(e, src)); - CallNative(e, TraceMemoryStoreF32); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { - e.mov(e.dword[addr], i->src2.value->constant.i32); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.mov(e.eax, i->src2.value->constant.i32); - e.vmovd(e.xmm0, e.eax); - e.lea(e.r8, Stash(e, e.xmm0)); - CallNative(e, TraceMemoryStoreF32); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movsd(e.qword[addr], src); - e.EndOp(src); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.lea(e.r8, Stash(e, src)); - CallNative(e, TraceMemoryStoreF64); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { - MovMem64(e, addr, i->src2.value->constant.i64); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.movsd(e.xmm0, e.ptr[addr]); - CallNative(e, TraceMemoryStoreF64); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - // TODO(benvanik): we should try to stick to movaps if possible. - e.movups(e.ptr[addr], src); - e.EndOp(src); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.lea(e.r8, Stash(e, src)); - CallNative(e, TraceMemoryStoreV128); -#endif // DTRACE - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { - // TODO(benvanik): check zero - // TODO(benvanik): correct order? - MovMem64(e, addr, i->src2.value->constant.v128.low); - MovMem64(e, addr + 8, i->src2.value->constant.v128.high); -#if DTRACE - e.lea(e.rdx, e.ptr[addr]); - e.lea(e.r8, e.ptr[addr]); - CallNative(e, TraceMemoryStoreV128); -#endif // DTRACE - } else { - ASSERT_INVALID_TYPE(); - } - -#if DYNAMIC_REGISTER_ACCESS_CHECK - e.L(".skip_access"); - e.outLocalLabel(); -#endif // DYNAMIC_REGISTER_ACCESS_CHECK - - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_PREFETCH, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; -}); - -// -------------------------------------------------------------------------- -// Comparisons -// -------------------------------------------------------------------------- - -table->AddSequence(OPCODE_MAX, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else if (IsFloatType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - if (i.src1.value->type == FLOAT32_TYPE) { - e.maxss(dest_src, src); - } else { - e.maxsd(dest_src, src); - } - }); - } else if (IsVecType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - e.maxps(dest_src, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_MIN, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else if (IsFloatType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - if (i.src1.value->type == FLOAT32_TYPE) { - e.minss(dest_src, src); - } else { - e.minsd(dest_src, src); - } - }); - } else if (IsVecType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - e.minps(dest_src, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_SELECT, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - if (IsIntType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else if (IsFloatType(i->dest->type) || IsVecType(i->dest->type)) { - Xmm dest, src2, src3; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0, - i->src3.value, src3, 0); - // TODO(benvanik): find a way to do this without branches. - e.inLocalLabel(); - e.movaps(dest, src3); - e.jz(".skip"); - e.movaps(dest, src2); - e.L(".skip"); - e.outLocalLabel(); - e.EndOp(dest, src2, src3); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_IS_TRUE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.setnz(dest); - e.EndOp(dest); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_IS_FALSE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.setz(dest); - e.EndOp(dest); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_COMPARE_EQ, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.sete(dest); - } else { - e.setne(dest); - } - }); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_COMPARE_NE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setne(dest); - } else { - e.sete(dest); - } - }); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_COMPARE_SLT, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setl(dest); - } else { - e.setge(dest); - } - }); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_COMPARE_SLE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setle(dest); - } else { - e.setg(dest); - } - }); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_COMPARE_SGT, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setg(dest); - } else { - e.setle(dest); - } - }); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_COMPARE_SGE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setge(dest); - } else { - e.setl(dest); - } - }); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_COMPARE_ULT, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setb(dest); - } else { - e.setae(dest); - } - }); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_COMPARE_ULE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setbe(dest); - } else { - e.seta(dest); - } - }); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_COMPARE_UGT, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.seta(dest); - } else { - e.setbe(dest); - } - }); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setae(dest); - } else { - e.setb(dest); - } - }); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_DID_CARRY, [](X64Emitter& e, Instr*& i) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - LoadEflags(e); - e.setc(dest); - e.EndOp(dest); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_DID_OVERFLOW, [](X64Emitter& e, Instr*& i) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - LoadEflags(e); - e.seto(dest); - e.EndOp(dest); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_DID_SATURATE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_VECTOR_COMPARE_EQ, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - VectorCompareXX(e, i, VECTOR_CMP_EQ, true); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_VECTOR_COMPARE_SGT, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - VectorCompareXX(e, i, VECTOR_CMP_GT, true); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_VECTOR_COMPARE_SGE, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - VectorCompareXX(e, i, VECTOR_CMP_GE, true); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_VECTOR_COMPARE_UGT, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - VectorCompareXX(e, i, VECTOR_CMP_GT, false); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_VECTOR_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - VectorCompareXX(e, i, VECTOR_CMP_GE, false); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -// -------------------------------------------------------------------------- -// Math -// -------------------------------------------------------------------------- - -table->AddSequence(OPCODE_ADD, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.add(dest_src, src); - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.add(dest_src, src); - }); - } else if (IsFloatType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - if (i.src1.value->type == FLOAT32_TYPE) { - e.addss(dest_src, src); - } else { - e.addsd(dest_src, src); - } - }); - } else if (IsVecType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - e.addps(dest_src, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_ADD_CARRY, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - // dest = src1 + src2 + src3.i8 - IntTernaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { - Reg8 src3_8(src3.getIdx()); - if (src3.getIdx() <= 4) { - e.mov(e.ah, src3_8); - } else { - e.mov(e.al, src3_8); - e.mov(e.ah, e.al); - } - e.sahf(); - e.adc(dest_src, src2); - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, uint32_t src3) { - e.mov(e.eax, src3); - e.mov(e.ah, e.al); - e.sahf(); - e.adc(dest_src, src2); - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src2, const Operand& src3) { - Reg8 src3_8(src3.getIdx()); - if (src3.getIdx() <= 4) { - e.mov(e.ah, src3_8); - } else { - e.mov(e.al, src3_8); - e.mov(e.ah, e.al); - } - e.sahf(); - e.adc(dest_src, src2); - }); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_VECTOR_ADD, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - if (i->flags == INT8_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == INT16_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == INT32_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == FLOAT32_TYPE) { - UNIMPLEMENTED_SEQ(); - } else { - ASSERT_INVALID_TYPE(); - } - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - if (i.flags & ARITHMETIC_SET_CARRY) { - auto Nax = LIKE_REG(e.rax, src); - e.mov(Nax, src); - e.not(Nax); - e.stc(); - e.adc(dest_src, Nax); - } else { - e.sub(dest_src, src); - } - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - if (i.flags & ARITHMETIC_SET_CARRY) { - auto Nax = LIKE_REG(e.rax, dest_src); - e.mov(Nax, src); - e.not(Nax); - e.stc(); - e.adc(dest_src, Nax); - } else { - e.sub(dest_src, src); - } - }); - } else if (IsFloatType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - if (i.src1.value->type == FLOAT32_TYPE) { - e.subss(dest_src, src); - } else { - e.subsd(dest_src, src); - } - }); - } else if (IsVecType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - e.subps(dest_src, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // RAX = value, RDX = clobbered - // TODO(benvanik): make the register allocator put dest_src in RAX? - auto Nax = LIKE_REG(e.rax, dest_src); - e.mov(Nax, dest_src); - if (i.flags & ARITHMETIC_UNSIGNED) { - e.mul(src); - } else { - e.imul(src); - } - e.mov(dest_src, Nax); - ReloadRDX(e); - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - // RAX = value, RDX = clobbered - // TODO(benvanik): make the register allocator put dest_src in RAX? - auto Nax = LIKE_REG(e.rax, dest_src); - auto Ndx = LIKE_REG(e.rdx, dest_src); - e.mov(Nax, dest_src); - e.mov(Ndx, src); - if (i.flags & ARITHMETIC_UNSIGNED) { - e.mul(Ndx); - } else { - e.imul(Ndx); - } - e.mov(dest_src, Nax); - ReloadRDX(e); - }); - } else if (IsFloatType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - if (i.flags & ARITHMETIC_UNSIGNED) { UNIMPLEMENTED_SEQ(); } - if (i.src1.value->type == FLOAT32_TYPE) { - e.mulss(dest_src, src); - } else { - e.mulsd(dest_src, src); - } - }); - } else if (IsVecType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - if (i.flags & ARITHMETIC_UNSIGNED) { UNIMPLEMENTED_SEQ(); } - e.mulps(dest_src, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_MUL_HI, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // RAX = value, RDX = clobbered - // TODO(benvanik): make the register allocator put dest_src in RAX? - auto Nax = LIKE_REG(e.rax, dest_src); - auto Ndx = LIKE_REG(e.rdx, dest_src); - e.mov(Nax, dest_src); - if (i.flags & ARITHMETIC_UNSIGNED) { - e.mul(src); - } else { - e.imul(src); - } - e.mov(dest_src, Ndx); - ReloadRDX(e); - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - // RAX = value, RDX = clobbered - // TODO(benvanik): make the register allocator put dest_src in RAX? - auto Nax = LIKE_REG(e.rax, dest_src); - auto Ndx = LIKE_REG(e.rdx, dest_src); - e.mov(Nax, dest_src); - e.mov(Ndx, src); - if (i.flags & ARITHMETIC_UNSIGNED) { - e.mul(Ndx); - } else { - e.imul(Ndx); - } - e.mov(dest_src, Ndx); - ReloadRDX(e); - }); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_DIV, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // RAX = value, RDX = clobbered - // TODO(benvanik): make the register allocator put dest_src in RAX? - auto Nax = LIKE_REG(e.rax, dest_src); - auto Ndx = LIKE_REG(e.rdx, dest_src); - e.mov(Nax, dest_src); - e.xor(Ndx, Ndx); - if (i.flags & ARITHMETIC_UNSIGNED) { - e.div(src); - } else { - e.idiv(src); - } - e.mov(dest_src, Nax); - ReloadRDX(e); - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - // RAX = value, RDX = clobbered - // TODO(benvanik): make the register allocator put dest_src in RAX? - auto Nax = LIKE_REG(e.rax, dest_src); - auto Ndx = LIKE_REG(e.rdx, dest_src); - e.mov(Nax, dest_src); - e.mov(Ndx, src); - if (i.flags & ARITHMETIC_UNSIGNED) { - e.div(Ndx); - } else { - e.idiv(Ndx); - } - e.mov(dest_src, Nax); - ReloadRDX(e); - }); - } else if (IsFloatType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - if (i.flags & ARITHMETIC_UNSIGNED) { UNIMPLEMENTED_SEQ(); } - if (i.src1.value->type == FLOAT32_TYPE) { - e.divss(dest_src, src); - } else { - e.divsd(dest_src, src); - } - }); - } else if (IsVecType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - if (i.flags & ARITHMETIC_UNSIGNED) { UNIMPLEMENTED_SEQ(); } - e.divps(dest_src, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_MUL_ADD, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else if (IsFloatType(i->dest->type)) { - XmmTernaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3) { - if (i.dest->type == FLOAT32_TYPE) { - e.vfmadd132ss(dest_src, src3, src2); - } else { - e.vfmadd132sd(dest_src, src3, src2); - } - }); - } else if (IsVecType(i->dest->type)) { - XmmTernaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3) { - e.vfmadd132ps(dest_src, src3, src2); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_MUL_SUB, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else if (IsFloatType(i->dest->type)) { - XmmTernaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3) { - if (i.dest->type == FLOAT32_TYPE) { - e.vfmsub132ss(dest_src, src3, src2); - } else { - e.vfmsub132sd(dest_src, src3, src2); - } - }); - } else if (IsVecType(i->dest->type)) { - XmmTernaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3) { - e.vfmsub132ps(dest_src, src3, src2); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_NEG, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - IntUnaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { - e.neg(dest_src); - }); - } else if (IsFloatType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - if (i.src1.value->type == FLOAT32_TYPE) { - e.mov(e.rax, XMMCONSTBASE); - e.vpxor(dest, src, XMMCONST(e.rax, XMMSignMaskPS)); - } else { - e.mov(e.rax, XMMCONSTBASE); - e.vpxor(dest, src, XMMCONST(e.rax, XMMSignMaskPD)); - } - }); - } else if (IsVecType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - e.mov(e.rax, XMMCONSTBASE); - e.vpxor(dest, src, XMMCONST(e.rax, XMMSignMaskPS)); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_ABS, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else if (IsFloatType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - if (i.src1.value->type == FLOAT32_TYPE) { - e.mov(e.rax, XMMCONSTBASE); - e.movaps(e.xmm0, XMMCONST(e.rax, XMMSignMaskPS)); - e.vpandn(dest, e.xmm0, src); - } else { - e.mov(e.rax, XMMCONSTBASE); - e.movaps(e.xmm0, XMMCONST(e.rax, XMMSignMaskPD));; - e.vpandn(dest, e.xmm0, src); - } - }); - } else if (IsVecType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - e.mov(e.rax, XMMCONSTBASE); - e.movaps(e.xmm0, XMMCONST(e.rax, XMMSignMaskPS));; - e.vpandn(dest, e.xmm0, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_SQRT, [](X64Emitter& e, Instr*& i) { - if (IsFloatType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - if (i.dest->type == FLOAT32_TYPE) { - e.sqrtss(dest, src); - } else { - e.sqrtsd(dest, src); - } - }); - } else if (IsVecType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - e.sqrtps(dest, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_RSQRT, [](X64Emitter& e, Instr*& i) { - if (IsFloatType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - if (i.dest->type == FLOAT32_TYPE) { - e.rsqrtss(dest, src); - } else { - e.cvtsd2ss(dest, src); - e.rsqrtss(dest, dest); - e.cvtss2sd(dest, dest); - } - }); - } else if (IsVecType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - e.rsqrtps(dest, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_POW2, [](X64Emitter& e, Instr*& i) { - if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_LOG2, [](X64Emitter& e, Instr*& i) { - if (IsFloatType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else if (IsVecType(i->dest->type)) { - UNIMPLEMENTED_SEQ(); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_DOT_PRODUCT_3, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->src1.value->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx - // TODO(benvanik): verify ordering - e.dpps(dest_src, src, B01110001); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_DOT_PRODUCT_4, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->src1.value->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx - // TODO(benvanik): verify ordering - e.dpps(dest_src, src, B11110001); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_AND, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.and(dest_src, src); - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.and(dest_src, src); - }); - } else if (IsVecType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - e.pand(dest_src, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_OR, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.or(dest_src, src); - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.or(dest_src, src); - }); - } else if (IsVecType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - e.por(dest_src, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_XOR, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.xor(dest_src, src); - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.xor(dest_src, src); - }); - } else if (IsVecType(i->dest->type)) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - e.pxor(dest_src, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - IntUnaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src) { - e.not(dest_src); - }); - } else if (IsVecType(i->dest->type)) { - XmmUnaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - // dest_src ^= 0xFFFF... - if (dest.getIdx() != src.getIdx()) { - e.movaps(dest, src); - } - e.mov(e.rax, XMMCONSTBASE); - e.pxor(dest, XMMCONST(e.rax, XMMOne)); - }); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_SHL, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - // TODO(benvanik): use shlx if available. - IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // Can only shl by cl. Eww x86. - Reg8 shamt(src.getIdx()); - e.mov(e.rax, e.rcx); - e.mov(e.cl, shamt); - e.shl(dest_src, e.cl); - e.mov(e.rcx, e.rax); - // BeaEngine can't disasm this, boo. - /*Reg32e dest_src_e(dest_src.getIdx(), MAX(dest_src.getBit(), 32)); - Reg32e src_e(src.getIdx(), MAX(dest_src.getBit(), 32)); - e.and(src_e, 0x3F); - e.shlx(dest_src_e, dest_src_e, src_e);*/ - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.shl(dest_src, src); - }); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_SHR, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - // TODO(benvanik): use shrx if available. - IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // Can only sar by cl. Eww x86. - Reg8 shamt(src.getIdx()); - e.mov(e.rax, e.rcx); - e.mov(e.cl, shamt); - e.shr(dest_src, e.cl); - e.mov(e.rcx, e.rax); - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.shr(dest_src, src); - }); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_SHA, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - // TODO(benvanik): use sarx if available. - IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // Can only sar by cl. Eww x86. - Reg8 shamt(src.getIdx()); - e.mov(e.rax, e.rcx); - e.mov(e.cl, shamt); - e.sar(dest_src, e.cl); - e.mov(e.rcx, e.rax); - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.sar(dest_src, src); - }); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - if (i->flags == INT8_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == INT16_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == INT32_TYPE) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - // src shift mask may have values >31, and x86 sets to zero when - // that happens so we mask. - e.mov(e.eax, 0x1F); - e.vmovd(e.xmm0, e.eax); - e.vpbroadcastd(e.xmm0, e.xmm0); - e.vandps(e.xmm0, src, e.xmm0); - e.vpsllvd(dest_src, dest_src, e.xmm0); - }); - } else { - ASSERT_INVALID_TYPE(); - } - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - if (i->flags == INT8_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == INT16_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == INT32_TYPE) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - // src shift mask may have values >31, and x86 sets to zero when - // that happens so we mask. - e.mov(e.eax, 0x1F); - e.vmovd(e.xmm0, e.eax); - e.vpbroadcastd(e.xmm0, e.xmm0); - e.vandps(e.xmm0, src, e.xmm0); - e.vpsrlvd(dest_src, dest_src, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_VECTOR_SHA, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - if (i->flags == INT8_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == INT16_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == INT32_TYPE) { - XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { - // src shift mask may have values >31, and x86 sets to zero when - // that happens so we mask. - e.mov(e.eax, 0x1F); - e.vmovd(e.xmm0, e.eax); - e.vpbroadcastd(e.xmm0, e.xmm0); - e.vandps(e.xmm0, src, e.xmm0); - e.vpsravd(dest_src, dest_src, src); - }); - } else { - ASSERT_INVALID_TYPE(); - } - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_ROTATE_LEFT, [](X64Emitter& e, Instr*& i) { - if (IsIntType(i->dest->type)) { - IntBinaryOp(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - // Can only rol by cl. Eww x86. - Reg8 shamt(src.getIdx()); - e.mov(e.rax, e.rcx); - e.mov(e.cl, shamt); - e.rol(dest_src, e.cl); - e.mov(e.rcx, e.rax); - }, [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.rol(dest_src, src); - }); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_BYTE_SWAP, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg16 dest, src1; - // TODO(benvanik): fix register allocator to put the value in ABCD - //e.BeginOp(i->dest, d, REG_DEST | REG_ABCD, - // i->src1.value, s1, 0); - //if (d != s1) { - // e.mov(d, s1); - // e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); - //} else { - // e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); - //} - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.mov(e.ax, src1); - e.xchg(e.ah, e.al); - e.mov(dest, e.ax); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg32 dest, src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest.getIdx() != src1.getIdx()) { - e.mov(dest, src1); - e.bswap(dest); - } else { - e.bswap(dest); - } - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg64 dest, src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest.getIdx() != src1.getIdx()) { - e.mov(dest, src1); - e.bswap(dest); - } else { - e.bswap(dest); - } - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_V128)) { - Xmm dest, src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - // TODO(benvanik): find a way to do this without the memory load. - e.mov(e.rax, XMMCONSTBASE); - e.vpshufb(dest, src1, XMMCONST(e.rax, XMMByteSwapMask)); - e.EndOp(dest, src1); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_CNTLZ, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8)) { - Reg8 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.bsr(dest.cvt16(), src.cvt16()); - // ZF = 1 if zero - e.mov(e.eax, 16 ^ 0x7); - e.cmovz(dest.cvt32(), e.eax); - e.sub(dest, 8); - e.xor(dest, 0x7); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16)) { - Reg8 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.bsr(dest.cvt16(), src); - // ZF = 1 if zero - e.mov(e.eax, 16 ^ 0xF); - e.cmovz(dest.cvt32(), e.eax); - e.xor(dest, 0xF); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32)) { - Reg8 dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.bsr(dest.cvt32(), src); - // ZF = 1 if zero - e.mov(e.eax, 32 ^ 0x1F); - e.cmovz(dest.cvt32(), e.eax); - e.xor(dest, 0x1F); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64)) { - Reg8 dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.bsr(dest, src); - // ZF = 1 if zero - e.mov(e.eax, 64 ^ 0x3F); - e.cmovz(dest.cvt32(), e.eax); - e.xor(dest, 0x3F); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_INSERT, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - if (i->src3.value->type == INT8_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (i->src3.value->type == INT16_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (i->src3.value->type == INT32_TYPE) { - UNIMPLEMENTED_SEQ(); - } else { - ASSERT_INVALID_TYPE(); - } - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -// TODO(benvanik): sequence extract/splat: -// v0.i32 = extract v0.v128, 0 -// v0.v128 = splat v0.i32 -// This can be a single broadcast. - -table->AddSequence(OPCODE_EXTRACT, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->src1.value->type)) { - if (i->dest->type == INT8_TYPE) { - Reg8 dest; - Xmm src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - if (i->src2.value->IsConstant()) { - e.pextrb(dest, src, i->src2.value->constant.i8); - } else { - UNIMPLEMENTED_SEQ(); - } - e.EndOp(dest, src); - } else if (i->dest->type == INT16_TYPE) { - Reg16 dest; - Xmm src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - if (i->src2.value->IsConstant()) { - e.pextrw(dest, src, i->src2.value->constant.i8); - } else { - UNIMPLEMENTED_SEQ(); - } - e.EndOp(dest, src); - } else if (i->dest->type == INT32_TYPE) { - if (i->src2.value->IsConstant()) { - Reg32 dest; - Xmm src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.pextrd(dest, src, i->src2.value->constant.i8); - e.EndOp(dest, src); - } else { - Reg32 dest; - Xmm src; - Reg8 sel; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0, - i->src2.value, sel, 0); - // Get the desired word in xmm0, then extract that. - e.mov(TEMP_REG, sel); - e.and(TEMP_REG, 0x03); - e.shl(TEMP_REG, 4); - e.mov(e.rax, (uintptr_t)extract_table_32); - e.movaps(e.xmm0, e.ptr[e.rax + TEMP_REG]); - e.vpshufb(e.xmm0, src, e.xmm0); - e.pextrd(dest, e.xmm0, 0); - e.EndOp(dest, src, sel); - } - } else if (i->dest->type == FLOAT32_TYPE) { - Reg32 dest; - Xmm src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - if (i->src2.value->IsConstant()) { - e.extractps(dest, src, i->src2.value->constant.i8); - } else { - UNIMPLEMENTED_SEQ(); - } - e.EndOp(dest, src); - } else { - ASSERT_INVALID_TYPE(); - } - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_SPLAT, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - if (i->Match(SIG_TYPE_V128, SIG_TYPE_I8)) { - Xmm dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.vmovd(e.xmm0, src.cvt32()); - e.vpbroadcastb(dest, e.xmm0); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I8C)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - // TODO(benvanik): faster constant splats. - e.mov(e.eax, i->src1.value->constant.i8); - e.vmovd(e.xmm0, e.eax); - e.vpbroadcastb(dest, e.xmm0); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I16)) { - Xmm dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.vmovd(e.xmm0, src.cvt32()); - e.vpbroadcastw(dest, e.xmm0); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I16C)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - // TODO(benvanik): faster constant splats. - e.mov(e.eax, i->src1.value->constant.i16); - e.vmovd(e.xmm0, e.eax); - e.vpbroadcastw(dest, e.xmm0); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I32)) { - Xmm dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.vmovd(e.xmm0, src); - e.vpbroadcastd(dest, e.xmm0); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_I32C)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - // TODO(benvanik): faster constant splats. - e.mov(e.eax, i->src1.value->constant.i32); - e.vmovd(e.xmm0, e.eax); - e.vpbroadcastd(dest, e.xmm0); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_F32)) { - Xmm dest, src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.vbroadcastss(dest, src); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_F32C)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(e.eax, i->src1.value->constant.i32); - e.vmovd(e.xmm0, e.eax); - e.vbroadcastss(dest, e.xmm0); - e.EndOp(dest); - } else { - ASSERT_INVALID_TYPE(); - } - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - if (i->src1.value->type == INT32_TYPE) { - // Permute words between src2 and src3. - // TODO(benvanik): check src3 for zero. if 0, we can use pshufb. - if (i->src1.value->IsConstant()) { - uint32_t control = i->src1.value->AsUint32(); - Xmm dest, src2, src3; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0, - i->src3.value, src3, 0); - // Shuffle things into the right places in dest & xmm0, - // then we blend them together. - uint32_t src_control = - (((control >> 24) & 0x3) << 0) | - (((control >> 16) & 0x3) << 2) | - (((control >> 8) & 0x3) << 4) | - (((control >> 0) & 0x3) << 6); - uint32_t blend_control = - (((control >> 26) & 0x1) << 0) | - (((control >> 18) & 0x1) << 1) | - (((control >> 10) & 0x1) << 2) | - (((control >> 2) & 0x1) << 3); - if (dest.getIdx() != src3.getIdx()) { - e.pshufd(dest, src2, src_control); - e.pshufd(e.xmm0, src3, src_control); - e.blendps(dest, e.xmm0, blend_control); - } else { - e.movaps(e.xmm0, src3); - e.pshufd(dest, src2, src_control); - e.pshufd(e.xmm0, e.xmm0, src_control); - e.blendps(dest, e.xmm0, blend_control); - } - e.EndOp(dest, src2, src3); - } else { - Reg32 control; - Xmm dest, src2, src3; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, control, 0, - i->src2.value, src2, 0, - i->src3.value, src3, 0); - UNIMPLEMENTED_SEQ(); - e.EndOp(dest, control, src2, src3); - } - } else if (i->src1.value->type == VEC128_TYPE) { - // Permute bytes between src2 and src3. - if (i->src3.value->IsConstantZero()) { - // Permuting with src2/zero, so just shuffle/mask. - Xmm dest, control, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, control, 0, - i->src2.value, src2, 0); - if (i->src2.value->IsConstantZero()) { - e.vpxor(dest, src2, src2); - } else { - if (i->src2.value->IsConstant()) { - LoadXmmConstant(e, src2, i->src2.value->constant.v128); - } - // Control mask needs to be shuffled. - e.mov(e.rax, XMMCONSTBASE); - e.vpshufb(e.xmm0, control, XMMCONST(e.rax, XMMByteSwapMask)); - e.vpshufb(dest, src2, e.xmm0); - // Build a mask with values in src2 having 0 and values in src3 having 1. - e.vpcmpgtb(e.xmm0, e.xmm0, XMMCONST(e.rax, XMMPermuteControl15)); - e.vpandn(dest, e.xmm0, dest); - } - e.EndOp(dest, control, src2); - } else { - // General permute. - Xmm dest, control, src2, src3; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, control, 0, - i->src2.value, src2, 0, - i->src3.value, src3, 0); - e.mov(e.rax, XMMCONSTBASE); - // Control mask needs to be shuffled. - e.vpshufb(e.xmm1, control, XMMCONST(e.rax, XMMByteSwapMask)); - // Build a mask with values in src2 having 0 and values in src3 having 1. - e.vpcmpgtb(dest, e.xmm1, XMMCONST(e.rax, XMMPermuteControl15)); - Xmm src2_shuf, src3_shuf; - if (i->src2.value->IsConstantZero()) { - e.vpxor(src2, src2); - src2_shuf = src2; - } else { - if (i->src2.value->IsConstant()) { - LoadXmmConstant(e, src2, i->src2.value->constant.v128); - } - src2_shuf = e.xmm0; - e.vpshufb(src2_shuf, src2, e.xmm1); - } - if (i->src3.value->IsConstantZero()) { - e.vpxor(src3, src3); - src3_shuf = src3; - } else { - if (i->src3.value->IsConstant()) { - LoadXmmConstant(e, src3, i->src3.value->constant.v128); - } - // NOTE: reusing xmm1 here. - src3_shuf = e.xmm1; - e.vpshufb(src3_shuf, src3, e.xmm1); - } - e.vpblendvb(dest, src2_shuf, src3_shuf, dest); - e.EndOp(dest, control, src2, src3); - } - } else { - ASSERT_INVALID_TYPE(); - } - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_SWIZZLE, [](X64Emitter& e, Instr*& i) { - if (IsVecType(i->dest->type)) { - // Defined by SWIZZLE_MASK() - if (i->flags == INT32_TYPE || i->flags == FLOAT32_TYPE) { - uint8_t swizzle_mask = (uint8_t)i->src2.offset; - swizzle_mask = - (((swizzle_mask >> 6) & 0x3) << 0) | - (((swizzle_mask >> 4) & 0x3) << 2) | - (((swizzle_mask >> 2) & 0x3) << 4) | - (((swizzle_mask >> 0) & 0x3) << 6); - Xmm dest, src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.pshufd(dest, src1, swizzle_mask); - e.EndOp(dest, src1); - } else { - UNIMPLEMENTED_SEQ(); - } - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_PACK, [](X64Emitter& e, Instr*& i) { - if (i->flags == PACK_TYPE_D3DCOLOR) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_FLOAT16_2) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_FLOAT16_4) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_SHORT_2) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S8_IN_16_LO) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S8_IN_16_HI) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S16_IN_32_LO) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S16_IN_32_HI) { - UNIMPLEMENTED_SEQ(); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { - if (i->flags == PACK_TYPE_D3DCOLOR) { - // ARGB (WXYZ) -> RGBA (XYZW) - // XMLoadColor - // int32_t src = (int32_t)src1.iw; - // dest.f4[0] = (float)((src >> 16) & 0xFF) * (1.0f / 255.0f); - // dest.f4[1] = (float)((src >> 8) & 0xFF) * (1.0f / 255.0f); - // dest.f4[2] = (float)(src & 0xFF) * (1.0f / 255.0f); - // dest.f4[3] = (float)((src >> 24) & 0xFF) * (1.0f / 255.0f); - XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - // src = ZZYYXXWW - // unpack to 000000ZZ,000000YY,000000XX,000000WW - e.mov(e.rax, XMMCONSTBASE); - e.vpshufb(dest, src, XMMCONST(e.rax, XMMUnpackD3DCOLOR)); - // mult by 1/255 - e.vmulps(dest, XMMCONST(e.rax, XMMOneOver255)); - }); - } else if (i->flags == PACK_TYPE_FLOAT16_2) { - // 1 bit sign, 5 bit exponent, 10 bit mantissa - // D3D10 half float format - // TODO(benvanik): http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx - // Use _mm_cvtph_ps -- requires very modern processors (SSE5+) - // Unpacking half floats: http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ - // Packing half floats: https://gist.github.com/rygorous/2156668 - // Load source, move from tight pack of X16Y16.... to X16...Y16... - // Also zero out the high end. - // TODO(benvanik): special case constant unpacks that just get 0/1/etc. - XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - // sx = src.iw >> 16; - // sy = src.iw & 0xFFFF; - // dest = { XMConvertHalfToFloat(sx), - // XMConvertHalfToFloat(sy), - // 0.0, - // 1.0 }; - auto addr = Stash(e, src); - e.lea(e.rdx, addr); - CallNative(e, Unpack_FLOAT16_2); - e.movaps(dest, addr); - }); - } else if (i->flags == PACK_TYPE_FLOAT16_4) { - // Could be shared with FLOAT16_2. - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_SHORT_2) { - // (VD.x) = 3.0 + (VB.x>>16)*2^-22 - // (VD.y) = 3.0 + (VB.x)*2^-22 - // (VD.z) = 0.0 - // (VD.w) = 1.0 - XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { - // XMLoadShortN2 plus 3,3,0,3 (for some reason) - // src is (xx,xx,xx,VALUE) - e.mov(e.rax, XMMCONSTBASE); - // (VALUE,VALUE,VALUE,VALUE) - e.vbroadcastss(dest, src); - // (VALUE&0xFFFF,VALUE&0xFFFF0000,0,0) - e.andps(dest, XMMCONST(e.rax, XMMMaskX16Y16)); - // Sign extend. - e.xorps(dest, XMMCONST(e.rax, XMMFlipX16Y16)); - // Convert int->float. - e.cvtpi2ps(dest, Stash(e, dest)); - // 0x8000 to undo sign. - e.addps(dest, XMMCONST(e.rax, XMMFixX16Y16)); - // Normalize. - e.mulps(dest, XMMCONST(e.rax, XMMNormalizeX16Y16)); - // Clamp. - e.maxps(dest, XMMCONST(e.rax, XMMNegativeOne)); - // Add 3,3,0,1. - e.addps(dest, XMMCONST(e.rax, XMM3301)); - }); - } else if (i->flags == PACK_TYPE_S8_IN_16_LO) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S8_IN_16_HI) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S16_IN_32_LO) { - UNIMPLEMENTED_SEQ(); - } else if (i->flags == PACK_TYPE_S16_IN_32_HI) { - UNIMPLEMENTED_SEQ(); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -// -------------------------------------------------------------------------- -// Atomic -// -------------------------------------------------------------------------- - -table->AddSequence(OPCODE_COMPARE_EXCHANGE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_ATOMIC_EXCHANGE, [](X64Emitter& e, Instr*& i) { - // dest = old_value = InterlockedExchange(src1 = address, src2 = new_value); - if (i->Match(SIG_TYPE_I32, SIG_TYPE_I64, SIG_TYPE_I32)) { - Reg32 dest, src2; - Reg64 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - Reg64 real_src1 = src1; - if (dest.getIdx() == src1.getIdx()) { - e.mov(TEMP_REG, src1); - real_src1 = TEMP_REG; - } - e.mov(dest, src2); - e.lock(); - e.xchg(e.dword[real_src1], dest); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I64, SIG_TYPE_I32C)) { - Reg32 dest; - Reg64 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - Reg64 real_src1 = src1; - if (dest.getIdx() == src1.getIdx()) { - e.mov(TEMP_REG, src1); - real_src1 = TEMP_REG; - } - e.mov(dest, i->src2.value->constant.i32); - e.lock(); - e.xchg(e.dword[real_src1], dest); - e.EndOp(dest, src1); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_ATOMIC_ADD, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; -}); - -table->AddSequence(OPCODE_ATOMIC_SUB, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; -}); -} diff --git a/src/alloy/backend/x64/lowering/lowering_table.cc b/src/alloy/backend/x64/lowering/lowering_table.cc deleted file mode 100644 index 6c5c8468b..000000000 --- a/src/alloy/backend/x64/lowering/lowering_table.cc +++ /dev/null @@ -1,71 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include -#include - -using namespace alloy; -using namespace alloy::backend::x64; -using namespace alloy::backend::x64::lowering; - - -LoweringTable::LoweringTable(X64Backend* backend) : - backend_(backend) { - xe_zero_struct(lookup_, sizeof(lookup_)); -} - -LoweringTable::~LoweringTable() { - for (size_t n = 0; n < XECOUNT(lookup_); n++) { - auto entry = lookup_[n]; - while (entry) { - auto next = entry->next; - delete entry; - entry = next; - } - } -} - -int LoweringTable::Initialize() { - RegisterSequences(this); - return 0; -} - -void LoweringTable::AddSequence(hir::Opcode starting_opcode, sequence_fn_t fn) { - auto existing_entry = lookup_[starting_opcode]; - auto new_entry = new sequence_fn_entry_t(); - new_entry->fn = fn; - new_entry->next = existing_entry; - lookup_[starting_opcode] = new_entry; -} - -int LoweringTable::ProcessBlock(X64Emitter& e, hir::Block* block) { - // Process instructions. - auto instr = block->instr_head; - while (instr) { - bool processed = false; - auto entry = lookup_[instr->opcode->num]; - while (entry) { - if ((*entry->fn)(e, instr)) { - processed = true; - break; - } - entry = entry->next; - } - if (!processed) { - // No sequence found! - XELOGE("Unable to process HIR opcode %s", instr->opcode->name); - return 1; - instr = e.Advance(instr); - } - } - - return 0; -} \ No newline at end of file diff --git a/src/alloy/backend/x64/lowering/lowering_table.h b/src/alloy/backend/x64/lowering/lowering_table.h deleted file mode 100644 index f62bfd777..000000000 --- a/src/alloy/backend/x64/lowering/lowering_table.h +++ /dev/null @@ -1,58 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_TABLE_H_ -#define ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_TABLE_H_ - -#include -#include - - -namespace alloy { -namespace backend { -namespace x64 { -class X64Backend; -class X64Emitter; -namespace lowering { - - -class LoweringTable { -public: - LoweringTable(X64Backend* backend); - ~LoweringTable(); - - int Initialize(); - - int ProcessBlock(X64Emitter& e, hir::Block* block); - -public: - typedef bool(*sequence_fn_t)(X64Emitter& e, hir::Instr*& instr); - void AddSequence(hir::Opcode starting_opcode, sequence_fn_t fn); - -private: - class sequence_fn_entry_t { - public: - sequence_fn_t fn; - sequence_fn_entry_t* next; - }; - - // NOTE: this class is shared by multiple threads and is not thread safe. - // Do not modify anything after init. - X64Backend* backend_; - sequence_fn_entry_t* lookup_[hir::__OPCODE_MAX_VALUE]; -}; - - -} // namespace lowering -} // namespace x64 -} // namespace backend -} // namespace alloy - - -#endif // ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_TABLE_H_ diff --git a/src/alloy/backend/x64/lowering/op_utils.inl b/src/alloy/backend/x64/lowering/op_utils.inl deleted file mode 100644 index 749e84901..000000000 --- a/src/alloy/backend/x64/lowering/op_utils.inl +++ /dev/null @@ -1,1063 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -// NOTE: this file is only designed to be included by lowering_sequencies.cc! - -#ifndef ALLOY_BACKEND_X64_X64_LOWERING_OP_UTILS_INL_ -#define ALLOY_BACKEND_X64_X64_LOWERING_OP_UTILS_INL_ - -namespace { - -#define LIKE_REG(dest, like) Reg(dest.getIdx(), dest.getKind(), like.getBit(), false) -#define TEMP_REG e.r8 -#define TEMP_LIKE(like) Reg(TEMP_REG.getIdx(), TEMP_REG.getKind(), like.getBit(), false) - -#define STASH_OFFSET 32 - -// If we are running with tracing on we have to store the EFLAGS in the stack, -// otherwise our calls out to C to print will clear it before DID_CARRY/etc -// can get the value. -#define STORE_EFLAGS 1 - -void LoadEflags(X64Emitter& e) { -#if STORE_EFLAGS - e.mov(e.eax, e.dword[e.rsp + STASH_OFFSET]); - e.push(e.rax); - e.popf(); -#else - // EFLAGS already present. -#endif // STORE_EFLAGS -} -void StoreEflags(X64Emitter& e) { -#if STORE_EFLAGS - e.pushf(); - e.pop(e.qword[e.rsp + STASH_OFFSET]); -#else - // EFLAGS should have CA set? - // (so long as we don't fuck with it) -#endif // STORE_EFLAGS -} - -Address Stash(X64Emitter& e, const Xmm& r) { - // TODO(benvanik): ensure aligned. - auto addr = e.ptr[e.rsp + STASH_OFFSET]; - e.movups(addr, r); - return addr; -} - -void LoadXmmConstant(X64Emitter& e, const Xmm& dest, const vec128_t& v) { - if (!v.low && !v.high) { - // zero - e.vpxor(dest, dest); - //} else if (v.low == ~0ull && v.high == ~0ull) { - // one - // TODO(benvanik): XMMCONST? - } else { - // TODO(benvanik): more efficient loading of partial values? - e.mov(e.qword[e.rsp + STASH_OFFSET], v.low); - e.mov(e.qword[e.rsp + STASH_OFFSET + 8], v.high); - e.vmovaps(dest, e.ptr[e.rsp + STASH_OFFSET]); - } -} - -// Moves a 64bit immediate into memory. -void MovMem64(X64Emitter& e, RegExp& addr, uint64_t v) { - if ((v & ~0x7FFFFFFF) == 0) { - // Fits under 31 bits, so just load using normal mov. - e.mov(e.qword[addr], v); - } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { - // Negative number that fits in 32bits. - e.mov(e.qword[addr], v); - } else { - // 64bit number that needs double movs. - e.mov(e.rax, v); - e.mov(e.qword[addr], e.rax); - } -} - -void CallNative(X64Emitter& e, void* target) { - e.mov(e.rax, (uint64_t)target); - e.call(e.rax); - e.mov(e.rcx, e.qword[e.rsp + StackLayout::GUEST_RCX_HOME]); - e.mov(e.rdx, e.qword[e.rcx + 8]); // membase -} - -void ReloadRDX(X64Emitter& e) { - e.mov(e.rdx, e.qword[e.rcx + 8]); // membase -} - -// Sets EFLAGs with zf for the given value. -// ZF = 1 if false, 0 = true (so jz = jump if false) -void CheckBoolean(X64Emitter& e, Value* v) { - if (v->IsConstant()) { - e.mov(e.ah, (v->IsConstantZero() ? 1 : 0) << 6); - e.sahf(); - } else if (v->type == INT8_TYPE) { - Reg8 src; - e.BeginOp(v, src, 0); - e.test(src, src); - e.EndOp(src); - } else if (v->type == INT16_TYPE) { - Reg16 src; - e.BeginOp(v, src, 0); - e.test(src, src); - e.EndOp(src); - } else if (v->type == INT32_TYPE) { - Reg32 src; - e.BeginOp(v, src, 0); - e.test(src, src); - e.EndOp(src); - } else if (v->type == INT64_TYPE) { - Reg64 src; - e.BeginOp(v, src, 0); - e.test(src, src); - e.EndOp(src); - } else if (v->type == FLOAT32_TYPE) { - // TODO(benvanik): mask? - Xmm src; - e.BeginOp(v, src, 0); - e.ptest(src, src); - e.EndOp(src); - } else if (v->type == FLOAT64_TYPE) { - // TODO(benvanik): mask? - Xmm src; - e.BeginOp(v, src, 0); - e.ptest(src, src); - e.EndOp(src); - } else if (v->type == VEC128_TYPE) { - Xmm src; - e.BeginOp(v, src, 0); - e.ptest(src, src); - e.EndOp(src); - } else { - ASSERT_INVALID_TYPE(); - } -} - -// Compares src1 and src2 and calls the given fn to set a byte based on EFLAGS. -void CompareXX(X64Emitter& e, Instr*& i, void(set_fn)(X64Emitter& e, Reg8& dest, bool invert)) { - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest; - Reg8 src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.cmp(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest; - Reg8 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.cmp(src1, i->src2.value->constant.i8); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8C, SIG_TYPE_I8)) { - Reg8 dest; - Reg8 src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - e.cmp(src2, i->src1.value->constant.i8); - set_fn(e, dest, true); - e.EndOp(dest, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg8 dest; - Reg16 src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.cmp(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16C)) { - Reg8 dest; - Reg16 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.cmp(src1, i->src2.value->constant.i16); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16C, SIG_TYPE_I16)) { - Reg8 dest; - Reg16 src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - e.cmp(src2, i->src1.value->constant.i16); - e.sete(dest); - set_fn(e, dest, true); - e.EndOp(dest, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg8 dest; - Reg32 src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.cmp(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32C)) { - Reg8 dest; - Reg32 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.cmp(src1, i->src2.value->constant.i32); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32C, SIG_TYPE_I32)) { - Reg8 dest; - Reg32 src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - e.cmp(src2, i->src1.value->constant.i32); - set_fn(e, dest, true); - e.EndOp(dest, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg8 dest; - Reg64 src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.cmp(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64C)) { - Reg8 dest; - Reg64 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.mov(e.rax, i->src2.value->constant.i64); - e.cmp(src1, e.rax); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64C, SIG_TYPE_I64)) { - Reg8 dest; - Reg64 src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - e.mov(e.rax, i->src1.value->constant.i64); - e.cmp(src2, e.rax); - set_fn(e, dest, true); - e.EndOp(dest, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32, SIG_TYPE_F32)) { - Reg8 dest; - Xmm src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.comiss(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F32, SIG_TYPE_F32C)) { - Reg8 dest; - Xmm src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (i->src2.value->IsConstantZero()) { - e.pxor(e.xmm0, e.xmm0); - } else { - e.mov(e.eax, (uint32_t)i->src2.value->constant.i32); - e.pinsrd(e.xmm0, e.eax, 0); - } - e.comiss(src1, e.xmm0); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64, SIG_TYPE_F64)) { - Reg8 dest; - Xmm src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.comisd(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_F64, SIG_TYPE_F64C)) { - Reg8 dest; - Xmm src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (i->src2.value->IsConstantZero()) { - e.pxor(e.xmm0, e.xmm0); - } else { - e.mov(e.rax, (uint64_t)i->src2.value->constant.i64); - e.pinsrq(e.xmm0, e.rax, 0); - } - e.comisd(src1, e.xmm0); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else { - UNIMPLEMENTED_SEQ(); - } -}; - -enum VectoreCompareOp { - VECTOR_CMP_EQ, - VECTOR_CMP_GT, - VECTOR_CMP_GE, -}; -// Compares src1 to src2 with the given op and sets the dest. -// Dest will have each part set to all ones if the compare passes. -void VectorCompareXX(X64Emitter& e, Instr*& i, VectoreCompareOp op, bool as_signed) { - Xmm dest, src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - if (op == VECTOR_CMP_EQ) { - // Commutative, so simple. - Xmm real_src; - if (dest.getIdx() == src1.getIdx()) { - real_src = src2; - } else if (dest.getIdx() == src2.getIdx()) { - real_src = src1; - } else { - e.movaps(dest, src1); - real_src = src2; - } - if (i->flags == INT8_TYPE) { - e.pcmpeqb(dest, real_src); - } else if (i->flags == INT16_TYPE) { - e.pcmpeqw(dest, real_src); - } else if (i->flags == INT32_TYPE) { - e.pcmpeqd(dest, real_src); - } else if (i->flags == FLOAT32_TYPE) { - e.cmpeqps(dest, real_src); - } else { - ASSERT_INVALID_TYPE(); - } - } else if (i->flags == FLOAT32_TYPE) { - // Float GT/GE must be emulated. - if (op == VECTOR_CMP_GT) { - // Have to swap: src2 < src1. - if (dest.getIdx() == src2.getIdx()) { - e.cmpltps(dest, src1); - } else if (dest.getIdx() == src1.getIdx()) { - e.movaps(e.xmm0, src1); - e.movaps(dest, src2); - e.cmpltps(dest, e.xmm0); - } else { - e.movaps(dest, src2); - e.cmpltps(dest, src1); - } - } else if (op == VECTOR_CMP_GE) { - // Have to swap: src2 <= src1. - if (dest.getIdx() == src2.getIdx()) { - e.cmpleps(dest, src1); - } else if (dest.getIdx() == src1.getIdx()) { - e.movaps(e.xmm0, src1); - e.movaps(dest, src2); - e.cmpleps(dest, e.xmm0); - } else { - e.movaps(dest, src2); - e.cmpleps(dest, src1); - } - } else { - ASSERT_INVALID_TYPE(); - } - } else { - // Integer types are easier. - Xmm real_src; - if (dest.getIdx() == src1.getIdx()) { - real_src = src2; - } else if (dest.getIdx() == src2.getIdx()) { - e.movaps(e.xmm0, src2); - e.movaps(dest, src1); - real_src = e.xmm0; - } else { - e.movaps(dest, src1); - real_src = src2; - } - if (op == VECTOR_CMP_GT) { - if (i->flags == INT8_TYPE) { - if (as_signed) { - e.pcmpgtb(dest, real_src); - } else { - UNIMPLEMENTED_SEQ(); - } - } else if (i->flags == INT16_TYPE) { - if (as_signed) { - e.pcmpgtw(dest, real_src); - } else { - UNIMPLEMENTED_SEQ(); - } - } else if (i->flags == INT32_TYPE) { - if (as_signed) { - e.pcmpgtd(dest, real_src); - } else { - UNIMPLEMENTED_SEQ(); - } - } else { - ASSERT_INVALID_TYPE(); - } - } else if (op == VECTOR_CMP_GE) { - if (i->flags == INT8_TYPE) { - if (as_signed) { - UNIMPLEMENTED_SEQ(); - } else { - UNIMPLEMENTED_SEQ(); - } - } else if (i->flags == INT16_TYPE) { - if (as_signed) { - UNIMPLEMENTED_SEQ(); - } else { - UNIMPLEMENTED_SEQ(); - } - } else if (i->flags == INT32_TYPE) { - if (as_signed) { - UNIMPLEMENTED_SEQ(); - } else { - UNIMPLEMENTED_SEQ(); - } - } else { - ASSERT_INVALID_TYPE(); - } - } else { - ASSERT_INVALID_TYPE(); - } - } - e.EndOp(dest, src1, src2); -}; - -typedef void(v_fn)(X64Emitter& e, Instr& i, const Reg& dest_src); -template -void IntUnaryOpV(X64Emitter& e, Instr*& i, v_fn v_fn, - T& dest, T& src1) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest.getIdx() == src1.getIdx()) { - v_fn(e, *i, dest); - } else { - e.mov(dest, src1); - v_fn(e, *i, dest); - } - e.EndOp(dest, src1); -} -template -void IntUnaryOpC(X64Emitter& e, Instr*& i, v_fn v_fn, - T& dest, Value* src1) { - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, (uint64_t)src1->get_constant(CT())); - v_fn(e, *i, dest); - e.EndOp(dest); -} -void IntUnaryOp(X64Emitter& e, Instr*& i, v_fn v_fn) { - if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest, src1; - IntUnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest; - IntUnaryOpC(e, i, v_fn, dest, i->src1.value); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg16 dest, src1; - IntUnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C)) { - Reg16 dest; - IntUnaryOpC(e, i, v_fn, dest, i->src1.value); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg32 dest, src1; - IntUnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C)) { - Reg32 dest; - IntUnaryOpC(e, i, v_fn, dest, i->src1.value); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg64 dest, src1; - IntUnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C)) { - Reg64 dest; - IntUnaryOpC(e, i, v_fn, dest, i->src1.value); - } else { - ASSERT_INVALID_TYPE(); - } - if (i->flags & ARITHMETIC_SET_CARRY) { - StoreEflags(e); - } -}; - -typedef void(vv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src); -typedef void(vc_fn)(X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src); -template -void IntBinaryOpVV(X64Emitter& e, Instr*& i, vv_fn vv_fn, - TD& dest, TS1& src1, TS2& src2) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - if (dest.getIdx() == src1.getIdx()) { - vv_fn(e, *i, dest, src2); - } else if (dest.getIdx() == src2.getIdx()) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vv_fn(e, *i, dest, src1); - } else { - // Eww. - auto Ntx = TEMP_LIKE(src1); - e.mov(Ntx, src1); - vv_fn(e, *i, Ntx, src2); - e.mov(dest, Ntx); - } - } else { - e.mov(dest, src1); - vv_fn(e, *i, dest, src2); - } - e.EndOp(dest, src1, src2); -} -template -void IntBinaryOpVC(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, - TD& dest, TS1& src1, Value* src2) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest.getBit() <= 32) { - // 32-bit. - if (dest.getIdx() == src1.getIdx()) { - vc_fn(e, *i, dest, (uint32_t)src2->get_constant(CT())); - } else { - e.mov(dest, src1); - vc_fn(e, *i, dest, (uint32_t)src2->get_constant(CT())); - } - } else { - // 64-bit. - if (dest.getIdx() == src1.getIdx()) { - e.mov(TEMP_REG, src2->constant.i64); - vv_fn(e, *i, dest, TEMP_REG); - } else { - e.mov(TEMP_REG, src2->constant.i64); - e.mov(dest, src1); - vv_fn(e, *i, dest, TEMP_REG); - } - } - e.EndOp(dest, src1); -} -template -void IntBinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, - TD& dest, Value* src1, TS2& src2) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - if (dest.getBit() <= 32) { - // 32-bit. - if (dest.getIdx() == src2.getIdx()) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); - } else { - // Eww. - auto Ntx = TEMP_LIKE(src2); - e.mov(Ntx, src2); - e.mov(dest, (uint32_t)src1->get_constant(CT())); - vv_fn(e, *i, dest, Ntx); - } - } else { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - e.mov(dest, src2); - vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); - } else { - // Need a cv_fn. Or a better way to do all of this. - e.mov(dest, (uint32_t)src1->get_constant(CT())); - vv_fn(e, *i, dest, src2); - } - } - } else { - // 64-bit. - if (dest.getIdx() == src2.getIdx()) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - e.mov(TEMP_REG, src1->constant.i64); - vv_fn(e, *i, dest, TEMP_REG); - } else { - // Eww. - e.mov(TEMP_REG, src1->constant.i64); - vv_fn(e, *i, TEMP_REG, src2); - e.mov(dest, TEMP_REG); - } - } else { - e.mov(TEMP_REG, src2); - e.mov(dest, src1->constant.i64); - vv_fn(e, *i, dest, TEMP_REG); - } - } - e.EndOp(dest, src2); -} -void IntBinaryOp(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn) { - // TODO(benvanik): table lookup. This linear scan is slow. - // Note: we assume DEST.type = SRC1.type, but that SRC2.type may vary. - XEASSERT(i->dest->type == i->src1.value->type); - if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest, src1, src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8C, SIG_TYPE_I8)) { - Reg8 dest, src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg16 dest, src1, src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I16C)) { - Reg16 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I16)) { - Reg16 dest, src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg32 dest, src1, src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I32C)) { - Reg32 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I32)) { - Reg32 dest, src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg64 dest, src1, src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I64C)) { - Reg64 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I64)) { - Reg64 dest, src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - // Start forced src2=i8 - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { - Reg16 dest, src1; - Reg8 src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8C)) { - Reg16 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I8)) { - Reg16 dest; - Reg8 src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { - Reg32 dest, src1; - Reg8 src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8C)) { - Reg32 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I8)) { - Reg32 dest; - Reg8 src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { - Reg64 dest, src1; - Reg8 src2; - IntBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8C)) { - Reg64 dest, src1; - IntBinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I8)) { - Reg64 dest; - Reg8 src2; - IntBinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else { - ASSERT_INVALID_TYPE(); - } - if (i->flags & ARITHMETIC_SET_CARRY) { - StoreEflags(e); - } -}; - -typedef void(vvv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, const Operand& src2, const Operand& src3); -typedef void(vvc_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, const Operand& src2, uint32_t src3); -typedef void(vcv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, uint32_t src2, const Operand& src3); -template -void IntTernaryOpVVV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, - TD& dest, TS1& src1, TS2& src2, TS3& src3) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0, - i->src3.value, src3, 0); - if (dest.getIdx() == src1.getIdx()) { - vvv_fn(e, *i, dest, src2, src3); - } else if (dest.getIdx() == src2.getIdx()) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vvv_fn(e, *i, dest, src1, src3); - } else { - UNIMPLEMENTED_SEQ(); - } - } else if (dest.getIdx() == src3.getIdx()) { - auto Ntx = TEMP_LIKE(src3); - e.mov(Ntx, src3); - e.mov(dest, src1); - vvv_fn(e, *i, dest, src2, Ntx); - } else { - e.mov(dest, src1); - vvv_fn(e, *i, dest, src2, src3); - } - e.EndOp(dest, src1, src2, src3); -} -template -void IntTernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, - TD& dest, TS1& src1, TS2& src2, Value* src3) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - if (dest.getBit() <= 32) { - // 32-bit. - if (dest.getIdx() == src1.getIdx()) { - vvc_fn(e, *i, dest, src2, (uint32_t)src3->get_constant(CT())); - } else if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vvc_fn(e, *i, dest, src1, (uint32_t)src3->get_constant(CT())); - } else { - // Eww. - auto Ntx = TEMP_LIKE(src2); - e.mov(Ntx, src2); - e.mov(dest, src1); - vvc_fn(e, *i, dest, Ntx, (uint32_t)src3->get_constant(CT())); - } - } else { - e.mov(dest, src1); - vvc_fn(e, *i, dest, src2, (uint32_t)src3->get_constant(CT())); - } - } else { - // 64-bit. - if (dest.getIdx() == src1.getIdx()) { - e.mov(TEMP_REG, src3->constant.i64); - vvv_fn(e, *i, dest, src2, TEMP_REG); - } else if (dest.getIdx() == src2.getIdx()) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - e.mov(TEMP_REG, src3->constant.i64); - vvv_fn(e, *i, dest, src1, TEMP_REG); - } else { - // Eww. - e.mov(TEMP_REG, src1); - e.mov(src1, src2); - e.mov(dest, TEMP_REG); - e.mov(TEMP_REG, src3->constant.i64); - vvv_fn(e, *i, dest, src1, TEMP_REG); - } - } else { - e.mov(TEMP_REG, src3->constant.i64); - e.mov(dest, src1); - vvv_fn(e, *i, dest, src2, TEMP_REG); - } - } - e.EndOp(dest, src1, src2); -} -template -void IntTernaryOpVCV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vcv_fn vcv_fn, - TD& dest, TS1& src1, Value* src2, TS3& src3) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src3.value, src3, 0); - if (dest.getBit() <= 32) { - // 32-bit. - if (dest.getIdx() == src1.getIdx()) { - vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src3); - } else if (dest.getIdx() == src3.getIdx()) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src1); - } else { - // Eww. - auto Ntx = TEMP_LIKE(src3); - e.mov(Ntx, src3); - e.mov(dest, src1); - vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), Ntx); - } - } else { - e.mov(dest, src1); - vcv_fn(e, *i, dest, (uint32_t)src2->get_constant(CT()), src3); - } - } else { - // 64-bit. - if (dest.getIdx() == src1.getIdx()) { - e.mov(TEMP_REG, src2->constant.i64); - vvv_fn(e, *i, dest, TEMP_REG, src3); - } else if (dest.getIdx() == src3.getIdx()) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - e.mov(TEMP_REG, src2->constant.i64); - vvv_fn(e, *i, dest, src1, TEMP_REG); - } else { - // Eww. - e.mov(TEMP_REG, src1); - e.mov(src1, src3); - e.mov(dest, TEMP_REG); - e.mov(TEMP_REG, src2->constant.i64); - vvv_fn(e, *i, dest, TEMP_REG, src1); - } - } else { - e.mov(TEMP_REG, src2->constant.i64); - e.mov(dest, src1); - vvv_fn(e, *i, dest, TEMP_REG, src3); - } - } - e.EndOp(dest, src1, src3); -} -void IntTernaryOp(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, vcv_fn vcv_fn) { - // TODO(benvanik): table lookup. This linear scan is slow. - // Note: we assume DEST.type = SRC1.type = SRC2.type, but that SRC3.type may vary. - XEASSERT(i->dest->type == i->src1.value->type && - i->dest->type == i->src2.value->type); - // TODO(benvanik): table lookup. - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest, src1, src2; - Reg8 src3; - IntTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest, src1, src2; - IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { - Reg16 dest, src1, src2; - Reg8 src3; - IntTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8C)) { - Reg16 dest, src1, src2; - IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { - Reg32 dest, src1, src2; - Reg8 src3; - IntTernaryOpVVV(e, i,vvv_fn, dest, src1, src2, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8C)) { - Reg32 dest, src1, src2; - IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { - Reg64 dest, src1, src2; - Reg8 src3; - IntTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8C)) { - Reg64 dest, src1, src2; - IntTernaryOpVVC(e, i, vvv_fn, vvc_fn, dest, src1, src2, i->src3.value); - // - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8C, SIG_TYPE_I8)) { - Reg8 dest, src1; - Reg8 src3; - IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I8)) { - Reg16 dest, src1; - Reg8 src3; - IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I8)) { - Reg32 dest, src1; - Reg8 src3; - IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I8)) { - Reg64 dest, src1; - Reg8 src3; - IntTernaryOpVCV(e, i, vvv_fn, vcv_fn, dest, src1, i->src2.value, src3); - } else { - ASSERT_INVALID_TYPE(); - } - if (i->flags & ARITHMETIC_SET_CARRY) { - StoreEflags(e); - } -} - -// Since alot of SSE ops can take dest + src, just do that. -// Worst case the callee can dedupe. -typedef void(xmm_v_fn)(X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src); -void XmmUnaryOpV(X64Emitter& e, Instr*& i, xmm_v_fn v_fn, - Xmm& dest, Xmm& src1) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - v_fn(e, *i, dest, src1); - e.EndOp(dest, src1); -} -void XmmUnaryOpC(X64Emitter& e, Instr*& i, xmm_v_fn v_fn, - Xmm& dest, Value* src1) { - e.BeginOp(i->dest, dest, REG_DEST); - if (src1->type == FLOAT32_TYPE) { - e.mov(e.eax, (uint32_t)src1->constant.i32); - e.movd(dest, e.eax); - } else if (src1->type == FLOAT64_TYPE) { - e.mov(e.rax, (uint64_t)src1->constant.i64); - e.movq(dest, e.rax); - } else { - LoadXmmConstant(e, dest, src1->constant.v128); - } - v_fn(e, *i, dest, dest); - e.EndOp(dest); -} -void XmmUnaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_v_fn v_fn) { - if (IsFloatType(i->src1.value->type)) { - if (i->Match(SIG_TYPE_F32, SIG_TYPE_F32)) { - Xmm dest, src1; - XmmUnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_F32C)) { - Xmm dest; - XmmUnaryOpC(e, i, v_fn, dest, i->src1.value); - } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_F64)) { - Xmm dest, src1; - XmmUnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_F64C)) { - Xmm dest; - XmmUnaryOpC(e, i, v_fn, dest, i->src1.value); - } else { - ASSERT_INVALID_TYPE(); - } - } else if (IsVecType(i->src1.value->type)) { - if (i->Match(SIG_TYPE_V128, SIG_TYPE_V128)) { - Xmm dest, src1; - XmmUnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_V128C)) { - Xmm dest; - XmmUnaryOpC(e, i, v_fn, dest, i->src1.value); - } else { - ASSERT_INVALID_TYPE(); - } - } else { - ASSERT_INVALID_TYPE(); - } -}; - -// TODO(benvanik): allow a vvv form for dest = src1 + src2 that new SSE -// ops support. -typedef void(xmm_vv_fn)(X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src); -void XmmBinaryOpVV(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, - Xmm& dest, Xmm& src1, Xmm& src2) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - if (dest.getIdx() == src1.getIdx()) { - vv_fn(e, *i, dest, src2); - } else if (dest.getIdx() == src2.getIdx()) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vv_fn(e, *i, dest, src1); - } else { - // Eww. - e.movaps(e.xmm0, src1); - vv_fn(e, *i, e.xmm0, src2); - e.movaps(dest, e.xmm0); - } - } else { - e.movaps(dest, src1); - vv_fn(e, *i, dest, src2); - } - e.EndOp(dest, src1, src2); -} -void XmmBinaryOpVC(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, - Xmm& dest, Xmm& src1, Value* src2) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - if (src2->type == FLOAT32_TYPE) { - e.mov(e.eax, (uint32_t)src2->constant.i32); - e.movss(dest, e.eax); - } else if (src2->type == FLOAT64_TYPE) { - e.mov(e.rax, (uint64_t)src2->constant.i64); - e.movsd(dest, e.rax); - } else { - LoadXmmConstant(e, dest, src2->constant.v128); - } - vv_fn(e, *i, dest, src1); - } else { - if (dest.getIdx() != src1.getIdx()) { - e.movaps(dest, src1); - } - if (src2->type == FLOAT32_TYPE) { - e.mov(e.eax, (uint32_t)src2->constant.i32); - e.movss(e.xmm0, e.eax); - } else if (src2->type == FLOAT64_TYPE) { - e.mov(e.rax, (uint64_t)src2->constant.i64); - e.movsd(e.xmm0, e.rax); - } else { - LoadXmmConstant(e, e.xmm0, src2->constant.v128); - } - vv_fn(e, *i, dest, e.xmm0); - } - e.EndOp(dest, src1); -} -void XmmBinaryOpCV(X64Emitter& e, Instr*& i, xmm_vv_fn vv_fn, - Xmm& dest, Value* src1, Xmm& src2) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - if (src1->type == FLOAT32_TYPE) { - e.mov(e.eax, (uint32_t)src1->constant.i32); - e.movss(dest, e.eax); - } else if (src1->type == FLOAT64_TYPE) { - e.mov(e.rax, (uint64_t)src1->constant.i64); - e.movsd(dest, e.rax); - } else { - LoadXmmConstant(e, dest, src1->constant.v128); - } - vv_fn(e, *i, dest, src2); - } else { - auto real_src2 = src2; - if (dest.getIdx() == src2.getIdx()) { - e.movaps(e.xmm0, src2); - real_src2 = e.xmm0; - } - if (src1->type == FLOAT32_TYPE) { - e.mov(e.eax, (uint32_t)src1->constant.i32); - e.movss(dest, e.eax); - } else if (src1->type == FLOAT64_TYPE) { - e.mov(e.rax, (uint64_t)src1->constant.i64); - e.movsd(dest, e.rax); - } else { - LoadXmmConstant(e, dest, src1->constant.v128); - } - vv_fn(e, *i, dest, real_src2); - } - e.EndOp(dest, src2); -} -void XmmBinaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_vv_fn vv_fn) { - // TODO(benvanik): table lookup. This linear scan is slow. - if (!i->src1.value->IsConstant() && !i->src2.value->IsConstant()) { - Xmm dest, src1, src2; - XmmBinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (!i->src1.value->IsConstant() && i->src2.value->IsConstant()) { - Xmm dest, src1; - XmmBinaryOpVC(e, i, vv_fn, dest, src1, i->src2.value); - } else if (i->src1.value->IsConstant() && !i->src2.value->IsConstant()) { - Xmm dest, src2; - XmmBinaryOpCV(e, i, vv_fn, dest, i->src1.value, src2); - } else { - ASSERT_INVALID_TYPE(); - } - if (flags & ARITHMETIC_SET_CARRY) { - StoreEflags(e); - } -}; - -typedef void(xmm_vvv_fn)(X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src2, const Xmm& src3); -void XmmTernaryOpVVV(X64Emitter& e, Instr*& i, xmm_vvv_fn vvv_fn, - Xmm& dest, Xmm& src1, Xmm& src2, Xmm& src3) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0, - i->src3.value, src3, 0); - if (dest.getIdx() == src1.getIdx()) { - vvv_fn(e, *i, dest, src2, src3); - } else if (dest.getIdx() == src2.getIdx()) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vvv_fn(e, *i, dest, src1, src3); - } else { - // Eww. - e.movaps(e.xmm0, src1); - vvv_fn(e, *i, e.xmm0, src2, src3); - e.movaps(dest, e.xmm0); - } - } else if (dest.getIdx() == src3.getIdx()) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vvv_fn(e, *i, dest, src1, src2); - } else { - e.movaps(e.xmm0, src3); - e.movaps(dest, src1); - vvv_fn(e, *i, dest, src2, e.xmm0); - } - } else { - e.movaps(dest, src1); - vvv_fn(e, *i, dest, src2, src3); - } - e.EndOp(dest, src1, src2, src3); -} -void XmmTernaryOp(X64Emitter& e, Instr*& i, uint32_t flags, xmm_vvv_fn vvv_fn) { - // TODO(benvanik): table lookup. This linear scan is slow. - if (!i->src1.value->IsConstant() && !i->src2.value->IsConstant() && - !i->src3.value->IsConstant()) { - Xmm dest, src1, src2, src3; - XmmTernaryOpVVV(e, i, vvv_fn, dest, src1, src2, src3); - } else { - ASSERT_INVALID_TYPE(); - } - if (flags & ARITHMETIC_SET_CARRY) { - StoreEflags(e); - } -}; - -} // namespace - -#endif // ALLOY_BACKEND_X64_X64_LOWERING_OP_UTILS_INL_ diff --git a/src/alloy/backend/x64/lowering/sources.gypi b/src/alloy/backend/x64/lowering/sources.gypi deleted file mode 100644 index d6cdeb1bb..000000000 --- a/src/alloy/backend/x64/lowering/sources.gypi +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright 2013 Ben Vanik. All Rights Reserved. -{ - 'sources': [ - 'lowering_sequences.cc', - 'lowering_sequences.h', - 'lowering_table.cc', - 'lowering_table.h', - 'op_utils.inl', - 'tracers.cc', - 'tracers.h', - ], -} diff --git a/src/alloy/backend/x64/sources.gypi b/src/alloy/backend/x64/sources.gypi index 7ca63e25d..38167e3f1 100644 --- a/src/alloy/backend/x64/sources.gypi +++ b/src/alloy/backend/x64/sources.gypi @@ -12,11 +12,12 @@ 'x64_emitter.h', 'x64_function.cc', 'x64_function.h', + 'x64_sequence.inl', + 'x64_sequences.cc', + 'x64_sequences.h', 'x64_thunk_emitter.cc', 'x64_thunk_emitter.h', - ], - - 'includes': [ - 'lowering/sources.gypi', + 'x64_tracers.cc', + 'x64_tracers.h', ], } diff --git a/src/alloy/backend/x64/x64_backend.cc b/src/alloy/backend/x64/x64_backend.cc index 076ab1cbb..40283f6d2 100644 --- a/src/alloy/backend/x64/x64_backend.cc +++ b/src/alloy/backend/x64/x64_backend.cc @@ -12,26 +12,23 @@ #include #include #include +#include #include -#include -#include using namespace alloy; using namespace alloy::backend; using namespace alloy::backend::x64; -using namespace alloy::backend::x64::lowering; using namespace alloy::runtime; X64Backend::X64Backend(Runtime* runtime) : - code_cache_(0), lowering_table_(0), + code_cache_(0), Backend(runtime) { } X64Backend::~X64Backend() { alloy::tracing::WriteEvent(EventType::Deinit({ })); - delete lowering_table_; delete code_cache_; } @@ -41,6 +38,8 @@ int X64Backend::Initialize() { return result; } + RegisterSequences(); + machine_info_.register_sets[0] = { 0, "gpr", @@ -68,9 +67,6 @@ int X64Backend::Initialize() { delete thunk_emitter; delete allocator; - lowering_table_ = new LoweringTable(this); - RegisterSequences(lowering_table_); - alloy::tracing::WriteEvent(EventType::Init({ })); diff --git a/src/alloy/backend/x64/x64_backend.h b/src/alloy/backend/x64/x64_backend.h index dd12c0347..0ff3018cd 100644 --- a/src/alloy/backend/x64/x64_backend.h +++ b/src/alloy/backend/x64/x64_backend.h @@ -20,7 +20,6 @@ namespace backend { namespace x64 { class X64CodeCache; -namespace lowering { class LoweringTable; } #define ALLOY_HAS_X64_BACKEND 1 @@ -38,8 +37,6 @@ public: HostToGuestThunk host_to_guest_thunk() const { return host_to_guest_thunk_; } GuestToHostThunk guest_to_host_thunk() const { return guest_to_host_thunk_; } - lowering::LoweringTable* lowering_table() const { return lowering_table_; } - virtual int Initialize(); virtual Assembler* CreateAssembler(); @@ -48,8 +45,6 @@ private: X64CodeCache* code_cache_; HostToGuestThunk host_to_guest_thunk_; GuestToHostThunk guest_to_host_thunk_; - - lowering::LoweringTable* lowering_table_; }; diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 80ed2cbca..ce1e4e70a 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -11,10 +11,14 @@ #include #include +#include +#include #include -#include #include #include +#include +#include +#include using namespace alloy; using namespace alloy::backend; @@ -31,6 +35,13 @@ namespace x64 { static const size_t MAX_CODE_SIZE = 1 * 1024 * 1024; +static const size_t STASH_OFFSET = 32; + +// If we are running with tracing on we have to store the EFLAGS in the stack, +// otherwise our calls out to C to print will clear it before DID_CARRY/etc +// can get the value. +#define STORE_EFLAGS 1 + } // namespace x64 } // namespace backend } // namespace alloy @@ -145,12 +156,9 @@ int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) { mov(qword[rsp + StackLayout::GUEST_RCX_HOME], rcx); mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rdx); mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0); - // ReloadRDX: mov(rdx, qword[rcx + 8]); // membase } - auto lowering_table = backend_->lowering_table(); - // Body. auto block = builder->first_block(); while (block) { @@ -161,12 +169,17 @@ int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) { label = label->next; } - // Add instructions. - // The table will process sequences of instructions to (try to) - // generate optimal code. - current_instr_ = block->instr_head; - if (lowering_table->ProcessBlock(*this, block)) { - return 1; + // Process instructions. + const Instr* instr = block->instr_head; + while (instr) { + const Instr* new_tail = instr; + if (!SelectSequence(*this, instr, &new_tail)) { + // No sequence found! + XEASSERTALWAYS(); + XELOGE("Unable to process HIR opcode %s", instr->opcode->name); + break; + } + instr = new_tail; } block = block->next; @@ -191,16 +204,320 @@ int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) { return 0; } -Instr* X64Emitter::Advance(Instr* i) { - auto next = i->next; - current_instr_ = next; - return next; -} - -void X64Emitter::MarkSourceOffset(Instr* i) { +void X64Emitter::MarkSourceOffset(const Instr* i) { auto entry = source_map_arena_.Alloc(); entry->source_offset = i->src1.offset; entry->hir_offset = uint32_t(i->block->ordinal << 16) | i->ordinal; entry->code_offset = getSize(); source_map_count_++; } + +void X64Emitter::DebugBreak() { + // TODO(benvanik): notify debugger. + db(0xCC); +} + +void X64Emitter::Trap() { + // TODO(benvanik): notify debugger. + db(0xCC); +} + +void X64Emitter::UnimplementedInstr(const hir::Instr* i) { + // TODO(benvanik): notify debugger. + db(0xCC); + XEASSERTALWAYS(); +} + +uint64_t ResolveFunctionSymbol(void* raw_context, uint64_t symbol_info_ptr) { + // TODO(benvanik): generate this thunk at runtime? or a shim? + auto thread_state = *reinterpret_cast(raw_context); + auto symbol_info = reinterpret_cast(symbol_info_ptr); + + Function* fn = NULL; + thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn); + XEASSERTNOTNULL(fn); + auto x64_fn = static_cast(fn); + return reinterpret_cast(x64_fn->machine_code()); +} + +void X64Emitter::Call(const hir::Instr* instr, runtime::FunctionInfo* symbol_info) { + auto fn = reinterpret_cast(symbol_info->function()); + // Resolve address to the function to call and store in rax. + // TODO(benvanik): caching/etc. For now this makes debugging easier. + if (fn) { + mov(rax, reinterpret_cast(fn->machine_code())); + } else { + CallNative(ResolveFunctionSymbol, reinterpret_cast(symbol_info)); + } + + // Actually jump/call to rax. + if (instr->flags & CALL_TAIL) { + // Pass the callers return address over. + mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]); + + add(rsp, static_cast(stack_size())); + jmp(rax); + } else { + // Return address is from the previous SET_RETURN_ADDRESS. + mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); + call(rax); + } +} + +uint64_t ResolveFunctionAddress(void* raw_context, uint64_t target_address) { + // TODO(benvanik): generate this thunk at runtime? or a shim? + auto thread_state = *reinterpret_cast(raw_context); + + // TODO(benvanik): required? + target_address &= 0xFFFFFFFF; + + Function* fn = NULL; + thread_state->runtime()->ResolveFunction(target_address, &fn); + XEASSERTNOTNULL(fn); + auto x64_fn = static_cast(fn); + return reinterpret_cast(x64_fn->machine_code()); +} + +void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) { + // Check if return. + if (instr->flags & CALL_POSSIBLE_RETURN) { + cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]); + je("epilog", CodeGenerator::T_NEAR); + } + + // Resolve address to the function to call and store in rax. + // TODO(benvanik): caching/etc. For now this makes debugging easier. + if (reg.getIdx() != rdx.getIdx()) { + mov(rdx, reg); + } + CallNative(ResolveFunctionAddress); + + // Actually jump/call to rax. + if (instr->flags & CALL_TAIL) { + // Pass the callers return address over. + mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]); + + add(rsp, static_cast(stack_size())); + jmp(rax); + } else { + // Return address is from the previous SET_RETURN_ADDRESS. + mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); + call(rax); + } +} + +uint64_t UndefinedCallExtern(void* raw_context, uint64_t symbol_info_ptr) { + auto symbol_info = reinterpret_cast(symbol_info_ptr); + XELOGW("undefined extern call to %.8X %s", + symbol_info->address(), + symbol_info->name()); + return 0; +} +void X64Emitter::CallExtern(const hir::Instr* instr, const FunctionInfo* symbol_info) { + XEASSERT(symbol_info->behavior() == FunctionInfo::BEHAVIOR_EXTERN); + if (!symbol_info->extern_handler()) { + CallNative(UndefinedCallExtern, reinterpret_cast(symbol_info)); + } else { + // rcx = context + // rdx = target host function + // r8 = arg0 + // r9 = arg1 + mov(rdx, reinterpret_cast(symbol_info->extern_handler())); + mov(r8, reinterpret_cast(symbol_info->extern_arg0())); + mov(r9, reinterpret_cast(symbol_info->extern_arg1())); + auto thunk = backend()->guest_to_host_thunk(); + mov(rax, reinterpret_cast(thunk)); + call(rax); + ReloadECX(); + ReloadEDX(); + // rax = host return + } +} + +void X64Emitter::CallNative(void* fn) { + mov(rax, reinterpret_cast(fn)); + call(rax); + ReloadECX(); + ReloadEDX(); +} + +void X64Emitter::CallNative(uint64_t(*fn)(void* raw_context)) { + mov(rax, reinterpret_cast(fn)); + call(rax); + ReloadECX(); + ReloadEDX(); +} + +void X64Emitter::CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0)) { + mov(rax, reinterpret_cast(fn)); + call(rax); + ReloadECX(); + ReloadEDX(); +} + +void X64Emitter::CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0), uint64_t arg0) { + mov(rdx, arg0); + mov(rax, reinterpret_cast(fn)); + call(rax); + ReloadECX(); + ReloadEDX(); +} + +void X64Emitter::SetReturnAddress(uint64_t value) { + mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], value); +} + +void X64Emitter::ReloadECX() { + mov(rcx, qword[rsp + StackLayout::GUEST_RCX_HOME]); +} + +void X64Emitter::ReloadEDX() { + mov(rdx, qword[rcx + 8]); // membase +} + +void X64Emitter::LoadEflags() { +#if STORE_EFLAGS + mov(eax, dword[rsp + STASH_OFFSET]); + push(rax); + popf(); +#else + // EFLAGS already present. +#endif // STORE_EFLAGS +} + +void X64Emitter::StoreEflags() { +#if STORE_EFLAGS + pushf(); + pop(qword[rsp + STASH_OFFSET]); +#else + // EFLAGS should have CA set? + // (so long as we don't fuck with it) +#endif // STORE_EFLAGS +} + +bool X64Emitter::ConstantFitsIn32Reg(uint64_t v) { + if ((v & ~0x7FFFFFFF) == 0) { + // Fits under 31 bits, so just load using normal mov. + return true; + } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { + // Negative number that fits in 32bits. + return true; + } + return false; +} + +void X64Emitter::MovMem64(const RegExp& addr, uint64_t v) { + if ((v & ~0x7FFFFFFF) == 0) { + // Fits under 31 bits, so just load using normal mov. + mov(qword[addr], v); + } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { + // Negative number that fits in 32bits. + mov(qword[addr], v); + } else if (!(v >> 32)) { + // All high bits are zero. It'd be nice if we had a way to load a 32bit + // immediate without sign extending! + // TODO(benvanik): this is super common, find a better way. + mov(dword[addr], static_cast(v)); + mov(dword[addr + 4], 0); + } else { + // 64bit number that needs double movs. + mov(dword[addr], static_cast(v)); + mov(dword[addr + 4], static_cast(v >> 32)); + } +} + +Address X64Emitter::GetXmmConstPtr(XmmConst id) { + static const vec128_t xmm_consts[] = { + /* XMMZero */ vec128f(0.0f, 0.0f, 0.0f, 0.0f), + /* XMMOne */ vec128f(1.0f, 1.0f, 1.0f, 1.0f), + /* XMMNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f), + /* XMMMaskX16Y16 */ vec128i(0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000), + /* XMMFlipX16Y16 */ vec128i(0x00008000, 0x00000000, 0x00000000, 0x00000000), + /* XMMFixX16Y16 */ vec128f(-32768.0f, 0.0f, 0.0f, 0.0f), + /* XMMNormalizeX16Y16 */ vec128f(1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f), + /* XMM3301 */ vec128f(3.0f, 3.0f, 0.0f, 1.0f), + /* XMMSignMaskPS */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), + /* XMMSignMaskPD */ vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u), + /* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), + /* XMMPermuteControl15 */ vec128b(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15), + /* XMMUnpackD3DCOLOR */ vec128i(0xFFFFFF02, 0xFFFFFF01, 0xFFFFFF00, 0xFFFFFF02), + /* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f), + /* XMMShiftMaskPS */ vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu), + /* XMMOneMask */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu), + }; + // TODO(benvanik): cache base pointer somewhere? stack? It'd be nice to + // prevent this move. + // TODO(benvanik): move to predictable location in PPCContext? could then + // just do rcx relative addression with no rax overwriting. + mov(rax, (uint64_t)&xmm_consts[id]); + return ptr[rax]; +} + +void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) { + // http://www.agner.org/optimize/optimizing_assembly.pdf + // 13.4 Generating constants + if (!v.low && !v.high) { + // 0000... + vpxor(dest, dest); + } else if (v.low == ~0ull && v.high == ~0ull) { + // 1111... + vmovaps(dest, GetXmmConstPtr(XMMOneMask)); + } else { + // TODO(benvanik): see what other common values are. + // TODO(benvanik): build constant table - 99% are reused. + MovMem64(rsp + STASH_OFFSET, v.low); + MovMem64(rsp + STASH_OFFSET + 8, v.high); + vmovdqa(dest, ptr[rsp + STASH_OFFSET]); + } +} + +void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, float v) { + union { + float f; + uint32_t i; + } x = { v }; + if (!v) { + // 0 + vpxor(dest, dest); + } else if (x.i == ~0UL) { + // 1111... + vmovaps(dest, GetXmmConstPtr(XMMOneMask)); + } else { + // TODO(benvanik): see what other common values are. + // TODO(benvanik): build constant table - 99% are reused. + mov(eax, x.i); + vmovd(dest, eax); + } +} + +void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) { + union { + double d; + uint64_t i; + } x = { v }; + if (!v) { + // 0 + vpxor(dest, dest); + } else if (x.i == ~0ULL) { + // 1111... + vmovaps(dest, GetXmmConstPtr(XMMOneMask)); + } else { + // TODO(benvanik): see what other common values are. + // TODO(benvanik): build constant table - 99% are reused. + mov(rax, x.i); + vmovq(dest, rax); + } +} + +Address X64Emitter::StashXmm(const Xmm& r) { + auto addr = ptr[rsp + STASH_OFFSET]; + vmovups(addr, r); + return addr; +} + +Address X64Emitter::StashXmm(const vec128_t& v) { + auto addr = ptr[rsp + STASH_OFFSET]; + LoadConstantXmm(xmm0, v); + vmovups(addr, xmm0); + return addr; +} diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index e006bf3f9..93f859616 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -19,7 +19,9 @@ XEDECLARECLASS2(alloy, hir, HIRBuilder); XEDECLARECLASS2(alloy, hir, Instr); XEDECLARECLASS2(alloy, runtime, DebugInfo); +XEDECLARECLASS2(alloy, runtime, FunctionInfo); XEDECLARECLASS2(alloy, runtime, Runtime); +XEDECLARECLASS2(alloy, runtime, SymbolInfo); namespace alloy { namespace backend { @@ -33,6 +35,25 @@ enum RegisterFlags { REG_ABCD = (1 << 1), }; +enum XmmConst { + XMMZero = 0, + XMMOne = 1, + XMMNegativeOne = 2, + XMMMaskX16Y16 = 3, + XMMFlipX16Y16 = 4, + XMMFixX16Y16 = 5, + XMMNormalizeX16Y16 = 6, + XMM3301 = 7, + XMMSignMaskPS = 8, + XMMSignMaskPD = 9, + XMMByteSwapMask = 10, + XMMPermuteControl15 = 11, + XMMUnpackD3DCOLOR = 12, + XMMOneOver255 = 13, + XMMShiftMaskPS = 14, + XMMOneMask = 15, +}; + // Unfortunately due to the design of xbyak we have to pass this to the ctor. class XbyakAllocator : public Xbyak::Allocator { public: @@ -54,79 +75,68 @@ public: void*& out_code_address, size_t& out_code_size); public: - template - void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags) { - SetupReg(v0, r0); - } - template - void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags, - hir::Value* v1, V1& r1, uint32_t r1_flags) { - SetupReg(v0, r0); - SetupReg(v1, r1); - } - template - void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags, - hir::Value* v1, V1& r1, uint32_t r1_flags, - hir::Value* v2, V2& r2, uint32_t r2_flags) { - SetupReg(v0, r0); - SetupReg(v1, r1); - SetupReg(v2, r2); - } - template - void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags, - hir::Value* v1, V1& r1, uint32_t r1_flags, - hir::Value* v2, V2& r2, uint32_t r2_flags, - hir::Value* v3, V3& r3, uint32_t r3_flags) { - SetupReg(v0, r0); - SetupReg(v1, r1); - SetupReg(v2, r2); - SetupReg(v3, r3); - } - template - void EndOp(V0& r0) { - } - template - void EndOp(V0& r0, V1& r1) { - } - template - void EndOp(V0& r0, V1& r1, V2& r2) { - } - template - void EndOp(V0& r0, V1& r1, V2& r2, V3& r3) { - } - // Reserved: rsp // Scratch: rax/rcx/rdx - // xmm0-1 + // xmm0-2 (could be only xmm0 with some trickery) // Available: rbx, r12-r15 (save to get r8-r11, rbp, rsi, rdi?) - // xmm6-xmm15 (save to get xmm2-xmm5) + // xmm6-xmm15 (save to get xmm3-xmm5) static const int GPR_COUNT = 5; static const int XMM_COUNT = 10; - static void SetupReg(hir::Value* v, Xbyak::Reg8& r) { + static void SetupReg(const hir::Value* v, Xbyak::Reg8& r) { auto idx = gpr_reg_map_[v->reg.index]; r = Xbyak::Reg8(idx); } - static void SetupReg(hir::Value* v, Xbyak::Reg16& r) { + static void SetupReg(const hir::Value* v, Xbyak::Reg16& r) { auto idx = gpr_reg_map_[v->reg.index]; r = Xbyak::Reg16(idx); } - static void SetupReg(hir::Value* v, Xbyak::Reg32& r) { + static void SetupReg(const hir::Value* v, Xbyak::Reg32& r) { auto idx = gpr_reg_map_[v->reg.index]; r = Xbyak::Reg32(idx); } - static void SetupReg(hir::Value* v, Xbyak::Reg64& r) { + static void SetupReg(const hir::Value* v, Xbyak::Reg64& r) { auto idx = gpr_reg_map_[v->reg.index]; r = Xbyak::Reg64(idx); } - static void SetupReg(hir::Value* v, Xbyak::Xmm& r) { + static void SetupReg(const hir::Value* v, Xbyak::Xmm& r) { auto idx = xmm_reg_map_[v->reg.index]; r = Xbyak::Xmm(idx); } - hir::Instr* Advance(hir::Instr* i); + void MarkSourceOffset(const hir::Instr* i); - void MarkSourceOffset(hir::Instr* i); + void DebugBreak(); + void Trap(); + void UnimplementedInstr(const hir::Instr* i); + void UnimplementedExtern(const hir::Instr* i); + + void Call(const hir::Instr* instr, runtime::FunctionInfo* symbol_info); + void CallIndirect(const hir::Instr* instr, const Xbyak::Reg64& reg); + void CallExtern(const hir::Instr* instr, const runtime::FunctionInfo* symbol_info); + void CallNative(void* fn); + void CallNative(uint64_t(*fn)(void* raw_context)); + void CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0)); + void CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0), uint64_t arg0); + void SetReturnAddress(uint64_t value); + void ReloadECX(); + void ReloadEDX(); + + // TODO(benvanik): Label for epilog (don't use strings). + + void LoadEflags(); + void StoreEflags(); + + // Moves a 64bit immediate into memory. + bool ConstantFitsIn32Reg(uint64_t v); + void MovMem64(const Xbyak::RegExp& addr, uint64_t v); + + Xbyak::Address GetXmmConstPtr(XmmConst id); + void LoadConstantXmm(Xbyak::Xmm dest, float v); + void LoadConstantXmm(Xbyak::Xmm dest, double v); + void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v); + Xbyak::Address StashXmm(const Xbyak::Xmm& r); + Xbyak::Address StashXmm(const vec128_t& v); size_t stack_size() const { return stack_size_; } diff --git a/src/alloy/backend/x64/x64_sequence.inl b/src/alloy/backend/x64/x64_sequence.inl new file mode 100644 index 000000000..ce2b8e36e --- /dev/null +++ b/src/alloy/backend/x64/x64_sequence.inl @@ -0,0 +1,714 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + + +namespace { + +enum KeyType { + KEY_TYPE_X = OPCODE_SIG_TYPE_X, + KEY_TYPE_L = OPCODE_SIG_TYPE_L, + KEY_TYPE_O = OPCODE_SIG_TYPE_O, + KEY_TYPE_S = OPCODE_SIG_TYPE_S, + KEY_TYPE_V_I8 = OPCODE_SIG_TYPE_V + INT8_TYPE, + KEY_TYPE_V_I16 = OPCODE_SIG_TYPE_V + INT16_TYPE, + KEY_TYPE_V_I32 = OPCODE_SIG_TYPE_V + INT32_TYPE, + KEY_TYPE_V_I64 = OPCODE_SIG_TYPE_V + INT64_TYPE, + KEY_TYPE_V_F32 = OPCODE_SIG_TYPE_V + FLOAT32_TYPE, + KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE, + KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE, +}; + +#pragma pack(push, 1) +union InstrKey { + struct { + uint32_t opcode : 8; + uint32_t dest : 5; + uint32_t src1 : 5; + uint32_t src2 : 5; + uint32_t src3 : 5; + uint32_t reserved : 4; + }; + uint32_t value; + + operator uint32_t() const { + return value; + } + + InstrKey() : value(0) {} + InstrKey(uint32_t v) : value(v) {} + InstrKey(const Instr* i) : value(0) { + opcode = i->opcode->num; + uint32_t sig = i->opcode->signature; + dest = GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0; + src1 = GET_OPCODE_SIG_TYPE_SRC1(sig); + if (src1 == OPCODE_SIG_TYPE_V) { + src1 += i->src1.value->type; + } + src2 = GET_OPCODE_SIG_TYPE_SRC2(sig); + if (src2 == OPCODE_SIG_TYPE_V) { + src2 += i->src2.value->type; + } + src3 = GET_OPCODE_SIG_TYPE_SRC3(sig); + if (src3 == OPCODE_SIG_TYPE_V) { + src3 += i->src3.value->type; + } + } + + template + struct Construct { + static const uint32_t value = + (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23); + }; +}; +#pragma pack(pop) +static_assert(sizeof(InstrKey) <= 4, "Key must be 4 bytes"); + +template +struct CombinedStruct; +template <> +struct CombinedStruct<> {}; +template +struct CombinedStruct : T, CombinedStruct {}; + +struct OpBase {}; + +template +struct Op : OpBase { + static const KeyType key_type = KEY_TYPE; +}; + +struct VoidOp : Op { +protected: + template friend struct Op; + template friend struct I; + void Load(const Instr::Op& op) {} +}; + +struct OffsetOp : Op { + uint64_t value; +protected: + template friend struct Op; + template friend struct I; + void Load(const Instr::Op& op) { + this->value = op.offset; + } +}; + +struct SymbolOp : Op { + FunctionInfo* value; +protected: + template friend struct Op; + template friend struct I; + bool Load(const Instr::Op& op) { + this->value = op.symbol_info; + return true; + } +}; + +struct LabelOp : Op { + hir::Label* value; +protected: + template friend struct Op; + template friend struct I; + void Load(const Instr::Op& op) { + this->value = op.label; + } +}; + +template +struct ValueOp : Op, KEY_TYPE> { + typedef REG_TYPE reg_type; + static const int tag = TAG; + const Value* value; + bool is_constant; + virtual bool ConstantFitsIn32Reg() const { return true; } + const REG_TYPE& reg() const { + XEASSERT(!is_constant); + return reg_; + } + operator const REG_TYPE&() const { + return reg(); + } + bool IsEqual(const T& b) const { + if (is_constant && b.is_constant) { + return reinterpret_cast(this)->constant() == b.constant(); + } else if (!is_constant && !b.is_constant) { + return reg_.getIdx() == b.reg_.getIdx(); + } else { + return false; + } + } + bool IsEqual(const Xbyak::Reg& b) const { + if (is_constant) { + return false; + } else if (!is_constant) { + return reg_.getIdx() == b.getIdx(); + } else { + return false; + } + } + bool operator== (const T& b) const { + return IsEqual(b); + } + bool operator!= (const T& b) const { + return !IsEqual(b); + } + bool operator== (const Xbyak::Reg& b) const { + return IsEqual(b); + } + bool operator!= (const Xbyak::Reg& b) const { + return !IsEqual(b); + } + void Load(const Instr::Op& op) { + const Value* value = op.value; + this->value = value; + is_constant = value->IsConstant(); + if (!is_constant) { + X64Emitter::SetupReg(value, reg_); + } + } +protected: + REG_TYPE reg_; +}; + +template +struct I8 : ValueOp, KEY_TYPE_V_I8, Reg8, int8_t, TAG> { + const int8_t constant() const { + XEASSERT(is_constant); + return value->constant.i8; + } +}; +template +struct I16 : ValueOp, KEY_TYPE_V_I16, Reg16, int16_t, TAG> { + const int16_t constant() const { + XEASSERT(is_constant); + return value->constant.i16; + } +}; +template +struct I32 : ValueOp, KEY_TYPE_V_I32, Reg32, int32_t, TAG> { + const int32_t constant() const { + XEASSERT(is_constant); + return value->constant.i32; + } +}; +template +struct I64 : ValueOp, KEY_TYPE_V_I64, Reg64, int64_t, TAG> { + const int64_t constant() const { + XEASSERT(is_constant); + return value->constant.i64; + } + bool ConstantFitsIn32Reg() const override { + int64_t v = value->constant.i64; + if ((v & ~0x7FFFFFFF) == 0) { + // Fits under 31 bits, so just load using normal mov. + return true; + } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { + // Negative number that fits in 32bits. + return true; + } + return false; + } +}; +template +struct F32 : ValueOp, KEY_TYPE_V_F32, Xmm, float, TAG> { + const float constant() const { + XEASSERT(is_constant); + return value->constant.f32; + } +}; +template +struct F64 : ValueOp, KEY_TYPE_V_F64, Xmm, double, TAG> { + const double constant() const { + XEASSERT(is_constant); + return value->constant.f64; + } +}; +template +struct V128 : ValueOp, KEY_TYPE_V_V128, Xmm, vec128_t, TAG> { + const vec128_t& constant() const { + XEASSERT(is_constant); + return value->constant.v128; + } +}; + +struct TagTable { + struct { + bool valid; + Instr::Op op; + } table[16]; + + template ::type* = nullptr> + bool CheckTag(const Instr::Op& op) { + return true; + } + template ::type* = nullptr> + bool CheckTag(const Instr::Op& op) { + return true; + } + template ::type* = nullptr> + bool CheckTag(const Instr::Op& op) { + return true; + } + template ::type* = nullptr> + bool CheckTag(const Instr::Op& op) { + return true; + } + template = KEY_TYPE_V_I8>::type* = nullptr> + bool CheckTag(const Instr::Op& op) { + const Value* value = op.value; + if (T::tag == -1) { + return true; + } + if (table[T::tag].valid && + table[T::tag].op.value != value) { + return false; + } + table[T::tag].valid = true; + table[T::tag].op.value = (Value*)value; + return true; + } +}; + +template +struct DestField; +template +struct DestField { + DEST dest; +protected: + bool LoadDest(const Instr* i, TagTable& tag_table) { + Instr::Op op; + op.value = i->dest; + if (tag_table.CheckTag(op)) { + dest.Load(op); + return true; + } + return false; + } +}; +template <> +struct DestField { +protected: + bool LoadDest(const Instr* i, TagTable& tag_table) { + return true; + } +}; + +template +struct I; +template +struct I : DestField { + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + const Instr* instr; +protected: + template friend struct SequenceFields; + bool Load(const Instr* i, TagTable& tag_table) { + if (InstrKey(i).value == key && + LoadDest(i, tag_table)) { + instr = i; + return true; + } + return false; + } +}; +template +struct I : DestField { + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + static const KeyType src1_type = SRC1::key_type; + const Instr* instr; + SRC1 src1; +protected: + template friend struct SequenceFields; + bool Load(const Instr* i, TagTable& tag_table) { + if (InstrKey(i).value == key && + LoadDest(i, tag_table) && + tag_table.CheckTag(i->src1)) { + instr = i; + src1.Load(i->src1); + return true; + } + return false; + } +}; +template +struct I : DestField { + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + static const KeyType src1_type = SRC1::key_type; + static const KeyType src2_type = SRC2::key_type; + const Instr* instr; + SRC1 src1; + SRC2 src2; +protected: + template friend struct SequenceFields; + bool Load(const Instr* i, TagTable& tag_table) { + if (InstrKey(i).value == key && + LoadDest(i, tag_table) && + tag_table.CheckTag(i->src1) && + tag_table.CheckTag(i->src2)) { + instr = i; + src1.Load(i->src1); + src2.Load(i->src2); + return true; + } + return false; + } +}; +template +struct I : DestField { + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + static const KeyType src1_type = SRC1::key_type; + static const KeyType src2_type = SRC2::key_type; + static const KeyType src3_type = SRC3::key_type; + const Instr* instr; + SRC1 src1; + SRC2 src2; + SRC3 src3; +protected: + template friend struct SequenceFields; + bool Load(const Instr* i, TagTable& tag_table) { + if (InstrKey(i).value == key && + LoadDest(i, tag_table) && + tag_table.CheckTag(i->src1) && + tag_table.CheckTag(i->src2) && + tag_table.CheckTag(i->src3)) { + instr = i; + src1.Load(i->src1); + src2.Load(i->src2); + src3.Load(i->src3); + return true; + } + return false; + } +}; + +template +struct SequenceFields; +template +struct SequenceFields { + I1 i1; + typedef typename I1 I1Type; +protected: + template friend struct Sequence; + bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) { + if (i1.Load(i, tag_table)) { + *new_tail = i->next; + return true; + } + return false; + } +}; +template +struct SequenceFields : SequenceFields { + I2 i2; +protected: + template friend struct Sequence; + bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) { + if (SequenceFields::Check(i, tag_table, new_tail)) { + auto ni = i->next; + if (ni && i2.Load(ni, tag_table)) { + *new_tail = ni; + return i; + } + } + return false; + } +}; +template +struct SequenceFields : SequenceFields { + I3 i3; +protected: + template friend struct Sequence; + bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) { + if (SequenceFields::Check(i, tag_table, new_tail)) { + auto ni = i->next; + if (ni && i3.Load(ni, tag_table)) { + *new_tail = ni; + return i; + } + } + return false; + } +}; +template +struct SequenceFields : SequenceFields { + I4 i4; +protected: + template friend struct Sequence; + bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) { + if (SequenceFields::Check(i, tag_table, new_tail)) { + auto ni = i->next; + if (ni && i4.Load(ni, tag_table)) { + *new_tail = ni; + return i; + } + } + return false; + } +}; +template +struct SequenceFields : SequenceFields { + I5 i5; +protected: + template friend struct Sequence; + bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) { + if (SequenceFields::Check(i, tag_table, new_tail)) { + auto ni = i->next; + if (ni && i5.Load(ni, tag_table)) { + *new_tail = ni; + return i; + } + } + return false; + } +}; + +template +struct Sequence { + struct EmitArgs : SequenceFields {}; + + static bool Select(X64Emitter& e, const Instr* i, const Instr** new_tail) { + EmitArgs args; + TagTable tag_table; + if (!args.Check(i, tag_table, new_tail)) { + return false; + } + SEQ::Emit(e, args); + return true; + } +}; + +template +const T GetTempReg(X64Emitter& e); +template <> +const Reg8 GetTempReg(X64Emitter& e) { + return e.al; +} +template <> +const Reg16 GetTempReg(X64Emitter& e) { + return e.ax; +} +template <> +const Reg32 GetTempReg(X64Emitter& e) { + return e.eax; +} +template <> +const Reg64 GetTempReg(X64Emitter& e) { + return e.rax; +} + +template +struct SingleSequence : public Sequence, T> { + typedef T EmitArgType; + static const uint32_t head_key = T::key; + static void Emit(X64Emitter& e, const EmitArgs& _) { + SEQ::Emit(e, _.i1); + } + + template + static void EmitUnaryOp( + X64Emitter& e, const EmitArgType& i, + const REG_FN& reg_fn) { + if (i.src1.is_constant) { + e.mov(i.dest, i.src1.constant()); + reg_fn(e, i.dest); + } else { + if (i.dest != i.src1) { + e.mov(i.dest, i.src1); + } + reg_fn(e, i.dest); + } + } + + template + static void EmitCommutativeBinaryOp( + X64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + if (i.dest == i.src2) { + if (i.src1.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src1.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src1.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + e.mov(i.dest, i.src1.constant()); + reg_reg_fn(e, i.dest, i.src2); + } + } else if (i.src2.is_constant) { + if (i.dest == i.src1) { + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + e.mov(i.dest, i.src2.constant()); + reg_reg_fn(e, i.dest, i.src1); + } + } else { + if (i.dest == i.src1) { + reg_reg_fn(e, i.dest, i.src2); + } else if (i.dest == i.src2) { + reg_reg_fn(e, i.dest, i.src1); + } else { + e.mov(i.dest, i.src1); + reg_reg_fn(e, i.dest, i.src2); + } + } + } + template + static void EmitAssociativeBinaryOp( + X64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + if (i.dest == i.src2) { + auto temp = GetTempReg(e); + e.mov(temp, i.src2); + e.mov(i.dest, i.src1.constant()); + reg_reg_fn(e, i.dest, temp); + } else { + e.mov(i.dest, i.src1.constant()); + reg_reg_fn(e, i.dest, i.src2); + } + } else if (i.src2.is_constant) { + if (i.dest == i.src1) { + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + e.mov(i.dest, i.src1); + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } + } else { + if (i.dest == i.src1) { + reg_reg_fn(e, i.dest, i.src2); + } else if (i.dest == i.src2) { + auto temp = GetTempReg(e); + e.mov(temp, i.src2); + e.mov(i.dest, i.src1); + reg_reg_fn(e, i.dest, temp); + } else { + e.mov(i.dest, i.src1); + reg_reg_fn(e, i.dest, i.src2); + } + } + } + + template + static void EmitCommutativeCompareOp( + X64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + if (i.src1.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.src2, static_cast(i.src1.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src1.constant()); + reg_reg_fn(e, i.src2, temp); + } + } else if (i.src2.is_constant) { + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.src1, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.src1, temp); + } + } else { + reg_reg_fn(e, i.src1, i.src2); + } + } + template + static void EmitAssociativeCompareOp( + X64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + if (i.src1.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, i.src2, static_cast(i.src1.constant()), true); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src1.constant()); + reg_reg_fn(e, i.dest, i.src2, temp, true); + } + } else if (i.src2.is_constant) { + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, i.src1, static_cast(i.src2.constant()), false); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, i.src1, temp, false); + } + } else { + reg_reg_fn(e, i.dest, i.src1, i.src2, false); + } + } +}; + +static const int ANY = -1; +typedef int tag_t; +static const tag_t TAG0 = 0; +static const tag_t TAG1 = 1; +static const tag_t TAG2 = 2; +static const tag_t TAG3 = 3; +static const tag_t TAG4 = 4; +static const tag_t TAG5 = 5; +static const tag_t TAG6 = 6; +static const tag_t TAG7 = 7; + +typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*, const Instr**); + +template +void Register() { + sequence_table.insert({ T::head_key, T::Select }); +} +template +void Register() { + Register(); + Register(); +}; +#define EMITTER_OPCODE_TABLE(name, ...) \ + void Register_##name() { \ + Register<__VA_ARGS__>(); \ + } + +#define MATCH(...) __VA_ARGS__ +#define EMITTER(name, match) struct name : SingleSequence +#define SEQUENCE(name, match) struct name : Sequence + +} // namespace diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc new file mode 100644 index 000000000..a48df3db5 --- /dev/null +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -0,0 +1,4488 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +// A note about vectors: +// Alloy represents vectors as xyzw pairs, with indices 0123. +// XMM registers are xyzw pairs with indices 3210, making them more like wzyx. +// This makes things somewhat confusing. It'd be nice to just shuffle the +// registers around on load/store, however certain operations require that +// data be in the right offset. +// Basically, this identity must hold: +// shuffle(vec, b00011011) -> {x,y,z,w} => {x,y,z,w} +// All indices and operations must respect that. +// +// Memory (big endian): +// [00 01 02 03] [04 05 06 07] [08 09 0A 0B] [0C 0D 0E 0F] (x, y, z, w) +// load into xmm register: +// [0F 0E 0D 0C] [0B 0A 09 08] [07 06 05 04] [03 02 01 00] (w, z, y, x) + +#include + +#include +#include +#include +#include + +// TODO(benvanik): reimplement packing functions +#include + +using namespace alloy; +using namespace alloy::backend; +using namespace alloy::backend::x64; +using namespace alloy::hir; +using namespace alloy::runtime; + +using namespace Xbyak; + +// Utilities/types used only in this file: +#include + +namespace { +static std::unordered_multimap sequence_table; +} // namespace + + +// ============================================================================ +// OPCODE_COMMENT +// ============================================================================ +EMITTER(COMMENT, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (IsTracingInstr()) { + auto str = reinterpret_cast(i.src1.value); + // TODO(benvanik): pass through. + // TODO(benvanik): don't just leak this memory. + auto str_copy = xestrdupa(str); + e.mov(e.rdx, reinterpret_cast(str_copy)); + e.CallNative(TraceString); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_COMMENT, + COMMENT); + + +// ============================================================================ +// OPCODE_NOP +// ============================================================================ +EMITTER(NOP, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.nop(); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_NOP, + NOP); + + +// ============================================================================ +// OPCODE_SOURCE_OFFSET +// ============================================================================ +EMITTER(SOURCE_OFFSET, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { +#if XE_DEBUG + e.nop(); + e.nop(); + e.mov(e.eax, (uint32_t)i.src1.value); + e.nop(); + e.nop(); +#endif // XE_DEBUG + e.MarkSourceOffset(i.instr); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SOURCE_OFFSET, + SOURCE_OFFSET); + + +// ============================================================================ +// OPCODE_DEBUG_BREAK +// ============================================================================ +EMITTER(DEBUG_BREAK, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.DebugBreak(); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DEBUG_BREAK, + DEBUG_BREAK); + + +// ============================================================================ +// OPCODE_DEBUG_BREAK_TRUE +// ============================================================================ +EMITTER(DEBUG_BREAK_TRUE_I8, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER(DEBUG_BREAK_TRUE_I16, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER(DEBUG_BREAK_TRUE_I32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER(DEBUG_BREAK_TRUE_I64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER(DEBUG_BREAK_TRUE_F32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER(DEBUG_BREAK_TRUE_F64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DEBUG_BREAK_TRUE, + DEBUG_BREAK_TRUE_I8, + DEBUG_BREAK_TRUE_I16, + DEBUG_BREAK_TRUE_I32, + DEBUG_BREAK_TRUE_I64, + DEBUG_BREAK_TRUE_F32, + DEBUG_BREAK_TRUE_F64); + + +// ============================================================================ +// OPCODE_TRAP +// ============================================================================ +EMITTER(TRAP, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.Trap(); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_TRAP, + TRAP); + + +// ============================================================================ +// OPCODE_TRAP_TRUE +// ============================================================================ +EMITTER(TRAP_TRUE_I8, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(); + e.L(skip); + } +}; +EMITTER(TRAP_TRUE_I16, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(); + e.L(skip); + } +}; +EMITTER(TRAP_TRUE_I32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(); + e.L(skip); + } +}; +EMITTER(TRAP_TRUE_I64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(); + e.L(skip); + } +}; +EMITTER(TRAP_TRUE_F32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(); + e.L(skip); + } +}; +EMITTER(TRAP_TRUE_F64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_TRAP_TRUE, + TRAP_TRUE_I8, + TRAP_TRUE_I16, + TRAP_TRUE_I32, + TRAP_TRUE_I64, + TRAP_TRUE_F32, + TRAP_TRUE_F64); + + +// ============================================================================ +// OPCODE_CALL +// ============================================================================ +EMITTER(CALL, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.Call(i.instr, i.src1.value); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CALL, + CALL); + + +// ============================================================================ +// OPCODE_CALL_TRUE +// ============================================================================ +EMITTER(CALL_TRUE_I8, MATCH(I, SymbolOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, i.src2.value); + e.L(skip); + } +}; +EMITTER(CALL_TRUE_I16, MATCH(I, SymbolOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, i.src2.value); + e.L(skip); + } +}; +EMITTER(CALL_TRUE_I32, MATCH(I, SymbolOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, i.src2.value); + e.L(skip); + } +}; +EMITTER(CALL_TRUE_I64, MATCH(I, SymbolOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, i.src2.value); + e.L(skip); + } +}; +EMITTER(CALL_TRUE_F32, MATCH(I, SymbolOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, i.src2.value); + e.L(skip); + } +}; +EMITTER(CALL_TRUE_F64, MATCH(I, SymbolOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, i.src2.value); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CALL_TRUE, + CALL_TRUE_I8, + CALL_TRUE_I16, + CALL_TRUE_I32, + CALL_TRUE_I64, + CALL_TRUE_F32, + CALL_TRUE_F64); + + +// ============================================================================ +// OPCODE_CALL_INDIRECT +// ============================================================================ +EMITTER(CALL_INDIRECT, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.CallIndirect(i.instr, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CALL_INDIRECT, + CALL_INDIRECT); + + +// ============================================================================ +// OPCODE_CALL_INDIRECT_TRUE +// ============================================================================ +EMITTER(CALL_INDIRECT_TRUE_I8, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER(CALL_INDIRECT_TRUE_I16, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER(CALL_INDIRECT_TRUE_I32, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER(CALL_INDIRECT_TRUE_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER(CALL_INDIRECT_TRUE_F32, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER(CALL_INDIRECT_TRUE_F64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CALL_INDIRECT_TRUE, + CALL_INDIRECT_TRUE_I8, + CALL_INDIRECT_TRUE_I16, + CALL_INDIRECT_TRUE_I32, + CALL_INDIRECT_TRUE_I64, + CALL_INDIRECT_TRUE_F32, + CALL_INDIRECT_TRUE_F64); + + +// ============================================================================ +// OPCODE_CALL_EXTERN +// ============================================================================ +EMITTER(CALL_EXTERN, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.CallExtern(i.instr, i.src1.value); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CALL_EXTERN, + CALL_EXTERN); + + +// ============================================================================ +// OPCODE_RETURN +// ============================================================================ +EMITTER(RETURN, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // If this is the last instruction in the last block, just let us + // fall through. + if (i.instr->next || i.instr->block->next) { + e.jmp("epilog", CodeGenerator::T_NEAR); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_RETURN, + RETURN); + + +// ============================================================================ +// OPCODE_RETURN_TRUE +// ============================================================================ +EMITTER(RETURN_TRUE_I8, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz("epilog", CodeGenerator::T_NEAR); + } +}; +EMITTER(RETURN_TRUE_I16, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz("epilog", CodeGenerator::T_NEAR); + } +}; +EMITTER(RETURN_TRUE_I32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz("epilog", CodeGenerator::T_NEAR); + } +}; +EMITTER(RETURN_TRUE_I64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz("epilog", CodeGenerator::T_NEAR); + } +}; +EMITTER(RETURN_TRUE_F32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jnz("epilog", CodeGenerator::T_NEAR); + } +}; +EMITTER(RETURN_TRUE_F64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jnz("epilog", CodeGenerator::T_NEAR); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_RETURN_TRUE, + RETURN_TRUE_I8, + RETURN_TRUE_I16, + RETURN_TRUE_I32, + RETURN_TRUE_I64, + RETURN_TRUE_F32, + RETURN_TRUE_F64); + + +// ============================================================================ +// OPCODE_SET_RETURN_ADDRESS +// ============================================================================ +EMITTER(SET_RETURN_ADDRESS, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.SetReturnAddress(i.src1.constant()); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SET_RETURN_ADDRESS, + SET_RETURN_ADDRESS); + + +// ============================================================================ +// OPCODE_BRANCH +// ============================================================================ +EMITTER(BRANCH, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.jmp(i.src1.value->name, e.T_NEAR); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_BRANCH, + BRANCH); + + +// ============================================================================ +// OPCODE_BRANCH_TRUE +// ============================================================================ +EMITTER(BRANCH_TRUE_I8, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_TRUE_I16, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_TRUE_I32, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_TRUE_I64, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_TRUE_F32, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_TRUE_F64, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_BRANCH_TRUE, + BRANCH_TRUE_I8, + BRANCH_TRUE_I16, + BRANCH_TRUE_I32, + BRANCH_TRUE_I64, + BRANCH_TRUE_F32, + BRANCH_TRUE_F64); + + +// ============================================================================ +// OPCODE_BRANCH_FALSE +// ============================================================================ +EMITTER(BRANCH_FALSE_I8, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_FALSE_I16, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_FALSE_I32, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_FALSE_I64, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_FALSE_F32, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_FALSE_F64, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_BRANCH_FALSE, + BRANCH_FALSE_I8, + BRANCH_FALSE_I16, + BRANCH_FALSE_I32, + BRANCH_FALSE_I64, + BRANCH_FALSE_F32, + BRANCH_FALSE_F64); + + +// ============================================================================ +// OPCODE_ASSIGN +// ============================================================================ +EMITTER(ASSIGN_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, i.src1); + } +}; +EMITTER(ASSIGN_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, i.src1); + } +}; +EMITTER(ASSIGN_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, i.src1); + } +}; +EMITTER(ASSIGN_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, i.src1); + } +}; +EMITTER(ASSIGN_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovaps(i.dest, i.src1); + } +}; +EMITTER(ASSIGN_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovaps(i.dest, i.src1); + } +}; +EMITTER(ASSIGN_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovaps(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ASSIGN, + ASSIGN_I8, + ASSIGN_I16, + ASSIGN_I32, + ASSIGN_I64, + ASSIGN_F32, + ASSIGN_F64, + ASSIGN_V128); + + +// ============================================================================ +// OPCODE_CAST +// ============================================================================ +EMITTER(CAST_I32_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovd(i.dest, i.src1); + } +}; +EMITTER(CAST_I64_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovq(i.dest, i.src1); + } +}; +EMITTER(CAST_F32_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovd(i.dest, i.src1); + } +}; +EMITTER(CAST_F64_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovq(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CAST, + CAST_I32_F32, + CAST_I64_F64, + CAST_F32_I32, + CAST_F64_I64); + + +// ============================================================================ +// OPCODE_ZERO_EXTEND +// ============================================================================ +EMITTER(ZERO_EXTEND_I16_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest, i.src1); + } +}; +EMITTER(ZERO_EXTEND_I32_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest, i.src1); + } +}; +EMITTER(ZERO_EXTEND_I64_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest, i.src1); + } +}; +EMITTER(ZERO_EXTEND_I32_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest, i.src1); + } +}; +EMITTER(ZERO_EXTEND_I64_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest, i.src1); + } +}; +EMITTER(ZERO_EXTEND_I64_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest.reg().cvt32(), i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ZERO_EXTEND, + ZERO_EXTEND_I16_I8, + ZERO_EXTEND_I32_I8, + ZERO_EXTEND_I64_I8, + ZERO_EXTEND_I32_I16, + ZERO_EXTEND_I64_I16, + ZERO_EXTEND_I64_I32); + + +// ============================================================================ +// OPCODE_SIGN_EXTEND +// ============================================================================ +EMITTER(SIGN_EXTEND_I16_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movsx(i.dest, i.src1); + } +}; +EMITTER(SIGN_EXTEND_I32_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movsx(i.dest, i.src1); + } +}; +EMITTER(SIGN_EXTEND_I64_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movsx(i.dest, i.src1); + } +}; +EMITTER(SIGN_EXTEND_I32_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movsx(i.dest, i.src1); + } +}; +EMITTER(SIGN_EXTEND_I64_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movsx(i.dest, i.src1); + } +}; +EMITTER(SIGN_EXTEND_I64_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movsxd(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SIGN_EXTEND, + SIGN_EXTEND_I16_I8, + SIGN_EXTEND_I32_I8, + SIGN_EXTEND_I64_I8, + SIGN_EXTEND_I32_I16, + SIGN_EXTEND_I64_I16, + SIGN_EXTEND_I64_I32); + + +// ============================================================================ +// OPCODE_TRUNCATE +// ============================================================================ +EMITTER(TRUNCATE_I8_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest.reg().cvt32(), i.src1.reg().cvt8()); + } +}; +EMITTER(TRUNCATE_I8_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest.reg().cvt32(), i.src1.reg().cvt8()); + } +}; +EMITTER(TRUNCATE_I8_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest.reg().cvt32(), i.src1.reg().cvt8()); + } +}; +EMITTER(TRUNCATE_I16_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest.reg().cvt32(), i.src1.reg().cvt16()); + } +}; +EMITTER(TRUNCATE_I16_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest.reg().cvt32(), i.src1.reg().cvt16()); + } +}; +EMITTER(TRUNCATE_I32_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, i.src1.reg().cvt32()); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_TRUNCATE, + TRUNCATE_I8_I16, + TRUNCATE_I8_I32, + TRUNCATE_I8_I64, + TRUNCATE_I16_I32, + TRUNCATE_I16_I64, + TRUNCATE_I32_I64); + + +// ============================================================================ +// OPCODE_CONVERT +// ============================================================================ +EMITTER(CONVERT_I32_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcvtss2si(i.dest, i.src1); + } +}; +EMITTER(CONVERT_I32_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcvttsd2si(i.dest, i.src1); + } +}; +EMITTER(CONVERT_I64_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcvttsd2si(i.dest, i.src1); + } +}; +EMITTER(CONVERT_F32_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcvtsi2ss(i.dest, i.src1); + } +}; +EMITTER(CONVERT_F32_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcvtsd2ss(i.dest, i.src1); + } +}; +EMITTER(CONVERT_F64_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcvtsi2sd(i.dest, i.src1); + } +}; +EMITTER(CONVERT_F64_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vcvtss2sd(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CONVERT, + CONVERT_I32_F32, + CONVERT_I32_F64, + CONVERT_I64_F64, + CONVERT_F32_I32, + CONVERT_F32_F64, + CONVERT_F64_I64, + CONVERT_F64_F32); + + +// ============================================================================ +// OPCODE_ROUND +// ============================================================================ +EMITTER(ROUND_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case ROUND_TO_ZERO: + e.vroundss(i.dest, i.src1, B00000011); + break; + case ROUND_TO_NEAREST: + e.vroundss(i.dest, i.src1, B00000000); + break; + case ROUND_TO_MINUS_INFINITY: + e.vroundss(i.dest, i.src1, B00000001); + break; + case ROUND_TO_POSITIVE_INFINITY: + e.vroundss(i.dest, i.src1, B00000010); + break; + } + } +}; +EMITTER(ROUND_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case ROUND_TO_ZERO: + e.vroundsd(i.dest, i.src1, B00000011); + break; + case ROUND_TO_NEAREST: + e.vroundsd(i.dest, i.src1, B00000000); + break; + case ROUND_TO_MINUS_INFINITY: + e.vroundsd(i.dest, i.src1, B00000001); + break; + case ROUND_TO_POSITIVE_INFINITY: + e.vroundsd(i.dest, i.src1, B00000010); + break; + } + } +}; +EMITTER(ROUND_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case ROUND_TO_ZERO: + e.vroundps(i.dest, i.src1, B00000011); + break; + case ROUND_TO_NEAREST: + e.vroundps(i.dest, i.src1, B00000000); + break; + case ROUND_TO_MINUS_INFINITY: + e.vroundps(i.dest, i.src1, B00000001); + break; + case ROUND_TO_POSITIVE_INFINITY: + e.vroundps(i.dest, i.src1, B00000010); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ROUND, + ROUND_F32, + ROUND_F64, + ROUND_V128); + + +// ============================================================================ +// OPCODE_VECTOR_CONVERT_I2F +// ============================================================================ +EMITTER(VECTOR_CONVERT_I2F, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // flags = ARITHMETIC_UNSIGNED + // TODO(benvanik): are these really the same? VC++ thinks so. + e.vcvtdq2ps(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_CONVERT_I2F, + VECTOR_CONVERT_I2F); + + +// ============================================================================ +// OPCODE_VECTOR_CONVERT_F2I +// ============================================================================ +EMITTER(VECTOR_CONVERT_F2I, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // flags = ARITHMETIC_UNSIGNED | ARITHMETIC_UNSIGNED + // TODO(benvanik): are these really the same? VC++ thinks so. + e.vcvttps2dq(i.dest, i.src1); + if (i.instr->flags & ARITHMETIC_SATURATE) { + // TODO(benvanik): check saturation. + e.UnimplementedInstr(i.instr); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_CONVERT_F2I, + VECTOR_CONVERT_F2I); + + +// ============================================================================ +// OPCODE_LOAD_VECTOR_SHL +// ============================================================================ +static vec128_t lvsl_table[17] = { + vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), + vec128b( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b( 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b( 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b( 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b( 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b( 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), +}; +EMITTER(LOAD_VECTOR_SHL_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + auto sh = i.src1.constant(); + XEASSERT(sh < XECOUNT(lvsl_table)); + e.mov(e.rax, (uintptr_t)&lvsl_table[sh]); + e.vmovaps(i.dest, e.ptr[e.rax]); + } else { +#if XE_DEBUG + // We should only ever be getting values in [0,16]. Assert that. + Xbyak::Label skip; + e.cmp(i.src1, 17); + e.jb(skip); + e.Trap(); + e.L(skip); +#endif // XE_DEBUG + // TODO(benvanik): find a cheaper way of doing this. + e.movzx(e.rdx, i.src1); + e.shl(e.rdx, 4); + e.mov(e.rax, (uintptr_t)lvsl_table); + e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); + e.ReloadEDX(); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOAD_VECTOR_SHL, + LOAD_VECTOR_SHL_I8); + + +// ============================================================================ +// OPCODE_LOAD_VECTOR_SHR +// ============================================================================ +static vec128_t lvsr_table[17] = { + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b( 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b( 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b( 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b( 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b( 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), + vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), +}; +EMITTER(LOAD_VECTOR_SHR_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + auto sh = i.src1.constant(); + XEASSERT(sh < XECOUNT(lvsr_table)); + e.mov(e.rax, (uintptr_t)&lvsr_table[sh]); + e.vmovaps(i.dest, e.ptr[e.rax]); + } else { +#if XE_DEBUG + // We should only ever be getting values in [0,16]. Assert that. + Xbyak::Label skip; + e.cmp(i.src1, 17); + e.jb(skip); + e.Trap(); + e.L(skip); +#endif // XE_DEBUG + // TODO(benvanik): find a cheaper way of doing this. + e.movzx(e.rdx, i.src1); + e.shl(e.rdx, 4); + e.mov(e.rax, (uintptr_t)lvsr_table); + e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); + e.ReloadEDX(); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOAD_VECTOR_SHR, + LOAD_VECTOR_SHR_I8); + + +// ============================================================================ +// OPCODE_LOAD_CLOCK +// ============================================================================ +EMITTER(LOAD_CLOCK, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // It'd be cool to call QueryPerformanceCounter directly, but w/e. + e.CallNative(LoadClock); + e.mov(i.dest, e.rax); + } + static uint64_t LoadClock(void* raw_context) { + LARGE_INTEGER counter; + uint64_t time = 0; + if (QueryPerformanceCounter(&counter)) { + time = counter.QuadPart; + } + return time; + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOAD_CLOCK, + LOAD_CLOCK); + + +// ============================================================================ +// OPCODE_LOAD_LOCAL +// ============================================================================ +// Note: all types are always aligned on the stack. +EMITTER(LOAD_LOCAL_I8, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, e.byte[e.rsp + i.src1.constant()]); + //e.TraceLoadI8(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER(LOAD_LOCAL_I16, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, e.word[e.rsp + i.src1.constant()]); + //e.TraceLoadI16(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER(LOAD_LOCAL_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, e.dword[e.rsp + i.src1.constant()]); + //e.TraceLoadI32(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER(LOAD_LOCAL_I64, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, e.qword[e.rsp + i.src1.constant()]); + //e.TraceLoadI64(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER(LOAD_LOCAL_F32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovss(i.dest, e.dword[e.rsp + i.src1.constant()]); + //e.TraceLoadF32(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER(LOAD_LOCAL_F64, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovsd(i.dest, e.qword[e.rsp + i.src1.constant()]); + //e.TraceLoadF64(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER(LOAD_LOCAL_V128, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovaps(i.dest, e.ptr[e.rsp + i.src1.constant()]); + //e.TraceLoadV128(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOAD_LOCAL, + LOAD_LOCAL_I8, + LOAD_LOCAL_I16, + LOAD_LOCAL_I32, + LOAD_LOCAL_I64, + LOAD_LOCAL_F32, + LOAD_LOCAL_F64, + LOAD_LOCAL_V128); + + +// ============================================================================ +// OPCODE_STORE_LOCAL +// ============================================================================ +// Note: all types are always aligned on the stack. +EMITTER(STORE_LOCAL_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreI8(DATA_LOCAL, i.src1.constant, i.src2); + e.mov(e.byte[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER(STORE_LOCAL_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2); + e.mov(e.word[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER(STORE_LOCAL_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2); + e.mov(e.dword[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER(STORE_LOCAL_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2); + e.mov(e.qword[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER(STORE_LOCAL_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreF32(DATA_LOCAL, i.src1.constant, i.src2); + e.vmovss(e.dword[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER(STORE_LOCAL_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreF64(DATA_LOCAL, i.src1.constant, i.src2); + e.vmovsd(e.qword[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER(STORE_LOCAL_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreV128(DATA_LOCAL, i.src1.constant, i.src2); + e.vmovaps(e.ptr[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_STORE_LOCAL, + STORE_LOCAL_I8, + STORE_LOCAL_I16, + STORE_LOCAL_I32, + STORE_LOCAL_I64, + STORE_LOCAL_F32, + STORE_LOCAL_F64, + STORE_LOCAL_V128); + + +// ============================================================================ +// OPCODE_LOAD_CONTEXT +// ============================================================================ +// Note: all types are always aligned in the context. +RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) { + return e.rcx + offset.value; +} +EMITTER(LOAD_CONTEXT_I8, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.mov(i.dest, e.byte[addr]); + if (IsTracingData()) { + e.mov(e.r8, e.byte[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadI8); + } + } +}; +EMITTER(LOAD_CONTEXT_I16, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.mov(i.dest, e.word[addr]); + if (IsTracingData()) { + e.mov(e.r8, e.word[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadI16); + } + } +}; +EMITTER(LOAD_CONTEXT_I32, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.mov(i.dest, e.dword[addr]); + if (IsTracingData()) { + e.mov(e.r8, e.dword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadI32); + } + } +}; +EMITTER(LOAD_CONTEXT_I64, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.mov(i.dest, e.qword[addr]); + if (IsTracingData()) { + e.mov(e.r8, e.qword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadI64); + } + } +}; +EMITTER(LOAD_CONTEXT_F32, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.vmovss(i.dest, e.dword[addr]); + if (IsTracingData()) { + e.lea(e.r8, e.dword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadF32); + } + } +}; +EMITTER(LOAD_CONTEXT_F64, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.vmovsd(i.dest, e.qword[addr]); + if (IsTracingData()) { + e.lea(e.r8, e.qword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadF64); + } + } +}; +EMITTER(LOAD_CONTEXT_V128, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.vmovaps(i.dest, e.ptr[addr]); + if (IsTracingData()) { + e.lea(e.r8, e.ptr[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadV128); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOAD_CONTEXT, + LOAD_CONTEXT_I8, + LOAD_CONTEXT_I16, + LOAD_CONTEXT_I32, + LOAD_CONTEXT_I64, + LOAD_CONTEXT_F32, + LOAD_CONTEXT_F64, + LOAD_CONTEXT_V128); + + +// ============================================================================ +// OPCODE_STORE_CONTEXT +// ============================================================================ +// Note: all types are always aligned on the stack. +EMITTER(STORE_CONTEXT_I8, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.byte[addr], i.src2.constant()); + } else { + e.mov(e.byte[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.byte[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreI8); + } + } +}; +EMITTER(STORE_CONTEXT_I16, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.word[addr], i.src2.constant()); + } else { + e.mov(e.word[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.word[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreI16); + } + } +}; +EMITTER(STORE_CONTEXT_I32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.constant()); + } else { + e.mov(e.dword[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.dword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreI32); + } + } +}; +EMITTER(STORE_CONTEXT_I64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.constant()); + } else { + e.mov(e.qword[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.qword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreI64); + } + } +}; +EMITTER(STORE_CONTEXT_F32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.value->constant.i32); + } else { + e.vmovss(e.dword[addr], i.src2); + } + if (IsTracingData()) { + e.lea(e.r8, e.dword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreF32); + } + } +}; +EMITTER(STORE_CONTEXT_F64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.value->constant.i64); + } else { + e.vmovsd(e.qword[addr], i.src2); + } + if (IsTracingData()) { + e.lea(e.r8, e.qword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreF64); + } + } +}; +EMITTER(STORE_CONTEXT_V128, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.vmovaps(e.ptr[addr], e.xmm0); + } else { + e.vmovaps(e.ptr[addr], i.src2); + } + if (IsTracingData()) { + e.lea(e.r8, e.ptr[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreV128); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_STORE_CONTEXT, + STORE_CONTEXT_I8, + STORE_CONTEXT_I16, + STORE_CONTEXT_I32, + STORE_CONTEXT_I64, + STORE_CONTEXT_F32, + STORE_CONTEXT_F64, + STORE_CONTEXT_V128); + + +// ============================================================================ +// OPCODE_LOAD +// ============================================================================ +// Note: most *should* be aligned, but needs to be checked! +template +bool CheckLoadAccessCallback(X64Emitter& e, const T& i) { + // If this is a constant address load, check to see if it's in a + // register range. We'll also probably want a dynamic check for + // unverified stores. So far, most games use constants. + if (!i.src1.is_constant) { + return false; + } + uint64_t address = i.src1.constant() & 0xFFFFFFFF; + auto cbs = e.runtime()->access_callbacks(); + while (cbs) { + if (cbs->handles(cbs->context, address)) { + e.mov(e.rcx, reinterpret_cast(cbs->context)); + e.mov(e.rdx, address); + e.CallNative(cbs->read); + if (T::dest_type == KEY_TYPE_V_I8) { + // No swap required. + e.mov(i.dest, e.al); + } else if (T::dest_type == KEY_TYPE_V_I16) { + e.ror(e.ax, 8); + e.mov(i.dest, e.ax); + } else if (T::dest_type == KEY_TYPE_V_I32) { + e.bswap(e.eax); + e.mov(i.dest, e.eax); + } else if (T::dest_type == KEY_TYPE_V_I64) { + e.bswap(e.rax); + e.mov(i.dest, e.rax); + } else { + XEASSERTALWAYS(); + } + return true; + } + cbs = cbs->next; + } + return false; +} +template +RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { + if (guest.is_constant) { + // TODO(benvanik): figure out how to do this without a temp. + // Since the constant is often 0x8... if we tried to use that as a + // displacement it would be sign extended and mess things up. + e.mov(e.eax, static_cast(guest.constant())); + return e.rdx + e.rax; + } else { + // Clear the top 32 bits, as they are likely garbage. + // TODO(benvanik): find a way to avoid doing this. + e.mov(e.eax, guest.reg().cvt32()); + return e.rdx + e.rax; + } +} +EMITTER(LOAD_I8, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (CheckLoadAccessCallback(e, i)) { + return; + } + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(i.dest, e.byte[addr]); + if (IsTracingData()) { + e.mov(e.r8, i.dest); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadI8); + } + } +}; +EMITTER(LOAD_I16, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (CheckLoadAccessCallback(e, i)) { + return; + } + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(i.dest, e.word[addr]); + if (IsTracingData()) { + e.mov(e.r8, i.dest); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadI16); + } + } +}; +EMITTER(LOAD_I32, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (CheckLoadAccessCallback(e, i)) { + return; + } + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(i.dest, e.dword[addr]); + if (IsTracingData()) { + e.mov(e.r8, i.dest); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadI32); + } + } +}; +EMITTER(LOAD_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (CheckLoadAccessCallback(e, i)) { + return; + } + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(i.dest, e.qword[addr]); + if (IsTracingData()) { + e.mov(e.r8, i.dest); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadI64); + } + } +}; +EMITTER(LOAD_F32, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.vmovss(i.dest, e.dword[addr]); + if (IsTracingData()) { + e.lea(e.r8, e.dword[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadF32); + } + } +}; +EMITTER(LOAD_F64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.vmovsd(i.dest, e.qword[addr]); + if (IsTracingData()) { + e.lea(e.r8, e.qword[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadF64); + } + } +}; +EMITTER(LOAD_V128, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + // TODO(benvanik): we should try to stick to movaps if possible. + e.vmovups(i.dest, e.ptr[addr]); + if (IsTracingData()) { + e.lea(e.r8, e.ptr[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadV128); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOAD, + LOAD_I8, + LOAD_I16, + LOAD_I32, + LOAD_I64, + LOAD_F32, + LOAD_F64, + LOAD_V128); + + +// ============================================================================ +// OPCODE_STORE +// ============================================================================ +// Note: most *should* be aligned, but needs to be checked! +template +bool CheckStoreAccessCallback(X64Emitter& e, const T& i) { + // If this is a constant address store, check to see if it's in a + // register range. We'll also probably want a dynamic check for + // unverified stores. So far, most games use constants. + if (!i.src1.is_constant) { + return false; + } + uint64_t address = i.src1.constant() & 0xFFFFFFFF; + auto cbs = e.runtime()->access_callbacks(); + while (cbs) { + if (cbs->handles(cbs->context, address)) { + e.mov(e.rcx, reinterpret_cast(cbs->context)); + e.mov(e.rdx, address); + if (i.src2.is_constant) { + e.mov(e.r8, i.src2.constant()); + } else { + if (T::src2_type == KEY_TYPE_V_I8) { + // No swap required. + e.movzx(e.r8, i.src2.reg().cvt8()); + } else if (T::src2_type == KEY_TYPE_V_I16) { + e.movzx(e.r8, i.src2.reg().cvt16()); + e.ror(e.r8w, 8); + } else if (T::src2_type == KEY_TYPE_V_I32) { + e.mov(e.r8d, i.src2.reg().cvt32()); + e.bswap(e.r8d); + } else if (T::src2_type == KEY_TYPE_V_I64) { + e.mov(e.r8, i.src2); + e.bswap(e.r8); + } else { + XEASSERTALWAYS(); + } + } + e.CallNative(cbs->write); + return true; + } + cbs = cbs->next; + } + return false; +} +EMITTER(STORE_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (CheckStoreAccessCallback(e, i)) { + return; + } + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.byte[addr], i.src2.constant()); + } else { + e.mov(e.byte[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.byte[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreI8); + } + } +}; +EMITTER(STORE_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (CheckStoreAccessCallback(e, i)) { + return; + } + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.word[addr], i.src2.constant()); + } else { + e.mov(e.word[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.word[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreI16); + } + } +}; +EMITTER(STORE_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (CheckStoreAccessCallback(e, i)) { + return; + } + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.constant()); + } else { + e.mov(e.dword[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.dword[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreI32); + } + } +}; +EMITTER(STORE_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (CheckStoreAccessCallback(e, i)) { + return; + } + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.constant()); + } else { + e.mov(e.qword[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.qword[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreI64); + } + } +}; +EMITTER(STORE_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.value->constant.i32); + } else { + e.vmovss(e.dword[addr], i.src2); + } + if (IsTracingData()) { + e.lea(e.r8, e.ptr[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreF32); + } + } +}; +EMITTER(STORE_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.value->constant.i64); + } else { + e.vmovsd(e.qword[addr], i.src2); + } + if (IsTracingData()) { + e.lea(e.r8, e.ptr[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreF64); + } + } +}; +EMITTER(STORE_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.vmovaps(e.ptr[addr], e.xmm0); + } else { + e.vmovaps(e.ptr[addr], i.src2); + } + if (IsTracingData()) { + e.lea(e.r8, e.ptr[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreV128); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_STORE, + STORE_I8, + STORE_I16, + STORE_I32, + STORE_I64, + STORE_F32, + STORE_F64, + STORE_V128); + + +// ============================================================================ +// OPCODE_PREFETCH +// ============================================================================ +EMITTER(PREFETCH, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): prefetch addr -> length. + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_PREFETCH, + PREFETCH); + + +// ============================================================================ +// OPCODE_MAX +// ============================================================================ +EMITTER(MAX_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmaxss(i.dest, i.src1, i.src2); + } +}; +EMITTER(MAX_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmaxsd(i.dest, i.src1, i.src2); + } +}; +EMITTER(MAX_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmaxps(i.dest, i.src1, i.src2); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_MAX, + MAX_F32, + MAX_F64, + MAX_V128); + + +// ============================================================================ +// OPCODE_MIN +// ============================================================================ +EMITTER(MIN_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vminss(i.dest, i.src1, i.src2); + } +}; +EMITTER(MIN_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vminsd(i.dest, i.src1, i.src2); + } +}; +EMITTER(MIN_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vminps(i.dest, i.src1, i.src2); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_MIN, + MIN_F32, + MIN_F64, + MIN_V128); + + +// ============================================================================ +// OPCODE_SELECT +// ============================================================================ +EMITTER(SELECT_I8, MATCH(I, I8<>, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.cmovnz(i.dest.reg().cvt32(), i.src2.reg().cvt32()); + e.cmovz(i.dest.reg().cvt32(), i.src3.reg().cvt32()); + } +}; +EMITTER(SELECT_I16, MATCH(I, I8<>, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.cmovnz(i.dest.reg().cvt32(), i.src2.reg().cvt32()); + e.cmovz(i.dest.reg().cvt32(), i.src3.reg().cvt32()); + } +}; +EMITTER(SELECT_I32, MATCH(I, I8<>, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.cmovnz(i.dest, i.src2); + e.cmovz(i.dest, i.src3); + } +}; +EMITTER(SELECT_I64, MATCH(I, I8<>, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.cmovnz(i.dest, i.src2); + e.cmovz(i.dest, i.src3); + } +}; +EMITTER(SELECT_F32, MATCH(I, I8<>, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + // TODO(benvanik): find a way to do this without branches. + Xbyak::Label skip; + e.vmovaps(i.dest, i.src3); + e.jz(skip); + e.vmovaps(i.dest, i.src2); + e.L(skip); + } +}; +EMITTER(SELECT_F64, MATCH(I, I8<>, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + // TODO(benvanik): find a way to do this without branches. + Xbyak::Label skip; + e.vmovaps(i.dest, i.src3); + e.jz(skip); + e.vmovaps(i.dest, i.src2); + e.L(skip); + } +}; +EMITTER(SELECT_V128, MATCH(I, I8<>, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + // TODO(benvanik): find a way to do this without branches. + Xbyak::Label skip; + e.vmovaps(i.dest, i.src3); + e.jz(skip); + e.vmovaps(i.dest, i.src2); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SELECT, + SELECT_I8, + SELECT_I16, + SELECT_I32, + SELECT_I64, + SELECT_F32, + SELECT_F64, + SELECT_V128); + + +// ============================================================================ +// OPCODE_IS_TRUE +// ============================================================================ +EMITTER(IS_TRUE_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER(IS_TRUE_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER(IS_TRUE_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER(IS_TRUE_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER(IS_TRUE_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER(IS_TRUE_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER(IS_TRUE_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_IS_TRUE, + IS_TRUE_I8, + IS_TRUE_I16, + IS_TRUE_I32, + IS_TRUE_I64, + IS_TRUE_F32, + IS_TRUE_F64, + IS_TRUE_V128); + + +// ============================================================================ +// OPCODE_IS_FALSE +// ============================================================================ +EMITTER(IS_FALSE_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER(IS_FALSE_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER(IS_FALSE_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER(IS_FALSE_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER(IS_FALSE_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER(IS_FALSE_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER(IS_FALSE_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_IS_FALSE, + IS_FALSE_I8, + IS_FALSE_I16, + IS_FALSE_I32, + IS_FALSE_I64, + IS_FALSE_F32, + IS_FALSE_F64, + IS_FALSE_V128); + + +// ============================================================================ +// OPCODE_COMPARE_EQ +// ============================================================================ +EMITTER(COMPARE_EQ_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg8& src1, const Reg8& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg8& src1, int32_t constant) { e.cmp(src1, constant); }); + e.sete(i.dest); + } +}; +EMITTER(COMPARE_EQ_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg16& src1, const Reg16& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg16& src1, int32_t constant) { e.cmp(src1, constant); }); + e.sete(i.dest); + } +}; +EMITTER(COMPARE_EQ_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg32& src1, const Reg32& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg32& src1, int32_t constant) { e.cmp(src1, constant); }); + e.sete(i.dest); + } +}; +EMITTER(COMPARE_EQ_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg64& src1, const Reg64& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg64& src1, int32_t constant) { e.cmp(src1, constant); }); + e.sete(i.dest); + } +}; +EMITTER(COMPARE_EQ_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vcomiss(i.src1, i.src2); + e.sete(i.dest); + } +}; +EMITTER(COMPARE_EQ_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vcomisd(i.src1, i.src2); + e.sete(i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_COMPARE_EQ, + COMPARE_EQ_I8, + COMPARE_EQ_I16, + COMPARE_EQ_I32, + COMPARE_EQ_I64, + COMPARE_EQ_F32, + COMPARE_EQ_F64); + + +// ============================================================================ +// OPCODE_COMPARE_NE +// ============================================================================ +EMITTER(COMPARE_NE_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg8& src1, const Reg8& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg8& src1, int32_t constant) { e.cmp(src1, constant); }); + e.setne(i.dest); + } +}; +EMITTER(COMPARE_NE_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg16& src1, const Reg16& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg16& src1, int32_t constant) { e.cmp(src1, constant); }); + e.setne(i.dest); + } +}; +EMITTER(COMPARE_NE_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg32& src1, const Reg32& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg32& src1, int32_t constant) { e.cmp(src1, constant); }); + e.setne(i.dest); + } +}; +EMITTER(COMPARE_NE_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg64& src1, const Reg64& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg64& src1, int32_t constant) { e.cmp(src1, constant); }); + e.setne(i.dest); + } +}; +EMITTER(COMPARE_NE_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vcomiss(i.src1, i.src2); + e.setne(i.dest); + } +}; +EMITTER(COMPARE_NE_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vcomisd(i.src1, i.src2); + e.setne(i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_COMPARE_NE, + COMPARE_NE_I8, + COMPARE_NE_I16, + COMPARE_NE_I32, + COMPARE_NE_I64, + COMPARE_NE_F32, + COMPARE_NE_F64); + + +// ============================================================================ +// OPCODE_COMPARE_* +// ============================================================================ +#define EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, type, reg_type) \ + EMITTER(COMPARE_##op##_##type, MATCH(I, type<>, type<>>)) { \ + static void Emit(X64Emitter& e, const EmitArgType& i) { \ + EmitAssociativeCompareOp( \ + e, i, \ + [](X64Emitter& e, const Reg8& dest, const reg_type& src1, const reg_type& src2, bool inverse) { \ + e.cmp(src1, src2); \ + if (!inverse) { e.instr(dest); } else { e.inverse_instr(dest); } \ + }, \ + [](X64Emitter& e, const Reg8& dest, const reg_type& src1, int32_t constant, bool inverse) { \ + e.cmp(src1, constant); \ + if (!inverse) { e.instr(dest); } else { e.inverse_instr(dest); } \ + }); \ + } \ + }; +#define EMITTER_ASSOCIATIVE_COMPARE_XX(op, instr, inverse_instr) \ + EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, I8, Reg8); \ + EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, I16, Reg16); \ + EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, I32, Reg32); \ + EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, I64, Reg64); \ + EMITTER(COMPARE_##op##_F32, MATCH(I, F32<>, F32<>>)) { \ + static void Emit(X64Emitter& e, const EmitArgType& i) { \ + e.vcomiss(i.src1, i.src2); \ + e.instr(i.dest); \ + } \ + }; \ + EMITTER(COMPARE_##op##_F64, MATCH(I, F64<>, F64<>>)) { \ + static void Emit(X64Emitter& e, const EmitArgType& i) { \ + if (i.src1.is_constant) { \ + e.LoadConstantXmm(e.xmm0, i.src1.constant()); \ + e.vcomisd(e.xmm0, i.src2); \ + } else if (i.src2.is_constant) { \ + e.LoadConstantXmm(e.xmm0, i.src2.constant()); \ + e.vcomisd(i.src1, e.xmm0); \ + } else { \ + e.vcomisd(i.src1, i.src2); \ + } \ + e.instr(i.dest); \ + } \ + }; \ + EMITTER_OPCODE_TABLE( \ + OPCODE_COMPARE_##op##, \ + COMPARE_##op##_I8, \ + COMPARE_##op##_I16, \ + COMPARE_##op##_I32, \ + COMPARE_##op##_I64, \ + COMPARE_##op##_F32, \ + COMPARE_##op##_F64); +EMITTER_ASSOCIATIVE_COMPARE_XX(SLT, setl, setge); +EMITTER_ASSOCIATIVE_COMPARE_XX(SLE, setle, setg); +EMITTER_ASSOCIATIVE_COMPARE_XX(SGT, setg, setle); +EMITTER_ASSOCIATIVE_COMPARE_XX(SGE, setge, setl); +EMITTER_ASSOCIATIVE_COMPARE_XX(ULT, setb, setae); +EMITTER_ASSOCIATIVE_COMPARE_XX(ULE, setbe, seta); +EMITTER_ASSOCIATIVE_COMPARE_XX(UGT, seta, setbe); +EMITTER_ASSOCIATIVE_COMPARE_XX(UGE, setae, setb); + + +// ============================================================================ +// OPCODE_DID_CARRY +// ============================================================================ +// TODO(benvanik): salc/setalc +// https://code.google.com/p/corkami/wiki/x86oddities +EMITTER(DID_CARRY_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.LoadEflags(); + e.setc(i.dest); + } +}; +EMITTER(DID_CARRY_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.LoadEflags(); + e.setc(i.dest); + } +}; +EMITTER(DID_CARRY_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.LoadEflags(); + e.setc(i.dest); + } +}; +EMITTER(DID_CARRY_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.LoadEflags(); + e.setc(i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DID_CARRY, + DID_CARRY_I8, + DID_CARRY_I16, + DID_CARRY_I32, + DID_CARRY_I64); + + +// ============================================================================ +// OPCODE_DID_OVERFLOW +// ============================================================================ +EMITTER(DID_OVERFLOW, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.LoadEflags(); + e.seto(i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DID_OVERFLOW, + DID_OVERFLOW); + + +// ============================================================================ +// OPCODE_DID_SATURATE +// ============================================================================ +//EMITTER(DID_SATURATE, MATCH(I>)) { +// static void Emit(X64Emitter& e, const EmitArgType& i) { +// } +//}; +//EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE, +// DID_SATURATE); + + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_EQ +// ============================================================================ +EMITTER(VECTOR_COMPARE_EQ_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpeqb(i.dest, i.src1, i.src2); + break; + case INT16_TYPE: + e.vpcmpeqw(i.dest, i.src1, i.src2); + break; + case INT32_TYPE: + e.vpcmpeqd(i.dest, i.src1, i.src2); + break; + case FLOAT32_TYPE: + e.vcmpeqps(i.dest, i.src1, i.src2); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_COMPARE_EQ, + VECTOR_COMPARE_EQ_V128); + + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_SGT +// ============================================================================ +EMITTER(VECTOR_COMPARE_SGT_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpgtb(i.dest, i.src1, i.src2); + break; + case INT16_TYPE: + e.vpcmpgtw(i.dest, i.src1, i.src2); + break; + case INT32_TYPE: + e.vpcmpgtd(i.dest, i.src1, i.src2); + break; + case FLOAT32_TYPE: + e.vcmpgtps(i.dest, i.src1, i.src2); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_COMPARE_SGT, + VECTOR_COMPARE_SGT_V128); + + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_SGE +// ============================================================================ +EMITTER(VECTOR_COMPARE_SGE_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpgtb(i.dest, i.src1, i.src2); + e.vpcmpeqb(e.xmm0, i.src1, i.src2); + e.vpor(i.dest, e.xmm0); + break; + case INT16_TYPE: + e.vpcmpgtw(i.dest, i.src1, i.src2); + e.vpcmpeqw(e.xmm0, i.src1, i.src2); + e.vpor(i.dest, e.xmm0); + break; + case INT32_TYPE: + e.vpcmpgtd(i.dest, i.src1, i.src2); + e.vpcmpeqd(e.xmm0, i.src1, i.src2); + e.vpor(i.dest, e.xmm0); + break; + case FLOAT32_TYPE: + e.vcmpgeps(i.dest, i.src1, i.src2); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_COMPARE_SGE, + VECTOR_COMPARE_SGE_V128); + + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_UGT +// ============================================================================ +//EMITTER(VECTOR_COMPARE_UGT_V128, MATCH(I, V128<>, V128<>>)) { +// static void Emit(X64Emitter& e, const EmitArgType& i) { +// } +//}; +//EMITTER_OPCODE_TABLE( +// OPCODE_VECTOR_COMPARE_UGT, +// VECTOR_COMPARE_UGT_V128); + + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_UGE +// ============================================================================ +//EMITTER(VECTOR_COMPARE_UGE_V128, MATCH(I, V128<>, V128<>>)) { +// static void Emit(X64Emitter& e, const EmitArgType& i) { +// } +//}; +//EMITTER_OPCODE_TABLE( +// OPCODE_VECTOR_COMPARE_UGE, +// VECTOR_COMPARE_UGE_V128); + + +// ============================================================================ +// OPCODE_ADD +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitAddXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitCommutativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const REG& src) { e.add(dest_src, src); }, + [](X64Emitter& e, const REG& dest_src, int32_t constant) { e.add(dest_src, constant); }); + if (i.instr->flags & ARITHMETIC_SET_CARRY) { + // CF is set if carried. + e.StoreEflags(); + } +} +EMITTER(ADD_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddXX(e, i); + } +}; +EMITTER(ADD_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddXX(e, i); + } +}; +EMITTER(ADD_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddXX(e, i); + } +}; +EMITTER(ADD_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddXX(e, i); + } +}; +EMITTER(ADD_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vaddss(i.dest, i.src1, i.src2); + } +}; +EMITTER(ADD_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vaddsd(i.dest, i.src1, i.src2); + } +}; +EMITTER(ADD_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vaddps(i.dest, i.src1, i.src2); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ADD, + ADD_I8, + ADD_I16, + ADD_I32, + ADD_I64, + ADD_F32, + ADD_F64, + ADD_V128); + + +// ============================================================================ +// OPCODE_ADD_CARRY +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitAddCarryXX(X64Emitter& e, const ARGS& i) { + // TODO(benvanik): faster setting? we could probably do some fun math tricks + // here to get the carry flag set. + if (i.src3.is_constant) { + if (i.src3.constant()) { + e.stc(); + } else { + e.clc(); + } + } else { + if (i.src3.reg().getIdx() <= 4) { + // Can move from A/B/C/DX to AH. + e.mov(e.ah, i.src3.reg().cvt8()); + } else { + e.mov(e.al, i.src3); + e.mov(e.ah, e.al); + } + e.sahf(); + } + if (i.src1.is_constant && i.src2.is_constant) { + auto ab = i.src1.constant() + i.src2.constant(); + if (!ab) { + e.xor(i.dest, i.dest); + } else { + e.mov(i.dest, ab); + } + e.adc(i.dest, 0); + } else { + SEQ::EmitCommutativeBinaryOp( + e, i, [](X64Emitter& e, const REG& dest_src, const REG& src) { + e.adc(dest_src, src); + }, [](X64Emitter& e, const REG& dest_src, int32_t constant) { + e.adc(dest_src, constant); + }); + } + if (i.instr->flags & ARITHMETIC_SET_CARRY) { + // CF is set if carried. + e.StoreEflags(); + } +} +EMITTER(ADD_CARRY_I8, MATCH(I, I8<>, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddCarryXX(e, i); + } +}; +EMITTER(ADD_CARRY_I16, MATCH(I, I16<>, I16<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddCarryXX(e, i); + } +}; +EMITTER(ADD_CARRY_I32, MATCH(I, I32<>, I32<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddCarryXX(e, i); + } +}; +EMITTER(ADD_CARRY_I64, MATCH(I, I64<>, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddCarryXX(e, i); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ADD_CARRY, + ADD_CARRY_I8, + ADD_CARRY_I16, + ADD_CARRY_I32, + ADD_CARRY_I64); + + +// ============================================================================ +// OPCODE_VECTOR_ADD +// ============================================================================ + + +// ============================================================================ +// OPCODE_SUB +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitSubXX(X64Emitter& e, const ARGS& i) { + if (i.instr->flags & ARITHMETIC_SET_CARRY) { + // TODO(benvanik): faster way of doing sub with CF set? + SEQ::EmitAssociativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const REG& src) { + auto temp = GetTempReg(e); + e.mov(temp, src); + e.not(temp); + e.stc(); + e.adc(dest_src, temp); + }, + [](X64Emitter& e, const REG& dest_src, int32_t constant) { + auto temp = GetTempReg(e); + e.mov(temp, constant); + e.not(temp); + e.stc(); + e.adc(dest_src, temp); + }); + e.StoreEflags(); + } else { + SEQ::EmitAssociativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const REG& src) { e.sub(dest_src, src); }, + [](X64Emitter& e, const REG& dest_src, int32_t constant) { e.sub(dest_src, constant); }); + } +} +EMITTER(SUB_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSubXX(e, i); + } +}; +EMITTER(SUB_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSubXX(e, i); + } +}; +EMITTER(SUB_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSubXX(e, i); + } +}; +EMITTER(SUB_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSubXX(e, i); + } +}; +EMITTER(SUB_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + e.vsubss(i.dest, i.src1, i.src2); + } +}; +EMITTER(SUB_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + e.vsubsd(i.dest, i.src1, i.src2); + } +}; +EMITTER(SUB_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + e.vsubps(i.dest, i.src1, i.src2); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SUB, + SUB_I8, + SUB_I16, + SUB_I32, + SUB_I64, + SUB_F32, + SUB_F64, + SUB_V128); + + +// ============================================================================ +// OPCODE_MUL +// ============================================================================ +// Sign doesn't matter here, as we don't use the high bits. +// We exploit mulx here to avoid creating too much register pressure. +EMITTER(MUL_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // dest hi, dest low = src * edx + // TODO(benvanik): place src2 in edx? + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + e.movzx(e.edx, i.src2); + e.mov(e.eax, static_cast(i.src1.constant())); + e.mulx(e.edx, i.dest.reg().cvt32(), e.eax); + } else if (i.src2.is_constant) { + e.movzx(e.edx, i.src1); + e.mov(e.eax, static_cast(i.src2.constant())); + e.mulx(e.edx, i.dest.reg().cvt32(), e.eax); + } else { + e.movzx(e.edx, i.src2); + e.mulx(e.edx, i.dest.reg().cvt32(), i.src1.reg().cvt32()); + } + } +}; +EMITTER(MUL_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // dest hi, dest low = src * edx + // TODO(benvanik): place src2 in edx? + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + e.movzx(e.edx, i.src2); + e.mov(e.ax, static_cast(i.src1.constant())); + e.mulx(e.edx, i.dest.reg().cvt32(), e.eax); + } else if (i.src2.is_constant) { + e.movzx(e.edx, i.src1); + e.mov(e.ax, static_cast(i.src2.constant())); + e.mulx(e.edx, i.dest.reg().cvt32(), e.eax); + } else { + e.movzx(e.edx, i.src2); + e.mulx(e.edx, i.dest.reg().cvt32(), i.src1.reg().cvt32()); + } + e.ReloadEDX(); + } +}; +EMITTER(MUL_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // dest hi, dest low = src * edx + // TODO(benvanik): place src2 in edx? + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + e.mov(e.edx, i.src2); + e.mov(e.eax, i.src1.constant()); + e.mulx(e.edx, i.dest, e.eax); + } else if (i.src2.is_constant) { + e.mov(e.edx, i.src1); + e.mov(e.eax, i.src2.constant()); + e.mulx(e.edx, i.dest, e.eax); + } else { + e.mov(e.edx, i.src2); + e.mulx(e.edx, i.dest, i.src1); + } + e.ReloadEDX(); + } +}; +EMITTER(MUL_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // dest hi, dest low = src * rdx + // TODO(benvanik): place src2 in edx? + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + e.mov(e.rdx, i.src2); + e.mov(e.rax, i.src1.constant()); + e.mulx(e.rdx, i.dest, e.rax); + } else if (i.src2.is_constant) { + e.mov(e.rdx, i.src1); + e.mov(e.rax, i.src2.constant()); + e.mulx(e.rdx, i.dest, e.rax); + } else { + e.mov(e.rdx, i.src2); + e.mulx(e.rdx, i.dest, i.src1); + } + e.ReloadEDX(); + } +}; +EMITTER(MUL_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + e.vmulss(i.dest, i.src1, i.src2); + } +}; +EMITTER(MUL_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + e.vmulsd(i.dest, i.src1, i.src2); + } +}; +EMITTER(MUL_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + e.vmulps(i.dest, i.src1, i.src2); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_MUL, + MUL_I8, + MUL_I16, + MUL_I32, + MUL_I64, + MUL_F32, + MUL_F64, + MUL_V128); + + +// ============================================================================ +// OPCODE_MUL_HI +// ============================================================================ +EMITTER(MUL_HI_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + // TODO(benvanik): place src1 in eax? still need to sign extend + e.movzx(e.eax, i.src1); + e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32()); + } else { + e.mov(e.al, i.src1); + e.imul(i.src2); + e.mov(i.dest, e.ah); + } + e.ReloadEDX(); + } +}; +EMITTER(MUL_HI_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + // TODO(benvanik): place src1 in eax? still need to sign extend + e.movzx(e.eax, i.src1); + e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32()); + } else { + e.mov(e.ax, i.src1); + e.imul(i.src2); + e.mov(i.dest, e.dx); + } + e.ReloadEDX(); + } +}; +EMITTER(MUL_HI_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + // TODO(benvanik): place src1 in eax? still need to sign extend + e.mov(e.eax, i.src1); + e.mulx(i.dest, e.eax, i.src2); + } else { + e.mov(e.eax, i.src1); + e.imul(i.src2); + e.mov(i.dest, e.edx); + } + e.ReloadEDX(); + } +}; +EMITTER(MUL_HI_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + // TODO(benvanik): place src1 in eax? still need to sign extend + e.mov(e.rax, i.src1); + e.mulx(i.dest, e.rax, i.src2); + } else { + e.mov(e.rax, i.src1); + e.imul(i.src2); + e.mov(i.dest, e.rdx); + } + e.ReloadEDX(); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_MUL_HI, + MUL_HI_I8, + MUL_HI_I16, + MUL_HI_I32, + MUL_HI_I64); + + +// ============================================================================ +// OPCODE_DIV +// ============================================================================ +// TODO(benvanik): optimize common constant cases. +// TODO(benvanik): simplify code! +EMITTER(DIV_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // NOTE: RDX clobbered. + bool clobbered_rcx = false; + if (i.src2.is_constant) { + XEASSERT(!i.src1.is_constant); + clobbered_rcx = true; + e.mov(e.cl, i.src2.constant()); + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + e.movzx(e.ax, i.src1); + e.div(e.cl); + } else { + e.movsx(e.ax, i.src1); + e.idiv(e.cl); + } + } else { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + e.mov(e.ax, static_cast(i.src1.constant())); + } else { + e.movzx(e.ax, i.src1); + } + e.div(i.src2); + } else { + if (i.src1.is_constant) { + e.mov(e.ax, static_cast(i.src1.constant())); + } else { + e.movsx(e.ax, i.src1); + } + e.idiv(i.src2); + } + } + e.mov(i.dest, e.al); + if (clobbered_rcx) { + e.ReloadECX(); + } + e.ReloadEDX(); + } +}; +EMITTER(DIV_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // NOTE: RDX clobbered. + bool clobbered_rcx = false; + if (i.src2.is_constant) { + XEASSERT(!i.src1.is_constant); + clobbered_rcx = true; + e.mov(e.cx, i.src2.constant()); + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + e.mov(e.ax, i.src1); + // Zero upper bits. + e.xor(e.dx, e.dx); + e.div(e.cx); + } else { + e.mov(e.ax, i.src1); + // Set dx to sign bit of src1 (dx:ax = dx:ax / src). + e.mov(e.dx, e.ax); + e.sar(e.dx, 15); + e.idiv(e.cx); + } + } else { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + e.mov(e.ax, i.src1.constant()); + } else { + e.mov(e.ax, i.src1); + } + // Zero upper bits. + e.xor(e.dx, e.dx); + e.div(i.src2); + } else { + if (i.src1.is_constant) { + e.mov(e.ax, i.src1.constant()); + } else { + e.mov(e.ax, i.src1); + } + // Set dx to sign bit of src1 (dx:ax = dx:ax / src). + e.mov(e.dx, e.ax); + e.sar(e.dx, 15); + e.idiv(i.src2); + } + } + e.mov(i.dest, e.ax); + if (clobbered_rcx) { + e.ReloadECX(); + } + e.ReloadEDX(); + } +}; +EMITTER(DIV_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // NOTE: RDX clobbered. + bool clobbered_rcx = false; + if (i.src2.is_constant) { + XEASSERT(!i.src1.is_constant); + clobbered_rcx = true; + e.mov(e.ecx, i.src2.constant()); + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + e.mov(e.eax, i.src1); + // Zero upper bits. + e.xor(e.edx, e.edx); + e.div(e.ecx); + } else { + e.mov(e.eax, i.src1); + // Set dx to sign bit of src1 (dx:ax = dx:ax / src). + e.mov(e.edx, e.eax); + e.sar(e.edx, 31); + e.idiv(e.ecx); + } + } else { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + e.mov(e.eax, i.src1.constant()); + } else { + e.mov(e.eax, i.src1); + } + // Zero upper bits. + e.xor(e.edx, e.edx); + e.div(i.src2); + } else { + if (i.src1.is_constant) { + e.mov(e.eax, i.src1.constant()); + } else { + e.mov(e.eax, i.src1); + } + // Set dx to sign bit of src1 (dx:ax = dx:ax / src). + e.mov(e.edx, e.eax); + e.sar(e.edx, 31); + e.idiv(i.src2); + } + } + e.mov(i.dest, e.eax); + if (clobbered_rcx) { + e.ReloadECX(); + } + e.ReloadEDX(); + } +}; +EMITTER(DIV_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // NOTE: RDX clobbered. + bool clobbered_rcx = false; + if (i.src2.is_constant) { + XEASSERT(!i.src1.is_constant); + clobbered_rcx = true; + e.mov(e.rcx, i.src2.constant()); + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + e.mov(e.rax, i.src1); + // Zero upper bits. + e.xor(e.rdx, e.rdx); + e.div(e.rcx); + } else { + e.mov(e.rax, i.src1); + // Set dx to sign bit of src1 (dx:ax = dx:ax / src). + e.mov(e.rdx, e.rax); + e.sar(e.rdx, 63); + e.idiv(e.rcx); + } + } else { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + e.mov(e.rax, i.src1.constant()); + } else { + e.mov(e.rax, i.src1); + } + // Zero upper bits. + e.xor(e.rdx, e.rdx); + e.div(i.src2); + } else { + if (i.src1.is_constant) { + e.mov(e.rax, i.src1.constant()); + } else { + e.mov(e.rax, i.src1); + } + // Set dx to sign bit of src1 (dx:ax = dx:ax / src). + e.mov(e.rdx, e.rax); + e.sar(e.rdx, 63); + e.idiv(i.src2); + } + } + e.mov(i.dest, e.rax); + if (clobbered_rcx) { + e.ReloadECX(); + } + e.ReloadEDX(); + } +}; +EMITTER(DIV_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + e.vdivss(i.dest, i.src1, i.src2); + } +}; +EMITTER(DIV_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + e.vdivsd(i.dest, i.src1, i.src2); + } +}; +EMITTER(DIV_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + e.vdivps(i.dest, i.src1, i.src2); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DIV, + DIV_I8, + DIV_I16, + DIV_I32, + DIV_I64, + DIV_F32, + DIV_F64, + DIV_V128); + + +// ============================================================================ +// OPCODE_MUL_ADD +// ============================================================================ +// d = 1 * 2 + 3 +// $0 = $1×$0 + $2 +// TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling. +// dest could be src2 or src3 - need to ensure it's not before overwriting dest +// perhaps use other 132/213/etc +EMITTER(MUL_ADD_F32, MATCH(I, F32<>, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.dest == i.src1) { + e.vfmadd213ss(i.dest, i.src2, i.src3); + } else { + if (i.dest != i.src2 && i.dest != i.src3) { + e.vmovss(i.dest, i.src1); + e.vfmadd213ss(i.dest, i.src2, i.src3); + } else { + e.vmovss(e.xmm0, i.src1); + e.vfmadd213ss(e.xmm0, i.src2, i.src3); + e.vmovss(i.dest, e.xmm0); + } + } + } +}; +EMITTER(MUL_ADD_F64, MATCH(I, F64<>, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.dest == i.src1) { + e.vfmadd213sd(i.dest, i.src2, i.src3); + } else { + if (i.dest != i.src2 && i.dest != i.src3) { + e.vmovsd(i.dest, i.src1); + e.vfmadd213sd(i.dest, i.src2, i.src3); + } else { + e.vmovsd(e.xmm0, i.src1); + e.vfmadd213sd(e.xmm0, i.src2, i.src3); + e.vmovsd(i.dest, e.xmm0); + } + } + } +}; +EMITTER(MUL_ADD_V128, MATCH(I, V128<>, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.dest == i.src1) { + e.vfmadd213ps(i.dest, i.src2, i.src3); + } else { + if (i.dest != i.src2 && i.dest != i.src3) { + e.vmovdqa(i.dest, i.src1); + e.vfmadd213ps(i.dest, i.src2, i.src3); + } else { + e.vmovdqa(e.xmm0, i.src1); + e.vfmadd213ps(e.xmm0, i.src2, i.src3); + e.vmovdqa(i.dest, e.xmm0); + } + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_MUL_ADD, + MUL_ADD_F32, + MUL_ADD_F64, + MUL_ADD_V128); + + +// ============================================================================ +// OPCODE_MUL_SUB +// ============================================================================ +// d = 1 * 2 - 3 +// $0 = $2×$0 - $3 +// TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling. +// dest could be src2 or src3 - need to ensure it's not before overwriting dest +// perhaps use other 132/213/etc +EMITTER(MUL_SUB_F32, MATCH(I, F32<>, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.dest == i.src1) { + e.vfmsub213ss(i.dest, i.src2, i.src3); + } else { + if (i.dest != i.src2 && i.dest != i.src3) { + e.vmovss(i.dest, i.src1); + e.vfmsub213ss(i.dest, i.src2, i.src3); + } else { + e.vmovss(e.xmm0, i.src1); + e.vfmsub213ss(e.xmm0, i.src2, i.src3); + e.vmovss(i.dest, e.xmm0); + } + } + } +}; +EMITTER(MUL_SUB_F64, MATCH(I, F64<>, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.dest == i.src1) { + e.vfmsub213sd(i.dest, i.src2, i.src3); + } else { + if (i.dest != i.src2 && i.dest != i.src3) { + e.vmovsd(i.dest, i.src1); + e.vfmsub213sd(i.dest, i.src2, i.src3); + } else { + e.vmovsd(e.xmm0, i.src1); + e.vfmsub213sd(e.xmm0, i.src2, i.src3); + e.vmovsd(i.dest, e.xmm0); + } + } + } +}; +EMITTER(MUL_SUB_V128, MATCH(I, V128<>, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.dest == i.src1) { + e.vfmsub213ps(i.dest, i.src2, i.src3); + } else { + if (i.dest != i.src2 && i.dest != i.src3) { + e.vmovdqa(i.dest, i.src1); + e.vfmsub213ps(i.dest, i.src2, i.src3); + } else { + e.vmovdqa(e.xmm0, i.src1); + e.vfmsub213ps(e.xmm0, i.src2, i.src3); + e.vmovdqa(i.dest, e.xmm0); + } + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_MUL_SUB, + MUL_SUB_F32, + MUL_SUB_F64, + MUL_SUB_V128); + + +// ============================================================================ +// OPCODE_NEG +// ============================================================================ +// TODO(benvanik): put dest/src1 together. +template +void EmitNegXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitUnaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src) { e.neg(dest_src); }); +} +EMITTER(NEG_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNegXX(e, i); + } +}; +EMITTER(NEG_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNegXX(e, i); + } +}; +EMITTER(NEG_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNegXX(e, i); + } +}; +EMITTER(NEG_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNegXX(e, i); + } +}; +EMITTER(NEG_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPS)); + } +}; +EMITTER(NEG_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPD)); + } +}; +EMITTER(NEG_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPS)); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_NEG, + NEG_I8, + NEG_I16, + NEG_I32, + NEG_I64, + NEG_F32, + NEG_F64, + NEG_V128); + + +// ============================================================================ +// OPCODE_ABS +// ============================================================================ +EMITTER(ABS_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMSignMaskPS)); + e.vpandn(i.dest, e.xmm0, i.src1); + } +}; +EMITTER(ABS_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMSignMaskPD)); + e.vpandn(i.dest, e.xmm0, i.src1); + } +}; +EMITTER(ABS_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMSignMaskPS)); + e.vpandn(i.dest, e.xmm0, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ABS, + ABS_F32, + ABS_F64, + ABS_V128); + + +// ============================================================================ +// OPCODE_SQRT +// ============================================================================ +EMITTER(SQRT_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vsqrtss(i.dest, i.src1); + } +}; +EMITTER(SQRT_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vsqrtsd(i.dest, i.src1); + } +}; +EMITTER(SQRT_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vsqrtps(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SQRT, + SQRT_F32, + SQRT_F64, + SQRT_V128); + + +// ============================================================================ +// OPCODE_RSQRT +// ============================================================================ +EMITTER(RSQRT_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vrsqrtss(i.dest, i.src1); + } +}; +EMITTER(RSQRT_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vcvtsd2ss(i.dest, i.src1); + e.vrsqrtss(i.dest, i.dest); + e.vcvtss2sd(i.dest, i.dest); + } +}; +EMITTER(RSQRT_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vrsqrtps(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_RSQRT, + RSQRT_F32, + RSQRT_F64, + RSQRT_V128); + + +// ============================================================================ +// OPCODE_POW2 +// ============================================================================ + + +// ============================================================================ +// OPCODE_LOG2 +// ============================================================================ + + +// ============================================================================ +// OPCODE_DOT_PRODUCT_3 +// ============================================================================ +EMITTER(DOT_PRODUCT_3_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx + // TODO(benvanik): verify ordering + // TODO(benvanik): apparently this is very slow - find alternative? + e.vdpps(i.dest, i.src1, i.src2, B01110001); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DOT_PRODUCT_3, + DOT_PRODUCT_3_V128); + + +// ============================================================================ +// OPCODE_DOT_PRODUCT_4 +// ============================================================================ +EMITTER(DOT_PRODUCT_4_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx + // TODO(benvanik): verify ordering + // TODO(benvanik): apparently this is very slow - find alternative? + e.vdpps(i.dest, i.src1, i.src2, B11110001); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DOT_PRODUCT_4, + DOT_PRODUCT_4_V128); + + +// ============================================================================ +// OPCODE_AND +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitAndXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitCommutativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const REG& src) { e.and(dest_src, src); }, + [](X64Emitter& e, const REG& dest_src, int32_t constant) { e.and(dest_src, constant); }); +} +EMITTER(AND_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAndXX(e, i); + } +}; +EMITTER(AND_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAndXX(e, i); + } +}; +EMITTER(AND_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAndXX(e, i); + } +}; +EMITTER(AND_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAndXX(e, i); + } +}; +EMITTER(AND_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vpand(i.dest, i.src1, i.src2); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_AND, + AND_I8, + AND_I16, + AND_I32, + AND_I64, + AND_V128); + + +// ============================================================================ +// OPCODE_OR +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitOrXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitCommutativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const REG& src) { e.or(dest_src, src); }, + [](X64Emitter& e, const REG& dest_src, int32_t constant) { e.or(dest_src, constant); }); +} +EMITTER(OR_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitOrXX(e, i); + } +}; +EMITTER(OR_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitOrXX(e, i); + } +}; +EMITTER(OR_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitOrXX(e, i); + } +}; +EMITTER(OR_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitOrXX(e, i); + } +}; +EMITTER(OR_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vpor(i.dest, i.src1, i.src2); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_OR, + OR_I8, + OR_I16, + OR_I32, + OR_I64, + OR_V128); + + +// ============================================================================ +// OPCODE_XOR +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitXorXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitCommutativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const REG& src) { e.xor(dest_src, src); }, + [](X64Emitter& e, const REG& dest_src, int32_t constant) { e.xor(dest_src, constant); }); +} +EMITTER(XOR_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitXorXX(e, i); + } +}; +EMITTER(XOR_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitXorXX(e, i); + } +}; +EMITTER(XOR_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitXorXX(e, i); + } +}; +EMITTER(XOR_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitXorXX(e, i); + } +}; +EMITTER(XOR_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vpxor(i.dest, i.src1, i.src2); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_XOR, + XOR_I8, + XOR_I16, + XOR_I32, + XOR_I64, + XOR_V128); + + +// ============================================================================ +// OPCODE_NOT +// ============================================================================ +// TODO(benvanik): put dest/src1 together. +template +void EmitNotXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitUnaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src) { e.not(dest_src); }); +} +EMITTER(NOT_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNotXX(e, i); + } +}; +EMITTER(NOT_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNotXX(e, i); + } +}; +EMITTER(NOT_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNotXX(e, i); + } +}; +EMITTER(NOT_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNotXX(e, i); + } +}; +EMITTER(NOT_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // dest = src ^ 0xFFFF... + e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMOne)); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_NOT, + NOT_I8, + NOT_I16, + NOT_I32, + NOT_I64, + NOT_V128); + + +// ============================================================================ +// OPCODE_SHL +// ============================================================================ +// TODO(benvanik): optimize common shifts. +template +void EmitShlXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitAssociativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const Reg8& src) { + if (dest_src.getBit() == 64) { + e.shlx(dest_src.cvt64(), dest_src.cvt64(), src.cvt64()); + } else { + e.shlx(dest_src.cvt32(), dest_src.cvt32(), src.cvt32()); + } + }, [](X64Emitter& e, const REG& dest_src, int8_t constant) { + e.shl(dest_src, constant); + }); +} +EMITTER(SHL_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShlXX(e, i); + } +}; +EMITTER(SHL_I16, MATCH(I, I16<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShlXX(e, i); + } +}; +EMITTER(SHL_I32, MATCH(I, I32<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShlXX(e, i); + } +}; +EMITTER(SHL_I64, MATCH(I, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShlXX(e, i); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SHL, + SHL_I8, + SHL_I16, + SHL_I32, + SHL_I64); + + +// ============================================================================ +// OPCODE_SHR +// ============================================================================ +// TODO(benvanik): optimize common shifts. +template +void EmitShrXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitAssociativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const Reg8& src) { + if (dest_src.getBit() == 64) { + e.shrx(dest_src.cvt64(), dest_src.cvt64(), src.cvt64()); + } else if (dest_src.getBit() == 32) { + e.shrx(dest_src.cvt32(), dest_src.cvt32(), src.cvt32()); + } else { + e.movzx(dest_src.cvt32(), dest_src); + e.shrx(dest_src.cvt32(), dest_src.cvt32(), src.cvt32()); + } + }, [](X64Emitter& e, const REG& dest_src, int8_t constant) { + e.shr(dest_src, constant); + }); +} +EMITTER(SHR_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShrXX(e, i); + } +}; +EMITTER(SHR_I16, MATCH(I, I16<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShrXX(e, i); + } +}; +EMITTER(SHR_I32, MATCH(I, I32<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShrXX(e, i); + } +}; +EMITTER(SHR_I64, MATCH(I, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShrXX(e, i); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SHR, + SHR_I8, + SHR_I16, + SHR_I32, + SHR_I64); + + +// ============================================================================ +// OPCODE_SHA +// ============================================================================ +// TODO(benvanik): optimize common shifts. +template +void EmitSarXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitAssociativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const Reg8& src) { + if (dest_src.getBit() == 64) { + e.sarx(dest_src.cvt64(), dest_src.cvt64(), src.cvt64()); + } else if (dest_src.getBit() == 32) { + e.sarx(dest_src.cvt32(), dest_src.cvt32(), src.cvt32()); + } else { + e.movsx(dest_src.cvt32(), dest_src); + e.sarx(dest_src.cvt32(), dest_src.cvt32(), src.cvt32()); + } + }, [](X64Emitter& e, const REG& dest_src, int8_t constant) { + e.sar(dest_src, constant); + }); +} +EMITTER(SHA_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSarXX(e, i); + } +}; +EMITTER(SHA_I16, MATCH(I, I16<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSarXX(e, i); + } +}; +EMITTER(SHA_I32, MATCH(I, I32<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSarXX(e, i); + } +}; +EMITTER(SHA_I64, MATCH(I, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSarXX(e, i); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SHA, + SHA_I8, + SHA_I16, + SHA_I32, + SHA_I64); + + +// ============================================================================ +// OPCODE_VECTOR_SHL +// ============================================================================ +EMITTER(VECTOR_SHL_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT32_TYPE: + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsllvd(i.dest, i.src1, e.xmm0); + break; + default: + XEASSERTALWAYS(); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_SHL, + VECTOR_SHL_V128); + + +// ============================================================================ +// OPCODE_VECTOR_SHR +// ============================================================================ +EMITTER(VECTOR_SHR_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT32_TYPE: + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsrlvd(i.dest, i.src1, e.xmm0); + break; + default: + XEASSERTALWAYS(); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_SHR, + VECTOR_SHR_V128); + + +// ============================================================================ +// OPCODE_VECTOR_SHA +// ============================================================================ +EMITTER(VECTOR_SHA_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT32_TYPE: + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsravd(i.dest, i.src1, e.xmm0); + break; + default: + XEASSERTALWAYS(); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_SHA, + VECTOR_SHA_V128); + + +// ============================================================================ +// OPCODE_ROTATE_LEFT +// ============================================================================ +// TODO(benvanik): put dest/src1 together, src2 in cl. +template +void EmitRotateLeftXX(X64Emitter& e, const ARGS& i) { + if (i.src2.is_constant) { + // Constant rotate. + if (i.dest != i.src1) { + if (i.src1.is_constant) { + e.mov(i.dest, i.src1.constant()); + } else { + e.mov(i.dest, i.src1); + } + } + e.rol(i.dest, i.src2.constant()); + } else { + // Variable rotate. + if (i.src2.reg().getIdx() != e.cl.getIdx()) { + e.mov(e.cl, i.src2); + } + if (i.dest != i.src1) { + if (i.src1.is_constant) { + e.mov(i.dest, i.src1.constant()); + } else { + e.mov(i.dest, i.src1); + } + } + e.rol(i.dest, e.cl); + e.ReloadECX(); + } +} +EMITTER(ROTATE_LEFT_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitRotateLeftXX(e, i); + } +}; +EMITTER(ROTATE_LEFT_I16, MATCH(I, I16<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitRotateLeftXX(e, i); + } +}; +EMITTER(ROTATE_LEFT_I32, MATCH(I, I32<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitRotateLeftXX(e, i); + } +}; +EMITTER(ROTATE_LEFT_I64, MATCH(I, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitRotateLeftXX(e, i); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ROTATE_LEFT, + ROTATE_LEFT_I8, + ROTATE_LEFT_I16, + ROTATE_LEFT_I32, + ROTATE_LEFT_I64); + + +// ============================================================================ +// OPCODE_BYTE_SWAP +// ============================================================================ +// TODO(benvanik): put dest/src1 together. +EMITTER(BYTE_SWAP_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitUnaryOp( + e, i, + [](X64Emitter& e, const Reg16& dest_src) { e.ror(dest_src, 8); }); + } +}; +EMITTER(BYTE_SWAP_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitUnaryOp( + e, i, + [](X64Emitter& e, const Reg32& dest_src) { e.bswap(dest_src); }); + } +}; +EMITTER(BYTE_SWAP_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitUnaryOp( + e, i, + [](X64Emitter& e, const Reg64& dest_src) { e.bswap(dest_src); }); + } +}; +EMITTER(BYTE_SWAP_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): find a way to do this without the memory load. + e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMByteSwapMask)); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_BYTE_SWAP, + BYTE_SWAP_I16, + BYTE_SWAP_I32, + BYTE_SWAP_I64, + BYTE_SWAP_V128); + + +// ============================================================================ +// OPCODE_CNTLZ +// ============================================================================ +EMITTER(CNTLZ_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // No 8bit lzcnt, so do 16 and sub 8. + e.movzx(i.dest.reg().cvt16(), i.src1); + e.lzcnt(i.dest.reg().cvt16(), i.dest.reg().cvt16()); + e.sub(i.dest, 8); + } +}; +EMITTER(CNTLZ_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.lzcnt(i.dest.reg().cvt32(), i.src1); + } +}; +EMITTER(CNTLZ_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.lzcnt(i.dest.reg().cvt32(), i.src1); + } +}; +EMITTER(CNTLZ_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.lzcnt(i.dest.reg().cvt64(), i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CNTLZ, + CNTLZ_I8, + CNTLZ_I16, + CNTLZ_I32, + CNTLZ_I64); + + +// ============================================================================ +// OPCODE_INSERT +// ============================================================================ + + +// ============================================================================ +// OPCODE_EXTRACT +// ============================================================================ +// TODO(benvanik): sequence extract/splat: +// v0.i32 = extract v0.v128, 0 +// v0.v128 = splat v0.i32 +// This can be a single broadcast. +EMITTER(EXTRACT_I8, MATCH(I, V128<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.vpextrb(i.dest, i.src1, i.src2.constant()); + } else { + XEASSERTALWAYS(); + } + } +}; +EMITTER(EXTRACT_I16, MATCH(I, V128<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.vpextrw(i.dest, i.src1, i.src2.constant()); + } else { + XEASSERTALWAYS(); + } + } +}; +EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + static vec128_t extract_table_32[4] = { + vec128b( 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b( 7, 6, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b(11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + }; + if (i.src2.is_constant) { + e.vpextrd(i.dest, i.src1, i.src2.constant()); + } else { + // Get teh desired word in xmm0, then extract that. + // TODO(benvanik): find a better way, this sequence is terrible. + e.xor(e.rax, e.rax); + e.mov(e.al, i.src2); + e.and(e.al, 0x03); + e.shl(e.al, 4); + e.mov(e.rdx, reinterpret_cast(extract_table_32)); + e.vmovaps(e.xmm0, e.ptr[e.rdx + e.rax]); + e.vpshufb(e.xmm0, i.src1, e.xmm0); + e.vpextrd(i.dest, e.xmm0, 0); + e.ReloadEDX(); + } + } +}; +EMITTER(EXTRACT_F32, MATCH(I, V128<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.vextractps(i.dest, i.src1, i.src2.constant()); + } else { + XEASSERTALWAYS(); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_EXTRACT, + EXTRACT_I8, + EXTRACT_I16, + EXTRACT_I32, + EXTRACT_F32); + + +// ============================================================================ +// OPCODE_SPLAT +// ============================================================================ +EMITTER(SPLAT_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + // TODO(benvanik): faster constant splats. + e.mov(e.al, i.src1.constant()); + e.vmovd(e.xmm0, e.eax); + e.vpbroadcastb(i.dest, e.xmm0); + } else { + e.vmovd(e.xmm0, i.src1.reg().cvt32()); + e.vpbroadcastb(i.dest, e.xmm0); + } + } +}; +EMITTER(SPLAT_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + // TODO(benvanik): faster constant splats. + e.mov(e.ax, i.src1.constant()); + e.vmovd(e.xmm0, e.eax); + e.vpbroadcastw(i.dest, e.xmm0); + } else { + e.vmovd(e.xmm0, i.src1.reg().cvt32()); + e.vpbroadcastw(i.dest, e.xmm0); + } + } +}; +EMITTER(SPLAT_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + // TODO(benvanik): faster constant splats. + e.mov(e.eax, i.src1.constant()); + e.vmovd(e.xmm0, e.eax); + e.vpbroadcastd(i.dest, e.xmm0); + } else { + e.vmovd(e.xmm0, i.src1); + e.vpbroadcastd(i.dest, e.xmm0); + } + } +}; +EMITTER(SPLAT_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + // TODO(benvanik): faster constant splats. + e.mov(e.eax, i.src1.value->constant.i32); + e.vmovd(e.xmm0, e.eax); + e.vbroadcastss(i.dest, e.xmm0); + } else { + e.vbroadcastss(i.dest, i.src1); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SPLAT, + SPLAT_I8, + SPLAT_I16, + SPLAT_I32, + SPLAT_F32); + + +// ============================================================================ +// OPCODE_PERMUTE +// ============================================================================ +EMITTER(PERMUTE_I32, MATCH(I, I32<>, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // Permute words between src2 and src3. + // TODO(benvanik): check src3 for zero. if 0, we can use pshufb. + if (i.src1.is_constant) { + uint32_t control = i.src1.constant(); + // Shuffle things into the right places in dest & xmm0, + // then we blend them together. + uint32_t src_control = + (((control >> 24) & 0x3) << 0) | + (((control >> 16) & 0x3) << 2) | + (((control >> 8) & 0x3) << 4) | + (((control >> 0) & 0x3) << 6); + uint32_t blend_control = + (((control >> 26) & 0x1) << 0) | + (((control >> 18) & 0x1) << 1) | + (((control >> 10) & 0x1) << 2) | + (((control >> 2) & 0x1) << 3); + if (i.dest != i.src3) { + e.vpshufd(i.dest, i.src2, src_control); + e.vpshufd(e.xmm0, i.src3, src_control); + e.vpblendd(i.dest, e.xmm0, blend_control); + } else { + e.vmovaps(e.xmm0, i.src3); + e.vpshufd(i.dest, i.src2, src_control); + e.vpshufd(e.xmm0, e.xmm0, src_control); + e.vpblendd(i.dest, e.xmm0, blend_control); + } + } else { + // Permute by non-constant. + XEASSERTALWAYS(); + } + } +}; +EMITTER(PERMUTE_V128, MATCH(I, V128<>, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): find out how to do this with only one temp register! + // Permute bytes between src2 and src3. + if (i.src3.value->IsConstantZero()) { + // Permuting with src2/zero, so just shuffle/mask. + if (i.src2.value->IsConstantZero()) { + // src2 & src3 are zero, so result will always be zero. + e.vpxor(i.dest, i.dest); + } else { + // Control mask needs to be shuffled. + e.vpshufb(e.xmm0, i.src1, e.GetXmmConstPtr(XMMByteSwapMask)); + if (i.src2.is_constant) { + e.LoadConstantXmm(i.dest, i.src2.constant()); + e.vpshufb(i.dest, i.dest, e.xmm0); + } else { + e.vpshufb(i.dest, i.src2, e.xmm0); + } + // Build a mask with values in src2 having 0 and values in src3 having 1. + e.vpcmpgtb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPermuteControl15)); + e.vpandn(i.dest, e.xmm0, i.dest); + } + } else { + // General permute. + // Control mask needs to be shuffled. + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm2, i.src1.constant()); + e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMByteSwapMask)); + } else { + e.vpshufb(e.xmm2, i.src1, e.GetXmmConstPtr(XMMByteSwapMask)); + } + // Build a mask with values in src2 having 0 and values in src3 having 1. + e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15)); + Xmm src2_shuf = e.xmm0; + if (i.src2.value->IsConstantZero()) { + e.vpxor(src2_shuf, src2_shuf); + } else if (i.src2.is_constant) { + e.LoadConstantXmm(src2_shuf, i.src2.constant()); + e.vpshufb(src2_shuf, src2_shuf, e.xmm2); + } else { + e.vpshufb(src2_shuf, i.src2, e.xmm2); + } + Xmm src3_shuf = e.xmm1; + if (i.src3.value->IsConstantZero()) { + e.vpxor(src3_shuf, src3_shuf); + } else if (i.src3.is_constant) { + e.LoadConstantXmm(src3_shuf, i.src3.constant()); + e.vpshufb(src3_shuf, src3_shuf, e.xmm2); + } else { + e.vpshufb(src3_shuf, i.src3, e.xmm2); + } + e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_PERMUTE, + PERMUTE_I32, + PERMUTE_V128); + + +// ============================================================================ +// OPCODE_SWIZZLE +// ============================================================================ +EMITTER(SWIZZLE, MATCH(I, V128<>, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto element_type = i.instr->flags; + if (element_type == INT8_TYPE) { + XEASSERTALWAYS(); + } else if (element_type == INT16_TYPE) { + XEASSERTALWAYS(); + } else if (element_type == INT32_TYPE || element_type == FLOAT32_TYPE) { + uint8_t swizzle_mask = static_cast(i.src2.value); + swizzle_mask = + (((swizzle_mask >> 6) & 0x3) << 0) | + (((swizzle_mask >> 4) & 0x3) << 2) | + (((swizzle_mask >> 2) & 0x3) << 4) | + (((swizzle_mask >> 0) & 0x3) << 6); + e.vpshufd(i.dest, i.src1, swizzle_mask); + } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) { + XEASSERTALWAYS(); + } else { + XEASSERTALWAYS(); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SWIZZLE, + SWIZZLE); + + +// ============================================================================ +// OPCODE_PACK +// ============================================================================ +EMITTER(PACK, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case PACK_TYPE_D3DCOLOR: + EmitD3DCOLOR(e, i); + break; + case PACK_TYPE_FLOAT16_2: + EmitFLOAT16_2(e, i); + break; + case PACK_TYPE_FLOAT16_4: + EmitFLOAT16_4(e, i); + break; + case PACK_TYPE_SHORT_2: + EmitSHORT_2(e, i); + break; + case PACK_TYPE_S8_IN_16_LO: + EmitS8_IN_16_LO(e, i); + break; + case PACK_TYPE_S8_IN_16_HI: + EmitS8_IN_16_HI(e, i); + break; + case PACK_TYPE_S16_IN_32_LO: + EmitS16_IN_32_LO(e, i); + break; + case PACK_TYPE_S16_IN_32_HI: + EmitS16_IN_32_HI(e, i); + break; + default: XEASSERTALWAYS(); break; + } + } + static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS8_IN_16_LO(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS8_IN_16_HI(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS16_IN_32_LO(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS16_IN_32_HI(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_PACK, + PACK); + + +// ============================================================================ +// OPCODE_UNPACK +// ============================================================================ +EMITTER(UNPACK, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case PACK_TYPE_D3DCOLOR: + EmitD3DCOLOR(e, i); + break; + case PACK_TYPE_FLOAT16_2: + EmitFLOAT16_2(e, i); + break; + case PACK_TYPE_FLOAT16_4: + EmitFLOAT16_4(e, i); + break; + case PACK_TYPE_SHORT_2: + EmitSHORT_2(e, i); + break; + case PACK_TYPE_S8_IN_16_LO: + EmitS8_IN_16_LO(e, i); + break; + case PACK_TYPE_S8_IN_16_HI: + EmitS8_IN_16_HI(e, i); + break; + case PACK_TYPE_S16_IN_32_LO: + EmitS16_IN_32_LO(e, i); + break; + case PACK_TYPE_S16_IN_32_HI: + EmitS16_IN_32_HI(e, i); + break; + default: XEASSERTALWAYS(); break; + } + } + static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { + // ARGB (WXYZ) -> RGBA (XYZW) + // XMLoadColor + // int32_t src = (int32_t)src1.iw; + // dest.f4[0] = (float)((src >> 16) & 0xFF) * (1.0f / 255.0f); + // dest.f4[1] = (float)((src >> 8) & 0xFF) * (1.0f / 255.0f); + // dest.f4[2] = (float)(src & 0xFF) * (1.0f / 255.0f); + // dest.f4[3] = (float)((src >> 24) & 0xFF) * (1.0f / 255.0f); + + // src = ZZYYXXWW + // unpack to 000000ZZ,000000YY,000000XX,000000WW + e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMUnpackD3DCOLOR)); + // mult by 1/255 + e.vmulps(i.dest, e.GetXmmConstPtr(XMMOneOver255)); + } + static void Unpack_FLOAT16_2(void* raw_context, __m128& v) { + uint32_t src = v.m128_i32[3]; + v.m128_f32[0] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)src); + v.m128_f32[1] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)(src >> 16)); + v.m128_f32[2] = 0.0f; + v.m128_f32[3] = 1.0f; + } + static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { + // 1 bit sign, 5 bit exponent, 10 bit mantissa + // D3D10 half float format + // TODO(benvanik): http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx + // Use _mm_cvtph_ps -- requires very modern processors (SSE5+) + // Unpacking half floats: http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ + // Packing half floats: https://gist.github.com/rygorous/2156668 + // Load source, move from tight pack of X16Y16.... to X16...Y16... + // Also zero out the high end. + // TODO(benvanik): special case constant unpacks that just get 0/1/etc. + + // sx = src.iw >> 16; + // sy = src.iw & 0xFFFF; + // dest = { XMConvertHalfToFloat(sx), + // XMConvertHalfToFloat(sy), + // 0.0, + // 1.0 }; + auto addr = e.StashXmm(i.src1); + e.lea(e.rdx, addr); + e.CallNative(Unpack_FLOAT16_2); + e.vmovaps(i.dest, addr); + } + static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { + // Could be shared with FLOAT16_2. + XEASSERTALWAYS(); + } + static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { + // (VD.x) = 3.0 + (VB.x>>16)*2^-22 + // (VD.y) = 3.0 + (VB.x)*2^-22 + // (VD.z) = 0.0 + // (VD.w) = 1.0 + + // XMLoadShortN2 plus 3,3,0,3 (for some reason) + // src is (xx,xx,xx,VALUE) + // (VALUE,VALUE,VALUE,VALUE) + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.vpxor(i.dest, i.dest); + } else { + // TODO(benvanik): check other common constants. + e.LoadConstantXmm(i.dest, i.src1.constant()); + e.vbroadcastss(i.dest, i.src1); + } + } else { + e.vbroadcastss(i.dest, i.src1); + } + // (VALUE&0xFFFF,VALUE&0xFFFF0000,0,0) + e.vandps(i.dest, e.GetXmmConstPtr(XMMMaskX16Y16)); + // Sign extend. + e.vxorps(i.dest, e.GetXmmConstPtr(XMMFlipX16Y16)); + // Convert int->float. + e.cvtpi2ps(i.dest, e.StashXmm(i.dest)); + // 0x8000 to undo sign. + e.vaddps(i.dest, e.GetXmmConstPtr(XMMFixX16Y16)); + // Normalize. + e.vmulps(i.dest, e.GetXmmConstPtr(XMMNormalizeX16Y16)); + // Clamp. + e.vmaxps(i.dest, e.GetXmmConstPtr(XMMNegativeOne)); + // Add 3,3,0,1. + e.vaddps(i.dest, e.GetXmmConstPtr(XMM3301)); + } + static void EmitS8_IN_16_LO(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS8_IN_16_HI(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS16_IN_32_LO(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS16_IN_32_HI(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_UNPACK, + UNPACK); + + +// ============================================================================ +// OPCODE_COMPARE_EXCHANGE +// ============================================================================ + + +// ============================================================================ +// OPCODE_ATOMIC_EXCHANGE +// ============================================================================ +// Note that the address we use here is a real, host address! +// This is weird, and should be fixed. +template +void EmitAtomicExchangeXX(X64Emitter& e, const ARGS& i) { + if (i.dest == i.src1) { + e.mov(e.rax, i.src1); + if (i.dest != i.src2) { + if (i.src2.is_constant) { + e.mov(i.dest, i.src2.constant()); + } else { + e.mov(i.dest, i.src2); + } + } + e.lock(); + e.xchg(e.dword[e.rax], i.dest); + } else { + if (i.dest != i.src2) { + if (i.src2.is_constant) { + e.mov(i.dest, i.src2.constant()); + } else { + e.mov(i.dest, i.src2); + } + } + e.lock(); + e.xchg(e.dword[i.src1.reg()], i.dest); + } +} +EMITTER(ATOMIC_EXCHANGE_I8, MATCH(I, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX(e, i); + } +}; +EMITTER(ATOMIC_EXCHANGE_I16, MATCH(I, I64<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX(e, i); + } +}; +EMITTER(ATOMIC_EXCHANGE_I32, MATCH(I, I64<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX(e, i); + } +}; +EMITTER(ATOMIC_EXCHANGE_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX(e, i); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ATOMIC_EXCHANGE, + ATOMIC_EXCHANGE_I8, + ATOMIC_EXCHANGE_I16, + ATOMIC_EXCHANGE_I32, + ATOMIC_EXCHANGE_I64); + + +// ============================================================================ +// OPCODE_ATOMIC_ADD +// ============================================================================ + + +// ============================================================================ +// OPCODE_ATOMIC_SUB +// ============================================================================ + + + + +//SEQUENCE(ADD_ADD_BRANCH, MATCH( +// I, I32<>, I32C<>>, +// I, I32, I32C<>>, +// I)) { +// static void Emit(X64Emitter& e, const EmitArgs& _) { +// } +//}; + + + +void alloy::backend::x64::RegisterSequences() { + #define REGISTER_EMITTER_OPCODE_TABLE(opcode) Register_##opcode() + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMMENT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_NOP); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SOURCE_OFFSET); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_TRAP); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CALL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CALL_EXTERN); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_RETURN); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SET_RETURN_ADDRESS); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BRANCH); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ASSIGN); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CAST); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ZERO_EXTEND); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SIGN_EXTEND); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_TRUNCATE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CONVERT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ROUND); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_I2F); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHR); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOAD_CLOCK); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOAD_LOCAL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_STORE_LOCAL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOAD_CONTEXT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_STORE_CONTEXT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOAD); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_STORE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_PREFETCH); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MAX); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MIN); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SELECT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_IS_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_EQ); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_NE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SLE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SGT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SGE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_ULT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_ULE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_UGT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_UGE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_CARRY); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_OVERFLOW); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_EQ); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGE); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGT); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ADD); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ADD_CARRY); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ADD); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SUB); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MUL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MUL_HI); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DIV); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_NEG); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ABS); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SQRT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_RSQRT); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_POW2); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOG2); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_4); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_AND); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_OR); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_XOR); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_NOT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SHL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SHR); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SHA); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ROTATE_LEFT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BYTE_SWAP); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CNTLZ); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_INSERT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_EXTRACT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SPLAT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_PERMUTE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SWIZZLE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_PACK); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_UNPACK); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_EXCHANGE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_ADD); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_SUB); +} + +bool alloy::backend::x64::SelectSequence(X64Emitter& e, const Instr* i, const Instr** new_tail) { + const InstrKey key(i); + const auto its = sequence_table.equal_range(key); + for (auto it = its.first; it != its.second; ++it) { + if (it->second(e, i, new_tail)) { + return true; + } + } + XELOGE("No sequence match for variant %s", i->opcode->name); + return false; +} diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.h b/src/alloy/backend/x64/x64_sequences.h similarity index 59% rename from src/alloy/backend/x64/lowering/lowering_sequences.h rename to src/alloy/backend/x64/x64_sequences.h index 634d52f47..5a77e9987 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.h +++ b/src/alloy/backend/x64/x64_sequences.h @@ -2,32 +2,32 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * + * Copyright 2014 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ -#ifndef ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_SEQUENCES_H_ -#define ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_SEQUENCES_H_ +#ifndef ALLOY_BACKEND_X64_X64_SEQUENCES_H_ +#define ALLOY_BACKEND_X64_X64_SEQUENCES_H_ #include -#include +XEDECLARECLASS2(alloy, hir, Instr); namespace alloy { namespace backend { namespace x64 { -namespace lowering { -class LoweringTable; - -void RegisterSequences(LoweringTable* table); +class X64Emitter; + + +void RegisterSequences(); +bool SelectSequence(X64Emitter& e, const hir::Instr* i, const hir::Instr** new_tail); -} // namespace lowering } // namespace x64 } // namespace backend } // namespace alloy -#endif // ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_SEQUENCES_H_ +#endif // ALLOY_BACKEND_X64_X64_SEQUENCES_H_ diff --git a/src/alloy/backend/x64/lowering/tracers.cc b/src/alloy/backend/x64/x64_tracers.cc similarity index 96% rename from src/alloy/backend/x64/lowering/tracers.cc rename to src/alloy/backend/x64/x64_tracers.cc index f1c18f882..0ebb699cb 100644 --- a/src/alloy/backend/x64/lowering/tracers.cc +++ b/src/alloy/backend/x64/x64_tracers.cc @@ -7,7 +7,7 @@ ****************************************************************************** */ -#include +#include #include #include @@ -15,19 +15,14 @@ using namespace alloy; using namespace alloy::backend::x64; -using namespace alloy::backend::x64::lowering; using namespace alloy::runtime; namespace alloy { namespace backend { namespace x64 { -namespace lowering { - -#define IFLUSH() -#define IPRINT -#define DFLUSH() -#define DPRINT +#define ITRACE 0 +#define DTRACE 0 #define TARGET_THREAD 1 @@ -36,6 +31,16 @@ namespace lowering { #define DFLUSH() fflush(stdout) #define DPRINT DFLUSH(); if (thread_state->thread_id() == TARGET_THREAD) printf +uint32_t GetTracingMode() { + uint32_t mode = 0; +#if ITRACE + mode |= TRACING_INSTR; +#endif // ITRACE +#if DTRACE + mode |= TRACING_DATA; +#endif // DTRACE + return mode; +} void TraceString(void* raw_context, const char* str) { auto thread_state = *((ThreadState**)raw_context); @@ -190,7 +195,6 @@ void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value) { } -} // namespace lowering } // namespace x64 } // namespace backend } // namespace alloy diff --git a/src/alloy/backend/x64/lowering/tracers.h b/src/alloy/backend/x64/x64_tracers.h similarity index 89% rename from src/alloy/backend/x64/lowering/tracers.h rename to src/alloy/backend/x64/x64_tracers.h index 7201b4f25..64c788ff3 100644 --- a/src/alloy/backend/x64/lowering/tracers.h +++ b/src/alloy/backend/x64/x64_tracers.h @@ -7,8 +7,8 @@ ****************************************************************************** */ -#ifndef ALLOY_BACKEND_X64_X64_LOWERING_TRACERS_H_ -#define ALLOY_BACKEND_X64_X64_LOWERING_TRACERS_H_ +#ifndef ALLOY_BACKEND_X64_X64_TRACERS_H_ +#define ALLOY_BACKEND_X64_X64_TRACERS_H_ #include @@ -33,7 +33,15 @@ namespace alloy { namespace backend { namespace x64 { class X64Emitter; -namespace lowering { + +enum TracingMode { + TRACING_INSTR = (1 << 1), + TRACING_DATA = (1 << 2), +}; + +uint32_t GetTracingMode(); +inline bool IsTracingInstr() { return (GetTracingMode() & TRACING_INSTR) != 0; } +inline bool IsTracingData() { return (GetTracingMode() & TRACING_DATA) != 0; } void TraceString(void* raw_context, const char* str); @@ -69,10 +77,9 @@ void TraceMemoryStoreF32(void* raw_context, uint64_t address, __m128 value); void TraceMemoryStoreF64(void* raw_context, uint64_t address, __m128 value); void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value); -} // namespace lowering } // namespace x64 } // namespace backend } // namespace alloy -#endif // ALLOY_BACKEND_X64_X64_LOWERING_TRACERS_H_ +#endif // ALLOY_BACKEND_X64_X64_TRACERS_H_ diff --git a/src/alloy/compiler/passes/constant_propagation_pass.cc b/src/alloy/compiler/passes/constant_propagation_pass.cc index 03a514a94..a481d18af 100644 --- a/src/alloy/compiler/passes/constant_propagation_pass.cc +++ b/src/alloy/compiler/passes/constant_propagation_pass.cc @@ -368,6 +368,13 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) { i->Remove(); } break; + case OPCODE_CNTLZ: + if (i->src1.value->IsConstant()) { + v->set_zero(v->type); + v->CountLeadingZeros(i->src1.value->constant); + i->Remove(); + } + break; // TODO(benvanik): INSERT/EXTRACT // TODO(benvanik): SPLAT/PERMUTE/SWIZZLE case OPCODE_SPLAT: diff --git a/src/alloy/compiler/passes/context_promotion_pass.cc b/src/alloy/compiler/passes/context_promotion_pass.cc index a5123486b..c880c4f0e 100644 --- a/src/alloy/compiler/passes/context_promotion_pass.cc +++ b/src/alloy/compiler/passes/context_promotion_pass.cc @@ -9,6 +9,8 @@ #include +#include + #include #include @@ -20,6 +22,10 @@ using namespace alloy::hir; using namespace alloy::runtime; +DEFINE_bool(store_all_context_values, false, + "Don't strip dead context stores to aid in debugging."); + + ContextPromotionPass::ContextPromotionPass() : context_values_size_(0), context_values_(0), CompilerPass() { @@ -69,10 +75,12 @@ int ContextPromotionPass::Run(HIRBuilder* builder) { } // Remove all dead stores. - block = builder->first_block(); - while (block) { - RemoveDeadStoresBlock(block); - block = block->next; + if (!FLAGS_store_all_context_values) { + block = builder->first_block(); + while (block) { + RemoveDeadStoresBlock(block); + block = block->next; + } } return 0; diff --git a/src/alloy/compiler/passes/control_flow_analysis_pass.cc b/src/alloy/compiler/passes/control_flow_analysis_pass.cc index bff651fe2..9c1abf118 100644 --- a/src/alloy/compiler/passes/control_flow_analysis_pass.cc +++ b/src/alloy/compiler/passes/control_flow_analysis_pass.cc @@ -13,12 +13,6 @@ #include #include -#pragma warning(push) -#pragma warning(disable : 4244) -#pragma warning(disable : 4267) -#include -#pragma warning(pop) - using namespace alloy; using namespace alloy::backend; using namespace alloy::compiler; diff --git a/src/alloy/compiler/passes/data_flow_analysis_pass.cc b/src/alloy/compiler/passes/data_flow_analysis_pass.cc index b4e1ea644..2a44f076d 100644 --- a/src/alloy/compiler/passes/data_flow_analysis_pass.cc +++ b/src/alloy/compiler/passes/data_flow_analysis_pass.cc @@ -36,8 +36,6 @@ DataFlowAnalysisPass::~DataFlowAnalysisPass() { } int DataFlowAnalysisPass::Run(HIRBuilder* builder) { - auto arena = builder->arena(); - // Linearize blocks so that we can detect cycles and propagate dependencies. uint32_t block_count = LinearizeBlocks(builder); diff --git a/src/alloy/compiler/passes/register_allocation_pass.cc b/src/alloy/compiler/passes/register_allocation_pass.cc index 20b4b021f..a89e1415c 100644 --- a/src/alloy/compiler/passes/register_allocation_pass.cc +++ b/src/alloy/compiler/passes/register_allocation_pass.cc @@ -9,6 +9,8 @@ #include +#include + using namespace alloy; using namespace alloy::backend; using namespace alloy::compiler; @@ -16,180 +18,135 @@ using namespace alloy::compiler::passes; using namespace alloy::hir; -struct RegisterAllocationPass::Interval { - uint32_t start_ordinal; - uint32_t end_ordinal; - Value* value; - RegisterFreeUntilSet* free_until_set; - // TODO(benvanik): reduce to offsets in arena? - struct Interval* next; - struct Interval* prev; +#define ASSERT_NO_CYCLES 0 - void AddToList(Interval** list_head) { - auto list_next = *list_head; - this->next = list_next; - if (list_next) { - list_next->prev = this; - } - *list_head = this; - } - - void InsertIntoList(Interval** list_head) { - auto it = *list_head; - while (it) { - if (it->start_ordinal > this->start_ordinal) { - // Went too far. Insert before this interval. - this->prev = it->prev; - this->next = it; - if (it->prev) { - it->prev->next = this; - } else { - *list_head = this; - } - it->prev = this; - return; - } - if (!it->next) { - // None found, add at tail. - it->next = this; - this->prev = it; - return; - } - it = it->next; - } - } - - void RemoveFromList(Interval** list_head) { - if (this->next) { - this->next->prev = this->prev; - } - if (this->prev) { - this->prev->next = this->next; - } else { - *list_head = this->next; - } - this->next = this->prev = NULL; - } -}; - -struct RegisterAllocationPass::Intervals { - Interval* unhandled; - Interval* active; - Interval* handled; -}; RegisterAllocationPass::RegisterAllocationPass( const MachineInfo* machine_info) : machine_info_(machine_info), CompilerPass() { - // Initialize register sets. The values of these will be - // cleared before use, so just the structure is required. + // Initialize register sets. + // TODO(benvanik): rewrite in a way that makes sense - this is terrible. auto mi_sets = machine_info->register_sets; - xe_zero_struct(&free_until_sets_, sizeof(free_until_sets_)); + xe_zero_struct(&usage_sets_, sizeof(usage_sets_)); uint32_t n = 0; while (mi_sets[n].count) { auto& mi_set = mi_sets[n]; - auto free_until_set = new RegisterFreeUntilSet(); - free_until_sets_.all_sets[n] = free_until_set; - free_until_set->count = mi_set.count; - free_until_set->set = &mi_set; + auto usage_set = new RegisterSetUsage(); + usage_sets_.all_sets[n] = usage_set; + usage_set->count = mi_set.count; + usage_set->set = &mi_set; if (mi_set.types & MachineInfo::RegisterSet::INT_TYPES) { - free_until_sets_.int_set = free_until_set; + usage_sets_.int_set = usage_set; } if (mi_set.types & MachineInfo::RegisterSet::FLOAT_TYPES) { - free_until_sets_.float_set = free_until_set; + usage_sets_.float_set = usage_set; } if (mi_set.types & MachineInfo::RegisterSet::VEC_TYPES) { - free_until_sets_.vec_set = free_until_set; + usage_sets_.vec_set = usage_set; } n++; } } RegisterAllocationPass::~RegisterAllocationPass() { - for (size_t n = 0; n < XECOUNT(free_until_sets_.all_sets); n++) { - if (!free_until_sets_.all_sets[n]) { + for (size_t n = 0; n < XECOUNT(usage_sets_.all_sets); n++) { + if (!usage_sets_.all_sets[n]) { break; } - delete free_until_sets_.all_sets[n]; + delete usage_sets_.all_sets[n]; } } int RegisterAllocationPass::Run(HIRBuilder* builder) { - // A (probably broken) implementation of a linear scan register allocator - // that operates directly on SSA form: - // http://www.christianwimmer.at/Publications/Wimmer10a/Wimmer10a.pdf - // - // Requirements: - // - SSA form (single definition for variables) - // - block should be in linear order: - // - dominators *should* come before (a->b->c) - // - loop block sequences *should not* have intervening non-loop blocks + // Simple per-block allocator that operates on SSA form. + // Registers do not move across blocks, though this could be + // optimized with some intra-block analysis (dominators/etc). + // Really, it'd just be nice to have someone who knew what they + // were doing lower SSA and do this right. - auto arena = scratch_arena(); - - // Renumber everything. uint32_t block_ordinal = 0; uint32_t instr_ordinal = 0; auto block = builder->first_block(); while (block) { // Sequential block ordinals. block->ordinal = block_ordinal++; + + // Reset all state. + PrepareBlockState(); + + // Renumber all instructions in the block. This is required so that + // we can sort the usage pointers below. auto instr = block->instr_head; while (instr) { // Sequential global instruction ordinals. instr->ordinal = instr_ordinal++; instr = instr->next; } - block = block->next; - } - // Compute all liveness ranges by walking forward through all - // blocks/instructions and checking the last use of each value. This lets - // us know the exact order in (block#,instr#) form, which is then used to - // setup the range. - // TODO(benvanik): ideally we would have a list of all values and not have - // to keep walking instructions over and over. - Interval* prev_interval = NULL; - Interval* head_interval = NULL; - block = builder->first_block(); - while (block) { - auto instr = block->instr_head; + instr = block->instr_head; while (instr) { - // Compute last-use for the dest value. - // Since we know all values of importance must be defined, we can avoid - // having to check every value and just look at dest. const OpcodeInfo* info = instr->opcode; - if (GET_OPCODE_SIG_TYPE_DEST(info->signature) == OPCODE_SIG_TYPE_V) { - auto v = instr->dest; - if (!v->last_use) { - ComputeLastUse(v); - } + uint32_t signature = info->signature; - // Add interval. - auto interval = arena->Alloc(); - interval->start_ordinal = instr->ordinal; - interval->end_ordinal = v->last_use ? - v->last_use->ordinal : v->def->ordinal; - interval->value = v; - interval->next = NULL; - interval->prev = prev_interval; - if (prev_interval) { - prev_interval->next = interval; - } else { - head_interval = interval; - } - prev_interval = interval; + // Update the register use heaps. + AdvanceUses(instr); - // Grab register set to use. - // We do this now so it's only once per interval, and it makes it easy - // to only compare intervals that overlap their sets. - if (v->type <= INT64_TYPE) { - interval->free_until_set = free_until_sets_.int_set; - } else if (v->type <= FLOAT64_TYPE) { - interval->free_until_set = free_until_sets_.float_set; + // Check sources for retirement. If any are unused after this instruction + // we can eagerly evict them to speed up register allocation. + // Since X64 (and other platforms) can often take advantage of dest==src1 + // register mappings we track retired src1 so that we can attempt to + // reuse it. + // NOTE: these checks require that the usage list be sorted! + bool has_preferred_reg = false; + RegAssignment preferred_reg = { 0 }; + if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V && + !instr->src1.value->IsConstant()) { + if (!instr->src1_use->next) { + // Pull off preferred register. We will try to reuse this for the + // dest. + has_preferred_reg = true; + preferred_reg = instr->src1.value->reg; + XEASSERTNOTNULL(preferred_reg.set); + } + } + + if (GET_OPCODE_SIG_TYPE_DEST(signature) == OPCODE_SIG_TYPE_V) { + // Must not have been set already. + XEASSERTNULL(instr->dest->reg.set); + + // Sort the usage list. We depend on this in future uses of this variable. + SortUsageList(instr->dest); + + // If we have a preferred register, use that. + // This way we can help along the stupid X86 two opcode instructions. + bool allocated; + if (has_preferred_reg) { + // Allocate with the given preferred register. If the register is in + // the wrong set it will not be reused. + allocated = TryAllocateRegister(instr->dest, preferred_reg); } else { - interval->free_until_set = free_until_sets_.vec_set; + // Allocate a register. This will either reserve a free one or + // spill and reuse an active one. + allocated = TryAllocateRegister(instr->dest); + } + if (!allocated) { + // Failed to allocate register -- need to spill and try again. + // We spill only those registers we aren't using. + if (!SpillOneRegister(builder, instr->dest->type)) { + // Unable to spill anything - this shouldn't happen. + XELOGE("Unable to spill any registers"); + XEASSERTALWAYS(); + return 1; + } + + // Demand allocation. + if (!TryAllocateRegister(instr->dest)) { + // Boned. + XELOGE("Register allocation failed"); + XEASSERTALWAYS(); + return 1; + } } } @@ -198,228 +155,266 @@ int RegisterAllocationPass::Run(HIRBuilder* builder) { block = block->next; } - // Now have a sorted list of intervals, minus their ending ordinals. - Intervals intervals; - intervals.unhandled = head_interval; - intervals.active = intervals.handled = NULL; - while (intervals.unhandled) { - // Get next unhandled interval. - auto current = intervals.unhandled; - intervals.unhandled = intervals.unhandled->next; - current->RemoveFromList(&intervals.unhandled); - - // Check for intervals in active that are handled or inactive. - auto it = intervals.active; - while (it) { - auto next = it->next; - if (it->end_ordinal <= current->start_ordinal) { - // Move from active to handled. - it->RemoveFromList(&intervals.active); - it->AddToList(&intervals.handled); - } - it = next; - } - - // Find a register for current. - if (!TryAllocateFreeReg(current, intervals)) { - // Failed, spill. - AllocateBlockedReg(builder, current, intervals); - } - - if (current->value->reg.index!= -1) { - // Add current to active. - current->AddToList(&intervals.active); - } - } - return 0; } -void RegisterAllocationPass::ComputeLastUse(Value* value) { - // TODO(benvanik): compute during construction? - // Note that this list isn't sorted (unfortunately), so we have to scan - // them all. - uint32_t max_ordinal = 0; - Value::Use* last_use = NULL; - auto use = value->use_head; - while (use) { - if (!last_use || use->instr->ordinal >= max_ordinal) { - last_use = use; - max_ordinal = use->instr->ordinal; - } - use = use->next; - } - value->last_use = last_use ? last_use->instr : NULL; -} - -bool RegisterAllocationPass::TryAllocateFreeReg( - Interval* current, Intervals& intervals) { - // Reset all registers in the set to unused. - auto free_until_set = current->free_until_set; - for (uint32_t n = 0; n < free_until_set->count; n++) { - free_until_set->pos[n] = -1; - } - - // Mark all active registers as used. - // TODO(benvanik): keep some kind of bitvector so that this is instant? - auto it = intervals.active; - while (it) { - if (it->free_until_set == free_until_set) { - free_until_set->pos[it->value->reg.index] = 0; - } - it = it->next; - } - - uint32_t max_pos = 0; - for (uint32_t n = 0; n < free_until_set->count; n++) { - if (max_pos == -1) { - max_pos = n; - } else { - if (free_until_set->pos[n] > free_until_set->pos[max_pos]) { - max_pos = n; +void RegisterAllocationPass::DumpUsage(const char* name) { +#if 0 + fprintf(stdout, "\n%s:\n", name); + for (size_t i = 0; i < XECOUNT(usage_sets_.all_sets); ++i) { + auto usage_set = usage_sets_.all_sets[i]; + if (usage_set) { + fprintf(stdout, "set %s:\n", usage_set->set->name); + fprintf(stdout, " avail: %s\n", usage_set->availability.to_string().c_str()); + fprintf(stdout, " upcoming uses:\n"); + for (auto it = usage_set->upcoming_uses.begin(); + it != usage_set->upcoming_uses.end(); ++it) { + fprintf(stdout, " v%d, used at %d\n", + it->value->ordinal, + it->use->instr->ordinal); } } } - if (!free_until_set->pos[max_pos]) { - // No register available without spilling. - return false; - } - if (current->end_ordinal < free_until_set->pos[max_pos]) { - // Register available for the whole interval. - current->value->reg.set = free_until_set->set; - current->value->reg.index = max_pos; - } else { - // Register available for the first part of the interval. - // Split the interval at where it hits the next one. - //current->value->reg = max_pos; - //SplitRange(current, free_until_set->pos[max_pos]); - // TODO(benvanik): actually split -- for now we just spill. - return false; - } - - return true; + fflush(stdout); +#endif } -void RegisterAllocationPass::AllocateBlockedReg( - HIRBuilder* builder, Interval* current, Intervals& intervals) { - auto free_until_set = current->free_until_set; - // TODO(benvanik): smart heuristics. - // wimmer AllocateBlockedReg has some stuff for deciding whether to - // spill current or some other active interval - which we ignore. - - // Pick a random interval. Maybe the first. Sure. - auto spill_interval = intervals.active; - Value* spill_value = NULL; - Instr* prev_use = NULL; - Instr* next_use = NULL; - while (spill_interval) { - if (spill_interval->free_until_set != free_until_set || - spill_interval->start_ordinal == current->start_ordinal) { - // Only interested in ones of the same register set. - // We also ensure that ones at the same ordinal as us are ignored, - // which can happen with multiple local inserts/etc. - spill_interval = spill_interval->next; - continue; +void RegisterAllocationPass::PrepareBlockState() { + for (size_t i = 0; i < XECOUNT(usage_sets_.all_sets); ++i) { + auto usage_set = usage_sets_.all_sets[i]; + if (usage_set) { + usage_set->availability.set(); + usage_set->upcoming_uses.clear(); } - spill_value = spill_interval->value; + } + DumpUsage("PrepareBlockState"); +} - // Find the uses right before/after current. - auto use = spill_value->use_head; - while (use) { - if (use->instr->ordinal != -1) { - if (use->instr->ordinal < current->start_ordinal) { - if (!prev_use || prev_use->ordinal < use->instr->ordinal) { - prev_use = use->instr; - } - } else if (use->instr->ordinal > current->start_ordinal) { - if (!next_use || next_use->ordinal > use->instr->ordinal) { - next_use = use->instr; - } +void RegisterAllocationPass::AdvanceUses(Instr* instr) { + for (size_t i = 0; i < XECOUNT(usage_sets_.all_sets); ++i) { + auto usage_set = usage_sets_.all_sets[i]; + if (!usage_set) { + break; + } + auto& upcoming_uses = usage_set->upcoming_uses; + for (auto it = upcoming_uses.begin(); it != upcoming_uses.end();) { + if (!it->use) { + // No uses at all - we can remove right away. + // This comes up from instructions where the dest is never used, + // like the ATOMIC ops. + MarkRegAvailable(it->value->reg); + it = upcoming_uses.erase(it); + continue; + } + if (it->use->instr != instr) { + // Not yet at this instruction. + ++it; + continue; + } + // The use is from this instruction. + if (!it->use->next) { + // Last use of the value. We can retire it now. + MarkRegAvailable(it->value->reg); + it = upcoming_uses.erase(it); + } else { + // Used again. Push back the next use. + // Note that we may be used multiple times this instruction, so + // eat those. + auto next_use = it->use->next; + while (next_use->next && next_use->instr == instr) { + next_use = next_use->next; } + // Remove the iterator. + auto value = it->value; + it = upcoming_uses.erase(it); + upcoming_uses.emplace_back(value, next_use); } - use = use->next; } - if (!prev_use) { - prev_use = spill_value->def; - } - if (prev_use->next == next_use) { - // Uh, this interval is way too short. - spill_interval = spill_interval->next; - continue; - } - XEASSERT(prev_use->ordinal != -1); - XEASSERTNOTNULL(next_use); - break; } - XEASSERT(spill_interval->free_until_set == free_until_set); + DumpUsage("AdvanceUses"); +} - // Find the real last use -- paired ops may require sequences to stay - // intact. This is a bad design. - auto prev_def_tail = prev_use; - while (prev_def_tail && - prev_def_tail->opcode->flags & OPCODE_FLAG_PAIRED_PREV) { - prev_def_tail = prev_def_tail->prev; +bool RegisterAllocationPass::IsRegInUse(const RegAssignment& reg) { + RegisterSetUsage* usage_set; + if (reg.set == usage_sets_.int_set->set) { + usage_set = usage_sets_.int_set; + } else if (reg.set == usage_sets_.float_set->set) { + usage_set = usage_sets_.float_set; + } else { + usage_set = usage_sets_.vec_set; + } + return !usage_set->availability.test(reg.index); +} + +RegisterAllocationPass::RegisterSetUsage* +RegisterAllocationPass::MarkRegUsed(const RegAssignment& reg, + Value* value, Value::Use* use) { + auto usage_set = RegisterSetForValue(value); + usage_set->availability.set(reg.index, false); + usage_set->upcoming_uses.emplace_back(value, use); + DumpUsage("MarkRegUsed"); + return usage_set; +} + +RegisterAllocationPass::RegisterSetUsage* +RegisterAllocationPass::MarkRegAvailable(const hir::RegAssignment& reg) { + RegisterSetUsage* usage_set; + if (reg.set == usage_sets_.int_set->set) { + usage_set = usage_sets_.int_set; + } else if (reg.set == usage_sets_.float_set->set) { + usage_set = usage_sets_.float_set; + } else { + usage_set = usage_sets_.vec_set; + } + usage_set->availability.set(reg.index, true); + return usage_set; +} + +bool RegisterAllocationPass::TryAllocateRegister( + Value* value, const RegAssignment& preferred_reg) { + // If the preferred register matches type and is available, use it. + auto usage_set = RegisterSetForValue(value); + if (usage_set->set == preferred_reg.set) { + // Check if available. + if (!IsRegInUse(preferred_reg)) { + // Mark as in-use and return. Best case. + MarkRegUsed(preferred_reg, value, value->use_head); + value->reg = preferred_reg; + return true; + } } - Value* new_value; - uint32_t end_ordinal; + // Otherwise, fallback to allocating like normal. + return TryAllocateRegister(value); +} + +bool RegisterAllocationPass::TryAllocateRegister(Value* value) { + // Get the set this register is in. + RegisterSetUsage* usage_set = RegisterSetForValue(value); + + // Find the first free register, if any. + // We have to ensure it's a valid one (in our count). + unsigned long first_unused = 0; + bool all_used = _BitScanForward(&first_unused, usage_set->availability.to_ulong()) == 0; + if (!all_used && first_unused < usage_set->count) { + // Available! Use it!. + value->reg.set = usage_set->set; + value->reg.index = first_unused; + MarkRegUsed(value->reg, value, value->use_head); + return true; + } + + // None available! Spill required. + return false; +} + +bool RegisterAllocationPass::SpillOneRegister( + HIRBuilder* builder, TypeName required_type) { + // Get the set that we will be picking from. + RegisterSetUsage* usage_set; + if (required_type <= INT64_TYPE) { + usage_set = usage_sets_.int_set; + } else if (required_type <= FLOAT64_TYPE) { + usage_set = usage_sets_.float_set; + } else { + usage_set = usage_sets_.vec_set; + } + + DumpUsage("SpillOneRegister (pre)"); + // Pick the one with the furthest next use. + XEASSERT(!usage_set->upcoming_uses.empty()); + auto furthest_usage = std::max_element( + usage_set->upcoming_uses.begin(), usage_set->upcoming_uses.end(), + RegisterUsage::Comparer()); + Value* spill_value = furthest_usage->value; + Value::Use* prev_use = furthest_usage->use->prev; + Value::Use* next_use = furthest_usage->use; + XEASSERTNOTNULL(next_use); + usage_set->upcoming_uses.erase(furthest_usage); + DumpUsage("SpillOneRegister (post)"); + const auto reg = spill_value->reg; + + // We know the spill_value use list is sorted, so we can cut it right now. + // This makes it easier down below. + auto new_head_use = next_use; + + // Allocate local. if (spill_value->local_slot) { - // Value is already assigned a slot, so load from that. - // We can then split the interval right after the previous use to - // before the next use. - - // Update the last use of the spilled interval/value. - end_ordinal = spill_interval->end_ordinal; - spill_interval->end_ordinal = current->start_ordinal;//prev_def_tail->ordinal; - XEASSERT(end_ordinal != -1); - XEASSERT(spill_interval->end_ordinal != -1); - - // Insert a load right before the next use. - new_value = builder->LoadLocal(spill_value->local_slot); - builder->last_instr()->MoveBefore(next_use); - - // Update last use info. - new_value->last_use = spill_value->last_use; - spill_value->last_use = prev_use; + // Value is already assigned a slot. Since we allocate in order and this is + // all SSA we know the stored value will be exactly what we want. Yay, + // we can prevent the redundant store! + // In fact, we may even want to pin this spilled value so that we always + // use the spilled value and prevent the need for more locals. } else { // Allocate a local slot. spill_value->local_slot = builder->AllocLocal(spill_value->type); - // Insert a spill right after the def. + // Add store. builder->StoreLocal(spill_value->local_slot, spill_value); auto spill_store = builder->last_instr(); - spill_store->MoveBefore(prev_def_tail->next); + auto spill_store_use = spill_store->src2_use; + XEASSERTNULL(spill_store_use->prev); + if (prev_use && prev_use->instr->opcode->flags & OPCODE_FLAG_PAIRED_PREV) { + // Instruction is paired. This is bad. We will insert the spill after the + // paired instruction. + XEASSERTNOTNULL(prev_use->instr->next); + spill_store->MoveBefore(prev_use->instr->next); - // Update last use of spilled interval/value. - end_ordinal = spill_interval->end_ordinal; - spill_interval->end_ordinal = current->start_ordinal;//prev_def_tail->ordinal; - XEASSERT(end_ordinal != -1); - XEASSERT(spill_interval->end_ordinal != -1); + // Update last use. + spill_value->last_use = spill_store; + } else if (prev_use) { + // We insert the store immediately before the previous use. + // If we were smarter we could then re-run allocation and reuse the register + // once dropped. + spill_store->MoveBefore(prev_use->instr); - // Insert a load right before the next use. - new_value = builder->LoadLocal(spill_value->local_slot); - builder->last_instr()->MoveBefore(next_use); + // Update last use. + spill_value->last_use = prev_use->instr; + } else { + // This is the first use, so the only thing we have is the define. + // Move the store to right after that. + spill_store->MoveBefore(spill_value->def->next); - // Update last use info. - new_value->last_use = spill_value->last_use; - spill_value->last_use = spill_store; + // Update last use. + spill_value->last_use = spill_store; + } } - // Reuse the same local slot. Hooray SSA. +#if ASSERT_NO_CYCLES + builder->AssertNoCycles(); + spill_value->def->block->AssertNoCycles(); +#endif // ASSERT_NO_CYCLES + + // Add load. + // Inserted immediately before the next use. Since by definition the next + // use is after the instruction requesting the spill we know we haven't + // done allocation for that code yet and can let that be handled + // automatically when we get to it. + auto new_value = builder->LoadLocal(spill_value->local_slot); + auto spill_load = builder->last_instr(); + spill_load->MoveBefore(next_use->instr); + // Note: implicit first use added. + +#if ASSERT_NO_CYCLES + builder->AssertNoCycles(); + spill_value->def->block->AssertNoCycles(); +#endif // ASSERT_NO_CYCLES + + // Set the local slot of the new value to our existing one. This way we will + // reuse that same memory if needed. new_value->local_slot = spill_value->local_slot; - // Rename all future uses to that loaded value. - auto use = spill_value->use_head; - while (use) { - // TODO(benvanik): keep use list sorted so we don't have to do this. - if (use->instr->ordinal <= spill_interval->end_ordinal || - use->instr->ordinal == -1) { - use = use->next; - continue; - } - auto next = use->next; - auto instr = use->instr; + // Rename all future uses of the SSA value to the new value as loaded + // from the local. + // We can quickly do this by walking the use list. Because the list is + // already sorted we know we are going to end up with a sorted list. + auto walk_use = new_head_use; + auto new_use_tail = walk_use; + while (walk_use) { + auto next_walk_use = walk_use->next; + auto instr = walk_use->instr; + uint32_t signature = instr->opcode->signature; if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) { if (instr->src1.value == spill_value) { @@ -436,36 +431,107 @@ void RegisterAllocationPass::AllocateBlockedReg( instr->set_src3(new_value); } } - use = next; + + walk_use = next_walk_use; + if (walk_use) { + new_use_tail = walk_use; + } } + new_value->last_use = new_use_tail->instr; - // Create new interval. - auto arena = scratch_arena(); - auto new_interval = arena->Alloc(); - new_interval->start_ordinal = new_value->def->ordinal; - new_interval->end_ordinal = end_ordinal; - new_interval->value = new_value; - new_interval->next = NULL; - new_interval->prev = NULL; - if (new_value->type <= INT64_TYPE) { - new_interval->free_until_set = free_until_sets_.int_set; - } else if (new_value->type <= FLOAT64_TYPE) { - new_interval->free_until_set = free_until_sets_.float_set; - } else { - new_interval->free_until_set = free_until_sets_.vec_set; - } + // Update tracking. + MarkRegAvailable(reg); - // Remove the old interval from the active list, as it's been spilled. - spill_interval->RemoveFromList(&intervals.active); - spill_interval->AddToList(&intervals.handled); - - // Insert interval into the right place in the list. - // We know it's ahead of us. - new_interval->InsertIntoList(&intervals.unhandled); - - // TODO(benvanik): use the register we just freed? - //current->value->reg.set = free_until_set->set; - //current->value->reg.index = spill_interval->value->reg.index; - bool allocated = TryAllocateFreeReg(current, intervals); - XEASSERTTRUE(allocated); + return true; +} + +RegisterAllocationPass::RegisterSetUsage* +RegisterAllocationPass::RegisterSetForValue( + const Value* value) { + if (value->type <= INT64_TYPE) { + return usage_sets_.int_set; + } else if (value->type <= FLOAT64_TYPE) { + return usage_sets_.float_set; + } else { + return usage_sets_.vec_set; + } +} + +namespace { +int CompareValueUse(const Value::Use* a, const Value::Use* b) { + return a->instr->ordinal - b->instr->ordinal; +} +} // namespace +void RegisterAllocationPass::SortUsageList(Value* value) { + // Modified in-place linked list sort from: + // http://www.chiark.greenend.org.uk/~sgtatham/algorithms/listsort.c + if (!value->use_head) { + return; + } + Value::Use* head = value->use_head; + Value::Use* tail = nullptr; + int insize = 1; + while (true) { + auto p = head; + head = nullptr; + tail = nullptr; + // count number of merges we do in this pass + int nmerges = 0; + while (p) { + // there exists a merge to be done + nmerges++; + // step 'insize' places along from p + auto q = p; + int psize = 0; + for (int i = 0; i < insize; i++) { + psize++; + q = q->next; + if (!q) break; + } + // if q hasn't fallen off end, we have two lists to merge + int qsize = insize; + // now we have two lists; merge them + while (psize > 0 || (qsize > 0 && q)) { + // decide whether next element of merge comes from p or q + Value::Use* e = nullptr; + if (psize == 0) { + // p is empty; e must come from q + e = q; q = q->next; qsize--; + } else if (qsize == 0 || !q) { + // q is empty; e must come from p + e = p; p = p->next; psize--; + } else if (CompareValueUse(p, q) <= 0) { + // First element of p is lower (or same); e must come from p + e = p; p = p->next; psize--; + } else { + // First element of q is lower; e must come from q + e = q; q = q->next; qsize--; + } + // add the next element to the merged list + if (tail) { + tail->next = e; + } else { + head = e; + } + // Maintain reverse pointers in a doubly linked list. + e->prev = tail; + tail = e; + } + // now p has stepped 'insize' places along, and q has too + p = q; + } + if (tail) { + tail->next = nullptr; + } + // If we have done only one merge, we're finished + if (nmerges <= 1) { + // allow for nmerges==0, the empty list case + break; + } + // Otherwise repeat, merging lists twice the size + insize *= 2; + } + + value->use_head = head; + value->last_use = tail->instr; } diff --git a/src/alloy/compiler/passes/register_allocation_pass.h b/src/alloy/compiler/passes/register_allocation_pass.h index 3167000ec..aa5943aea 100644 --- a/src/alloy/compiler/passes/register_allocation_pass.h +++ b/src/alloy/compiler/passes/register_allocation_pass.h @@ -10,6 +10,10 @@ #ifndef ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_ #define ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_ +#include +#include +#include + #include #include @@ -27,28 +31,53 @@ public: virtual int Run(hir::HIRBuilder* builder); private: - struct Interval; - struct Intervals; - void ComputeLastUse(hir::Value* value); - bool TryAllocateFreeReg(Interval* current, Intervals& intervals); - void AllocateBlockedReg(hir::HIRBuilder* builder, - Interval* current, Intervals& intervals); + // TODO(benvanik): rewrite all this set shit -- too much indirection, the + // complexity is not needed. + struct RegisterUsage { + hir::Value* value; + hir::Value::Use* use; + RegisterUsage() : value(nullptr), use(nullptr) {} + RegisterUsage(hir::Value* value_, hir::Value::Use* use_) + : value(value_), use(use_) {} + struct Comparer : std::binary_function { + bool operator()(const RegisterUsage& a, const RegisterUsage& b) const { + return a.use->instr->ordinal < b.use->instr->ordinal; + } + }; + }; + struct RegisterSetUsage { + const backend::MachineInfo::RegisterSet* set = nullptr; + uint32_t count = 0; + std::bitset<32> availability = 0; + // TODO(benvanik): another data type. + std::vector upcoming_uses; + }; + + void DumpUsage(const char* name); + void PrepareBlockState(); + void AdvanceUses(hir::Instr* instr); + bool IsRegInUse(const hir::RegAssignment& reg); + RegisterSetUsage* MarkRegUsed(const hir::RegAssignment& reg, + hir::Value* value, hir::Value::Use* use); + RegisterSetUsage* MarkRegAvailable(const hir::RegAssignment& reg); + + bool TryAllocateRegister(hir::Value* value, + const hir::RegAssignment& preferred_reg); + bool TryAllocateRegister(hir::Value* value); + bool SpillOneRegister(hir::HIRBuilder* builder, hir::TypeName required_type); + + RegisterSetUsage* RegisterSetForValue(const hir::Value* value); + + void SortUsageList(hir::Value* value); private: const backend::MachineInfo* machine_info_; - - struct RegisterFreeUntilSet { - uint32_t count; - uint32_t pos[32]; - const backend::MachineInfo::RegisterSet* set; - }; - struct RegisterFreeUntilSets { - RegisterFreeUntilSet* int_set; - RegisterFreeUntilSet* float_set; - RegisterFreeUntilSet* vec_set; - RegisterFreeUntilSet* all_sets[3]; - }; - RegisterFreeUntilSets free_until_sets_; + struct { + RegisterSetUsage* int_set = nullptr; + RegisterSetUsage* float_set = nullptr; + RegisterSetUsage* vec_set = nullptr; + RegisterSetUsage* all_sets[3]; + } usage_sets_; }; diff --git a/src/alloy/compiler/passes/validation_pass.cc b/src/alloy/compiler/passes/validation_pass.cc index 15e89bd67..bc77ab482 100644 --- a/src/alloy/compiler/passes/validation_pass.cc +++ b/src/alloy/compiler/passes/validation_pass.cc @@ -88,12 +88,12 @@ int ValidationPass::ValidateInstruction(Block* block, Instr* instr) { } int ValidationPass::ValidateValue(Block* block, Instr* instr, Value* value) { - if (value->def) { - /*auto def = value->def; - XEASSERT(def->block == block); - if (def->block != block) { - return 1; - }*/ - } + //if (value->def) { + // auto def = value->def; + // XEASSERT(def->block == block); + // if (def->block != block) { + // return 1; + // } + //} return 0; } diff --git a/src/alloy/core.h b/src/alloy/core.h index aef7e57c2..3beb11ba4 100644 --- a/src/alloy/core.h +++ b/src/alloy/core.h @@ -44,6 +44,10 @@ typedef struct XECACHEALIGN vec128_s { uint64_t high; }; }; + + bool operator== (const vec128_s& b) const { + return low == b.low && high == b.high; + } } vec128_t; XEFORCEINLINE vec128_t vec128i(uint32_t x, uint32_t y, uint32_t z, uint32_t w) { vec128_t v; diff --git a/src/alloy/frontend/ppc/ppc_emit_alu.cc b/src/alloy/frontend/ppc/ppc_emit_alu.cc index 9b25e824c..ce023eb85 100644 --- a/src/alloy/frontend/ppc/ppc_emit_alu.cc +++ b/src/alloy/frontend/ppc/ppc_emit_alu.cc @@ -643,20 +643,20 @@ XEEMITTER(cmpli, 0x28000000, D )(PPCHIRBuilder& f, InstrData& i) { XEEMITTER(andx, 0x7C000038, X )(PPCHIRBuilder& f, InstrData& i) { // RA <- (RS) & (RB) Value* ra = f.And(f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB)); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } XEEMITTER(andcx, 0x7C000078, X )(PPCHIRBuilder& f, InstrData& i) { // RA <- (RS) & ¬(RB) Value* ra = f.And(f.LoadGPR(i.X.RT), f.Not(f.LoadGPR(i.X.RB))); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -665,8 +665,8 @@ XEEMITTER(andix, 0x70000000, D )(PPCHIRBuilder& f, InstrData& i) { Value* ra = f.And( f.LoadGPR(i.D.RT), f.LoadConstant((uint64_t)i.D.DS)); - f.UpdateCR(0, ra); f.StoreGPR(i.D.RA, ra); + f.UpdateCR(0, ra); return 0; } @@ -675,8 +675,8 @@ XEEMITTER(andisx, 0x74000000, D )(PPCHIRBuilder& f, InstrData& i) { Value* ra = f.And( f.LoadGPR(i.D.RT), f.LoadConstant((uint64_t(i.D.DS) << 16))); - f.UpdateCR(0, ra); f.StoreGPR(i.D.RA, ra); + f.UpdateCR(0, ra); return 0; } @@ -688,10 +688,10 @@ XEEMITTER(cntlzdx, 0x7C000074, X )(PPCHIRBuilder& f, InstrData& i) { // RA <- n Value* v = f.CountLeadingZeros(f.LoadGPR(i.X.RT)); v = f.ZeroExtend(v, INT64_TYPE); + f.StoreGPR(i.X.RA, v); if (i.X.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.X.RA, v); return 0; } @@ -704,10 +704,10 @@ XEEMITTER(cntlzwx, 0x7C000034, X )(PPCHIRBuilder& f, InstrData& i) { Value* v = f.CountLeadingZeros( f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE)); v = f.ZeroExtend(v, INT64_TYPE); + f.StoreGPR(i.X.RA, v); if (i.X.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.X.RA, v); return 0; } @@ -715,10 +715,10 @@ XEEMITTER(eqvx, 0x7C000238, X )(PPCHIRBuilder& f, InstrData& i) { // RA <- (RS) == (RB) Value* ra = f.Xor(f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB)); ra = f.Not(ra); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -728,10 +728,10 @@ XEEMITTER(extsbx, 0x7C000774, X )(PPCHIRBuilder& f, InstrData& i) { // RA[0:55] <- i56.s Value* rt = f.LoadGPR(i.X.RT); rt = f.SignExtend(f.Truncate(rt, INT8_TYPE), INT64_TYPE); + f.StoreGPR(i.X.RA, rt); if (i.X.Rc) { f.UpdateCR(0, rt); } - f.StoreGPR(i.X.RA, rt); return 0; } @@ -741,10 +741,10 @@ XEEMITTER(extshx, 0x7C000734, X )(PPCHIRBuilder& f, InstrData& i) { // RA[0:47] <- 48.s Value* rt = f.LoadGPR(i.X.RT); rt = f.SignExtend(f.Truncate(rt, INT16_TYPE), INT64_TYPE); + f.StoreGPR(i.X.RA, rt); if (i.X.Rc) { f.UpdateCR(0, rt); } - f.StoreGPR(i.X.RA, rt); return 0; } @@ -754,10 +754,10 @@ XEEMITTER(extswx, 0x7C0007B4, X )(PPCHIRBuilder& f, InstrData& i) { // RA[0:31] <- i32.s Value* rt = f.LoadGPR(i.X.RT); rt = f.SignExtend(f.Truncate(rt, INT32_TYPE), INT64_TYPE); + f.StoreGPR(i.X.RA, rt); if (i.X.Rc) { f.UpdateCR(0, rt); } - f.StoreGPR(i.X.RA, rt); return 0; } @@ -767,10 +767,10 @@ XEEMITTER(nandx, 0x7C0003B8, X )(PPCHIRBuilder& f, InstrData& i) { f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB)); ra = f.Not(ra); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -780,10 +780,10 @@ XEEMITTER(norx, 0x7C0000F8, X )(PPCHIRBuilder& f, InstrData& i) { f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB)); ra = f.Not(ra); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -803,10 +803,10 @@ XEEMITTER(orx, 0x7C000378, X )(PPCHIRBuilder& f, InstrData& i) { f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB)); } + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -815,10 +815,10 @@ XEEMITTER(orcx, 0x7C000338, X )(PPCHIRBuilder& f, InstrData& i) { Value* ra = f.Or( f.LoadGPR(i.X.RT), f.Not(f.LoadGPR(i.X.RB))); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -849,10 +849,10 @@ XEEMITTER(xorx, 0x7C000278, X )(PPCHIRBuilder& f, InstrData& i) { Value* ra = f.Xor( f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB)); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -895,10 +895,10 @@ XEEMITTER(rld, 0x78000000, MDS)(PPCHIRBuilder& f, InstrData& i) { if (m != 0xFFFFFFFFFFFFFFFF) { v = f.And(v, f.LoadConstant(m)); } + f.StoreGPR(i.MD.RA, v); if (i.MD.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.MD.RA, v); return 0; } else if (i.MD.idx == 1) { // XEEMITTER(rldicrx, 0x78000004, MD ) @@ -922,10 +922,10 @@ XEEMITTER(rld, 0x78000000, MDS)(PPCHIRBuilder& f, InstrData& i) { v = f.And(v, f.LoadConstant(m)); } } + f.StoreGPR(i.MD.RA, v); if (i.MD.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.MD.RA, v); return 0; } else if (i.MD.idx == 2) { // XEEMITTER(rldicx, 0x78000008, MD ) @@ -959,10 +959,10 @@ XEEMITTER(rld, 0x78000000, MDS)(PPCHIRBuilder& f, InstrData& i) { f.And(v, f.LoadConstant(m)), f.And(ra, f.LoadConstant(~m))); } + f.StoreGPR(i.MD.RA, v); if (i.MD.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.MD.RA, v); return 0; } else { XEINSTRNOTIMPLEMENTED(); @@ -987,10 +987,10 @@ XEEMITTER(rlwimix, 0x50000000, M )(PPCHIRBuilder& f, InstrData& i) { } v = f.ZeroExtend(v, INT64_TYPE); v = f.Or(v, f.And(f.LoadGPR(i.M.RA), f.LoadConstant((~(uint64_t)m)))); + f.StoreGPR(i.M.RA, v); if (i.M.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.M.RA, v); return 0; } @@ -1014,10 +1014,10 @@ XEEMITTER(rlwinmx, 0x54000000, M )(PPCHIRBuilder& f, InstrData& i) { v = f.And(v, f.LoadConstant((uint32_t)XEMASK(i.M.MB + 32, i.M.ME + 32))); } v = f.ZeroExtend(v, INT64_TYPE); + f.StoreGPR(i.M.RA, v); if (i.M.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.M.RA, v); return 0; } @@ -1036,10 +1036,10 @@ XEEMITTER(rlwnmx, 0x5C000000, M )(PPCHIRBuilder& f, InstrData& i) { v = f.And(v, f.LoadConstant((uint32_t)XEMASK(i.M.MB + 32, i.M.ME + 32))); } v = f.ZeroExtend(v, INT64_TYPE); + f.StoreGPR(i.M.RA, v); if (i.M.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.M.RA, v); return 0; } @@ -1146,7 +1146,7 @@ XEEMITTER(sradx, 0x7C000634, X )(PPCHIRBuilder& f, InstrData& i) { // CA is set to 1 if the low-order 32 bits of (RS) contain a negative number // and any 1-bits are shifted out of position 63; otherwise CA is set to 0. // We already have ca set to indicate the pos 63 bit, now just and in sign. - ca = f.And(ca, f.Shr(v, 63)); + ca = f.And(ca, f.Truncate(f.Shr(v, 63), INT8_TYPE)); f.StoreCA(ca); f.StoreGPR(i.X.RA, v); @@ -1174,15 +1174,15 @@ XEEMITTER(sradix, 0x7C000674, XS )(PPCHIRBuilder& f, InstrData& i) { XEASSERT(sh); uint64_t mask = XEMASK(64 - sh, 63); Value* ca = f.And( - f.Shr(v, 63), + f.Truncate(f.Shr(v, 63), INT8_TYPE), f.IsTrue(f.And(v, f.LoadConstant(mask)))); f.StoreCA(ca); v = f.Sha(v, sh); + f.StoreGPR(i.XS.RA, v); if (i.XS.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.XS.RA, v); return 0; } @@ -1203,7 +1203,7 @@ XEEMITTER(srawx, 0x7C000630, X )(PPCHIRBuilder& f, InstrData& i) { // is negative. Value* mask = f.Not(f.Shl(f.LoadConstant(-1), sh)); Value* ca = f.And( - f.Shr(v, 31), + f.Truncate(f.Shr(v, 31), INT8_TYPE), f.IsTrue(f.And(v, mask))); f.StoreCA(ca); v = f.Sha(v, sh), @@ -1235,8 +1235,8 @@ XEEMITTER(srawix, 0x7C000670, X )(PPCHIRBuilder& f, InstrData& i) { // is negative. uint32_t mask = (uint32_t)XEMASK(64 - i.X.RB, 63); ca = f.And( - f.Shr(v, 31), - f.ZeroExtend(f.IsTrue(f.And(v, f.LoadConstant(mask))), INT32_TYPE)); + f.Truncate(f.Shr(v, 31), INT8_TYPE), + f.IsTrue(f.And(v, f.LoadConstant(mask)))); v = f.Sha(v, (int8_t)i.X.RB), v = f.SignExtend(v, INT64_TYPE); diff --git a/src/alloy/frontend/ppc/ppc_hir_builder.cc b/src/alloy/frontend/ppc/ppc_hir_builder.cc index dd25c4f8a..1b254ea4e 100644 --- a/src/alloy/frontend/ppc/ppc_hir_builder.cc +++ b/src/alloy/frontend/ppc/ppc_hir_builder.cc @@ -240,18 +240,18 @@ void PPCHIRBuilder::UpdateCR( void PPCHIRBuilder::UpdateCR( uint32_t n, Value* lhs, Value* rhs, bool is_signed) { - Value* lt; - Value* gt; if (is_signed) { - lt = CompareSLT(lhs, rhs); - gt = CompareSGT(lhs, rhs); + Value* lt = CompareSLT(lhs, rhs); + StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 0, lt); + Value* gt = CompareSGT(lhs, rhs); + StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 1, gt); } else { - lt = CompareULT(lhs, rhs); - gt = CompareUGT(lhs, rhs); + Value* lt = CompareULT(lhs, rhs); + StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 0, lt); + Value* gt = CompareUGT(lhs, rhs); + StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 1, gt); } Value* eq = CompareEQ(lhs, rhs); - StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 0, lt); - StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 1, gt); StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 2, eq); // Value* so = AllocValue(UINT8_TYPE); @@ -280,7 +280,7 @@ Value* PPCHIRBuilder::LoadCA() { } void PPCHIRBuilder::StoreCA(Value* value) { - value = Truncate(value, INT8_TYPE); + XEASSERT(value->type == INT8_TYPE); StoreContext(offsetof(PPCContext, xer_ca), value); } diff --git a/src/alloy/hir/block.cc b/src/alloy/hir/block.cc new file mode 100644 index 000000000..ebace67fa --- /dev/null +++ b/src/alloy/hir/block.cc @@ -0,0 +1,39 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + +using namespace alloy; +using namespace alloy::hir; + + +void Block::AssertNoCycles() { + Instr* hare = instr_head; + Instr* tortoise = instr_head; + if (!hare) { + return; + } + while (hare = hare->next) { + if (hare == tortoise) { + // Cycle! + XEASSERTALWAYS(); + } + hare = hare->next; + if (hare == tortoise) { + // Cycle! + XEASSERTALWAYS(); + } + tortoise = tortoise->next; + if (!hare || !tortoise) { + return; + } + } +} diff --git a/src/alloy/hir/block.h b/src/alloy/hir/block.h index 1683b333c..f60dd83c5 100644 --- a/src/alloy/hir/block.h +++ b/src/alloy/hir/block.h @@ -61,6 +61,8 @@ public: Instr* instr_tail; uint16_t ordinal; + + void AssertNoCycles(); }; diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index cad24c32c..f93a310e8 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -92,7 +92,7 @@ void HIRBuilder::DumpValue(StringBuffer* str, Value* value) { case INT8_TYPE: str->Append("%X", value->constant.i8); break; case INT16_TYPE: str->Append("%X", value->constant.i16); break; case INT32_TYPE: str->Append("%X", value->constant.i32); break; - case INT64_TYPE: str->Append("%X", value->constant.i64); break; + case INT64_TYPE: str->Append("%llX", value->constant.i64); break; case FLOAT32_TYPE: str->Append("%F", value->constant.f32); break; case FLOAT64_TYPE: str->Append("%F", value->constant.f64); break; case VEC128_TYPE: str->Append("(%F,%F,%F,%F)", @@ -252,6 +252,29 @@ void HIRBuilder::Dump(StringBuffer* str) { } } +void HIRBuilder::AssertNoCycles() { + Block* hare = block_head_; + Block* tortoise = block_head_; + if (!hare) { + return; + } + while (hare = hare->next) { + if (hare == tortoise) { + // Cycle! + XEASSERTALWAYS(); + } + hare = hare->next; + if (hare == tortoise) { + // Cycle! + XEASSERTALWAYS(); + } + tortoise = tortoise->next; + if (!hare || !tortoise) { + return; + } + } +} + Block* HIRBuilder::current_block() const { return current_block_; } @@ -1729,16 +1752,19 @@ Value* HIRBuilder::Extract(Value* value, Value* index, TypeName target_type) { // TODO(benvanik): could do some of this as constants. + Value* trunc_index = index->type != INT8_TYPE ? + Truncate(index, INT8_TYPE) : index; + Instr* i = AppendInstr( OPCODE_EXTRACT_info, 0, AllocValue(target_type)); i->set_src1(value); - i->set_src2(ZeroExtend(index, INT64_TYPE)); + i->set_src2(trunc_index); i->src3.value = NULL; return i->dest; } -Value* HIRBuilder::Extract(Value* value, uint64_t index, +Value* HIRBuilder::Extract(Value* value, uint8_t index, TypeName target_type) { return Extract(value, LoadConstant(index), target_type); } diff --git a/src/alloy/hir/hir_builder.h b/src/alloy/hir/hir_builder.h index 1ebdb01a1..6568a5a49 100644 --- a/src/alloy/hir/hir_builder.h +++ b/src/alloy/hir/hir_builder.h @@ -35,6 +35,7 @@ public: virtual int Finalize(); void Dump(StringBuffer* str); + void AssertNoCycles(); Arena* arena() const { return arena_; } @@ -196,7 +197,7 @@ public: Value* Insert(Value* value, Value* index, Value* part); Value* Insert(Value* value, uint64_t index, Value* part); Value* Extract(Value* value, Value* index, TypeName target_type); - Value* Extract(Value* value, uint64_t index, TypeName target_type); + Value* Extract(Value* value, uint8_t index, TypeName target_type); // i8->i16/i32/... (i8|i8 / i8|i8|i8|i8 / ...) // i8/i16/i32 -> vec128 Value* Splat(Value* value, TypeName target_type); diff --git a/src/alloy/hir/instr.cc b/src/alloy/hir/instr.cc index 51de2da2c..dc489ef4b 100644 --- a/src/alloy/hir/instr.cc +++ b/src/alloy/hir/instr.cc @@ -48,19 +48,6 @@ void Instr::set_src3(Value* value) { src3_use = value ? value->AddUse(block->arena, this) : NULL; } -bool Instr::Match(SignatureType dest_req, - SignatureType src1_req, - SignatureType src2_req, - SignatureType src3_req) const { - #define TO_SIG_TYPE(v) \ - (v ? (v->IsConstant() ? SignatureType((v->type + 1) | SIG_TYPE_C) : SignatureType(v->type + 1)) : SIG_TYPE_X) - return - ((dest_req == SIG_TYPE_IGNORE) || (dest_req == TO_SIG_TYPE(dest))) && - ((src1_req == SIG_TYPE_IGNORE) || (src1_req == TO_SIG_TYPE(src1.value))) && - ((src2_req == SIG_TYPE_IGNORE) || (src2_req == TO_SIG_TYPE(src2.value))) && - ((src3_req == SIG_TYPE_IGNORE) || (src3_req == TO_SIG_TYPE(src3.value))); -} - void Instr::MoveBefore(Instr* other) { if (next == other) { return; diff --git a/src/alloy/hir/instr.h b/src/alloy/hir/instr.h index 62983401d..b128c534a 100644 --- a/src/alloy/hir/instr.h +++ b/src/alloy/hir/instr.h @@ -24,26 +24,6 @@ namespace hir { class Block; class Label; -enum SignatureType { - SIG_TYPE_X = 0, - SIG_TYPE_I8 = 1, - SIG_TYPE_I16 = 2, - SIG_TYPE_I32 = 3, - SIG_TYPE_I64 = 4, - SIG_TYPE_F32 = 5, - SIG_TYPE_F64 = 6, - SIG_TYPE_V128 = 7, - SIG_TYPE_C = (1 << 3), - SIG_TYPE_I8C = SIG_TYPE_C | SIG_TYPE_I8, - SIG_TYPE_I16C = SIG_TYPE_C | SIG_TYPE_I16, - SIG_TYPE_I32C = SIG_TYPE_C | SIG_TYPE_I32, - SIG_TYPE_I64C = SIG_TYPE_C | SIG_TYPE_I64, - SIG_TYPE_F32C = SIG_TYPE_C | SIG_TYPE_F32, - SIG_TYPE_F64C = SIG_TYPE_C | SIG_TYPE_F64, - SIG_TYPE_V128C = SIG_TYPE_C | SIG_TYPE_V128, - SIG_TYPE_IGNORE = 0xFF, -}; - class Instr { public: Block* block; @@ -74,11 +54,6 @@ public: void set_src2(Value* value); void set_src3(Value* value); - bool Match(SignatureType dest = SIG_TYPE_X, - SignatureType src1 = SIG_TYPE_X, - SignatureType src2 = SIG_TYPE_X, - SignatureType src3 = SIG_TYPE_X) const; - void MoveBefore(Instr* other); void Replace(const OpcodeInfo* opcode, uint16_t flags); void Remove(); diff --git a/src/alloy/hir/opcodes.inl b/src/alloy/hir/opcodes.inl index baf214f25..deb789675 100644 --- a/src/alloy/hir/opcodes.inl +++ b/src/alloy/hir/opcodes.inl @@ -11,590 +11,590 @@ DEFINE_OPCODE( OPCODE_COMMENT, "comment", - OPCODE_SIG_X, - OPCODE_FLAG_IGNORE); + OPCODE_SIG_X_O, + OPCODE_FLAG_IGNORE) DEFINE_OPCODE( OPCODE_NOP, "nop", OPCODE_SIG_X, - OPCODE_FLAG_IGNORE); + OPCODE_FLAG_IGNORE) DEFINE_OPCODE( OPCODE_SOURCE_OFFSET, "source_offset", OPCODE_SIG_X_O, - OPCODE_FLAG_IGNORE | OPCODE_FLAG_HIDE); + OPCODE_FLAG_IGNORE | OPCODE_FLAG_HIDE) DEFINE_OPCODE( OPCODE_DEBUG_BREAK, "debug_break", OPCODE_SIG_X, - OPCODE_FLAG_VOLATILE); + OPCODE_FLAG_VOLATILE) DEFINE_OPCODE( OPCODE_DEBUG_BREAK_TRUE, "debug_break_true", OPCODE_SIG_X_V, - OPCODE_FLAG_VOLATILE); + OPCODE_FLAG_VOLATILE) DEFINE_OPCODE( OPCODE_TRAP, "trap", OPCODE_SIG_X, - OPCODE_FLAG_VOLATILE); + OPCODE_FLAG_VOLATILE) DEFINE_OPCODE( OPCODE_TRAP_TRUE, "trap_true", OPCODE_SIG_X_V, - OPCODE_FLAG_VOLATILE); + OPCODE_FLAG_VOLATILE) DEFINE_OPCODE( OPCODE_CALL, "call", OPCODE_SIG_X_S, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_CALL_TRUE, "call_true", OPCODE_SIG_X_V_S, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_CALL_INDIRECT, "call_indirect", OPCODE_SIG_X_V, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_CALL_INDIRECT_TRUE, "call_indirect_true", OPCODE_SIG_X_V_V, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_CALL_EXTERN, "call_extern", OPCODE_SIG_X_S, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_RETURN, "return", OPCODE_SIG_X, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_RETURN_TRUE, "return_true", OPCODE_SIG_X_V, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_SET_RETURN_ADDRESS, "set_return_address", OPCODE_SIG_X_V, - 0); + 0) DEFINE_OPCODE( OPCODE_BRANCH, "branch", OPCODE_SIG_X_L, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_BRANCH_TRUE, "branch_true", OPCODE_SIG_X_V_L, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_BRANCH_FALSE, "branch_false", OPCODE_SIG_X_V_L, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_ASSIGN, "assign", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_CAST, "cast", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_ZERO_EXTEND, "zero_extend", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SIGN_EXTEND, "sign_extend", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_TRUNCATE, "truncate", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_CONVERT, "convert", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_ROUND, "round", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_CONVERT_I2F, "vector_convert_i2f", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_CONVERT_F2I, "vector_convert_f2i", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_LOAD_VECTOR_SHL, "load_vector_shl", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_LOAD_VECTOR_SHR, "load_vector_shr", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_LOAD_CLOCK, "load_clock", OPCODE_SIG_V, - 0); + 0) DEFINE_OPCODE( OPCODE_LOAD_LOCAL, "load_local", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_STORE_LOCAL, "store_local", OPCODE_SIG_X_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_LOAD_CONTEXT, "load_context", OPCODE_SIG_V_O, - 0); + 0) DEFINE_OPCODE( OPCODE_STORE_CONTEXT, "store_context", OPCODE_SIG_X_O_V, - 0); + 0) DEFINE_OPCODE( OPCODE_LOAD, "load", OPCODE_SIG_V_V, - OPCODE_FLAG_MEMORY); + OPCODE_FLAG_MEMORY) DEFINE_OPCODE( OPCODE_STORE, "store", OPCODE_SIG_X_V_V, - OPCODE_FLAG_MEMORY); + OPCODE_FLAG_MEMORY) DEFINE_OPCODE( OPCODE_PREFETCH, "prefetch", OPCODE_SIG_X_V_O, - 0); + 0) DEFINE_OPCODE( OPCODE_MAX, "max", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_MIN, "min", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SELECT, "select", OPCODE_SIG_V_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_IS_TRUE, "is_true", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_IS_FALSE, "is_false", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_EQ, "compare_eq", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_COMPARE_NE, "compare_ne", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_COMPARE_SLT, "compare_slt", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_SLE, "compare_sle", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_SGT, "compare_sgt", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_SGE, "compare_sge", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_ULT, "compare_ult", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_ULE, "compare_ule", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_UGT, "compare_ugt", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_UGE, "compare_uge", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_DID_CARRY, "did_carry", OPCODE_SIG_V_V, - OPCODE_FLAG_PAIRED_PREV); + OPCODE_FLAG_PAIRED_PREV) DEFINE_OPCODE( OPCODE_DID_OVERFLOW, "did_overflow", OPCODE_SIG_V_V, - OPCODE_FLAG_PAIRED_PREV); + OPCODE_FLAG_PAIRED_PREV) DEFINE_OPCODE( OPCODE_DID_SATURATE, "did_saturate", OPCODE_SIG_V_V, - OPCODE_FLAG_PAIRED_PREV); + OPCODE_FLAG_PAIRED_PREV) DEFINE_OPCODE( OPCODE_VECTOR_COMPARE_EQ, "vector_compare_eq", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_VECTOR_COMPARE_SGT, "vector_compare_sgt", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_COMPARE_SGE, "vector_compare_sge", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_COMPARE_UGT, "vector_compare_ugt", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_COMPARE_UGE, "vector_compare_uge", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_ADD, "add", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_ADD_CARRY, "add_carry", OPCODE_SIG_V_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_ADD, "vector_add", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_SUB, "sub", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_MUL, "mul", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_MUL_HI, "mul_hi", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_DIV, "div", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_MUL_ADD, "mul_add", OPCODE_SIG_V_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_MUL_SUB, "mul_sub", OPCODE_SIG_V_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_NEG, "neg", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_ABS, "abs", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SQRT, "sqrt", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_RSQRT, "rsqrt", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_POW2, "pow2", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_LOG2, "log2", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_DOT_PRODUCT_3, "dot_product_3", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_DOT_PRODUCT_4, "dot_product_4", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_AND, "and", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_OR, "or", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_XOR, "xor", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_NOT, "not", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SHL, "shl", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_SHL, "vector_shl", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SHR, "shr", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_SHR, "vector_shr", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SHA, "sha", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_SHA, "vector_sha", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_ROTATE_LEFT, "rotate_left", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_BYTE_SWAP, "byte_swap", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_CNTLZ, "cntlz", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_INSERT, "insert", OPCODE_SIG_V_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_EXTRACT, "extract", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SPLAT, "splat", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_PERMUTE, "permute", OPCODE_SIG_V_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SWIZZLE, "swizzle", OPCODE_SIG_V_V_O, - 0); + 0) DEFINE_OPCODE( OPCODE_PACK, "pack", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_UNPACK, "unpack", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_EXCHANGE, "compare_exchange", OPCODE_SIG_V_V_V_V, - OPCODE_FLAG_VOLATILE); + OPCODE_FLAG_VOLATILE) DEFINE_OPCODE( OPCODE_ATOMIC_EXCHANGE, "atomic_exchange", OPCODE_SIG_V_V_V, - OPCODE_FLAG_VOLATILE); + OPCODE_FLAG_VOLATILE) DEFINE_OPCODE( OPCODE_ATOMIC_ADD, "atomic_add", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_ATOMIC_SUB, "atomic_sub", OPCODE_SIG_V_V_V, - 0); + 0) diff --git a/src/alloy/hir/sources.gypi b/src/alloy/hir/sources.gypi index 948b43dd8..1ea2d7783 100644 --- a/src/alloy/hir/sources.gypi +++ b/src/alloy/hir/sources.gypi @@ -1,6 +1,7 @@ # Copyright 2013 Ben Vanik. All Rights Reserved. { 'sources': [ + 'block.cc', 'block.h', 'hir_builder.cc', 'hir_builder.h', diff --git a/src/alloy/hir/value.cc b/src/alloy/hir/value.cc index a684c6f2b..f70d6ceb2 100644 --- a/src/alloy/hir/value.cc +++ b/src/alloy/hir/value.cc @@ -560,6 +560,26 @@ void Value::ByteSwap() { } } +void Value::CountLeadingZeros(const ConstantValue& src) { + switch (type) { + case INT8_TYPE: + constant.i8 = __lzcnt16(src.i8) - 8; + break; + case INT16_TYPE: + constant.i8 = __lzcnt16(src.i16); + break; + case INT32_TYPE: + constant.i8 = __lzcnt(src.i32); + break; + case INT64_TYPE: + constant.i8 = __lzcnt64(src.i64); + break; + default: + XEASSERTALWAYS(); + break; + } +} + bool Value::Compare(Opcode opcode, Value* other) { // TODO(benvanik): big matrix. XEASSERTALWAYS(); diff --git a/src/alloy/hir/value.h b/src/alloy/hir/value.h index 4587efb19..e3af4906f 100644 --- a/src/alloy/hir/value.h +++ b/src/alloy/hir/value.h @@ -68,6 +68,10 @@ enum ValueFlags { VALUE_IS_ALLOCATED = (1 << 2), // Used by backends. Do not set. }; +struct RegAssignment { + const backend::MachineInfo::RegisterSet* set; + int32_t index; +}; class Value { public: @@ -91,10 +95,7 @@ public: TypeName type; uint32_t flags; - struct { - const backend::MachineInfo::RegisterSet* set; - int32_t index; - } reg; + RegAssignment reg; ConstantValue constant; Instr* def; @@ -392,6 +393,7 @@ public: void Shr(Value* other); void Sha(Value* other); void ByteSwap(); + void CountLeadingZeros(const ConstantValue& src); bool Compare(Opcode opcode, Value* other); }; diff --git a/third_party/xbyak b/third_party/xbyak index 702d6e668..2d599b3bd 160000 --- a/third_party/xbyak +++ b/third_party/xbyak @@ -1 +1 @@ -Subproject commit 702d6e6683c322f08a36ea059f6d6f8263b1bd0d +Subproject commit 2d599b3bd64a6d13c8b47a5f7410c67837bfff5d diff --git a/xenia.gyp b/xenia.gyp index e59823058..a765e5c00 100644 --- a/xenia.gyp +++ b/xenia.gyp @@ -24,6 +24,18 @@ 'target_arch%': 'x64', }, + 'conditions': [ + ['OS=="win"', { + 'variables': { + 'move_command%': 'move' + }, + }, { + 'variables': { + 'move_command%': 'mv' + }, + }] + ], + 'target_defaults': { 'include_dirs': [ 'include/', @@ -255,6 +267,7 @@ 'include_dirs': [ '.', 'src/', + '<(INTERMEDIATE_DIR)', ], 'includes': [ From efa056539892dc5c105c65ba26c6f8d3b6d13be9 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 07:53:19 -0700 Subject: [PATCH 097/184] Fixing permute. --- src/alloy/backend/x64/x64_sequences.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index a48df3db5..2d02a2118 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4028,8 +4028,6 @@ EMITTER(PERMUTE_V128, MATCH(I, V128<>, V128<>, V128<>>)) } else { e.vpshufb(e.xmm2, i.src1, e.GetXmmConstPtr(XMMByteSwapMask)); } - // Build a mask with values in src2 having 0 and values in src3 having 1. - e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15)); Xmm src2_shuf = e.xmm0; if (i.src2.value->IsConstantZero()) { e.vpxor(src2_shuf, src2_shuf); @@ -4048,6 +4046,8 @@ EMITTER(PERMUTE_V128, MATCH(I, V128<>, V128<>, V128<>>)) } else { e.vpshufb(src3_shuf, i.src3, e.xmm2); } + // Build a mask with values in src2 having 0 and values in src3 having 1. + e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15)); e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest); } } From 16bac6d9c567f781cde8f8dacafe989928b94193 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 11:56:55 -0700 Subject: [PATCH 098/184] Constant support for a lot of vector ops. --- src/alloy/backend/x64/x64_sequence.inl | 30 ++++ src/alloy/backend/x64/x64_sequences.cc | 196 ++++++++++++++++--------- 2 files changed, 157 insertions(+), 69 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequence.inl b/src/alloy/backend/x64/x64_sequence.inl index ce2b8e36e..eae1096eb 100644 --- a/src/alloy/backend/x64/x64_sequence.inl +++ b/src/alloy/backend/x64/x64_sequence.inl @@ -628,6 +628,36 @@ struct SingleSequence : public Sequence, T> { } } + template + static void EmitCommutativeBinaryXmmOp( + X64Emitter& e, const EmitArgType& i, const FN& fn) { + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + fn(e, i.dest, e.xmm0, i.src2); + } else if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + fn(e, i.dest, i.src1, e.xmm0); + } else { + fn(e, i.dest, i.src1, i.src2); + } + } + + template + static void EmitAssociativeBinaryXmmOp( + X64Emitter& e, const EmitArgType& i, const FN& fn) { + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + fn(e, i.dest, e.xmm0, i.src2); + } else if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + fn(e, i.dest, i.src1, e.xmm0); + } else { + fn(e, i.dest, i.src1, i.src2); + } + } + template static void EmitCommutativeCompareOp( X64Emitter& e, const EmitArgType& i, diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 2d02a2118..9e94888cb 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -2272,20 +2272,23 @@ EMITTER_OPCODE_TABLE( // ============================================================================ EMITTER(VECTOR_COMPARE_EQ_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcmpeqb(i.dest, i.src1, i.src2); - break; - case INT16_TYPE: - e.vpcmpeqw(i.dest, i.src1, i.src2); - break; - case INT32_TYPE: - e.vpcmpeqd(i.dest, i.src1, i.src2); - break; - case FLOAT32_TYPE: - e.vcmpeqps(i.dest, i.src1, i.src2); - break; - } + EmitCommutativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpeqb(dest, src1, src2); + break; + case INT16_TYPE: + e.vpcmpeqw(dest, src1, src2); + break; + case INT32_TYPE: + e.vpcmpeqd(dest, src1, src2); + break; + case FLOAT32_TYPE: + e.vcmpeqps(dest, src1, src2); + break; + } + }); } }; EMITTER_OPCODE_TABLE( @@ -2298,20 +2301,23 @@ EMITTER_OPCODE_TABLE( // ============================================================================ EMITTER(VECTOR_COMPARE_SGT_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcmpgtb(i.dest, i.src1, i.src2); - break; - case INT16_TYPE: - e.vpcmpgtw(i.dest, i.src1, i.src2); - break; - case INT32_TYPE: - e.vpcmpgtd(i.dest, i.src1, i.src2); - break; - case FLOAT32_TYPE: - e.vcmpgtps(i.dest, i.src1, i.src2); - break; - } + EmitAssociativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpgtb(dest, src1, src2); + break; + case INT16_TYPE: + e.vpcmpgtw(dest, src1, src2); + break; + case INT32_TYPE: + e.vpcmpgtd(dest, src1, src2); + break; + case FLOAT32_TYPE: + e.vcmpgtps(dest, src1, src2); + break; + } + }); } }; EMITTER_OPCODE_TABLE( @@ -2324,26 +2330,29 @@ EMITTER_OPCODE_TABLE( // ============================================================================ EMITTER(VECTOR_COMPARE_SGE_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcmpgtb(i.dest, i.src1, i.src2); - e.vpcmpeqb(e.xmm0, i.src1, i.src2); - e.vpor(i.dest, e.xmm0); - break; - case INT16_TYPE: - e.vpcmpgtw(i.dest, i.src1, i.src2); - e.vpcmpeqw(e.xmm0, i.src1, i.src2); - e.vpor(i.dest, e.xmm0); - break; - case INT32_TYPE: - e.vpcmpgtd(i.dest, i.src1, i.src2); - e.vpcmpeqd(e.xmm0, i.src1, i.src2); - e.vpor(i.dest, e.xmm0); - break; - case FLOAT32_TYPE: - e.vcmpgeps(i.dest, i.src1, i.src2); - break; - } + EmitAssociativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpgtb(dest, src1, src2); + e.vpcmpeqb(e.xmm0, src1, src2); + e.vpor(dest, e.xmm0); + break; + case INT16_TYPE: + e.vpcmpgtw(dest, src1, src2); + e.vpcmpeqw(e.xmm0, src1, src2); + e.vpor(dest, e.xmm0); + break; + case INT32_TYPE: + e.vpcmpgtd(dest, src1, src2); + e.vpcmpeqd(e.xmm0, src1, src2); + e.vpor(dest, e.xmm0); + break; + case FLOAT32_TYPE: + e.vcmpgeps(i.dest, i.src1, i.src2); + break; + } + }); } }; EMITTER_OPCODE_TABLE( @@ -2412,17 +2421,26 @@ EMITTER(ADD_I64, MATCH(I, I64<>, I64<>>)) { }; EMITTER(ADD_F32, MATCH(I, F32<>, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vaddss(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vaddss(dest, src1, src2); + }); } }; EMITTER(ADD_F64, MATCH(I, F64<>, F64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vaddsd(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vaddsd(dest, src1, src2); + }); } }; EMITTER(ADD_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vaddps(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vaddps(dest, src1, src2); + }); } }; EMITTER_OPCODE_TABLE( @@ -2569,19 +2587,28 @@ EMITTER(SUB_I64, MATCH(I, I64<>, I64<>>)) { EMITTER(SUB_F32, MATCH(I, F32<>, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERT(!i.instr->flags); - e.vsubss(i.dest, i.src1, i.src2); + EmitAssociativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vsubss(dest, src1, src2); + }); } }; EMITTER(SUB_F64, MATCH(I, F64<>, F64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERT(!i.instr->flags); - e.vsubsd(i.dest, i.src1, i.src2); + EmitAssociativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vsubsd(dest, src1, src2); + }); } }; EMITTER(SUB_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERT(!i.instr->flags); - e.vsubps(i.dest, i.src1, i.src2); + EmitAssociativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vsubps(dest, src1, src2); + }); } }; EMITTER_OPCODE_TABLE( @@ -2682,19 +2709,28 @@ EMITTER(MUL_I64, MATCH(I, I64<>, I64<>>)) { EMITTER(MUL_F32, MATCH(I, F32<>, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERT(!i.instr->flags); - e.vmulss(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmulss(dest, src1, src2); + }); } }; EMITTER(MUL_F64, MATCH(I, F64<>, F64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERT(!i.instr->flags); - e.vmulsd(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmulsd(dest, src1, src2); + }); } }; EMITTER(MUL_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERT(!i.instr->flags); - e.vmulps(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmulps(dest, src1, src2); + }); } }; EMITTER_OPCODE_TABLE( @@ -2969,19 +3005,28 @@ EMITTER(DIV_I64, MATCH(I, I64<>, I64<>>)) { EMITTER(DIV_F32, MATCH(I, F32<>, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERT(!i.instr->flags); - e.vdivss(i.dest, i.src1, i.src2); + EmitAssociativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vdivss(dest, src1, src2); + }); } }; EMITTER(DIV_F64, MATCH(I, F64<>, F64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERT(!i.instr->flags); - e.vdivsd(i.dest, i.src1, i.src2); + EmitAssociativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vdivsd(dest, src1, src2); + }); } }; EMITTER(DIV_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERT(!i.instr->flags); - e.vdivps(i.dest, i.src1, i.src2); + EmitAssociativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vdivps(dest, src1, src2); + }); } }; EMITTER_OPCODE_TABLE( @@ -3274,9 +3319,11 @@ EMITTER_OPCODE_TABLE( EMITTER(DOT_PRODUCT_3_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx - // TODO(benvanik): verify ordering - // TODO(benvanik): apparently this is very slow - find alternative? - e.vdpps(i.dest, i.src1, i.src2, B01110001); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + // TODO(benvanik): apparently this is very slow - find alternative? + e.vdpps(dest, src1, src2, B01110001); + }); } }; EMITTER_OPCODE_TABLE( @@ -3290,9 +3337,11 @@ EMITTER_OPCODE_TABLE( EMITTER(DOT_PRODUCT_4_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx - // TODO(benvanik): verify ordering - // TODO(benvanik): apparently this is very slow - find alternative? - e.vdpps(i.dest, i.src1, i.src2, B11110001); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + // TODO(benvanik): apparently this is very slow - find alternative? + e.vdpps(dest, src1, src2, B11110001); + }); } }; EMITTER_OPCODE_TABLE( @@ -3333,7 +3382,10 @@ EMITTER(AND_I64, MATCH(I, I64<>, I64<>>)) { }; EMITTER(AND_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vpand(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vpand(dest, src1, src2); + }); } }; EMITTER_OPCODE_TABLE( @@ -3378,7 +3430,10 @@ EMITTER(OR_I64, MATCH(I, I64<>, I64<>>)) { }; EMITTER(OR_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vpor(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vpor(dest, src1, src2); + }); } }; EMITTER_OPCODE_TABLE( @@ -3423,7 +3478,10 @@ EMITTER(XOR_I64, MATCH(I, I64<>, I64<>>)) { }; EMITTER(XOR_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vpxor(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vpxor(dest, src1, src2); + }); } }; EMITTER_OPCODE_TABLE( From d85665bb068024db5642391f01d7e0d03099101d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 11:57:20 -0700 Subject: [PATCH 099/184] More efficient 11111... vec loading. --- src/alloy/backend/x64/x64_emitter.cc | 7 +++---- src/alloy/backend/x64/x64_emitter.h | 1 - 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index ce1e4e70a..aad925ef7 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -443,7 +443,6 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { /* XMMUnpackD3DCOLOR */ vec128i(0xFFFFFF02, 0xFFFFFF01, 0xFFFFFF00, 0xFFFFFF02), /* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f), /* XMMShiftMaskPS */ vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu), - /* XMMOneMask */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu), }; // TODO(benvanik): cache base pointer somewhere? stack? It'd be nice to // prevent this move. @@ -461,7 +460,7 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) { vpxor(dest, dest); } else if (v.low == ~0ull && v.high == ~0ull) { // 1111... - vmovaps(dest, GetXmmConstPtr(XMMOneMask)); + vpcmpeqb(dest, dest); } else { // TODO(benvanik): see what other common values are. // TODO(benvanik): build constant table - 99% are reused. @@ -481,7 +480,7 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, float v) { vpxor(dest, dest); } else if (x.i == ~0UL) { // 1111... - vmovaps(dest, GetXmmConstPtr(XMMOneMask)); + vpcmpeqb(dest, dest); } else { // TODO(benvanik): see what other common values are. // TODO(benvanik): build constant table - 99% are reused. @@ -500,7 +499,7 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) { vpxor(dest, dest); } else if (x.i == ~0ULL) { // 1111... - vmovaps(dest, GetXmmConstPtr(XMMOneMask)); + vpcmpeqb(dest, dest); } else { // TODO(benvanik): see what other common values are. // TODO(benvanik): build constant table - 99% are reused. diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 93f859616..7a36e3837 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -51,7 +51,6 @@ enum XmmConst { XMMUnpackD3DCOLOR = 12, XMMOneOver255 = 13, XMMShiftMaskPS = 14, - XMMOneMask = 15, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. From 2d765461ff9321dcea90399988cc3914b0866f41 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 12:03:13 -0700 Subject: [PATCH 100/184] Common constant vector shifts. --- src/alloy/backend/x64/x64_sequences.cc | 196 ++++++++++++++++++++++++- 1 file changed, 188 insertions(+), 8 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 9e94888cb..b0cd63095 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -3688,17 +3688,107 @@ EMITTER_OPCODE_TABLE( EMITTER(VECTOR_SHL_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; case INT32_TYPE: - // src shift mask may have values >31, and x86 sets to zero when - // that happens so we mask. - e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - e.vpsllvd(i.dest, i.src1, e.xmm0); + EmitInt32(e, i); break; default: XEASSERTALWAYS(); break; } } + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.b16[n] != shamt.b16[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same. + uint8_t sh = shamt.b16[0] & 0x7; + if (!sh) { + // No shift? + e.vmovaps(i.dest, i.src1); + } else { + // Even bytes. + e.vpsrlw(e.xmm0, i.src1, 8); + e.vpsllw(e.xmm0, sh + 8); + // Odd bytes. + e.vpsllw(i.dest, i.src1, 8); + e.vpsrlw(i.dest, 8 - sh); + // Mix. + e.vpor(i.dest, e.xmm0); + } + } else { + // Counts differ, so pre-mask and load constant. + XEASSERTALWAYS(); + } + } else { + // Fully variable shift. + XEASSERTALWAYS(); + } + } + static void EmitInt16(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 8 - n; ++n) { + if (shamt.s8[n] != shamt.s8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpsllw. + e.vpsllw(i.dest, i.src1, shamt.s8[0] & 0xF); + } else { + // Counts differ, so pre-mask and load constant. + XEASSERTALWAYS(); + } + } else { + // Fully variable shift. + XEASSERTALWAYS(); + } + } + static void EmitInt32(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.i4[n] != shamt.i4[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpslld. + e.vpslld(i.dest, i.src1, shamt.b16[0] & 0x1F); + } else { + // Counts differ, so pre-mask and load constant. + vec128_t masked = i.src2.constant(); + for (size_t n = 0; n < 4; ++n) { + masked.i4[n] &= 0x1F; + } + e.LoadConstantXmm(e.xmm0, masked); + e.vpsllvd(i.dest, i.src1, e.xmm0); + } + } else { + // Fully variable shift. + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsllvd(i.dest, i.src1, e.xmm0); + } + } }; EMITTER_OPCODE_TABLE( OPCODE_VECTOR_SHL, @@ -3711,17 +3801,107 @@ EMITTER_OPCODE_TABLE( EMITTER(VECTOR_SHR_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; case INT32_TYPE: - // src shift mask may have values >31, and x86 sets to zero when - // that happens so we mask. - e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - e.vpsrlvd(i.dest, i.src1, e.xmm0); + EmitInt32(e, i); break; default: XEASSERTALWAYS(); break; } } + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.b16[n] != shamt.b16[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same. + uint8_t sh = shamt.b16[0] & 0x7; + if (!sh) { + // No shift? + e.vmovaps(i.dest, i.src1); + } else { + // Even bytes. + e.vpsllw(e.xmm0, i.src1, 8); + e.vpsrlw(e.xmm0, sh + 8); + // Odd bytes. + e.vpsrlw(i.dest, i.src1, 8); + e.vpsllw(i.dest, 8 - sh); + // Mix. + e.vpor(i.dest, e.xmm0); + } + } else { + // Counts differ, so pre-mask and load constant. + XEASSERTALWAYS(); + } + } else { + // Fully variable shift. + XEASSERTALWAYS(); + } + } + static void EmitInt16(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 8 - n; ++n) { + if (shamt.s8[n] != shamt.s8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpsllw. + e.vpsrlw(i.dest, i.src1, shamt.s8[0] & 0xF); + } else { + // Counts differ, so pre-mask and load constant. + XEASSERTALWAYS(); + } + } else { + // Fully variable shift. + XEASSERTALWAYS(); + } + } + static void EmitInt32(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.i4[n] != shamt.i4[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpslld. + e.vpsrld(i.dest, i.src1, shamt.b16[0] & 0x1F); + } else { + // Counts differ, so pre-mask and load constant. + vec128_t masked = i.src2.constant(); + for (size_t n = 0; n < 4; ++n) { + masked.i4[n] &= 0x1F; + } + e.LoadConstantXmm(e.xmm0, masked); + e.vpsrlvd(i.dest, i.src1, e.xmm0); + } + } else { + // Fully variable shift. + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsrlvd(i.dest, i.src1, e.xmm0); + } + } }; EMITTER_OPCODE_TABLE( OPCODE_VECTOR_SHR, From 7635bb71a0ef354a797c8897ebace90aef1a762f Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 12:03:40 -0700 Subject: [PATCH 101/184] Constant permute. --- src/alloy/backend/x64/x64_sequences.cc | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index b0cd63095..1a3f90abc 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4219,13 +4219,28 @@ EMITTER(PERMUTE_I32, MATCH(I, I32<>, V128<>, V128<>>)) { (((control >> 18) & 0x1) << 1) | (((control >> 10) & 0x1) << 2) | (((control >> 2) & 0x1) << 3); - if (i.dest != i.src3) { - e.vpshufd(i.dest, i.src2, src_control); - e.vpshufd(e.xmm0, i.src3, src_control); + // TODO(benvanik): if src2/src3 are constants, shuffle now! + Xmm src2; + if (i.src2.is_constant) { + src2 = e.xmm1; + e.LoadConstantXmm(src2, i.src2.constant()); + } else { + src2 = i.src2; + } + Xmm src3; + if (i.src3.is_constant) { + src3 = e.xmm2; + e.LoadConstantXmm(src3, i.src3.constant()); + } else { + src3 = i.src3; + } + if (i.dest != src3) { + e.vpshufd(i.dest, src2, src_control); + e.vpshufd(e.xmm0, src3, src_control); e.vpblendd(i.dest, e.xmm0, blend_control); } else { - e.vmovaps(e.xmm0, i.src3); - e.vpshufd(i.dest, i.src2, src_control); + e.vmovaps(e.xmm0, src3); + e.vpshufd(i.dest, src2, src_control); e.vpshufd(e.xmm0, e.xmm0, src_control); e.vpblendd(i.dest, e.xmm0, blend_control); } From e32342e956568128b620d0dd133145cb3e069938 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 13:02:00 -0700 Subject: [PATCH 102/184] Variable vector_shl int8. --- src/alloy/backend/x64/x64_emitter.cc | 1 + src/alloy/backend/x64/x64_emitter.h | 1 + src/alloy/backend/x64/x64_sequences.cc | 36 +++++++++++++++++++++++++- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index aad925ef7..6e94a660a 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -443,6 +443,7 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { /* XMMUnpackD3DCOLOR */ vec128i(0xFFFFFF02, 0xFFFFFF01, 0xFFFFFF00, 0xFFFFFF02), /* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f), /* XMMShiftMaskPS */ vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu), + /* XMMShiftByteMask */ vec128i(0x000000FFu, 0x000000FFu, 0x000000FFu, 0x000000FFu), }; // TODO(benvanik): cache base pointer somewhere? stack? It'd be nice to // prevent this move. diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 7a36e3837..d67348e18 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -51,6 +51,7 @@ enum XmmConst { XMMUnpackD3DCOLOR = 12, XMMOneOver255 = 13, XMMShiftMaskPS = 14, + XMMShiftByteMask = 15, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 1a3f90abc..7f0850d6c 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -3734,7 +3734,41 @@ EMITTER(VECTOR_SHL_V128, MATCH(I, V128<>, V128<>>)) { } } else { // Fully variable shift. - XEASSERTALWAYS(); + // TODO(benvanik): find a better sequence. + Xmm temp = i.dest; + if (i.dest == i.src1 || i.dest == i.src2) { + temp = e.xmm2; + } + auto byte_mask = e.GetXmmConstPtr(XMMShiftByteMask); + // AABBCCDD|EEFFGGHH|IIJJKKLL|MMNNOOPP + // DD| HH| LL| PP + e.vpand(e.xmm0, i.src1, byte_mask); + e.vpand(e.xmm1, i.src2, byte_mask); + e.vpsllvd(temp, e.xmm0, e.xmm1); + // CC | GG | KK | OO + e.vpsrld(e.xmm0, i.src1, 8); + e.vpand(e.xmm0, byte_mask); + e.vpsrld(e.xmm1, i.src2, 8); + e.vpand(e.xmm1, byte_mask); + e.vpsllvd(e.xmm0, e.xmm0, e.xmm1); + e.vpslld(e.xmm0, 8); + e.vpor(temp, e.xmm0); + // BB | FF | JJ | NN + e.vpsrld(e.xmm0, i.src1, 16); + e.vpand(e.xmm0, byte_mask); + e.vpsrld(e.xmm1, i.src2, 16); + e.vpand(e.xmm1, byte_mask); + e.vpsllvd(e.xmm0, e.xmm0, e.xmm1); + e.vpslld(e.xmm0, 16); + e.vpor(temp, e.xmm0); + // AA |EE |II |MM + e.vpsrld(e.xmm0, i.src1, 24); + e.vpand(e.xmm0, byte_mask); + e.vpsrld(e.xmm1, i.src2, 24); + e.vpand(e.xmm1, byte_mask); + e.vpsllvd(e.xmm0, e.xmm0, e.xmm1); + e.vpslld(e.xmm0, 24); + e.vpor(i.dest, temp, e.xmm0); } } static void EmitInt16(X64Emitter& e, const EmitArgType& i) { From 87f8a4bb25a1a7dada434e75be909eb518232a9e Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 13:02:23 -0700 Subject: [PATCH 103/184] vector_add. --- src/alloy/backend/x64/x64_sequences.cc | 65 ++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 7f0850d6c..1f3c1efc8 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -2259,12 +2259,14 @@ EMITTER_OPCODE_TABLE( // ============================================================================ // OPCODE_DID_SATURATE // ============================================================================ -//EMITTER(DID_SATURATE, MATCH(I>)) { -// static void Emit(X64Emitter& e, const EmitArgType& i) { -// } -//}; -//EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE, -// DID_SATURATE); +EMITTER(DID_SATURATE, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): implement saturation check (VECTOR_ADD, etc). + e.xor(i.dest, i.dest); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE, + DID_SATURATE); // ============================================================================ @@ -2530,6 +2532,53 @@ EMITTER_OPCODE_TABLE( // ============================================================================ // OPCODE_VECTOR_ADD // ============================================================================ +EMITTER(VECTOR_ADD, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) { + const TypeName part_type = static_cast(i.instr->flags & 0xFF); + const uint32_t arithmetic_flags = i.instr->flags >> 8; + bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); + bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE); + switch (part_type) { + case INT8_TYPE: + if (saturate) { + // TODO(benvanik): trace DID_SATURATE + if (is_unsigned) { + e.vpaddsb(dest, src1, src2); + } else { + e.vpaddusb(dest, src1, src2); + } + } else { + e.vpaddb(dest, src1, src2); + } + break; + case INT16_TYPE: + if (saturate) { + // TODO(benvanik): trace DID_SATURATE + if (is_unsigned) { + e.vpaddsw(dest, src1, src2); + } else { + e.vpaddusw(dest, src1, src2); + } + } else { + e.vpaddw(dest, src1, src2); + } + break; + case INT32_TYPE: + XEASSERTALWAYS(); + break; + case FLOAT32_TYPE: + e.vaddps(dest, src1, src2); + break; + default: XEASSERTALWAYS(); break; + } + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_ADD, + VECTOR_ADD); // ============================================================================ @@ -4713,7 +4762,7 @@ void alloy::backend::x64::RegisterSequences() { REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_UGE); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_CARRY); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_OVERFLOW); - //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_EQ); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGT); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGE); @@ -4721,7 +4770,7 @@ void alloy::backend::x64::RegisterSequences() { //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGE); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ADD); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ADD_CARRY); - //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ADD); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ADD); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SUB); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MUL); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MUL_HI); From 0612a68f80fef40d1d5ea9036c2f63b07bdc5220 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 13:02:38 -0700 Subject: [PATCH 104/184] Fixing encoding of vpextr*. --- src/alloy/backend/x64/x64_sequences.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 1f3c1efc8..93039b45e 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4162,7 +4162,7 @@ EMITTER_OPCODE_TABLE( EMITTER(EXTRACT_I8, MATCH(I, V128<>, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { - e.vpextrb(i.dest, i.src1, i.src2.constant()); + e.vpextrb(i.dest.reg().cvt64(), i.src1, i.src2.constant()); } else { XEASSERTALWAYS(); } @@ -4171,7 +4171,7 @@ EMITTER(EXTRACT_I8, MATCH(I, V128<>, I8<>>)) { EMITTER(EXTRACT_I16, MATCH(I, V128<>, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { - e.vpextrw(i.dest, i.src1, i.src2.constant()); + e.vpextrw(i.dest.reg().cvt64(), i.src1, i.src2.constant()); } else { XEASSERTALWAYS(); } @@ -4186,9 +4186,9 @@ EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), }; if (i.src2.is_constant) { - e.vpextrd(i.dest, i.src1, i.src2.constant()); + e.vpextrd(i.dest.reg().cvt64(), i.src1, i.src2.constant()); } else { - // Get teh desired word in xmm0, then extract that. + // Get the desired word in xmm0, then extract that. // TODO(benvanik): find a better way, this sequence is terrible. e.xor(e.rax, e.rax); e.mov(e.al, i.src2); @@ -4197,7 +4197,7 @@ EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { e.mov(e.rdx, reinterpret_cast(extract_table_32)); e.vmovaps(e.xmm0, e.ptr[e.rdx + e.rax]); e.vpshufb(e.xmm0, i.src1, e.xmm0); - e.vpextrd(i.dest, e.xmm0, 0); + e.vpextrd(i.dest.reg().cvt32(), e.xmm0, 0); e.ReloadEDX(); } } From 8619a15ee3be88b548756ee7de2a1063cc052ba4 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 13:25:22 -0700 Subject: [PATCH 105/184] Emulated POW2/LOG2. Can probably be faked with polynomials. --- src/alloy/backend/x64/x64_sequences.cc | 97 +++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 3 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 93039b45e..6e6d85da9 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -977,7 +977,7 @@ EMITTER(VECTOR_CONVERT_F2I, MATCH(I, V128<>>)) e.vcvttps2dq(i.dest, i.src1); if (i.instr->flags & ARITHMETIC_SATURATE) { // TODO(benvanik): check saturation. - e.UnimplementedInstr(i.instr); + // In theory cvt throws if it saturates. } } }; @@ -3355,11 +3355,102 @@ EMITTER_OPCODE_TABLE( // ============================================================================ // OPCODE_POW2 // ============================================================================ +// TODO(benvanik): use approx here: +// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html +EMITTER(POW2_F32, MATCH(I, F32<>>)) { + static __m128 EmulatePow2(__m128 src) { + float result = static_cast(pow(2, src.m128_f32[0])); + return _mm_load_ss(&result); + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + e.lea(e.r8, e.StashXmm(i.src1)); + e.CallNative(EmulatePow2); + e.vmovaps(i.dest, e.xmm0); + } +}; +EMITTER(POW2_F64, MATCH(I, F64<>>)) { + static __m128d EmulatePow2(__m128 src) { + double result = pow(2, src.m128_f32[0]); + return _mm_load_sd(&result); + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + e.lea(e.r8, e.StashXmm(i.src1)); + e.CallNative(EmulatePow2); + e.vmovaps(i.dest, e.xmm0); + } +}; +EMITTER(POW2_V128, MATCH(I, V128<>>)) { + static __m128 EmulatePow2(__m128 src) { + __m128 result; + for (size_t i = 0; i < 4; ++i) { + result.m128_f32[i] = static_cast(pow(2, src.m128_f32[i])); + } + return result; + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.lea(e.r8, e.StashXmm(i.src1)); + e.CallNative(EmulatePow2); + e.vmovaps(i.dest, e.xmm0); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_POW2, + POW2_F32, + POW2_F64, + POW2_V128); // ============================================================================ // OPCODE_LOG2 // ============================================================================ +// TODO(benvanik): use approx here: +// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html +EMITTER(LOG2_F32, MATCH(I, F32<>>)) { + static __m128 EmulateLog2(__m128 src) { + float result = log2(src.m128_f32[0]); + return _mm_load_ss(&result); + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + e.lea(e.r8, e.StashXmm(i.src1)); + e.CallNative(EmulateLog2); + e.vmovaps(i.dest, e.xmm0); + } +}; +EMITTER(LOG2_F64, MATCH(I, F64<>>)) { + static __m128d EmulateLog2(__m128d src) { + double result = log2(src.m128d_f64[0]); + return _mm_load_sd(&result); + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + e.lea(e.r8, e.StashXmm(i.src1)); + e.CallNative(EmulateLog2); + e.vmovaps(i.dest, e.xmm0); + } +}; +EMITTER(LOG2_V128, MATCH(I, V128<>>)) { + static __m128 EmulateLog2(__m128 src) { + __m128 result; + for (size_t i = 0; i < 4; ++i) { + result.m128_f32[i] = log2(src.m128_f32[i]); + } + return result; + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + e.lea(e.r8, e.StashXmm(i.src1)); + e.CallNative(EmulateLog2); + e.vmovaps(i.dest, e.xmm0); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOG2, + LOG2_F32, + LOG2_F64, + LOG2_V128); // ============================================================================ @@ -4781,8 +4872,8 @@ void alloy::backend::x64::RegisterSequences() { REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ABS); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SQRT); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_RSQRT); - //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_POW2); - //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOG2); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_POW2); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOG2); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_4); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_AND); From 5436cde0fc857e4d4c1b58432f48ab8c9ae33bbd Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 14:27:07 -0700 Subject: [PATCH 106/184] Saturating unsigned VECTOR_ADD. --- src/alloy/backend/x64/x64_emitter.cc | 1 + src/alloy/backend/x64/x64_emitter.h | 1 + src/alloy/backend/x64/x64_sequences.cc | 36 +++++++++++++++++++++++++- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 6e94a660a..684fcaa86 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -444,6 +444,7 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { /* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f), /* XMMShiftMaskPS */ vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu), /* XMMShiftByteMask */ vec128i(0x000000FFu, 0x000000FFu, 0x000000FFu, 0x000000FFu), + /* XMMUnsignedDwordMax */ vec128i(0xFFFFFFFFu, 0x00000000u, 0xFFFFFFFFu, 0x00000000u), }; // TODO(benvanik): cache base pointer somewhere? stack? It'd be nice to // prevent this move. diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index d67348e18..3ac92be3f 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -52,6 +52,7 @@ enum XmmConst { XMMOneOver255 = 13, XMMShiftMaskPS = 14, XMMShiftByteMask = 15, + XMMUnsignedDwordMax = 16, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 6e6d85da9..9ece83e67 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -2566,7 +2566,41 @@ EMITTER(VECTOR_ADD, MATCH(I, V128<>, V128<>>)) { } break; case INT32_TYPE: - XEASSERTALWAYS(); + if (saturate) { + if (is_unsigned) { + // We reuse all these temps... + XEASSERT(src1 != e.xmm0 && src1 != e.xmm1 && src1 != e.xmm2); + XEASSERT(src2 != e.xmm0 && src2 != e.xmm1 && src2 != e.xmm2); + // Clamp to 0xFFFFFFFF. + // Wish there was a vpaddusd... + // | A | B | C | D | + // | B | D | + e.db(0xCC); + e.vpsllq(e.xmm0, src1, 32); + e.vpsllq(e.xmm1, src2, 32); + e.vpsrlq(e.xmm0, 32); + e.vpsrlq(e.xmm1, 32); + e.vpaddq(e.xmm0, e.xmm1); + e.vpcmpgtq(e.xmm0, e.GetXmmConstPtr(XMMUnsignedDwordMax)); + e.vpsllq(e.xmm0, 32); + e.vpsrlq(e.xmm0, 32); + // | A | C | + e.vpsrlq(e.xmm1, src1, 32); + e.vpsrlq(e.xmm2, src2, 32); + e.vpaddq(e.xmm1, e.xmm2); + e.vpcmpgtq(e.xmm1, e.GetXmmConstPtr(XMMUnsignedDwordMax)); + e.vpsllq(e.xmm1, 32); + // xmm0 = mask for with saturated dwords == 111... + e.vpor(e.xmm0, e.xmm1); + e.vpaddd(dest, src1, src2); + // dest.f[n] = xmm1.f[n] ? xmm1.f[n] : dest.f[n]; + e.vblendvps(dest, dest, e.xmm1, e.xmm1); + } else { + XEASSERTALWAYS(); + } + } else { + e.vpaddd(dest, src1, src2); + } break; case FLOAT32_TYPE: e.vaddps(dest, src1, src2); From 2856d38024de34f155490778ca16ee7506f1ce2d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 14:31:19 -0700 Subject: [PATCH 107/184] Fixing some instructions. --- src/alloy/backend/x64/x64_sequences.cc | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 9ece83e67..d60404aa1 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -2351,7 +2351,7 @@ EMITTER(VECTOR_COMPARE_SGE_V128, MATCH(I, V128 e.vpor(dest, e.xmm0); break; case FLOAT32_TYPE: - e.vcmpgeps(i.dest, i.src1, i.src2); + e.vcmpgeps(dest, src1, src2); break; } }); @@ -2545,9 +2545,9 @@ EMITTER(VECTOR_ADD, MATCH(I, V128<>, V128<>>)) { if (saturate) { // TODO(benvanik): trace DID_SATURATE if (is_unsigned) { - e.vpaddsb(dest, src1, src2); - } else { e.vpaddusb(dest, src1, src2); + } else { + e.vpaddsb(dest, src1, src2); } } else { e.vpaddb(dest, src1, src2); @@ -2557,9 +2557,9 @@ EMITTER(VECTOR_ADD, MATCH(I, V128<>, V128<>>)) { if (saturate) { // TODO(benvanik): trace DID_SATURATE if (is_unsigned) { - e.vpaddsw(dest, src1, src2); - } else { e.vpaddusw(dest, src1, src2); + } else { + e.vpaddsw(dest, src1, src2); } } else { e.vpaddw(dest, src1, src2); @@ -2575,7 +2575,6 @@ EMITTER(VECTOR_ADD, MATCH(I, V128<>, V128<>>)) { // Wish there was a vpaddusd... // | A | B | C | D | // | B | D | - e.db(0xCC); e.vpsllq(e.xmm0, src1, 32); e.vpsllq(e.xmm1, src2, 32); e.vpsrlq(e.xmm0, 32); @@ -4287,7 +4286,7 @@ EMITTER_OPCODE_TABLE( EMITTER(EXTRACT_I8, MATCH(I, V128<>, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { - e.vpextrb(i.dest.reg().cvt64(), i.src1, i.src2.constant()); + e.vpextrb(i.dest.reg().cvt32(), i.src1, i.src2.constant()); } else { XEASSERTALWAYS(); } @@ -4296,7 +4295,7 @@ EMITTER(EXTRACT_I8, MATCH(I, V128<>, I8<>>)) { EMITTER(EXTRACT_I16, MATCH(I, V128<>, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { - e.vpextrw(i.dest.reg().cvt64(), i.src1, i.src2.constant()); + e.vpextrw(i.dest.reg().cvt32(), i.src1, i.src2.constant()); } else { XEASSERTALWAYS(); } @@ -4311,7 +4310,7 @@ EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), }; if (i.src2.is_constant) { - e.vpextrd(i.dest.reg().cvt64(), i.src1, i.src2.constant()); + e.vpextrd(i.dest, i.src1, i.src2.constant()); } else { // Get the desired word in xmm0, then extract that. // TODO(benvanik): find a better way, this sequence is terrible. @@ -4322,7 +4321,7 @@ EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { e.mov(e.rdx, reinterpret_cast(extract_table_32)); e.vmovaps(e.xmm0, e.ptr[e.rdx + e.rax]); e.vpshufb(e.xmm0, i.src1, e.xmm0); - e.vpextrd(i.dest.reg().cvt32(), e.xmm0, 0); + e.vpextrd(i.dest, e.xmm0, 0); e.ReloadEDX(); } } From b0034f3b4d2f75cfbefdb363f2410ad3774f5269 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 15:15:52 -0700 Subject: [PATCH 108/184] Fix cntlz. --- .../compiler/passes/constant_propagation_pass.cc | 2 +- src/alloy/hir/value.cc | 12 ++++++------ src/alloy/hir/value.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/alloy/compiler/passes/constant_propagation_pass.cc b/src/alloy/compiler/passes/constant_propagation_pass.cc index a481d18af..5804ed218 100644 --- a/src/alloy/compiler/passes/constant_propagation_pass.cc +++ b/src/alloy/compiler/passes/constant_propagation_pass.cc @@ -371,7 +371,7 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) { case OPCODE_CNTLZ: if (i->src1.value->IsConstant()) { v->set_zero(v->type); - v->CountLeadingZeros(i->src1.value->constant); + v->CountLeadingZeros(i->src1.value); i->Remove(); } break; diff --git a/src/alloy/hir/value.cc b/src/alloy/hir/value.cc index f70d6ceb2..10fc62cad 100644 --- a/src/alloy/hir/value.cc +++ b/src/alloy/hir/value.cc @@ -560,19 +560,19 @@ void Value::ByteSwap() { } } -void Value::CountLeadingZeros(const ConstantValue& src) { - switch (type) { +void Value::CountLeadingZeros(const Value* other) { + switch (other->type) { case INT8_TYPE: - constant.i8 = __lzcnt16(src.i8) - 8; + constant.i8 = static_cast(__lzcnt16(other->constant.i8) - 8); break; case INT16_TYPE: - constant.i8 = __lzcnt16(src.i16); + constant.i8 = static_cast(__lzcnt16(other->constant.i16)); break; case INT32_TYPE: - constant.i8 = __lzcnt(src.i32); + constant.i8 = static_cast(__lzcnt(other->constant.i32)); break; case INT64_TYPE: - constant.i8 = __lzcnt64(src.i64); + constant.i8 = static_cast(__lzcnt64(other->constant.i64)); break; default: XEASSERTALWAYS(); diff --git a/src/alloy/hir/value.h b/src/alloy/hir/value.h index e3af4906f..9a1f668f5 100644 --- a/src/alloy/hir/value.h +++ b/src/alloy/hir/value.h @@ -393,7 +393,7 @@ public: void Shr(Value* other); void Sha(Value* other); void ByteSwap(); - void CountLeadingZeros(const ConstantValue& src); + void CountLeadingZeros(const Value* other); bool Compare(Opcode opcode, Value* other); }; From edf282abdceebfbce74cd04e0bb16e4dffaa3eb1 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 15:26:16 -0700 Subject: [PATCH 109/184] Disabling logging. --- src/alloy/backend/ivm/ivm_intcode.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 6001cb15b..59646c067 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -40,10 +40,10 @@ namespace ivm { #define DPRINT #define DFLUSH() -#define IPRINT if (ics.thread_state->thread_id() == 1) printf -#define IFLUSH() fflush(stdout) -#define DPRINT if (ics.thread_state->thread_id() == 1) printf -#define DFLUSH() fflush(stdout) +//#define IPRINT if (ics.thread_state->thread_id() == 1) printf +//#define IFLUSH() fflush(stdout) +//#define DPRINT if (ics.thread_state->thread_id() == 1) printf +//#define DFLUSH() fflush(stdout) #if XE_CPU_BIGENDIAN #define VECB16(v,n) (v.b16[n]) From 192941eeb04d7dfd2bcaa1e745ff86bd71975835 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 16:23:46 -0700 Subject: [PATCH 110/184] PACK D3DCOLOR. --- src/alloy/backend/x64/x64_emitter.cc | 8 +++++--- src/alloy/backend/x64/x64_emitter.h | 12 +++++++----- src/alloy/backend/x64/x64_sequences.cc | 22 +++++++++++++++++++++- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 684fcaa86..3ee6ab59c 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -431,8 +431,8 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { /* XMMZero */ vec128f(0.0f, 0.0f, 0.0f, 0.0f), /* XMMOne */ vec128f(1.0f, 1.0f, 1.0f, 1.0f), /* XMMNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f), - /* XMMMaskX16Y16 */ vec128i(0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000), - /* XMMFlipX16Y16 */ vec128i(0x00008000, 0x00000000, 0x00000000, 0x00000000), + /* XMMMaskX16Y16 */ vec128i(0x0000FFFFu, 0xFFFF0000u, 0x00000000u, 0x00000000u), + /* XMMFlipX16Y16 */ vec128i(0x00008000u, 0x00000000u, 0x00000000u, 0x00000000u), /* XMMFixX16Y16 */ vec128f(-32768.0f, 0.0f, 0.0f, 0.0f), /* XMMNormalizeX16Y16 */ vec128f(1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f), /* XMM3301 */ vec128f(3.0f, 3.0f, 0.0f, 1.0f), @@ -440,11 +440,13 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { /* XMMSignMaskPD */ vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u), /* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), /* XMMPermuteControl15 */ vec128b(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15), - /* XMMUnpackD3DCOLOR */ vec128i(0xFFFFFF02, 0xFFFFFF01, 0xFFFFFF00, 0xFFFFFF02), + /* XMMPackD3DCOLOR */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u), + /* XMMUnpackD3DCOLOR */ vec128i(0xFFFFFF02u, 0xFFFFFF01u, 0xFFFFFF00u, 0xFFFFFF03u), /* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f), /* XMMShiftMaskPS */ vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu), /* XMMShiftByteMask */ vec128i(0x000000FFu, 0x000000FFu, 0x000000FFu, 0x000000FFu), /* XMMUnsignedDwordMax */ vec128i(0xFFFFFFFFu, 0x00000000u, 0xFFFFFFFFu, 0x00000000u), + /* XMM255 */ vec128f(255.0f, 255.0f, 255.0f, 255.0f), }; // TODO(benvanik): cache base pointer somewhere? stack? It'd be nice to // prevent this move. diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 3ac92be3f..4b05e5134 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -48,11 +48,13 @@ enum XmmConst { XMMSignMaskPD = 9, XMMByteSwapMask = 10, XMMPermuteControl15 = 11, - XMMUnpackD3DCOLOR = 12, - XMMOneOver255 = 13, - XMMShiftMaskPS = 14, - XMMShiftByteMask = 15, - XMMUnsignedDwordMax = 16, + XMMPackD3DCOLOR = 12, + XMMUnpackD3DCOLOR = 13, + XMMOneOver255 = 14, + XMMShiftMaskPS = 15, + XMMShiftByteMask = 16, + XMMUnsignedDwordMax = 17, + XMM255 = 18, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index d60404aa1..3c5da0755 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4582,7 +4582,27 @@ EMITTER(PACK, MATCH(I, V128<>>)) { } } static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { - XEASSERTALWAYS(); + // RGBA (XYZW) -> ARGB (WXYZ) + // float r = roundf(((src1.x < 0) ? 0 : ((1 < src1.x) ? 1 : src1.x)) * 255); + // float g = roundf(((src1.y < 0) ? 0 : ((1 < src1.y) ? 1 : src1.y)) * 255); + // float b = roundf(((src1.z < 0) ? 0 : ((1 < src1.z) ? 1 : src1.z)) * 255); + // float a = roundf(((src1.w < 0) ? 0 : ((1 < src1.w) ? 1 : src1.w)) * 255); + // dest.iw = ((uint32_t)a << 24) | + // ((uint32_t)r << 16) | + // ((uint32_t)g << 8) | + // ((uint32_t)b); + // f2i(clamp(src, 0, 1) * 255) + e.vpxor(e.xmm0, e.xmm0); + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm1, i.src1.constant()); + e.vmaxps(e.xmm0, e.xmm1); + } else { + e.vmaxps(e.xmm0, i.src1); + } + e.vminps(e.xmm0, e.GetXmmConstPtr(XMMOne)); + e.vmulps(e.xmm0, e.GetXmmConstPtr(XMM255)); + e.vcvttps2dq(e.xmm0, e.xmm0); + e.vpshufb(i.dest, e.xmm0, e.GetXmmConstPtr(XMMPackD3DCOLOR)); } static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { XEASSERTALWAYS(); From f289f90435a7319ab07beb0e4c690c413046d711 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 16:34:47 -0700 Subject: [PATCH 111/184] Needs AVX2. --- README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 590d2ee63..e51170837 100644 --- a/README.md +++ b/README.md @@ -84,11 +84,9 @@ Come on people. Jeez. ### What kind of machine do I need to run this? -You'll need 64-bit Windows 7 with a processor supporting at least AVX1. -It's only tested on Windows 8 and that may become a requirement as several of -the APIs exposed there are beneficial to emulation. In general if you have to -ask if your machine is good enough to run games at a decent speed the answer is -no. +You'll need 64-bit Windows 8 with a processor supporting at least AVX2 - in +other words, a Haswell. In general if you have to ask if your machine is good +enough to run games at a decent speed the answer is no. ### What about Linux/OSX? From ca22010502718387f4a796400b41fb5fee017ecf Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 19:33:16 -0700 Subject: [PATCH 112/184] Adding memory snooping for mmio accesses. Yuck. --- src/alloy/backend/x64/x64_sequences.cc | 88 ++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 6 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 3c5da0755..17502d137 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -1498,6 +1498,86 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { return e.rdx + e.rax; } } +uint64_t DynamicRegisterLoad(void* raw_context, uint32_t address) { + auto thread_state = *((ThreadState**)raw_context); + auto cbs = thread_state->runtime()->access_callbacks(); + while (cbs) { + if (cbs->handles(cbs->context, address)) { + return cbs->read(cbs->context, address); + } + cbs = cbs->next; + } + return 0; +} +void DynamicRegisterStore(void* raw_context, uint32_t address, uint64_t value) { + auto thread_state = *((ThreadState**)raw_context); + auto cbs = thread_state->runtime()->access_callbacks(); + while (cbs) { + if (cbs->handles(cbs->context, address)) { + cbs->write(cbs->context, address, value); + return; + } + cbs = cbs->next; + } +} +template +void EmitLoadCheck(X64Emitter& e, const RegExp& addr, DEST_REG& dest) { + // rax = reserved + // if (address >> 24 == 0x7F) call register load handler; + e.lea(e.r8d, e.ptr[addr]); + e.shr(e.r8d, 24); + e.cmp(e.r8b, 0x7F); + e.inLocalLabel(); + Xbyak::Label normal_addr; + Xbyak::Label skip_load; + e.jne(normal_addr); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(DynamicRegisterLoad); + if (DEST_REG::key_type == KEY_TYPE_V_I32) { + e.bswap(e.eax); + e.mov(dest, e.eax); + } + e.jmp(skip_load); + e.L(normal_addr); + if (DEST_REG::key_type == KEY_TYPE_V_I32) { + e.mov(dest, e.dword[addr]); + } + e.L(skip_load); + e.outLocalLabel(); +} +template +void EmitStoreCheck(X64Emitter& e, const RegExp& addr, SRC_REG& src) { + // rax = reserved + // if (address >> 24 == 0x7F) call register store handler; + e.lea(e.r8d, e.ptr[addr]); + e.shr(e.r8d, 24); + e.cmp(e.r8b, 0x7F); + e.inLocalLabel(); + Xbyak::Label normal_addr; + Xbyak::Label skip_load; + e.jne(normal_addr); + e.lea(e.rdx, e.ptr[addr]); + if (SRC_REG::key_type == KEY_TYPE_V_I32) { + if (src.is_constant) { + e.mov(e.r8d, XESWAP32(src.constant())); + } else { + e.mov(e.r8d, src); + e.bswap(e.r8d); + } + } + e.CallNative(DynamicRegisterStore); + e.jmp(skip_load); + e.L(normal_addr); + if (SRC_REG::key_type == KEY_TYPE_V_I32) { + if (src.is_constant) { + e.mov(e.dword[addr], src.constant()); + } else { + e.mov(e.dword[addr], src); + } + } + e.L(skip_load); + e.outLocalLabel(); +} EMITTER(LOAD_I8, MATCH(I, I64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { if (CheckLoadAccessCallback(e, i)) { @@ -1532,7 +1612,7 @@ EMITTER(LOAD_I32, MATCH(I, I64<>>)) { return; } auto addr = ComputeMemoryAddress(e, i.src1); - e.mov(i.dest, e.dword[addr]); + EmitLoadCheck(e, addr, i.dest); if (IsTracingData()) { e.mov(e.r8, i.dest); e.lea(e.rdx, e.ptr[addr]); @@ -1685,11 +1765,7 @@ EMITTER(STORE_I32, MATCH(I, I32<>>)) { return; } auto addr = ComputeMemoryAddress(e, i.src1); - if (i.src2.is_constant) { - e.mov(e.dword[addr], i.src2.constant()); - } else { - e.mov(e.dword[addr], i.src2); - } + EmitStoreCheck(e, addr, i.src2); if (IsTracingData()) { e.mov(e.r8, e.dword[addr]); e.lea(e.rdx, e.ptr[addr]); From 2ef6545927731292d95b48a65dbee8d520e0275a Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 19:35:52 -0700 Subject: [PATCH 113/184] Trap sometimes called. --- src/alloy/backend/x64/x64_emitter.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 3ee6ab59c..58ac912d9 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -218,7 +218,8 @@ void X64Emitter::DebugBreak() { } void X64Emitter::Trap() { - // TODO(benvanik): notify debugger. + // 0x0FE00014 is a 'debug print' where r3 = buffer r4 = length + // TODO(benvanik): post software interrupt to debugger. db(0xCC); } From f3f9d93017d059d906ad1057bc1bba3973a9cdcc Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 19:38:47 -0700 Subject: [PATCH 114/184] Adding microprofile to third_party. --- third_party/microprofile/README.md | 1 + third_party/microprofile/microprofile.h | 4075 +++++++++++++++++++++++ 2 files changed, 4076 insertions(+) create mode 100644 third_party/microprofile/README.md create mode 100644 third_party/microprofile/microprofile.h diff --git a/third_party/microprofile/README.md b/third_party/microprofile/README.md new file mode 100644 index 000000000..8b8040b20 --- /dev/null +++ b/third_party/microprofile/README.md @@ -0,0 +1 @@ +https://bitbucket.org/jonasmeyer/microprofile diff --git a/third_party/microprofile/microprofile.h b/third_party/microprofile/microprofile.h new file mode 100644 index 000000000..f6b747d56 --- /dev/null +++ b/third_party/microprofile/microprofile.h @@ -0,0 +1,4075 @@ +#pragma once +// This is free and unencumbered software released into the public domain. +// Anyone is free to copy, modify, publish, use, compile, sell, or +// distribute this software, either in source code form or as a compiled +// binary, for any purpose, commercial or non-commercial, and by any +// means. +// In jurisdictions that recognize copyright laws, the author or authors +// of this software dedicate any and all copyright interest in the +// software to the public domain. We make this dedication for the benefit +// of the public at large and to the detriment of our heirs and +// successors. We intend this dedication to be an overt act of +// relinquishment in perpetuity of all present and future rights to this +// software under copyright law. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// For more information, please refer to +// +// *********************************************************************** +// +// +// +// +// Howto: +// Call these functions from your code: +// MicroProfileOnThreadCreate +// MicroProfileMouseButton +// MicroProfileMousePosition +// MicroProfileModKey +// MicroProfileFlip <-- Call this once per frame +// MicroProfileDraw <-- Call this once per frame +// MicroProfileToggleDisplayMode <-- Bind to a key to toggle profiling +// MicroProfileTogglePause <-- Bind to a key to toggle pause +// +// Use these macros in your code in blocks you want to time: +// +// MICROPROFILE_DECLARE +// MICROPROFILE_DEFINE +// MICROPROFILE_DECLARE_GPU +// MICROPROFILE_DEFINE_GPU +// MICROPROFILE_SCOPE +// MICROPROFILE_SCOPEI +// MICROPROFILE_SCOPEGPU +// MICROPROFILE_SCOPEGPUI +// MICROPROFILE_META +// +// +// Usage: +// +// { +// MICROPROFILE_SCOPEI("GroupName", "TimerName", nColorRgb): +// ..Code to be timed.. +// } +// +// MICROPROFILE_DECLARE / MICROPROFILE_DEFINE allows defining groups in a shared place, to ensure sorting of the timers +// +// (in global scope) +// MICROPROFILE_DEFINE(g_ProfileFisk, "Fisk", "Skalle", nSomeColorRgb); +// +// (in some other file) +// MICROPROFILE_DECLARE(g_ProfileFisk); +// +// void foo(){ +// MICROPROFILE_SCOPE(g_ProfileFisk); +// } +// +// Once code is instrumented the gui is activeted by calling MicroProfileToggleDisplayMode or by clicking in the upper left corner of +// the screen +// +// The following functions must be implemented before the profiler is usable +// debug render: +// void MicroProfileDrawText(int nX, int nY, uint32_t nColor, const char* pText, uint32_t nNumCharacters); +// void MicroProfileDrawBox(int nX, int nY, int nX1, int nY1, uint32_t nColor, MicroProfileBoxType = MicroProfileBoxTypeFlat); +// void MicroProfileDrawLine2D(uint32_t nVertices, float* pVertices, uint32_t nColor); +// Gpu time stamps: +// uint32_t MicroProfileGpuInsertTimeStamp(); +// uint64_t MicroProfileGpuGetTimeStamp(uint32_t nKey); +// uint64_t MicroProfileTicksPerSecondGpu(); +// threading: +// const char* MicroProfileGetThreadName(); Threadnames in detailed view + + +#ifndef MICROPROFILE_ENABLED +#define MICROPROFILE_ENABLED 1 +#endif + +#if 0 == MICROPROFILE_ENABLED + +#define MICROPROFILE_DECLARE(var) +#define MICROPROFILE_DEFINE(var, group, name, color) +#define MICROPROFILE_DECLARE_GPU(var) +#define MICROPROFILE_DEFINE_GPU(var, group, name, color) +#define MICROPROFILE_SCOPE(var) do{}while(0) +#define MICROPROFILE_SCOPEI(group, name, color) do{}while(0) +#define MICROPROFILE_SCOPEGPU(var) do{}while(0) +#define MICROPROFILE_SCOPEGPUI(group, name, color) do{}while(0) +#define MICROPROFILE_META(name, count) +#define MICROPROFILE_FORCEENABLECPUGROUP(s) do{} while(0) +#define MICROPROFILE_FORCEDISABLECPUGROUP(s) do{} while(0) +#define MICROPROFILE_FORCEENABLEGPUGROUP(s) do{} while(0) +#define MICROPROFILE_FORCEDISABLEGPUGROUP(s) do{} while(0) + +#define MicroProfileGetTime(group, name) 0.f +#define MicroProfileOnThreadCreate(foo) do{}while(0) +#define MicroProfileMouseButton(foo, bar) do{}while(0) +#define MicroProfileMousePosition(foo, bar) do{}while(0) +#define MicroProfileModKey(key) do{}while(0) +#define MicroProfileFlip() do{}while(0) +#define MicroProfileDraw(foo, bar) do{}while(0) +#define MicroProfileIsDrawing() 0 +#define MicroProfileToggleDisplayMode() do{}while(0) +#define MicroProfileSetDisplayMode() do{}while(0) +#define MicroProfileTogglePause() do{}while(0) +#define MicroProfileDumpTimers() do{}while(0) + +#else + +#include +#include + +#if defined(__APPLE__) +#include +#include +#include +#include +#include +#if TARGET_OS_IPHONE +#define MICROPROFILE_IOS +#endif + +#define MP_TICK() mach_absolute_time() +inline int64_t MicroProfileTicksPerSecondCpu() +{ + static int64_t nTicksPerSecond = 0; + if(nTicksPerSecond == 0) + { + mach_timebase_info_data_t sTimebaseInfo; + mach_timebase_info(&sTimebaseInfo); + nTicksPerSecond = 1000000000ll * sTimebaseInfo.denom / sTimebaseInfo.numer; + } + return nTicksPerSecond; +} + +#define MP_BREAK() __builtin_trap() +#define MP_THREAD_LOCAL __thread +#define MP_STRCASECMP strcasecmp +#define MP_GETCURRENTTHREADID() (uint64_t)pthread_self() +typedef uint64_t ThreadIdType; + +#elif defined(_WIN32) +int64_t MicroProfileTicksPerSecondCpu(); +int64_t MicroProfileGetTick(); +#define MP_TICK() MicroProfileGetTick() +#define MP_BREAK() __debugbreak() +#define MP_THREAD_LOCAL __declspec(thread) +#define MP_STRCASECMP _stricmp +#define MP_GETCURRENTTHREADID() GetCurrentThreadId() +typedef uint32_t ThreadIdType; + +#elif defined(__linux__) +#include +#include +inline int64_t MicroProfileTicksPerSecondCpu() +{ + return 1000000000ll; +} + +inline int64_t MicroProfileGetTick() +{ + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + return 1000000000ll * ts.tv_sec + ts.tv_nsec; +} +#define MP_TICK() MicroProfileGetTick() +#define MP_BREAK() __builtin_trap() +#define MP_THREAD_LOCAL __thread +#define MP_STRCASECMP strcasecmp +#define MP_GETCURRENTTHREADID() (uint64_t)pthread_self() +typedef uint64_t ThreadIdType; +#endif + +#ifndef MP_GETCURRENTTHREADID +#define MP_GETCURRENTTHREADID() 0 +typedef uint32_t ThreadIdType; +#endif + +#ifndef MICROPROFILE_API +#define MICROPROFILE_API +#endif + +#define MP_ASSERT(a) do{if(!(a)){MP_BREAK();} }while(0) +#define MICROPROFILE_DECLARE(var) extern MicroProfileToken g_mp_##var +#define MICROPROFILE_DEFINE(var, group, name, color) MicroProfileToken g_mp_##var = MicroProfileGetToken(group, name, color, MicroProfileTokenTypeCpu) +#define MICROPROFILE_DECLARE_GPU(var) extern MicroProfileToken g_mp_##var +#define MICROPROFILE_DEFINE_GPU(var, group, name, color) MicroProfileToken g_mp_##var = MicroProfileGetToken(group, name, color, MicroProfileTokenTypeGpu) +#define MICROPROFILE_TOKEN_PASTE0(a, b) a ## b +#define MICROPROFILE_TOKEN_PASTE(a, b) MICROPROFILE_TOKEN_PASTE0(a,b) +#define MICROPROFILE_SCOPE(var) MicroProfileScopeHandler MICROPROFILE_TOKEN_PASTE(foo, __LINE__)(g_mp_##var) +#define MICROPROFILE_SCOPEI(group, name, color) static MicroProfileToken MICROPROFILE_TOKEN_PASTE(g_mp,__LINE__) = MicroProfileGetToken(group, name, color, MicroProfileTokenTypeCpu); MicroProfileScopeHandler MICROPROFILE_TOKEN_PASTE(foo,__LINE__)( MICROPROFILE_TOKEN_PASTE(g_mp,__LINE__)) +#define MICROPROFILE_SCOPEGPU(var) MicroProfileScopeGpuHandler MICROPROFILE_TOKEN_PASTE(foo, __LINE__)(g_mp_##var) +#define MICROPROFILE_SCOPEGPUI(group, name, color) static MicroProfileToken MICROPROFILE_TOKEN_PASTE(g_mp,__LINE__) = MicroProfileGetToken(group, name, color, MicroProfileTokenTypeGpu); MicroProfileScopeGpuHandler MICROPROFILE_TOKEN_PASTE(foo,__LINE__)( MICROPROFILE_TOKEN_PASTE(g_mp,__LINE__)) +#define MICROPROFILE_META_CPU(name, count) static MicroProfileToken MICROPROFILE_TOKEN_PASTE(g_mp_meta,__LINE__) = MicroProfileGetMetaToken(name); MicroProfileMetaUpdate(MICROPROFILE_TOKEN_PASTE(g_mp_meta,__LINE__), count, MicroProfileTokenTypeCpu) +#define MICROPROFILE_META_GPU(name, count) static MicroProfileToken MICROPROFILE_TOKEN_PASTE(g_mp_meta,__LINE__) = MicroProfileGetMetaToken(name); MicroProfileMetaUpdate(MICROPROFILE_TOKEN_PASTE(g_mp_meta,__LINE__), count, MicroProfileTokenTypeGpu) + +///configuration + +#ifndef MICROPROFILE_TEXT_WIDTH +#define MICROPROFILE_TEXT_WIDTH 5 +#endif + +#ifndef MICROPROFILE_TEXT_HEIGHT +#define MICROPROFILE_TEXT_HEIGHT 8 +#endif + +#ifndef MICROPROFILE_DETAILED_BAR_HEIGHT +#define MICROPROFILE_DETAILED_BAR_HEIGHT 12 +#endif + +#ifndef MICROPROFILE_DETAILED_CONTEXT_SWITCH_HEIGHT +#define MICROPROFILE_DETAILED_CONTEXT_SWITCH_HEIGHT 7 +#endif + +#ifndef MICROPROFILE_GRAPH_WIDTH +#define MICROPROFILE_GRAPH_WIDTH 256 +#endif + +#ifndef MICROPROFILE_GRAPH_HEIGHT +#define MICROPROFILE_GRAPH_HEIGHT 256 +#endif + +#ifndef MICROPROFILE_BORDER_SIZE +#define MICROPROFILE_BORDER_SIZE 1 +#endif + +#ifndef MICROPROFILE_USE_THREAD_NAME_CALLBACK +#define MICROPROFILE_USE_THREAD_NAME_CALLBACK 0 +#endif + +#ifndef MICROPROFILE_DRAWCURSOR +#define MICROPROFILE_DRAWCURSOR 0 +#endif + +#ifndef MICROPROFILE_DETAILED_BAR_NAMES +#define MICROPROFILE_DETAILED_BAR_NAMES 1 +#endif + +#ifndef MICROPROFILE_GPU_FRAME_DELAY +#define MICROPROFILE_GPU_FRAME_DELAY 3 //must be > 0 +#endif + +#ifndef MICROPROFILE_PER_THREAD_BUFFER_SIZE +#define MICROPROFILE_PER_THREAD_BUFFER_SIZE (2048<<10) +#endif + +#ifndef MICROPROFILE_HELP_LEFT +#define MICROPROFILE_HELP_LEFT "Left-Click" +#endif + +#ifndef MICROPROFILE_HELP_ALT +#define MICROPROFILE_HELP_ALT "Alt-Click" +#endif + +#ifndef MICROPROFILE_HELP_MOD +#define MICROPROFILE_HELP_MOD "Mod" +#endif + +#ifndef MICROPROFILE_PRINTF +#define MICROPROFILE_PRINTF printf +#endif + +#ifndef MICROPROFILE_META_MAX +#define MICROPROFILE_META_MAX 8 +#endif + + + +#define MICROPROFILE_FORCEENABLECPUGROUP(s) MicroProfileForceEnableGroup(s, MicroProfileTokenTypeCpu) +#define MICROPROFILE_FORCEDISABLECPUGROUP(s) MicroProfileForceDisableGroup(s, MicroProfileTokenTypeCpu) +#define MICROPROFILE_FORCEENABLEGPUGROUP(s) MicroProfileForceEnableGroup(s, MicroProfileTokenTypeGpu) +#define MICROPROFILE_FORCEDISABLEGPUGROUP(s) MicroProfileForceDisableGroup(s, MicroProfileTokenTypeGpu) + +#define MICROPROFILE_INVALID_TICK ((uint64_t)-1) +#define MICROPROFILE_GROUP_MASK_ALL 0xffffffffffff + + +typedef uint64_t MicroProfileToken; +typedef uint16_t MicroProfileGroupId; + +#define MICROPROFILE_INVALID_TOKEN (uint64_t)-1 + +enum MicroProfileTokenType +{ + MicroProfileTokenTypeCpu, + MicroProfileTokenTypeGpu, +}; +enum MicroProfileBoxType +{ + MicroProfileBoxTypeBar, + MicroProfileBoxTypeFlat, +}; + +struct MicroProfileState +{ + uint32_t nDisplay; + uint32_t nMenuAllGroups; + uint64_t nMenuActiveGroup; + uint32_t nMenuAllThreads; + uint32_t nAggregateFlip; + uint32_t nBars; + float fReferenceTime; +}; + + +MICROPROFILE_API void MicroProfileInit(); +MICROPROFILE_API void MicroProfileShutdown(); +MICROPROFILE_API MicroProfileToken MicroProfileFindToken(const char* sGroup, const char* sName); +MICROPROFILE_API MicroProfileToken MicroProfileGetToken(const char* sGroup, const char* sName, uint32_t nColor, MicroProfileTokenType Token = MicroProfileTokenTypeCpu); +MICROPROFILE_API MicroProfileToken MicroProfileGetMetaToken(const char* pName); +MICROPROFILE_API void MicroProfileMetaUpdate(MicroProfileToken, int nCount, MicroProfileTokenType eTokenType); +MICROPROFILE_API uint64_t MicroProfileEnter(MicroProfileToken nToken); +MICROPROFILE_API void MicroProfileLeave(MicroProfileToken nToken, uint64_t nTick); +MICROPROFILE_API uint64_t MicroProfileGpuEnter(MicroProfileToken nToken); +MICROPROFILE_API void MicroProfileGpuLeave(MicroProfileToken nToken, uint64_t nTick); +inline uint16_t MicroProfileGetTimerIndex(MicroProfileToken t){ return (t&0xffff); } +inline uint64_t MicroProfileGetGroupMask(MicroProfileToken t){ return ((t>>16)&MICROPROFILE_GROUP_MASK_ALL);} +inline MicroProfileToken MicroProfileMakeToken(uint64_t nGroupMask, uint16_t nTimer){ return (nGroupMask<<16) | nTimer;} + +MICROPROFILE_API void MicroProfileFlip(); //! called once per frame. +MICROPROFILE_API void MicroProfileDraw(uint32_t nWidth, uint32_t nHeight); //! call if drawing microprofilers +MICROPROFILE_API bool MicroProfileIsDrawing(); +MICROPROFILE_API void MicroProfileToggleGraph(MicroProfileToken nToken); +MICROPROFILE_API bool MicroProfileDrawGraph(uint32_t nScreenWidth, uint32_t nScreenHeight); +MICROPROFILE_API void MicroProfileSetAggregateCount(uint32_t nCount); //!Set no. of frames to aggregate over. 0 for infinite +MICROPROFILE_API void MicroProfileToggleDisplayMode(); //switch between off, bars, detailed +MICROPROFILE_API void MicroProfileSetDisplayMode(int); //switch between off, bars, detailed +MICROPROFILE_API void MicroProfileClearGraph(); +MICROPROFILE_API void MicroProfileTogglePause(); +MICROPROFILE_API void MicroProfileGetState(MicroProfileState* pStateOut); +MICROPROFILE_API void MicroProfileSetState(MicroProfileState* pStateIn); +MICROPROFILE_API void MicroProfileForceEnableGroup(const char* pGroup, MicroProfileTokenType Type); +MICROPROFILE_API void MicroProfileForceDisableGroup(const char* pGroup, MicroProfileTokenType Type); +MICROPROFILE_API float MicroProfileGetTime(const char* pGroup, const char* pName); +MICROPROFILE_API void MicroProfileMousePosition(uint32_t nX, uint32_t nY, int nWheelDelta); +MICROPROFILE_API void MicroProfileModKey(uint32_t nKeyState); +MICROPROFILE_API void MicroProfileMouseButton(uint32_t nLeft, uint32_t nRight); +MICROPROFILE_API void MicroProfileOnThreadCreate(const char* pThreadName); //should be called from newly created threads +MICROPROFILE_API void MicroProfileOnThreadExit(); //call on exit to reuse log +MICROPROFILE_API void MicroProfileInitThreadLog(); +MICROPROFILE_API void MicroProfileDrawLineVertical(int nX, int nTop, int nBottom, uint32_t nColor); +MICROPROFILE_API void MicroProfileDrawLineHorizontal(int nLeft, int nRight, int nY, uint32_t nColor); +MICROPROFILE_API void MicroProfileDumpTimers(); + + + +//UNDEFINED: MUST BE IMPLEMENTED ELSEWHERE +MICROPROFILE_API void MicroProfileDrawText(int nX, int nY, uint32_t nColor, const char* pText, uint32_t nNumCharacters); +MICROPROFILE_API void MicroProfileDrawBox(int nX, int nY, int nX1, int nY1, uint32_t nColor, MicroProfileBoxType = MicroProfileBoxTypeFlat); +MICROPROFILE_API void MicroProfileDrawLine2D(uint32_t nVertices, float* pVertices, uint32_t nColor); +MICROPROFILE_API uint32_t MicroProfileGpuInsertTimeStamp(); +MICROPROFILE_API uint64_t MicroProfileGpuGetTimeStamp(uint32_t nKey); +MICROPROFILE_API uint64_t MicroProfileTicksPerSecondGpu(); +#if MICROPROFILE_USE_THREAD_NAME_CALLBACK +MICROPROFILE_API const char* MicroProfileGetThreadName(); +#else +#define MicroProfileGetThreadName() "" +#endif + +struct MicroProfileScopeHandler +{ + MicroProfileToken nToken; + uint64_t nTick; + MicroProfileScopeHandler(MicroProfileToken Token):nToken(Token) + { + nTick = MicroProfileEnter(nToken); + } + ~MicroProfileScopeHandler() + { + MicroProfileLeave(nToken, nTick); + } +}; + +struct MicroProfileScopeGpuHandler +{ + MicroProfileToken nToken; + uint64_t nTick; + MicroProfileScopeGpuHandler(MicroProfileToken Token):nToken(Token) + { + nTick = MicroProfileGpuEnter(nToken); + } + ~MicroProfileScopeGpuHandler() + { + MicroProfileGpuLeave(nToken, nTick); + } +}; + + + + +#ifdef MICRO_PROFILE_IMPL + +#ifdef _WIN32 +#include +#define snprintf _snprintf + +#pragma warning(push) +#pragma warning(disable: 4244) +int64_t MicroProfileTicksPerSecondCpu() +{ + static int64_t nTicksPerSecond = 0; + if(nTicksPerSecond == 0) + { + QueryPerformanceFrequency((LARGE_INTEGER*)&nTicksPerSecond); + } + return nTicksPerSecond; +} +int64_t MicroProfileGetTick() +{ + int64_t ticks; + QueryPerformanceCounter((LARGE_INTEGER*)&ticks); + return ticks; +} + +#endif + +#include +#include +#include +#include +#include +#include +#include + + +#define S g_MicroProfile +#define MICROPROFILE_MAX_TIMERS 1024 +#define MICROPROFILE_MAX_GROUPS 48 //dont bump! no. of bits used it bitmask +#define MICROPROFILE_MAX_GRAPHS 5 +#define MICROPROFILE_GRAPH_HISTORY 128 +#define MICROPROFILE_BUFFER_SIZE ((MICROPROFILE_PER_THREAD_BUFFER_SIZE)/sizeof(MicroProfileLogEntry)) +#define MICROPROFILE_MAX_THREADS 32 +#define MICROPROFILE_MAX_CONTEXT_SWITCH_THREADS 256 +#define MICROPROFILE_STACK_MAX 32 +#define MICROPROFILE_MAX_PRESETS 5 +#define MICROPROFILE_DEBUG 0 +#define MICROPROFILE_TOOLTIP_MAX_STRINGS (32 + MICROPROFILE_MAX_GROUPS*2) +#define MICROPROFILE_TOOLTIP_STRING_BUFFER_SIZE 1024 +#define MICROPROFILE_TOOLTIP_MAX_LOCKED 3 +#define MICROPROFILE_MAX_FRAME_HISTORY 512 +#define MICROPROFILE_ANIM_DELAY_PRC 0.5f +#define MICROPROFILE_GAP_TIME 50 //extra ms to fetch to close timers from earlier frames + +#ifndef MICROPROFILE_CONTEXT_SWITCH_TRACE +#ifdef _WIN32 +#define MICROPROFILE_CONTEXT_SWITCH_TRACE 1 +#else +#define MICROPROFILE_CONTEXT_SWITCH_TRACE 0 +#endif +#endif + +#if MICROPROFILE_CONTEXT_SWITCH_TRACE +#define MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE (128*1024) //2mb with 16 byte entry size +#else +#define MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE (1) +#endif + +enum MicroProfileDrawMask +{ + MP_DRAW_OFF = 0x0, + MP_DRAW_BARS = 0x1, + MP_DRAW_DETAILED = 0x2, + MP_DRAW_HIDDEN = 0x3, +}; + +enum MicroProfileDrawBarsMask +{ + MP_DRAW_TIMERS = 0x1, + MP_DRAW_AVERAGE = 0x2, + MP_DRAW_MAX = 0x4, + MP_DRAW_CALL_COUNT = 0x8, + MP_DRAW_TIMERS_EXCLUSIVE = 0x10, + MP_DRAW_AVERAGE_EXCLUSIVE = 0x20, + MP_DRAW_MAX_EXCLUSIVE = 0x40, + MP_DRAW_META_FIRST = 0x80, + MP_DRAW_ALL = 0xffffffff, + +}; + +struct MicroProfileTimer +{ + uint64_t nTicks; + uint32_t nCount; +}; + +struct MicroProfileGroupInfo +{ + const char* pName; + uint32_t nNameLen; + uint32_t nGroupIndex; + uint32_t nNumTimers; + uint32_t nMaxTimerNameLen; + MicroProfileTokenType Type; +}; + +struct MicroProfileTimerInfo +{ + MicroProfileToken nToken; + uint32_t nTimerIndex; + uint32_t nGroupIndex; + const char* pName; + uint32_t nNameLen; + uint32_t nColor; +}; + +struct MicroProfileGraphState +{ + int64_t nHistory[MICROPROFILE_GRAPH_HISTORY]; + MicroProfileToken nToken; + int32_t nKey; +}; + +struct MicroProfileContextSwitch +{ + ThreadIdType nThreadOut; + ThreadIdType nThreadIn; + int64_t nCpu : 8; + int64_t nTicks : 56; +}; + + +#define MP_LOG_TICK_MASK 0x0000ffffffffffff +#define MP_LOG_INDEX_MASK 0x3fff000000000000 +#define MP_LOG_BEGIN_MASK 0xc000000000000000 +#define MP_LOG_META 0x1 +#define MP_LOG_ENTER 0x2 +#define MP_LOG_LEAVE 0x0 +typedef uint64_t MicroProfileLogEntry; + + + +inline int MicroProfileLogType(MicroProfileLogEntry Index) +{ + return ((MP_LOG_BEGIN_MASK & Index)>>62) & 0x3; +} + +inline uint64_t MicroProfileLogTimerIndex(MicroProfileLogEntry Index) +{ + return (0x3fff&(Index>>48)); +} + +inline MicroProfileLogEntry MicroProfileMakeLogIndex(uint64_t nBegin, MicroProfileToken nToken, int64_t nTick) +{ + MicroProfileLogEntry Entry = (nBegin<<62) | ((0x3fff&nToken)<<48) | (MP_LOG_TICK_MASK&nTick); + int t = MicroProfileLogType(Entry); + uint64_t nTimerIndex = MicroProfileLogTimerIndex(Entry); + MP_ASSERT(t == nBegin); + MP_ASSERT(nTimerIndex == (nToken&0x3fff)); + return Entry; + +} + +inline int64_t MicroProfileLogTickDifference(MicroProfileLogEntry Start, MicroProfileLogEntry End) +{ + uint64_t nStart = Start; + uint64_t nEnd = End; + int64_t nDifference = ((nEnd<<16) - (nStart<<16)); + return nDifference >> 16; +} + +inline int64_t MicroProfileLogGetTick(MicroProfileLogEntry e) +{ + return MP_LOG_TICK_MASK & e; +} + +inline int64_t MicroProfileLogSetTick(MicroProfileLogEntry e, int64_t nTick) +{ + return (MP_LOG_TICK_MASK & nTick) | (e & ~MP_LOG_TICK_MASK); +} + +struct MicroProfileFrameState +{ + int64_t nFrameStartCpu; + int64_t nFrameStartGpu; + uint32_t nLogStart[MICROPROFILE_MAX_GROUPS]; +}; + +struct MicroProfileThreadLog +{ + MicroProfileThreadLog* pNext; + MicroProfileLogEntry Log[MICROPROFILE_BUFFER_SIZE]; + + std::atomic nPut; + std::atomic nGet; + uint32_t nActive; + uint32_t nGpu; + ThreadIdType nThreadId; + enum + { + THREAD_MAX_LEN = 64, + }; + char ThreadName[64]; + int nFreeListNext; +}; + +struct MicroProfileStringArray +{ + const char* ppStrings[MICROPROFILE_TOOLTIP_MAX_STRINGS]; + char Buffer[MICROPROFILE_TOOLTIP_STRING_BUFFER_SIZE]; + char* pBufferPos; + uint32_t nNumStrings; +}; + + +struct +{ + uint32_t nTotalTimers; + uint32_t nGroupCount; + uint32_t nAggregateFlip; + uint32_t nAggregateFlipCount; + uint32_t nAggregateFrames; + + uint32_t nDisplay; + uint32_t nBars; + uint64_t nActiveGroup; + uint32_t nActiveBars; + + uint64_t nForceGroup; + + //menu/mouse over stuff + uint64_t nMenuActiveGroup; + uint32_t nMenuAllGroups; + uint32_t nMenuAllThreads; + uint64_t nHoverToken; + int64_t nHoverTime; + int nHoverFrame; +#if MICROPROFILE_DEBUG + uint64_t nHoverAddressEnter; + uint64_t nHoverAddressLeave; +#endif + uint32_t nOverflow; + + uint64_t nGroupMask; + uint32_t nRunning; + uint32_t nMaxGroupSize; + + float fGraphBaseTime; //old kill + float fGraphBaseTimePos; //old kill + float fReferenceTime; + float fRcpReferenceTime; + uint32_t nOpacityBackground; + uint32_t nOpacityForeground; + + float fDetailedOffset; //display offset relative to start of latest displayable frame. + float fDetailedRange; //no. of ms to display + + float fDetailedOffsetTarget; + float fDetailedRangeTarget; + + int nOffsetY; + + uint32_t nWidth; + uint32_t nHeight; + + uint32_t nBarWidth; + uint32_t nBarHeight; + + + MicroProfileGroupInfo GroupInfo[MICROPROFILE_MAX_GROUPS]; + MicroProfileTimerInfo TimerInfo[MICROPROFILE_MAX_TIMERS]; + + MicroProfileTimer AggregateTimers[MICROPROFILE_MAX_TIMERS]; + uint64_t MaxTimers[MICROPROFILE_MAX_TIMERS]; + uint64_t AggregateTimersExclusive[MICROPROFILE_MAX_TIMERS]; + uint64_t MaxTimersExclusive[MICROPROFILE_MAX_TIMERS]; + + MicroProfileTimer Frame[MICROPROFILE_MAX_TIMERS]; + uint64_t FrameExclusive[MICROPROFILE_MAX_TIMERS]; + + MicroProfileTimer Aggregate[MICROPROFILE_MAX_TIMERS]; + uint64_t AggregateMax[MICROPROFILE_MAX_TIMERS]; + uint64_t AggregateExclusive[MICROPROFILE_MAX_TIMERS]; + uint64_t AggregateMaxExclusive[MICROPROFILE_MAX_TIMERS]; + + struct + { + uint64_t nCounters[MICROPROFILE_MAX_TIMERS]; + const char* pName; + } MetaCounters[MICROPROFILE_META_MAX]; + + MicroProfileGraphState Graph[MICROPROFILE_MAX_GRAPHS]; + uint32_t nGraphPut; + + uint32_t nMouseX; + uint32_t nMouseY; + int nMouseWheelDelta; + uint32_t nMouseDownLeft; + uint32_t nMouseDownRight; + uint32_t nMouseLeft; + uint32_t nMouseRight; + uint32_t nMouseLeftMod; + uint32_t nMouseRightMod; + uint32_t nModDown; + uint32_t nActiveMenu; + + uint32_t nThreadActive[MICROPROFILE_MAX_THREADS]; + MicroProfileThreadLog* Pool[MICROPROFILE_MAX_THREADS]; + uint32_t nNumLogs; + uint32_t nMemUsage; + int nFreeListHead; + + uint32_t nFrameCurrent; + uint32_t nFramePut; + + MicroProfileFrameState Frames[MICROPROFILE_MAX_FRAME_HISTORY]; + + MicroProfileLogEntry* pDisplayMouseOver; + + + uint64_t nFlipTicks; + uint64_t nFlipAggregate; + uint64_t nFlipMax; + uint64_t nFlipAggregateDisplay; + uint64_t nFlipMaxDisplay; + + + MicroProfileStringArray LockedToolTips[MICROPROFILE_TOOLTIP_MAX_LOCKED]; + uint32_t nLockedToolTipColor[MICROPROFILE_TOOLTIP_MAX_LOCKED]; + int LockedToolTipFront; + + + int64_t nRangeBegin; + int64_t nRangeEnd; + int64_t nRangeBeginGpu; + int64_t nRangeEndGpu; + uint32_t nRangeBeginIndex; + uint32_t nRangeEndIndex; + MicroProfileThreadLog* pRangeLog; + uint32_t nHoverColor; + uint32_t nHoverColorShared; + + + std::thread* pContextSwitchThread; + bool bContextSwitchRunning; + bool bContextSwitchStop; + bool bContextSwitchAllThreads; + bool bContextSwitchNoBars; + uint32_t nContextSwitchUsage; + uint32_t nContextSwitchLastPut; + + int64_t nContextSwitchHoverTickIn; + int64_t nContextSwitchHoverTickOut; + uint32_t nContextSwitchHoverThread; + uint32_t nContextSwitchHoverThreadBefore; + uint32_t nContextSwitchHoverThreadAfter; + uint8_t nContextSwitchHoverCpu; + uint8_t nContextSwitchHoverCpuNext; + + uint32_t nContextSwitchPut; + MicroProfileContextSwitch ContextSwitch[MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE]; + +} g_MicroProfile; + +MicroProfileThreadLog* g_MicroProfileGpuLog = 0; +#ifdef MICROPROFILE_IOS +// iOS doesn't support __thread +static pthread_key_t g_MicroProfileThreadLogKey; +static pthread_once_t g_MicroProfileThreadLogKeyOnce = PTHREAD_ONCE_INIT; +static void MicroProfileCreateThreadLogKey() +{ + pthread_key_create(&g_MicroProfileThreadLogKey, NULL); +} +#else +MP_THREAD_LOCAL MicroProfileThreadLog* g_MicroProfileThreadLog = 0; +#endif +static bool g_bUseLock = false; /// This is used because windows does not support using mutexes under dll init(which is where global initialization is handled) +static uint32_t g_nMicroProfileBackColors[2] = { 0x474747, 0x313131 }; + +#define MICROPROFILE_NUM_CONTEXT_SWITCH_COLORS 16 +static uint32_t g_nMicroProfileContextSwitchThreadColors[MICROPROFILE_NUM_CONTEXT_SWITCH_COLORS] = //palette generated by http://tools.medialab.sciences-po.fr/iwanthue/index.php +{ + 0x63607B, + 0x755E2B, + 0x326A55, + 0x523135, + 0x904F42, + 0x87536B, + 0x346875, + 0x5E6046, + 0x35404C, + 0x224038, + 0x413D1E, + 0x5E3A26, + 0x5D6161, + 0x4C6234, + 0x7D564F, + 0x5C4352, +}; +static uint32_t g_MicroProfileAggregatePresets[] = {0, 10, 20, 30, 60, 120}; +static float g_MicroProfileReferenceTimePresets[] = {5.f, 10.f, 15.f,20.f, 33.33f, 66.66f, 100.f}; +static uint32_t g_MicroProfileOpacityPresets[] = {0x40, 0x80, 0xc0, 0xff}; +static const char* g_MicroProfilePresetNames[] = +{ + "Default", + "Render", + "GPU", + "Lighting", + "AI", + "Visibility", + "Sound", +}; + + +MICROPROFILE_DEFINE(g_MicroProfileDetailed, "MicroProfile", "Detailed View", 0x8888000); +MICROPROFILE_DEFINE(g_MicroProfileDrawGraph, "MicroProfile", "Draw Graph", 0xff44ee00); +MICROPROFILE_DEFINE(g_MicroProfileFlip, "MicroProfile", "MicroProfileFlip", 0x3355ee); +MICROPROFILE_DEFINE(g_MicroProfileThreadLoop, "MicroProfile", "ThreadLoop", 0x3355ee); +MICROPROFILE_DEFINE(g_MicroProfileClear, "MicroProfile", "Clear", 0x3355ee); +MICROPROFILE_DEFINE(g_MicroProfileAccumulate, "MicroProfile", "Accumulate", 0x3355ee); +MICROPROFILE_DEFINE(g_MicroProfileDrawBarView, "MicroProfile", "DrawBarView", 0x00dd77); +MICROPROFILE_DEFINE(g_MicroProfileDraw,"MicroProfile", "Draw", 0x737373); +MICROPROFILE_DEFINE(g_MicroProfileContextSwitchDraw, "MicroProfile", "ContextSwitchDraw", 0x730073); +MICROPROFILE_DEFINE(g_MicroProfileContextSwitchSearch,"MicroProfile", "ContextSwitchSearch", 0xDD7300); + +void MicroProfileStartContextSwitchTrace(); +void MicroProfileStopContextSwitchTrace(); +bool MicroProfileIsLocalThread(uint32_t nThreadId); + +inline std::recursive_mutex& MicroProfileMutex() +{ + static std::recursive_mutex Mutex; + return Mutex; +} + +template +T MicroProfileMin(T a, T b) +{ return a < b ? a : b; } + +template +T MicroProfileMax(T a, T b) +{ return a > b ? a : b; } + + + +void MicroProfileStringArrayClear(MicroProfileStringArray* pArray) +{ + pArray->nNumStrings = 0; + pArray->pBufferPos = &pArray->Buffer[0]; +} + +void MicroProfileStringArrayAddLiteral(MicroProfileStringArray* pArray, const char* pLiteral) +{ + pArray->ppStrings[pArray->nNumStrings++] = pLiteral; +} + +void MicroProfileStringArrayFormat(MicroProfileStringArray* pArray, const char* fmt, ...) +{ + pArray->ppStrings[pArray->nNumStrings++] = pArray->pBufferPos; + va_list args; + va_start (args, fmt); + pArray->pBufferPos += 1 + vsprintf(pArray->pBufferPos, fmt, args); + va_end(args); + MP_ASSERT(pArray->pBufferPos < pArray->Buffer + MICROPROFILE_TOOLTIP_STRING_BUFFER_SIZE); +} +void MicroProfileStringArrayCopy(MicroProfileStringArray* pDest, MicroProfileStringArray* pSrc) +{ + memcpy(&pDest->ppStrings[0], &pSrc->ppStrings[0], sizeof(pDest->ppStrings)); + memcpy(&pDest->Buffer[0], &pSrc->Buffer[0], sizeof(pDest->Buffer)); + for(uint32_t i = 0; i < MICROPROFILE_TOOLTIP_MAX_STRINGS; ++i) + { + if(i < pSrc->nNumStrings) + { + if(pSrc->ppStrings[i] >= &pSrc->Buffer[0] && pSrc->ppStrings[i] < &pSrc->Buffer[0] + MICROPROFILE_TOOLTIP_STRING_BUFFER_SIZE) + { + pDest->ppStrings[i] += &pDest->Buffer[0] - &pSrc->Buffer[0]; + } + } + } + pDest->nNumStrings = pSrc->nNumStrings; +} + +MicroProfileThreadLog* MicroProfileCreateThreadLog(const char* pName); +void MicroProfileLoadPreset(const char* pSuffix); +void MicroProfileSavePreset(const char* pSuffix); + + +inline int64_t MicroProfileMsToTick(float fMs, int64_t nTicksPerSecond) +{ + return (int64_t)(fMs*0.001f*nTicksPerSecond); +} + +inline float MicroProfileTickToMsMultiplier(int64_t nTicksPerSecond) +{ + return 1000.f / nTicksPerSecond; +} + +inline uint16_t MicroProfileGetGroupIndex(MicroProfileToken t) +{ + return (uint16_t)S.TimerInfo[MicroProfileGetTimerIndex(t)].nGroupIndex; +} + + +void MicroProfileInit() +{ + std::recursive_mutex& mutex = MicroProfileMutex(); + bool bUseLock = g_bUseLock; + if(bUseLock) + mutex.lock(); + static bool bOnce = true; + if(bOnce) + { + S.nMemUsage += sizeof(S); + bOnce = false; + memset(&S, 0, sizeof(S)); + S.nGroupCount = 0; + S.nBarWidth = 100; + S.nBarHeight = MICROPROFILE_TEXT_HEIGHT; + S.nActiveGroup = 0; + S.nActiveBars = 0; + S.nForceGroup = 0; + S.nMenuAllGroups = 0; + S.nMenuActiveGroup = 0; + S.nMenuAllThreads = 1; + S.nAggregateFlip = 30; + S.nTotalTimers = 0; + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + S.Graph[i].nToken = MICROPROFILE_INVALID_TOKEN; + } + S.nBars = MP_DRAW_ALL; + S.nRunning = 1; + S.fGraphBaseTime = 40.f; + S.nWidth = 100; + S.nHeight = 100; + S.nActiveMenu = (uint32_t)-1; + S.fReferenceTime = 33.33f; + S.fRcpReferenceTime = 1.f / S.fReferenceTime; + S.nFreeListHead = -1; + int64_t nTick = MP_TICK(); + for(int i = 0; i < MICROPROFILE_MAX_FRAME_HISTORY; ++i) + { + S.Frames[i].nFrameStartCpu = nTick; + S.Frames[i].nFrameStartGpu = -1; + } + + MicroProfileThreadLog* pGpu = MicroProfileCreateThreadLog("GPU"); + g_MicroProfileGpuLog = pGpu; + MP_ASSERT(S.Pool[0] == pGpu); + pGpu->nGpu = 1; + pGpu->nThreadId = 0; + + + S.fDetailedOffsetTarget = S.fDetailedOffset = 0.f; + S.fDetailedRangeTarget = S.fDetailedRange = 50.f; + + S.nOpacityBackground = 0xff<<24; + S.nOpacityForeground = 0xff<<24; + } + if(bUseLock) + mutex.unlock(); +} + +void MicroProfileShutdown() +{ +#if MICROPROFILE_CONTEXT_SWITCH_TRACE + std::lock_guard Lock(MicroProfileMutex()); + if(S.pContextSwitchThread) + { + if(S.pContextSwitchThread->joinable()) + { + S.bContextSwitchStop = true; + S.pContextSwitchThread->join(); + } + delete S.pContextSwitchThread; + } +#endif + + +} + +#ifdef MICROPROFILE_IOS +inline MicroProfileThreadLog* MicroProfileGetThreadLog() +{ + pthread_once(&g_MicroProfileThreadLogKeyOnce, MicroProfileCreateThreadLogKey); + return (MicroProfileThreadLog*)pthread_getspecific(g_MicroProfileThreadLogKey); +} + +inline void MicroProfileSetThreadLog(MicroProfileThreadLog* pLog) +{ + pthread_once(&g_MicroProfileThreadLogKeyOnce, MicroProfileCreateThreadLogKey); + pthread_setspecific(g_MicroProfileThreadLogKey, pLog); +} +#else +MicroProfileThreadLog* MicroProfileGetThreadLog() +{ + return g_MicroProfileThreadLog; +} +inline void MicroProfileSetThreadLog(MicroProfileThreadLog* pLog) +{ + g_MicroProfileThreadLog = pLog; +} +#endif + + +MicroProfileThreadLog* MicroProfileCreateThreadLog(const char* pName) +{ + MicroProfileThreadLog* pLog = 0; + if(S.nFreeListHead != -1) + { + pLog = S.Pool[S.nFreeListHead]; + S.nFreeListHead = S.Pool[S.nFreeListHead]->nFreeListNext; + } + else + { + pLog = new MicroProfileThreadLog; + S.nMemUsage += sizeof(MicroProfileThreadLog); + S.Pool[S.nNumLogs++] = pLog; + } + memset(pLog, 0, sizeof(*pLog)); + int len = (int)strlen(pName); + int maxlen = sizeof(pLog->ThreadName)-1; + len = len < maxlen ? len : maxlen; + memcpy(&pLog->ThreadName[0], pName, len); + pLog->ThreadName[len] = '\0'; + pLog->nThreadId = MP_GETCURRENTTHREADID(); + pLog->nFreeListNext = -1; + return pLog; +} + +void MicroProfileOnThreadCreate(const char* pThreadName) +{ + g_bUseLock = true; + MicroProfileInit(); + std::lock_guard Lock(MicroProfileMutex()); + MP_ASSERT(MicroProfileGetThreadLog() == 0); + MicroProfileThreadLog* pLog = MicroProfileCreateThreadLog(pThreadName ? pThreadName : MicroProfileGetThreadName()); + MP_ASSERT(pLog); + MicroProfileSetThreadLog(pLog); +} + +void MicroProfileOnThreadExit() +{ + MicroProfileThreadLog* pLog = MicroProfileGetThreadLog(); + if(pLog) + { + int32_t nLogIndex = -1; + for(int i = 0; i < MICROPROFILE_MAX_THREADS; ++i) + { + if(pLog == S.Pool[i]) + { + nLogIndex = i; + break; + } + } + MP_ASSERT(nLogIndex < MICROPROFILE_MAX_THREADS && nLogIndex > 0); + pLog->nFreeListNext = S.nFreeListHead; + pLog->nThreadId = 0; + S.nFreeListHead = nLogIndex; + } +} + +void MicroProfileInitThreadLog() +{ + MicroProfileOnThreadCreate(nullptr); +} + + +struct MicroProfileScopeLock +{ + bool bUseLock; + std::recursive_mutex& m; + MicroProfileScopeLock(std::recursive_mutex& m) : bUseLock(g_bUseLock), m(m) + { + if(bUseLock) + m.lock(); + } + ~MicroProfileScopeLock() + { + if(bUseLock) + m.unlock(); + } +}; + +MicroProfileToken MicroProfileFindToken(const char* pGroup, const char* pName) +{ + MicroProfileInit(); + MicroProfileScopeLock L(MicroProfileMutex()); + for(uint32_t i = 0; i < S.nTotalTimers; ++i) + { + if(!MP_STRCASECMP(pName, S.TimerInfo[i].pName) && !MP_STRCASECMP(pGroup, S.GroupInfo[S.TimerInfo[i].nGroupIndex].pName)) + { + return S.TimerInfo[i].nToken; + } + } + return MICROPROFILE_INVALID_TOKEN; +} + +uint16_t MicroProfileGetGroup(const char* pGroup, MicroProfileTokenType Type) +{ + for(uint32_t i = 0; i < S.nGroupCount; ++i) + { + if(!MP_STRCASECMP(pGroup, S.GroupInfo[i].pName)) + { + return i; + } + } + uint16_t nGroupIndex = 0xffff; + S.GroupInfo[S.nGroupCount].pName = pGroup; + S.GroupInfo[S.nGroupCount].nNameLen = strlen(pGroup); + S.GroupInfo[S.nGroupCount].nGroupIndex = S.nGroupCount; + S.GroupInfo[S.nGroupCount].nNumTimers = 0; + S.GroupInfo[S.nGroupCount].Type = Type; + S.GroupInfo[S.nGroupCount].nMaxTimerNameLen = 0; + nGroupIndex = S.nGroupCount++; + S.nGroupMask = (S.nGroupMask<<1)|1; + MP_ASSERT(nGroupIndex < MICROPROFILE_MAX_GROUPS); + return nGroupIndex; +} + + +MicroProfileToken MicroProfileGetToken(const char* pGroup, const char* pName, uint32_t nColor, MicroProfileTokenType Type) +{ + MicroProfileInit(); + MicroProfileScopeLock L(MicroProfileMutex()); + MicroProfileToken ret = MicroProfileFindToken(pGroup, pName); + if(ret != MICROPROFILE_INVALID_TOKEN) + return ret; + uint16_t nGroupIndex = MicroProfileGetGroup(pGroup, Type); + uint16_t nTimerIndex = (uint16_t)(S.nTotalTimers++); + uint64_t nGroupMask = 1ll << nGroupIndex; + MicroProfileToken nToken = MicroProfileMakeToken(nGroupMask, nTimerIndex); + S.GroupInfo[nGroupIndex].nNumTimers++; + S.GroupInfo[nGroupIndex].nMaxTimerNameLen = MicroProfileMax(S.GroupInfo[nGroupIndex].nMaxTimerNameLen, (uint32_t)strlen(pName)); + MP_ASSERT(S.GroupInfo[nGroupIndex].Type == Type); //dont mix cpu & gpu timers in the same group + S.nMaxGroupSize = MicroProfileMax(S.nMaxGroupSize, S.GroupInfo[nGroupIndex].nNumTimers); + S.TimerInfo[nTimerIndex].nToken = nToken; + S.TimerInfo[nTimerIndex].pName = pName; + S.TimerInfo[nTimerIndex].nNameLen = strlen(pName); + S.TimerInfo[nTimerIndex].nColor = nColor&0xffffff; + S.TimerInfo[nTimerIndex].nGroupIndex = nGroupIndex; + return nToken; +} + +MicroProfileToken MicroProfileGetMetaToken(const char* pName) +{ + MicroProfileInit(); + MicroProfileScopeLock L(MicroProfileMutex()); + for(uint32_t i = 0; i < MICROPROFILE_META_MAX; ++i) + { + if(!S.MetaCounters[i].pName) + { + S.MetaCounters[i].pName = pName; + return i; + } + else if(!MP_STRCASECMP(pName, S.MetaCounters[i].pName)) + { + return i; + } + } + MP_ASSERT(0);//out of slots, increase MICROPROFILE_META_MAX + return (MicroProfileToken)-1; +} + + +inline void MicroProfileLogPut(MicroProfileToken nToken_, uint64_t nTick, uint64_t nBegin, MicroProfileThreadLog* pLog) +{ + MP_ASSERT(pLog != 0); //this assert is hit if MicroProfileOnCreateThread is not called + uint32_t nPos = pLog->nPut.load(std::memory_order_relaxed); + uint32_t nNextPos = (nPos+1) % MICROPROFILE_BUFFER_SIZE; + if(nNextPos == pLog->nGet.load(std::memory_order_relaxed)) + { + S.nOverflow = 100; + } + else + { + int64_t test = MicroProfileMakeLogIndex(nBegin, nToken_, nTick);; + MP_ASSERT(MicroProfileLogType(test) == nBegin); + MP_ASSERT(MicroProfileLogTimerIndex(test) == MicroProfileGetTimerIndex(nToken_)); + pLog->Log[nPos] = MicroProfileMakeLogIndex(nBegin, nToken_, nTick); + pLog->nPut.store(nNextPos, std::memory_order_release); + } +} + +uint64_t MicroProfileEnter(MicroProfileToken nToken_) +{ + if(MicroProfileGetGroupMask(nToken_) & S.nActiveGroup) + { + if(!MicroProfileGetThreadLog()) + { + MicroProfileInitThreadLog(); + } + uint64_t nTick = MP_TICK(); + MicroProfileLogPut(nToken_, nTick, MP_LOG_ENTER, MicroProfileGetThreadLog()); + return nTick; + } + return MICROPROFILE_INVALID_TICK; +} + +void MicroProfileMetaUpdate(MicroProfileToken nToken, int nCount, MicroProfileTokenType eTokenType) +{ + if((MP_DRAW_META_FIRST< nGet) + { + nRange[0][0] = nGet; + nRange[0][1] = nPut; + nRange[1][0] = nRange[1][1] = 0; + } + else if(nPut != nGet) + { + MP_ASSERT(nGet != MICROPROFILE_BUFFER_SIZE); + uint32_t nCountEnd = MICROPROFILE_BUFFER_SIZE - nGet; + nRange[0][0] = nGet; + nRange[0][1] = nGet + nCountEnd; + nRange[1][0] = 0; + nRange[1][1] = nPut; + } +} + +void MicroProfileFlip() +{ + #if 0 + //verify LogEntry wraps correctly + MicroProfileLogEntry c = MP_LOG_TICK_MASK-5000; + for(int i = 0; i < 10000; ++i, c += 1) + { + MicroProfileLogEntry l2 = (c+2500) & MP_LOG_TICK_MASK; + MP_ASSERT(2500 == MicroProfileLogTickDifference(c, l2)); + } + #endif + MICROPROFILE_SCOPE(g_MicroProfileFlip); + std::lock_guard Lock(MicroProfileMutex()); + + { + static int once = 0; + if(0 == once) + { + uint32_t nDisplay = S.nDisplay; + MicroProfileLoadPreset(g_MicroProfilePresetNames[0]); + once++; + S.nDisplay = nDisplay;// dont load display, just state + } + } + + if(S.nRunning) + { + S.nFramePut = (S.nFramePut+1) % MICROPROFILE_MAX_FRAME_HISTORY; + S.nFrameCurrent = (S.nFramePut + MICROPROFILE_MAX_FRAME_HISTORY - MICROPROFILE_GPU_FRAME_DELAY - 1) % MICROPROFILE_MAX_FRAME_HISTORY; + uint32_t nFrameNext = (S.nFrameCurrent+1) % MICROPROFILE_MAX_FRAME_HISTORY; + + uint32_t nContextSwitchPut = S.nContextSwitchPut; + if(S.nContextSwitchLastPut < nContextSwitchPut) + { + S.nContextSwitchUsage = (nContextSwitchPut - S.nContextSwitchLastPut); + } + else + { + S.nContextSwitchUsage = MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE - S.nContextSwitchLastPut + nContextSwitchPut; + } + S.nContextSwitchLastPut = nContextSwitchPut; + + MicroProfileFrameState* pFramePut = &S.Frames[S.nFramePut]; + MicroProfileFrameState* pFrameCurrent = &S.Frames[S.nFrameCurrent]; + MicroProfileFrameState* pFrameNext = &S.Frames[nFrameNext]; + + pFramePut->nFrameStartCpu = MP_TICK(); + pFramePut->nFrameStartGpu = (uint32_t)MicroProfileGpuInsertTimeStamp(); + if(pFrameNext->nFrameStartGpu != (uint64_t)-1) + pFrameNext->nFrameStartGpu = MicroProfileGpuGetTimeStamp((uint32_t)pFrameNext->nFrameStartGpu); + + if(pFrameCurrent->nFrameStartGpu == (uint64_t)-1) + pFrameCurrent->nFrameStartGpu = pFrameNext->nFrameStartGpu + 1; + + uint64_t nFrameStartCpu = pFrameCurrent->nFrameStartCpu; + uint64_t nFrameEndCpu = pFrameNext->nFrameStartCpu; + uint64_t nFrameStartGpu = pFrameCurrent->nFrameStartGpu; + uint64_t nFrameEndGpu = pFrameNext->nFrameStartGpu; + + { + uint64_t nTick = nFrameEndCpu - nFrameStartCpu; + S.nFlipTicks = nTick; + S.nFlipAggregate += nTick; + S.nFlipMax = MicroProfileMax(S.nFlipMax, nTick); + } + + for(uint32_t i = 0; i < MICROPROFILE_MAX_THREADS; ++i) + { + MicroProfileThreadLog* pLog = S.Pool[i]; + if(!pLog) + { + pFramePut->nLogStart[i] = 0; + } + else + { + pFramePut->nLogStart[i] = pLog->nPut.load(std::memory_order_acquire); + //need to keep last frame around to close timers. timers more than 1 frame old is ditched. + pLog->nGet.store(pFrameCurrent->nLogStart[i], std::memory_order_relaxed); + } + } + + if(S.nRunning) + { + { + MICROPROFILE_SCOPE(g_MicroProfileClear); + for(uint32_t i = 0; i < S.nTotalTimers; ++i) + { + S.Frame[i].nTicks = 0; + S.Frame[i].nCount = 0; + S.FrameExclusive[i] = 0; + } + for(uint32_t j = 0; j < MICROPROFILE_META_MAX; ++j) + { + if(S.MetaCounters[j].pName) + { + for(uint32_t i = 0; i < S.nTotalTimers; ++i) + { + S.MetaCounters[j].nCounters[i] = 0; + } + } + } + } + { + MICROPROFILE_SCOPE(g_MicroProfileThreadLoop); + for(uint32_t i = 0; i < MICROPROFILE_MAX_THREADS; ++i) + { + MicroProfileThreadLog* pLog = S.Pool[i]; + if(!pLog) + continue; + + uint32_t nPut = pFrameNext->nLogStart[i]; + uint32_t nGet = pFrameCurrent->nLogStart[i]; + uint32_t nRange[2][2] = { {0, 0}, {0, 0}, }; + MicroProfileGetRange(nPut, nGet, nRange); + + + uint64_t nFrameStart = pLog->nGpu ? nFrameStartGpu : nFrameStartCpu; + uint64_t nFrameEnd = pLog->nGpu ? nFrameEndGpu : nFrameEndCpu; + //fetch gpu results. + if(pLog->nGpu) + { + for(uint32_t j = 0; j < 2; ++j) + { + uint32_t nStart = nRange[j][0]; + uint32_t nEnd = nRange[j][1]; + for(uint32_t k = nStart; k < nEnd; ++k) + { + MicroProfileLogEntry L = pLog->Log[k]; + pLog->Log[k] = MicroProfileLogSetTick(L, MicroProfileGpuGetTimeStamp((uint32_t)MicroProfileLogGetTick(L))); + } + } + } + uint32_t nStack[MICROPROFILE_STACK_MAX]; + int64_t nChildTickStack[MICROPROFILE_STACK_MAX]; + uint32_t nStackPos = 0; + nChildTickStack[0] = 0; + + for(uint32_t j = 0; j < 2; ++j) + { + uint32_t nStart = nRange[j][0]; + uint32_t nEnd = nRange[j][1]; + for(uint32_t k = nStart; k < nEnd; ++k) + { + MicroProfileLogEntry LE = pLog->Log[k]; + int nType = MicroProfileLogType(LE); + if(MP_LOG_ENTER == nType) + { + MP_ASSERT(nStackPos < MICROPROFILE_STACK_MAX); + nStack[nStackPos++] = k; + nChildTickStack[nStackPos] = 0; + } + else if(MP_LOG_META == nType) + { + if(nStackPos) + { + int64_t nMetaIndex = MicroProfileLogTimerIndex(LE); + int64_t nMetaCount = MicroProfileLogGetTick(LE); + MP_ASSERT(nMetaIndex < MICROPROFILE_META_MAX); + int64_t nCounter = MicroProfileLogTimerIndex(pLog->Log[nStack[nStackPos-1]]); + S.MetaCounters[nMetaIndex].nCounters[nCounter] += nMetaCount; + } + } + else + { + MP_ASSERT(nType == MP_LOG_LEAVE); + //todo: reconsider the fallback for Leaves without enters + int64_t nTickStart = 0 != nStackPos ? pLog->Log[nStack[nStackPos-1]] : nFrameStart; + int64_t nTicks = MicroProfileLogTickDifference(nTickStart, LE); + int64_t nChildTicks = nChildTickStack[nStackPos]; + if(0 != nStackPos) + { + MP_ASSERT(MicroProfileLogTimerIndex(pLog->Log[nStack[nStackPos-1]]) == MicroProfileLogTimerIndex(LE)); + nStackPos--; + nChildTickStack[nStackPos] += nTicks; + } + uint32_t nTimerIndex = MicroProfileLogTimerIndex(LE); + S.Frame[nTimerIndex].nTicks += nTicks; + S.FrameExclusive[nTimerIndex] += (nTicks-nChildTicks); + S.Frame[nTimerIndex].nCount += 1; + } + } + } + //todo: reconsider the fallback for enters without leaves + for(uint32_t j = 0; j < nStackPos; ++j) + { + MicroProfileLogEntry LE = pLog->Log[nStack[j]]; + uint64_t nTicks = MicroProfileLogTickDifference(LE, nFrameEnd); + uint32_t nTimerIndex = MicroProfileLogTimerIndex(LE); + S.Frame[nTimerIndex].nTicks += nTicks; + } + } + } + { + MICROPROFILE_SCOPE(g_MicroProfileAccumulate); + for(uint32_t i = 0; i < S.nTotalTimers; ++i) + { + S.AggregateTimers[i].nTicks += S.Frame[i].nTicks; + S.AggregateTimers[i].nCount += S.Frame[i].nCount; + S.MaxTimers[i] = MicroProfileMax(S.MaxTimers[i], S.Frame[i].nTicks); + S.AggregateTimersExclusive[i] += S.FrameExclusive[i]; + S.MaxTimersExclusive[i] = MicroProfileMax(S.MaxTimersExclusive[i], S.FrameExclusive[i]); + } + } + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + if(S.Graph[i].nToken != MICROPROFILE_INVALID_TOKEN) + { + MicroProfileToken nToken = S.Graph[i].nToken; + S.Graph[i].nHistory[S.nGraphPut] = S.Frame[MicroProfileGetTimerIndex(nToken)].nTicks; + } + } + S.nGraphPut = (S.nGraphPut+1) % MICROPROFILE_GRAPH_HISTORY; + + } + + + if(S.nRunning && S.nAggregateFlip <= ++S.nAggregateFlipCount) + { + memcpy(&S.Aggregate[0], &S.AggregateTimers[0], sizeof(S.Aggregate[0]) * S.nTotalTimers); + memcpy(&S.AggregateMax[0], &S.MaxTimers[0], sizeof(S.AggregateMax[0]) * S.nTotalTimers); + memcpy(&S.AggregateExclusive[0], &S.AggregateTimersExclusive[0], sizeof(S.AggregateExclusive[0]) * S.nTotalTimers); + memcpy(&S.AggregateMaxExclusive[0], &S.MaxTimersExclusive[0], sizeof(S.AggregateMaxExclusive[0]) * S.nTotalTimers); + + S.nAggregateFrames = S.nAggregateFlipCount; + S.nFlipAggregateDisplay = S.nFlipAggregate; + S.nFlipMaxDisplay = S.nFlipMax; + + + if(S.nAggregateFlip) // if 0 accumulate indefinitely + { + memset(&S.AggregateTimers[0], 0, sizeof(S.Aggregate[0]) * S.nTotalTimers); + memset(&S.MaxTimers[0], 0, sizeof(S.MaxTimers[0]) * S.nTotalTimers); + memset(&S.AggregateTimersExclusive[0], 0, sizeof(S.AggregateExclusive[0]) * S.nTotalTimers); + memset(&S.MaxTimersExclusive[0], 0, sizeof(S.MaxTimersExclusive[0]) * S.nTotalTimers); + S.nAggregateFlipCount = 0; + S.nFlipAggregate = 0; + S.nFlipMax = 0; + } + } + } + uint64_t nNewActiveGroup = 0; + if(S.nDisplay && S.nRunning) + nNewActiveGroup = S.nMenuAllGroups ? S.nGroupMask : S.nMenuActiveGroup; + nNewActiveGroup |= S.nForceGroup; + if(S.nActiveGroup != nNewActiveGroup) + S.nActiveGroup = nNewActiveGroup; + uint32_t nNewActiveBars = 0; + if(S.nDisplay && S.nRunning) + nNewActiveBars = S.nBars; + if(nNewActiveBars != S.nActiveBars) + S.nActiveBars = nNewActiveBars; + + S.fDetailedOffset = S.fDetailedOffset + (S.fDetailedOffsetTarget - S.fDetailedOffset) * MICROPROFILE_ANIM_DELAY_PRC; + S.fDetailedRange = S.fDetailedRange + (S.fDetailedRangeTarget - S.fDetailedRange) * MICROPROFILE_ANIM_DELAY_PRC; + +} + +void MicroProfileSetDisplayMode(int nValue) +{ + nValue = nValue >= 0 && nValue < 4 ? nValue : S.nDisplay; + S.nDisplay = nValue; + S.fGraphBaseTime = 40.f; + S.nOffsetY = 0; +} + +void MicroProfileToggleDisplayMode() +{ + S.nDisplay = (S.nDisplay + 1) % 4; + S.nOffsetY = 0; + +} + + +void MicroProfileFloatWindowSize(const char** ppStrings, uint32_t nNumStrings, uint32_t* pColors, uint32_t& nWidth, uint32_t& nHeight, uint32_t* pStringLengths = 0) +{ + uint32_t* nStringLengths = pStringLengths ? pStringLengths : (uint32_t*)alloca(nNumStrings * sizeof(uint32_t)); + uint32_t nTextCount = nNumStrings/2; + for(uint32_t i = 0; i < nTextCount; ++i) + { + uint32_t i0 = i * 2; + uint32_t s0, s1; + nStringLengths[i0] = s0 = (uint32_t)strlen(ppStrings[i0]); + nStringLengths[i0+1] = s1 = (uint32_t)strlen(ppStrings[i0+1]); + nWidth = MicroProfileMax(s0+s1, nWidth); + } + nWidth = (MICROPROFILE_TEXT_WIDTH+1) * (2+nWidth) + 2 * MICROPROFILE_BORDER_SIZE; + if(pColors) + nWidth += MICROPROFILE_TEXT_WIDTH + 1; + nHeight = (MICROPROFILE_TEXT_HEIGHT+1) * nTextCount + 2 * MICROPROFILE_BORDER_SIZE; +} + +void MicroProfileDrawFloatWindow(uint32_t nX, uint32_t nY, const char** ppStrings, uint32_t nNumStrings, uint32_t nColor, uint32_t* pColors = 0) +{ + uint32_t nWidth = 0, nHeight = 0; + uint32_t* nStringLengths = (uint32_t*)alloca(nNumStrings * sizeof(uint32_t)); + MicroProfileFloatWindowSize(ppStrings, nNumStrings, pColors, nWidth, nHeight, nStringLengths); + uint32_t nTextCount = nNumStrings/2; + if(nX + nWidth > S.nWidth) + nX = S.nWidth - nWidth; + if(nY + nHeight > S.nHeight) + nY = S.nHeight - nHeight; + MicroProfileDrawBox(nX-1, nY-1, nX + nWidth+1, nY + nHeight+1, 0xff000000|nColor); + MicroProfileDrawBox(nX, nY, nX + nWidth, nY + nHeight, 0xff000000); + if(pColors) + { + nX += MICROPROFILE_TEXT_WIDTH+1; + nWidth -= MICROPROFILE_TEXT_WIDTH+1; + } + for(uint32_t i = 0; i < nTextCount; ++i) + { + int i0 = i * 2; + if(pColors) + { + MicroProfileDrawBox(nX-MICROPROFILE_TEXT_WIDTH, nY, nX, nY + MICROPROFILE_TEXT_WIDTH, pColors[i]|0xff000000); + } + MicroProfileDrawText(nX + 1, nY + 1, (uint32_t)-1, ppStrings[i0], strlen(ppStrings[i0])); + MicroProfileDrawText(nX + nWidth - nStringLengths[i0+1] * (MICROPROFILE_TEXT_WIDTH+1), nY + 1, (uint32_t)-1, ppStrings[i0+1], strlen(ppStrings[i0+1])); + nY += (MICROPROFILE_TEXT_HEIGHT+1); + } +} + +void MicroProfileDrawTextBox(uint32_t nX, uint32_t nY, const char** ppStrings, uint32_t nNumStrings, uint32_t nColor, uint32_t* pColors = 0) +{ + uint32_t nWidth = 0, nHeight = 0; + uint32_t* nStringLengths = (uint32_t*)alloca(nNumStrings * sizeof(uint32_t)); + for(uint32_t i = 0; i < nNumStrings; ++i) + { + nStringLengths[i] = (uint32_t)strlen(ppStrings[i]); + nWidth = MicroProfileMax(nWidth, nStringLengths[i]); + nHeight++; + } + nWidth = (MICROPROFILE_TEXT_WIDTH+1) * (2+nWidth) + 2 * MICROPROFILE_BORDER_SIZE; + nHeight = (MICROPROFILE_TEXT_HEIGHT+1) * nHeight + 2 * MICROPROFILE_BORDER_SIZE; + if(nX + nWidth > S.nWidth) + nX = S.nWidth - nWidth; + if(nY + nHeight > S.nHeight) + nY = S.nHeight - nHeight; + MicroProfileDrawBox(nX, nY, nX + nWidth, nY + nHeight, 0xff000000); + for(uint32_t i = 0; i < nNumStrings; ++i) + { + MicroProfileDrawText(nX + 1, nY + 1, (uint32_t)-1, ppStrings[i], strlen(ppStrings[i])); + nY += (MICROPROFILE_TEXT_HEIGHT+1); + } +} + + + +void MicroProfileToolTipMeta(MicroProfileStringArray* pToolTip) +{ + if(S.nRangeBeginIndex != S.nRangeEndIndex && S.pRangeLog) + { + uint64_t nMetaSum[MICROPROFILE_META_MAX] = {0}; + + uint32_t nRange[2][2]; + MicroProfileThreadLog* pLog = S.pRangeLog; + + + MicroProfileGetRange(S.nRangeEndIndex, S.nRangeBeginIndex, nRange); + for(uint32_t i = 0; i < 2; ++i) + { + uint32_t nStart = nRange[i][0]; + uint32_t nEnd = nRange[i][1]; + for(uint32_t j = nStart; j < nEnd; ++j) + { + MicroProfileLogEntry LE = pLog->Log[j]; + int nType = MicroProfileLogType(LE); + if(MP_LOG_META == nType) + { + int64_t nMetaIndex = MicroProfileLogTimerIndex(LE); + int64_t nMetaCount = MicroProfileLogGetTick(LE); + MP_ASSERT(nMetaIndex < MICROPROFILE_META_MAX); + nMetaSum[nMetaIndex] += nMetaCount; + } + } + } + bool bSpaced = false; + for(int i = 0; i < MICROPROFILE_META_MAX; ++i) + { + if(S.MetaCounters[i].pName && nMetaSum[i]) + { + if(!bSpaced) + { + bSpaced = true; + MicroProfileStringArrayAddLiteral(pToolTip, ""); + MicroProfileStringArrayAddLiteral(pToolTip, ""); + } + MicroProfileStringArrayFormat(pToolTip, "%s", S.MetaCounters[i].pName); + MicroProfileStringArrayFormat(pToolTip, "%5d", nMetaSum[i]); + } + } + } +} + + +void MicroProfileDrawFloatTooltip(uint32_t nX, uint32_t nY, uint32_t nToken, uint64_t nTime) +{ + uint32_t nIndex = MicroProfileGetTimerIndex(nToken); + uint32_t nAggregateFrames = S.nAggregateFrames ? S.nAggregateFrames : 1; + uint32_t nAggregateCount = S.Aggregate[nIndex].nCount ? S.Aggregate[nIndex].nCount : 1; + + uint32_t nGroupId = MicroProfileGetGroupIndex(nToken); + uint32_t nTimerId = MicroProfileGetTimerIndex(nToken); + bool bGpu = S.GroupInfo[nGroupId].Type == MicroProfileTokenTypeGpu; + + float fToMs = MicroProfileTickToMsMultiplier(bGpu ? MicroProfileTicksPerSecondGpu() : MicroProfileTicksPerSecondCpu()); + + float fMs = fToMs * (nTime); + float fFrameMs = fToMs * (S.Frame[nIndex].nTicks); + float fAverage = fToMs * (S.Aggregate[nIndex].nTicks/nAggregateFrames); + float fCallAverage = fToMs * (S.Aggregate[nIndex].nTicks / nAggregateCount); + float fMax = fToMs * (S.AggregateMax[nIndex]); + + float fFrameMsExclusive = fToMs * (S.FrameExclusive[nIndex]); + float fAverageExclusive = fToMs * (S.AggregateExclusive[nIndex]/nAggregateFrames); + float fMaxExclusive = fToMs * (S.AggregateMaxExclusive[nIndex]); + + + MicroProfileStringArray ToolTip; + MicroProfileStringArrayClear(&ToolTip); + const char* pGroupName = S.GroupInfo[nGroupId].pName; + const char* pTimerName = S.TimerInfo[nTimerId].pName; + MicroProfileStringArrayFormat(&ToolTip, "%s", pGroupName); + MicroProfileStringArrayFormat(&ToolTip,"%s", pTimerName); + +#if MICROPROFILE_DEBUG + MicroProfileStringArrayFormat(&ToolTip,"0x%p", S.nHoverAddressEnter); + MicroProfileStringArrayFormat(&ToolTip,"0x%p", S.nHoverAddressLeave); +#endif + + if(nTime != (uint64_t)0) + { + MicroProfileStringArrayAddLiteral(&ToolTip, "Time:"); + MicroProfileStringArrayFormat(&ToolTip,"%6.3fms", fMs); + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + } + + MicroProfileStringArrayAddLiteral(&ToolTip, "Frame Time:"); + MicroProfileStringArrayFormat(&ToolTip,"%6.3fms", fFrameMs); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Average:"); + MicroProfileStringArrayFormat(&ToolTip,"%6.3fms", fAverage); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Max:"); + MicroProfileStringArrayFormat(&ToolTip,"%6.3fms", fMax); + + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Frame Call Average:"); + MicroProfileStringArrayFormat(&ToolTip,"%6.3fms", fCallAverage); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Frame Call Count:"); + MicroProfileStringArrayFormat(&ToolTip, "%6d", nAggregateCount / nAggregateFrames); + + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Exclusive Frame Time:"); + MicroProfileStringArrayFormat(&ToolTip, "%6.3fms", fFrameMsExclusive); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Exclusive Average:"); + MicroProfileStringArrayFormat(&ToolTip, "%6.3fms", fAverageExclusive); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Exclusive Max:"); + MicroProfileStringArrayFormat(&ToolTip, "%6.3fms", fMaxExclusive); + + MicroProfileToolTipMeta(&ToolTip); + + + MicroProfileDrawFloatWindow(nX, nY+20, &ToolTip.ppStrings[0], ToolTip.nNumStrings, S.TimerInfo[nTimerId].nColor); + + if(S.nMouseLeftMod) + { + int nIndex = (S.LockedToolTipFront + MICROPROFILE_TOOLTIP_MAX_LOCKED - 1) % MICROPROFILE_TOOLTIP_MAX_LOCKED; + S.nLockedToolTipColor[nIndex] = S.TimerInfo[nTimerId].nColor; + MicroProfileStringArrayCopy(&S.LockedToolTips[nIndex], &ToolTip); + S.LockedToolTipFront = nIndex; + + } +} + +#define MICROPROFILE_FRAME_HISTORY_HEIGHT 50 +#define MICROPROFILE_FRAME_HISTORY_WIDTH 7 +#define MICROPROFILE_FRAME_HISTORY_COLOR_CPU 0xffff7f27 //255 127 39 +#define MICROPROFILE_FRAME_HISTORY_COLOR_GPU 0xff37a0ee //55 160 238 +#define MICROPROFILE_FRAME_HISTORY_COLOR_HIGHTLIGHT 0x7733bb44 +#define MICROPROFILE_FRAME_COLOR_HIGHTLIGHT 0x20009900 +#define MICROPROFILE_FRAME_COLOR_HIGHTLIGHT_GPU 0x20996600 +#define MICROPROFILE_NUM_FRAMES (MICROPROFILE_MAX_FRAME_HISTORY - (MICROPROFILE_GPU_FRAME_DELAY+1)) + +void MicroProfileZoomTo(int64_t nTickStart, int64_t nTickEnd) +{ + int64_t nStart = S.Frames[S.nFrameCurrent].nFrameStartCpu; + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + S.fDetailedOffsetTarget = MicroProfileLogTickDifference(nStart, nTickStart) * fToMs; + S.fDetailedRangeTarget = MicroProfileLogTickDifference(nTickStart, nTickEnd) * fToMs; + + +} + +void MicroProfileCenter(int64_t nTickCenter) +{ + int64_t nStart = S.Frames[S.nFrameCurrent].nFrameStartCpu; + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + float fCenter = MicroProfileLogTickDifference(nStart, nTickCenter) * fToMs; + S.fDetailedOffsetTarget = S.fDetailedOffset = fCenter - 0.5f * S.fDetailedRange; +} +#ifdef MICROPROFILE_DEBUG +uint64_t* g_pMicroProfileDumpStart = 0; +uint64_t* g_pMicroProfileDumpEnd = 0; +void MicroProfileDebugDumpRange() +{ + if(g_pMicroProfileDumpStart != g_pMicroProfileDumpEnd) + { + uint64_t* pStart = g_pMicroProfileDumpStart; + uint64_t* pEnd = g_pMicroProfileDumpEnd; + while(pStart != pEnd) + { + uint64_t nTick = MicroProfileLogGetTick(*pStart); + uint64_t nToken = MicroProfileLogTimerIndex(*pStart); + uint32_t nTimerId = MicroProfileGetTimerIndex(nToken); + + const char* pTimerName = S.TimerInfo[nTimerId].pName; + char buffer[256]; + int type = MicroProfileLogType(*pStart); + + const char* pBegin = type == MP_LOG_LEAVE ? "END" : + (type == MP_LOG_ENTER ? "BEGIN" : "META"); + snprintf(buffer, 255, "DUMP 0x%p: %s :: %llx: %s\n", pStart, pBegin, nTick, pTimerName); +#ifdef _WIN32 + OutputDebugString(buffer); +#else + printf("%s", buffer); +#endif + pStart++; + } + + g_pMicroProfileDumpStart = g_pMicroProfileDumpEnd; + } +} +#define MP_DEBUG_DUMP_RANGE() MicroProfileDebugDumpRange(); +#else +#define MP_DEBUG_DUMP_RANGE() do{} while(0) +#endif + +#define MICROPROFILE_HOVER_DIST 0.5f + +void MicroProfileDrawDetailedContextSwitchBars(uint32_t nY, uint32_t nThreadId, uint32_t nContextSwitchStart, uint32_t nContextSwitchEnd, int64_t nBaseTicks, uint32_t nBaseY) +{ + MICROPROFILE_SCOPE(g_MicroProfileContextSwitchDraw); + int64_t nTickIn = -1; + uint32_t nThreadBefore = -1; + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + float fMsToScreen = S.nWidth / S.fDetailedRange; + float fMouseX = (float)S.nMouseX; + float fMouseY = (float)S.nMouseY; + + + for(uint32_t j = nContextSwitchStart; j != nContextSwitchEnd; j = (j+1) % MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE) + { + MP_ASSERT(j < MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE); + MicroProfileContextSwitch CS = S.ContextSwitch[j]; + + if(nTickIn == -1) + { + if(CS.nThreadIn == nThreadId) + { + nTickIn = CS.nTicks; + nThreadBefore = CS.nThreadOut; + } + } + else + { + if(CS.nThreadOut == nThreadId) + { + int64_t nTickOut = CS.nTicks; + float fMsStart = fToMs * MicroProfileLogTickDifference(nBaseTicks, nTickIn); + float fMsEnd = fToMs * MicroProfileLogTickDifference(nBaseTicks, nTickOut); + if(fMsStart <= fMsEnd) + { + float fXStart = fMsStart * fMsToScreen; + float fXEnd = fMsEnd * fMsToScreen; + float fYStart = (float)nY; + float fYEnd = fYStart + (MICROPROFILE_DETAILED_CONTEXT_SWITCH_HEIGHT); + uint32_t nColor = g_nMicroProfileContextSwitchThreadColors[CS.nCpu%MICROPROFILE_NUM_CONTEXT_SWITCH_COLORS]; + float fXDist = MicroProfileMax(fXStart - fMouseX, fMouseX - fXEnd); + bool bHover = fXDist < MICROPROFILE_HOVER_DIST && fYStart <= fMouseY && fMouseY <= fYEnd && nBaseY < fMouseY; + if(bHover) + { + S.nRangeBegin = nTickIn; + S.nRangeEnd = nTickOut; + S.nContextSwitchHoverTickIn = nTickIn; + S.nContextSwitchHoverTickOut = nTickOut; + S.nContextSwitchHoverThread = CS.nThreadOut; + S.nContextSwitchHoverThreadBefore = nThreadBefore; + S.nContextSwitchHoverThreadAfter = CS.nThreadIn; + S.nContextSwitchHoverCpuNext = CS.nCpu; + nColor = S.nHoverColor; + } + if(CS.nCpu == S.nContextSwitchHoverCpu) + { + nColor = S.nHoverColorShared; + } + MicroProfileDrawBox(fXStart, fYStart, fXEnd, fYEnd, nColor|S.nOpacityForeground, MicroProfileBoxTypeFlat); + } + nTickIn = -1; + } + } + } +} + +void MicroProfileDrawDetailedBars(uint32_t nWidth, uint32_t nHeight, int nBaseY, int nSelectedFrame) +{ + MP_DEBUG_DUMP_RANGE(); + int nY = nBaseY - S.nOffsetY; + int64_t nNumBoxes = 0; + int64_t nNumLines = 0; + + uint32_t nFrameNext = (S.nFrameCurrent+1) % MICROPROFILE_MAX_FRAME_HISTORY; + MicroProfileFrameState* pFrameCurrent = &S.Frames[S.nFrameCurrent]; + MicroProfileFrameState* pFrameNext = &S.Frames[nFrameNext]; + + S.nRangeBegin = 0; + S.nRangeEnd = 0; + S.nRangeBeginGpu = 0; + S.nRangeEndGpu = 0; + S.nRangeBeginIndex = S.nRangeEndIndex = 0; + S.pRangeLog = 0; + uint64_t nFrameStartCpu = pFrameCurrent->nFrameStartCpu; + uint64_t nFrameStartGpu = pFrameCurrent->nFrameStartGpu; + float fToMsCpu = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + float fToMsGpu = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondGpu()); + + float fDetailedOffset = S.fDetailedOffset; + float fDetailedRange = S.fDetailedRange; + int64_t nDetailedOffsetTicksCpu = MicroProfileMsToTick(fDetailedOffset, MicroProfileTicksPerSecondCpu()); + int64_t nDetailedOffsetTicksGpu = MicroProfileMsToTick(fDetailedOffset, MicroProfileTicksPerSecondGpu()); + int64_t nBaseTicksCpu = nDetailedOffsetTicksCpu + nFrameStartCpu; + int64_t nBaseTicksGpu = nDetailedOffsetTicksGpu + nFrameStartGpu; + int64_t nBaseTicksEndCpu = nBaseTicksCpu + MicroProfileMsToTick(fDetailedRange, MicroProfileTicksPerSecondCpu()); + + MicroProfileFrameState* pFrameFirst = pFrameCurrent; + int64_t nGapTime = MicroProfileTicksPerSecondCpu() * MICROPROFILE_GAP_TIME / 1000; + for(uint32_t i = 0; i < MICROPROFILE_MAX_FRAME_HISTORY - MICROPROFILE_GPU_FRAME_DELAY; ++i) + { + uint32_t nNextIndex = (S.nFrameCurrent + MICROPROFILE_MAX_FRAME_HISTORY - i) % MICROPROFILE_MAX_FRAME_HISTORY; + pFrameFirst = &S.Frames[nNextIndex]; + if(pFrameFirst->nFrameStartCpu <= nBaseTicksCpu-nGapTime) + break; + } + + float fMsBase = fToMsCpu * nDetailedOffsetTicksCpu; + float fMs = fDetailedRange; + float fMsEnd = fMs + fMsBase; + float fWidth = (float)nWidth; + float fMsToScreen = fWidth / fMs; + + { + float fRate = floor(2*(log10(fMs)-1))/2; + float fStep = powf(10.f, fRate); + float fRcpStep = 1.f / fStep; + int nColorIndex = (int)(floor(fMsBase*fRcpStep)); + float fStart = floor(fMsBase*fRcpStep) * fStep; + for(float f = fStart; f < fMsEnd; ) + { + float fStart = f; + float fNext = f + fStep; + MicroProfileDrawBox(((fStart-fMsBase) * fMsToScreen), nBaseY, (fNext-fMsBase) * fMsToScreen+1, nBaseY + nHeight, S.nOpacityBackground | g_nMicroProfileBackColors[nColorIndex++ & 1]); + f = fNext; + } + } + + nY += MICROPROFILE_TEXT_HEIGHT+1; + MicroProfileLogEntry* pMouseOver = S.pDisplayMouseOver; + MicroProfileLogEntry* pMouseOverNext = 0; + uint64_t nMouseOverToken = pMouseOver ? MicroProfileLogTimerIndex(*pMouseOver) : MICROPROFILE_INVALID_TOKEN; + float fMouseX = (float)S.nMouseX; + float fMouseY = (float)S.nMouseY; + uint64_t nHoverToken = MICROPROFILE_INVALID_TOKEN; + int64_t nHoverTime = 0; + + static int nHoverCounter = 155; + static int nHoverCounterDelta = 10; + nHoverCounter += nHoverCounterDelta; + if(nHoverCounter >= 245) + nHoverCounterDelta = -10; + else if(nHoverCounter < 100) + nHoverCounterDelta = 10; + S.nHoverColor = (nHoverCounter<<24)|(nHoverCounter<<16)|(nHoverCounter<<8)|nHoverCounter; + uint32_t nHoverCounterShared = nHoverCounter>>2; + S.nHoverColorShared = (nHoverCounterShared<<24)|(nHoverCounterShared<<16)|(nHoverCounterShared<<8)|nHoverCounterShared; + + uint32_t nLinesDrawn[MICROPROFILE_STACK_MAX]={0}; + + uint32_t nContextSwitchHoverThreadAfter = S.nContextSwitchHoverThreadAfter; + uint32_t nContextSwitchHoverThreadBefore = S.nContextSwitchHoverThreadBefore; + S.nContextSwitchHoverThread = S.nContextSwitchHoverThreadAfter = S.nContextSwitchHoverThreadBefore = -1; + + uint32_t nContextSwitchStart = -1; + uint32_t nContextSwitchEnd = -1; + S.nContextSwitchHoverCpuNext = 0xff; + S.nContextSwitchHoverTickIn = -1; + S.nContextSwitchHoverTickOut = -1; + if(S.bContextSwitchRunning) + { + MICROPROFILE_SCOPE(g_MicroProfileContextSwitchSearch); + uint32_t nContextSwitchPut = S.nContextSwitchPut; + nContextSwitchStart = nContextSwitchEnd = (nContextSwitchPut + MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE - 1) % MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE; + int64_t nSearchEnd = nBaseTicksEndCpu + MicroProfileMsToTick(30.f, MicroProfileTicksPerSecondCpu()); + int64_t nSearchBegin = nBaseTicksCpu - MicroProfileMsToTick(30.f, MicroProfileTicksPerSecondCpu()); + for(uint32_t i = 0; i < MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE; ++i) + { + uint32_t nIndex = (nContextSwitchPut + MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE - (i+1)) % MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE; + MicroProfileContextSwitch& CS = S.ContextSwitch[nIndex]; + if(CS.nTicks > nSearchEnd) + { + nContextSwitchEnd = nIndex; + } + if(CS.nTicks > nSearchBegin) + { + nContextSwitchStart = nIndex; + } + } + } + + bool bSkipBarView = S.bContextSwitchRunning && S.bContextSwitchNoBars; + + if(!bSkipBarView) + { + for(uint32_t i = 0; i < MICROPROFILE_MAX_THREADS; ++i) + { + MicroProfileThreadLog* pLog = S.Pool[i]; + if(!pLog) + continue; + + uint32_t nPut = pFrameNext->nLogStart[i]; + ///note: this may display new samples as old data, but this will only happen when + // unpaused, where the detailed view is hardly perceptible + uint32_t nFront = S.Pool[i]->nPut.load(std::memory_order_relaxed); + MicroProfileFrameState* pFrameLogFirst = pFrameCurrent; + MicroProfileFrameState* pFrameLogLast = pFrameNext; + uint32_t nGet = pFrameLogFirst->nLogStart[i]; + do + { + MP_ASSERT(pFrameLogFirst >= &S.Frames[0] && pFrameLogFirst < &S.Frames[MICROPROFILE_MAX_FRAME_HISTORY]); + uint32_t nNewGet = pFrameLogFirst->nLogStart[i]; + bool bIsValid = false; + if(nPut < nFront) + { + bIsValid = nNewGet <= nPut || nNewGet >= nFront; + } + else + { + bIsValid = nNewGet <= nPut && nNewGet >= nFront; + } + if(bIsValid) + { + nGet = nNewGet; + if(pFrameLogFirst->nFrameStartCpu > nBaseTicksEndCpu) + { + pFrameLogLast = pFrameLogFirst;//pick the last frame that ends after + } + + + pFrameLogFirst--; + if(pFrameLogFirst < &S.Frames[0]) + pFrameLogFirst = &S.Frames[MICROPROFILE_MAX_FRAME_HISTORY-1]; + } + else + { + break; + } + }while(pFrameLogFirst != pFrameFirst); + + + if(nGet == (uint32_t)-1) + continue; + MP_ASSERT(nGet != (uint32_t)-1); + + nPut = pFrameLogLast->nLogStart[i]; + + uint32_t nRange[2][2] = { {0, 0}, {0, 0}, }; + + MicroProfileGetRange(nPut, nGet, nRange); + if(nPut == nGet) + continue; + if(0==S.nThreadActive[i] && 0==S.nMenuAllThreads) + continue; + uint32_t nMaxStackDepth = 0; + + bool bGpu = pLog->nGpu != 0; + float fToMs = bGpu ? fToMsGpu : fToMsCpu; + int64_t nBaseTicks = bGpu ? nBaseTicksGpu : nBaseTicksCpu; + char ThreadName[MicroProfileThreadLog::THREAD_MAX_LEN + 16]; + uint64_t nThreadId = pLog->nThreadId; + snprintf(ThreadName, sizeof(ThreadName)-1, "%04llx: %s", nThreadId, &pLog->ThreadName[0] ); + nY += 3; + uint32_t nThreadColor = -1; + if(pLog->nThreadId == nContextSwitchHoverThreadAfter || pLog->nThreadId == nContextSwitchHoverThreadBefore) + nThreadColor = S.nHoverColorShared|0x906060; + MicroProfileDrawText(0, nY, nThreadColor, &ThreadName[0], strlen(&ThreadName[0])); + nY += 3; + nY += MICROPROFILE_TEXT_HEIGHT + 1; + + if(S.bContextSwitchRunning) + { + MicroProfileDrawDetailedContextSwitchBars(nY, pLog->nThreadId, nContextSwitchStart, nContextSwitchEnd, nBaseTicks, nBaseY); + nY -= MICROPROFILE_DETAILED_BAR_HEIGHT; + nY += MICROPROFILE_DETAILED_CONTEXT_SWITCH_HEIGHT+1; + } + + uint32_t nYDelta = MICROPROFILE_DETAILED_BAR_HEIGHT; + uint32_t nStack[MICROPROFILE_STACK_MAX]; + uint32_t nStackPos = 0; + for(uint32_t j = 0; j < 2; ++j) + { + uint32_t nStart = nRange[j][0]; + uint32_t nEnd = nRange[j][1]; + for(uint32_t k = nStart; k < nEnd; ++k) + { + MicroProfileLogEntry* pEntry = pLog->Log + k; + int nType = MicroProfileLogType(*pEntry); + if(MP_LOG_ENTER == nType) + { + MP_ASSERT(nStackPos < MICROPROFILE_STACK_MAX); + nStack[nStackPos++] = k; + } + else if(MP_LOG_META == nType) + { + + } + else if(MP_LOG_LEAVE == nType) + { + if(0 == nStackPos) + { + continue; + } + + MicroProfileLogEntry* pEntryEnter = pLog->Log + nStack[nStackPos-1]; + if(MicroProfileLogTimerIndex(*pEntryEnter) != MicroProfileLogTimerIndex(*pEntry)) + { + //uprintf("mismatch %llx %llx\n", pEntryEnter->nToken, pEntry->nToken); + continue; + } + int64_t nTickStart = MicroProfileLogGetTick(*pEntryEnter); + int64_t nTickEnd = MicroProfileLogGetTick(*pEntry); + uint64_t nTimerIndex = MicroProfileLogTimerIndex(*pEntry); + uint32_t nColor = S.TimerInfo[nTimerIndex].nColor; + if(nMouseOverToken == nTimerIndex) + { + if(pEntry == pMouseOver) + { + nColor = S.nHoverColor; + if(bGpu) + { + S.nRangeBeginGpu = *pEntryEnter; + S.nRangeEndGpu = *pEntry; + S.nRangeBeginIndex = nStack[nStackPos-1]; + S.nRangeEndIndex = k; + S.pRangeLog = pLog; + } + else + { + S.nRangeBegin = *pEntryEnter; + S.nRangeEnd = *pEntry; + S.nRangeBeginIndex = nStack[nStackPos-1]; + S.nRangeEndIndex = k; + S.pRangeLog = pLog; + + } + } + else + { + nColor = S.nHoverColorShared; + } + } + + nMaxStackDepth = MicroProfileMax(nMaxStackDepth, nStackPos); + float fMsStart = fToMs * MicroProfileLogTickDifference(nBaseTicks, nTickStart); + float fMsEnd = fToMs * MicroProfileLogTickDifference(nBaseTicks, nTickEnd); + MP_ASSERT(fMsStart <= fMsEnd); + float fXStart = fMsStart * fMsToScreen; + float fXEnd = fMsEnd * fMsToScreen; + float fYStart = (float)(nY + nStackPos * nYDelta); + float fYEnd = fYStart + (MICROPROFILE_DETAILED_BAR_HEIGHT); + float fXDist = MicroProfileMax(fXStart - fMouseX, fMouseX - fXEnd); + bool bHover = fXDist < MICROPROFILE_HOVER_DIST && fYStart <= fMouseY && fMouseY <= fYEnd && nBaseY < fMouseY; + uint32_t nIntegerWidth = (uint32_t)(fXEnd - fXStart); + if(nIntegerWidth) + { + if(bHover && S.nActiveMenu == -1) + { + nHoverToken = MicroProfileLogTimerIndex(*pEntry); + #if MICROPROFILE_DEBUG + S.nHoverAddressEnter = (uint64_t)pEntryEnter; + S.nHoverAddressLeave = (uint64_t)pEntry; + #endif + nHoverTime = MicroProfileLogTickDifference(nTickStart, nTickEnd); + pMouseOverNext = pEntry; + } + + MicroProfileDrawBox(fXStart, fYStart, fXEnd, fYEnd, nColor|S.nOpacityForeground, MicroProfileBoxTypeBar); +#if MICROPROFILE_DETAILED_BAR_NAMES + if(nIntegerWidth>3*MICROPROFILE_TEXT_WIDTH) + { + int nCharacters = (nIntegerWidth - 2*MICROPROFILE_TEXT_WIDTH) / MICROPROFILE_TEXT_WIDTH; + MicroProfileDrawText(fXStart+1, fYStart+1, -1, S.TimerInfo[nTimerIndex].pName, MicroProfileMin(S.TimerInfo[nTimerIndex].nNameLen, nCharacters)); + } +#endif + ++nNumBoxes; + } + else + { + float fXAvg = 0.5f * (fXStart + fXEnd); + int nLineX = (int)floor(fXAvg+0.5f); + if(nLineX != (int)nLinesDrawn[nStackPos]) + { + if(bHover && S.nActiveMenu == -1) + { + nHoverToken = (uint32_t)MicroProfileLogTimerIndex(*pEntry); + nHoverTime = MicroProfileLogTickDifference(nTickStart, nTickEnd); + pMouseOverNext = pEntry; + } + nLinesDrawn[nStackPos] = nLineX; + MicroProfileDrawLineVertical(nLineX, fYStart + 0.5f, fYEnd + 0.5f, nColor|S.nOpacityForeground); + ++nNumLines; + } + } + nStackPos--; + } + } + } + nY += nMaxStackDepth * nYDelta + MICROPROFILE_DETAILED_BAR_HEIGHT+1; + } + } + if(S.bContextSwitchRunning && (S.bContextSwitchAllThreads||S.bContextSwitchNoBars)) + { + uint32_t nNumThreads = 0; + uint32_t nThreads[MICROPROFILE_MAX_CONTEXT_SWITCH_THREADS]; + for(uint32_t i = 0; i < MICROPROFILE_MAX_THREADS && S.Pool[i]; ++i) + nThreads[nNumThreads++] = S.Pool[i]->nThreadId; + uint32_t nNumThreadsBase = nNumThreads; + if(S.bContextSwitchAllThreads) + { + for(uint32_t i = nContextSwitchStart; i != nContextSwitchEnd; i = (i+1) % MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE) + { + MicroProfileContextSwitch CS = S.ContextSwitch[i]; + uint32_t nThreadId = CS.nThreadIn; + if(nThreadId) + { + bool bSeen = false; + for(uint32_t j = 0; j < nNumThreads; ++j) + { + if(nThreads[j] == nThreadId) + { + bSeen = true; + break; + } + } + if(!bSeen) + { + nThreads[nNumThreads++] = nThreadId; + } + } + if(nNumThreads == MICROPROFILE_MAX_CONTEXT_SWITCH_THREADS) + { + S.nOverflow = 10; + break; + } + } + std::sort(&nThreads[nNumThreadsBase], &nThreads[nNumThreads]); + } + uint32_t nStart = nNumThreadsBase; + if(S.bContextSwitchNoBars) + nStart = 0; + for(uint32_t i = nStart; i < nNumThreads; ++i) + { + uint32_t nThreadId = nThreads[i]; + if(nThreadId) + { + char ThreadName[MicroProfileThreadLog::THREAD_MAX_LEN + 16]; + const char* cLocal = MicroProfileIsLocalThread(nThreadId) ? "*": " "; + int nStrLen = snprintf(ThreadName, sizeof(ThreadName)-1, "%04x: %s", nThreadId, i < nNumThreadsBase ? &S.Pool[i]->ThreadName[0] : cLocal ); + uint32_t nThreadColor = -1; + if(nThreadId == nContextSwitchHoverThreadAfter || nThreadId == nContextSwitchHoverThreadBefore) + nThreadColor = S.nHoverColorShared|0x906060; + MicroProfileDrawDetailedContextSwitchBars(nY+2, nThreadId, nContextSwitchStart, nContextSwitchEnd, nBaseTicksCpu, nBaseY); + MicroProfileDrawText(0, nY, nThreadColor, &ThreadName[0], nStrLen); + nY += MICROPROFILE_TEXT_HEIGHT+1; + } + } + } + + S.nContextSwitchHoverCpu = S.nContextSwitchHoverCpuNext; + + + + + S.pDisplayMouseOver = pMouseOverNext; + + if(!S.nRunning) + { + if(nHoverToken != MICROPROFILE_INVALID_TOKEN && nHoverTime) + { + S.nHoverToken = nHoverToken; + S.nHoverTime = nHoverTime; + } + + if(nSelectedFrame != -1) + { + S.nRangeBegin = S.Frames[nSelectedFrame].nFrameStartCpu; + S.nRangeEnd = S.Frames[(nSelectedFrame+1)%MICROPROFILE_MAX_FRAME_HISTORY].nFrameStartCpu; + S.nRangeBeginGpu = S.Frames[nSelectedFrame].nFrameStartGpu; + S.nRangeEndGpu = S.Frames[(nSelectedFrame+1)%MICROPROFILE_MAX_FRAME_HISTORY].nFrameStartGpu; + } + if(S.nRangeBegin != S.nRangeEnd) + { + float fMsStart = fToMsCpu * MicroProfileLogTickDifference(nBaseTicksCpu, S.nRangeBegin); + float fMsEnd = fToMsCpu * MicroProfileLogTickDifference(nBaseTicksCpu, S.nRangeEnd); + float fXStart = fMsStart * fMsToScreen; + float fXEnd = fMsEnd * fMsToScreen; + MicroProfileDrawBox(fXStart, nBaseY, fXEnd, nHeight, MICROPROFILE_FRAME_COLOR_HIGHTLIGHT, MicroProfileBoxTypeFlat); + MicroProfileDrawLineVertical(fXStart, nBaseY, nHeight, MICROPROFILE_FRAME_COLOR_HIGHTLIGHT | 0x44000000); + MicroProfileDrawLineVertical(fXEnd, nBaseY, nHeight, MICROPROFILE_FRAME_COLOR_HIGHTLIGHT | 0x44000000); + + fMsStart += fDetailedOffset; + fMsEnd += fDetailedOffset; + char sBuffer[32]; + uint32_t nLenStart = snprintf(sBuffer, sizeof(sBuffer)-1, "%.2fms", fMsStart); + float fStartTextWidth = (float)((1+MICROPROFILE_TEXT_WIDTH) * nLenStart); + float fStartTextX = fXStart - fStartTextWidth - 2; + MicroProfileDrawBox(fStartTextX, nBaseY, fStartTextX + fStartTextWidth + 2, MICROPROFILE_TEXT_HEIGHT + 2 + nBaseY, 0x33000000, MicroProfileBoxTypeFlat); + MicroProfileDrawText(fStartTextX+1, nBaseY, (uint32_t)-1, sBuffer, nLenStart); + uint32_t nLenEnd = snprintf(sBuffer, sizeof(sBuffer)-1, "%.2fms", fMsEnd); + MicroProfileDrawBox(fXEnd+1, nBaseY, fXEnd+1+(1+MICROPROFILE_TEXT_WIDTH) * nLenEnd + 3, MICROPROFILE_TEXT_HEIGHT + 2 + nBaseY, 0x33000000, MicroProfileBoxTypeFlat); + MicroProfileDrawText(fXEnd+2, nBaseY+1, (uint32_t)-1, sBuffer, nLenEnd); + + if(S.nMouseRight) + { + MicroProfileZoomTo(S.nRangeBegin, S.nRangeEnd); + } + } + + if(S.nRangeBeginGpu != S.nRangeEndGpu) + { + float fMsStart = fToMsGpu * MicroProfileLogTickDifference(nBaseTicksGpu, S.nRangeBeginGpu); + float fMsEnd = fToMsGpu * MicroProfileLogTickDifference(nBaseTicksGpu, S.nRangeEndGpu); + float fXStart = fMsStart * fMsToScreen; + float fXEnd = fMsEnd * fMsToScreen; + MicroProfileDrawBox(fXStart, nBaseY, fXEnd, nHeight, MICROPROFILE_FRAME_COLOR_HIGHTLIGHT_GPU, MicroProfileBoxTypeFlat); + MicroProfileDrawLineVertical(fXStart, nBaseY, nHeight, MICROPROFILE_FRAME_COLOR_HIGHTLIGHT_GPU | 0x44000000); + MicroProfileDrawLineVertical(fXEnd, nBaseY, nHeight, MICROPROFILE_FRAME_COLOR_HIGHTLIGHT_GPU | 0x44000000); + + nBaseY += MICROPROFILE_TEXT_HEIGHT+1; + + fMsStart += fDetailedOffset; + fMsEnd += fDetailedOffset; + char sBuffer[32]; + uint32_t nLenStart = snprintf(sBuffer, sizeof(sBuffer)-1, "%.2fms", fMsStart); + float fStartTextWidth = (float)((1+MICROPROFILE_TEXT_WIDTH) * nLenStart); + float fStartTextX = fXStart - fStartTextWidth - 2; + MicroProfileDrawBox(fStartTextX, nBaseY, fStartTextX + fStartTextWidth + 2, MICROPROFILE_TEXT_HEIGHT + 2 + nBaseY, 0x33000000, MicroProfileBoxTypeFlat); + MicroProfileDrawText(fStartTextX+1, nBaseY, (uint32_t)-1, sBuffer, nLenStart); + uint32_t nLenEnd = snprintf(sBuffer, sizeof(sBuffer)-1, "%.2fms", fMsEnd); + MicroProfileDrawBox(fXEnd+1, nBaseY, fXEnd+1+(1+MICROPROFILE_TEXT_WIDTH) * nLenEnd + 3, MICROPROFILE_TEXT_HEIGHT + 2 + nBaseY, 0x33000000, MicroProfileBoxTypeFlat); + MicroProfileDrawText(fXEnd+2, nBaseY+1, (uint32_t)-1, sBuffer, nLenEnd); + } + } +} + + +void MicroProfileDrawDetailedFrameHistory(uint32_t nWidth, uint32_t nHeight, uint32_t nBaseY, uint32_t nSelectedFrame) +{ + const uint32_t nBarHeight = MICROPROFILE_FRAME_HISTORY_HEIGHT; + float fBaseX = (float)nWidth; + float fDx = fBaseX / MICROPROFILE_NUM_FRAMES; + + uint32_t nLastIndex = (S.nFrameCurrent+1) % MICROPROFILE_MAX_FRAME_HISTORY; + MicroProfileDrawBox(0, nBaseY, nWidth, nBaseY+MICROPROFILE_FRAME_HISTORY_HEIGHT, 0xff000000 | g_nMicroProfileBackColors[0], MicroProfileBoxTypeFlat); + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()) * S.fRcpReferenceTime; + float fToMsGpu = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondGpu()) * S.fRcpReferenceTime; + + + MicroProfileFrameState* pFrameCurrent = &S.Frames[S.nFrameCurrent]; + uint64_t nFrameStartCpu = pFrameCurrent->nFrameStartCpu; + int64_t nDetailedOffsetTicksCpu = MicroProfileMsToTick(S.fDetailedOffset, MicroProfileTicksPerSecondCpu()); + int64_t nCpuStart = nDetailedOffsetTicksCpu + nFrameStartCpu; + int64_t nCpuEnd = nCpuStart + MicroProfileMsToTick(S.fDetailedRange, MicroProfileTicksPerSecondCpu());; + + + float fSelectionStart = (float)nWidth; + float fSelectionEnd = 0.f; + for(uint32_t i = 0; i < MICROPROFILE_NUM_FRAMES; ++i) + { + uint32_t nIndex = (S.nFrameCurrent + MICROPROFILE_MAX_FRAME_HISTORY - i) % MICROPROFILE_MAX_FRAME_HISTORY; + MicroProfileFrameState* pCurrent = &S.Frames[nIndex]; + MicroProfileFrameState* pNext = &S.Frames[nLastIndex]; + + int64_t nTicks = pNext->nFrameStartCpu - pCurrent->nFrameStartCpu; + int64_t nTicksGpu = pNext->nFrameStartGpu - pCurrent->nFrameStartGpu; + float fScale = fToMs * nTicks; + float fScaleGpu = fToMsGpu * nTicksGpu; + fScale = fScale > 1.f ? 0.f : 1.f - fScale; + fScaleGpu = fScaleGpu > 1.f ? 0.f : 1.f - fScaleGpu; + float fXEnd = fBaseX; + float fXStart = fBaseX - fDx; + fBaseX = fXStart; + uint32_t nColor = MICROPROFILE_FRAME_HISTORY_COLOR_CPU; + if(nIndex == nSelectedFrame) + nColor = (uint32_t)-1; + MicroProfileDrawBox(fXStart, nBaseY + fScale * nBarHeight, fXEnd, nBaseY+MICROPROFILE_FRAME_HISTORY_HEIGHT, nColor, MicroProfileBoxTypeBar); + if(pNext->nFrameStartCpu > nCpuStart) + { + fSelectionStart = fXStart; + } + if(pCurrent->nFrameStartCpu < nCpuEnd && fSelectionEnd == 0.f) + { + fSelectionEnd = fXEnd; + } + nLastIndex = nIndex; + } + MicroProfileDrawBox(fSelectionStart, nBaseY, fSelectionEnd, nBaseY+MICROPROFILE_FRAME_HISTORY_HEIGHT, MICROPROFILE_FRAME_HISTORY_COLOR_HIGHTLIGHT, MicroProfileBoxTypeFlat); +} +void MicroProfileDrawDetailedView(uint32_t nWidth, uint32_t nHeight) +{ + MICROPROFILE_SCOPE(g_MicroProfileDetailed); + uint32_t nBaseY = S.nBarHeight + 1; + + int nSelectedFrame = -1; + if(S.nMouseY > nBaseY && S.nMouseY <= nBaseY + MICROPROFILE_FRAME_HISTORY_HEIGHT && S.nActiveMenu == -1) + { + + nSelectedFrame = ((MICROPROFILE_NUM_FRAMES) * (S.nWidth-S.nMouseX) / S.nWidth); + nSelectedFrame = (S.nFrameCurrent + MICROPROFILE_MAX_FRAME_HISTORY - nSelectedFrame) % MICROPROFILE_MAX_FRAME_HISTORY; + S.nHoverFrame = nSelectedFrame; + if(S.nMouseRight) + { + int64_t nRangeBegin = S.Frames[nSelectedFrame].nFrameStartCpu; + int64_t nRangeEnd = S.Frames[(nSelectedFrame+1)%MICROPROFILE_MAX_FRAME_HISTORY].nFrameStartCpu; + MicroProfileZoomTo(nRangeBegin, nRangeEnd); + } + if(S.nMouseDownLeft) + { + uint64_t nFrac = (1024 * (MICROPROFILE_NUM_FRAMES) * (S.nMouseX) / S.nWidth) % 1024; + int64_t nRangeBegin = S.Frames[nSelectedFrame].nFrameStartCpu; + int64_t nRangeEnd = S.Frames[(nSelectedFrame+1)%MICROPROFILE_MAX_FRAME_HISTORY].nFrameStartCpu; + MicroProfileCenter(nRangeBegin + (nRangeEnd-nRangeBegin) * nFrac / 1024); + } + } + else + { + S.nHoverFrame = -1; + } + + MicroProfileDrawDetailedBars(nWidth, nHeight, nBaseY + MICROPROFILE_FRAME_HISTORY_HEIGHT, nSelectedFrame); + MicroProfileDrawDetailedFrameHistory(nWidth, nHeight, nBaseY, nSelectedFrame); +} + +template +void MicroProfileLoopActiveGroupsDraw(int32_t nX, int32_t nY, const char* pName, T CB) +{ + if(pName) + MicroProfileDrawText(nX, nY, (uint32_t)-1, pName, strlen(pName)); + + nY += S.nBarHeight + 2; + uint64_t nGroup = S.nActiveGroup = S.nMenuAllGroups ? S.nGroupMask : S.nMenuActiveGroup; + uint32_t nCount = 0; + for(uint32_t j = 0; j < MICROPROFILE_MAX_GROUPS; ++j) + { + uint64_t nMask = 1ll << j; + if(nMask & nGroup) + { + nY += S.nBarHeight + 1; + for(uint32_t i = 0; i < S.nTotalTimers;++i) + { + uint64_t nTokenMask = MicroProfileGetGroupMask(S.TimerInfo[i].nToken); + if(nTokenMask & nMask) + { + if(nY >= 0) + CB(i, nCount, nMask, nX, nY); + + nCount += 2; + nY += S.nBarHeight + 1; + + if(nY > (int)S.nHeight) + return; + } + } + + } + } +} + + +void MicroProfileCalcTimers(float* pTimers, float* pAverage, float* pMax, float* pCallAverage, float* pExclusive, float* pAverageExclusive, float* pMaxExclusive, uint64_t nGroup, uint32_t nSize) +{ + uint32_t nCount = 0; + uint64_t nMask = 1; + + for(uint32_t j = 0; j < MICROPROFILE_MAX_GROUPS; ++j) + { + if(nMask & nGroup) + { + const float fToMs = MicroProfileTickToMsMultiplier(S.GroupInfo[j].Type == MicroProfileTokenTypeGpu ? MicroProfileTicksPerSecondGpu() : MicroProfileTicksPerSecondCpu()); + for(uint32_t i = 0; i < S.nTotalTimers;++i) + { + uint64_t nTokenMask = MicroProfileGetGroupMask(S.TimerInfo[i].nToken); + if(nTokenMask & nMask) + { + { + uint32_t nTimer = i; + uint32_t nIdx = nCount; + uint32_t nAggregateFrames = S.nAggregateFrames ? S.nAggregateFrames : 1; + uint32_t nAggregateCount = S.Aggregate[nTimer].nCount ? S.Aggregate[nTimer].nCount : 1; + float fToPrc = S.fRcpReferenceTime; + float fMs = fToMs * (S.Frame[nTimer].nTicks); + float fPrc = MicroProfileMin(fMs * fToPrc, 1.f); + float fAverageMs = fToMs * (S.Aggregate[nTimer].nTicks / nAggregateFrames); + float fAveragePrc = MicroProfileMin(fAverageMs * fToPrc, 1.f); + float fMaxMs = fToMs * (S.AggregateMax[nTimer]); + float fMaxPrc = MicroProfileMin(fMaxMs * fToPrc, 1.f); + float fCallAverageMs = fToMs * (S.Aggregate[nTimer].nTicks / nAggregateCount); + float fCallAveragePrc = MicroProfileMin(fCallAverageMs * fToPrc, 1.f); + float fMsExclusive = fToMs * (S.FrameExclusive[nTimer]); + float fPrcExclusive = MicroProfileMin(fMsExclusive * fToPrc, 1.f); + float fAverageMsExclusive = fToMs * (S.AggregateExclusive[nTimer] / nAggregateFrames); + float fAveragePrcExclusive = MicroProfileMin(fAverageMsExclusive * fToPrc, 1.f); + float fMaxMsExclusive = fToMs * (S.AggregateMaxExclusive[nTimer]); + float fMaxPrcExclusive = MicroProfileMin(fMaxMsExclusive * fToPrc, 1.f); + pTimers[nIdx] = fMs; + pTimers[nIdx+1] = fPrc; + pAverage[nIdx] = fAverageMs; + pAverage[nIdx+1] = fAveragePrc; + pMax[nIdx] = fMaxMs; + pMax[nIdx+1] = fMaxPrc; + pCallAverage[nIdx] = fCallAverageMs; + pCallAverage[nIdx+1] = fCallAveragePrc; + pExclusive[nIdx] = fMsExclusive; + pExclusive[nIdx+1] = fPrcExclusive; + pAverageExclusive[nIdx] = fAverageMsExclusive; + pAverageExclusive[nIdx+1] = fAveragePrcExclusive; + pMaxExclusive[nIdx] = fMaxMsExclusive; + pMaxExclusive[nIdx+1] = fMaxPrcExclusive; + } + nCount += 2; + } + } + } + nMask <<= 1ll; + } +} + +#define SBUF_MAX 32 + +uint32_t MicroProfileDrawBarArray(int32_t nX, int32_t nY, float* pTimers, const char* pName, uint32_t nTotalHeight) +{ + const uint32_t nHeight = S.nBarHeight; + const uint32_t nWidth = S.nBarWidth; + const uint32_t nTextWidth = 6 * (1+MICROPROFILE_TEXT_WIDTH); + const float fWidth = (float)S.nBarWidth; + + MicroProfileDrawLineVertical(nX-5, nY, nTotalHeight, S.nOpacityBackground|g_nMicroProfileBackColors[0]|g_nMicroProfileBackColors[1]); + + MicroProfileLoopActiveGroupsDraw(nX, nY, pName, + [=](uint32_t nTimer, uint32_t nIdx, uint64_t nGroupMask, uint32_t nX, uint32_t nY){ + char sBuffer[SBUF_MAX]; + int nLen = snprintf(sBuffer, SBUF_MAX-1, "%5.2f", pTimers[nIdx]); + MicroProfileDrawBox(nX + nTextWidth, nY, nX + nTextWidth + fWidth * pTimers[nIdx+1], nY + nHeight, S.nOpacityForeground|S.TimerInfo[nTimer].nColor, MicroProfileBoxTypeBar); + MicroProfileDrawText(nX, nY, (uint32_t)-1, sBuffer, nLen); + }); + return nWidth + 5 + nTextWidth; + +} + +uint32_t MicroProfileDrawBarCallCount(int32_t nX, int32_t nY, const char* pName) +{ + MicroProfileLoopActiveGroupsDraw(nX, nY, pName, + [](uint32_t nTimer, uint32_t nIdx, uint64_t nGroupMask, uint32_t nX, uint32_t nY){ + char sBuffer[SBUF_MAX]; + int nLen = snprintf(sBuffer, SBUF_MAX-1, "%5d", S.Frame[nTimer].nCount);//fix + MicroProfileDrawText(nX, nY, (uint32_t)-1, sBuffer, nLen); + }); + uint32_t nTextWidth = 6 * MICROPROFILE_TEXT_WIDTH; + return 5 + nTextWidth; +} + +uint32_t MicroProfileDrawBarMetaCount(int32_t nX, int32_t nY, uint64_t* pCounters, const char* pName, uint32_t nTotalHeight) +{ + MicroProfileDrawLineVertical(nX-5, nY, nTotalHeight, S.nOpacityBackground|g_nMicroProfileBackColors[0]|g_nMicroProfileBackColors[1]); + uint32_t nTextWidth = (1+MICROPROFILE_TEXT_WIDTH) * MicroProfileMax(6, strlen(pName)); + + + MicroProfileLoopActiveGroupsDraw(nX, nY, pName, + [=](uint32_t nTimer, uint32_t nIdx, uint64_t nGroupMask, uint32_t nX, uint32_t nY){ + char sBuffer[SBUF_MAX]; + int nLen = snprintf(sBuffer, SBUF_MAX-1, "%5llu", pCounters[nTimer]); + MicroProfileDrawText(nX + nTextWidth - nLen * (MICROPROFILE_TEXT_WIDTH+1), nY, (uint32_t)-1, sBuffer, nLen); + }); + return 5 + nTextWidth; +} + + +uint32_t MicroProfileDrawBarLegend(int32_t nX, int32_t nY, uint32_t nTotalHeight) +{ + MicroProfileDrawLineVertical(nX-5, nY, nTotalHeight, S.nOpacityBackground | g_nMicroProfileBackColors[0]|g_nMicroProfileBackColors[1]); + MicroProfileLoopActiveGroupsDraw(nX, nY, 0, + [](uint32_t nTimer, uint32_t nIdx, uint64_t nGroupMask, uint32_t nX, uint32_t nY){ + MicroProfileDrawText(nX, nY, S.TimerInfo[nTimer].nColor, S.TimerInfo[nTimer].pName, strlen(S.TimerInfo[nTimer].pName)); + if(S.nMouseY >= nY && S.nMouseY < nY + MICROPROFILE_TEXT_HEIGHT+1 && S.nMouseX < nX + 20 * (MICROPROFILE_TEXT_WIDTH+1)) + { + S.nHoverToken = nTimer; + S.nHoverTime = 0; + } + }); + return nX; +} + +bool MicroProfileDrawGraph(uint32_t nScreenWidth, uint32_t nScreenHeight) +{ + MICROPROFILE_SCOPE(g_MicroProfileDrawGraph); + bool bEnabled = false; + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + if(S.Graph[i].nToken != MICROPROFILE_INVALID_TOKEN) + bEnabled = true; + if(!bEnabled) + return false; + + uint32_t nX = nScreenWidth - MICROPROFILE_GRAPH_WIDTH; + uint32_t nY = nScreenHeight - MICROPROFILE_GRAPH_HEIGHT; + MicroProfileDrawBox(nX, nY, nX + MICROPROFILE_GRAPH_WIDTH, nY + MICROPROFILE_GRAPH_HEIGHT, S.nOpacityBackground | g_nMicroProfileBackColors[0]|g_nMicroProfileBackColors[1]); + bool bMouseOver = S.nMouseX >= nX && S.nMouseY >= nY; + float fMouseXPrc =(float(S.nMouseX - nX)) / MICROPROFILE_GRAPH_WIDTH; + if(bMouseOver) + { + float fXAvg = fMouseXPrc * MICROPROFILE_GRAPH_WIDTH + nX; + MicroProfileDrawLineVertical(fXAvg, nY, nY + MICROPROFILE_GRAPH_HEIGHT, (uint32_t)-1); + } + + + float fY = (float)nScreenHeight; + float fDX = MICROPROFILE_GRAPH_WIDTH * 1.f / MICROPROFILE_GRAPH_HISTORY; + float fDY = MICROPROFILE_GRAPH_HEIGHT; + uint32_t nPut = S.nGraphPut; + float* pGraphData = (float*)alloca(sizeof(float)* MICROPROFILE_GRAPH_HISTORY*2); + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + if(S.Graph[i].nToken != MICROPROFILE_INVALID_TOKEN) + { + uint32_t nGroupId = MicroProfileGetGroupIndex(S.Graph[i].nToken); + bool bGpu = S.GroupInfo[nGroupId].Type == MicroProfileTokenTypeGpu; + float fToMs = MicroProfileTickToMsMultiplier(bGpu ? MicroProfileTicksPerSecondGpu() : MicroProfileTicksPerSecondCpu()); + float fToPrc = fToMs * S.fRcpReferenceTime * 3 / 4; + + float fX = (float)nX; + for(uint32_t j = 0; j < MICROPROFILE_GRAPH_HISTORY; ++j) + { + float fWeigth = MicroProfileMin(fToPrc * (S.Graph[i].nHistory[(j+nPut)%MICROPROFILE_GRAPH_HISTORY]), 1.f); + pGraphData[(j*2)] = fX; + pGraphData[(j*2)+1] = fY - fDY * fWeigth; + fX += fDX; + } + MicroProfileDrawLine2D(MICROPROFILE_GRAPH_HISTORY, pGraphData, S.TimerInfo[MicroProfileGetTimerIndex(S.Graph[i].nToken)].nColor); + } + } + { + float fY1 = 0.25f * MICROPROFILE_GRAPH_HEIGHT + nY; + float fY2 = 0.50f * MICROPROFILE_GRAPH_HEIGHT + nY; + float fY3 = 0.75f * MICROPROFILE_GRAPH_HEIGHT + nY; + MicroProfileDrawLineHorizontal(nX, nX + MICROPROFILE_GRAPH_WIDTH, fY1, 0xffdd4444); + MicroProfileDrawLineHorizontal(nX, nX + MICROPROFILE_GRAPH_WIDTH, fY2, 0xff000000| g_nMicroProfileBackColors[0]); + MicroProfileDrawLineHorizontal(nX, nX + MICROPROFILE_GRAPH_WIDTH, fY3, 0xff000000|g_nMicroProfileBackColors[0]); + + char buf[32]; + int nLen = snprintf(buf, sizeof(buf)-1, "%5.2fms", S.fReferenceTime); + MicroProfileDrawText(nX+1, fY1 - (2+MICROPROFILE_TEXT_HEIGHT), (uint32_t)-1, buf, nLen); + } + + + + if(bMouseOver) + { + uint32_t pColors[MICROPROFILE_MAX_GRAPHS]; + MicroProfileStringArray Strings; + MicroProfileStringArrayClear(&Strings); + uint32_t nTextCount = 0; + uint32_t nGraphIndex = (S.nGraphPut + MICROPROFILE_GRAPH_HISTORY - int(MICROPROFILE_GRAPH_HISTORY*(1.f - fMouseXPrc))) % MICROPROFILE_GRAPH_HISTORY; + + uint32_t nX = S.nMouseX; + uint32_t nY = S.nMouseY + 20; + + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + if(S.Graph[i].nToken != MICROPROFILE_INVALID_TOKEN) + { + uint32_t nGroupId = MicroProfileGetGroupIndex(S.Graph[i].nToken); + bool bGpu = S.GroupInfo[nGroupId].Type == MicroProfileTokenTypeGpu; + float fToMs = MicroProfileTickToMsMultiplier(bGpu ? MicroProfileTicksPerSecondGpu() : MicroProfileTicksPerSecondCpu()); + uint32_t nIndex = MicroProfileGetTimerIndex(S.Graph[i].nToken); + uint32_t nColor = S.TimerInfo[nIndex].nColor; + const char* pName = S.TimerInfo[nIndex].pName; + pColors[nTextCount++] = nColor; + MicroProfileStringArrayAddLiteral(&Strings, pName); + MicroProfileStringArrayFormat(&Strings, "%5.2fms", fToMs * (S.Graph[i].nHistory[nGraphIndex])); + } + } + if(nTextCount) + { + MicroProfileDrawFloatWindow(nX, nY, Strings.ppStrings, Strings.nNumStrings, 0, pColors); + } + + if(S.nMouseRight) + { + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + S.Graph[i].nToken = MICROPROFILE_INVALID_TOKEN; + } + } + } + + return bMouseOver; +} + +void MicroProfileDumpTimers() +{ + uint64_t nActiveGroup = S.nGroupMask; + + uint32_t nNumTimers = S.nTotalTimers; + uint32_t nBlockSize = 2 * nNumTimers; + float* pTimers = (float*)alloca(nBlockSize * 7 * sizeof(float)); + float* pAverage = pTimers + nBlockSize; + float* pMax = pTimers + 2 * nBlockSize; + float* pCallAverage = pTimers + 3 * nBlockSize; + float* pTimersExclusive = pTimers + 4 * nBlockSize; + float* pAverageExclusive = pTimers + 5 * nBlockSize; + float* pMaxExclusive = pTimers + 6 * nBlockSize; + MicroProfileCalcTimers(pTimers, pAverage, pMax, pCallAverage, pTimersExclusive, pAverageExclusive, pMaxExclusive, nActiveGroup, nNumTimers); + + MICROPROFILE_PRINTF("%11s, ", "Time"); + MICROPROFILE_PRINTF("%11s, ", "Average"); + MICROPROFILE_PRINTF("%11s, ", "Max"); + MICROPROFILE_PRINTF("%11s, ", "Call Avg"); + MICROPROFILE_PRINTF("%9s, ", "Count"); + MICROPROFILE_PRINTF("%11s, ", "Excl"); + MICROPROFILE_PRINTF("%11s, ", "Avg Excl"); + MICROPROFILE_PRINTF("%11s, \n", "Max Excl"); + + for(uint32_t j = 0; j < MICROPROFILE_MAX_GROUPS; ++j) + { + uint64_t nMask = 1ll << j; + if(nMask & nActiveGroup) + { + MICROPROFILE_PRINTF("%s\n", S.GroupInfo[j].pName); + for(uint32_t i = 0; i < S.nTotalTimers;++i) + { + uint64_t nTokenMask = MicroProfileGetGroupMask(S.TimerInfo[i].nToken); + if(nTokenMask & nMask) + { + uint32_t nIdx = i * 2; + MICROPROFILE_PRINTF("%9.2fms, ", pTimers[nIdx]); + MICROPROFILE_PRINTF("%9.2fms, ", pAverage[nIdx]); + MICROPROFILE_PRINTF("%9.2fms, ", pMax[nIdx]); + MICROPROFILE_PRINTF("%9.2fms, ", pCallAverage[nIdx]); + MICROPROFILE_PRINTF("%9d, ", S.Frame[i].nCount); + MICROPROFILE_PRINTF("%9.2fms, ", pTimersExclusive[nIdx]); + MICROPROFILE_PRINTF("%9.2fms, ", pAverageExclusive[nIdx]); + MICROPROFILE_PRINTF("%9.2fms, ", pMaxExclusive[nIdx]); + MICROPROFILE_PRINTF("%s\n", S.TimerInfo[i].pName); + } + } + } + } +} + +void MicroProfileDrawBarView(uint32_t nScreenWidth, uint32_t nScreenHeight) +{ + uint64_t nActiveGroup = S.nMenuAllGroups ? S.nGroupMask : S.nMenuActiveGroup; + if(!nActiveGroup) + return; + MICROPROFILE_SCOPE(g_MicroProfileDrawBarView); + + const uint32_t nHeight = S.nBarHeight; + int nColorIndex = 0; + uint32_t nX = 0; + uint32_t nY = nHeight + 1 - S.nOffsetY; + uint32_t nNumTimers = 0; + uint32_t nNumGroups = 0; + uint32_t nMaxTimerNameLen = 1; + for(uint32_t j = 0; j < MICROPROFILE_MAX_GROUPS; ++j) + { + if(nActiveGroup & (1ll << j)) + { + nNumTimers += S.GroupInfo[j].nNumTimers; + nNumGroups += 1; + nMaxTimerNameLen = MicroProfileMax(nMaxTimerNameLen, S.GroupInfo[j].nMaxTimerNameLen); + } + } + uint32_t nBlockSize = 2 * nNumTimers; + float* pTimers = (float*)alloca(nBlockSize * 7 * sizeof(float)); + float* pAverage = pTimers + nBlockSize; + float* pMax = pTimers + 2 * nBlockSize; + float* pCallAverage = pTimers + 3 * nBlockSize; + float* pTimersExclusive = pTimers + 4 * nBlockSize; + float* pAverageExclusive = pTimers + 5 * nBlockSize; + float* pMaxExclusive = pTimers + 6 * nBlockSize; + MicroProfileCalcTimers(pTimers, pAverage, pMax, pCallAverage, pTimersExclusive, pAverageExclusive, pMaxExclusive, nActiveGroup, nNumTimers); + { + uint32_t nWidth = 0; + for(uint32_t i = 1; i ; i <<= 1) + { + if(S.nBars & i) + { + nWidth += S.nBarWidth + 5 + 6 * (1+MICROPROFILE_TEXT_WIDTH); + if(i & MP_DRAW_CALL_COUNT) + nWidth += 5 + 6 * MICROPROFILE_TEXT_WIDTH; + } + } + nWidth += (1+nMaxTimerNameLen) * (MICROPROFILE_TEXT_WIDTH+1); + for(uint32_t i = 0; i < nNumTimers+nNumGroups+1; ++i) + { + int nY0 = nY + i * (nHeight + 1); + MicroProfileDrawBox(nX, nY0, nWidth, nY0 + (nHeight+1)+1, S.nOpacityBackground | g_nMicroProfileBackColors[nColorIndex++ & 1]); + } + } + int nTotalHeight = (nNumTimers+nNumGroups+2) * (nHeight+1); + uint32_t nLegendOffset = 1; + for(uint32_t j = 0; j < MICROPROFILE_MAX_GROUPS; ++j) + { + if(nActiveGroup & (1ll << j)) + { + MicroProfileDrawText(nX, nY + (1+nHeight) * nLegendOffset, (uint32_t)-1, S.GroupInfo[j].pName, S.GroupInfo[j].nNameLen); + nLegendOffset += S.GroupInfo[j].nNumTimers+1; + } + } + if(S.nBars & MP_DRAW_TIMERS) + nX += MicroProfileDrawBarArray(nX, nY, pTimers, "Time", nTotalHeight) + 1; + if(S.nBars & MP_DRAW_AVERAGE) + nX += MicroProfileDrawBarArray(nX, nY, pAverage, "Average", nTotalHeight) + 1; + if(S.nBars & MP_DRAW_MAX) + nX += MicroProfileDrawBarArray(nX, nY, pMax, "Max Time", nTotalHeight) + 1; + if(S.nBars & MP_DRAW_CALL_COUNT) + { + nX += MicroProfileDrawBarArray(nX, nY, pCallAverage, "Call Average", nTotalHeight) + 1; + nX += MicroProfileDrawBarCallCount(nX, nY, "Count") + 1; + } + if(S.nBars & MP_DRAW_TIMERS_EXCLUSIVE) + nX += MicroProfileDrawBarArray(nX, nY, pTimersExclusive, "Exclusive Time", nTotalHeight) + 1; + if(S.nBars & MP_DRAW_AVERAGE_EXCLUSIVE) + nX += MicroProfileDrawBarArray(nX, nY, pAverageExclusive, "Exclusive Average", nTotalHeight) + 1; + if(S.nBars & MP_DRAW_MAX_EXCLUSIVE) + nX += MicroProfileDrawBarArray(nX, nY, pMaxExclusive, "Exclusive Max Time", nTotalHeight) + 1; + + for(int i = 0; i < MICROPROFILE_META_MAX; ++i) + { + if(0 != (S.nBars & (MP_DRAW_META_FIRST< SubmenuCallback; + typedef std::function ClickCallback; + SubmenuCallback GroupCallback[] = + { [] (int index, bool& bSelected) -> const char*{ + switch(index) + { + case 0: + bSelected = S.nDisplay == MP_DRAW_DETAILED; + return "Detailed"; + case 1: + bSelected = S.nDisplay == MP_DRAW_BARS; + return "Timers"; + case 2: + bSelected = S.nDisplay == MP_DRAW_HIDDEN; + return "Hidden"; + case 3: + bSelected = false; + return "Off"; + + default: return 0; + } + }, + [] (int index, bool& bSelected) -> const char*{ + if(index == 0) + { + bSelected = S.nMenuAllGroups != 0; + return "ALL"; + } + else + { + index = index-1; + bSelected = 0 != (S.nMenuActiveGroup & (1ll << index)); + if(index < MICROPROFILE_MAX_GROUPS && S.GroupInfo[index].pName) + return S.GroupInfo[index].pName; + else + return 0; + } + }, + [] (int index, bool& bSelected) -> const char*{ + if(index < sizeof(g_MicroProfileAggregatePresets)/sizeof(g_MicroProfileAggregatePresets[0])) + { + int val = g_MicroProfileAggregatePresets[index]; + bSelected = (int)S.nAggregateFlip == val; + if(0 == val) + return "Infinite"; + else + { + static char buf[128]; + snprintf(buf, sizeof(buf)-1, "%7d", val); + return buf; + } + } + return 0; + }, + [] (int index, bool& bSelected) -> const char*{ + bSelected = 0 != (S.nBars & (1 << index)); + switch(index) + { + case 0: return "Time"; + case 1: return "Average"; + case 2: return "Max"; + case 3: return "Call Count"; + case 4: return "Exclusive Timers"; + case 5: return "Exclusive Average"; + case 6: return "Exclusive Max"; + } + int nMetaIndex = index - 7; + if(nMetaIndex < MICROPROFILE_META_MAX) + { + return S.MetaCounters[nMetaIndex].pName; + } + return 0; + }, + [] (int index, bool& bSelected) -> const char*{ + if(index >= nOptionSize) return 0; + switch(Options[index].nSubType) + { + case 0: + bSelected = S.fReferenceTime == g_MicroProfileReferenceTimePresets[Options[index].nIndex]; + break; + case 1: + bSelected = S.nOpacityBackground>>24 == g_MicroProfileOpacityPresets[Options[index].nIndex]; + break; + case 2: + bSelected = S.nOpacityForeground>>24 == g_MicroProfileOpacityPresets[Options[index].nIndex]; + break; +#if MICROPROFILE_CONTEXT_SWITCH_TRACE + case 3: + { + switch(Options[index].nIndex) + { + case 0: + bSelected = S.bContextSwitchRunning; + break; + case 1: + bSelected = S.bContextSwitchAllThreads; + break; + case 2: + bSelected = S.bContextSwitchNoBars; + break; + } + } + break; +#endif + } + return Options[index].Text; + }, + + [] (int index, bool& bSelected) -> const char*{ + static char buf[128]; + bSelected = false; + int nNumPresets = sizeof(g_MicroProfilePresetNames) / sizeof(g_MicroProfilePresetNames[0]); + int nIndexSave = index - nNumPresets - 1; + if(index == nNumPresets) + return "--"; + else if(nIndexSave >=0 && nIndexSave const char*{ + return 0; + }, + [] (int index, bool& bSelected) -> const char*{ + return 0; + }, + [] (int index, bool& bSelected) -> const char*{ + return 0; + }, + + + }; + ClickCallback CBClick[] = + { + [](int nIndex) + { + switch(nIndex) + { + case 0: + S.nDisplay = MP_DRAW_DETAILED; + break; + case 1: + S.nDisplay = MP_DRAW_BARS; + break; + case 2: + S.nDisplay = MP_DRAW_HIDDEN; + break; + case 3: + S.nDisplay = 0; + break; + } + }, + [](int nIndex) + { + if(nIndex == 0) + S.nMenuAllGroups = 1-S.nMenuAllGroups; + else + S.nMenuActiveGroup ^= (1ll << (nIndex-1)); + }, + [](int nIndex) + { + S.nAggregateFlip = g_MicroProfileAggregatePresets[nIndex]; + if(0 == S.nAggregateFlip) + { + memset(S.AggregateTimers, 0, sizeof(S.AggregateTimers)); + memset(S.MaxTimers, 0, sizeof(S.MaxTimers)); + memset(S.AggregateTimersExclusive, 0, sizeof(S.AggregateTimersExclusive)); + memset(S.MaxTimersExclusive, 0, sizeof(S.MaxTimersExclusive)); + S.nFlipAggregate = 0; + S.nFlipMax = 0; + S.nAggregateFlipCount = 0; + } + }, + [](int nIndex) + { + S.nBars ^= (1 << nIndex); + }, + [](int nIndex) + { + switch(Options[nIndex].nSubType) + { + case 0: + S.fReferenceTime = g_MicroProfileReferenceTimePresets[Options[nIndex].nIndex]; + S.fRcpReferenceTime = 1.f / S.fReferenceTime; + break; + case 1: + S.nOpacityBackground = g_MicroProfileOpacityPresets[Options[nIndex].nIndex]<<24; + break; + case 2: + S.nOpacityForeground = g_MicroProfileOpacityPresets[Options[nIndex].nIndex]<<24; + break; +#if MICROPROFILE_CONTEXT_SWITCH_TRACE + case 3: + { + switch(Options[nIndex].nIndex) + { + case 0: + if(S.bContextSwitchRunning) + { + MicroProfileStopContextSwitchTrace(); + } + else + { + MicroProfileStartContextSwitchTrace(); + } + break; + case 1: + S.bContextSwitchAllThreads = !S.bContextSwitchAllThreads; + break; + case 2: + S.bContextSwitchNoBars= !S.bContextSwitchNoBars; + break; + + } + } + break; +#endif + } + }, + [](int nIndex) + { + int nNumPresets = sizeof(g_MicroProfilePresetNames) / sizeof(g_MicroProfilePresetNames[0]); + int nIndexSave = nIndex - nNumPresets - 1; + if(nIndexSave >= 0 && nIndexSave < nNumPresets) + { + MicroProfileSavePreset(g_MicroProfilePresetNames[nIndexSave]); + } + else if(nIndex >= 0 && nIndex < nNumPresets) + { + MicroProfileLoadPreset(g_MicroProfilePresetNames[nIndex]); + } + }, + [](int nIndex) + { + }, + [](int nIndex) + { + }, + [](int nIndex) + { + }, + }; + + uint32_t nSelectMenu = (uint32_t)-1; + for(uint32_t i = 0; i < nNumMenuItems; ++i) + { + nMenuX[i] = nX; + uint32_t nLen = (uint32_t)strlen(pMenuText[i]); + uint32_t nEnd = nX + nLen * (MICROPROFILE_TEXT_WIDTH+1); + if(S.nMouseY <= MICROPROFILE_TEXT_HEIGHT && S.nMouseX <= nEnd && S.nMouseX >= nX) + { + MicroProfileDrawBox(nX-1, nY, nX + nLen * (MICROPROFILE_TEXT_WIDTH+1), nY +(S.nBarHeight+1)+1, 0xff888888); + nSelectMenu = i; + if((S.nMouseLeft || S.nMouseRight) && i == (int)nPauseIndex) + { + S.nRunning = !S.nRunning; + } + } + MicroProfileDrawText(nX, nY, (uint32_t)-1, pMenuText[i], strlen(pMenuText[i])); + nX += (nLen+1) * (MICROPROFILE_TEXT_WIDTH+1); + } + uint32_t nMenu = nSelectMenu != (uint32_t)-1 ? nSelectMenu : S.nActiveMenu; + S.nActiveMenu = nMenu; + if((uint32_t)-1 != nMenu) + { + nX = nMenuX[nMenu]; + nY += MICROPROFILE_TEXT_HEIGHT+1; + SubmenuCallback CB = GroupCallback[nMenu]; + int nNumLines = 0; + bool bSelected = false; + const char* pString = CB(nNumLines, bSelected); + uint32_t nWidth = 0, nHeight = 0; + while(pString) + { + nWidth = MicroProfileMax(nWidth, (int)strlen(pString)); + nNumLines++; + pString = CB(nNumLines, bSelected); + } + nWidth = (2+nWidth) * (MICROPROFILE_TEXT_WIDTH+1); + nHeight = nNumLines * (MICROPROFILE_TEXT_HEIGHT+1); + if(S.nMouseY <= nY + nHeight+0 && S.nMouseY >= nY-0 && S.nMouseX <= nX + nWidth + 0 && S.nMouseX >= nX - 0) + { + S.nActiveMenu = nMenu; + } + else if(nSelectMenu == (uint32_t)-1) + { + S.nActiveMenu = (uint32_t)-1; + } + MicroProfileDrawBox(nX, nY, nX + nWidth, nY + nHeight, 0xff000000|g_nMicroProfileBackColors[1]); + for(int i = 0; i < nNumLines; ++i) + { + bool bSelected = false; + const char* pString = CB(i, bSelected); + if(S.nMouseY >= nY && S.nMouseY < nY + MICROPROFILE_TEXT_HEIGHT + 1) + { + bMouseOver = true; + if(S.nMouseLeft || S.nMouseRight) + { + CBClick[nMenu](i); + } + MicroProfileDrawBox(nX, nY, nX + nWidth, nY + MICROPROFILE_TEXT_HEIGHT + 1, 0xff888888); + } + int nLen = snprintf(buffer, SBUF_SIZE-1, "%c %s", bSelected ? '*' : ' ' ,pString); + MicroProfileDrawText(nX, nY, (uint32_t)-1, buffer, nLen); + nY += MICROPROFILE_TEXT_HEIGHT+1; + } + } + + + { + static char FrameTimeMessage[64]; + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + uint32_t nAggregateFrames = S.nAggregateFrames ? S.nAggregateFrames : 1; + float fMs = fToMs * (S.nFlipTicks); + float fAverageMs = fToMs * (S.nFlipAggregateDisplay / nAggregateFrames); + float fMaxMs = fToMs * S.nFlipMaxDisplay; + int nLen = snprintf(FrameTimeMessage, sizeof(FrameTimeMessage)-1, "Time[%6.2f] Avg[%6.2f] Max[%6.2f]", fMs, fAverageMs, fMaxMs); + pMenuText[nNumMenuItems++] = &FrameTimeMessage[0]; + MicroProfileDrawText(nWidth - nLen * (MICROPROFILE_TEXT_WIDTH+1), 0, -1, FrameTimeMessage, nLen); + } +} + + +void MicroProfileMoveGraph() +{ + int nZoom = S.nMouseWheelDelta; + int nPanX = 0; + int nPanY = 0; + static int X = 0, Y = 0; + if(S.nMouseDownLeft && !S.nModDown) + { + nPanX = S.nMouseX - X; + nPanY = S.nMouseY - Y; + } + X = S.nMouseX; + Y = S.nMouseY; + + if(nZoom) + { + float fOldRange = S.fDetailedRange; + if(nZoom>0) + { + S.fDetailedRangeTarget = S.fDetailedRange *= S.nModDown ? 1.40 : 1.05f; + } + else + { + S.fDetailedRangeTarget = S.fDetailedRange /= S.nModDown ? 1.40 : 1.05f; + } + + float fDiff = fOldRange - S.fDetailedRange; + float fMousePrc = MicroProfileMax((float)S.nMouseX / S.nWidth ,0.f); + S.fDetailedOffsetTarget = S.fDetailedOffset += fDiff * fMousePrc; + + } + if(nPanX) + { + S.fDetailedOffsetTarget = S.fDetailedOffset += -nPanX * S.fDetailedRange / S.nWidth; + } + S.nOffsetY -= nPanY; + if(S.nOffsetY<0) + S.nOffsetY = 0; +} + +bool MicroProfileIsDrawing() +{ + return S.nDisplay != 0; +} +void MicroProfileDraw(uint32_t nWidth, uint32_t nHeight) +{ + MICROPROFILE_SCOPE(g_MicroProfileDraw); + + if(S.nDisplay) + { + MicroProfileScopeLock L(MicroProfileMutex()); + S.nWidth = nWidth; + S.nHeight = nHeight; + S.nHoverToken = MICROPROFILE_INVALID_TOKEN; + S.nHoverTime = 0; + S.nHoverFrame = -1; + if(S.nDisplay != MP_DRAW_DETAILED) + S.nContextSwitchHoverThread = S.nContextSwitchHoverThreadAfter = S.nContextSwitchHoverThreadBefore = -1; + MicroProfileMoveGraph(); + + + if(S.nDisplay == MP_DRAW_DETAILED) + { + MicroProfileDrawDetailedView(nWidth, nHeight); + } + else if(S.nDisplay == MP_DRAW_BARS && S.nBars) + { + MicroProfileDrawBarView(nWidth, nHeight); + } + + MicroProfileDrawMenu(nWidth, nHeight); + bool bMouseOverGraph = MicroProfileDrawGraph(nWidth, nHeight); + bool bHidden = S.nDisplay == MP_DRAW_HIDDEN; + if(!bHidden) + { + uint32_t nLockedToolTipX = 3; + bool bDeleted = false; + for(int i = 0; i < MICROPROFILE_TOOLTIP_MAX_LOCKED; ++i) + { + int nIndex = (S.LockedToolTipFront + i) % MICROPROFILE_TOOLTIP_MAX_LOCKED; + if(S.LockedToolTips[nIndex].ppStrings[0]) + { + uint32_t nToolTipWidth = 0, nToolTipHeight = 0; + MicroProfileFloatWindowSize(S.LockedToolTips[nIndex].ppStrings, S.LockedToolTips[nIndex].nNumStrings, 0, nToolTipWidth, nToolTipHeight, 0); + uint32_t nStartY = nHeight - nToolTipHeight - 2; + if(!bDeleted && S.nMouseY > nStartY && S.nMouseX > nLockedToolTipX && S.nMouseX <= nLockedToolTipX + nToolTipWidth && (S.nMouseLeft || S.nMouseRight) ) + { + bDeleted = true; + int j = i; + for(; j < MICROPROFILE_TOOLTIP_MAX_LOCKED-1; ++j) + { + int nIndex0 = (S.LockedToolTipFront + j) % MICROPROFILE_TOOLTIP_MAX_LOCKED; + int nIndex1 = (S.LockedToolTipFront + j+1) % MICROPROFILE_TOOLTIP_MAX_LOCKED; + MicroProfileStringArrayCopy(&S.LockedToolTips[nIndex0], &S.LockedToolTips[nIndex1]); + } + MicroProfileStringArrayClear(&S.LockedToolTips[(S.LockedToolTipFront + j) % MICROPROFILE_TOOLTIP_MAX_LOCKED]); + } + else + { + MicroProfileDrawFloatWindow(nLockedToolTipX, nHeight-nToolTipHeight-2, &S.LockedToolTips[nIndex].ppStrings[0], S.LockedToolTips[nIndex].nNumStrings, S.nLockedToolTipColor[nIndex]); + nLockedToolTipX += nToolTipWidth + 4; + } + } + } + + if(S.nActiveMenu == 7) + { + if(S.nDisplay & MP_DRAW_DETAILED) + { + MicroProfileStringArray DetailedHelp; + MicroProfileStringArrayClear(&DetailedHelp); + MicroProfileStringArrayFormat(&DetailedHelp, "%s", MICROPROFILE_HELP_LEFT); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Toggle Graph"); + MicroProfileStringArrayFormat(&DetailedHelp, "%s", MICROPROFILE_HELP_ALT); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Zoom"); + MicroProfileStringArrayFormat(&DetailedHelp, "%s + %s", MICROPROFILE_HELP_MOD, MICROPROFILE_HELP_LEFT); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Lock Tooltip"); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Drag"); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Pan View"); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Mouse Wheel"); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Zoom"); + MicroProfileDrawFloatWindow(nWidth, MICROPROFILE_FRAME_HISTORY_HEIGHT+20, DetailedHelp.ppStrings, DetailedHelp.nNumStrings, 0xff777777); + + MicroProfileStringArray DetailedHistoryHelp; + MicroProfileStringArrayClear(&DetailedHistoryHelp); + MicroProfileStringArrayFormat(&DetailedHistoryHelp, "%s", MICROPROFILE_HELP_LEFT); + MicroProfileStringArrayAddLiteral(&DetailedHistoryHelp, "Center View"); + MicroProfileStringArrayFormat(&DetailedHistoryHelp, "%s", MICROPROFILE_HELP_ALT); + MicroProfileStringArrayAddLiteral(&DetailedHistoryHelp, "Zoom to frame"); + MicroProfileDrawFloatWindow(nWidth, 20, DetailedHistoryHelp.ppStrings, DetailedHistoryHelp.nNumStrings, 0xff777777); + + + + } + else if(0 != (S.nDisplay & MP_DRAW_BARS) && S.nBars) + { + MicroProfileStringArray BarHelp; + MicroProfileStringArrayClear(&BarHelp); + MicroProfileStringArrayFormat(&BarHelp, "%s", MICROPROFILE_HELP_LEFT); + MicroProfileStringArrayAddLiteral(&BarHelp, "Toggle Graph"); + MicroProfileStringArrayFormat(&BarHelp, "%s + %s", MICROPROFILE_HELP_MOD, MICROPROFILE_HELP_LEFT); + MicroProfileStringArrayAddLiteral(&BarHelp, "Lock Tooltip"); + MicroProfileStringArrayAddLiteral(&BarHelp, "Drag"); + MicroProfileStringArrayAddLiteral(&BarHelp, "Pan View"); + MicroProfileDrawFloatWindow(nWidth, MICROPROFILE_FRAME_HISTORY_HEIGHT+20, BarHelp.ppStrings, BarHelp.nNumStrings, 0xff777777); + + } + MicroProfileStringArray Debug; + MicroProfileStringArrayClear(&Debug); + MicroProfileStringArrayAddLiteral(&Debug, "Memory Usage"); + MicroProfileStringArrayFormat(&Debug, "%4.2fmb", S.nMemUsage / (1024.f * 1024.f)); + uint32_t nFrameNext = (S.nFrameCurrent+1) % MICROPROFILE_MAX_FRAME_HISTORY; + MicroProfileFrameState* pFrameCurrent = &S.Frames[S.nFrameCurrent]; + MicroProfileFrameState* pFrameNext = &S.Frames[nFrameNext]; + + + MicroProfileStringArrayAddLiteral(&Debug, ""); + MicroProfileStringArrayAddLiteral(&Debug, ""); + MicroProfileStringArrayAddLiteral(&Debug, "Usage"); + MicroProfileStringArrayAddLiteral(&Debug, "markers [frames] "); + +#if MICROPROFILE_CONTEXT_SWITCH_TRACE + MicroProfileStringArrayAddLiteral(&Debug, "Context Switch"); + MicroProfileStringArrayFormat(&Debug, "%9d [%7d]", S.nContextSwitchUsage, MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE / S.nContextSwitchUsage ); +#endif + + for(int i = 0; i < MICROPROFILE_MAX_GROUPS; ++i) + { + if(pFrameCurrent->nLogStart[i] && S.Pool[i]) + { + uint32_t nEnd = pFrameNext->nLogStart[i]; + uint32_t nStart = pFrameCurrent->nLogStart[i]; + uint32_t nUsage = nStart < nEnd ? (nEnd - nStart) : (nEnd + MICROPROFILE_BUFFER_SIZE - nStart); + uint32_t nFrameSupport = MICROPROFILE_BUFFER_SIZE / nUsage; + MicroProfileStringArrayFormat(&Debug, "%s", &S.Pool[i]->ThreadName[0]); + MicroProfileStringArrayFormat(&Debug, "%9d [%7d]", nUsage, nFrameSupport); + } + } + + MicroProfileDrawFloatWindow(0, nHeight-10, Debug.ppStrings, Debug.nNumStrings, 0xff777777); + } + + + + if(S.nActiveMenu == -1 && !bMouseOverGraph) + { + if(S.nHoverToken != MICROPROFILE_INVALID_TOKEN) + { + MicroProfileDrawFloatTooltip(S.nMouseX, S.nMouseY, S.nHoverToken, S.nHoverTime); + } + else if(S.nContextSwitchHoverThreadAfter != -1 && S.nContextSwitchHoverThreadBefore != -1) + { + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + MicroProfileStringArray ToolTip; + MicroProfileStringArrayClear(&ToolTip); + MicroProfileStringArrayAddLiteral(&ToolTip, "Context Switch"); + MicroProfileStringArrayFormat(&ToolTip, "%04x", S.nContextSwitchHoverThread); + MicroProfileStringArrayAddLiteral(&ToolTip, "Before"); + MicroProfileStringArrayFormat(&ToolTip, "%04x", S.nContextSwitchHoverThreadBefore); + MicroProfileStringArrayAddLiteral(&ToolTip, "After"); + MicroProfileStringArrayFormat(&ToolTip, "%04x", S.nContextSwitchHoverThreadAfter); + MicroProfileStringArrayAddLiteral(&ToolTip, "Duration"); + int64_t nDifference = MicroProfileLogTickDifference(S.nContextSwitchHoverTickIn, S.nContextSwitchHoverTickOut); + MicroProfileStringArrayFormat(&ToolTip, "%6.2fms", fToMs * nDifference ); + MicroProfileStringArrayAddLiteral(&ToolTip, "CPU"); + MicroProfileStringArrayFormat(&ToolTip, "%d", S.nContextSwitchHoverCpu); + MicroProfileDrawFloatWindow(S.nMouseX, S.nMouseY+20, &ToolTip.ppStrings[0], ToolTip.nNumStrings, -1); + + + } + else if(S.nHoverFrame != -1) + { + uint32_t nNextFrame = (S.nHoverFrame+1)%MICROPROFILE_MAX_FRAME_HISTORY; + int64_t nTick = S.Frames[S.nHoverFrame].nFrameStartCpu; + int64_t nTickNext = S.Frames[nNextFrame].nFrameStartCpu; + int64_t nTickGpu = S.Frames[S.nHoverFrame].nFrameStartGpu; + int64_t nTickNextGpu = S.Frames[nNextFrame].nFrameStartGpu; + + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + float fToMsGpu = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondGpu()); + float fMs = fToMs * (nTickNext - nTick); + float fMsGpu = fToMsGpu * (nTickNextGpu - nTickGpu); + MicroProfileStringArray ToolTip; + MicroProfileStringArrayClear(&ToolTip); + MicroProfileStringArrayFormat(&ToolTip, "Frame %d", S.nHoverFrame); + #if MICROPROFILE_DEBUG + MicroProfileStringArrayFormat(&ToolTip, "%p", &S.Frames[S.nHoverFrame]); + #else + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + #endif + MicroProfileStringArrayAddLiteral(&ToolTip, "CPU Time"); + MicroProfileStringArrayFormat(&ToolTip, "%6.2fms", fMs); + MicroProfileStringArrayAddLiteral(&ToolTip, "GPU Time"); + MicroProfileStringArrayFormat(&ToolTip, "%6.2fms", fMsGpu); + #if MICROPROFILE_DEBUG + for(int i = 0; i < MICROPROFILE_MAX_GROUPS; ++i) + { + if(S.Frames[S.nHoverFrame].nLogStart[i]) + { + MicroProfileStringArrayFormat(&ToolTip, "%d", i); + MicroProfileStringArrayFormat(&ToolTip, "%d", S.Frames[S.nHoverFrame].nLogStart[i]); + } + } + #endif + MicroProfileDrawFloatWindow(S.nMouseX, S.nMouseY+20, &ToolTip.ppStrings[0], ToolTip.nNumStrings, -1); + } + if(S.nMouseLeft) + { + if(S.nHoverToken != MICROPROFILE_INVALID_TOKEN) + MicroProfileToggleGraph(S.nHoverToken); + } + } + } +#if MICROPROFILE_DRAWCURSOR + { + float fCursor[8] = + { + MicroProfileMax(0, (int)S.nMouseX-3), S.nMouseY, + MicroProfileMin(nWidth, S.nMouseX+3), S.nMouseY, + S.nMouseX, MicroProfileMax((int)S.nMouseY-3, 0), + S.nMouseX, MicroProfileMin(nHeight, S.nMouseY+3), + }; + MicroProfileDrawLine2D(2, &fCursor[0], 0xff00ff00); + MicroProfileDrawLine2D(2, &fCursor[4], 0xff00ff00); + } +#endif + + } + S.nMouseLeft = S.nMouseRight = 0; + S.nMouseLeftMod = S.nMouseRightMod = 0; + S.nMouseWheelDelta = 0; + if(S.nOverflow) + S.nOverflow--; + +} +void MicroProfileMousePosition(uint32_t nX, uint32_t nY, int nWheelDelta) +{ + S.nMouseX = nX; + S.nMouseY = nY; + S.nMouseWheelDelta = nWheelDelta; +} + +void MicroProfileModKey(uint32_t nKeyState) +{ + S.nModDown = nKeyState ? 1 : 0; +} + +void MicroProfileClearGraph() +{ + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + if(S.Graph[i].nToken != 0) + { + S.Graph[i].nToken = MICROPROFILE_INVALID_TOKEN; + } + } +} +void MicroProfileTogglePause() +{ + S.nRunning = !S.nRunning; +} + +void MicroProfileGetState(MicroProfileState* pStateOut) +{ + pStateOut->nDisplay = S.nDisplay; + pStateOut->nMenuAllGroups = S.nMenuAllGroups; + pStateOut->nMenuActiveGroup = S.nMenuActiveGroup; + pStateOut->nMenuAllThreads = S.nMenuAllThreads; + pStateOut->nAggregateFlip = S.nAggregateFlip; + pStateOut->nBars = S.nBars; + pStateOut->fReferenceTime = S.fReferenceTime; +} + +void MicroProfileSetState(MicroProfileState* pStateOut) +{ + MicroProfileScopeLock L(MicroProfileMutex()); + S.nDisplay = pStateOut->nDisplay; + S.nMenuAllGroups = pStateOut->nMenuAllGroups; + S.nMenuActiveGroup = pStateOut->nMenuActiveGroup; + S.nMenuAllThreads = pStateOut->nMenuAllThreads; + S.nAggregateFlip = pStateOut->nAggregateFlip; + S.nBars = pStateOut->nBars; + S.fReferenceTime = pStateOut->fReferenceTime; + S.fRcpReferenceTime = 1.f / S.fReferenceTime; +} + +void MicroProfileToggleGraph(MicroProfileToken nToken) +{ + nToken &= 0xffff; + int32_t nMinSort = 0x7fffffff; + int32_t nFreeIndex = -1; + int32_t nMinIndex = 0; + int32_t nMaxSort = 0x80000000; + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + if(S.Graph[i].nToken == MICROPROFILE_INVALID_TOKEN) + nFreeIndex = i; + if(S.Graph[i].nToken == nToken) + { + S.Graph[i].nToken = MICROPROFILE_INVALID_TOKEN; + return; + } + if(S.Graph[i].nKey < nMinSort) + { + nMinSort = S.Graph[i].nKey; + nMinIndex = i; + } + if(S.Graph[i].nKey > nMaxSort) + { + nMaxSort = S.Graph[i].nKey; + } + } + int nIndex = nFreeIndex > -1 ? nFreeIndex : nMinIndex; + S.Graph[nIndex].nToken = nToken; + S.Graph[nIndex].nKey = nMaxSort+1; + memset(&S.Graph[nIndex].nHistory[0], 0, sizeof(S.Graph[nIndex].nHistory)); +} +void MicroProfileMouseButton(uint32_t nLeft, uint32_t nRight) +{ + if(0 == nLeft && S.nMouseDownLeft) + { + if(S.nModDown) + S.nMouseLeftMod = 1; + else + S.nMouseLeft = 1; + } + + if(0 == nRight && S.nMouseDownRight) + { + if(S.nModDown) + S.nMouseRightMod = 1; + else + S.nMouseRight = 1; + } + + S.nMouseDownLeft = nLeft; + S.nMouseDownRight = nRight; + +} + +#include + +#define MICROPROFILE_PRESET_HEADER_MAGIC 0x28586813 +#define MICROPROFILE_PRESET_HEADER_VERSION 0x00000100 +struct MicroProfilePresetHeader +{ + uint32_t nMagic; + uint32_t nVersion; + //groups, threads, aggregate, reference frame, graphs timers + uint32_t nGroups[MICROPROFILE_MAX_GROUPS]; + uint32_t nThreads[MICROPROFILE_MAX_THREADS]; + uint32_t nGraphName[MICROPROFILE_MAX_GRAPHS]; + uint32_t nGraphGroupName[MICROPROFILE_MAX_GRAPHS]; + uint32_t nMenuAllGroups; + uint32_t nMenuAllThreads; + uint32_t nAggregateFlip; + float fReferenceTime; + uint32_t nBars; + uint32_t nDisplay; + uint32_t nOpacityBackground; + uint32_t nOpacityForeground; +}; + +#ifndef MICROPROFILE_PRESET_FILENAME_FUNC +#define MICROPROFILE_PRESET_FILENAME_FUNC MicroProfilePresetFilename +static const char* MicroProfilePresetFilename(const char* pSuffix) +{ + static char filename[512]; + snprintf(filename, sizeof(filename)-1, ".microprofilepreset.%s", pSuffix); + return filename; +} +#endif + +void MicroProfileSavePreset(const char* pPresetName) +{ + std::lock_guard Lock(MicroProfileMutex()); + FILE* F = fopen(MICROPROFILE_PRESET_FILENAME_FUNC(pPresetName), "w"); + if(!F) return; + + MicroProfilePresetHeader Header; + memset(&Header, 0, sizeof(Header)); + Header.nAggregateFlip = S.nAggregateFlip; + Header.nBars = S.nBars; + Header.fReferenceTime = S.fReferenceTime; + Header.nMenuAllGroups = S.nMenuAllGroups; + Header.nMenuAllThreads = S.nMenuAllThreads; + Header.nMagic = MICROPROFILE_PRESET_HEADER_MAGIC; + Header.nVersion = MICROPROFILE_PRESET_HEADER_VERSION; + Header.nDisplay = S.nDisplay; + Header.nOpacityBackground = S.nOpacityBackground; + Header.nOpacityForeground = S.nOpacityForeground; + fwrite(&Header, sizeof(Header), 1, F); + uint64_t nMask = 1; + for(uint32_t i = 0; i < MICROPROFILE_MAX_GROUPS; ++i) + { + if(S.nMenuActiveGroup & nMask) + { + uint32_t offset = ftell(F); + const char* pName = S.GroupInfo[i].pName; + int nLen = (int)strlen(pName)+1; + fwrite(pName, nLen, 1, F); + Header.nGroups[i] = offset; + } + nMask <<= 1; + } + for(uint32_t i = 0; i < MICROPROFILE_MAX_THREADS; ++i) + { + MicroProfileThreadLog* pLog = S.Pool[i]; + if(pLog && S.nThreadActive[i]) + { + uint32_t nOffset = ftell(F); + const char* pName = &pLog->ThreadName[0]; + int nLen = (int)strlen(pName)+1; + fwrite(pName, nLen, 1, F); + Header.nThreads[i] = nOffset; + } + } + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + MicroProfileToken nToken = S.Graph[i].nToken; + if(nToken != MICROPROFILE_INVALID_TOKEN) + { + uint32_t nGroupIndex = MicroProfileGetGroupIndex(nToken); + uint32_t nTimerIndex = MicroProfileGetTimerIndex(nToken); + const char* pGroupName = S.GroupInfo[nGroupIndex].pName; + const char* pTimerName = S.TimerInfo[nTimerIndex].pName; + MP_ASSERT(pGroupName); + MP_ASSERT(pTimerName); + int nGroupLen = (int)strlen(pGroupName)+1; + int nTimerLen = (int)strlen(pTimerName)+1; + + uint32_t nOffsetGroup = ftell(F); + fwrite(pGroupName, nGroupLen, 1, F); + uint32_t nOffsetTimer = ftell(F); + fwrite(pTimerName, nTimerLen, 1, F); + Header.nGraphName[i] = nOffsetTimer; + Header.nGraphGroupName[i] = nOffsetGroup; + } + } + fseek(F, 0, SEEK_SET); + fwrite(&Header, sizeof(Header), 1, F); + + fclose(F); + +} + + + +void MicroProfileLoadPreset(const char* pSuffix) +{ + std::lock_guard Lock(MicroProfileMutex()); + FILE* F = fopen(MICROPROFILE_PRESET_FILENAME_FUNC(pSuffix), "r"); + if(!F) + { + return; + } + fseek(F, 0, SEEK_END); + int nSize = ftell(F); + char* const pBuffer = (char*)alloca(nSize); + fseek(F, 0, SEEK_SET); + int nRead = (int)fread(pBuffer, nSize, 1, F); + fclose(F); + if(1 != nRead) + return; + + MicroProfilePresetHeader& Header = *(MicroProfilePresetHeader*)pBuffer; + + if(Header.nMagic != MICROPROFILE_PRESET_HEADER_MAGIC || Header.nVersion != MICROPROFILE_PRESET_HEADER_VERSION) + { + return; + } + + S.nAggregateFlip = Header.nAggregateFlip; + S.nBars = Header.nBars; + S.fReferenceTime = Header.fReferenceTime; + S.fRcpReferenceTime = 1.f / Header.fReferenceTime; + S.nMenuAllGroups = Header.nMenuAllGroups; + S.nMenuAllThreads = Header.nMenuAllThreads; + S.nDisplay = Header.nDisplay; + S.nMenuActiveGroup = 0; + S.nOpacityBackground = Header.nOpacityBackground; + S.nOpacityForeground = Header.nOpacityForeground; + + memset(&S.nThreadActive[0], 0, sizeof(S.nThreadActive)); + + for(uint32_t i = 0; i < MICROPROFILE_MAX_GROUPS; ++i) + { + if(Header.nGroups[i]) + { + const char* pGroupName = pBuffer + Header.nGroups[i]; + for(uint32_t j = 0; j < MICROPROFILE_MAX_GROUPS; ++j) + { + if(S.GroupInfo[j].pName && 0 == MP_STRCASECMP(pGroupName, S.GroupInfo[j].pName)) + { + S.nMenuActiveGroup |= (1ll << j); + } + } + } + } + for(uint32_t i = 0; i < MICROPROFILE_MAX_THREADS; ++i) + { + if(Header.nThreads[i]) + { + const char* pThreadName = pBuffer + Header.nThreads[i]; + for(uint32_t j = 0; j < MICROPROFILE_MAX_THREADS; ++j) + { + MicroProfileThreadLog* pLog = S.Pool[j]; + if(pLog && 0 == MP_STRCASECMP(pThreadName, &pLog->ThreadName[0])) + { + S.nThreadActive[j] = 1; + } + } + } + } + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + MicroProfileToken nPrevToken = S.Graph[i].nToken; + S.Graph[i].nToken = MICROPROFILE_INVALID_TOKEN; + if(Header.nGraphName[i] && Header.nGraphGroupName[i]) + { + const char* pGraphName = pBuffer + Header.nGraphName[i]; + const char* pGraphGroupName = pBuffer + Header.nGraphGroupName[i]; + for(uint32_t j = 0; j < S.nTotalTimers; ++j) + { + uint64_t nGroupIndex = S.TimerInfo[j].nGroupIndex; + if(0 == MP_STRCASECMP(pGraphName, S.TimerInfo[j].pName) && 0 == MP_STRCASECMP(pGraphGroupName, S.GroupInfo[nGroupIndex].pName)) + { + MicroProfileToken nToken = MicroProfileMakeToken(1ll << nGroupIndex, (uint16_t)j); + S.Graph[i].nToken = nToken; + if(nToken != nPrevToken) + { + memset(&S.Graph[i].nHistory, 0, sizeof(S.Graph[i].nHistory)); + } + break; + } + } + } + } +} + +void MicroProfileDrawLineVertical(int nX, int nTop, int nBottom, uint32_t nColor) +{ + MicroProfileDrawBox(nX, nTop, nX + 1, nBottom, nColor); +} + +void MicroProfileDrawLineHorizontal(int nLeft, int nRight, int nY, uint32_t nColor) +{ + MicroProfileDrawBox(nLeft, nY, nRight, nY + 1, nColor); +} + +float MicroProfileGetTime(const char* pGroup, const char* pName) +{ + MicroProfileToken nToken = MicroProfileFindToken(pGroup, pName); + if(nToken == MICROPROFILE_INVALID_TOKEN) + { + return 0.f; + } + uint32_t nTimerIndex = MicroProfileGetTimerIndex(nToken); + uint32_t nGroupIndex = MicroProfileGetGroupIndex(nToken); + float fToMs = MicroProfileTickToMsMultiplier(S.GroupInfo[nGroupIndex].Type == MicroProfileTokenTypeGpu ? MicroProfileTicksPerSecondGpu() : MicroProfileTicksPerSecondCpu()); + return S.Frame[nTimerIndex].nTicks * fToMs; +} +void MicroProfileForceEnableGroup(const char* pGroup, MicroProfileTokenType Type) +{ + MicroProfileInit(); + std::lock_guard Lock(MicroProfileMutex()); + uint16_t nGroup = MicroProfileGetGroup(pGroup, Type); + S.nForceGroup |= (1ll << nGroup); +} + +void MicroProfileForceDisableGroup(const char* pGroup, MicroProfileTokenType Type) +{ + MicroProfileInit(); + std::lock_guard Lock(MicroProfileMutex()); + uint16_t nGroup = MicroProfileGetGroup(pGroup, Type); + S.nForceGroup &= ~(1ll << nGroup); +} + + + +#if MICROPROFILE_CONTEXT_SWITCH_TRACE +#ifdef _WIN32 +#define INITGUID +#include +#include +#include + + +static GUID g_MicroProfileThreadClassGuid = { 0x3d6fa8d1, 0xfe05, 0x11d0, 0x9d, 0xda, 0x00, 0xc0, 0x4f, 0xd7, 0xba, 0x7c }; + +struct MicroProfileSCSwitch +{ + uint32_t NewThreadId; + uint32_t OldThreadId; + int8_t NewThreadPriority; + int8_t OldThreadPriority; + uint8_t PreviousCState; + int8_t SpareByte; + int8_t OldThreadWaitReason; + int8_t OldThreadWaitMode; + int8_t OldThreadState; + int8_t OldThreadWaitIdealProcessor; + uint32_t NewThreadWaitTime; + uint32_t Reserved; +}; + + +VOID WINAPI MicroProfileContextSwitchCallback(PEVENT_TRACE pEvent) +{ + if (pEvent->Header.Guid == g_MicroProfileThreadClassGuid) + { + if (pEvent->Header.Class.Type == 36) + { + MicroProfileSCSwitch* pCSwitch = (MicroProfileSCSwitch*) pEvent->MofData; + if ((pCSwitch->NewThreadId != 0) || (pCSwitch->OldThreadId != 0)) + { + MicroProfileContextSwitch Switch; + Switch.nThreadOut = pCSwitch->OldThreadId; + Switch.nThreadIn = pCSwitch->NewThreadId; + Switch.nCpu = pEvent->BufferContext.ProcessorNumber; + Switch.nTicks = pEvent->Header.TimeStamp.QuadPart; + MicroProfileContextSwitchPut(&Switch); + } + } + } +} + +ULONG WINAPI MicroProfileBufferCallback(PEVENT_TRACE_LOGFILE Buffer) +{ + return (S.bContextSwitchStop || !S.bContextSwitchRunning) ? FALSE : TRUE; +} + + +struct MicroProfileKernelTraceProperties : public EVENT_TRACE_PROPERTIES +{ + char dummy[sizeof(KERNEL_LOGGER_NAME)]; +}; + + +void MicroProfileTraceThread(int unused) +{ + + { + TRACEHANDLE SessionHandle = 0; + MicroProfileKernelTraceProperties sessionProperties; + + ZeroMemory(&sessionProperties, sizeof(sessionProperties)); + sessionProperties.Wnode.BufferSize = sizeof(sessionProperties); + sessionProperties.Wnode.Flags = WNODE_FLAG_TRACED_GUID; + sessionProperties.Wnode.ClientContext = 1; //QPC clock resolution + sessionProperties.Wnode.Guid = SystemTraceControlGuid; + sessionProperties.BufferSize = 1; + sessionProperties.NumberOfBuffers = 128; + sessionProperties.EnableFlags = EVENT_TRACE_FLAG_CSWITCH; + sessionProperties.LogFileMode = EVENT_TRACE_REAL_TIME_MODE; + sessionProperties.MaximumFileSize = 0; + sessionProperties.LoggerNameOffset = sizeof(EVENT_TRACE_PROPERTIES); + sessionProperties.LogFileNameOffset = 0; + + EVENT_TRACE_LOGFILE log; + ZeroMemory(&log, sizeof(log)); + log.LoggerName = KERNEL_LOGGER_NAME; + log.ProcessTraceMode = 0; + TRACEHANDLE hLog = OpenTrace(&log); + if (hLog) + { + ControlTrace(SessionHandle, KERNEL_LOGGER_NAME, &sessionProperties, EVENT_TRACE_CONTROL_STOP); + } + CloseTrace(hLog); + + + } + ULONG status = ERROR_SUCCESS; + TRACEHANDLE SessionHandle = 0; + MicroProfileKernelTraceProperties sessionProperties; + + ZeroMemory(&sessionProperties, sizeof(sessionProperties)); + sessionProperties.Wnode.BufferSize = sizeof(sessionProperties); + sessionProperties.Wnode.Flags = WNODE_FLAG_TRACED_GUID; + sessionProperties.Wnode.ClientContext = 1; //QPC clock resolution + sessionProperties.Wnode.Guid = SystemTraceControlGuid; + sessionProperties.BufferSize = 1; + sessionProperties.NumberOfBuffers = 128; + sessionProperties.EnableFlags = EVENT_TRACE_FLAG_CSWITCH|EVENT_TRACE_FLAG_PROCESS; + sessionProperties.LogFileMode = EVENT_TRACE_REAL_TIME_MODE; + sessionProperties.MaximumFileSize = 0; + sessionProperties.LoggerNameOffset = sizeof(EVENT_TRACE_PROPERTIES); + sessionProperties.LogFileNameOffset = 0; + + + status = StartTrace((PTRACEHANDLE) &SessionHandle, KERNEL_LOGGER_NAME, &sessionProperties); + + if (ERROR_SUCCESS != status) + { + S.bContextSwitchRunning = false; + return; + } + + EVENT_TRACE_LOGFILE log; + ZeroMemory(&log, sizeof(log)); + + log.LoggerName = KERNEL_LOGGER_NAME; + log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_RAW_TIMESTAMP; + log.EventCallback = MicroProfileContextSwitchCallback; + log.BufferCallback = MicroProfileBufferCallback; + + TRACEHANDLE hLog = OpenTrace(&log); + ProcessTrace(&hLog, 1, 0, 0); + CloseTrace(hLog); + S.bContextSwitchRunning = false; +} + +void MicroProfileStartContextSwitchTrace() +{ + if(!S.bContextSwitchRunning) + { + if(!S.pContextSwitchThread) + S.pContextSwitchThread = new std::thread(); + if(S.pContextSwitchThread->joinable()) + { + S.bContextSwitchStop = true; + S.pContextSwitchThread->join(); + } + S.bContextSwitchRunning = true; + S.bContextSwitchStop = false; + *S.pContextSwitchThread = std::thread(&MicroProfileTraceThread, 0); + } +} + +void MicroProfileStopContextSwitchTrace() +{ + if(S.bContextSwitchRunning && S.pContextSwitchThread) + { + S.bContextSwitchStop = true; + S.pContextSwitchThread->join(); + } +} + +bool MicroProfileIsLocalThread(uint32_t nThreadId) +{ + HANDLE h = OpenThread(THREAD_QUERY_LIMITED_INFORMATION, FALSE, nThreadId); + if(h == NULL) + return false; + DWORD hProcess = GetProcessIdOfThread(h); + CloseHandle(h); + return GetCurrentProcessId() == hProcess; +} + +#else +#error "context switch trace not supported/implemented on platform" +#endif +#else + +bool MicroProfileIsLocalThread(uint32_t nThreadId){return false;} +void MicroProfileStopContextSwitchTrace(){} +void MicroProfileStartContextSwitchTrace(){} + +#endif + + +#undef S + +#ifdef _WIN32 +#pragma warning(pop) +#endif +#endif +#endif From 6486e0a48ee5ad5a7f74be048e1952a630ece55c Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 22:54:40 -0700 Subject: [PATCH 115/184] Profiler skeleton. --- src/xenia/common.h | 1 + src/xenia/config.h | 2 + src/xenia/core/thread.cc | 4 + src/xenia/kernel/objects/xthread.cc | 4 + src/xenia/profiling.cc | 88 +++++++++++++++++ src/xenia/profiling.h | 135 +++++++++++++++++++++++++++ src/xenia/sources.gypi | 2 + tools/alloy-sandbox/alloy-sandbox.cc | 5 + tools/xenia-run/xenia-run.cc | 4 + xenia.gyp | 4 + 10 files changed, 249 insertions(+) create mode 100644 src/xenia/profiling.cc create mode 100644 src/xenia/profiling.h diff --git a/src/xenia/common.h b/src/xenia/common.h index ff16b03c3..68d9d2eb7 100644 --- a/src/xenia/common.h +++ b/src/xenia/common.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/src/xenia/config.h b/src/xenia/config.h index b83aa8715..0804277bf 100644 --- a/src/xenia/config.h +++ b/src/xenia/config.h @@ -27,6 +27,8 @@ #define XE_OPTION_LOG_KERNEL 1 #define XE_OPTION_LOG_FS 1 +// Enable profiling. +#define XE_OPTION_PROFILING 1 // TODO(benvanik): make this a runtime option #define XE_OPTION_OPTIMIZED 0 diff --git a/src/xenia/core/thread.cc b/src/xenia/core/thread.cc index 8a48d8267..8aace9ee4 100644 --- a/src/xenia/core/thread.cc +++ b/src/xenia/core/thread.cc @@ -79,7 +79,9 @@ static uint32_t __stdcall xe_thread_callback_win32(void* param) { } } + xe::Profiler::ThreadEnter(thread->name); thread->callback(thread->callback_param); + xe::Profiler::ThreadExit(); return 0; } #pragma warning(default : 6320; default : 6322) @@ -118,7 +120,9 @@ static void* xe_thread_callback_pthreads(void* param) { #else pthread_setname_np(pthread_self(), thread->name); #endif // OSX + xe::Profiler::ThreadEnter(thread->name); thread->callback(thread->callback_param); + xe::Profiler::ThreadExit(); return 0; } diff --git a/src/xenia/kernel/objects/xthread.cc b/src/xenia/kernel/objects/xthread.cc index 33f5aa378..8acce8b27 100644 --- a/src/xenia/kernel/objects/xthread.cc +++ b/src/xenia/kernel/objects/xthread.cc @@ -253,10 +253,12 @@ X_STATUS XThread::Exit(int exit_code) { static uint32_t __stdcall XThreadStartCallbackWin32(void* param) { XThread* thread = reinterpret_cast(param); + xe::Profiler::ThreadEnter(thread->name()); xeKeTlsSetValue(current_thread_tls, (uint64_t)thread); thread->Execute(); xeKeTlsSetValue(current_thread_tls, NULL); thread->Release(); + xe::Profiler::ThreadExit(); return 0; } @@ -293,10 +295,12 @@ X_STATUS XThread::PlatformExit(int exit_code) { static void* XThreadStartCallbackPthreads(void* param) { XThread* thread = reinterpret_cast(param); + xe::Profiler::ThreadEnter(thread->name()); xeKeTlsSetValue(current_thread_tls, (uint64_t)thread); thread->Execute(); xeKeTlsSetValue(current_thread_tls, NULL); thread->Release(); + xe::Profiler::ThreadExit(); return 0; } diff --git a/src/xenia/profiling.cc b/src/xenia/profiling.cc new file mode 100644 index 000000000..c7b5f2eb2 --- /dev/null +++ b/src/xenia/profiling.cc @@ -0,0 +1,88 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#define MICRO_PROFILE_IMPL +#include + +namespace xe { + +std::unique_ptr Profiler::display_ = nullptr; + +void Profiler::Dump() { + MicroProfileDumpTimers(); +} + +void Profiler::Shutdown() { + display_.reset(); + MicroProfileShutdown(); +} + +void Profiler::ThreadEnter(const char* name) { + MicroProfileOnThreadCreate(name); +} + +void Profiler::ThreadExit() { + MicroProfileOnThreadExit(); +} + +void Profiler::set_display(std::unique_ptr display) { + display_ = std::move(display); +} + +void Profiler::Present() { + MicroProfileFlip(); + if (!display_) { + return; + } + + display_->Begin(); + MicroProfileDraw(display_->width(), display_->height()); + display_->End(); +} + +} // namespace xe + +uint32_t MicroProfileGpuInsertTimeStamp() { + return 0; +} + +uint64_t MicroProfileGpuGetTimeStamp(uint32_t nKey) { + return 0; +} + +uint64_t MicroProfileTicksPerSecondGpu() { + return 0; +} + +void MicroProfileDrawBox(int nX, int nY, int nX1, int nY1, uint32_t nColor, MicroProfileBoxType type) { + auto display = xe::Profiler::display(); + if (!display) { + return; + } + display->DrawBox( + nX, nY, nX1, nY1, + nColor, + static_cast(type)); +} + +void MicroProfileDrawLine2D(uint32_t nVertices, float* pVertices, uint32_t nColor) { + auto display = xe::Profiler::display(); + if (!display) { + return; + } + display->DrawLine2D(nVertices, pVertices, nColor); +} + +void MicroProfileDrawText(int nX, int nY, uint32_t nColor, const char* pText, uint32_t nLen) { + auto display = xe::Profiler::display(); + if (!display) { + return; + } + display->DrawText(nX, nY, nColor, pText, nLen); +} diff --git a/src/xenia/profiling.h b/src/xenia/profiling.h new file mode 100644 index 000000000..8ab3d3169 --- /dev/null +++ b/src/xenia/profiling.h @@ -0,0 +1,135 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_PROFILING_H_ +#define XENIA_PROFILING_H_ + +#include + +#include +#include +#include +#include +#include + +#if XE_OPTION_PROFILING +// Pollutes the global namespace. Yuck. +#include +#endif // XE_OPTION_PROFILING + +namespace xe { + +#if XE_OPTION_PROFILING + +// Defines a profiling scope for CPU tasks. +// Use `SCOPE_profile_cpu(name)` to activate the scope. +#define DEFINE_profile_cpu(name, group_name, scope_name, color) \ + MICROPROFILE_DEFINE(name, group_name, scope_name, color) + +// Declares a previously defined profile scope. Use in a translation unit. +#define DECLARE_profile_cpu(name) MICROPROFILE_DECLARE(name) + +// Defines a profiling scope for GPU tasks. +// Use `COUNT_profile_gpu(name)` to activate the scope. +#define DEFINE_profile_gpu(name, group_name, scope_name, color) \ + MICROPROFILE_DEFINE_GPU(name, group_name, scope_name, color) + +// Declares a previously defined profile scope. Use in a translation unit. +#define DECLARE_profile_gpu(name) MICROPROFILE_DECLARE_GPU(name) + +// Enters a previously defined CPU profiling scope, active for the duration +// of the containing block. +#define SCOPE_profile_cpu(name) \ + MICROPROFILE_SCOPE(name) + +// Enters a CPU profiling scope, active for the duration of the containing +// block. No previous definition required. +#define SCOPE_profile_cpu_i(group_name, scope_name, color) \ + MICROPROFILE_SCOPEI(group_name, scope_name, color) + +// Enters a previously defined GPU profiling scope, active for the duration +// of the containing block. +#define SCOPE_profile_gpu(name) \ + MICROPROFILE_SCOPEGPU(name) + +// Enters a GPU profiling scope, active for the duration of the containing +// block. No previous definition required. +#define SCOPE_profile_gpu_i(group_name, scope_name, color) \ + MICROPROFILE_SCOPEGPUI(group_name, scope_name, color) + +// Tracks a CPU value counter. +#define COUNT_profile_cpu(name, count) MICROPROFILE_META_CPU(name, count) + +// Tracks a GPU value counter. +#define COUNT_profile_gpu(name, count) MICROPROFILE_META_GPU(name, count) + +#else + +#define DEFINE_profile_cpu(name, group_name, scope_name, color) +#define DEFINE_profile_gpu(name, group_name, scope_name, color) +#define DECLARE_profile_cpu(name) +#define DECLARE_profile_gpu(name) +#define SCOPE_profile_cpu(name) do {} while (false) +#define SCOPE_profile_cpu_i(group_name, scope_name, color) do {} while (false) +#define SCOPE_profile_gpu(name) do {} while (false) +#define SCOPE_profile_gpu_i(group_name, scope_name, color) do {} while (false) +#define COUNT_profile_cpu(name, count) do {} while (false) +#define COUNT_profile_gpu(name, count) do {} while (false) + +#endif // XE_OPTION_PROFILING + +class ProfilerDisplay { +public: + enum BoxType { + BOX_TYPE_BAR = MicroProfileBoxTypeBar, + BOX_TYPE_FLAT = MicroProfileBoxTypeFlat, + }; + + virtual uint32_t width() const = 0; + virtual uint32_t height() const = 0; + + // TODO(benvanik): GPU timestamping. + + virtual void Begin() = 0; + virtual void End() = 0; + virtual void DrawBox(int x, int y, int x1, int y1, uint32_t color, BoxType type) = 0; + virtual void DrawLine2D(uint32_t count, float* vertices, uint32_t color) = 0; + virtual void DrawText(int x, int y, uint32_t color, const char* text, size_t text_length) = 0; +}; + +class Profiler { +public: + // Dumps data to stdout. + static void Dump(); + // Cleans up profiling, releasing all memory. + static void Shutdown(); + + // Activates the calling thread for profiling. + // This must be called immediately after launching a thread. + static void ThreadEnter(const char* name = nullptr); + // Deactivates the calling thread for profiling. + static void ThreadExit(); + + // Gets the current display, if any. + static ProfilerDisplay* display() { return display_.get(); } + // Initializes drawing with the given display. + static void set_display(std::unique_ptr display); + // Presents the profiler to the bound display, if any. + static void Present(); + + // TODO(benvanik): display mode/pause/etc? + // TODO(benvanik): mouse, keys + +private: + static std::unique_ptr display_; +}; + +} // namespace xe + +#endif // XENIA_PROFILING_H_ diff --git a/src/xenia/sources.gypi b/src/xenia/sources.gypi index 5d2c066b0..0d20898e0 100644 --- a/src/xenia/sources.gypi +++ b/src/xenia/sources.gypi @@ -18,6 +18,8 @@ 'platform.cc', 'platform.h', 'platform_includes.h', + 'profiling.cc', + 'profiling.h', 'string.cc', 'string.h', 'types.h', diff --git a/tools/alloy-sandbox/alloy-sandbox.cc b/tools/alloy-sandbox/alloy-sandbox.cc index e7f6bb2d6..e37a1d3d2 100644 --- a/tools/alloy-sandbox/alloy-sandbox.cc +++ b/tools/alloy-sandbox/alloy-sandbox.cc @@ -24,6 +24,8 @@ using namespace xe::cpu; int alloy_sandbox(int argc, xechar_t** argv) { + xe::Profiler::ThreadEnter("main"); + XenonMemory* memory = new XenonMemory(); ExportResolver* export_resolver = new ExportResolver(); @@ -57,6 +59,9 @@ int alloy_sandbox(int argc, xechar_t** argv) { delete runtime; delete memory; + xe::Profiler::Dump(); + xe::Profiler::ThreadExit(); + return 0; } // ehhh diff --git a/tools/xenia-run/xenia-run.cc b/tools/xenia-run/xenia-run.cc index 59bc70a24..909e26dc8 100644 --- a/tools/xenia-run/xenia-run.cc +++ b/tools/xenia-run/xenia-run.cc @@ -22,6 +22,8 @@ DEFINE_string(target, "", int xenia_run(int argc, xechar_t** argv) { int result_code = 1; + Profiler::ThreadEnter("main"); + Emulator* emulator = NULL; // Grab path from the flag or unnamed argument. @@ -89,6 +91,8 @@ XECLEANUP: if (result_code) { XEFATAL("Failed to launch emulator: %d", result_code); } + Profiler::Dump(); + Profiler::Shutdown(); return result_code; } XE_MAIN_WINDOW_THUNK(xenia_run, XETEXT("xenia-run"), "xenia-run some.xex"); diff --git a/xenia.gyp b/xenia.gyp index a765e5c00..8933eb750 100644 --- a/xenia.gyp +++ b/xenia.gyp @@ -39,6 +39,8 @@ 'target_defaults': { 'include_dirs': [ 'include/', + 'third_party/', + '.', ], 'defines': [ @@ -242,6 +244,7 @@ 'user32', 'ole32', 'ntdll', + 'advapi32', ], }], ['OS == "mac"', { @@ -318,6 +321,7 @@ 'xinput', 'xaudio2', 'Shell32', + 'advapi32', ], }], ['OS == "mac"', { From cd56c3033444ae482ce353b3c4f811f3c1c54c50 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 27 May 2014 22:54:52 -0700 Subject: [PATCH 116/184] Fixing warnings in microprofile. --- third_party/microprofile/microprofile.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/third_party/microprofile/microprofile.h b/third_party/microprofile/microprofile.h index f6b747d56..a2a840ea6 100644 --- a/third_party/microprofile/microprofile.h +++ b/third_party/microprofile/microprofile.h @@ -1108,7 +1108,7 @@ uint16_t MicroProfileGetGroup(const char* pGroup, MicroProfileTokenType Type) } uint16_t nGroupIndex = 0xffff; S.GroupInfo[S.nGroupCount].pName = pGroup; - S.GroupInfo[S.nGroupCount].nNameLen = strlen(pGroup); + S.GroupInfo[S.nGroupCount].nNameLen = (uint32_t)strlen(pGroup); S.GroupInfo[S.nGroupCount].nGroupIndex = S.nGroupCount; S.GroupInfo[S.nGroupCount].nNumTimers = 0; S.GroupInfo[S.nGroupCount].Type = Type; @@ -1137,7 +1137,7 @@ MicroProfileToken MicroProfileGetToken(const char* pGroup, const char* pName, ui S.nMaxGroupSize = MicroProfileMax(S.nMaxGroupSize, S.GroupInfo[nGroupIndex].nNumTimers); S.TimerInfo[nTimerIndex].nToken = nToken; S.TimerInfo[nTimerIndex].pName = pName; - S.TimerInfo[nTimerIndex].nNameLen = strlen(pName); + S.TimerInfo[nTimerIndex].nNameLen = (uint32_t)strlen(pName); S.TimerInfo[nTimerIndex].nColor = nColor&0xffffff; S.TimerInfo[nTimerIndex].nGroupIndex = nGroupIndex; return nToken; @@ -1592,8 +1592,8 @@ void MicroProfileDrawFloatWindow(uint32_t nX, uint32_t nY, const char** ppString { MicroProfileDrawBox(nX-MICROPROFILE_TEXT_WIDTH, nY, nX, nY + MICROPROFILE_TEXT_WIDTH, pColors[i]|0xff000000); } - MicroProfileDrawText(nX + 1, nY + 1, (uint32_t)-1, ppStrings[i0], strlen(ppStrings[i0])); - MicroProfileDrawText(nX + nWidth - nStringLengths[i0+1] * (MICROPROFILE_TEXT_WIDTH+1), nY + 1, (uint32_t)-1, ppStrings[i0+1], strlen(ppStrings[i0+1])); + MicroProfileDrawText(nX + 1, nY + 1, (uint32_t)-1, ppStrings[i0], (uint32_t)strlen(ppStrings[i0])); + MicroProfileDrawText(nX + nWidth - nStringLengths[i0+1] * (MICROPROFILE_TEXT_WIDTH+1), nY + 1, (uint32_t)-1, ppStrings[i0+1], (uint32_t)strlen(ppStrings[i0+1])); nY += (MICROPROFILE_TEXT_HEIGHT+1); } } @@ -1617,7 +1617,7 @@ void MicroProfileDrawTextBox(uint32_t nX, uint32_t nY, const char** ppStrings, u MicroProfileDrawBox(nX, nY, nX + nWidth, nY + nHeight, 0xff000000); for(uint32_t i = 0; i < nNumStrings; ++i) { - MicroProfileDrawText(nX + 1, nY + 1, (uint32_t)-1, ppStrings[i], strlen(ppStrings[i])); + MicroProfileDrawText(nX + 1, nY + 1, (uint32_t)-1, ppStrings[i], (uint32_t)strlen(ppStrings[i])); nY += (MICROPROFILE_TEXT_HEIGHT+1); } } @@ -1808,7 +1808,7 @@ void MicroProfileDebugDumpRange() (type == MP_LOG_ENTER ? "BEGIN" : "META"); snprintf(buffer, 255, "DUMP 0x%p: %s :: %llx: %s\n", pStart, pBegin, nTick, pTimerName); #ifdef _WIN32 - OutputDebugString(buffer); + OutputDebugStringA(buffer); #else printf("%s", buffer); #endif @@ -2078,7 +2078,7 @@ void MicroProfileDrawDetailedBars(uint32_t nWidth, uint32_t nHeight, int nBaseY, uint32_t nThreadColor = -1; if(pLog->nThreadId == nContextSwitchHoverThreadAfter || pLog->nThreadId == nContextSwitchHoverThreadBefore) nThreadColor = S.nHoverColorShared|0x906060; - MicroProfileDrawText(0, nY, nThreadColor, &ThreadName[0], strlen(&ThreadName[0])); + MicroProfileDrawText(0, nY, nThreadColor, &ThreadName[0], (uint32_t)strlen(&ThreadName[0])); nY += 3; nY += MICROPROFILE_TEXT_HEIGHT + 1; @@ -2440,7 +2440,7 @@ template void MicroProfileLoopActiveGroupsDraw(int32_t nX, int32_t nY, const char* pName, T CB) { if(pName) - MicroProfileDrawText(nX, nY, (uint32_t)-1, pName, strlen(pName)); + MicroProfileDrawText(nX, nY, (uint32_t)-1, pName, (uint32_t)strlen(pName)); nY += S.nBarHeight + 2; uint64_t nGroup = S.nActiveGroup = S.nMenuAllGroups ? S.nGroupMask : S.nMenuActiveGroup; @@ -2567,7 +2567,7 @@ uint32_t MicroProfileDrawBarCallCount(int32_t nX, int32_t nY, const char* pName) uint32_t MicroProfileDrawBarMetaCount(int32_t nX, int32_t nY, uint64_t* pCounters, const char* pName, uint32_t nTotalHeight) { MicroProfileDrawLineVertical(nX-5, nY, nTotalHeight, S.nOpacityBackground|g_nMicroProfileBackColors[0]|g_nMicroProfileBackColors[1]); - uint32_t nTextWidth = (1+MICROPROFILE_TEXT_WIDTH) * MicroProfileMax(6, strlen(pName)); + uint32_t nTextWidth = (1+MICROPROFILE_TEXT_WIDTH) * MicroProfileMax(6, (uint32_t)strlen(pName)); MicroProfileLoopActiveGroupsDraw(nX, nY, pName, @@ -2585,7 +2585,7 @@ uint32_t MicroProfileDrawBarLegend(int32_t nX, int32_t nY, uint32_t nTotalHeight MicroProfileDrawLineVertical(nX-5, nY, nTotalHeight, S.nOpacityBackground | g_nMicroProfileBackColors[0]|g_nMicroProfileBackColors[1]); MicroProfileLoopActiveGroupsDraw(nX, nY, 0, [](uint32_t nTimer, uint32_t nIdx, uint64_t nGroupMask, uint32_t nX, uint32_t nY){ - MicroProfileDrawText(nX, nY, S.TimerInfo[nTimer].nColor, S.TimerInfo[nTimer].pName, strlen(S.TimerInfo[nTimer].pName)); + MicroProfileDrawText(nX, nY, S.TimerInfo[nTimer].nColor, S.TimerInfo[nTimer].pName, (uint32_t)strlen(S.TimerInfo[nTimer].pName)); if(S.nMouseY >= nY && S.nMouseY < nY + MICROPROFILE_TEXT_HEIGHT+1 && S.nMouseX < nX + 20 * (MICROPROFILE_TEXT_WIDTH+1)) { S.nHoverToken = nTimer; @@ -3201,7 +3201,7 @@ void MicroProfileDrawMenu(uint32_t nWidth, uint32_t nHeight) S.nRunning = !S.nRunning; } } - MicroProfileDrawText(nX, nY, (uint32_t)-1, pMenuText[i], strlen(pMenuText[i])); + MicroProfileDrawText(nX, nY, (uint32_t)-1, pMenuText[i], (uint32_t)strlen(pMenuText[i])); nX += (nLen+1) * (MICROPROFILE_TEXT_WIDTH+1); } uint32_t nMenu = nSelectMenu != (uint32_t)-1 ? nSelectMenu : S.nActiveMenu; From beb9bd11f03c7a7428c1b487f7d25b7169a03541 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 28 May 2014 13:59:43 -0700 Subject: [PATCH 117/184] Microprofile integration. --- src/xenia/cpu/processor.cc | 2 + src/xenia/gpu/d3d11/d3d11_graphics_system.cc | 8 +- src/xenia/gpu/d3d11/d3d11_profiler_display.cc | 630 ++++++++++++++++++ src/xenia/gpu/d3d11/d3d11_profiler_display.h | 86 +++ src/xenia/gpu/d3d11/d3d11_window.cc | 14 + src/xenia/gpu/d3d11/d3d11_window.h | 2 + src/xenia/gpu/d3d11/sources.gypi | 2 + src/xenia/kernel/objects/xthread.cc | 4 + src/xenia/kernel/xboxkrnl_rtl.cc | 3 +- src/xenia/logging.cc | 2 + src/xenia/profiling.cc | 57 ++ src/xenia/profiling.h | 45 +- src/xenia/ui/ui_event.h | 12 + src/xenia/ui/win32/win32_window.cc | 5 +- src/xenia/ui/window.h | 3 + tools/alloy-sandbox/alloy-sandbox.cc | 1 + tools/xenia-run/xenia-run.cc | 1 + 17 files changed, 864 insertions(+), 13 deletions(-) create mode 100644 src/xenia/gpu/d3d11/d3d11_profiler_display.cc create mode 100644 src/xenia/gpu/d3d11/d3d11_profiler_display.h diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index db11ef2ab..b77664482 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -193,6 +193,8 @@ uint64_t Processor::Execute( uint64_t Processor::ExecuteInterrupt( uint32_t cpu, uint64_t address, uint64_t arg0, uint64_t arg1) { + SCOPE_profile_cpu_f("cpu"); + // Acquire lock on interrupt thread (we can only dispatch one at a time). xe_mutex_lock(interrupt_thread_lock_); diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc index f1432dc50..03c91038c 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc @@ -24,6 +24,12 @@ namespace { void __stdcall D3D11GraphicsSystemVsyncCallback( D3D11GraphicsSystem* gs, BOOLEAN) { + static bool thread_name_set = false; + if (!thread_name_set) { + thread_name_set = true; + Profiler::ThreadEnter("VsyncTimer"); + } + gs->MarkVblank(); gs->DispatchInterruptCallback(0); } @@ -53,7 +59,7 @@ void D3D11GraphicsSystem::Initialize() { (WAITORTIMERCALLBACK)D3D11GraphicsSystemVsyncCallback, this, 16, - 100, + 16, WT_EXECUTEINTIMERTHREAD); // Create DXGI factory so we can get a swap chain/etc. diff --git a/src/xenia/gpu/d3d11/d3d11_profiler_display.cc b/src/xenia/gpu/d3d11/d3d11_profiler_display.cc new file mode 100644 index 000000000..b0632b21f --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_profiler_display.cc @@ -0,0 +1,630 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include + +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; + + +namespace { +const uint8_t profiler_font[] = { + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x10,0x78,0x38,0x78,0x7c,0x7c,0x3c,0x44,0x38,0x04,0x44,0x40,0x44,0x44,0x38,0x78, + 0x38,0x78,0x38,0x7c,0x44,0x44,0x44,0x44,0x44,0x7c,0x00,0x00,0x40,0x00,0x04,0x00, + 0x18,0x00,0x40,0x10,0x08,0x40,0x30,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x10,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x38,0x10,0x38,0x7c,0x08,0x7c,0x1c,0x7c,0x38,0x38, + 0x10,0x28,0x28,0x10,0x00,0x20,0x10,0x08,0x10,0x10,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x04,0x00,0x20,0x38,0x38,0x70,0x00,0x1c,0x10,0x00,0x1c,0x10,0x70,0x30,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x28,0x44,0x44,0x44,0x40,0x40,0x40,0x44,0x10,0x04,0x48,0x40,0x6c,0x44,0x44,0x44, + 0x44,0x44,0x44,0x10,0x44,0x44,0x44,0x44,0x44,0x04,0x00,0x00,0x40,0x00,0x04,0x00, + 0x24,0x00,0x40,0x00,0x00,0x40,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x10,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x44,0x30,0x44,0x04,0x18,0x40,0x20,0x04,0x44,0x44, + 0x10,0x28,0x28,0x3c,0x44,0x50,0x10,0x10,0x08,0x54,0x10,0x00,0x00,0x00,0x04,0x00, + 0x00,0x08,0x00,0x10,0x44,0x44,0x40,0x40,0x04,0x28,0x00,0x30,0x10,0x18,0x58,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x44,0x44,0x40,0x44,0x40,0x40,0x40,0x44,0x10,0x04,0x50,0x40,0x54,0x64,0x44,0x44, + 0x44,0x44,0x40,0x10,0x44,0x44,0x44,0x28,0x28,0x08,0x00,0x38,0x78,0x3c,0x3c,0x38, + 0x20,0x38,0x78,0x30,0x18,0x44,0x10,0x6c,0x78,0x38,0x78,0x3c,0x5c,0x3c,0x3c,0x44, + 0x44,0x44,0x44,0x44,0x7c,0x00,0x4c,0x10,0x04,0x08,0x28,0x78,0x40,0x08,0x44,0x44, + 0x10,0x00,0x7c,0x50,0x08,0x50,0x00,0x20,0x04,0x38,0x10,0x00,0x00,0x00,0x08,0x10, + 0x10,0x10,0x7c,0x08,0x08,0x54,0x40,0x20,0x04,0x44,0x00,0x30,0x10,0x18,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x44,0x78,0x40,0x44,0x78,0x78,0x40,0x7c,0x10,0x04,0x60,0x40,0x54,0x54,0x44,0x78, + 0x44,0x78,0x38,0x10,0x44,0x44,0x54,0x10,0x10,0x10,0x00,0x04,0x44,0x40,0x44,0x44, + 0x78,0x44,0x44,0x10,0x08,0x48,0x10,0x54,0x44,0x44,0x44,0x44,0x60,0x40,0x10,0x44, + 0x44,0x44,0x28,0x44,0x08,0x00,0x54,0x10,0x18,0x18,0x48,0x04,0x78,0x10,0x38,0x3c, + 0x10,0x00,0x28,0x38,0x10,0x20,0x00,0x20,0x04,0x10,0x7c,0x00,0x7c,0x00,0x10,0x00, + 0x00,0x20,0x00,0x04,0x10,0x5c,0x40,0x10,0x04,0x00,0x00,0x60,0x10,0x0c,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x7c,0x44,0x40,0x44,0x40,0x40,0x4c,0x44,0x10,0x04,0x50,0x40,0x44,0x4c,0x44,0x40, + 0x54,0x50,0x04,0x10,0x44,0x44,0x54,0x28,0x10,0x20,0x00,0x3c,0x44,0x40,0x44,0x7c, + 0x20,0x44,0x44,0x10,0x08,0x70,0x10,0x54,0x44,0x44,0x44,0x44,0x40,0x38,0x10,0x44, + 0x44,0x54,0x10,0x44,0x10,0x00,0x64,0x10,0x20,0x04,0x7c,0x04,0x44,0x20,0x44,0x04, + 0x10,0x00,0x7c,0x14,0x20,0x54,0x00,0x20,0x04,0x38,0x10,0x10,0x00,0x00,0x20,0x10, + 0x10,0x10,0x7c,0x08,0x10,0x58,0x40,0x08,0x04,0x00,0x00,0x30,0x10,0x18,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x44,0x44,0x44,0x44,0x40,0x40,0x44,0x44,0x10,0x44,0x48,0x40,0x44,0x44,0x44,0x40, + 0x48,0x48,0x44,0x10,0x44,0x28,0x6c,0x44,0x10,0x40,0x00,0x44,0x44,0x40,0x44,0x40, + 0x20,0x3c,0x44,0x10,0x08,0x48,0x10,0x54,0x44,0x44,0x44,0x44,0x40,0x04,0x12,0x4c, + 0x28,0x54,0x28,0x3c,0x20,0x00,0x44,0x10,0x40,0x44,0x08,0x44,0x44,0x20,0x44,0x08, + 0x00,0x00,0x28,0x78,0x44,0x48,0x00,0x10,0x08,0x54,0x10,0x10,0x00,0x00,0x40,0x00, + 0x10,0x08,0x00,0x10,0x00,0x40,0x40,0x04,0x04,0x00,0x00,0x30,0x10,0x18,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x44,0x78,0x38,0x78,0x7c,0x40,0x3c,0x44,0x38,0x38,0x44,0x7c,0x44,0x44,0x38,0x40, + 0x34,0x44,0x38,0x10,0x38,0x10,0x44,0x44,0x10,0x7c,0x00,0x3c,0x78,0x3c,0x3c,0x3c, + 0x20,0x04,0x44,0x38,0x48,0x44,0x38,0x44,0x44,0x38,0x78,0x3c,0x40,0x78,0x0c,0x34, + 0x10,0x6c,0x44,0x04,0x7c,0x00,0x38,0x38,0x7c,0x38,0x08,0x38,0x38,0x20,0x38,0x70, + 0x10,0x00,0x28,0x10,0x00,0x34,0x00,0x08,0x10,0x10,0x00,0x20,0x00,0x10,0x00,0x00, + 0x20,0x04,0x00,0x20,0x10,0x3c,0x70,0x00,0x1c,0x00,0x7c,0x1c,0x10,0x70,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x38,0x00,0x00,0x30,0x00,0x00,0x00,0x00,0x00,0x40,0x04,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +}; + +const char* shader_code = " \ +cbuffer MatrixBuffer {\n \ + float4x4 projection_matrix;\n \ +};\n \ +Texture2D texture0;\n \ +SamplerState sampler0;\n \ +struct Vertex {\n \ + float2 position : POSITION0;\n \ + float2 tex : TEXCOORD0;\n \ + float4 color : COLOR0;\n \ +};\n \ +struct Pixel {\n \ + float4 position : SV_POSITION;\n \ + float2 tex : TEXCOORD0;\n \ + float4 color : COLOR0;\n \ +};\n \ +Pixel vs(Vertex v) {\n \ + Pixel p;\n \ + p.position = float4(mul(float4(v.position, 0.0f, 1.0f), projection_matrix).xy - float2(1.0f, -1.0f), 0.0f, 1.0f);\n \ + p.tex = v.tex;\n \ + p.color = v.color;\n \ + return p;\n \ +}\n \ +float4 ps(Pixel p) : SV_TARGET {\n \ + if (p.tex.x > 1.0f) {\n \ + return float4(p.color.rgb, 0.5f);\n \ + } else {\n \ + float4 sample = texture0.Sample(sampler0, p.tex);\n \ + if(sample.w < 0.5f) {\n \ + discard;\n \ + }\n \ + return p.color * sample;\n \ + }\n \ +}\n"; + +} // namespace + + +D3D11ProfilerDisplay::D3D11ProfilerDisplay(D3D11Window* window) : window_(window) { + draw_state_ = { 0 }; + if (!SetupState() || + !SetupShaders() || + !SetupFont()) { + // Hrm. + XEASSERTALWAYS(); + } + + // Pass through mouse events. + window->mouse_down.AddListener([](xe::ui::MouseEvent& e) { + Profiler::OnMouseDown( + e.button() == xe::ui::MouseEvent::MOUSE_BUTTON_LEFT, + e.button() == xe::ui::MouseEvent::MOUSE_BUTTON_RIGHT); + }); + window->mouse_up.AddListener([](xe::ui::MouseEvent& e) { + Profiler::OnMouseUp(); + }); + window->mouse_move.AddListener([](xe::ui::MouseEvent& e) { + Profiler::OnMouseMove(e.x(), e.y()); + }); + window->mouse_wheel.AddListener([](xe::ui::MouseEvent& e) { + Profiler::OnMouseWheel(e.x(), e.y(), -e.dy()); + }); + + // Watch for toggle/mode keys and such. + window->key_down.AddListener([](xe::ui::KeyEvent& e) { + Profiler::OnKeyDown(e.key_code()); + }); + window->key_up.AddListener([](xe::ui::KeyEvent& e) { + Profiler::OnKeyUp(e.key_code()); + }); +} + +bool D3D11ProfilerDisplay::SetupState() { + HRESULT hr; + auto device = window_->device(); + + D3D11_BLEND_DESC blend_desc; + xe_zero_struct(&blend_desc, sizeof(blend_desc)); + blend_desc.RenderTarget[0].BlendEnable = true; + blend_desc.RenderTarget[0].BlendOp = D3D11_BLEND_OP_ADD; + blend_desc.RenderTarget[0].BlendOpAlpha = D3D11_BLEND_OP_ADD; + blend_desc.RenderTarget[0].SrcBlend = D3D11_BLEND_SRC_ALPHA; + blend_desc.RenderTarget[0].SrcBlendAlpha = D3D11_BLEND_ZERO; + blend_desc.RenderTarget[0].DestBlend = D3D11_BLEND_INV_SRC_ALPHA; + blend_desc.RenderTarget[0].DestBlendAlpha = D3D11_BLEND_ZERO; + blend_desc.RenderTarget[0].RenderTargetWriteMask = 0x0F; + hr = device->CreateBlendState(&blend_desc, &blend_state_); + XEASSERT(SUCCEEDED(hr)); + + D3D11_DEPTH_STENCIL_DESC depth_stencil_desc; + xe_zero_struct(&depth_stencil_desc, sizeof(depth_stencil_desc)); + depth_stencil_desc.DepthEnable = false; + depth_stencil_desc.StencilEnable = false; + depth_stencil_desc.DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ZERO; + hr = device->CreateDepthStencilState(&depth_stencil_desc, &depth_stencil_state_); + XEASSERT(SUCCEEDED(hr)); + + return true; +} + +bool D3D11ProfilerDisplay::SetupShaders() { + HRESULT hr; + auto device = window_->device(); + + ID3DBlob* vs_code_blob = nullptr; + ID3DBlob* vs_errors = nullptr; + hr = D3DCompile( + shader_code, xestrlena(shader_code), + "D3D11ProfilerDisplay.vs", + nullptr, + nullptr, + "vs", + "vs_5_0", + D3DCOMPILE_ENABLE_STRICTNESS, + 0, + &vs_code_blob, + &vs_errors); + if (FAILED(hr)) { + XELOGE("Failed to compile profiler vs: %s", + reinterpret_cast(vs_errors->GetBufferPointer())); + return false; + } + hr = device->CreateVertexShader(vs_code_blob->GetBufferPointer(), + vs_code_blob->GetBufferSize(), + nullptr, + &vertex_shader_); + if (FAILED(hr)) { + XELOGE("Failed to create profiler vs"); + return false; + } + ID3DBlob* ps_code_blob = nullptr; + ID3DBlob* ps_errors = nullptr; + hr = D3DCompile( + shader_code, xestrlena(shader_code), + "D3D11ProfilerDisplay.ps", + nullptr, + nullptr, + "ps", + "ps_5_0", + D3DCOMPILE_ENABLE_STRICTNESS, + 0, + &ps_code_blob, + &ps_errors); + if (FAILED(hr)) { + XELOGE("Failed to compile profiler ps: %s", + reinterpret_cast(ps_errors->GetBufferPointer())); + return false; + } + hr = device->CreatePixelShader(ps_code_blob->GetBufferPointer(), + ps_code_blob->GetBufferSize(), + nullptr, + &pixel_shader_); + if (FAILED(hr)) { + XELOGE("Failed to create profiler ps"); + return false; + } + + D3D11_BUFFER_DESC buffer_desc = { 0 }; + buffer_desc.Usage = D3D11_USAGE_DYNAMIC; + buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + buffer_desc.ByteWidth = sizeof(float) * 16; + hr = device->CreateBuffer(&buffer_desc, nullptr, &shader_constants_); + if (FAILED(hr)) { + XELOGE("Failed to create profiler constant buffer"); + return false; + } + + D3D11_INPUT_ELEMENT_DESC element_descs[] = { + { "POSITION", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0, }, + { "TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_VERTEX_DATA, 0, }, + { "COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_VERTEX_DATA, 0, }, + }; + hr = device->CreateInputLayout(element_descs, (UINT)XECOUNT(element_descs), + vs_code_blob->GetBufferPointer(), + vs_code_blob->GetBufferSize(), + &shader_layout_); + if (FAILED(hr)) { + XELOGE("Failed to create profiler input layout"); + return false; + } + + buffer_desc.Usage = D3D11_USAGE_DYNAMIC; + buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER; + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + buffer_desc.ByteWidth = sizeof(draw_state_.vertex_buffer); + hr = device->CreateBuffer(&buffer_desc, nullptr, &vertex_buffer_); + if (FAILED(hr)) { + XELOGE("Failed to create profiler vertex buffer"); + return false; + } + + return true; +} + +bool D3D11ProfilerDisplay::SetupFont() { + HRESULT hr; + auto device = window_->device(); + + // Setup font lookup table. + for (uint32_t i = 0; i < XECOUNT(font_description_.char_offsets); ++i) { + font_description_.char_offsets[i] = 206; + } + for (uint32_t i = 'A'; i <= 'Z'; ++i) { + font_description_.char_offsets[i] = (i-'A')*8+1; + } + for (uint32_t i = 'a'; i <= 'z'; ++i) { + font_description_.char_offsets[i] = (i-'a')*8+217; + } + for (uint32_t i = '0'; i <= '9'; ++i) { + font_description_.char_offsets[i] = (i-'0')*8+433; + } + for (uint32_t i = '!'; i <= '/'; ++i) { + font_description_.char_offsets[i] = (i-'!')*8+513; + } + for (uint32_t i = ':'; i <= '@'; ++i) { + font_description_.char_offsets[i] = (i-':')*8+625+8; + } + for (uint32_t i = '['; i <= '_'; ++i) { + font_description_.char_offsets[i] = (i-'[')*8+681+8; + } + for (uint32_t i = '{'; i <= '~'; ++i) { + font_description_.char_offsets[i] = (i-'{')*8+721+8; + } + + // Unpack font bitmap into an RGBA texture. + const int FONT_TEX_X = 1024; + const int FONT_TEX_Y = 9; + const int UNPACKED_SIZE = FONT_TEX_X * FONT_TEX_Y * 4; + uint32_t unpacked[UNPACKED_SIZE]; + int idx = 0; + int end = FONT_TEX_X * FONT_TEX_Y / 8; + for (int i = 0; i < end; i++) { + uint8_t b = profiler_font[i]; + for (int j = 0; j < 8; ++j) { + unpacked[idx++] = b & 0x80 ? 0xFFFFFFFFu : 0; + b <<= 1; + } + } + + D3D11_TEXTURE2D_DESC texture_desc = { 0 }; + texture_desc.Width = FONT_TEX_X; + texture_desc.Height = FONT_TEX_Y; + texture_desc.MipLevels = 1; + texture_desc.ArraySize = 1; + texture_desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; + texture_desc.SampleDesc.Count = 1; + texture_desc.SampleDesc.Quality = 0; + texture_desc.Usage = D3D11_USAGE_IMMUTABLE; + texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + texture_desc.CPUAccessFlags = 0; + texture_desc.MiscFlags = 0; + D3D11_SUBRESOURCE_DATA initial_data = { 0 }; + initial_data.pSysMem = unpacked; + initial_data.SysMemPitch = FONT_TEX_X * 4; + initial_data.SysMemSlicePitch = 0; + ID3D11Texture2D* font_texture = nullptr; + hr = device->CreateTexture2D(&texture_desc, &initial_data, &font_texture); + if (FAILED(hr)) { + XELOGE("Unable to create profiler font texture"); + return false; + } + + D3D11_SHADER_RESOURCE_VIEW_DESC texture_view_desc; + xe_zero_struct(&texture_view_desc, sizeof(texture_view_desc)); + texture_view_desc.Format = texture_desc.Format; + texture_view_desc.ViewDimension = D3D10_SRV_DIMENSION_TEXTURE2D; + texture_view_desc.Texture2D.MipLevels = 1; + texture_view_desc.Texture2D.MostDetailedMip = 0; + hr = device->CreateShaderResourceView( + font_texture, &texture_view_desc, &font_texture_view_); + XESAFERELEASE(font_texture); + if (FAILED(hr)) { + XELOGE("Unable to create profiler font texture view"); + return false; + } + + D3D11_SAMPLER_DESC sampler_desc; + xe_zero_struct(&sampler_desc, sizeof(sampler_desc)); + sampler_desc.Filter = D3D11_ENCODE_BASIC_FILTER( + D3D11_FILTER_TYPE_POINT, D3D11_FILTER_TYPE_POINT, + D3D11_FILTER_TYPE_POINT, false); + sampler_desc.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP; + sampler_desc.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP; + sampler_desc.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP; + sampler_desc.MipLODBias; + sampler_desc.MaxAnisotropy = 1; + sampler_desc.ComparisonFunc = D3D11_COMPARISON_ALWAYS; + sampler_desc.BorderColor[0]; + sampler_desc.BorderColor[1]; + sampler_desc.BorderColor[2]; + sampler_desc.BorderColor[3]; + sampler_desc.MinLOD; + sampler_desc.MaxLOD; + hr = device->CreateSamplerState( + &sampler_desc, &font_sampler_state_); + if (FAILED(hr)) { + XEFATAL("D3D11: unable to create invalid sampler state"); + return false; + } + + return true; +} + +D3D11ProfilerDisplay::~D3D11ProfilerDisplay() { + XESAFERELEASE(blend_state_); + XESAFERELEASE(depth_stencil_state_); + XESAFERELEASE(vertex_shader_); + XESAFERELEASE(pixel_shader_); + XESAFERELEASE(shader_constants_); + XESAFERELEASE(shader_layout_); + XESAFERELEASE(font_texture_view_); + XESAFERELEASE(font_sampler_state_); + XESAFERELEASE(vertex_buffer_); +} + +uint32_t D3D11ProfilerDisplay::width() const { + return window_->width(); +} + +uint32_t D3D11ProfilerDisplay::height() const { + return window_->height(); +} + +void D3D11ProfilerDisplay::Begin() { + auto context = window_->context(); + + // Setup projection matrix. + float left = 0.0f; + float right = (float)width(); + float bottom = (float)height(); + float top = 0.0f; + float z_near = -1.0f; + float z_far = 1.0f; + float projection[16] = { 0 }; + projection[0] = 2.0f / (right - left); + projection[5] = 2.0f / (top - bottom); + projection[10] = -2.0f / (z_far - z_near); + projection[12] = -(right + left) / (right - left); + projection[13] = -(top + bottom) / (top - bottom); + projection[14] = -(z_far + z_near) / (z_far - z_near); + projection[15] = 1.0f; + D3D11_MAPPED_SUBRESOURCE res; + context->Map(shader_constants_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + memcpy(res.pData, projection, sizeof(projection)); + context->Unmap(shader_constants_, 0); + + // Setup state. + context->OMSetBlendState(blend_state_, { 0 }, 0xFFFFFFFF); + context->OMSetDepthStencilState(depth_stencil_state_, 0); + + // Bind shaders. + context->GSSetShader(nullptr, nullptr, 0); + context->VSSetShader(vertex_shader_, nullptr, 0); + context->VSSetConstantBuffers(0, 1, &shader_constants_); + context->PSSetShader(pixel_shader_, nullptr, 0); + context->PSSetSamplers(0, 1, &font_sampler_state_); + context->PSSetConstantBuffers(0, 1, &shader_constants_); + context->PSSetShaderResources(0, 1, &font_texture_view_); + context->IASetInputLayout(shader_layout_); +} + +void D3D11ProfilerDisplay::End() { + Flush(); +} + +D3D11ProfilerDisplay::Vertex* D3D11ProfilerDisplay::AllocateVertices( + D3D_PRIMITIVE_TOPOLOGY primitive, size_t count) { + if (draw_state_.vertex_index + count > XECOUNT(draw_state_.vertex_buffer)) { + Flush(); + } + XEASSERT(draw_state_.vertex_index + count <= XECOUNT(draw_state_.vertex_buffer)); + + size_t head = draw_state_.vertex_index; + draw_state_.vertex_index += count; + + if (draw_state_.command_index && + draw_state_.commands[draw_state_.command_index - 1].primitive == primitive) { + draw_state_.commands[draw_state_.command_index - 1].vertex_count += count; + } else { + XEASSERT(draw_state_.command_index < XECOUNT(draw_state_.commands)); + draw_state_.commands[draw_state_.command_index].primitive = primitive; + draw_state_.commands[draw_state_.command_index].vertex_count = count; + ++draw_state_.command_index; + } + return &draw_state_.vertex_buffer[head]; +} + +void D3D11ProfilerDisplay::Flush() { + auto context = window_->context(); + if (!draw_state_.vertex_index) { + return; + } + + D3D11_MAPPED_SUBRESOURCE res; + context->Map(vertex_buffer_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + memcpy(res.pData, draw_state_.vertex_buffer, sizeof(Vertex) * draw_state_.vertex_index); + context->Unmap(vertex_buffer_, 0); + + uint32_t stride = 20; + uint32_t offset = 0; + context->IASetVertexBuffers(0, 1, &vertex_buffer_, &stride, &offset); + + size_t vertex_index = 0; + for (int i = 0; i < draw_state_.command_index; ++i) { + size_t count = draw_state_.commands[i].vertex_count; + context->IASetPrimitiveTopology(draw_state_.commands[i].primitive); + context->Draw((UINT)count, (UINT)vertex_index); + vertex_index += count; + } + + draw_state_.vertex_index = 0; + draw_state_.command_index = 0; +} + +#define Q0(d, member, v) d[0].member = v +#define Q1(d, member, v) d[1].member = v; d[3].member = v +#define Q2(d, member, v) d[4].member = v +#define Q3(d, member, v) d[2].member = v; d[5].member = v + +void D3D11ProfilerDisplay::DrawBox( + int x, int y, int x1, int y1, uint32_t color, BoxType type) { + Vertex* v = AllocateVertices(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST, 6); + uint32_t color0; + uint32_t color1; + if (type == BOX_TYPE_FLAT) { + color0 = 0xFF000000 | + ((color & 0xFF) << 16) | + (color & 0xFF00FF00) | + ((color >> 16) & 0xFF); + color1 = color0; + } else { + uint32_t r = 0xFF & (color >> 16); + uint32_t g = 0xFF & (color >> 8); + uint32_t b = 0xFF & color; + uint32_t max_c = MAX(MAX(MAX(r, g), b), 30u); + uint32_t min_c = MIN(MIN(MIN(r, g), b), 180u); + uint32_t r0 = 0xFF & ((r + max_c)/2); + uint32_t g0 = 0xFF & ((g + max_c)/2); + uint32_t b0 = 0xFF & ((b + max_c)/2); + uint32_t r1 = 0xFF & ((r + min_c) / 2); + uint32_t g1 = 0xFF & ((g + min_c) / 2); + uint32_t b1 = 0xFF & ((b + min_c) / 2); + color0 = r0 | (g0 << 8) | (b0 << 16) | (0xFF000000 & color); + color1 = r1 | (g1 << 8) | (b1 << 16) | (0xFF000000 & color); + } + Q0(v, x, (float)x); + Q0(v, y, (float)y); + Q0(v, color, color0); + Q0(v, u, 2.0f); + Q0(v, v, 2.0f); + Q1(v, x, (float)x1); + Q1(v, y, (float)y); + Q1(v, color, color0); + Q1(v, u, 3.0f); + Q1(v, v, 2.0f); + Q2(v, x, (float)x1); + Q2(v, y, (float)y1); + Q2(v, color, color1); + Q2(v, u, 3.0f); + Q2(v, v, 3.0f); + Q3(v, x, (float)x); + Q3(v, y, (float)y1); + Q3(v, color, color1); + Q3(v, u, 2.0f); + Q3(v, v, 3.0f); +} + +void D3D11ProfilerDisplay::DrawLine2D( + uint32_t count, float* vertices, uint32_t color) { + if (!count || !vertices) { + return; + } + color = 0xFF000000 | + ((color & 0xFF) << 16) | + (color & 0xFF00FF00) | + ((color >> 16) & 0xFF); + Vertex* v = AllocateVertices(D3D11_PRIMITIVE_TOPOLOGY_LINELIST, 2 * (count - 1)); + for (uint32_t i = 0; i < count - 1; ++i) { + v[0].x = vertices[i * 2]; + v[0].y = vertices[i * 2 + 1]; + v[0].color = color; + v[0].u = 2.0f; + v[0].v = 2.0f; + v[1].x = vertices[(i + 1) * 2]; + v[1].y = vertices[(i + 1) * 2 + 1] ; + v[1].color = color; + v[1].u = 2.0f; + v[1].v = 2.0f; + v += 2; + } +} + +void D3D11ProfilerDisplay::DrawText( + int x, int y, uint32_t color, const char* text, size_t text_length) { + const float offset_u = 5.0f / 1024.0f; + float fx = (float)x; + float fy = (float)y; + float fy2 = fy + (MICROPROFILE_TEXT_HEIGHT + 1); + color = 0xFF000000 | + ((color & 0xFF) << 16) | + (color & 0xFF00FF00) | + ((color >> 16) & 0xFF); + Vertex* v = AllocateVertices(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST, 6 * text_length); + const char* s = text; + for (uint32_t j = 0; j < text_length; ++j) { + int16_t nOffset = font_description_.char_offsets[(int)*s++]; + float fOffset = nOffset / 1024.f; + Q0(v, x, fx); + Q0(v, y, fy); + Q0(v, color, color); + Q0(v, u, fOffset); + Q0(v, v, 0.0f); + Q1(v, x, fx + MICROPROFILE_TEXT_WIDTH); + Q1(v, y, fy); + Q1(v, color, color); + Q1(v, u, fOffset + offset_u); + Q1(v, v, 0.0f); + Q2(v, x, fx + MICROPROFILE_TEXT_WIDTH); + Q2(v, y, fy2); + Q2(v, color, color); + Q2(v, u, fOffset + offset_u); + Q2(v, v, 1.0f); + Q3(v, x, fx); + Q3(v, y, fy2); + Q3(v, color, color); + Q3(v, u, fOffset); + Q3(v, v, 1.0f); + fx += MICROPROFILE_TEXT_WIDTH + 1; + v += 6; + } +} diff --git a/src/xenia/gpu/d3d11/d3d11_profiler_display.h b/src/xenia/gpu/d3d11/d3d11_profiler_display.h new file mode 100644 index 000000000..fd9f970f9 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_profiler_display.h @@ -0,0 +1,86 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_PROFILER_DISPLAY_H_ +#define XENIA_GPU_D3D11_D3D11_PROFILER_DISPLAY_H_ + +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + +class D3D11Window; + + +class D3D11ProfilerDisplay : public ProfilerDisplay { +public: + D3D11ProfilerDisplay(D3D11Window* window); + virtual ~D3D11ProfilerDisplay(); + + uint32_t width() const override; + uint32_t height() const override; + + // TODO(benvanik): GPU timestamping. + + void Begin() override; + void End() override; + void DrawBox(int x, int y, int x1, int y1, uint32_t color, BoxType type) override; + void DrawLine2D(uint32_t count, float* vertices, uint32_t color) override; + void DrawText(int x, int y, uint32_t color, const char* text, size_t text_length) override; + +private: + bool SetupState(); + bool SetupShaders(); + bool SetupFont(); + + struct Vertex { + float x, y; + float u, v; + uint32_t color; + }; + struct { + size_t vertex_index; + Vertex vertex_buffer[16 << 10]; + struct { + D3D11_PRIMITIVE_TOPOLOGY primitive; + size_t vertex_count; + } commands[32]; + size_t command_index; + } draw_state_; + Vertex* AllocateVertices(D3D_PRIMITIVE_TOPOLOGY primitive, size_t count); + void Flush(); + + D3D11Window* window_; + ID3D11BlendState* blend_state_; + ID3D11DepthStencilState* depth_stencil_state_; + ID3D11VertexShader* vertex_shader_; + ID3D11PixelShader* pixel_shader_; + ID3D11Buffer* shader_constants_; + ID3D11InputLayout* shader_layout_; + ID3D11ShaderResourceView* font_texture_view_; + ID3D11SamplerState* font_sampler_state_; + ID3D11Buffer* vertex_buffer_; + + struct { + uint16_t char_offsets[256]; + } font_description_; +}; + + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_PROFILER_DISPLAY_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_window.cc b/src/xenia/gpu/d3d11/d3d11_window.cc index 930149694..64a9b0df6 100644 --- a/src/xenia/gpu/d3d11/d3d11_window.cc +++ b/src/xenia/gpu/d3d11/d3d11_window.cc @@ -9,6 +9,8 @@ #include +#include + using namespace xe; using namespace xe::gpu; @@ -36,6 +38,7 @@ D3D11Window::D3D11Window( } D3D11Window::~D3D11Window() { + Profiler::set_display(nullptr); if (context_) { context_->ClearState(); } @@ -100,10 +103,21 @@ int D3D11Window::Initialize(const char* title, uint32_t width, uint32_t height) } context_->OMSetRenderTargets(1, &render_target_view_, NULL); + // Setup profiler display. + if (Profiler::is_enabled()) { + std::unique_ptr profiler_display( + new D3D11ProfilerDisplay(this)); + Profiler::set_display(std::move(profiler_display)); + } + return 0; } void D3D11Window::Swap() { + // Present profiler. + context_->OMSetRenderTargets(1, &render_target_view_, NULL); + Profiler::Present(); + // Swap buffers. // TODO(benvanik): control vsync with flag. bool vsync = true; diff --git a/src/xenia/gpu/d3d11/d3d11_window.h b/src/xenia/gpu/d3d11/d3d11_window.h index f9e47723d..df470df0f 100644 --- a/src/xenia/gpu/d3d11/d3d11_window.h +++ b/src/xenia/gpu/d3d11/d3d11_window.h @@ -29,7 +29,9 @@ public: IDXGIFactory1* dxgi_factory, ID3D11Device* device); virtual ~D3D11Window(); + ID3D11Device* device() const { return device_; } IDXGISwapChain* swap_chain() const { return swap_chain_; } + ID3D11DeviceContext* context() const { return context_; } virtual int Initialize(const char* title, uint32_t width, uint32_t height); diff --git a/src/xenia/gpu/d3d11/sources.gypi b/src/xenia/gpu/d3d11/sources.gypi index b1ad47ff4..6e0b193f5 100644 --- a/src/xenia/gpu/d3d11/sources.gypi +++ b/src/xenia/gpu/d3d11/sources.gypi @@ -10,6 +10,8 @@ 'd3d11_graphics_driver.h', 'd3d11_graphics_system.cc', 'd3d11_graphics_system.h', + 'd3d11_profiler_display.cc', + 'd3d11_profiler_display.h', 'd3d11_shader.cc', 'd3d11_shader.h', 'd3d11_shader_cache.cc', diff --git a/src/xenia/kernel/objects/xthread.cc b/src/xenia/kernel/objects/xthread.cc index 8acce8b27..7a4c8f5d3 100644 --- a/src/xenia/kernel/objects/xthread.cc +++ b/src/xenia/kernel/objects/xthread.cc @@ -231,6 +231,10 @@ X_STATUS XThread::Create() { return return_code; } + char thread_name[32]; + xesnprintfa(thread_name, XECOUNT(thread_name), "XThread%04X", handle()); + set_name(thread_name); + module->Release(); return X_STATUS_SUCCESS; } diff --git a/src/xenia/kernel/xboxkrnl_rtl.cc b/src/xenia/kernel/xboxkrnl_rtl.cc index 18310e6a7..9fb0bcdbd 100644 --- a/src/xenia/kernel/xboxkrnl_rtl.cc +++ b/src/xenia/kernel/xboxkrnl_rtl.cc @@ -633,9 +633,10 @@ spin: cs->recursion_count = 1; } - SHIM_CALL RtlEnterCriticalSection_shim( PPCContext* ppc_state, KernelState* state) { + SCOPE_profile_cpu_f("kernel"); + uint32_t cs_ptr = SHIM_GET_ARG_32(0); XELOGD("RtlEnterCriticalSection(%.8X)", cs_ptr); diff --git a/src/xenia/logging.cc b/src/xenia/logging.cc index 14b19aaf2..3f9d840f9 100644 --- a/src/xenia/logging.cc +++ b/src/xenia/logging.cc @@ -58,6 +58,8 @@ void xe_format_log_line( void xe_log_line(const char* file_path, const uint32_t line_number, const char* function_name, const char level_char, const char* fmt, ...) { + SCOPE_profile_cpu_i("emu", "log_line"); + char buffer[2048]; va_list args; va_start(args, fmt); diff --git a/src/xenia/profiling.cc b/src/xenia/profiling.cc index c7b5f2eb2..b72eba457 100644 --- a/src/xenia/profiling.cc +++ b/src/xenia/profiling.cc @@ -8,12 +8,18 @@ */ #define MICRO_PROFILE_IMPL +#define MICROPROFILE_USE_THREAD_NAME_CALLBACK 1 #include namespace xe { std::unique_ptr Profiler::display_ = nullptr; +void Profiler::Initialize() { + MicroProfileInit(); + MicroProfileSetDisplayMode(2); +} + void Profiler::Dump() { MicroProfileDumpTimers(); } @@ -23,6 +29,12 @@ void Profiler::Shutdown() { MicroProfileShutdown(); } +uint32_t Profiler::GetColor(const char* str) { + std::hash fn; + size_t value = fn(str); + return value & 0xFFFFFF; +} + void Profiler::ThreadEnter(const char* name) { MicroProfileOnThreadCreate(name); } @@ -31,6 +43,47 @@ void Profiler::ThreadExit() { MicroProfileOnThreadExit(); } +bool Profiler::OnKeyDown(int key_code) { + // http://msdn.microsoft.com/en-us/library/windows/desktop/dd375731(v=vs.85).aspx + switch (key_code) { + case VK_TAB: + MicroProfileToggleDisplayMode(); + return true; + case VK_OEM_3: // ` + MicroProfileTogglePause(); + return true; + case 0x31: // 1 + MicroProfileModKey(1); + return true; + } + return false; +} + +bool Profiler::OnKeyUp(int key_code) { + switch (key_code) { + case 0x31: // 1 + MicroProfileModKey(0); + return true; + } + return false; +} + +void Profiler::OnMouseDown(bool left_button, bool right_button) { + MicroProfileMouseButton(left_button, right_button); +} + +void Profiler::OnMouseUp() { + MicroProfileMouseButton(0, 0); +} + +void Profiler::OnMouseMove(int x, int y) { + MicroProfileMousePosition(x, y, 0); +} + +void Profiler::OnMouseWheel(int x, int y, int dy) { + MicroProfileMousePosition(x, y, dy); +} + void Profiler::set_display(std::unique_ptr display) { display_ = std::move(display); } @@ -60,6 +113,10 @@ uint64_t MicroProfileTicksPerSecondGpu() { return 0; } +const char* MicroProfileGetThreadName() { + return "TODO: get thread name!"; +} + void MicroProfileDrawBox(int nX, int nY, int nX1, int nY1, uint32_t nColor, MicroProfileBoxType type) { auto display = xe::Profiler::display(); if (!display) { diff --git a/src/xenia/profiling.h b/src/xenia/profiling.h index 8ab3d3169..bf66c68a5 100644 --- a/src/xenia/profiling.h +++ b/src/xenia/profiling.h @@ -29,16 +29,16 @@ namespace xe { // Defines a profiling scope for CPU tasks. // Use `SCOPE_profile_cpu(name)` to activate the scope. -#define DEFINE_profile_cpu(name, group_name, scope_name, color) \ - MICROPROFILE_DEFINE(name, group_name, scope_name, color) +#define DEFINE_profile_cpu(name, group_name, scope_name) \ + MICROPROFILE_DEFINE(name, group_name, scope_name, xe::Profiler::GetColor(scope_name)) // Declares a previously defined profile scope. Use in a translation unit. #define DECLARE_profile_cpu(name) MICROPROFILE_DECLARE(name) // Defines a profiling scope for GPU tasks. // Use `COUNT_profile_gpu(name)` to activate the scope. -#define DEFINE_profile_gpu(name, group_name, scope_name, color) \ - MICROPROFILE_DEFINE_GPU(name, group_name, scope_name, color) +#define DEFINE_profile_gpu(name, group_name, scope_name) \ + MICROPROFILE_DEFINE_GPU(name, group_name, scope_name, xe::Profiler::GetColor(scope_name)) // Declares a previously defined profile scope. Use in a translation unit. #define DECLARE_profile_gpu(name) MICROPROFILE_DECLARE_GPU(name) @@ -50,8 +50,13 @@ namespace xe { // Enters a CPU profiling scope, active for the duration of the containing // block. No previous definition required. -#define SCOPE_profile_cpu_i(group_name, scope_name, color) \ - MICROPROFILE_SCOPEI(group_name, scope_name, color) +#define SCOPE_profile_cpu_i(group_name, scope_name) \ + MICROPROFILE_SCOPEI(group_name, scope_name, xe::Profiler::GetColor(scope_name)) + +// Enters a CPU profiling scope by function name, active for the duration of +// the containing block. No previous definition required. +#define SCOPE_profile_cpu_f(group_name) \ + MICROPROFILE_SCOPEI(group_name, XE_CURRENT_FUNCTION, xe::Profiler::GetColor(XE_CURRENT_FUNCTION)) // Enters a previously defined GPU profiling scope, active for the duration // of the containing block. @@ -60,8 +65,13 @@ namespace xe { // Enters a GPU profiling scope, active for the duration of the containing // block. No previous definition required. -#define SCOPE_profile_gpu_i(group_name, scope_name, color) \ - MICROPROFILE_SCOPEGPUI(group_name, scope_name, color) +#define SCOPE_profile_gpu_i(group_name, scope_name) \ + MICROPROFILE_SCOPEGPUI(group_name, scope_name, xe::Profiler::GetColor(scope_name)) + +// Enters a GPU profiling scope by function name, active for the duration of +// the containing block. No previous definition required. +#define SCOPE_profile_gpu_f(group_name) \ + MICROPROFILE_SCOPEGPUI(group_name, XE_CURRENT_FUNCTION, xe::Profiler::GetColor(XE_CURRENT_FUNCTION)) // Tracks a CPU value counter. #define COUNT_profile_cpu(name, count) MICROPROFILE_META_CPU(name, count) @@ -105,17 +115,35 @@ public: class Profiler { public: +#if XE_OPTION_PROFILING + static bool is_enabled() { return true; } +#else + static bool is_enabled() { return false; } +#endif // XE_OPTION_PROFILING + + // Initializes the profiler. Call at startup. + static void Initialize(); // Dumps data to stdout. static void Dump(); // Cleans up profiling, releasing all memory. static void Shutdown(); + // Computes a color from the given string. + static uint32_t GetColor(const char* str); + // Activates the calling thread for profiling. // This must be called immediately after launching a thread. static void ThreadEnter(const char* name = nullptr); // Deactivates the calling thread for profiling. static void ThreadExit(); + static bool OnKeyDown(int key_code); + static bool OnKeyUp(int key_code); + static void OnMouseDown(bool left_button, bool right_button); + static void OnMouseUp(); + static void OnMouseMove(int x, int y); + static void OnMouseWheel(int x, int y, int dy); + // Gets the current display, if any. static ProfilerDisplay* display() { return display_.get(); } // Initializes drawing with the given display. @@ -124,7 +152,6 @@ public: static void Present(); // TODO(benvanik): display mode/pause/etc? - // TODO(benvanik): mouse, keys private: static std::unique_ptr display_; diff --git a/src/xenia/ui/ui_event.h b/src/xenia/ui/ui_event.h index 0c973b86f..07fae340e 100644 --- a/src/xenia/ui/ui_event.h +++ b/src/xenia/ui/ui_event.h @@ -32,6 +32,18 @@ private: Window* window_; }; +class KeyEvent : public UIEvent { +public: + KeyEvent(Window* window, int key_code) : + key_code_(key_code), + UIEvent(window) {} + virtual ~KeyEvent() {} + + int key_code() const { return key_code_; } + +private: + int key_code_; +}; class MouseEvent : public UIEvent { public: diff --git a/src/xenia/ui/win32/win32_window.cc b/src/xenia/ui/win32/win32_window.cc index 4bf5cc0cc..5e4b12738 100644 --- a/src/xenia/ui/win32/win32_window.cc +++ b/src/xenia/ui/win32/win32_window.cc @@ -281,12 +281,13 @@ bool Win32Window::HandleMouse(UINT message, WPARAM wParam, LPARAM lParam) { } bool Win32Window::HandleKeyboard(UINT message, WPARAM wParam, LPARAM lParam) { + auto e = KeyEvent(this, (int)wParam); switch (message) { case WM_KEYDOWN: - (byte)wParam; + key_down(e); return true; case WM_KEYUP: - (byte)wParam; + key_up(e); return true; default: return false; diff --git a/src/xenia/ui/window.h b/src/xenia/ui/window.h index 49fcbf17a..cdc7b2c39 100644 --- a/src/xenia/ui/window.h +++ b/src/xenia/ui/window.h @@ -48,6 +48,9 @@ public: alloy::Delegate closing; alloy::Delegate closed; + alloy::Delegate key_down; + alloy::Delegate key_up; + alloy::Delegate mouse_down; alloy::Delegate mouse_move; alloy::Delegate mouse_up; diff --git a/tools/alloy-sandbox/alloy-sandbox.cc b/tools/alloy-sandbox/alloy-sandbox.cc index e37a1d3d2..038580c69 100644 --- a/tools/alloy-sandbox/alloy-sandbox.cc +++ b/tools/alloy-sandbox/alloy-sandbox.cc @@ -24,6 +24,7 @@ using namespace xe::cpu; int alloy_sandbox(int argc, xechar_t** argv) { + Profiler::Initialize(); xe::Profiler::ThreadEnter("main"); XenonMemory* memory = new XenonMemory(); diff --git a/tools/xenia-run/xenia-run.cc b/tools/xenia-run/xenia-run.cc index 909e26dc8..9f2151d8c 100644 --- a/tools/xenia-run/xenia-run.cc +++ b/tools/xenia-run/xenia-run.cc @@ -22,6 +22,7 @@ DEFINE_string(target, "", int xenia_run(int argc, xechar_t** argv) { int result_code = 1; + Profiler::Initialize(); Profiler::ThreadEnter("main"); Emulator* emulator = NULL; From c1812406f52085f26e0ee8a1dedb4f2277daedba Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 28 May 2014 19:19:39 -0700 Subject: [PATCH 118/184] Adding a bunch of profiling tracers. --- src/alloy/backend/x64/x64_assembler.cc | 2 + src/alloy/backend/x64/x64_code_cache.cc | 2 + src/alloy/backend/x64/x64_emitter.cc | 2 + src/alloy/compiler/compiler.cc | 2 + .../passes/constant_propagation_pass.cc | 2 + .../compiler/passes/context_promotion_pass.cc | 2 + .../passes/control_flow_analysis_pass.cc | 2 + .../passes/data_flow_analysis_pass.cc | 2 + .../passes/dead_code_elimination_pass.cc | 2 + .../compiler/passes/finalization_pass.cc | 2 + .../passes/register_allocation_pass.cc | 2 + .../compiler/passes/simplification_pass.cc | 2 + src/alloy/compiler/passes/validation_pass.cc | 2 + .../compiler/passes/value_reduction_pass.cc | 2 + src/alloy/frontend/ppc/ppc_hir_builder.cc | 2 + src/alloy/frontend/ppc/ppc_scanner.cc | 4 ++ src/alloy/frontend/ppc/ppc_translator.cc | 2 + src/alloy/hir/hir_builder.cc | 4 ++ src/alloy/runtime/entry_table.cc | 2 + src/alloy/runtime/function.cc | 2 + src/alloy/runtime/module.cc | 4 ++ src/alloy/runtime/runtime.cc | 8 ++++ src/xenia/apu/audio_system.cc | 38 ++++++++++++------- src/xenia/apu/xaudio2/xaudio2_audio_driver.cc | 2 + src/xenia/cpu/processor.cc | 6 +++ src/xenia/gpu/d3d11/d3d11_geometry_shader.cc | 7 ++++ src/xenia/gpu/d3d11/d3d11_graphics_driver.cc | 35 +++++++++++++++++ src/xenia/gpu/d3d11/d3d11_graphics_system.cc | 3 ++ src/xenia/gpu/d3d11/d3d11_shader.cc | 7 ++++ src/xenia/gpu/d3d11/d3d11_shader_cache.cc | 1 + src/xenia/gpu/d3d11/d3d11_window.cc | 2 + src/xenia/gpu/ring_buffer_worker.cc | 2 + src/xenia/gpu/shader_cache.cc | 2 + src/xenia/hid/input_system.cc | 8 ++++ 34 files changed, 156 insertions(+), 13 deletions(-) diff --git a/src/alloy/backend/x64/x64_assembler.cc b/src/alloy/backend/x64/x64_assembler.cc index 5a7028e11..d70afe909 100644 --- a/src/alloy/backend/x64/x64_assembler.cc +++ b/src/alloy/backend/x64/x64_assembler.cc @@ -66,6 +66,8 @@ int X64Assembler::Assemble( FunctionInfo* symbol_info, HIRBuilder* builder, uint32_t debug_info_flags, DebugInfo* debug_info, Function** out_function) { + SCOPE_profile_cpu_f("alloy"); + int result = 0; // Lower HIR -> x64. diff --git a/src/alloy/backend/x64/x64_code_cache.cc b/src/alloy/backend/x64/x64_code_cache.cc index 7282c2e23..9d1c2ce60 100644 --- a/src/alloy/backend/x64/x64_code_cache.cc +++ b/src/alloy/backend/x64/x64_code_cache.cc @@ -75,6 +75,8 @@ int X64CodeCache::Initialize() { void* X64CodeCache::PlaceCode(void* machine_code, size_t code_size, size_t stack_size) { + SCOPE_profile_cpu_f("alloy"); + // Add unwind info into the allocation size. Keep things 16b aligned. code_size += XEROUNDUP(X64CodeChunk::UNWIND_INFO_SIZE, 16); diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 58ac912d9..6616b0d52 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -77,6 +77,8 @@ int X64Emitter::Emit( HIRBuilder* builder, uint32_t debug_info_flags, runtime::DebugInfo* debug_info, void*& out_code_address, size_t& out_code_size) { + SCOPE_profile_cpu_f("alloy"); + // Reset. if (debug_info_flags & DEBUG_INFO_SOURCE_MAP) { source_map_count_ = 0; diff --git a/src/alloy/compiler/compiler.cc b/src/alloy/compiler/compiler.cc index a28f6b48b..62c6e5a4b 100644 --- a/src/alloy/compiler/compiler.cc +++ b/src/alloy/compiler/compiler.cc @@ -49,6 +49,8 @@ void Compiler::Reset() { } int Compiler::Compile(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // TODO(benvanik): sophisticated stuff. Run passes in parallel, run until they // stop changing things, etc. for (auto it = passes_.begin(); it != passes_.end(); ++it) { diff --git a/src/alloy/compiler/passes/constant_propagation_pass.cc b/src/alloy/compiler/passes/constant_propagation_pass.cc index 5804ed218..f8430c509 100644 --- a/src/alloy/compiler/passes/constant_propagation_pass.cc +++ b/src/alloy/compiler/passes/constant_propagation_pass.cc @@ -23,6 +23,8 @@ ConstantPropagationPass::~ConstantPropagationPass() { } int ConstantPropagationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Once ContextPromotion has run there will likely be a whole slew of // constants that can be pushed through the function. // Example: diff --git a/src/alloy/compiler/passes/context_promotion_pass.cc b/src/alloy/compiler/passes/context_promotion_pass.cc index c880c4f0e..dc225aea6 100644 --- a/src/alloy/compiler/passes/context_promotion_pass.cc +++ b/src/alloy/compiler/passes/context_promotion_pass.cc @@ -51,6 +51,8 @@ int ContextPromotionPass::Initialize(Compiler* compiler) { } int ContextPromotionPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Like mem2reg, but because context memory is unaliasable it's easier to // check and convert LoadContext/StoreContext into value operations. // Example of load->value promotion: diff --git a/src/alloy/compiler/passes/control_flow_analysis_pass.cc b/src/alloy/compiler/passes/control_flow_analysis_pass.cc index 9c1abf118..5cf6ea6a6 100644 --- a/src/alloy/compiler/passes/control_flow_analysis_pass.cc +++ b/src/alloy/compiler/passes/control_flow_analysis_pass.cc @@ -30,6 +30,8 @@ ControlFlowAnalysisPass::~ControlFlowAnalysisPass() { } int ControlFlowAnalysisPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // TODO(benvanik): reset edges for all blocks? Needed to be re-runnable. // Add edges. diff --git a/src/alloy/compiler/passes/data_flow_analysis_pass.cc b/src/alloy/compiler/passes/data_flow_analysis_pass.cc index 2a44f076d..209410016 100644 --- a/src/alloy/compiler/passes/data_flow_analysis_pass.cc +++ b/src/alloy/compiler/passes/data_flow_analysis_pass.cc @@ -36,6 +36,8 @@ DataFlowAnalysisPass::~DataFlowAnalysisPass() { } int DataFlowAnalysisPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Linearize blocks so that we can detect cycles and propagate dependencies. uint32_t block_count = LinearizeBlocks(builder); diff --git a/src/alloy/compiler/passes/dead_code_elimination_pass.cc b/src/alloy/compiler/passes/dead_code_elimination_pass.cc index d295cebec..afb8d87b2 100644 --- a/src/alloy/compiler/passes/dead_code_elimination_pass.cc +++ b/src/alloy/compiler/passes/dead_code_elimination_pass.cc @@ -23,6 +23,8 @@ DeadCodeEliminationPass::~DeadCodeEliminationPass() { } int DeadCodeEliminationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // ContextPromotion/DSE will likely leave around a lot of dead statements. // Code generated for comparison/testing produces many unused statements and // with proper use analysis it should be possible to remove most of them: diff --git a/src/alloy/compiler/passes/finalization_pass.cc b/src/alloy/compiler/passes/finalization_pass.cc index 7f827da15..e6358f242 100644 --- a/src/alloy/compiler/passes/finalization_pass.cc +++ b/src/alloy/compiler/passes/finalization_pass.cc @@ -30,6 +30,8 @@ FinalizationPass::~FinalizationPass() { } int FinalizationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Process the HIR and prepare it for lowering. // After this is done the HIR should be ready for emitting. diff --git a/src/alloy/compiler/passes/register_allocation_pass.cc b/src/alloy/compiler/passes/register_allocation_pass.cc index a89e1415c..7c3a0a7a9 100644 --- a/src/alloy/compiler/passes/register_allocation_pass.cc +++ b/src/alloy/compiler/passes/register_allocation_pass.cc @@ -59,6 +59,8 @@ RegisterAllocationPass::~RegisterAllocationPass() { } int RegisterAllocationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Simple per-block allocator that operates on SSA form. // Registers do not move across blocks, though this could be // optimized with some intra-block analysis (dominators/etc). diff --git a/src/alloy/compiler/passes/simplification_pass.cc b/src/alloy/compiler/passes/simplification_pass.cc index 14cea8681..7fc53c940 100644 --- a/src/alloy/compiler/passes/simplification_pass.cc +++ b/src/alloy/compiler/passes/simplification_pass.cc @@ -23,6 +23,8 @@ SimplificationPass::~SimplificationPass() { } int SimplificationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + EliminateConversions(builder); SimplifyAssignments(builder); return 0; diff --git a/src/alloy/compiler/passes/validation_pass.cc b/src/alloy/compiler/passes/validation_pass.cc index bc77ab482..265c82fe9 100644 --- a/src/alloy/compiler/passes/validation_pass.cc +++ b/src/alloy/compiler/passes/validation_pass.cc @@ -30,6 +30,8 @@ ValidationPass::~ValidationPass() { } int ValidationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + StringBuffer str; builder->Dump(&str); printf(str.GetString()); diff --git a/src/alloy/compiler/passes/value_reduction_pass.cc b/src/alloy/compiler/passes/value_reduction_pass.cc index 4eb61a38b..94453e294 100644 --- a/src/alloy/compiler/passes/value_reduction_pass.cc +++ b/src/alloy/compiler/passes/value_reduction_pass.cc @@ -53,6 +53,8 @@ void ValueReductionPass::ComputeLastUse(Value* value) { } int ValueReductionPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Walk each block and reuse variable ordinals as much as possible. llvm::BitVector ordinals(builder->max_value_ordinal()); diff --git a/src/alloy/frontend/ppc/ppc_hir_builder.cc b/src/alloy/frontend/ppc/ppc_hir_builder.cc index 1b254ea4e..a8bec8435 100644 --- a/src/alloy/frontend/ppc/ppc_hir_builder.cc +++ b/src/alloy/frontend/ppc/ppc_hir_builder.cc @@ -44,6 +44,8 @@ void PPCHIRBuilder::Reset() { } int PPCHIRBuilder::Emit(FunctionInfo* symbol_info, bool with_debug_info) { + SCOPE_profile_cpu_f("alloy"); + Memory* memory = frontend_->memory(); const uint8_t* p = memory->membase(); diff --git a/src/alloy/frontend/ppc/ppc_scanner.cc b/src/alloy/frontend/ppc/ppc_scanner.cc index f75229b9e..9658bd595 100644 --- a/src/alloy/frontend/ppc/ppc_scanner.cc +++ b/src/alloy/frontend/ppc/ppc_scanner.cc @@ -38,6 +38,8 @@ bool PPCScanner::IsRestGprLr(uint64_t address) { } int PPCScanner::FindExtents(FunctionInfo* symbol_info) { + SCOPE_profile_cpu_f("alloy"); + // This is a simple basic block analyizer. It walks the start address to the // end address looking for branches. Each span of instructions between // branches is considered a basic block. When the last blr (that has no @@ -286,6 +288,8 @@ int PPCScanner::FindExtents(FunctionInfo* symbol_info) { } std::vector PPCScanner::FindBlocks(FunctionInfo* symbol_info) { + SCOPE_profile_cpu_f("alloy"); + Memory* memory = frontend_->memory(); const uint8_t* p = memory->membase(); diff --git a/src/alloy/frontend/ppc/ppc_translator.cc b/src/alloy/frontend/ppc/ppc_translator.cc index 61617db33..4f879336c 100644 --- a/src/alloy/frontend/ppc/ppc_translator.cc +++ b/src/alloy/frontend/ppc/ppc_translator.cc @@ -86,6 +86,8 @@ int PPCTranslator::Translate( FunctionInfo* symbol_info, uint32_t debug_info_flags, Function** out_function) { + SCOPE_profile_cpu_f("alloy"); + // Scan the function to find its extents. We only need to do this if we // haven't already been provided with them from some other source. if (!symbol_info->has_end_address()) { diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index f93a310e8..158e08224 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -51,6 +51,8 @@ void HIRBuilder::Reset() { } int HIRBuilder::Finalize() { + SCOPE_profile_cpu_f("alloy"); + // Scan blocks in order and add fallthrough branches. These are needed for // analysis passes to work. We may have also added blocks out of order and // need to ensure they fall through in the right order. @@ -141,6 +143,8 @@ void HIRBuilder::DumpOp( } void HIRBuilder::Dump(StringBuffer* str) { + SCOPE_profile_cpu_f("alloy"); + if (attributes_) { str->Append("; attributes = %.8X\n", attributes_); } diff --git a/src/alloy/runtime/entry_table.cc b/src/alloy/runtime/entry_table.cc index cf6da5d70..ebec56ea4 100644 --- a/src/alloy/runtime/entry_table.cc +++ b/src/alloy/runtime/entry_table.cc @@ -75,6 +75,8 @@ Entry::Status EntryTable::GetOrCreate(uint64_t address, Entry** out_entry) { } std::vector EntryTable::FindWithAddress(uint64_t address) { + SCOPE_profile_cpu_f("alloy"); + std::vector fns; LockMutex(lock_); for (auto it = map_.begin(); it != map_.end(); ++it) { diff --git a/src/alloy/runtime/function.cc b/src/alloy/runtime/function.cc index 853808d53..2dd0ddce5 100644 --- a/src/alloy/runtime/function.cc +++ b/src/alloy/runtime/function.cc @@ -74,6 +74,8 @@ Breakpoint* Function::FindBreakpoint(uint64_t address) { } int Function::Call(ThreadState* thread_state, uint64_t return_address) { + SCOPE_profile_cpu_f("alloy"); + ThreadState* original_thread_state = ThreadState::Get(); if (original_thread_state != thread_state) { ThreadState::Bind(thread_state); diff --git a/src/alloy/runtime/module.cc b/src/alloy/runtime/module.cc index ea056e0dd..5e38c3902 100644 --- a/src/alloy/runtime/module.cc +++ b/src/alloy/runtime/module.cc @@ -161,6 +161,8 @@ SymbolInfo::Status Module::DefineVariable(VariableInfo* symbol_info) { } void Module::ForEachFunction(std::function callback) { + SCOPE_profile_cpu_f("alloy"); + LockMutex(lock_); for (auto it = list_.begin(); it != list_.end(); ++it) { SymbolInfo* symbol_info = *it; @@ -174,6 +176,8 @@ void Module::ForEachFunction(std::function callback) { void Module::ForEachFunction(size_t since, size_t& version, std::function callback) { + SCOPE_profile_cpu_f("alloy"); + LockMutex(lock_); size_t count = list_.size(); version = count; diff --git a/src/alloy/runtime/runtime.cc b/src/alloy/runtime/runtime.cc index 3fc45a447..1aff92e04 100644 --- a/src/alloy/runtime/runtime.cc +++ b/src/alloy/runtime/runtime.cc @@ -159,6 +159,8 @@ std::vector Runtime::FindFunctionsWithAddress(uint64_t address) { } int Runtime::ResolveFunction(uint64_t address, Function** out_function) { + SCOPE_profile_cpu_f("alloy"); + *out_function = NULL; Entry* entry; Entry::Status status = entry_table_.GetOrCreate(address, &entry); @@ -192,6 +194,8 @@ int Runtime::ResolveFunction(uint64_t address, Function** out_function) { int Runtime::LookupFunctionInfo( uint64_t address, FunctionInfo** out_symbol_info) { + SCOPE_profile_cpu_f("alloy"); + *out_symbol_info = NULL; // TODO(benvanik): fast reject invalid addresses/log errors. @@ -220,6 +224,8 @@ int Runtime::LookupFunctionInfo( int Runtime::LookupFunctionInfo(Module* module, uint64_t address, FunctionInfo** out_symbol_info) { + SCOPE_profile_cpu_f("alloy"); + // Atomic create/lookup symbol in module. // If we get back the NEW flag we must declare it now. FunctionInfo* symbol_info = NULL; @@ -241,6 +247,8 @@ int Runtime::LookupFunctionInfo(Module* module, uint64_t address, int Runtime::DemandFunction( FunctionInfo* symbol_info, Function** out_function) { + SCOPE_profile_cpu_f("alloy"); + *out_function = NULL; // Lock function for generation. If it's already being generated diff --git a/src/xenia/apu/audio_system.cc b/src/xenia/apu/audio_system.cc index 1793fc92d..144d6bb15 100644 --- a/src/xenia/apu/audio_system.cc +++ b/src/xenia/apu/audio_system.cc @@ -82,21 +82,26 @@ void AudioSystem::ThreadStart() { if (result == WAIT_FAILED) { DWORD err = GetLastError(); XEASSERTALWAYS(); + break; } + size_t pumped = 0; - if (result >= WAIT_OBJECT_0 && result <= WAIT_OBJECT_0 + (maximum_client_count_ - 1)) { - size_t index = result - WAIT_OBJECT_0; - do { - xe_mutex_lock(lock_); - uint32_t client_callback = clients_[index].callback; - uint32_t client_callback_arg = clients_[index].wrapped_callback_arg; - xe_mutex_unlock(lock_); - if (client_callback) { - processor->Execute(thread_state_, client_callback, client_callback_arg, 0); - } - pumped++; - index++; - } while (index < maximum_client_count_ && WaitForSingleObject(client_wait_handles_[index], 0) == WAIT_OBJECT_0); + { + SCOPE_profile_cpu_i("apu", "Pump"); + if (result >= WAIT_OBJECT_0 && result <= WAIT_OBJECT_0 + (maximum_client_count_ - 1)) { + size_t index = result - WAIT_OBJECT_0; + do { + xe_mutex_lock(lock_); + uint32_t client_callback = clients_[index].callback; + uint32_t client_callback_arg = clients_[index].wrapped_callback_arg; + xe_mutex_unlock(lock_); + if (client_callback) { + processor->Execute(thread_state_, client_callback, client_callback_arg, 0); + } + pumped++; + index++; + } while (index < maximum_client_count_ && WaitForSingleObject(client_wait_handles_[index], 0) == WAIT_OBJECT_0); + } } if (!running_) { @@ -104,6 +109,7 @@ void AudioSystem::ThreadStart() { } if (!pumped) { + SCOPE_profile_cpu_i("apu", "Sleep"); Sleep(500); } } @@ -126,6 +132,8 @@ void AudioSystem::Shutdown() { X_STATUS AudioSystem::RegisterClient( uint32_t callback, uint32_t callback_arg, size_t* out_index) { + SCOPE_profile_cpu_f("apu"); + XEASSERTTRUE(unused_clients_.size()); xe_mutex_lock(lock_); @@ -157,6 +165,8 @@ X_STATUS AudioSystem::RegisterClient( } void AudioSystem::SubmitFrame(size_t index, uint32_t samples_ptr) { + SCOPE_profile_cpu_f("apu"); + xe_mutex_lock(lock_); XEASSERTTRUE(index < maximum_client_count_); XEASSERTTRUE(clients_[index].driver != NULL); @@ -166,6 +176,8 @@ void AudioSystem::SubmitFrame(size_t index, uint32_t samples_ptr) { } void AudioSystem::UnregisterClient(size_t index) { + SCOPE_profile_cpu_f("apu"); + xe_mutex_lock(lock_); XEASSERTTRUE(index < maximum_client_count_); DestroyDriver(clients_[index].driver); diff --git a/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc b/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc index 0155753c4..6f2cd6659 100644 --- a/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc +++ b/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc @@ -121,6 +121,8 @@ void XAudio2AudioDriver::Initialize() { } void XAudio2AudioDriver::SubmitFrame(uint32_t frame_ptr) { + SCOPE_profile_cpu_f("apu"); + // Process samples! They are big-endian floats. HRESULT hr; diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index b77664482..0c780ce22 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -147,6 +147,8 @@ void Processor::AddRegisterAccessCallbacks( } int Processor::Execute(XenonThreadState* thread_state, uint64_t address) { + SCOPE_profile_cpu_f("cpu"); + // Attempt to get the function. Function* fn; if (runtime_->ResolveFunction(address, &fn)) { @@ -171,6 +173,8 @@ int Processor::Execute(XenonThreadState* thread_state, uint64_t address) { uint64_t Processor::Execute( XenonThreadState* thread_state, uint64_t address, uint64_t arg0) { + SCOPE_profile_cpu_f("cpu"); + PPCContext* context = thread_state->context(); context->r[3] = arg0; if (Execute(thread_state, address)) { @@ -182,6 +186,8 @@ uint64_t Processor::Execute( uint64_t Processor::Execute( XenonThreadState* thread_state, uint64_t address, uint64_t arg0, uint64_t arg1) { + SCOPE_profile_cpu_f("cpu"); + PPCContext* context = thread_state->context(); context->r[3] = arg0; context->r[4] = arg1; diff --git a/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc b/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc index 5984631fe..ba677f7a0 100644 --- a/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc +++ b/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc @@ -34,6 +34,8 @@ D3D11GeometryShader::~D3D11GeometryShader() { } int D3D11GeometryShader::Prepare(D3D11VertexShader* vertex_shader) { + SCOPE_profile_cpu_f("gpu"); + if (handle_) { return 0; } @@ -74,6 +76,8 @@ int D3D11GeometryShader::Prepare(D3D11VertexShader* vertex_shader) { } ID3D10Blob* D3D11GeometryShader::Compile(const char* shader_source) { + SCOPE_profile_cpu_f("gpu"); + // TODO(benvanik): pick shared runtime mode defines. D3D10_SHADER_MACRO defines[] = { "TEST_DEFINE", "1", @@ -161,6 +165,7 @@ D3D11PointSpriteGeometryShader::~D3D11PointSpriteGeometryShader() { int D3D11PointSpriteGeometryShader::Generate(D3D11VertexShader* vertex_shader, alloy::StringBuffer* output) { + SCOPE_profile_cpu_f("gpu"); if (D3D11GeometryShader::Generate(vertex_shader, output)) { return 1; } @@ -215,6 +220,7 @@ D3D11RectListGeometryShader::~D3D11RectListGeometryShader() { int D3D11RectListGeometryShader::Generate(D3D11VertexShader* vertex_shader, alloy::StringBuffer* output) { + SCOPE_profile_cpu_f("gpu"); if (D3D11GeometryShader::Generate(vertex_shader, output)) { return 1; } @@ -259,6 +265,7 @@ D3D11QuadListGeometryShader::~D3D11QuadListGeometryShader() { int D3D11QuadListGeometryShader::Generate(D3D11VertexShader* vertex_shader, alloy::StringBuffer* output) { + SCOPE_profile_cpu_f("gpu"); if (D3D11GeometryShader::Generate(vertex_shader, output)) { return 1; } diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc index 25410bf6f..11518f71c 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc @@ -190,6 +190,8 @@ void D3D11GraphicsDriver::SetShader( } int D3D11GraphicsDriver::SetupDraw(XE_GPU_PRIMITIVE_TYPE prim_type) { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; // Ignore copies. @@ -296,6 +298,8 @@ void D3D11GraphicsDriver::DrawIndexBuffer( XE_GPU_PRIMITIVE_TYPE prim_type, bool index_32bit, uint32_t index_count, uint32_t index_base, uint32_t index_size, uint32_t endianness) { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; XETRACED3D("D3D11: draw indexed %d (%d indicies) from %.8X", @@ -321,6 +325,8 @@ void D3D11GraphicsDriver::DrawIndexBuffer( void D3D11GraphicsDriver::DrawIndexAuto( XE_GPU_PRIMITIVE_TYPE prim_type, uint32_t index_count) { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; XETRACED3D("D3D11: draw indexed %d (%d indicies)", @@ -346,6 +352,8 @@ int D3D11GraphicsDriver::RebuildRenderTargets( return 0; } + SCOPE_profile_cpu_f("gpu"); + // Remove old versions. for (int n = 0; n < XECOUNT(render_targets_.color_buffers); n++) { auto& cb = render_targets_.color_buffers[n]; @@ -426,6 +434,8 @@ int D3D11GraphicsDriver::RebuildRenderTargets( } int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { + SCOPE_profile_cpu_f("gpu"); + // Most information comes from here: // https://chromium.googlesource.com/chromiumos/third_party/mesa/+/6173cc19c45d92ef0b7bc6aa008aa89bb29abbda/src/gallium/drivers/freedreno/freedreno_zsa.c // http://cgit.freedesktop.org/mesa/mesa/diff/?id=aac7f06ad843eaa696363e8e9c7781ca30cb4914 @@ -768,6 +778,8 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { } int D3D11GraphicsDriver::UpdateConstantBuffers() { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; D3D11_MAPPED_SUBRESOURCE res; @@ -799,6 +811,8 @@ int D3D11GraphicsDriver::UpdateConstantBuffers() { } int D3D11GraphicsDriver::BindShaders() { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; xe_gpu_program_cntl_t program_cntl; program_cntl.dword_0 = rf.values[XE_GPU_REG_SQ_PROGRAM_CNTL].u32; @@ -892,6 +906,8 @@ int D3D11GraphicsDriver::BindShaders() { } int D3D11GraphicsDriver::PrepareFetchers() { + SCOPE_profile_cpu_f("gpu"); + // Input assembly. XEASSERTNOTNULL(state_.vertex_shader); auto vtx_inputs = state_.vertex_shader->GetVertexBufferInputs(); @@ -934,6 +950,8 @@ int D3D11GraphicsDriver::PrepareFetchers() { } int D3D11GraphicsDriver::PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc) { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6; xe_gpu_fetch_group_t* group = (xe_gpu_fetch_group_t*)&rf.values[r]; @@ -1009,6 +1027,8 @@ int D3D11GraphicsDriver::PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc) { } int D3D11GraphicsDriver::PrepareTextureFetchers() { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; for (int n = 0; n < XECOUNT(state_.texture_fetchers); n++) { @@ -1275,6 +1295,8 @@ int D3D11GraphicsDriver::FetchTexture1D( xe_gpu_texture_fetch_t& fetch, TextureInfo& info, ID3D11Resource** out_texture) { + SCOPE_profile_cpu_f("gpu"); + uint32_t address = (fetch.address << 12) + address_translation_; uint32_t width = 1 + fetch.size_1d.width; @@ -1299,6 +1321,8 @@ int D3D11GraphicsDriver::FetchTexture1D( } XEFORCEINLINE void TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch, XE_GPU_ENDIAN endianness) { + SCOPE_profile_cpu_f("gpu"); + switch (endianness) { case XE_GPU_ENDIAN_8IN16: for (uint32_t i = 0; i < pitch; i += 2, src += 2, dest += 2) { @@ -1344,6 +1368,8 @@ int D3D11GraphicsDriver::FetchTexture2D( xe_gpu_texture_fetch_t& fetch, TextureInfo& info, ID3D11Resource** out_texture) { + SCOPE_profile_cpu_f("gpu"); + XEASSERTTRUE(fetch.dimension == 1); uint32_t address = (fetch.address << 12) + address_translation_; @@ -1448,6 +1474,8 @@ int D3D11GraphicsDriver::FetchTexture3D( xe_gpu_texture_fetch_t& fetch, TextureInfo& info, ID3D11Resource** out_texture) { + SCOPE_profile_cpu_f("gpu"); + XELOGE("D3D11: FetchTexture2D not yet implemented"); XEASSERTALWAYS(); return 1; @@ -1470,6 +1498,8 @@ int D3D11GraphicsDriver::FetchTextureCube( xe_gpu_texture_fetch_t& fetch, TextureInfo& info, ID3D11Resource** out_texture) { + SCOPE_profile_cpu_f("gpu"); + XELOGE("D3D11: FetchTextureCube not yet implemented"); XEASSERTALWAYS(); return 1; @@ -1477,6 +1507,7 @@ int D3D11GraphicsDriver::FetchTextureCube( int D3D11GraphicsDriver::PrepareTextureSampler( xenos::XE_GPU_SHADER_TYPE shader_type, Shader::tex_buffer_desc_t& desc) { + SCOPE_profile_cpu_f("gpu"); auto& fetcher = state_.texture_fetchers[desc.fetch_slot]; auto& info = fetcher.info; @@ -1588,6 +1619,8 @@ int D3D11GraphicsDriver::PrepareTextureSampler( int D3D11GraphicsDriver::PrepareIndexBuffer( bool index_32bit, uint32_t index_count, uint32_t index_base, uint32_t index_size, uint32_t endianness) { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; uint32_t address = index_base + address_translation_; @@ -1634,6 +1667,8 @@ int D3D11GraphicsDriver::PrepareIndexBuffer( } int D3D11GraphicsDriver::Resolve() { + SCOPE_profile_cpu_f("gpu"); + // No clue how this is supposed to work yet. ID3D11Texture2D* back_buffer = 0; swap_chain_->GetBuffer(0, __uuidof(ID3D11Texture2D), diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc index 03c91038c..825e9f1ff 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc @@ -29,6 +29,7 @@ void __stdcall D3D11GraphicsSystemVsyncCallback( thread_name_set = true; Profiler::ThreadEnter("VsyncTimer"); } + SCOPE_profile_cpu_f("gpu"); gs->MarkVblank(); gs->DispatchInterruptCallback(0); @@ -151,6 +152,8 @@ void D3D11GraphicsSystem::Initialize() { } void D3D11GraphicsSystem::Pump() { + SCOPE_profile_cpu_f("gpu"); + if (swap_pending_) { swap_pending_ = false; diff --git a/src/xenia/gpu/d3d11/d3d11_shader.cc b/src/xenia/gpu/d3d11/d3d11_shader.cc index a60a7bdf3..97e0cb295 100644 --- a/src/xenia/gpu/d3d11/d3d11_shader.cc +++ b/src/xenia/gpu/d3d11/d3d11_shader.cc @@ -145,6 +145,8 @@ void D3D11Shader::set_translated_src(char* value) { } ID3D10Blob* D3D11Shader::Compile(const char* shader_source) { + SCOPE_profile_cpu_f("gpu"); + // TODO(benvanik): pick shared runtime mode defines. D3D10_SHADER_MACRO defines[] = { "TEST_DEFINE", "1", @@ -256,6 +258,7 @@ D3D11VertexShader::~D3D11VertexShader() { } int D3D11VertexShader::Prepare(xe_gpu_program_cntl_t* program_cntl) { + SCOPE_profile_cpu_f("gpu"); if (handle_) { return 0; } @@ -411,6 +414,8 @@ int D3D11VertexShader::Prepare(xe_gpu_program_cntl_t* program_cntl) { } const char* D3D11VertexShader::Translate(xe_gpu_program_cntl_t* program_cntl) { + SCOPE_profile_cpu_f("gpu"); + Output* output = new Output(); xe_gpu_translate_ctx_t ctx; ctx.output = output; @@ -599,6 +604,7 @@ D3D11PixelShader::~D3D11PixelShader() { int D3D11PixelShader::Prepare(xe_gpu_program_cntl_t* program_cntl, D3D11VertexShader* input_shader) { + SCOPE_profile_cpu_f("gpu"); if (handle_) { return 0; } @@ -641,6 +647,7 @@ int D3D11PixelShader::Prepare(xe_gpu_program_cntl_t* program_cntl, const char* D3D11PixelShader::Translate( xe_gpu_program_cntl_t* program_cntl, D3D11VertexShader* input_shader) { + SCOPE_profile_cpu_f("gpu"); Output* output = new Output(); xe_gpu_translate_ctx_t ctx; ctx.output = output; diff --git a/src/xenia/gpu/d3d11/d3d11_shader_cache.cc b/src/xenia/gpu/d3d11/d3d11_shader_cache.cc index 7f6a5a722..be9352b50 100644 --- a/src/xenia/gpu/d3d11/d3d11_shader_cache.cc +++ b/src/xenia/gpu/d3d11/d3d11_shader_cache.cc @@ -31,6 +31,7 @@ Shader* D3D11ShaderCache::CreateCore( xenos::XE_GPU_SHADER_TYPE type, const uint8_t* src_ptr, size_t length, uint64_t hash) { + SCOPE_profile_cpu_f("gpu"); switch (type) { case XE_GPU_SHADER_TYPE_VERTEX: return new D3D11VertexShader( diff --git a/src/xenia/gpu/d3d11/d3d11_window.cc b/src/xenia/gpu/d3d11/d3d11_window.cc index 64a9b0df6..da33ab6bb 100644 --- a/src/xenia/gpu/d3d11/d3d11_window.cc +++ b/src/xenia/gpu/d3d11/d3d11_window.cc @@ -114,6 +114,8 @@ int D3D11Window::Initialize(const char* title, uint32_t width, uint32_t height) } void D3D11Window::Swap() { + SCOPE_profile_cpu_f("gpu"); + // Present profiler. context_->OMSetRenderTargets(1, &render_target_view_, NULL); Profiler::Present(); diff --git a/src/xenia/gpu/ring_buffer_worker.cc b/src/xenia/gpu/ring_buffer_worker.cc index 3792c0b61..9999601bb 100644 --- a/src/xenia/gpu/ring_buffer_worker.cc +++ b/src/xenia/gpu/ring_buffer_worker.cc @@ -125,6 +125,8 @@ void RingBufferWorker::Pump() { void RingBufferWorker::ExecutePrimaryBuffer( uint32_t start_index, uint32_t end_index) { + SCOPE_profile_cpu_f("gpu"); + // Adjust pointer base. uint32_t ptr = primary_buffer_ptr_ + start_index * 4; ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (ptr & 0x1FFFFFFF); diff --git a/src/xenia/gpu/shader_cache.cc b/src/xenia/gpu/shader_cache.cc index 9aee3e2b7..33033bc36 100644 --- a/src/xenia/gpu/shader_cache.cc +++ b/src/xenia/gpu/shader_cache.cc @@ -55,6 +55,8 @@ Shader* ShaderCache::Find( Shader* ShaderCache::FindOrCreate( XE_GPU_SHADER_TYPE type, const uint8_t* src_ptr, size_t length) { + SCOPE_profile_cpu_f("gpu"); + uint64_t hash = Hash(src_ptr, length); unordered_map::iterator it = map_.find(hash); if (it != map_.end()) { diff --git a/src/xenia/hid/input_system.cc b/src/xenia/hid/input_system.cc index b82ca11af..6ad1ab177 100644 --- a/src/xenia/hid/input_system.cc +++ b/src/xenia/hid/input_system.cc @@ -42,6 +42,8 @@ void InputSystem::AddDriver(InputDriver* driver) { X_RESULT InputSystem::GetCapabilities( uint32_t user_index, uint32_t flags, X_INPUT_CAPABILITIES& out_caps) { + SCOPE_profile_cpu_f("hid"); + for (auto it = drivers_.begin(); it != drivers_.end(); ++it) { InputDriver* driver = *it; if (XSUCCEEDED(driver->GetCapabilities(user_index, flags, out_caps))) { @@ -52,6 +54,8 @@ X_RESULT InputSystem::GetCapabilities( } X_RESULT InputSystem::GetState(uint32_t user_index, X_INPUT_STATE& out_state) { + SCOPE_profile_cpu_f("hid"); + for (auto it = drivers_.begin(); it != drivers_.end(); ++it) { InputDriver* driver = *it; if (driver->GetState(user_index, out_state) == X_ERROR_SUCCESS) { @@ -63,6 +67,8 @@ X_RESULT InputSystem::GetState(uint32_t user_index, X_INPUT_STATE& out_state) { X_RESULT InputSystem::SetState( uint32_t user_index, X_INPUT_VIBRATION& vibration) { + SCOPE_profile_cpu_f("hid"); + for (auto it = drivers_.begin(); it != drivers_.end(); ++it) { InputDriver* driver = *it; if (XSUCCEEDED(driver->SetState(user_index, vibration))) { @@ -74,6 +80,8 @@ X_RESULT InputSystem::SetState( X_RESULT InputSystem::GetKeystroke( uint32_t user_index, uint32_t flags, X_INPUT_KEYSTROKE& out_keystroke) { + SCOPE_profile_cpu_f("hid"); + for (auto it = drivers_.begin(); it != drivers_.end(); ++it) { InputDriver* driver = *it; if (XSUCCEEDED(driver->GetKeystroke(user_index, flags, out_keystroke))) { From c06526e5df03261459730e6252809c541407cf96 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 28 May 2014 20:02:40 -0700 Subject: [PATCH 119/184] Backpatch function addresses. --- src/alloy/backend/x64/x64_emitter.cc | 52 +++++++++++++++++++- src/xenia/gpu/d3d11/d3d11_graphics_driver.cc | 2 - 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 6616b0d52..e966a5103 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -231,16 +231,53 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) { XEASSERTALWAYS(); } +// Total size of ResolveFunctionSymbol call site in bytes. +// Used to overwrite it with nops as needed. +const size_t TOTAL_RESOLVE_SIZE = 27; +const size_t ASM_OFFSET = 2 + 2 + 8 + 2 + 8; + +// Length Assembly Byte Sequence +// ================================================================================= +// 2 bytes 66 NOP 66 90H +// 3 bytes NOP DWORD ptr [EAX] 0F 1F 00H +// 4 bytes NOP DWORD ptr [EAX + 00H] 0F 1F 40 00H +// 5 bytes NOP DWORD ptr [EAX + EAX*1 + 00H] 0F 1F 44 00 00H +// 6 bytes 66 NOP DWORD ptr [EAX + EAX*1 + 00H] 66 0F 1F 44 00 00H +// 7 bytes NOP DWORD ptr [EAX + 00000000H] 0F 1F 80 00 00 00 00H +// 8 bytes NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H +// 9 bytes 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H + uint64_t ResolveFunctionSymbol(void* raw_context, uint64_t symbol_info_ptr) { // TODO(benvanik): generate this thunk at runtime? or a shim? auto thread_state = *reinterpret_cast(raw_context); auto symbol_info = reinterpret_cast(symbol_info_ptr); + // Resolve function. This will demand compile as required. Function* fn = NULL; thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn); XEASSERTNOTNULL(fn); auto x64_fn = static_cast(fn); - return reinterpret_cast(x64_fn->machine_code()); + uint64_t addr = reinterpret_cast(x64_fn->machine_code()); + + // Overwrite the call site. + // The return address points to ReloadRCX work after the call. + uint64_t return_address = reinterpret_cast(_ReturnAddress()); + #pragma pack(push, 1) + struct Asm { + uint16_t mov_rax; + uint64_t rax_constant; + uint16_t mov_rdx; + uint64_t rdx_constant; + uint16_t call_rax; + uint8_t mov_rcx[5]; + }; + #pragma pack(pop) + Asm* code = reinterpret_cast(return_address - ASM_OFFSET); + code->rax_constant = addr; + code->call_rax = 0x9066; + + // We need to return the target in rax so that it gets called. + return addr; } void X64Emitter::Call(const hir::Instr* instr, runtime::FunctionInfo* symbol_info) { @@ -250,7 +287,18 @@ void X64Emitter::Call(const hir::Instr* instr, runtime::FunctionInfo* symbol_inf if (fn) { mov(rax, reinterpret_cast(fn->machine_code())); } else { - CallNative(ResolveFunctionSymbol, reinterpret_cast(symbol_info)); + size_t start = getSize(); + // 2b + 8b constant + mov(rax, reinterpret_cast(ResolveFunctionSymbol)); + // 2b + 8b constant + mov(rdx, reinterpret_cast(symbol_info)); + // 2b + call(rax); + // 5b + ReloadECX(); + size_t total_size = getSize() - start; + XEASSERT(total_size == TOTAL_RESOLVE_SIZE); + // EDX overwritten, don't bother reloading. } // Actually jump/call to rax. diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc index 11518f71c..fd7377693 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc @@ -1321,8 +1321,6 @@ int D3D11GraphicsDriver::FetchTexture1D( } XEFORCEINLINE void TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch, XE_GPU_ENDIAN endianness) { - SCOPE_profile_cpu_f("gpu"); - switch (endianness) { case XE_GPU_ENDIAN_8IN16: for (uint32_t i = 0; i < pitch; i += 2, src += 2, dest += 2) { From 18b1f9f51382fb80b28dc6497e4fb886ebdf9a81 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 28 May 2014 20:19:28 -0700 Subject: [PATCH 120/184] Fixing controller input. --- src/xenia/apu/audio_system.cc | 29 +++++++++++++---------------- src/xenia/hid/hid.cc | 10 +++++----- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/src/xenia/apu/audio_system.cc b/src/xenia/apu/audio_system.cc index 144d6bb15..efc762fc9 100644 --- a/src/xenia/apu/audio_system.cc +++ b/src/xenia/apu/audio_system.cc @@ -86,22 +86,19 @@ void AudioSystem::ThreadStart() { } size_t pumped = 0; - { - SCOPE_profile_cpu_i("apu", "Pump"); - if (result >= WAIT_OBJECT_0 && result <= WAIT_OBJECT_0 + (maximum_client_count_ - 1)) { - size_t index = result - WAIT_OBJECT_0; - do { - xe_mutex_lock(lock_); - uint32_t client_callback = clients_[index].callback; - uint32_t client_callback_arg = clients_[index].wrapped_callback_arg; - xe_mutex_unlock(lock_); - if (client_callback) { - processor->Execute(thread_state_, client_callback, client_callback_arg, 0); - } - pumped++; - index++; - } while (index < maximum_client_count_ && WaitForSingleObject(client_wait_handles_[index], 0) == WAIT_OBJECT_0); - } + if (result >= WAIT_OBJECT_0 && result <= WAIT_OBJECT_0 + (maximum_client_count_ - 1)) { + size_t index = result - WAIT_OBJECT_0; + do { + xe_mutex_lock(lock_); + uint32_t client_callback = clients_[index].callback; + uint32_t client_callback_arg = clients_[index].wrapped_callback_arg; + xe_mutex_unlock(lock_); + if (client_callback) { + processor->Execute(thread_state_, client_callback, client_callback_arg, 0); + } + pumped++; + index++; + } while (index < maximum_client_count_ && WaitForSingleObject(client_wait_handles_[index], 0) == WAIT_OBJECT_0); } if (!running_) { diff --git a/src/xenia/hid/hid.cc b/src/xenia/hid/hid.cc index fc3a3b9e6..fbd66630d 100644 --- a/src/xenia/hid/hid.cc +++ b/src/xenia/hid/hid.cc @@ -46,16 +46,16 @@ InputSystem* xe::hid::Create(Emulator* emulator) { // NOTE: in any mode we create as many as we can, falling back to nop. #if XE_PLATFORM_WIN32 - InputDriver* winkey_driver = xe::hid::winkey::Create(input_system); - if (winkey_driver) { - input_system->AddDriver(winkey_driver); - any_created = true; - } InputDriver* xinput_driver = xe::hid::xinput::Create(input_system); if (xinput_driver) { input_system->AddDriver(xinput_driver); any_created = true; } + InputDriver* winkey_driver = xe::hid::winkey::Create(input_system); + if (winkey_driver) { + input_system->AddDriver(winkey_driver); + any_created = true; + } #endif // WIN32 // Fallback to nop if none created. From 997f582d1f60384ad2db1b5d5fe8138e0a4850d5 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 28 May 2014 20:32:50 -0700 Subject: [PATCH 121/184] Removing useless counters. --- src/xenia/apu/audio_system.cc | 2 -- src/xenia/apu/xaudio2/xaudio2_audio_driver.cc | 2 -- src/xenia/gpu/d3d11/d3d11_shader_cache.cc | 1 - 3 files changed, 5 deletions(-) diff --git a/src/xenia/apu/audio_system.cc b/src/xenia/apu/audio_system.cc index efc762fc9..40d151bd2 100644 --- a/src/xenia/apu/audio_system.cc +++ b/src/xenia/apu/audio_system.cc @@ -129,8 +129,6 @@ void AudioSystem::Shutdown() { X_STATUS AudioSystem::RegisterClient( uint32_t callback, uint32_t callback_arg, size_t* out_index) { - SCOPE_profile_cpu_f("apu"); - XEASSERTTRUE(unused_clients_.size()); xe_mutex_lock(lock_); diff --git a/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc b/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc index 6f2cd6659..0155753c4 100644 --- a/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc +++ b/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc @@ -121,8 +121,6 @@ void XAudio2AudioDriver::Initialize() { } void XAudio2AudioDriver::SubmitFrame(uint32_t frame_ptr) { - SCOPE_profile_cpu_f("apu"); - // Process samples! They are big-endian floats. HRESULT hr; diff --git a/src/xenia/gpu/d3d11/d3d11_shader_cache.cc b/src/xenia/gpu/d3d11/d3d11_shader_cache.cc index be9352b50..7f6a5a722 100644 --- a/src/xenia/gpu/d3d11/d3d11_shader_cache.cc +++ b/src/xenia/gpu/d3d11/d3d11_shader_cache.cc @@ -31,7 +31,6 @@ Shader* D3D11ShaderCache::CreateCore( xenos::XE_GPU_SHADER_TYPE type, const uint8_t* src_ptr, size_t length, uint64_t hash) { - SCOPE_profile_cpu_f("gpu"); switch (type) { case XE_GPU_SHADER_TYPE_VERTEX: return new D3D11VertexShader( From 328ece538ac1223cbfa9cb9d45a474014558b061 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 28 May 2014 21:46:43 -0700 Subject: [PATCH 122/184] Fixing disabled profiling. --- src/xenia/profiling.cc | 27 ++++++++++++++++++++++++++- src/xenia/profiling.h | 18 ++++++++++++++---- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/src/xenia/profiling.cc b/src/xenia/profiling.cc index b72eba457..ce3a4ece0 100644 --- a/src/xenia/profiling.cc +++ b/src/xenia/profiling.cc @@ -15,9 +15,11 @@ namespace xe { std::unique_ptr Profiler::display_ = nullptr; +#if XE_OPTION_PROFILING + void Profiler::Initialize() { MicroProfileInit(); - MicroProfileSetDisplayMode(2); + MicroProfileSetDisplayMode(1); } void Profiler::Dump() { @@ -99,8 +101,29 @@ void Profiler::Present() { display_->End(); } +#else + +void Profiler::Initialize() {} +void Profiler::Dump() {} +void Profiler::Shutdown() {} +uint32_t Profiler::GetColor(const char* str) { return 0; } +void Profiler::ThreadEnter(const char* name) {} +void Profiler::ThreadExit() {} +bool Profiler::OnKeyDown(int key_code) { return false; } +bool Profiler::OnKeyUp(int key_code) { return false; } +void Profiler::OnMouseDown(bool left_button, bool right_button) {} +void Profiler::OnMouseUp() {} +void Profiler::OnMouseMove(int x, int y) {} +void Profiler::OnMouseWheel(int x, int y, int dy) {} +void Profiler::set_display(std::unique_ptr display) {} +void Profiler::Present() {} + +#endif // XE_OPTION_PROFILING + } // namespace xe +#if XE_OPTION_PROFILING + uint32_t MicroProfileGpuInsertTimeStamp() { return 0; } @@ -143,3 +166,5 @@ void MicroProfileDrawText(int nX, int nY, uint32_t nColor, const char* pText, ui } display->DrawText(nX, nY, nColor, pText, nLen); } + +#endif // XE_OPTION_PROFILING diff --git a/src/xenia/profiling.h b/src/xenia/profiling.h index bf66c68a5..30ab996ac 100644 --- a/src/xenia/profiling.h +++ b/src/xenia/profiling.h @@ -81,24 +81,34 @@ namespace xe { #else -#define DEFINE_profile_cpu(name, group_name, scope_name, color) -#define DEFINE_profile_gpu(name, group_name, scope_name, color) +#define DEFINE_profile_cpu(name, group_name, scope_name) +#define DEFINE_profile_gpu(name, group_name, scope_name) #define DECLARE_profile_cpu(name) #define DECLARE_profile_gpu(name) #define SCOPE_profile_cpu(name) do {} while (false) -#define SCOPE_profile_cpu_i(group_name, scope_name, color) do {} while (false) +#define SCOPE_profile_cpu_f(name) do {} while (false) +#define SCOPE_profile_cpu_i(group_name, scope_name) do {} while (false) #define SCOPE_profile_gpu(name) do {} while (false) -#define SCOPE_profile_gpu_i(group_name, scope_name, color) do {} while (false) +#define SCOPE_profile_gpu_f(name) do {} while (false) +#define SCOPE_profile_gpu_i(group_name, scope_name) do {} while (false) #define COUNT_profile_cpu(name, count) do {} while (false) #define COUNT_profile_gpu(name, count) do {} while (false) +#define MICROPROFILE_TEXT_WIDTH 1 +#define MICROPROFILE_TEXT_HEIGHT 1 + #endif // XE_OPTION_PROFILING class ProfilerDisplay { public: enum BoxType { +#if XE_OPTION_PROFILING BOX_TYPE_BAR = MicroProfileBoxTypeBar, BOX_TYPE_FLAT = MicroProfileBoxTypeFlat, +#else + BOX_TYPE_BAR, + BOX_TYPE_FLAT, +#endif // XE_OPTION_PROFILING }; virtual uint32_t width() const = 0; From 0c553098267153e389242e6538df0082167b6469 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Thu, 29 May 2014 23:11:00 -0700 Subject: [PATCH 123/184] Fixing COMPARE and tweaking ABS/NEG. --- src/alloy/backend/x64/x64_emitter.cc | 2 + src/alloy/backend/x64/x64_emitter.h | 38 +++++++-------- src/alloy/backend/x64/x64_sequences.cc | 64 +++++++++++++++++--------- src/alloy/frontend/ppc/ppc_disasm.cc | 2 +- 4 files changed, 65 insertions(+), 41 deletions(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index e966a5103..0096a08fa 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -489,6 +489,8 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { /* XMM3301 */ vec128f(3.0f, 3.0f, 0.0f, 1.0f), /* XMMSignMaskPS */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), /* XMMSignMaskPD */ vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u), + /* XMMAbsMaskPS */ vec128i(0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu), + /* XMMAbsMaskPD */ vec128i(0xFFFFFFFFu, 0x7FFFFFFFu, 0xFFFFFFFFu, 0x7FFFFFFFu), /* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), /* XMMPermuteControl15 */ vec128b(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15), /* XMMPackD3DCOLOR */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u), diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 4b05e5134..a720e1970 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -37,24 +37,26 @@ enum RegisterFlags { enum XmmConst { XMMZero = 0, - XMMOne = 1, - XMMNegativeOne = 2, - XMMMaskX16Y16 = 3, - XMMFlipX16Y16 = 4, - XMMFixX16Y16 = 5, - XMMNormalizeX16Y16 = 6, - XMM3301 = 7, - XMMSignMaskPS = 8, - XMMSignMaskPD = 9, - XMMByteSwapMask = 10, - XMMPermuteControl15 = 11, - XMMPackD3DCOLOR = 12, - XMMUnpackD3DCOLOR = 13, - XMMOneOver255 = 14, - XMMShiftMaskPS = 15, - XMMShiftByteMask = 16, - XMMUnsignedDwordMax = 17, - XMM255 = 18, + XMMOne, + XMMNegativeOne, + XMMMaskX16Y16, + XMMFlipX16Y16, + XMMFixX16Y16, + XMMNormalizeX16Y16, + XMM3301, + XMMSignMaskPS, + XMMSignMaskPD, + XMMAbsMaskPS, + XMMAbsMaskPD, + XMMByteSwapMask, + XMMPermuteControl15, + XMMPackD3DCOLOR, + XMMUnpackD3DCOLOR, + XMMOneOver255, + XMMShiftMaskPS, + XMMShiftByteMask, + XMMUnsignedDwordMax, + XMM255, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 17502d137..865f93476 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -1946,6 +1946,8 @@ EMITTER(SELECT_F32, MATCH(I, I8<>, F32<>, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { e.test(i.src1, i.src1); // TODO(benvanik): find a way to do this without branches. + // We may be able to load src1 into an xmm, cmp with zero, and use that + // as a selection mask to choose between src2 & src3. Xbyak::Label skip; e.vmovaps(i.dest, i.src3); e.jz(skip); @@ -2243,6 +2245,23 @@ EMITTER_OPCODE_TABLE( EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, I16, Reg16); \ EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, I32, Reg32); \ EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, I64, Reg64); \ + EMITTER_OPCODE_TABLE( \ + OPCODE_COMPARE_##op##, \ + COMPARE_##op##_I8, \ + COMPARE_##op##_I16, \ + COMPARE_##op##_I32, \ + COMPARE_##op##_I64); +EMITTER_ASSOCIATIVE_COMPARE_XX(SLT, setl, setge); +EMITTER_ASSOCIATIVE_COMPARE_XX(SLE, setle, setg); +EMITTER_ASSOCIATIVE_COMPARE_XX(SGT, setg, setle); +EMITTER_ASSOCIATIVE_COMPARE_XX(SGE, setge, setl); +EMITTER_ASSOCIATIVE_COMPARE_XX(ULT, setb, setae); +EMITTER_ASSOCIATIVE_COMPARE_XX(ULE, setbe, seta); +EMITTER_ASSOCIATIVE_COMPARE_XX(UGT, seta, setbe); +EMITTER_ASSOCIATIVE_COMPARE_XX(UGE, setae, setb); + +// http://x86.renejeschke.de/html/file_module_x86_id_288.html +#define EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(op, instr) \ EMITTER(COMPARE_##op##_F32, MATCH(I, F32<>, F32<>>)) { \ static void Emit(X64Emitter& e, const EmitArgType& i) { \ e.vcomiss(i.src1, i.src2); \ @@ -2264,21 +2283,17 @@ EMITTER_OPCODE_TABLE( } \ }; \ EMITTER_OPCODE_TABLE( \ - OPCODE_COMPARE_##op##, \ - COMPARE_##op##_I8, \ - COMPARE_##op##_I16, \ - COMPARE_##op##_I32, \ - COMPARE_##op##_I64, \ + OPCODE_COMPARE_##op##_FLT, \ COMPARE_##op##_F32, \ COMPARE_##op##_F64); -EMITTER_ASSOCIATIVE_COMPARE_XX(SLT, setl, setge); -EMITTER_ASSOCIATIVE_COMPARE_XX(SLE, setle, setg); -EMITTER_ASSOCIATIVE_COMPARE_XX(SGT, setg, setle); -EMITTER_ASSOCIATIVE_COMPARE_XX(SGE, setge, setl); -EMITTER_ASSOCIATIVE_COMPARE_XX(ULT, setb, setae); -EMITTER_ASSOCIATIVE_COMPARE_XX(ULE, setbe, seta); -EMITTER_ASSOCIATIVE_COMPARE_XX(UGT, seta, setbe); -EMITTER_ASSOCIATIVE_COMPARE_XX(UGE, setae, setb); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SLT, setb); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SLE, setbe); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SGT, seta); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SGE, setae); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(ULT, setb); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(ULE, setbe); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(UGT, seta); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(UGE, setae); // ============================================================================ @@ -3356,18 +3371,18 @@ EMITTER(NEG_I64, MATCH(I, I64<>>)) { }; EMITTER(NEG_F32, MATCH(I, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPS)); + e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPS)); } }; EMITTER(NEG_F64, MATCH(I, F64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPD)); + e.vxorpd(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPD)); } }; EMITTER(NEG_V128, MATCH(I, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERT(!i.instr->flags); - e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPS)); + e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPS)); } }; EMITTER_OPCODE_TABLE( @@ -3386,20 +3401,17 @@ EMITTER_OPCODE_TABLE( // ============================================================================ EMITTER(ABS_F32, MATCH(I, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMSignMaskPS)); - e.vpandn(i.dest, e.xmm0, i.src1); + e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS)); } }; EMITTER(ABS_F64, MATCH(I, F64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMSignMaskPD)); - e.vpandn(i.dest, e.xmm0, i.src1); + e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD)); } }; EMITTER(ABS_V128, MATCH(I, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMSignMaskPS)); - e.vpandn(i.dest, e.xmm0, i.src1); + e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS)); } }; EMITTER_OPCODE_TABLE( @@ -4980,6 +4992,14 @@ void alloy::backend::x64::RegisterSequences() { REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_ULE); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_UGT); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_UGE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SLT_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SLE_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SGT_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SGE_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_ULT_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_ULE_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_UGT_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_UGE_FLT); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_CARRY); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_OVERFLOW); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE); diff --git a/src/alloy/frontend/ppc/ppc_disasm.cc b/src/alloy/frontend/ppc/ppc_disasm.cc index ee9f21522..aa823a972 100644 --- a/src/alloy/frontend/ppc/ppc_disasm.cc +++ b/src/alloy/frontend/ppc/ppc_disasm.cc @@ -266,7 +266,7 @@ void Disasm_dcbz(InstrData& i, StringBuffer* str) { } void Disasm_fcmp(InstrData& i, StringBuffer* str) { - str->Append("%-8s cr%d, r%d, r%d", i.type->name, + str->Append("%-8s cr%d, f%d, f%d", i.type->name, i.X.RT >> 2, i.X.RA, i.X.RB); } From 0267efa5cc0065cef6c002829ff5f2d4a672ae65 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 30 May 2014 06:50:07 -0700 Subject: [PATCH 124/184] Always swapping the graphics system so that we see the profiler update. --- src/xenia/gpu/d3d11/d3d11_graphics_system.cc | 5 +++ src/xenia/gpu/d3d11/d3d11_profiler_display.cc | 33 ++++++++++++++----- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc index 825e9f1ff..70460d0cd 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc @@ -170,6 +170,11 @@ void D3D11GraphicsSystem::Pump() { if (xe_pal_now() - last_interrupt_time_ > 500 / 1000.0) { DispatchInterruptCallback(0); } + + // Force a swap when profiling. + if (Profiler::is_enabled()) { + window_->Swap(); + } } } diff --git a/src/xenia/gpu/d3d11/d3d11_profiler_display.cc b/src/xenia/gpu/d3d11/d3d11_profiler_display.cc index b0632b21f..276b73c40 100644 --- a/src/xenia/gpu/d3d11/d3d11_profiler_display.cc +++ b/src/xenia/gpu/d3d11/d3d11_profiler_display.cc @@ -423,13 +423,22 @@ uint32_t D3D11ProfilerDisplay::height() const { void D3D11ProfilerDisplay::Begin() { auto context = window_->context(); + D3D11_VIEWPORT viewport; + viewport.TopLeftX = 0.0f; + viewport.TopLeftY = 0.0f; + viewport.Width = static_cast(width()); + viewport.Height = static_cast(height()); + viewport.MinDepth = 0.0f; + viewport.MaxDepth = 1.0f; + context->RSSetViewports(1, &viewport); + // Setup projection matrix. - float left = 0.0f; - float right = (float)width(); - float bottom = (float)height(); - float top = 0.0f; - float z_near = -1.0f; - float z_far = 1.0f; + float left = viewport.TopLeftX; + float right = viewport.TopLeftX + viewport.Width; + float bottom = viewport.TopLeftY + viewport.Height; + float top = viewport.TopLeftY; + float z_near = viewport.MinDepth; + float z_far = viewport.MaxDepth; float projection[16] = { 0 }; projection[0] = 2.0f / (right - left); projection[5] = 2.0f / (top - bottom); @@ -452,9 +461,17 @@ void D3D11ProfilerDisplay::Begin() { context->VSSetShader(vertex_shader_, nullptr, 0); context->VSSetConstantBuffers(0, 1, &shader_constants_); context->PSSetShader(pixel_shader_, nullptr, 0); - context->PSSetSamplers(0, 1, &font_sampler_state_); context->PSSetConstantBuffers(0, 1, &shader_constants_); - context->PSSetShaderResources(0, 1, &font_texture_view_); + ID3D11SamplerState* ps_samplers[D3D11_COMMONSHADER_SAMPLER_SLOT_COUNT] = { + font_sampler_state_, + nullptr, + }; + context->PSSetSamplers(0, XECOUNT(ps_samplers), ps_samplers); + ID3D11ShaderResourceView* ps_resources[D3D11_COMMONSHADER_INPUT_RESOURCE_SLOT_COUNT] = { + font_texture_view_, + nullptr, + }; + context->PSSetShaderResources(0, XECOUNT(ps_resources), ps_resources); context->IASetInputLayout(shader_layout_); } From 6c92e6a5c5768790a473f5aab56bf1cf90b319bd Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 30 May 2014 11:44:24 -0700 Subject: [PATCH 125/184] Increasing microprofile stack size, as some of these trees are deep. --- src/xenia/gpu/d3d11/d3d11_graphics_system.cc | 2 -- third_party/microprofile/microprofile.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc index 70460d0cd..88fe0f205 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc @@ -152,8 +152,6 @@ void D3D11GraphicsSystem::Initialize() { } void D3D11GraphicsSystem::Pump() { - SCOPE_profile_cpu_f("gpu"); - if (swap_pending_) { swap_pending_ = false; diff --git a/third_party/microprofile/microprofile.h b/third_party/microprofile/microprofile.h index a2a840ea6..3e9dca24a 100644 --- a/third_party/microprofile/microprofile.h +++ b/third_party/microprofile/microprofile.h @@ -443,7 +443,7 @@ int64_t MicroProfileGetTick() #define MICROPROFILE_BUFFER_SIZE ((MICROPROFILE_PER_THREAD_BUFFER_SIZE)/sizeof(MicroProfileLogEntry)) #define MICROPROFILE_MAX_THREADS 32 #define MICROPROFILE_MAX_CONTEXT_SWITCH_THREADS 256 -#define MICROPROFILE_STACK_MAX 32 +#define MICROPROFILE_STACK_MAX 128 #define MICROPROFILE_MAX_PRESETS 5 #define MICROPROFILE_DEBUG 0 #define MICROPROFILE_TOOLTIP_MAX_STRINGS (32 + MICROPROFILE_MAX_GROUPS*2) From 8ba95efdcec57bf2bc5ad32e9bec3e1b21d1466a Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 30 May 2014 12:41:48 -0700 Subject: [PATCH 126/184] Fixing mulx usage. --- src/alloy/backend/x64/x64_sequences.cc | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 865f93476..ec4163ed1 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -2924,7 +2924,7 @@ EMITTER(MUL_HI_I8, MATCH(I, I8<>, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.instr->flags & ARITHMETIC_UNSIGNED) { // TODO(benvanik): place src1 in eax? still need to sign extend - e.movzx(e.eax, i.src1); + e.movzx(e.edx, i.src1); e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32()); } else { e.mov(e.al, i.src1); @@ -2938,7 +2938,7 @@ EMITTER(MUL_HI_I16, MATCH(I, I16<>, I16<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.instr->flags & ARITHMETIC_UNSIGNED) { // TODO(benvanik): place src1 in eax? still need to sign extend - e.movzx(e.eax, i.src1); + e.movzx(e.edx, i.src1); e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32()); } else { e.mov(e.ax, i.src1); @@ -2952,8 +2952,13 @@ EMITTER(MUL_HI_I32, MATCH(I, I32<>, I32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.instr->flags & ARITHMETIC_UNSIGNED) { // TODO(benvanik): place src1 in eax? still need to sign extend - e.mov(e.eax, i.src1); - e.mulx(i.dest, e.eax, i.src2); + e.mov(e.edx, i.src1); + if (i.src2.is_constant) { + e.mov(e.eax, i.src2.constant()); + e.mulx(i.dest, e.edx, e.eax); + } else { + e.mulx(i.dest, e.edx, i.src2); + } } else { e.mov(e.eax, i.src1); e.imul(i.src2); @@ -2966,8 +2971,13 @@ EMITTER(MUL_HI_I64, MATCH(I, I64<>, I64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.instr->flags & ARITHMETIC_UNSIGNED) { // TODO(benvanik): place src1 in eax? still need to sign extend - e.mov(e.rax, i.src1); - e.mulx(i.dest, e.rax, i.src2); + e.mov(e.rdx, i.src1); + if (i.src2.is_constant) { + e.mov(e.rax, i.src2.constant()); + e.mulx(i.dest, e.rdx, e.rax); + } else { + e.mulx(i.dest, e.rax, i.src2); + } } else { e.mov(e.rax, i.src1); e.imul(i.src2); From 1729c8ce6d58137261379be68d851062bef8ce9d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 30 May 2014 14:00:28 -0700 Subject: [PATCH 127/184] Constant MIN/MAX. --- src/alloy/backend/x64/x64_sequences.cc | 30 ++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index ec4163ed1..0d7824c8a 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -1866,17 +1866,26 @@ EMITTER_OPCODE_TABLE( // ============================================================================ EMITTER(MAX_F32, MATCH(I, F32<>, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vmaxss(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmaxss(dest, src1, src2); + }); } }; EMITTER(MAX_F64, MATCH(I, F64<>, F64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vmaxsd(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmaxsd(dest, src1, src2); + }); } }; EMITTER(MAX_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vmaxps(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmaxps(dest, src1, src2); + }); } }; EMITTER_OPCODE_TABLE( @@ -1891,17 +1900,26 @@ EMITTER_OPCODE_TABLE( // ============================================================================ EMITTER(MIN_F32, MATCH(I, F32<>, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vminss(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vminss(dest, src1, src2); + }); } }; EMITTER(MIN_F64, MATCH(I, F64<>, F64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vminsd(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vminsd(dest, src1, src2); + }); } }; EMITTER(MIN_V128, MATCH(I, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vminps(i.dest, i.src1, i.src2); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vminps(dest, src1, src2); + }); } }; EMITTER_OPCODE_TABLE( From 691a3d4adec9f32029bc7987a770ade74d47cf21 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 30 May 2014 14:01:52 -0700 Subject: [PATCH 128/184] Fixing EXTRACT. --- src/alloy/backend/x64/x64_sequences.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 0d7824c8a..09a34a59c 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -48,6 +48,14 @@ static std::unordered_multimap sequence_table; } // namespace +// Selects the right byte/word/etc from a vector. We need to flip logical +// indices (0,1,2,3,4,5,6,7,...) = (3,2,1,0,7,6,5,4,...) +#define VEC128_B(n) ((n) & 0xC) | ((~(n)) & 0x3) +#define VEC128_W(n) ((n) & 0x6) | ((~(n)) & 0x1) +#define VEC128_D(n) (n) +#define VEC128_F(n) (n) + + // ============================================================================ // OPCODE_COMMENT // ============================================================================ @@ -4402,7 +4410,7 @@ EMITTER_OPCODE_TABLE( EMITTER(EXTRACT_I8, MATCH(I, V128<>, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { - e.vpextrb(i.dest.reg().cvt32(), i.src1, i.src2.constant()); + e.vpextrb(i.dest.reg().cvt32(), i.src1, VEC128_B(i.src2.constant())); } else { XEASSERTALWAYS(); } @@ -4411,7 +4419,7 @@ EMITTER(EXTRACT_I8, MATCH(I, V128<>, I8<>>)) { EMITTER(EXTRACT_I16, MATCH(I, V128<>, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { - e.vpextrw(i.dest.reg().cvt32(), i.src1, i.src2.constant()); + e.vpextrw(i.dest.reg().cvt32(), i.src1, VEC128_W(i.src2.constant())); } else { XEASSERTALWAYS(); } @@ -4426,7 +4434,7 @@ EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), }; if (i.src2.is_constant) { - e.vpextrd(i.dest, i.src1, i.src2.constant()); + e.vpextrd(i.dest, i.src1, VEC128_D(i.src2.constant())); } else { // Get the desired word in xmm0, then extract that. // TODO(benvanik): find a better way, this sequence is terrible. @@ -4445,7 +4453,7 @@ EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { EMITTER(EXTRACT_F32, MATCH(I, V128<>, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { - e.vextractps(i.dest, i.src1, i.src2.constant()); + e.vextractps(i.dest, i.src1, VEC128_F(i.src2.constant())); } else { XEASSERTALWAYS(); } From d65b5801f8745fa51393e91b21ffaf923c96ed92 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 30 May 2014 14:02:15 -0700 Subject: [PATCH 129/184] Adding hlide's comments for EXTRACT - need to implement/test. --- src/alloy/backend/x64/x64_sequences.cc | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 09a34a59c..3b7d386f0 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4413,6 +4413,12 @@ EMITTER(EXTRACT_I8, MATCH(I, V128<>, I8<>>)) { e.vpextrb(i.dest.reg().cvt32(), i.src1, VEC128_B(i.src2.constant())); } else { XEASSERTALWAYS(); + // TODO(benvanik): try out hlide's version: + // mov eax, 0x80808080 + // mov al, i.src2 + // vmovd xmm0, eax + // vpshufb xmm0, i.src1, xmm0 + // vmovd i.dest.reg().cvt32(), xmm0 } } }; @@ -4421,6 +4427,14 @@ EMITTER(EXTRACT_I16, MATCH(I, V128<>, I8<>>)) { if (i.src2.is_constant) { e.vpextrw(i.dest.reg().cvt32(), i.src1, VEC128_W(i.src2.constant())); } else { + // TODO(benvanik): try out hlide's version: + // xor eax, eax + // mov al, i.src2 // eax = [i, 0, 0, 0] + // imul eax, eax, 0x00000202 // [i*2, i*2, 0, 0] supposedly that 0<= i < 8 + // add eax,0x80800100 // [i*2+0b00, i*2+0b01, 0x80, 0x80] + // vmovd xmm0, eax + // vpshufb xmm0, i.src1, xmm0 + // vmovd i.dest.reg().cvt32(), xmm0 XEASSERTALWAYS(); } } @@ -4436,8 +4450,15 @@ EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { if (i.src2.is_constant) { e.vpextrd(i.dest, i.src1, VEC128_D(i.src2.constant())); } else { + // TODO(benvanik): try out hlide's version: + // xor eax, eax + // mov al, i.src2 // eax = [i, 0, 0, 0] + // imul eax, eax, 0x04040404 // [i*4, i*4, i*4, i*4] supposedly that 0<= i < 4 + // add eax,0x03020100 // [i*4+0b00, i*4+0b01, i*4+0b10, i*4+0b11] + // vmovd xmm0, eax + // vpshufb xmm0, i.src1, xmm0 + // vmovd i.dest.reg().cvt32(), xmm0 // Get the desired word in xmm0, then extract that. - // TODO(benvanik): find a better way, this sequence is terrible. e.xor(e.rax, e.rax); e.mov(e.al, i.src2); e.and(e.al, 0x03); From 0b42c72ec59b9eb4c4a1373172526201aa7c4c12 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 30 May 2014 20:09:00 -0700 Subject: [PATCH 130/184] Fixing const propagated DID_CARRY. --- src/alloy/backend/x64/x64_sequences.cc | 11 +++-- .../passes/constant_propagation_pass.cc | 49 +++++++++++++++++-- .../passes/constant_propagation_pass.h | 1 + src/alloy/hir/value.cc | 19 ++++++- src/alloy/hir/value.h | 4 +- 5 files changed, 74 insertions(+), 10 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 3b7d386f0..629c5bf33 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -2329,24 +2329,28 @@ EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(UGE, setae); // https://code.google.com/p/corkami/wiki/x86oddities EMITTER(DID_CARRY_I8, MATCH(I, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.src1.is_constant); e.LoadEflags(); e.setc(i.dest); } }; EMITTER(DID_CARRY_I16, MATCH(I, I16<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.src1.is_constant); e.LoadEflags(); e.setc(i.dest); } }; EMITTER(DID_CARRY_I32, MATCH(I, I32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.src1.is_constant); e.LoadEflags(); e.setc(i.dest); } }; EMITTER(DID_CARRY_I64, MATCH(I, I64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.src1.is_constant); e.LoadEflags(); e.setc(i.dest); } @@ -4414,8 +4418,8 @@ EMITTER(EXTRACT_I8, MATCH(I, V128<>, I8<>>)) { } else { XEASSERTALWAYS(); // TODO(benvanik): try out hlide's version: - // mov eax, 0x80808080 - // mov al, i.src2 + // mov eax, 0x80808003 + // xor al, i.src2.cvt8() // vmovd xmm0, eax // vpshufb xmm0, i.src1, xmm0 // vmovd i.dest.reg().cvt32(), xmm0 @@ -4430,6 +4434,7 @@ EMITTER(EXTRACT_I16, MATCH(I, V128<>, I8<>>)) { // TODO(benvanik): try out hlide's version: // xor eax, eax // mov al, i.src2 // eax = [i, 0, 0, 0] + // xor eax, 0x80800203 // imul eax, eax, 0x00000202 // [i*2, i*2, 0, 0] supposedly that 0<= i < 8 // add eax,0x80800100 // [i*2+0b00, i*2+0b01, 0x80, 0x80] // vmovd xmm0, eax @@ -4454,7 +4459,7 @@ EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { // xor eax, eax // mov al, i.src2 // eax = [i, 0, 0, 0] // imul eax, eax, 0x04040404 // [i*4, i*4, i*4, i*4] supposedly that 0<= i < 4 - // add eax,0x03020100 // [i*4+0b00, i*4+0b01, i*4+0b10, i*4+0b11] + // xor/add eax, 0x00010203 // [i*4+0b00, i*4+0b01, i*4+0b10, i*4+0b11] // vmovd xmm0, eax // vpshufb xmm0, i.src1, xmm0 // vmovd i.dest.reg().cvt32(), xmm0 diff --git a/src/alloy/compiler/passes/constant_propagation_pass.cc b/src/alloy/compiler/passes/constant_propagation_pass.cc index f8430c509..140d0bf9c 100644 --- a/src/alloy/compiler/passes/constant_propagation_pass.cc +++ b/src/alloy/compiler/passes/constant_propagation_pass.cc @@ -43,6 +43,14 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) { // v1 = add 1000, 1000 // store_context +200, 2000 // A DCE run after this should clean up any of the values no longer needed. + // + // Special care needs to be taken with paired instructions. For example, + // DID_CARRY needs to be set as a constant: + // v1 = sub.2 20, 1 + // v2 = did_carry v1 + // should become: + // v1 = 19 + // v2 = 0 Block* block = builder->first_block(); while (block) { @@ -252,19 +260,41 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) { } break; + case OPCODE_DID_CARRY: + XEASSERT(!i->src1.value->IsConstant()); + break; + case OPCODE_DID_OVERFLOW: + XEASSERT(!i->src1.value->IsConstant()); + break; + case OPCODE_DID_SATURATE: + XEASSERT(!i->src1.value->IsConstant()); + break; + case OPCODE_ADD: if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); - v->Add(i->src2.value); + bool did_carry = v->Add(i->src2.value); + bool propagate_carry = !!(i->flags & ARITHMETIC_SET_CARRY); i->Remove(); + + // If carry is set find the DID_CARRY and fix it. + if (propagate_carry) { + PropagateCarry(v, did_carry); + } } break; - // TODO(benvanik): ADD_CARRY + // TODO(benvanik): ADD_CARRY (w/ ARITHMETIC_SET_CARRY) case OPCODE_SUB: if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); - v->Sub(i->src2.value); + bool did_carry = v->Sub(i->src2.value); + bool propagate_carry = !!(i->flags & ARITHMETIC_SET_CARRY); i->Remove(); + + // If carry is set find the DID_CARRY and fix it. + if (propagate_carry) { + PropagateCarry(v, did_carry); + } } break; case OPCODE_MUL: @@ -393,3 +423,16 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) { return 0; } + +void ConstantPropagationPass::PropagateCarry(hir::Value* v, bool did_carry) { + auto next = v->use_head; + while (next) { + auto use = next; + next = use->next; + if (use->instr->opcode == &OPCODE_DID_CARRY_info) { + // Replace carry value. + use->instr->dest->set_constant(did_carry ? 1 : 0); + use->instr->Remove(); + } + } +} diff --git a/src/alloy/compiler/passes/constant_propagation_pass.h b/src/alloy/compiler/passes/constant_propagation_pass.h index ce705522b..2220394ad 100644 --- a/src/alloy/compiler/passes/constant_propagation_pass.h +++ b/src/alloy/compiler/passes/constant_propagation_pass.h @@ -26,6 +26,7 @@ public: virtual int Run(hir::HIRBuilder* builder); private: + void PropagateCarry(hir::Value* v, bool did_carry); }; diff --git a/src/alloy/hir/value.cc b/src/alloy/hir/value.cc index 10fc62cad..0f723e943 100644 --- a/src/alloy/hir/value.cc +++ b/src/alloy/hir/value.cc @@ -187,19 +187,26 @@ void Value::Round(RoundMode round_mode) { XEASSERTALWAYS(); } -void Value::Add(Value* other) { +bool Value::Add(Value* other) { + #define CHECK_DID_CARRY(v1, v2) (((uint64_t)v2) > ~((uint64_t)v1)) + #define ADD_DID_CARRY(a, b) CHECK_DID_CARRY(a, b) XEASSERT(type == other->type); + bool did_carry = false; switch (type) { case INT8_TYPE: + did_carry = ADD_DID_CARRY(constant.i8, other->constant.i8); constant.i8 += other->constant.i8; break; case INT16_TYPE: + did_carry = ADD_DID_CARRY(constant.i16, other->constant.i16); constant.i16 += other->constant.i16; break; case INT32_TYPE: + did_carry = ADD_DID_CARRY(constant.i32, other->constant.i32); constant.i32 += other->constant.i32; break; case INT64_TYPE: + did_carry = ADD_DID_CARRY(constant.i64, other->constant.i64); constant.i64 += other->constant.i64; break; case FLOAT32_TYPE: @@ -212,21 +219,28 @@ void Value::Add(Value* other) { XEASSERTALWAYS(); break; } + return did_carry; } -void Value::Sub(Value* other) { +bool Value::Sub(Value* other) { + #define SUB_DID_CARRY(a, b) (b > a) XEASSERT(type == other->type); + bool did_carry = false; switch (type) { case INT8_TYPE: + did_carry = SUB_DID_CARRY(constant.i8, other->constant.i8); constant.i8 -= other->constant.i8; break; case INT16_TYPE: + did_carry = SUB_DID_CARRY(constant.i16, other->constant.i16); constant.i16 -= other->constant.i16; break; case INT32_TYPE: + did_carry = SUB_DID_CARRY(constant.i32, other->constant.i32); constant.i32 -= other->constant.i32; break; case INT64_TYPE: + did_carry = SUB_DID_CARRY(constant.i64, other->constant.i64); constant.i64 -= other->constant.i64; break; case FLOAT32_TYPE: @@ -239,6 +253,7 @@ void Value::Sub(Value* other) { XEASSERTALWAYS(); break; } + return did_carry; } void Value::Mul(Value* other) { diff --git a/src/alloy/hir/value.h b/src/alloy/hir/value.h index 9a1f668f5..3c4e82619 100644 --- a/src/alloy/hir/value.h +++ b/src/alloy/hir/value.h @@ -375,8 +375,8 @@ public: void Truncate(TypeName target_type); void Convert(TypeName target_type, RoundMode round_mode); void Round(RoundMode round_mode); - void Add(Value* other); - void Sub(Value* other); + bool Add(Value* other); + bool Sub(Value* other); void Mul(Value* other); void Div(Value* other); static void MulAdd(Value* dest, Value* value1, Value* value2, Value* value3); From 7acbf759e2942bc05cd10653b153c3b8ffe74ddf Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 30 May 2014 21:38:33 -0700 Subject: [PATCH 131/184] Fixing UNPACK D3DCOLOR. Probably. --- src/alloy/backend/x64/x64_emitter.cc | 2 +- src/alloy/backend/x64/x64_sequences.cc | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 0096a08fa..090c8fe9a 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -494,7 +494,7 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { /* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), /* XMMPermuteControl15 */ vec128b(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15), /* XMMPackD3DCOLOR */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u), - /* XMMUnpackD3DCOLOR */ vec128i(0xFFFFFF02u, 0xFFFFFF01u, 0xFFFFFF00u, 0xFFFFFF03u), + /* XMMUnpackD3DCOLOR */ vec128i(0xFFFFFF0Eu, 0xFFFFFF0Du, 0xFFFFFF0Cu, 0xFFFFFF0Fu), /* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f), /* XMMShiftMaskPS */ vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu), /* XMMShiftByteMask */ vec128i(0x000000FFu, 0x000000FFu, 0x000000FFu, 0x000000FFu), diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 629c5bf33..9de06e58f 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4826,6 +4826,8 @@ EMITTER(UNPACK, MATCH(I, V128<>>)) { // src = ZZYYXXWW // unpack to 000000ZZ,000000YY,000000XX,000000WW e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMUnpackD3DCOLOR)); + // int -> float + e.vcvtdq2ps(i.dest, i.dest); // mult by 1/255 e.vmulps(i.dest, e.GetXmmConstPtr(XMMOneOver255)); } From df5d86e78c2c21b059c869fda05d374e85c42bb4 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 30 May 2014 22:29:08 -0700 Subject: [PATCH 132/184] Tweaking load/store. Nasty. --- src/alloy/backend/x64/x64_sequences.cc | 68 +++++++++++++++----------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 9de06e58f..438e1d2d9 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -1529,9 +1529,10 @@ void DynamicRegisterStore(void* raw_context, uint32_t address, uint64_t value) { } } template -void EmitLoadCheck(X64Emitter& e, const RegExp& addr, DEST_REG& dest) { +void EmitLoadCheck(X64Emitter& e, const I64<>& addr_value, DEST_REG& dest) { // rax = reserved // if (address >> 24 == 0x7F) call register load handler; + auto addr = ComputeMemoryAddress(e, addr_value); e.lea(e.r8d, e.ptr[addr]); e.shr(e.r8d, 24); e.cmp(e.r8b, 0x7F); @@ -1550,13 +1551,23 @@ void EmitLoadCheck(X64Emitter& e, const RegExp& addr, DEST_REG& dest) { if (DEST_REG::key_type == KEY_TYPE_V_I32) { e.mov(dest, e.dword[addr]); } + if (IsTracingData()) { + e.mov(e.r8, dest); + e.lea(e.rdx, e.ptr[addr]); + if (DEST_REG::key_type == KEY_TYPE_V_I32) { + e.CallNative(TraceMemoryLoadI32); + } else if (DEST_REG::key_type == KEY_TYPE_V_I64) { + e.CallNative(TraceMemoryLoadI64); + } + } e.L(skip_load); e.outLocalLabel(); } template -void EmitStoreCheck(X64Emitter& e, const RegExp& addr, SRC_REG& src) { +void EmitStoreCheck(X64Emitter& e, const I64<>& addr_value, SRC_REG& src) { // rax = reserved // if (address >> 24 == 0x7F) call register store handler; + auto addr = ComputeMemoryAddress(e, addr_value); e.lea(e.r8d, e.ptr[addr]); e.shr(e.r8d, 24); e.cmp(e.r8b, 0x7F); @@ -1567,11 +1578,18 @@ void EmitStoreCheck(X64Emitter& e, const RegExp& addr, SRC_REG& src) { e.lea(e.rdx, e.ptr[addr]); if (SRC_REG::key_type == KEY_TYPE_V_I32) { if (src.is_constant) { - e.mov(e.r8d, XESWAP32(src.constant())); + e.mov(e.r8d, XESWAP32(static_cast(src.constant()))); } else { e.mov(e.r8d, src); e.bswap(e.r8d); } + } else if (SRC_REG::key_type == KEY_TYPE_V_I64) { + if (src.is_constant) { + e.mov(e.r8, XESWAP64(static_cast(src.constant()))); + } else { + e.mov(e.r8, src); + e.bswap(e.r8); + } } e.CallNative(DynamicRegisterStore); e.jmp(skip_load); @@ -1582,6 +1600,21 @@ void EmitStoreCheck(X64Emitter& e, const RegExp& addr, SRC_REG& src) { } else { e.mov(e.dword[addr], src); } + } else if (SRC_REG::key_type == KEY_TYPE_V_I64) { + if (src.is_constant) { + e.MovMem64(addr, src.constant()); + } else { + e.mov(e.qword[addr], src); + } + } + if (IsTracingData()) { + e.mov(e.r8, e.qword[addr]); + e.lea(e.rdx, e.ptr[addr]); + if (SRC_REG::key_type == KEY_TYPE_V_I32) { + e.CallNative(TraceMemoryStoreI32); + } else if (SRC_REG::key_type == KEY_TYPE_V_I64) { + e.CallNative(TraceMemoryStoreI64); + } } e.L(skip_load); e.outLocalLabel(); @@ -1619,13 +1652,7 @@ EMITTER(LOAD_I32, MATCH(I, I64<>>)) { if (CheckLoadAccessCallback(e, i)) { return; } - auto addr = ComputeMemoryAddress(e, i.src1); - EmitLoadCheck(e, addr, i.dest); - if (IsTracingData()) { - e.mov(e.r8, i.dest); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(TraceMemoryLoadI32); - } + EmitLoadCheck(e, i.src1, i.dest); } }; EMITTER(LOAD_I64, MATCH(I, I64<>>)) { @@ -1772,13 +1799,7 @@ EMITTER(STORE_I32, MATCH(I, I32<>>)) { if (CheckStoreAccessCallback(e, i)) { return; } - auto addr = ComputeMemoryAddress(e, i.src1); - EmitStoreCheck(e, addr, i.src2); - if (IsTracingData()) { - e.mov(e.r8, e.dword[addr]); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(TraceMemoryStoreI32); - } + EmitStoreCheck(e, i.src1, i.src2); } }; EMITTER(STORE_I64, MATCH(I, I64<>>)) { @@ -1786,17 +1807,7 @@ EMITTER(STORE_I64, MATCH(I, I64<>>)) { if (CheckStoreAccessCallback(e, i)) { return; } - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.src2.is_constant) { - e.MovMem64(addr, i.src2.constant()); - } else { - e.mov(e.qword[addr], i.src2); - } - if (IsTracingData()) { - e.mov(e.r8, e.qword[addr]); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(TraceMemoryStoreI64); - } + EmitStoreCheck(e, i.src1, i.src2); } }; EMITTER(STORE_F32, MATCH(I, F32<>>)) { @@ -1940,6 +1951,7 @@ EMITTER_OPCODE_TABLE( // ============================================================================ // OPCODE_SELECT // ============================================================================ +// dest = src1 ? src2 : src3 EMITTER(SELECT_I8, MATCH(I, I8<>, I8<>, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { e.test(i.src1, i.src1); From 002aaab77de4a988c18c6fc6833089fbff8db2bf Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 30 May 2014 22:36:06 -0700 Subject: [PATCH 133/184] Fixing VECTOR_COMPARE_SGE. --- src/alloy/backend/x64/x64_sequences.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 438e1d2d9..84e265592 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -2469,18 +2469,18 @@ EMITTER(VECTOR_COMPARE_SGE_V128, MATCH(I, V128 [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { switch (i.instr->flags) { case INT8_TYPE: - e.vpcmpgtb(dest, src1, src2); e.vpcmpeqb(e.xmm0, src1, src2); + e.vpcmpgtb(dest, src1, src2); e.vpor(dest, e.xmm0); break; case INT16_TYPE: - e.vpcmpgtw(dest, src1, src2); e.vpcmpeqw(e.xmm0, src1, src2); + e.vpcmpgtw(dest, src1, src2); e.vpor(dest, e.xmm0); break; case INT32_TYPE: - e.vpcmpgtd(dest, src1, src2); e.vpcmpeqd(e.xmm0, src1, src2); + e.vpcmpgtd(dest, src1, src2); e.vpor(dest, e.xmm0); break; case FLOAT32_TYPE: From 334f744fb812f63aacf777527ef05ca6d8433362 Mon Sep 17 00:00:00 2001 From: hlide Date: Sat, 31 May 2014 14:23:22 +0200 Subject: [PATCH 134/184] EXTRACT_I8, EXTRACT_I16, EXTRACT_I32, EXTRACT_F32: properly modified the code in comments --- src/alloy/backend/x64/x64_sequences.cc | 48 +++++++++++++++----------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 438e1d2d9..4be6a15a2 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4430,11 +4430,12 @@ EMITTER(EXTRACT_I8, MATCH(I, V128<>, I8<>>)) { } else { XEASSERTALWAYS(); // TODO(benvanik): try out hlide's version: - // mov eax, 0x80808003 - // xor al, i.src2.cvt8() - // vmovd xmm0, eax - // vpshufb xmm0, i.src1, xmm0 - // vmovd i.dest.reg().cvt32(), xmm0 + // e.mov(e.eax, 0x80808003); + // e.xor(e.al, i.src2); + // e.and(e.al, 15); + // e.vmovd(e.xmm0, e.eax); + // e.vpshufb(e.xmm0, i.src1, e.xmm0); + // e.vmovd(i.dest.reg().cvt32(), e.xmm0); } } }; @@ -4444,14 +4445,13 @@ EMITTER(EXTRACT_I16, MATCH(I, V128<>, I8<>>)) { e.vpextrw(i.dest.reg().cvt32(), i.src1, VEC128_W(i.src2.constant())); } else { // TODO(benvanik): try out hlide's version: - // xor eax, eax - // mov al, i.src2 // eax = [i, 0, 0, 0] - // xor eax, 0x80800203 - // imul eax, eax, 0x00000202 // [i*2, i*2, 0, 0] supposedly that 0<= i < 8 - // add eax,0x80800100 // [i*2+0b00, i*2+0b01, 0x80, 0x80] - // vmovd xmm0, eax - // vpshufb xmm0, i.src1, xmm0 - // vmovd i.dest.reg().cvt32(), xmm0 + // e.mov(e.eax, 7); + // e.and(e.al, i.src2); // eax = [i&7, 0, 0, 0] + // e.imul(e.eax, 0x00000202); // [(i&7)*2, (i&7)*2, 0, 0] + // e.xor(e.eax, 0x80800203); // [((i&7)*2)^3, ((i&7)*2)^2, 0x80, 0x80] + // e.vmovd(e.xmm0, e.eax); + // e.vpshufb(e.xmm0, i.src1, e.xmm0); + // e.vmovd(i.dest.reg().cvt32(), e.xmm0); XEASSERTALWAYS(); } } @@ -4468,13 +4468,13 @@ EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { e.vpextrd(i.dest, i.src1, VEC128_D(i.src2.constant())); } else { // TODO(benvanik): try out hlide's version: - // xor eax, eax - // mov al, i.src2 // eax = [i, 0, 0, 0] - // imul eax, eax, 0x04040404 // [i*4, i*4, i*4, i*4] supposedly that 0<= i < 4 - // xor/add eax, 0x00010203 // [i*4+0b00, i*4+0b01, i*4+0b10, i*4+0b11] - // vmovd xmm0, eax - // vpshufb xmm0, i.src1, xmm0 - // vmovd i.dest.reg().cvt32(), xmm0 + // e.mov(e.eax, 3); + // e.and(e.al, i.src2); // eax = [(i&3), 0, 0, 0] + // e.imul(e.eax, 0x04040404); // [(i&3)*4, (i&3)*4, (i&3)*4, (i&3)*4] + // e.add(e.eax, 0x00010203); // [((i&3)*4)+3, ((i&3)*4)+2, ((i&3)*4)+1, ((i&3)*4)+0] + // e.vmovd(e.xmm0, e.eax); + // e.vpshufb(e.xmm0, i.src1, e.xmm0); + // e.vmovd(i.dest.reg().cvt32(), e.xmm0); // Get the desired word in xmm0, then extract that. e.xor(e.rax, e.rax); e.mov(e.al, i.src2); @@ -4494,6 +4494,14 @@ EMITTER(EXTRACT_F32, MATCH(I, V128<>, I8<>>)) { e.vextractps(i.dest, i.src1, VEC128_F(i.src2.constant())); } else { XEASSERTALWAYS(); + // TODO(benvanik): try out hlide's version: + // e.mov(e.eax, 3); + // e.and(e.al, i.src2); // eax = [(i&3), 0, 0, 0] + // e.imul(e.eax, 0x04040404); // [(i&3)*4, (i&3)*4, (i&3)*4, (i&3)*4] + // e.add(e.eax, 0x00010203); // [((i&3)*4)+3, ((i&3)*4)+2, ((i&3)*4)+1, ((i&3)*4)+0] + // e.vmovd(e.xmm0, e.eax); + // e.vpshufb(e.xmm0, i.src1, e.xmm0); + // e.vmovd(i.dest, e.xmm0); } } }; From 8525cf739ef63afac65629f607fbe20d394441d2 Mon Sep 17 00:00:00 2001 From: hlide Date: Sat, 31 May 2014 16:51:15 +0200 Subject: [PATCH 135/184] Update x64_sequences.cc --- src/alloy/backend/x64/x64_sequences.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 4be6a15a2..00821cc14 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -3269,7 +3269,7 @@ EMITTER_OPCODE_TABLE( // OPCODE_MUL_ADD // ============================================================================ // d = 1 * 2 + 3 -// $0 = $1×$0 + $2 +// $0 = $1�$0 + $2 // TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling. // dest could be src2 or src3 - need to ensure it's not before overwriting dest // perhaps use other 132/213/etc @@ -3332,7 +3332,7 @@ EMITTER_OPCODE_TABLE( // OPCODE_MUL_SUB // ============================================================================ // d = 1 * 2 - 3 -// $0 = $2×$0 - $3 +// $0 = $2�$0 - $3 // TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling. // dest could be src2 or src3 - need to ensure it's not before overwriting dest // perhaps use other 132/213/etc @@ -4445,7 +4445,7 @@ EMITTER(EXTRACT_I16, MATCH(I, V128<>, I8<>>)) { e.vpextrw(i.dest.reg().cvt32(), i.src1, VEC128_W(i.src2.constant())); } else { // TODO(benvanik): try out hlide's version: - // e.mov(e.eax, 7); + // e.mov(e.eax, 7); // e.and(e.al, i.src2); // eax = [i&7, 0, 0, 0] // e.imul(e.eax, 0x00000202); // [(i&7)*2, (i&7)*2, 0, 0] // e.xor(e.eax, 0x80800203); // [((i&7)*2)^3, ((i&7)*2)^2, 0x80, 0x80] @@ -4468,7 +4468,7 @@ EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { e.vpextrd(i.dest, i.src1, VEC128_D(i.src2.constant())); } else { // TODO(benvanik): try out hlide's version: - // e.mov(e.eax, 3); + // e.mov(e.eax, 3); // e.and(e.al, i.src2); // eax = [(i&3), 0, 0, 0] // e.imul(e.eax, 0x04040404); // [(i&3)*4, (i&3)*4, (i&3)*4, (i&3)*4] // e.add(e.eax, 0x00010203); // [((i&3)*4)+3, ((i&3)*4)+2, ((i&3)*4)+1, ((i&3)*4)+0] @@ -4495,7 +4495,7 @@ EMITTER(EXTRACT_F32, MATCH(I, V128<>, I8<>>)) { } else { XEASSERTALWAYS(); // TODO(benvanik): try out hlide's version: - // e.mov(e.eax, 3); + // e.mov(e.eax, 3); // e.and(e.al, i.src2); // eax = [(i&3), 0, 0, 0] // e.imul(e.eax, 0x04040404); // [(i&3)*4, (i&3)*4, (i&3)*4, (i&3)*4] // e.add(e.eax, 0x00010203); // [((i&3)*4)+3, ((i&3)*4)+2, ((i&3)*4)+1, ((i&3)*4)+0] From 430d8277709940c8231f4583e0c150af49b31cbd Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 31 May 2014 08:06:04 -0700 Subject: [PATCH 136/184] Like, really fixing SELECT. Frogger now runs ^_^ --- src/alloy/backend/x64/x64_sequences.cc | 52 ++++++++++++++------------ 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 84e265592..ad1864095 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -1952,6 +1952,8 @@ EMITTER_OPCODE_TABLE( // OPCODE_SELECT // ============================================================================ // dest = src1 ? src2 : src3 +// TODO(benvanik): match compare + select sequences, as often it's something +// like SELECT(VECTOR_COMPARE_SGE(a, b), a, b) EMITTER(SELECT_I8, MATCH(I, I8<>, I8<>, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { e.test(i.src1, i.src1); @@ -1982,37 +1984,41 @@ EMITTER(SELECT_I64, MATCH(I, I8<>, I64<>, I64<>>)) { }; EMITTER(SELECT_F32, MATCH(I, I8<>, F32<>, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - // TODO(benvanik): find a way to do this without branches. - // We may be able to load src1 into an xmm, cmp with zero, and use that - // as a selection mask to choose between src2 & src3. - Xbyak::Label skip; - e.vmovaps(i.dest, i.src3); - e.jz(skip); - e.vmovaps(i.dest, i.src2); - e.L(skip); + // TODO(benvanik): find a shorter sequence. + // xmm0 = src1 != 0 ? 1111... : 0000.... + e.movzx(e.eax, i.src1); + e.vmovd(e.xmm1, e.eax); + e.vxorps(e.xmm0, e.xmm0); + e.vcmpneqss(e.xmm0, e.xmm1); + e.vpand(e.xmm1, e.xmm0, i.src2); + e.vpandn(i.dest, e.xmm0, i.src3); + e.vpor(i.dest, e.xmm1); } }; EMITTER(SELECT_F64, MATCH(I, I8<>, F64<>, F64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - // TODO(benvanik): find a way to do this without branches. - Xbyak::Label skip; - e.vmovaps(i.dest, i.src3); - e.jz(skip); - e.vmovaps(i.dest, i.src2); - e.L(skip); + // xmm0 = src1 != 0 ? 1111... : 0000.... + e.movzx(e.eax, i.src1); + e.vmovd(e.xmm1, e.eax); + e.vxorpd(e.xmm0, e.xmm0); + e.vcmpneqsd(e.xmm0, e.xmm1); + e.vpand(e.xmm1, e.xmm0, i.src2); + e.vpandn(i.dest, e.xmm0, i.src3); + e.vpor(i.dest, e.xmm1); } }; EMITTER(SELECT_V128, MATCH(I, I8<>, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - // TODO(benvanik): find a way to do this without branches. - Xbyak::Label skip; - e.vmovaps(i.dest, i.src3); - e.jz(skip); - e.vmovaps(i.dest, i.src2); - e.L(skip); + // TODO(benvanik): find a shorter sequence. + // xmm0 = src1 != 0 ? 1111... : 0000.... + e.movzx(e.eax, i.src1); + e.vmovd(e.xmm1, e.eax); + e.vpbroadcastd(e.xmm1, e.xmm1); + e.vxorps(e.xmm0, e.xmm0); + e.vcmpneqps(e.xmm0, e.xmm1); + e.vpand(e.xmm1, e.xmm0, i.src2); + e.vpandn(i.dest, e.xmm0, i.src3); + e.vpor(i.dest, e.xmm1); } }; EMITTER_OPCODE_TABLE( From e45fc3dc5658f107c1782990781ad5b7e4d0e5ab Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 31 May 2014 08:13:13 -0700 Subject: [PATCH 137/184] Reducing profiling forced swap frequency. --- src/xenia/gpu/d3d11/d3d11_graphics_system.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc index 88fe0f205..553ed8828 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc @@ -164,14 +164,16 @@ void D3D11GraphicsSystem::Pump() { DispatchInterruptCallback(0); } else { - // If we have gone too long without an interrupt, fire one. - if (xe_pal_now() - last_interrupt_time_ > 500 / 1000.0) { + double time_since_last_interrupt = xe_pal_now() - last_interrupt_time_; + if (time_since_last_interrupt > 0.5) { + // If we have gone too long without an interrupt, fire one. DispatchInterruptCallback(0); } - - // Force a swap when profiling. - if (Profiler::is_enabled()) { - window_->Swap(); + if (time_since_last_interrupt > 0.3) { + // Force a swap when profiling. + if (Profiler::is_enabled()) { + window_->Swap(); + } } } } From 0598df1d9efe3fbe4872f23c204a6dcbd84df1be Mon Sep 17 00:00:00 2001 From: hlide Date: Sat, 31 May 2014 17:14:40 +0200 Subject: [PATCH 138/184] Removed weird symbols in x64_sequences.cc --- src/alloy/backend/x64/x64_sequences.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 00821cc14..5bf7dcca3 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -3269,7 +3269,7 @@ EMITTER_OPCODE_TABLE( // OPCODE_MUL_ADD // ============================================================================ // d = 1 * 2 + 3 -// $0 = $1�$0 + $2 +// $0 = $1x$0 + $2 // TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling. // dest could be src2 or src3 - need to ensure it's not before overwriting dest // perhaps use other 132/213/etc @@ -3332,7 +3332,7 @@ EMITTER_OPCODE_TABLE( // OPCODE_MUL_SUB // ============================================================================ // d = 1 * 2 - 3 -// $0 = $2�$0 - $3 +// $0 = $2x$0 - $3 // TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling. // dest could be src2 or src3 - need to ensure it's not before overwriting dest // perhaps use other 132/213/etc @@ -4495,7 +4495,7 @@ EMITTER(EXTRACT_F32, MATCH(I, V128<>, I8<>>)) { } else { XEASSERTALWAYS(); // TODO(benvanik): try out hlide's version: - // e.mov(e.eax, 3); + // e.mov(e.eax, 3); // e.and(e.al, i.src2); // eax = [(i&3), 0, 0, 0] // e.imul(e.eax, 0x04040404); // [(i&3)*4, (i&3)*4, (i&3)*4, (i&3)*4] // e.add(e.eax, 0x00010203); // [((i&3)*4)+3, ((i&3)*4)+2, ((i&3)*4)+1, ((i&3)*4)+0] From a7c0c1327a6bc9127020b388aef84fa91f3e48c5 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 31 May 2014 10:47:37 -0700 Subject: [PATCH 139/184] Latest xbyak with vcvtph2ps/ps2ph. --- third_party/xbyak | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/xbyak b/third_party/xbyak index 2d599b3bd..df27af3e4 160000 --- a/third_party/xbyak +++ b/third_party/xbyak @@ -1 +1 @@ -Subproject commit 2d599b3bd64a6d13c8b47a5f7410c67837bfff5d +Subproject commit df27af3e4e7ded756bfbb23d0f663df728442935 From e42460039f949a96136edf33a8dd22c2753307b3 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 31 May 2014 11:23:10 -0700 Subject: [PATCH 140/184] Untested PACK float16_2/_4. --- src/alloy/backend/ivm/ivm_intcode.cc | 1 + src/alloy/backend/x64/x64_sequences.cc | 41 ++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 59646c067..0542a7277 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -3927,6 +3927,7 @@ uint32_t IntCode_PACK_FLOAT16_2(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_PACK_FLOAT16_4(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; + dest.ix = dest.iy = 0; dest.iz = ((uint32_t)DirectX::PackedVector::XMConvertFloatToHalf(src1.x) << 16) | DirectX::PackedVector::XMConvertFloatToHalf(src1.y); diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index dccad4e37..8af3c5669 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -2976,7 +2976,12 @@ EMITTER(MUL_HI_I8, MATCH(I, I8<>, I8<>>)) { e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32()); } else { e.mov(e.al, i.src1); - e.imul(i.src2); + if (i.src2.is_constant) { + e.mov(e.al, i.src2.constant()); + e.imul(e.al); + } else { + e.imul(i.src2); + } e.mov(i.dest, e.ah); } e.ReloadEDX(); @@ -2990,7 +2995,12 @@ EMITTER(MUL_HI_I16, MATCH(I, I16<>, I16<>>)) { e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32()); } else { e.mov(e.ax, i.src1); - e.imul(i.src2); + if (i.src2.is_constant) { + e.mov(e.dx, i.src2.constant()); + e.imul(e.dx); + } else { + e.imul(i.src2); + } e.mov(i.dest, e.dx); } e.ReloadEDX(); @@ -3009,7 +3019,12 @@ EMITTER(MUL_HI_I32, MATCH(I, I32<>, I32<>>)) { } } else { e.mov(e.eax, i.src1); - e.imul(i.src2); + if (i.src2.is_constant) { + e.mov(e.edx, i.src2.constant()); + e.imul(e.edx); + } else { + e.imul(i.src2); + } e.mov(i.dest, e.edx); } e.ReloadEDX(); @@ -3028,7 +3043,12 @@ EMITTER(MUL_HI_I64, MATCH(I, I64<>, I64<>>)) { } } else { e.mov(e.rax, i.src1); - e.imul(i.src2); + if (i.src2.is_constant) { + e.mov(e.rdx, i.src2.constant()); + e.imul(e.rdx); + } else { + e.imul(i.src2); + } e.mov(i.dest, e.rdx); } e.ReloadEDX(); @@ -4781,10 +4801,19 @@ EMITTER(PACK, MATCH(I, V128<>>)) { e.vpshufb(i.dest, e.xmm0, e.GetXmmConstPtr(XMMPackD3DCOLOR)); } static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { - XEASSERTALWAYS(); + // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx + // dest = [(src1.x | src1.y), 0, 0, 0] + e.db(0xCC); + e.vcvtps2ph(e.xmm0, i.src1, B00000011); + e.vxorps(i.dest, i.dest); + e.vpblendw(i.dest, e.xmm0, B00000011); } static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { - XEASSERTALWAYS(); + // dest = [(src1.x | src1.y), (src1.z | src1.w), 0, 0] + e.db(0xCC); + e.vcvtps2ph(e.xmm0, i.src1, B00000011); + e.vxorps(i.dest, i.dest); + e.vpblendw(i.dest, e.xmm0, B00001111); } static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { XEASSERTALWAYS(); From 6607606b15c48d5c2c863527d9ad3f7f060be180 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 31 May 2014 15:42:21 -0700 Subject: [PATCH 141/184] Moving texture code into shared/separate files. Leaking a lot. --- src/xenia/gpu/d3d11/d3d11_graphics_driver.cc | 550 +------------------ src/xenia/gpu/d3d11/d3d11_graphics_driver.h | 25 +- src/xenia/gpu/d3d11/d3d11_texture.cc | 255 +++++++++ src/xenia/gpu/d3d11/d3d11_texture.h | 69 +++ src/xenia/gpu/d3d11/d3d11_texture_cache.cc | 113 ++++ src/xenia/gpu/d3d11/d3d11_texture_cache.h | 55 ++ src/xenia/gpu/d3d11/sources.gypi | 4 + src/xenia/gpu/sources.gypi | 4 + src/xenia/gpu/texture.cc | 264 +++++++++ src/xenia/gpu/texture.h | 74 +++ src/xenia/gpu/texture_cache.cc | 45 ++ src/xenia/gpu/texture_cache.h | 49 ++ 12 files changed, 961 insertions(+), 546 deletions(-) create mode 100644 src/xenia/gpu/d3d11/d3d11_texture.cc create mode 100644 src/xenia/gpu/d3d11/d3d11_texture.h create mode 100644 src/xenia/gpu/d3d11/d3d11_texture_cache.cc create mode 100644 src/xenia/gpu/d3d11/d3d11_texture_cache.h create mode 100644 src/xenia/gpu/texture.cc create mode 100644 src/xenia/gpu/texture.h create mode 100644 src/xenia/gpu/texture_cache.cc create mode 100644 src/xenia/gpu/texture_cache.h diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc index fd7377693..6c466ccac 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc @@ -13,6 +13,8 @@ #include #include #include +#include +#include using namespace xe; using namespace xe::gpu; @@ -32,6 +34,7 @@ D3D11GraphicsDriver::D3D11GraphicsDriver( device_->AddRef(); device_->GetImmediateContext(&context_); shader_cache_ = new D3D11ShaderCache(device_); + texture_cache_ = new D3D11TextureCache(memory_, context_, device_); xe_zero_struct(&state_, sizeof(state_)); @@ -126,9 +129,6 @@ D3D11GraphicsDriver::D3D11GraphicsDriver( D3D11GraphicsDriver::~D3D11GraphicsDriver() { RebuildRenderTargets(0, 0); - for (size_t n = 0; n < XECOUNT(state_.texture_fetchers); n++) { - XESAFERELEASE(state_.texture_fetchers[n].view); - } XESAFERELEASE(state_.constant_buffers.float_constants); XESAFERELEASE(state_.constant_buffers.bool_constants); XESAFERELEASE(state_.constant_buffers.loop_constants); @@ -136,6 +136,7 @@ D3D11GraphicsDriver::~D3D11GraphicsDriver() { XESAFERELEASE(state_.constant_buffers.gs_consts); XESAFERELEASE(invalid_texture_view_); XESAFERELEASE(invalid_texture_sampler_state_); + delete texture_cache_; delete shader_cache_; XESAFERELEASE(context_); XESAFERELEASE(device_); @@ -1034,9 +1035,8 @@ int D3D11GraphicsDriver::PrepareTextureFetchers() { for (int n = 0; n < XECOUNT(state_.texture_fetchers); n++) { auto& fetcher = state_.texture_fetchers[n]; - // TODO(benvanik): caching. + // TODO(benvanik): quick validate without refetching. fetcher.enabled = false; - XESAFERELEASE(fetcher.view); fetcher.view = NULL; int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + n * 6; @@ -1049,468 +1049,34 @@ int D3D11GraphicsDriver::PrepareTextureFetchers() { // Stash a copy of the fetch register. fetcher.fetch = fetch; - fetcher.info = GetTextureInfo(fetch); - if (fetcher.info.format == DXGI_FORMAT_UNKNOWN) { + // Fetch texture from the cache. + uint32_t address = (fetch.address << 12) + address_translation_; + auto texture_view = texture_cache_->FetchTexture(address, fetch); + if (!texture_view) { + XELOGW("D3D11: unable to fetch texture at %.8X", address); + continue; + } + if (texture_view->format == DXGI_FORMAT_UNKNOWN) { XELOGW("D3D11: unknown texture format %d", fetch.format); continue; } + fetcher.view = static_cast(texture_view); - D3D11_SHADER_RESOURCE_VIEW_DESC texture_view_desc; - xe_zero_struct(&texture_view_desc, sizeof(texture_view_desc)); - // TODO(benvanik): this may need to be typed on the fetch instruction (float/int/etc?) - texture_view_desc.Format = fetcher.info.format; - - ID3D11Resource* texture = NULL; - D3D_SRV_DIMENSION dimension = D3D11_SRV_DIMENSION_UNKNOWN; - switch (fetch.dimension) { - case DIMENSION_1D: - texture_view_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D; - texture_view_desc.Texture1D.MipLevels = 1; - texture_view_desc.Texture1D.MostDetailedMip = 0; - if (FetchTexture1D(fetch, fetcher.info, &texture)) { - XELOGE("D3D11: failed to fetch Texture1D"); - return 1; - } - break; - case DIMENSION_2D: - texture_view_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; - texture_view_desc.Texture2D.MipLevels = 1; - texture_view_desc.Texture2D.MostDetailedMip = 0; - if (FetchTexture2D(fetch, fetcher.info, &texture)) { - XELOGE("D3D11: failed to fetch Texture2D"); - return 1; - } - break; - case DIMENSION_3D: - texture_view_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D; - texture_view_desc.Texture3D.MipLevels = 1; - texture_view_desc.Texture3D.MostDetailedMip = 0; - if (FetchTexture3D(fetch, fetcher.info, &texture)) { - XELOGE("D3D11: failed to fetch Texture3D"); - return 1; - } - break; - case DIMENSION_CUBE: - texture_view_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBE; - texture_view_desc.TextureCube.MipLevels = 1; - texture_view_desc.TextureCube.MostDetailedMip = 0; - if (FetchTextureCube(fetch, fetcher.info, &texture)) { - XELOGE("D3D11: failed to fetch TextureCube"); - return 1; - } - break; - } - - XEASSERTNOTNULL(texture); - - ID3D11ShaderResourceView* texture_view = NULL; - HRESULT hr = device_->CreateShaderResourceView( - texture, &texture_view_desc, &texture_view); - if (FAILED(hr)) { - XELOGE("D3D11: unable to create texture resource view"); - texture->Release(); - return 1; - } - texture->Release(); - + // Only enable if we get all the way down here successfully. fetcher.enabled = true; - fetcher.view = texture_view; } return 0; } -// http://msdn.microsoft.com/en-us/library/windows/desktop/cc308051(v=vs.85).aspx -D3D11GraphicsDriver::TextureInfo D3D11GraphicsDriver::GetTextureInfo( - xe_gpu_texture_fetch_t& fetch) { - // a2xx_sq_surfaceformat - TextureInfo info; - info.format = DXGI_FORMAT_UNKNOWN; - info.block_size = 0; - info.texel_pitch = 0; - info.is_compressed = false; - switch (fetch.format) { - case FMT_8: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_RRR1: - info.format = DXGI_FORMAT_R8_UNORM; - break; - case XE_GPU_SWIZZLE_000R: - info.format = DXGI_FORMAT_A8_UNORM; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_8"); - info.format = DXGI_FORMAT_A8_UNORM; - break; - } - info.block_size = 1; - info.texel_pitch = 1; - break; - case FMT_1_5_5_5: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_BGRA: - info.format = DXGI_FORMAT_B5G5R5A1_UNORM; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_1_5_5_5"); - info.format = DXGI_FORMAT_B5G5R5A1_UNORM; - break; - } - info.block_size = 1; - info.texel_pitch = 2; - break; - case FMT_8_8_8_8: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_RGBA: - info.format = DXGI_FORMAT_R8G8B8A8_UNORM; - break; - case XE_GPU_SWIZZLE_BGRA: - info.format = DXGI_FORMAT_B8G8R8A8_UNORM; - break; - case XE_GPU_SWIZZLE_RGB1: - info.format = DXGI_FORMAT_R8G8B8A8_UNORM; // ? - break; - case XE_GPU_SWIZZLE_BGR1: - info.format = DXGI_FORMAT_B8G8R8X8_UNORM; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_8_8_8_8"); - info.format = DXGI_FORMAT_R8G8B8A8_UNORM; - break; - } - info.block_size = 1; - info.texel_pitch = 4; - break; - case FMT_4_4_4_4: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_BGRA: - info.format = DXGI_FORMAT_B4G4R4A4_UNORM; // only supported on Windows 8+ - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_4_4_4_4"); - info.format = DXGI_FORMAT_B4G4R4A4_UNORM; // only supported on Windows 8+ - break; - } - info.block_size = 1; - info.texel_pitch = 2; - break; - case FMT_16_16_16_16_FLOAT: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_RGBA: - info.format = DXGI_FORMAT_R16G16B16A16_FLOAT; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_16_16_16_16_FLOAT"); - info.format = DXGI_FORMAT_R16G16B16A16_FLOAT; - break; - } - info.block_size = 1; - info.texel_pitch = 8; - break; - case FMT_32_FLOAT: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_R111: - info.format = DXGI_FORMAT_R32_FLOAT; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_32_FLOAT"); - info.format = DXGI_FORMAT_R32_FLOAT; - break; - } - info.block_size = 1; - info.texel_pitch = 4; - break; - case FMT_DXT1: - info.format = DXGI_FORMAT_BC1_UNORM; - info.block_size = 4; - info.texel_pitch = 8; - info.is_compressed = true; - break; - case FMT_DXT2_3: - case FMT_DXT4_5: - info.format = (fetch.format == FMT_DXT4_5 ? DXGI_FORMAT_BC3_UNORM : DXGI_FORMAT_BC2_UNORM); - info.block_size = 4; - info.texel_pitch = 16; - info.is_compressed = true; - break; - case FMT_1_REVERSE: - case FMT_1: - case FMT_5_6_5: - case FMT_6_5_5: - case FMT_2_10_10_10: - case FMT_8_A: - case FMT_8_B: - case FMT_8_8: - case FMT_Cr_Y1_Cb_Y0: - case FMT_Y1_Cr_Y0_Cb: - case FMT_5_5_5_1: - case FMT_8_8_8_8_A: - case FMT_10_11_11: - case FMT_11_11_10: - case FMT_24_8: - case FMT_24_8_FLOAT: - case FMT_16: - case FMT_16_16: - case FMT_16_16_16_16: - case FMT_16_EXPAND: - case FMT_16_16_EXPAND: - case FMT_16_16_16_16_EXPAND: - case FMT_16_FLOAT: - case FMT_16_16_FLOAT: - case FMT_32: - case FMT_32_32: - case FMT_32_32_32_32: - case FMT_32_32_FLOAT: - case FMT_32_32_32_32_FLOAT: - case FMT_32_AS_8: - case FMT_32_AS_8_8: - case FMT_16_MPEG: - case FMT_16_16_MPEG: - case FMT_8_INTERLACED: - case FMT_32_AS_8_INTERLACED: - case FMT_32_AS_8_8_INTERLACED: - case FMT_16_INTERLACED: - case FMT_16_MPEG_INTERLACED: - case FMT_16_16_MPEG_INTERLACED: - case FMT_DXN: - case FMT_8_8_8_8_AS_16_16_16_16: - case FMT_DXT1_AS_16_16_16_16: - case FMT_DXT2_3_AS_16_16_16_16: - case FMT_DXT4_5_AS_16_16_16_16: - case FMT_2_10_10_10_AS_16_16_16_16: - case FMT_10_11_11_AS_16_16_16_16: - case FMT_11_11_10_AS_16_16_16_16: - case FMT_32_32_32_FLOAT: - case FMT_DXT3A: - case FMT_DXT5A: - case FMT_CTX1: - case FMT_DXT3A_AS_1_1_1_1: - info.format = DXGI_FORMAT_UNKNOWN; - break; - } - return info; -} - -int D3D11GraphicsDriver::FetchTexture1D( - xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture) { - SCOPE_profile_cpu_f("gpu"); - - uint32_t address = (fetch.address << 12) + address_translation_; - - uint32_t width = 1 + fetch.size_1d.width; - - D3D11_TEXTURE1D_DESC texture_desc; - xe_zero_struct(&texture_desc, sizeof(texture_desc)); - texture_desc.Width = width; - texture_desc.MipLevels = 1; - texture_desc.ArraySize = 1; - texture_desc.Format = info.format; - texture_desc.Usage = D3D11_USAGE_DYNAMIC; - texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; - texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - texture_desc.MiscFlags = 0; // D3D11_RESOURCE_MISC_GENERATE_MIPS? - HRESULT hr = device_->CreateTexture1D( - &texture_desc, NULL, (ID3D11Texture1D**)out_texture); - if (FAILED(hr)) { - return 1; - } - - return 0; -} - -XEFORCEINLINE void TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch, XE_GPU_ENDIAN endianness) { - switch (endianness) { - case XE_GPU_ENDIAN_8IN16: - for (uint32_t i = 0; i < pitch; i += 2, src += 2, dest += 2) { - *(uint16_t*)dest = XESWAP16(*(uint16_t*)src); - } - break; - case XE_GPU_ENDIAN_8IN32: // Swap bytes. - for (uint32_t i = 0; i < pitch; i += 4, src += 4, dest += 4) { - *(uint32_t*)dest = XESWAP32(*(uint32_t*)src); - } - break; - case XE_GPU_ENDIAN_16IN32: // Swap half words. - for (uint32_t i = 0; i < pitch; i += 4, src += 4, dest += 4) { - uint32_t value = *(uint32_t*)src; - *(uint32_t*)dest = ((value >> 16) & 0xFFFF) | (value << 16); - } - break; - default: - case XE_GPU_ENDIAN_NONE: - memcpy(dest, src, pitch); - break; - } -} - -// https://code.google.com/p/crunch/source/browse/trunk/inc/crn_decomp.h#4104 -XEFORCEINLINE uint32_t TiledOffset2DOuter(uint32_t y, uint32_t width, uint32_t log_bpp) -{ - uint32_t macro = ((y >> 5) * (width >> 5)) << (log_bpp + 7); - uint32_t micro = ((y & 6) << 2) << log_bpp; - return macro + ((micro & ~15) << 1) + (micro & 15) + ((y & 8) << (3 + log_bpp)) + ((y & 1) << 4); -} - -XEFORCEINLINE uint32_t TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp, uint32_t base_offset) -{ - uint32_t macro = (x >> 5) << (bpp + 7); - uint32_t micro = (x & 7) << bpp; - uint32_t offset = base_offset + (macro + ((micro & ~15) << 1) + (micro & 15)); - return ((offset & ~511) << 3) + ((offset & 448) << 2) + (offset & 63) + - ((y & 16) << 7) + (((((y & 8) >> 2) + (x >> 3)) & 3) << 6); -} - -int D3D11GraphicsDriver::FetchTexture2D( - xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture) { - SCOPE_profile_cpu_f("gpu"); - - XEASSERTTRUE(fetch.dimension == 1); - - uint32_t address = (fetch.address << 12) + address_translation_; - - uint32_t logical_width = 1 + fetch.size_2d.width; - uint32_t logical_height = 1 + fetch.size_2d.height; - - uint32_t block_width = logical_width / info.block_size; - uint32_t block_height = logical_height / info.block_size; - - uint32_t input_width, input_height; - uint32_t output_width, output_height; - - if (!info.is_compressed) { - // must be 32x32, but also must have a pitch that is a multiple of 256 bytes - uint32_t bytes_per_block = info.block_size * info.block_size * info.texel_pitch; - uint32_t width_multiple = 32; - if (bytes_per_block) { - uint32_t minimum_multiple = 256 / bytes_per_block; - if (width_multiple < minimum_multiple) { - width_multiple = minimum_multiple; - } - } - - input_width = XEROUNDUP(logical_width, width_multiple); - input_height = XEROUNDUP(logical_height, 32); - output_width = logical_width; - output_height = logical_height; - } - else { - // must be 128x128 - input_width = XEROUNDUP(logical_width, 128); - input_height = XEROUNDUP(logical_height, 128); - output_width = XENEXTPOW2(logical_width); - output_height = XENEXTPOW2(logical_height); - } - - D3D11_TEXTURE2D_DESC texture_desc; - xe_zero_struct(&texture_desc, sizeof(texture_desc)); - texture_desc.Width = output_width; - texture_desc.Height = output_height; - texture_desc.MipLevels = 1; - texture_desc.ArraySize = 1; - texture_desc.Format = info.format; - texture_desc.SampleDesc.Count = 1; - texture_desc.SampleDesc.Quality = 0; - texture_desc.Usage = D3D11_USAGE_DYNAMIC; - texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; - texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - texture_desc.MiscFlags = 0; // D3D11_RESOURCE_MISC_GENERATE_MIPS? - HRESULT hr = device_->CreateTexture2D( - &texture_desc, NULL, (ID3D11Texture2D**)out_texture); - if (FAILED(hr)) { - return 1; - } - - // TODO(benvanik): all mip levels. - D3D11_MAPPED_SUBRESOURCE res; - hr = context_->Map(*out_texture, 0, - D3D11_MAP_WRITE_DISCARD, 0, &res); - if (FAILED(hr)) { - XELOGE("D3D11: failed to map texture"); - return 1; - } - - auto logical_pitch = (logical_width / info.block_size) * info.texel_pitch; - auto input_pitch = (input_width / info.block_size) * info.texel_pitch; - auto output_pitch = res.RowPitch; // (output_width / info.block_size) * info.texel_pitch; - - const uint8_t* src = memory_->Translate(address); - uint8_t* dest = (uint8_t*)res.pData; - - memset(dest, 0, output_pitch * (output_height / info.block_size)); // TODO(gibbed): remove me later - - if (!fetch.tiled) { - dest = (uint8_t*)res.pData; - for (uint32_t y = 0; y < block_height; y++) { - for (uint32_t x = 0; x < logical_pitch; x += info.texel_pitch) { - TextureSwap(dest + x, src + x, info.texel_pitch, (XE_GPU_ENDIAN)fetch.endianness); - } - src += input_pitch; - dest += output_pitch; - } - } - else { - auto bpp = (info.texel_pitch >> 2) + ((info.texel_pitch >> 1) >> (info.texel_pitch >> 2)); - for (uint32_t y = 0, output_base_offset = 0; y < block_height; y++, output_base_offset += output_pitch) { - auto input_base_offset = TiledOffset2DOuter(y, (input_width / info.block_size), bpp); - for (uint32_t x = 0, output_offset = output_base_offset; x < block_width; x++, output_offset += info.texel_pitch) { - auto input_offset = TiledOffset2DInner(x, y, bpp, input_base_offset) >> bpp; - TextureSwap(dest + output_offset, - src + input_offset * info.texel_pitch, - info.texel_pitch, (XE_GPU_ENDIAN)fetch.endianness); - } - } - } - context_->Unmap(*out_texture, 0); - return 0; -} - -int D3D11GraphicsDriver::FetchTexture3D( - xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture) { - SCOPE_profile_cpu_f("gpu"); - - XELOGE("D3D11: FetchTexture2D not yet implemented"); - XEASSERTALWAYS(); - return 1; - //D3D11_TEXTURE3D_DESC texture_desc; - //xe_zero_struct(&texture_desc, sizeof(texture_desc)); - //texture_desc.Width; - //texture_desc.Height; - //texture_desc.Depth; - //texture_desc.MipLevels; - //texture_desc.Format; - //texture_desc.Usage; - //texture_desc.BindFlags; - //texture_desc.CPUAccessFlags; - //texture_desc.MiscFlags; - //hr = device_->CreateTexture3D( - // &texture_desc, &initial_data, (ID3D11Texture3D**)&texture); -} - -int D3D11GraphicsDriver::FetchTextureCube( - xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture) { - SCOPE_profile_cpu_f("gpu"); - - XELOGE("D3D11: FetchTextureCube not yet implemented"); - XEASSERTALWAYS(); - return 1; -} - int D3D11GraphicsDriver::PrepareTextureSampler( xenos::XE_GPU_SHADER_TYPE shader_type, Shader::tex_buffer_desc_t& desc) { SCOPE_profile_cpu_f("gpu"); + // If the fetcher is disabled or invalid, set some default textures. auto& fetcher = state_.texture_fetchers[desc.fetch_slot]; - auto& info = fetcher.info; if (!fetcher.enabled || - info.format == DXGI_FORMAT_UNKNOWN) { + fetcher.view->format == DXGI_FORMAT_UNKNOWN) { XELOGW("D3D11: ignoring texture fetch: disabled or an unknown format"); if (shader_type == XE_GPU_SHADER_TYPE_VERTEX) { context_->VSSetShaderResources(desc.input_index, @@ -1526,82 +1092,16 @@ int D3D11GraphicsDriver::PrepareTextureSampler( return 0; } - HRESULT hr; - + // Get and set the real shader resource views/samplers. if (shader_type == XE_GPU_SHADER_TYPE_VERTEX) { - context_->VSSetShaderResources(desc.input_index, 1, &fetcher.view); + context_->VSSetShaderResources(desc.input_index, 1, &fetcher.view->srv); } else { - context_->PSSetShaderResources(desc.input_index, 1, &fetcher.view); + context_->PSSetShaderResources(desc.input_index, 1, &fetcher.view->srv); } - - D3D11_SAMPLER_DESC sampler_desc; - xe_zero_struct(&sampler_desc, sizeof(sampler_desc)); - uint32_t min_filter = desc.tex_fetch.min_filter == 3 ? - fetcher.fetch.min_filter : desc.tex_fetch.min_filter; - uint32_t mag_filter = desc.tex_fetch.mag_filter == 3 ? - fetcher.fetch.mag_filter : desc.tex_fetch.mag_filter; - uint32_t mip_filter = desc.tex_fetch.mip_filter == 3 ? - fetcher.fetch.mip_filter : desc.tex_fetch.mip_filter; - // MIN, MAG, MIP - static const D3D11_FILTER filter_matrix[2][2][3] = { - { - // min = POINT - { - // mag = POINT - D3D11_FILTER_MIN_MAG_MIP_POINT, - D3D11_FILTER_MIN_MAG_POINT_MIP_LINEAR, - D3D11_FILTER_MIN_MAG_POINT_MIP_LINEAR, // basemap? - }, - { - // mag = LINEAR - D3D11_FILTER_MIN_POINT_MAG_LINEAR_MIP_POINT, - D3D11_FILTER_MIN_POINT_MAG_MIP_LINEAR, - D3D11_FILTER_MIN_POINT_MAG_MIP_LINEAR, // basemap? - }, - }, - { - // min = LINEAR - { - // mag = POINT - D3D11_FILTER_MIN_LINEAR_MAG_MIP_POINT, - D3D11_FILTER_MIN_LINEAR_MAG_POINT_MIP_LINEAR, - D3D11_FILTER_MIN_LINEAR_MAG_POINT_MIP_LINEAR, // basemap? - }, - { - // mag = LINEAR - D3D11_FILTER_MIN_MAG_LINEAR_MIP_POINT, - D3D11_FILTER_MIN_MAG_MIP_LINEAR, - D3D11_FILTER_MIN_MAG_MIP_LINEAR, // basemap? - }, - }, - }; - sampler_desc.Filter = filter_matrix[min_filter][mag_filter][mip_filter]; - static const D3D11_TEXTURE_ADDRESS_MODE mode_map[] = { - D3D11_TEXTURE_ADDRESS_WRAP, - D3D11_TEXTURE_ADDRESS_MIRROR, - D3D11_TEXTURE_ADDRESS_CLAMP, // ? - D3D11_TEXTURE_ADDRESS_MIRROR_ONCE, // ? - D3D11_TEXTURE_ADDRESS_CLAMP, // ? - D3D11_TEXTURE_ADDRESS_MIRROR_ONCE, // ? - D3D11_TEXTURE_ADDRESS_BORDER, // ? - D3D11_TEXTURE_ADDRESS_MIRROR, // ? - }; - sampler_desc.AddressU = mode_map[fetcher.fetch.clamp_x]; - sampler_desc.AddressV = mode_map[fetcher.fetch.clamp_y]; - sampler_desc.AddressW = mode_map[fetcher.fetch.clamp_z]; - sampler_desc.MipLODBias; - sampler_desc.MaxAnisotropy = 1; - sampler_desc.ComparisonFunc = D3D11_COMPARISON_ALWAYS; - sampler_desc.BorderColor[0]; - sampler_desc.BorderColor[1]; - sampler_desc.BorderColor[2]; - sampler_desc.BorderColor[3]; - sampler_desc.MinLOD; - sampler_desc.MaxLOD; - ID3D11SamplerState* sampler_state = NULL; - hr = device_->CreateSamplerState(&sampler_desc, &sampler_state); - if (FAILED(hr)) { - XELOGE("D3D11: unable to create sampler state"); + ID3D11SamplerState* sampler_state = texture_cache_->GetSamplerState( + fetcher.fetch, desc); + if (!sampler_state) { + XELOGW("D3D11: failed to set sampler state; ignoring texture"); return 1; } if (shader_type == XE_GPU_SHADER_TYPE_VERTEX) { @@ -1609,7 +1109,7 @@ int D3D11GraphicsDriver::PrepareTextureSampler( } else { context_->PSSetSamplers(desc.input_index, 1, &sampler_state); } - sampler_state->Release(); + XESAFERELEASE(sampler_state); return 0; } diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.h b/src/xenia/gpu/d3d11/d3d11_graphics_driver.h index 94ccfe748..5a289d255 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.h +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.h @@ -26,6 +26,8 @@ namespace d3d11 { class D3D11PixelShader; class D3D11ShaderCache; +class D3D11TextureCache; +struct D3D11TextureView; class D3D11VertexShader; @@ -66,25 +68,6 @@ private: int PrepareTextureFetchers(); int PrepareTextureSampler(xenos::XE_GPU_SHADER_TYPE shader_type, Shader::tex_buffer_desc_t& desc); - typedef struct { - DXGI_FORMAT format; - uint32_t block_size; - uint32_t texel_pitch; - bool is_compressed; - } TextureInfo; - TextureInfo GetTextureInfo(xenos::xe_gpu_texture_fetch_t& fetch); - int FetchTexture1D(xenos::xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture); - int FetchTexture2D(xenos::xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture); - int FetchTexture3D(xenos::xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture); - int FetchTextureCube(xenos::xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture); int PrepareIndexBuffer( bool index_32bit, uint32_t index_count, uint32_t index_base, uint32_t index_size, uint32_t endianness); @@ -94,6 +77,7 @@ private: ID3D11Device* device_; ID3D11DeviceContext* context_; D3D11ShaderCache* shader_cache_; + D3D11TextureCache* texture_cache_; ID3D11ShaderResourceView* invalid_texture_view_; ID3D11SamplerState* invalid_texture_sampler_state_; @@ -125,8 +109,7 @@ private: struct { bool enabled; xenos::xe_gpu_texture_fetch_t fetch; - TextureInfo info; - ID3D11ShaderResourceView* view; + D3D11TextureView* view; } texture_fetchers[32]; } state_; diff --git a/src/xenia/gpu/d3d11/d3d11_texture.cc b/src/xenia/gpu/d3d11/d3d11_texture.cc new file mode 100644 index 000000000..b4beb2ca4 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_texture.cc @@ -0,0 +1,255 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; + + +D3D11Texture::D3D11Texture(D3D11TextureCache* cache, uint32_t address) + : Texture(address), + cache_(cache) { +} + +D3D11Texture::~D3D11Texture() { + // views +} + +TextureView* D3D11Texture::Fetch( + const xenos::xe_gpu_texture_fetch_t& fetch) { + D3D11TextureView* view = new D3D11TextureView(); + view->texture = this; + if (!FillViewInfo(view, fetch)) { + return nullptr; + } + + D3D11_SHADER_RESOURCE_VIEW_DESC srv_desc; + xe_zero_struct(&srv_desc, sizeof(srv_desc)); + // TODO(benvanik): this may need to be typed on the fetch instruction (float/int/etc?) + srv_desc.Format = view->format; + + D3D_SRV_DIMENSION dimension = D3D11_SRV_DIMENSION_UNKNOWN; + switch (view->dimensions) { + case DIMENSION_1D: + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D; + srv_desc.Texture1D.MipLevels = 1; + srv_desc.Texture1D.MostDetailedMip = 0; + if (!FetchTexture1D(view, fetch)) { + XELOGE("D3D11: failed to fetch Texture1D"); + return nullptr; + } + break; + case DIMENSION_2D: + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + srv_desc.Texture2D.MipLevels = 1; + srv_desc.Texture2D.MostDetailedMip = 0; + if (!FetchTexture2D(view, fetch)) { + XELOGE("D3D11: failed to fetch Texture2D"); + return nullptr; + } + break; + case DIMENSION_3D: + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D; + srv_desc.Texture3D.MipLevels = 1; + srv_desc.Texture3D.MostDetailedMip = 0; + if (!FetchTexture3D(view, fetch)) { + XELOGE("D3D11: failed to fetch Texture3D"); + return nullptr; + } + break; + case DIMENSION_CUBE: + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBE; + srv_desc.TextureCube.MipLevels = 1; + srv_desc.TextureCube.MostDetailedMip = 0; + if (!FetchTextureCube(view, fetch)) { + XELOGE("D3D11: failed to fetch TextureCube"); + return nullptr; + } + break; + } + + HRESULT hr = cache_->device()->CreateShaderResourceView( + view->resource, &srv_desc, &view->srv); + if (FAILED(hr)) { + XELOGE("D3D11: unable to create texture resource view"); + return nullptr; + } + + return view; +} + +bool D3D11Texture::FetchTexture1D( + D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { + SCOPE_profile_cpu_f("gpu"); + + uint32_t width = 1 + fetch.size_1d.width; + + D3D11_TEXTURE1D_DESC texture_desc; + xe_zero_struct(&texture_desc, sizeof(texture_desc)); + texture_desc.Width = width; + texture_desc.MipLevels = 1; + texture_desc.ArraySize = 1; + texture_desc.Format = view->format; + texture_desc.Usage = D3D11_USAGE_DYNAMIC; + texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + texture_desc.MiscFlags = 0; // D3D11_RESOURCE_MISC_GENERATE_MIPS? + HRESULT hr = cache_->device()->CreateTexture1D( + &texture_desc, NULL, (ID3D11Texture1D**)&view->resource); + if (FAILED(hr)) { + return false; + } + + // TODO(benvanik): upload! + XELOGE("D3D11: FetchTexture1D not yet implemented"); + return false; +} + +bool D3D11Texture::FetchTexture2D( + D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { + SCOPE_profile_cpu_f("gpu"); + + XEASSERTTRUE(fetch.dimension == 1); + + uint32_t logical_width = 1 + fetch.size_2d.width; + uint32_t logical_height = 1 + fetch.size_2d.height; + + uint32_t block_width = logical_width / view->block_size; + uint32_t block_height = logical_height / view->block_size; + + uint32_t input_width, input_height; + uint32_t output_width, output_height; + + if (!view->is_compressed) { + // must be 32x32, but also must have a pitch that is a multiple of 256 bytes + uint32_t bytes_per_block = view->block_size * view->block_size * + view->texel_pitch; + uint32_t width_multiple = 32; + if (bytes_per_block) { + uint32_t minimum_multiple = 256 / bytes_per_block; + if (width_multiple < minimum_multiple) { + width_multiple = minimum_multiple; + } + } + + input_width = XEROUNDUP(logical_width, width_multiple); + input_height = XEROUNDUP(logical_height, 32); + output_width = logical_width; + output_height = logical_height; + } + else { + // must be 128x128 + input_width = XEROUNDUP(logical_width, 128); + input_height = XEROUNDUP(logical_height, 128); + output_width = XENEXTPOW2(logical_width); + output_height = XENEXTPOW2(logical_height); + } + + D3D11_TEXTURE2D_DESC texture_desc; + xe_zero_struct(&texture_desc, sizeof(texture_desc)); + texture_desc.Width = output_width; + texture_desc.Height = output_height; + texture_desc.MipLevels = 1; + texture_desc.ArraySize = 1; + texture_desc.Format = view->format; + texture_desc.SampleDesc.Count = 1; + texture_desc.SampleDesc.Quality = 0; + texture_desc.Usage = D3D11_USAGE_DYNAMIC; + texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + texture_desc.MiscFlags = 0; // D3D11_RESOURCE_MISC_GENERATE_MIPS? + HRESULT hr = cache_->device()->CreateTexture2D( + &texture_desc, NULL, (ID3D11Texture2D**)&view->resource); + if (FAILED(hr)) { + return false; + } + + // TODO(benvanik): all mip levels. + D3D11_MAPPED_SUBRESOURCE res; + hr = cache_->context()->Map(view->resource, 0, + D3D11_MAP_WRITE_DISCARD, 0, &res); + if (FAILED(hr)) { + XELOGE("D3D11: failed to map texture"); + return false; + } + + auto logical_pitch = (logical_width / view->block_size) * view->texel_pitch; + auto input_pitch = (input_width / view->block_size) * view->texel_pitch; + auto output_pitch = res.RowPitch; // (output_width / info.block_size) * info.texel_pitch; + + const uint8_t* src = cache_->memory()->Translate(address_); + uint8_t* dest = (uint8_t*)res.pData; + + //memset(dest, 0, output_pitch * (output_height / view->block_size)); // TODO(gibbed): remove me later + + if (!fetch.tiled) { + dest = (uint8_t*)res.pData; + for (uint32_t y = 0; y < block_height; y++) { + for (uint32_t x = 0; x < logical_pitch; x += view->texel_pitch) { + TextureSwap(dest + x, src + x, view->texel_pitch, (XE_GPU_ENDIAN)fetch.endianness); + } + src += input_pitch; + dest += output_pitch; + } + } + else { + auto bpp = (view->texel_pitch >> 2) + ((view->texel_pitch >> 1) >> (view->texel_pitch >> 2)); + for (uint32_t y = 0, output_base_offset = 0; y < block_height; y++, output_base_offset += output_pitch) { + auto input_base_offset = TiledOffset2DOuter(y, (input_width / view->block_size), bpp); + for (uint32_t x = 0, output_offset = output_base_offset; x < block_width; x++, output_offset += view->texel_pitch) { + auto input_offset = TiledOffset2DInner(x, y, bpp, input_base_offset) >> bpp; + TextureSwap(dest + output_offset, + src + input_offset * view->texel_pitch, + view->texel_pitch, (XE_GPU_ENDIAN)fetch.endianness); + } + } + } + cache_->context()->Unmap(view->resource, 0); + return true; +} + +bool D3D11Texture::FetchTexture3D( + D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { + SCOPE_profile_cpu_f("gpu"); + + XELOGE("D3D11: FetchTexture3D not yet implemented"); + XEASSERTALWAYS(); + return false; + //D3D11_TEXTURE3D_DESC texture_desc; + //xe_zero_struct(&texture_desc, sizeof(texture_desc)); + //texture_desc.Width; + //texture_desc.Height; + //texture_desc.Depth; + //texture_desc.MipLevels; + //texture_desc.Format; + //texture_desc.Usage; + //texture_desc.BindFlags; + //texture_desc.CPUAccessFlags; + //texture_desc.MiscFlags; + //hr = device_->CreateTexture3D( + // &texture_desc, &initial_data, (ID3D11Texture3D**)&view->resource); +} + +bool D3D11Texture::FetchTextureCube( + D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { + SCOPE_profile_cpu_f("gpu"); + + XELOGE("D3D11: FetchTextureCube not yet implemented"); + XEASSERTALWAYS(); + return false; +} diff --git a/src/xenia/gpu/d3d11/d3d11_texture.h b/src/xenia/gpu/d3d11/d3d11_texture.h new file mode 100644 index 000000000..06cceb041 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_texture.h @@ -0,0 +1,69 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_TEXTURE_H_ +#define XENIA_GPU_D3D11_D3D11_TEXTURE_H_ + +#include + +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + +class D3D11TextureCache; + + +struct D3D11TextureView : TextureView { + ID3D11Resource* resource; + ID3D11ShaderResourceView* srv; + + D3D11TextureView() + : resource(nullptr), srv(nullptr) {} + virtual ~D3D11TextureView() { + XESAFERELEASE(srv); + XESAFERELEASE(resource); + } +}; + + +class D3D11Texture : public Texture { +public: + D3D11Texture(D3D11TextureCache* cache, uint32_t address); + virtual ~D3D11Texture(); + + TextureView* Fetch( + const xenos::xe_gpu_texture_fetch_t& fetch) override; + +protected: + bool FetchTexture1D( + D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); + bool FetchTexture2D( + D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); + bool FetchTexture3D( + D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); + bool FetchTextureCube( + D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); + + D3D11TextureCache* cache_; + + // views +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_TEXTURE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_texture_cache.cc b/src/xenia/gpu/d3d11/d3d11_texture_cache.cc new file mode 100644 index 000000000..ca8a2f88d --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_texture_cache.cc @@ -0,0 +1,113 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; + + +D3D11TextureCache::D3D11TextureCache( + Memory* memory, + ID3D11DeviceContext* context, ID3D11Device* device) + : TextureCache(memory), + context_(context), device_(device) { + context_->AddRef(); + device_->AddRef(); +} + +D3D11TextureCache::~D3D11TextureCache() { + XESAFERELEASE(device_); + XESAFERELEASE(context_); +} + +Texture* D3D11TextureCache::CreateTexture( + uint32_t address, const xenos::xe_gpu_texture_fetch_t& fetch) { + return new D3D11Texture(this, address); +} + +ID3D11SamplerState* D3D11TextureCache::GetSamplerState( + const xenos::xe_gpu_texture_fetch_t& fetch, + const Shader::tex_buffer_desc_t& desc) { + D3D11_SAMPLER_DESC sampler_desc; + xe_zero_struct(&sampler_desc, sizeof(sampler_desc)); + uint32_t min_filter = desc.tex_fetch.min_filter == 3 ? + fetch.min_filter : desc.tex_fetch.min_filter; + uint32_t mag_filter = desc.tex_fetch.mag_filter == 3 ? + fetch.mag_filter : desc.tex_fetch.mag_filter; + uint32_t mip_filter = desc.tex_fetch.mip_filter == 3 ? + fetch.mip_filter : desc.tex_fetch.mip_filter; + // MIN, MAG, MIP + static const D3D11_FILTER filter_matrix[2][2][3] = { + { + // min = POINT + { + // mag = POINT + D3D11_FILTER_MIN_MAG_MIP_POINT, + D3D11_FILTER_MIN_MAG_POINT_MIP_LINEAR, + D3D11_FILTER_MIN_MAG_POINT_MIP_LINEAR, // basemap? + }, + { + // mag = LINEAR + D3D11_FILTER_MIN_POINT_MAG_LINEAR_MIP_POINT, + D3D11_FILTER_MIN_POINT_MAG_MIP_LINEAR, + D3D11_FILTER_MIN_POINT_MAG_MIP_LINEAR, // basemap? + }, + }, + { + // min = LINEAR + { + // mag = POINT + D3D11_FILTER_MIN_LINEAR_MAG_MIP_POINT, + D3D11_FILTER_MIN_LINEAR_MAG_POINT_MIP_LINEAR, + D3D11_FILTER_MIN_LINEAR_MAG_POINT_MIP_LINEAR, // basemap? + }, + { + // mag = LINEAR + D3D11_FILTER_MIN_MAG_LINEAR_MIP_POINT, + D3D11_FILTER_MIN_MAG_MIP_LINEAR, + D3D11_FILTER_MIN_MAG_MIP_LINEAR, // basemap? + }, + }, + }; + sampler_desc.Filter = filter_matrix[min_filter][mag_filter][mip_filter]; + static const D3D11_TEXTURE_ADDRESS_MODE mode_map[] = { + D3D11_TEXTURE_ADDRESS_WRAP, + D3D11_TEXTURE_ADDRESS_MIRROR, + D3D11_TEXTURE_ADDRESS_CLAMP, // ? + D3D11_TEXTURE_ADDRESS_MIRROR_ONCE, // ? + D3D11_TEXTURE_ADDRESS_CLAMP, // ? + D3D11_TEXTURE_ADDRESS_MIRROR_ONCE, // ? + D3D11_TEXTURE_ADDRESS_BORDER, // ? + D3D11_TEXTURE_ADDRESS_MIRROR, // ? + }; + sampler_desc.AddressU = mode_map[fetch.clamp_x]; + sampler_desc.AddressV = mode_map[fetch.clamp_y]; + sampler_desc.AddressW = mode_map[fetch.clamp_z]; + sampler_desc.MipLODBias; + sampler_desc.MaxAnisotropy = 1; + sampler_desc.ComparisonFunc = D3D11_COMPARISON_ALWAYS; + sampler_desc.BorderColor[0]; + sampler_desc.BorderColor[1]; + sampler_desc.BorderColor[2]; + sampler_desc.BorderColor[3]; + sampler_desc.MinLOD; + sampler_desc.MaxLOD; + ID3D11SamplerState* sampler_state = NULL; + HRESULT hr = device_->CreateSamplerState(&sampler_desc, &sampler_state); + if (FAILED(hr)) { + XELOGE("D3D11: unable to create sampler state"); + return nullptr; + } + return sampler_state; +} diff --git a/src/xenia/gpu/d3d11/d3d11_texture_cache.h b/src/xenia/gpu/d3d11/d3d11_texture_cache.h new file mode 100644 index 000000000..4405a331c --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_texture_cache.h @@ -0,0 +1,55 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_TEXTURE_CACHE_H_ +#define XENIA_GPU_D3D11_D3D11_TEXTURE_CACHE_H_ + +#include + +#include +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + + +class D3D11TextureCache : public TextureCache { +public: + D3D11TextureCache(Memory* memory, + ID3D11DeviceContext* context, ID3D11Device* device); + virtual ~D3D11TextureCache(); + + ID3D11DeviceContext* context() const { return context_; } + ID3D11Device* device() const { return device_; } + + ID3D11SamplerState* GetSamplerState( + const xenos::xe_gpu_texture_fetch_t& fetch, + const Shader::tex_buffer_desc_t& desc); + +protected: + Texture* CreateTexture(uint32_t address, + const xenos::xe_gpu_texture_fetch_t& fetch) override; + +private: + ID3D11DeviceContext* context_; + ID3D11Device* device_; +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_TEXTURE_CACHE_H_ diff --git a/src/xenia/gpu/d3d11/sources.gypi b/src/xenia/gpu/d3d11/sources.gypi index 6e0b193f5..46c391ee6 100644 --- a/src/xenia/gpu/d3d11/sources.gypi +++ b/src/xenia/gpu/d3d11/sources.gypi @@ -16,6 +16,10 @@ 'd3d11_shader.h', 'd3d11_shader_cache.cc', 'd3d11_shader_cache.h', + 'd3d11_texture.cc', + 'd3d11_texture.h', + 'd3d11_texture_cache.cc', + 'd3d11_texture_cache.h', 'd3d11_window.cc', 'd3d11_window.h', ], diff --git a/src/xenia/gpu/sources.gypi b/src/xenia/gpu/sources.gypi index 3d3ced141..9309e0ec3 100644 --- a/src/xenia/gpu/sources.gypi +++ b/src/xenia/gpu/sources.gypi @@ -15,6 +15,10 @@ 'shader.h', 'shader_cache.cc', 'shader_cache.h', + 'texture.cc', + 'texture.h', + 'texture_cache.cc', + 'texture_cache.h', ], 'includes': [ diff --git a/src/xenia/gpu/texture.cc b/src/xenia/gpu/texture.cc new file mode 100644 index 000000000..0b6ef8105 --- /dev/null +++ b/src/xenia/gpu/texture.cc @@ -0,0 +1,264 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include + +// TODO(benvanik): replace DXGI constants with xenia constants. +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + + +Texture::Texture(uint32_t address) + : address_(address) { +} + +bool Texture::FillViewInfo(TextureView* view, + const xenos::xe_gpu_texture_fetch_t& fetch) { + // http://msdn.microsoft.com/en-us/library/windows/desktop/cc308051(v=vs.85).aspx + // a2xx_sq_surfaceformat + + view->dimensions = fetch.dimension; + switch (fetch.dimension) { + case DIMENSION_1D: + view->width = fetch.size_1d.width; + break; + case DIMENSION_2D: + view->width = fetch.size_2d.width; + view->height = fetch.size_2d.height; + break; + case DIMENSION_3D: + view->width = fetch.size_3d.width; + view->height = fetch.size_3d.height; + view->depth = fetch.size_3d.depth; + break; + case DIMENSION_CUBE: + view->width = fetch.size_stack.width; + view->height = fetch.size_stack.height; + view->depth = fetch.size_stack.depth; + break; + } + view->format = DXGI_FORMAT_UNKNOWN; + view->block_size = 0; + view->texel_pitch = 0; + view->is_compressed = false; + switch (fetch.format) { + case FMT_8: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_RRR1: + view->format = DXGI_FORMAT_R8_UNORM; + break; + case XE_GPU_SWIZZLE_000R: + view->format = DXGI_FORMAT_A8_UNORM; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_8"); + view->format = DXGI_FORMAT_A8_UNORM; + break; + } + view->block_size = 1; + view->texel_pitch = 1; + break; + case FMT_1_5_5_5: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_BGRA: + view->format = DXGI_FORMAT_B5G5R5A1_UNORM; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_1_5_5_5"); + view->format = DXGI_FORMAT_B5G5R5A1_UNORM; + break; + } + view->block_size = 1; + view->texel_pitch = 2; + break; + case FMT_8_8_8_8: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_RGBA: + view->format = DXGI_FORMAT_R8G8B8A8_UNORM; + break; + case XE_GPU_SWIZZLE_BGRA: + view->format = DXGI_FORMAT_B8G8R8A8_UNORM; + break; + case XE_GPU_SWIZZLE_RGB1: + view->format = DXGI_FORMAT_R8G8B8A8_UNORM; // ? + break; + case XE_GPU_SWIZZLE_BGR1: + view->format = DXGI_FORMAT_B8G8R8X8_UNORM; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_8_8_8_8"); + view->format = DXGI_FORMAT_R8G8B8A8_UNORM; + break; + } + view->block_size = 1; + view->texel_pitch = 4; + break; + case FMT_4_4_4_4: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_BGRA: + view->format = DXGI_FORMAT_B4G4R4A4_UNORM; // only supported on Windows 8+ + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_4_4_4_4"); + view->format = DXGI_FORMAT_B4G4R4A4_UNORM; // only supported on Windows 8+ + break; + } + view->block_size = 1; + view->texel_pitch = 2; + break; + case FMT_16_16_16_16_FLOAT: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_RGBA: + view->format = DXGI_FORMAT_R16G16B16A16_FLOAT; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_16_16_16_16_FLOAT"); + view->format = DXGI_FORMAT_R16G16B16A16_FLOAT; + break; + } + view->block_size = 1; + view->texel_pitch = 8; + break; + case FMT_32_FLOAT: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_R111: + view->format = DXGI_FORMAT_R32_FLOAT; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_32_FLOAT"); + view->format = DXGI_FORMAT_R32_FLOAT; + break; + } + view->block_size = 1; + view->texel_pitch = 4; + break; + case FMT_DXT1: + view->format = DXGI_FORMAT_BC1_UNORM; + view->block_size = 4; + view->texel_pitch = 8; + view->is_compressed = true; + break; + case FMT_DXT2_3: + case FMT_DXT4_5: + view->format = (fetch.format == FMT_DXT4_5 ? DXGI_FORMAT_BC3_UNORM : DXGI_FORMAT_BC2_UNORM); + view->block_size = 4; + view->texel_pitch = 16; + view->is_compressed = true; + break; + case FMT_1_REVERSE: + case FMT_1: + case FMT_5_6_5: + case FMT_6_5_5: + case FMT_2_10_10_10: + case FMT_8_A: + case FMT_8_B: + case FMT_8_8: + case FMT_Cr_Y1_Cb_Y0: + case FMT_Y1_Cr_Y0_Cb: + case FMT_5_5_5_1: + case FMT_8_8_8_8_A: + case FMT_10_11_11: + case FMT_11_11_10: + case FMT_24_8: + case FMT_24_8_FLOAT: + case FMT_16: + case FMT_16_16: + case FMT_16_16_16_16: + case FMT_16_EXPAND: + case FMT_16_16_EXPAND: + case FMT_16_16_16_16_EXPAND: + case FMT_16_FLOAT: + case FMT_16_16_FLOAT: + case FMT_32: + case FMT_32_32: + case FMT_32_32_32_32: + case FMT_32_32_FLOAT: + case FMT_32_32_32_32_FLOAT: + case FMT_32_AS_8: + case FMT_32_AS_8_8: + case FMT_16_MPEG: + case FMT_16_16_MPEG: + case FMT_8_INTERLACED: + case FMT_32_AS_8_INTERLACED: + case FMT_32_AS_8_8_INTERLACED: + case FMT_16_INTERLACED: + case FMT_16_MPEG_INTERLACED: + case FMT_16_16_MPEG_INTERLACED: + case FMT_DXN: + case FMT_8_8_8_8_AS_16_16_16_16: + case FMT_DXT1_AS_16_16_16_16: + case FMT_DXT2_3_AS_16_16_16_16: + case FMT_DXT4_5_AS_16_16_16_16: + case FMT_2_10_10_10_AS_16_16_16_16: + case FMT_10_11_11_AS_16_16_16_16: + case FMT_11_11_10_AS_16_16_16_16: + case FMT_32_32_32_FLOAT: + case FMT_DXT3A: + case FMT_DXT5A: + case FMT_CTX1: + case FMT_DXT3A_AS_1_1_1_1: + view->format = DXGI_FORMAT_UNKNOWN; + break; + } + return true; +} + +void Texture::TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch, + XE_GPU_ENDIAN endianness) { + switch (endianness) { + case XE_GPU_ENDIAN_8IN16: + for (uint32_t i = 0; i < pitch; i += 2, src += 2, dest += 2) { + *(uint16_t*)dest = XESWAP16(*(uint16_t*)src); + } + break; + case XE_GPU_ENDIAN_8IN32: // Swap bytes. + for (uint32_t i = 0; i < pitch; i += 4, src += 4, dest += 4) { + *(uint32_t*)dest = XESWAP32(*(uint32_t*)src); + } + break; + case XE_GPU_ENDIAN_16IN32: // Swap half words. + for (uint32_t i = 0; i < pitch; i += 4, src += 4, dest += 4) { + uint32_t value = *(uint32_t*)src; + *(uint32_t*)dest = ((value >> 16) & 0xFFFF) | (value << 16); + } + break; + default: + case XE_GPU_ENDIAN_NONE: + memcpy(dest, src, pitch); + break; + } +} + +// https://code.google.com/p/crunch/source/browse/trunk/inc/crn_decomp.h#4104 +uint32_t Texture::TiledOffset2DOuter(uint32_t y, uint32_t width, + uint32_t log_bpp) { + uint32_t macro = ((y >> 5) * (width >> 5)) << (log_bpp + 7); + uint32_t micro = ((y & 6) << 2) << log_bpp; + return macro + + ((micro & ~15) << 1) + + (micro & 15) + + ((y & 8) << (3 + log_bpp)) + + ((y & 1) << 4); +} + +uint32_t Texture::TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp, + uint32_t base_offset) { + uint32_t macro = (x >> 5) << (bpp + 7); + uint32_t micro = (x & 7) << bpp; + uint32_t offset = base_offset + (macro + ((micro & ~15) << 1) + (micro & 15)); + return ((offset & ~511) << 3) + ((offset & 448) << 2) + (offset & 63) + + ((y & 16) << 7) + (((((y & 8) >> 2) + (x >> 3)) & 3) << 6); +} diff --git a/src/xenia/gpu/texture.h b/src/xenia/gpu/texture.h new file mode 100644 index 000000000..24d595162 --- /dev/null +++ b/src/xenia/gpu/texture.h @@ -0,0 +1,74 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_TEXTURE_H_ +#define XENIA_GPU_TEXTURE_H_ + +#include +#include + +// TODO(benvanik): replace DXGI constants with xenia constants. +#include + + +namespace xe { +namespace gpu { + + +class Texture; + + +struct TextureView { + Texture* texture; + int dimensions; + uint32_t width; + uint32_t height; + uint32_t depth; + uint32_t block_size; + uint32_t texel_pitch; + bool is_compressed; + DXGI_FORMAT format; + + TextureView() + : texture(nullptr), + dimensions(0), + width(0), height(0), depth(0), + block_size(0), texel_pitch(0), + is_compressed(false), format(DXGI_FORMAT_UNKNOWN) {} +}; + + +class Texture { +public: + Texture(uint32_t address); + virtual ~Texture() = default; + + virtual TextureView* Fetch( + const xenos::xe_gpu_texture_fetch_t& fetch) = 0; + +protected: + bool FillViewInfo(TextureView* view, + const xenos::xe_gpu_texture_fetch_t& fetch); + + static void TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch, + xenos::XE_GPU_ENDIAN endianness); + static uint32_t TiledOffset2DOuter(uint32_t y, uint32_t width, + uint32_t log_bpp); + static uint32_t TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp, + uint32_t base_offset); + + uint32_t address_; +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_TEXTURE_H_ diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc new file mode 100644 index 000000000..f205008c2 --- /dev/null +++ b/src/xenia/gpu/texture_cache.cc @@ -0,0 +1,45 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + + +// https://github.com/ivmai/bdwgc/blob/master/os_dep.c + +TextureCache::TextureCache(Memory* memory) + : memory_(memory) { +} + +TextureCache::~TextureCache() { + // textures +} + +TextureView* TextureCache::FetchTexture( + uint32_t address, const xenos::xe_gpu_texture_fetch_t& fetch) { + auto it = textures_.find(address); + if (it == textures_.end()) { + // Texture not found. + auto texture = CreateTexture(address, fetch); + if (!texture) { + return nullptr; + } + textures_.insert({ address, texture }); + return texture->Fetch(fetch); + } else { + // Texture found. + return it->second->Fetch(fetch); + } +} diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h new file mode 100644 index 000000000..dc796fe50 --- /dev/null +++ b/src/xenia/gpu/texture_cache.h @@ -0,0 +1,49 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_TEXTURE_CACHE_H_ +#define XENIA_GPU_TEXTURE_CACHE_H_ + +#include +#include +#include + + +namespace xe { +namespace gpu { + + +// TODO(benvanik): overlapping textures. +// TODO(benvanik): multiple textures (differing formats/etc) per address. +class TextureCache { +public: + TextureCache(Memory* memory); + virtual ~TextureCache(); + + Memory* memory() const { return memory_; } + + TextureView* FetchTexture( + uint32_t address, const xenos::xe_gpu_texture_fetch_t& fetch); + +protected: + virtual Texture* CreateTexture( + uint32_t address, const xenos::xe_gpu_texture_fetch_t& fetch) = 0; + + Memory* memory_; + + // Mapped by guest address. + std::unordered_map textures_; +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_TEXTURE_CACHE_H_ From 529a1478d894fdbfafa51c7fb3be8e33e4030fa0 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 31 May 2014 16:34:05 -0700 Subject: [PATCH 142/184] Sampler state caching. --- src/xenia/gpu/d3d11/d3d11_graphics_driver.cc | 25 -------------------- src/xenia/gpu/d3d11/d3d11_texture_cache.cc | 25 ++++++++++++++++++++ src/xenia/gpu/d3d11/d3d11_texture_cache.h | 6 +++++ src/xenia/types.h | 10 ++++++++ 4 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc index 6c466ccac..f29afd285 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc @@ -845,9 +845,6 @@ int D3D11GraphicsDriver::BindShaders() { // Setup input layout (as encoded in vertex shader). context_->IASetInputLayout(vs->input_layout()); - - //context_->VSSetSamplers - //context_->VSSetShaderResources } else { context_->VSSetShader(NULL, NULL, 0); context_->IASetInputLayout(NULL); @@ -877,27 +874,6 @@ int D3D11GraphicsDriver::BindShaders() { }; context_->PSSetConstantBuffers( 0, XECOUNT(vs_constant_buffers), vs_constant_buffers); - - // TODO(benvanik): set samplers for all inputs. - D3D11_SAMPLER_DESC sampler_desc; - xe_zero_struct(&sampler_desc, sizeof(sampler_desc)); - //sampler_desc.Filter = ? - sampler_desc.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP; - sampler_desc.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP; - sampler_desc.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP; - sampler_desc.MipLODBias = 0; - sampler_desc.MaxAnisotropy = 1; - sampler_desc.ComparisonFunc = D3D11_COMPARISON_ALWAYS; - //sampler_desc.BorderColor = ...; - sampler_desc.MinLOD = 0; - sampler_desc.MaxLOD = 0; - ID3D11SamplerState* sampler_state = NULL; - device_->CreateSamplerState(&sampler_desc, &sampler_state); - ID3D11SamplerState* sampler_states[] = { sampler_state }; - context_->PSSetSamplers(0, XECOUNT(sampler_states), sampler_states); - sampler_state->Release(); - - //context_->PSSetShaderResources } else { context_->PSSetShader(NULL, NULL, 0); return 1; @@ -1109,7 +1085,6 @@ int D3D11GraphicsDriver::PrepareTextureSampler( } else { context_->PSSetSamplers(desc.input_index, 1, &sampler_state); } - XESAFERELEASE(sampler_state); return 0; } diff --git a/src/xenia/gpu/d3d11/d3d11_texture_cache.cc b/src/xenia/gpu/d3d11/d3d11_texture_cache.cc index ca8a2f88d..ad8e4d09e 100644 --- a/src/xenia/gpu/d3d11/d3d11_texture_cache.cc +++ b/src/xenia/gpu/d3d11/d3d11_texture_cache.cc @@ -27,6 +27,12 @@ D3D11TextureCache::D3D11TextureCache( } D3D11TextureCache::~D3D11TextureCache() { + for (auto it = samplers_.begin(); it != samplers_.end(); ++it) { + auto& cached_state = it->second; + XESAFERELEASE(cached_state.state); + } + samplers_.clear(); + XESAFERELEASE(device_); XESAFERELEASE(context_); } @@ -103,11 +109,30 @@ ID3D11SamplerState* D3D11TextureCache::GetSamplerState( sampler_desc.BorderColor[3]; sampler_desc.MinLOD; sampler_desc.MaxLOD; + + // TODO(benvanik): do this earlier without having to setup the whole struct? + size_t hash = hash_combine( + sampler_desc.Filter, + sampler_desc.AddressU, + sampler_desc.AddressV, + sampler_desc.AddressW); + auto range = samplers_.equal_range(hash); + for (auto it = range.first; it != range.second; ++it) { + const auto& cached_state = it->second; + // TODO(benvanik): faster compare? + if (memcmp(&sampler_desc, &cached_state.desc, sizeof(sampler_desc)) == 0) { + return cached_state.state; + } + } + ID3D11SamplerState* sampler_state = NULL; HRESULT hr = device_->CreateSamplerState(&sampler_desc, &sampler_state); if (FAILED(hr)) { XELOGE("D3D11: unable to create sampler state"); return nullptr; } + + samplers_.insert({ hash, { sampler_desc, sampler_state } }); + return sampler_state; } diff --git a/src/xenia/gpu/d3d11/d3d11_texture_cache.h b/src/xenia/gpu/d3d11/d3d11_texture_cache.h index 4405a331c..ce0fdc310 100644 --- a/src/xenia/gpu/d3d11/d3d11_texture_cache.h +++ b/src/xenia/gpu/d3d11/d3d11_texture_cache.h @@ -44,6 +44,12 @@ protected: private: ID3D11DeviceContext* context_; ID3D11Device* device_; + + struct CachedSamplerState { + D3D11_SAMPLER_DESC desc; + ID3D11SamplerState* state; + }; + std::unordered_multimap samplers_; }; diff --git a/src/xenia/types.h b/src/xenia/types.h index 42d6aa658..4cd3f5daf 100644 --- a/src/xenia/types.h +++ b/src/xenia/types.h @@ -134,6 +134,16 @@ typedef XECACHEALIGN volatile void xe_aligned_void_t; #endif // GNUC #endif // !MIN +XEFORCEINLINE size_t hash_combine(size_t seed) { + return seed; +} +template +size_t hash_combine(size_t seed, const T& v, const Ts&... vs) { + std::hash hasher; + seed ^= hasher(v) + 0x9E3779B9 + (seed << 6) + (seed >> 2); + return hash_combine(seed, vs...); +} + #if XE_PLATFORM_WIN32 #define XESAFERELEASE(p) if (p) { p->Release(); } #endif // WIN32 From 19c48c7a90be614a8f4461455a2e73c2964caa5e Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 31 May 2014 17:38:32 -0700 Subject: [PATCH 143/184] Partial texture cache - doesn't invalidate yet. --- src/xenia/gpu/d3d11/d3d11_texture.cc | 141 +++++++++++---------- src/xenia/gpu/d3d11/d3d11_texture.h | 21 ++- src/xenia/gpu/d3d11/d3d11_texture_cache.cc | 5 +- src/xenia/gpu/d3d11/d3d11_texture_cache.h | 2 +- src/xenia/gpu/texture.cc | 109 +++++++++++++++- src/xenia/gpu/texture.h | 44 ++++++- src/xenia/gpu/texture_cache.cc | 9 +- src/xenia/gpu/texture_cache.h | 3 +- 8 files changed, 250 insertions(+), 84 deletions(-) diff --git a/src/xenia/gpu/d3d11/d3d11_texture.cc b/src/xenia/gpu/d3d11/d3d11_texture.cc index b4beb2ca4..809a971ac 100644 --- a/src/xenia/gpu/d3d11/d3d11_texture.cc +++ b/src/xenia/gpu/d3d11/d3d11_texture.cc @@ -21,19 +21,18 @@ using namespace xe::gpu::d3d11; using namespace xe::gpu::xenos; -D3D11Texture::D3D11Texture(D3D11TextureCache* cache, uint32_t address) - : Texture(address), +D3D11Texture::D3D11Texture(D3D11TextureCache* cache, uint32_t address, + const uint8_t* host_address) + : Texture(address, host_address), cache_(cache) { } D3D11Texture::~D3D11Texture() { - // views } -TextureView* D3D11Texture::Fetch( +TextureView* D3D11Texture::FetchNew( const xenos::xe_gpu_texture_fetch_t& fetch) { D3D11TextureView* view = new D3D11TextureView(); - view->texture = this; if (!FillViewInfo(view, fetch)) { return nullptr; } @@ -49,7 +48,7 @@ TextureView* D3D11Texture::Fetch( srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D; srv_desc.Texture1D.MipLevels = 1; srv_desc.Texture1D.MostDetailedMip = 0; - if (!FetchTexture1D(view, fetch)) { + if (!CreateTexture1D(view, fetch)) { XELOGE("D3D11: failed to fetch Texture1D"); return nullptr; } @@ -58,7 +57,7 @@ TextureView* D3D11Texture::Fetch( srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; srv_desc.Texture2D.MipLevels = 1; srv_desc.Texture2D.MostDetailedMip = 0; - if (!FetchTexture2D(view, fetch)) { + if (!CreateTexture2D(view, fetch)) { XELOGE("D3D11: failed to fetch Texture2D"); return nullptr; } @@ -67,7 +66,7 @@ TextureView* D3D11Texture::Fetch( srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D; srv_desc.Texture3D.MipLevels = 1; srv_desc.Texture3D.MostDetailedMip = 0; - if (!FetchTexture3D(view, fetch)) { + if (!CreateTexture3D(view, fetch)) { XELOGE("D3D11: failed to fetch Texture3D"); return nullptr; } @@ -76,7 +75,7 @@ TextureView* D3D11Texture::Fetch( srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBE; srv_desc.TextureCube.MipLevels = 1; srv_desc.TextureCube.MostDetailedMip = 0; - if (!FetchTextureCube(view, fetch)) { + if (!CreateTextureCube(view, fetch)) { XELOGE("D3D11: failed to fetch TextureCube"); return nullptr; } @@ -93,10 +92,24 @@ TextureView* D3D11Texture::Fetch( return view; } -bool D3D11Texture::FetchTexture1D( - D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { - SCOPE_profile_cpu_f("gpu"); +bool D3D11Texture::FetchDirty( + TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) { + auto d3d_view = static_cast(view); + switch (view->dimensions) { + case DIMENSION_1D: + return FetchTexture1D(d3d_view, fetch); + case DIMENSION_2D: + return FetchTexture2D(d3d_view, fetch); + case DIMENSION_3D: + return FetchTexture3D(d3d_view, fetch); + case DIMENSION_CUBE: + return FetchTextureCube(d3d_view, fetch); + } + return false; +} +bool D3D11Texture::CreateTexture1D( + D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) { uint32_t width = 1 + fetch.size_1d.width; D3D11_TEXTURE1D_DESC texture_desc; @@ -115,55 +128,26 @@ bool D3D11Texture::FetchTexture1D( return false; } + return FetchTexture1D(view, fetch); +} + +bool D3D11Texture::FetchTexture1D( + D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { + SCOPE_profile_cpu_f("gpu"); + // TODO(benvanik): upload! XELOGE("D3D11: FetchTexture1D not yet implemented"); return false; } -bool D3D11Texture::FetchTexture2D( - D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { - SCOPE_profile_cpu_f("gpu"); - +bool D3D11Texture::CreateTexture2D( + D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) { XEASSERTTRUE(fetch.dimension == 1); - uint32_t logical_width = 1 + fetch.size_2d.width; - uint32_t logical_height = 1 + fetch.size_2d.height; - - uint32_t block_width = logical_width / view->block_size; - uint32_t block_height = logical_height / view->block_size; - - uint32_t input_width, input_height; - uint32_t output_width, output_height; - - if (!view->is_compressed) { - // must be 32x32, but also must have a pitch that is a multiple of 256 bytes - uint32_t bytes_per_block = view->block_size * view->block_size * - view->texel_pitch; - uint32_t width_multiple = 32; - if (bytes_per_block) { - uint32_t minimum_multiple = 256 / bytes_per_block; - if (width_multiple < minimum_multiple) { - width_multiple = minimum_multiple; - } - } - - input_width = XEROUNDUP(logical_width, width_multiple); - input_height = XEROUNDUP(logical_height, 32); - output_width = logical_width; - output_height = logical_height; - } - else { - // must be 128x128 - input_width = XEROUNDUP(logical_width, 128); - input_height = XEROUNDUP(logical_height, 128); - output_width = XENEXTPOW2(logical_width); - output_height = XENEXTPOW2(logical_height); - } - D3D11_TEXTURE2D_DESC texture_desc; xe_zero_struct(&texture_desc, sizeof(texture_desc)); - texture_desc.Width = output_width; - texture_desc.Height = output_height; + texture_desc.Width = view->sizes_2d.output_width; + texture_desc.Height = view->sizes_2d.output_height; texture_desc.MipLevels = 1; texture_desc.ArraySize = 1; texture_desc.Format = view->format; @@ -179,39 +163,50 @@ bool D3D11Texture::FetchTexture2D( return false; } + return FetchTexture2D(view, fetch); +} + +bool D3D11Texture::FetchTexture2D( + D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { + SCOPE_profile_cpu_f("gpu"); + + XEASSERTTRUE(fetch.dimension == 1); + + auto sizes = GetTextureSizes2D(view); + // TODO(benvanik): all mip levels. D3D11_MAPPED_SUBRESOURCE res; - hr = cache_->context()->Map(view->resource, 0, - D3D11_MAP_WRITE_DISCARD, 0, &res); + HRESULT hr = cache_->context()->Map(view->resource, 0, + D3D11_MAP_WRITE_DISCARD, 0, &res); if (FAILED(hr)) { XELOGE("D3D11: failed to map texture"); return false; } - auto logical_pitch = (logical_width / view->block_size) * view->texel_pitch; - auto input_pitch = (input_width / view->block_size) * view->texel_pitch; - auto output_pitch = res.RowPitch; // (output_width / info.block_size) * info.texel_pitch; - const uint8_t* src = cache_->memory()->Translate(address_); uint8_t* dest = (uint8_t*)res.pData; //memset(dest, 0, output_pitch * (output_height / view->block_size)); // TODO(gibbed): remove me later + uint32_t output_pitch = res.RowPitch; // (output_width / info.block_size) * info.texel_pitch; if (!fetch.tiled) { dest = (uint8_t*)res.pData; - for (uint32_t y = 0; y < block_height; y++) { - for (uint32_t x = 0; x < logical_pitch; x += view->texel_pitch) { + for (uint32_t y = 0; y < sizes.block_height; y++) { + for (uint32_t x = 0; x < sizes.logical_pitch; x += view->texel_pitch) { TextureSwap(dest + x, src + x, view->texel_pitch, (XE_GPU_ENDIAN)fetch.endianness); } - src += input_pitch; + src += sizes.input_pitch; dest += output_pitch; } - } - else { + } else { auto bpp = (view->texel_pitch >> 2) + ((view->texel_pitch >> 1) >> (view->texel_pitch >> 2)); - for (uint32_t y = 0, output_base_offset = 0; y < block_height; y++, output_base_offset += output_pitch) { - auto input_base_offset = TiledOffset2DOuter(y, (input_width / view->block_size), bpp); - for (uint32_t x = 0, output_offset = output_base_offset; x < block_width; x++, output_offset += view->texel_pitch) { + for (uint32_t y = 0, output_base_offset = 0; + y < sizes.block_height; + y++, output_base_offset += output_pitch) { + auto input_base_offset = TiledOffset2DOuter(y, (sizes.input_width / view->block_size), bpp); + for (uint32_t x = 0, output_offset = output_base_offset; + x < sizes.block_width; + x++, output_offset += view->texel_pitch) { auto input_offset = TiledOffset2DInner(x, y, bpp, input_base_offset) >> bpp; TextureSwap(dest + output_offset, src + input_offset * view->texel_pitch, @@ -223,6 +218,13 @@ bool D3D11Texture::FetchTexture2D( return true; } +bool D3D11Texture::CreateTexture3D( + D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) { + XELOGE("D3D11: CreateTexture3D not yet implemented"); + XEASSERTALWAYS(); + return false; +} + bool D3D11Texture::FetchTexture3D( D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { SCOPE_profile_cpu_f("gpu"); @@ -245,6 +247,13 @@ bool D3D11Texture::FetchTexture3D( // &texture_desc, &initial_data, (ID3D11Texture3D**)&view->resource); } +bool D3D11Texture::CreateTextureCube( + D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) { + XELOGE("D3D11: CreateTextureCube not yet implemented"); + XEASSERTALWAYS(); + return false; +} + bool D3D11Texture::FetchTextureCube( D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { SCOPE_profile_cpu_f("gpu"); diff --git a/src/xenia/gpu/d3d11/d3d11_texture.h b/src/xenia/gpu/d3d11/d3d11_texture.h index 06cceb041..a8ee91662 100644 --- a/src/xenia/gpu/d3d11/d3d11_texture.h +++ b/src/xenia/gpu/d3d11/d3d11_texture.h @@ -39,25 +39,34 @@ struct D3D11TextureView : TextureView { class D3D11Texture : public Texture { public: - D3D11Texture(D3D11TextureCache* cache, uint32_t address); + D3D11Texture(D3D11TextureCache* cache, uint32_t address, + const uint8_t* host_address); virtual ~D3D11Texture(); - TextureView* Fetch( - const xenos::xe_gpu_texture_fetch_t& fetch) override; - protected: + TextureView* FetchNew( + const xenos::xe_gpu_texture_fetch_t& fetch) override; + bool FetchDirty( + TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) override; + + bool CreateTexture1D( + D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); bool FetchTexture1D( D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); + bool CreateTexture2D( + D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); bool FetchTexture2D( D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); + bool CreateTexture3D( + D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); bool FetchTexture3D( D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); + bool CreateTextureCube( + D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); bool FetchTextureCube( D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); D3D11TextureCache* cache_; - - // views }; diff --git a/src/xenia/gpu/d3d11/d3d11_texture_cache.cc b/src/xenia/gpu/d3d11/d3d11_texture_cache.cc index ad8e4d09e..eb3442bfc 100644 --- a/src/xenia/gpu/d3d11/d3d11_texture_cache.cc +++ b/src/xenia/gpu/d3d11/d3d11_texture_cache.cc @@ -38,8 +38,9 @@ D3D11TextureCache::~D3D11TextureCache() { } Texture* D3D11TextureCache::CreateTexture( - uint32_t address, const xenos::xe_gpu_texture_fetch_t& fetch) { - return new D3D11Texture(this, address); + uint32_t address, const uint8_t* host_address, + const xenos::xe_gpu_texture_fetch_t& fetch) { + return new D3D11Texture(this, address, host_address); } ID3D11SamplerState* D3D11TextureCache::GetSamplerState( diff --git a/src/xenia/gpu/d3d11/d3d11_texture_cache.h b/src/xenia/gpu/d3d11/d3d11_texture_cache.h index ce0fdc310..63f275d02 100644 --- a/src/xenia/gpu/d3d11/d3d11_texture_cache.h +++ b/src/xenia/gpu/d3d11/d3d11_texture_cache.h @@ -38,7 +38,7 @@ public: const Shader::tex_buffer_desc_t& desc); protected: - Texture* CreateTexture(uint32_t address, + Texture* CreateTexture(uint32_t address, const uint8_t* host_address, const xenos::xe_gpu_texture_fetch_t& fetch) override; private: diff --git a/src/xenia/gpu/texture.cc b/src/xenia/gpu/texture.cc index 0b6ef8105..d624d82ce 100644 --- a/src/xenia/gpu/texture.cc +++ b/src/xenia/gpu/texture.cc @@ -21,8 +21,54 @@ using namespace xe::gpu; using namespace xe::gpu::xenos; -Texture::Texture(uint32_t address) - : address_(address) { +Texture::Texture(uint32_t address, const uint8_t* host_address) + : address_(address), host_address_(host_address) { +} + +Texture::~Texture() { + for (auto it = views_.begin(); it != views_.end(); ++it) { + auto view = *it; + delete view; + } + views_.clear(); +} + +TextureView* Texture::Fetch( + const xenos::xe_gpu_texture_fetch_t& fetch) { + // TODO(benvanik): compute length for hash check. + size_t length = 0; + switch (fetch.dimension) { + case DIMENSION_1D: + break; + case DIMENSION_2D: + break; + case DIMENSION_3D: + break; + case DIMENSION_CUBE: + break; + } + uint64_t hash = xe_hash64(host_address_, length); + + for (auto it = views_.begin(); it != views_.end(); ++it) { + auto view = *it; + if (memcmp(&view->fetch, &fetch, sizeof(fetch))) { + continue; + } + bool dirty = hash != view->hash; + if (dirty) { + return FetchDirty(view, fetch) ? view : nullptr; + } else { + return view; + } + } + + auto new_view = FetchNew(fetch); + if (!new_view) { + return nullptr; + } + new_view->hash = hash; + views_.push_back(new_view); + return new_view; } bool Texture::FillViewInfo(TextureView* view, @@ -30,6 +76,9 @@ bool Texture::FillViewInfo(TextureView* view, // http://msdn.microsoft.com/en-us/library/windows/desktop/cc308051(v=vs.85).aspx // a2xx_sq_surfaceformat + view->texture = this; + view->fetch = fetch; + view->dimensions = fetch.dimension; switch (fetch.dimension) { case DIMENSION_1D: @@ -213,9 +262,65 @@ bool Texture::FillViewInfo(TextureView* view, view->format = DXGI_FORMAT_UNKNOWN; break; } + + if (view->format == DXGI_FORMAT_UNKNOWN) { + return false; + } + + switch (fetch.dimension) { + case DIMENSION_1D: + break; + case DIMENSION_2D: + view->sizes_2d = GetTextureSizes2D(view); + break; + case DIMENSION_3D: + break; + case DIMENSION_CUBE: + break; + } return true; } +const TextureSizes2D Texture::GetTextureSizes2D(TextureView* view) { + TextureSizes2D sizes; + + sizes.logical_width = 1 + view->fetch.size_2d.width; + sizes.logical_height = 1 + view->fetch.size_2d.height; + + sizes.block_width = sizes.logical_width / view->block_size; + sizes.block_height = sizes.logical_height / view->block_size; + + if (!view->is_compressed) { + // must be 32x32, but also must have a pitch that is a multiple of 256 bytes + uint32_t bytes_per_block = view->block_size * view->block_size * + view->texel_pitch; + uint32_t width_multiple = 32; + if (bytes_per_block) { + uint32_t minimum_multiple = 256 / bytes_per_block; + if (width_multiple < minimum_multiple) { + width_multiple = minimum_multiple; + } + } + sizes.input_width = XEROUNDUP(sizes.logical_width, width_multiple); + sizes.input_height = XEROUNDUP(sizes.logical_height, 32); + sizes.output_width = sizes.logical_width; + sizes.output_height = sizes.logical_height; + } else { + // must be 128x128 + sizes.input_width = XEROUNDUP(sizes.logical_width, 128); + sizes.input_height = XEROUNDUP(sizes.logical_height, 128); + sizes.output_width = XENEXTPOW2(sizes.logical_width); + sizes.output_height = XENEXTPOW2(sizes.logical_height); + } + + sizes.logical_pitch = + (sizes.logical_width / view->block_size) * view->texel_pitch; + sizes.input_pitch = + (sizes.input_width / view->block_size) * view->texel_pitch; + + return sizes; +} + void Texture::TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch, XE_GPU_ENDIAN endianness) { switch (endianness) { diff --git a/src/xenia/gpu/texture.h b/src/xenia/gpu/texture.h index 24d595162..9b919a5d9 100644 --- a/src/xenia/gpu/texture.h +++ b/src/xenia/gpu/texture.h @@ -23,9 +23,34 @@ namespace gpu { class Texture; +struct TextureSizes1D {}; +struct TextureSizes2D { + uint32_t logical_width; + uint32_t logical_height; + uint32_t block_width; + uint32_t block_height; + uint32_t input_width; + uint32_t input_height; + uint32_t output_width; + uint32_t output_height; + uint32_t logical_pitch; + uint32_t input_pitch; +}; +struct TextureSizes3D {}; +struct TextureSizesCube {}; struct TextureView { Texture* texture; + xenos::xe_gpu_texture_fetch_t fetch; + uint64_t hash; + + union { + TextureSizes1D sizes_1d; + TextureSizes2D sizes_2d; + TextureSizes3D sizes_3d; + TextureSizesCube sizes_cube; + }; + int dimensions; uint32_t width; uint32_t height; @@ -46,16 +71,23 @@ struct TextureView { class Texture { public: - Texture(uint32_t address); - virtual ~Texture() = default; + Texture(uint32_t address, const uint8_t* host_address); + virtual ~Texture(); - virtual TextureView* Fetch( - const xenos::xe_gpu_texture_fetch_t& fetch) = 0; + TextureView* Fetch( + const xenos::xe_gpu_texture_fetch_t& fetch); protected: bool FillViewInfo(TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); + virtual TextureView* FetchNew( + const xenos::xe_gpu_texture_fetch_t& fetch) = 0; + virtual bool FetchDirty( + TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) = 0; + + const TextureSizes2D GetTextureSizes2D(TextureView* view); + static void TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch, xenos::XE_GPU_ENDIAN endianness); static uint32_t TiledOffset2DOuter(uint32_t y, uint32_t width, @@ -64,6 +96,10 @@ protected: uint32_t base_offset); uint32_t address_; + const uint8_t* host_address_; + + // TODO(benvanik): replace with LRU keyed list. + std::vector views_; }; diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc index f205008c2..1f0a4a5ac 100644 --- a/src/xenia/gpu/texture_cache.cc +++ b/src/xenia/gpu/texture_cache.cc @@ -24,7 +24,11 @@ TextureCache::TextureCache(Memory* memory) } TextureCache::~TextureCache() { - // textures + for (auto it = textures_.begin(); it != textures_.end(); ++it) { + auto texture = it->second; + delete texture; + } + textures_.clear(); } TextureView* TextureCache::FetchTexture( @@ -32,7 +36,8 @@ TextureView* TextureCache::FetchTexture( auto it = textures_.find(address); if (it == textures_.end()) { // Texture not found. - auto texture = CreateTexture(address, fetch); + const uint8_t* host_address = memory_->Translate(address); + auto texture = CreateTexture(address, host_address, fetch); if (!texture) { return nullptr; } diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h index dc796fe50..285ffe1d7 100644 --- a/src/xenia/gpu/texture_cache.h +++ b/src/xenia/gpu/texture_cache.h @@ -33,7 +33,8 @@ public: protected: virtual Texture* CreateTexture( - uint32_t address, const xenos::xe_gpu_texture_fetch_t& fetch) = 0; + uint32_t address, const uint8_t* host_address, + const xenos::xe_gpu_texture_fetch_t& fetch) = 0; Memory* memory_; From 2d173ea62bb81eb250e17abdf349672d20283cac Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 31 May 2014 22:26:39 -0700 Subject: [PATCH 144/184] Caching index buffers. --- src/xenia/gpu/buffer.cc | 40 ++++++++++ src/xenia/gpu/buffer.h | 72 ++++++++++++++++++ src/xenia/gpu/buffer_cache.cc | 57 ++++++++++++++ src/xenia/gpu/buffer_cache.h | 47 ++++++++++++ src/xenia/gpu/d3d11/d3d11_buffer.cc | 79 ++++++++++++++++++++ src/xenia/gpu/d3d11/d3d11_buffer.h | 57 ++++++++++++++ src/xenia/gpu/d3d11/d3d11_buffer_cache.cc | 38 ++++++++++ src/xenia/gpu/d3d11/d3d11_buffer_cache.h | 50 +++++++++++++ src/xenia/gpu/d3d11/d3d11_graphics_driver.cc | 47 ++++-------- src/xenia/gpu/d3d11/d3d11_graphics_driver.h | 2 + src/xenia/gpu/d3d11/d3d11_shader_cache.h | 4 +- src/xenia/gpu/d3d11/sources.gypi | 4 + src/xenia/gpu/shader_cache.cc | 12 ++- src/xenia/gpu/sources.gypi | 4 + 14 files changed, 471 insertions(+), 42 deletions(-) create mode 100644 src/xenia/gpu/buffer.cc create mode 100644 src/xenia/gpu/buffer.h create mode 100644 src/xenia/gpu/buffer_cache.cc create mode 100644 src/xenia/gpu/buffer_cache.h create mode 100644 src/xenia/gpu/d3d11/d3d11_buffer.cc create mode 100644 src/xenia/gpu/d3d11/d3d11_buffer.h create mode 100644 src/xenia/gpu/d3d11/d3d11_buffer_cache.cc create mode 100644 src/xenia/gpu/d3d11/d3d11_buffer_cache.h diff --git a/src/xenia/gpu/buffer.cc b/src/xenia/gpu/buffer.cc new file mode 100644 index 000000000..0b7fe9ad6 --- /dev/null +++ b/src/xenia/gpu/buffer.cc @@ -0,0 +1,40 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + + +Buffer::Buffer( + const uint8_t* src_ptr, size_t length) : + src_(src_ptr), length_(length) { +} + +Buffer::~Buffer() { +} + +IndexBuffer::IndexBuffer(const IndexBufferInfo& info, + const uint8_t* src_ptr, size_t length) + : Buffer(src_ptr, length), + info_(info) { +} + +IndexBuffer::~IndexBuffer() {} + +VertexBuffer::VertexBuffer(const uint8_t* src_ptr, size_t length) + : Buffer(src_ptr, length) { +} + +VertexBuffer::~VertexBuffer() {} diff --git a/src/xenia/gpu/buffer.h b/src/xenia/gpu/buffer.h new file mode 100644 index 000000000..bc83ed20d --- /dev/null +++ b/src/xenia/gpu/buffer.h @@ -0,0 +1,72 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_BUFFER_H_ +#define XENIA_GPU_BUFFER_H_ + +#include +#include +#include + + +namespace xe { +namespace gpu { + + +class Buffer { +public: + Buffer(const uint8_t* src_ptr, size_t length); + virtual ~Buffer(); + + const uint8_t* src() const { return src_; } + size_t length() const { return length_; } + uint64_t hash() const { return hash_; } + + virtual bool FetchNew(uint64_t hash) = 0; + virtual bool FetchDirty(uint64_t hash) = 0; + +protected: + const uint8_t* src_; + size_t length_; + uint64_t hash_; +}; + + +struct IndexBufferInfo { + bool index_32bit; + uint32_t index_count; + uint32_t index_size; + uint32_t endianness; +}; + + +class IndexBuffer : public Buffer { +public: + IndexBuffer(const IndexBufferInfo& info, + const uint8_t* src_ptr, size_t length); + virtual ~IndexBuffer(); + +protected: + IndexBufferInfo info_; +}; + + +class VertexBuffer : public Buffer { +public: + VertexBuffer(const uint8_t* src_ptr, size_t length); + virtual ~VertexBuffer(); +}; + + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_BUFFER_H_ diff --git a/src/xenia/gpu/buffer_cache.cc b/src/xenia/gpu/buffer_cache.cc new file mode 100644 index 000000000..1f1d9ac00 --- /dev/null +++ b/src/xenia/gpu/buffer_cache.cc @@ -0,0 +1,57 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + + +BufferCache::BufferCache() { +} + +BufferCache::~BufferCache() { + Clear(); +} + +IndexBuffer* BufferCache::FetchIndexBuffer( + const IndexBufferInfo& info, + const uint8_t* src_ptr, size_t length) { + size_t key = hash_combine(info.endianness, info.index_32bit, info.index_count, info.index_size); + size_t hash = xe_hash64(src_ptr, length); + auto it = index_buffer_map_.find(key); + if (it != index_buffer_map_.end()) { + if (hash == it->second->hash()) { + return it->second; + } else { + return it->second->FetchDirty(hash) ? it->second : nullptr; + } + } else { + auto buffer = CreateIndexBuffer(info, src_ptr, length); + index_buffer_map_.insert({ key, buffer }); + if (!buffer->FetchNew(hash)) { + return nullptr; + } + return buffer; + } +} + +void BufferCache::Clear() { + for (auto it = index_buffer_map_.begin(); + it != index_buffer_map_.end(); ++it) { + auto buffer = it->second; + delete buffer; + } + index_buffer_map_.clear(); +} diff --git a/src/xenia/gpu/buffer_cache.h b/src/xenia/gpu/buffer_cache.h new file mode 100644 index 000000000..21a057a0c --- /dev/null +++ b/src/xenia/gpu/buffer_cache.h @@ -0,0 +1,47 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_BUFFER_CACHE_H_ +#define XENIA_GPU_BUFFER_CACHE_H_ + +#include +#include +#include + + +namespace xe { +namespace gpu { + + +class BufferCache { +public: + BufferCache(); + virtual ~BufferCache(); + + IndexBuffer* FetchIndexBuffer( + const IndexBufferInfo& info, + const uint8_t* src_ptr, size_t length); + + void Clear(); + +protected: + virtual IndexBuffer* CreateIndexBuffer( + const IndexBufferInfo& info, + const uint8_t* src_ptr, size_t length) = 0; + +private: + std::unordered_map index_buffer_map_; +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_BUFFER_CACHE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_buffer.cc b/src/xenia/gpu/d3d11/d3d11_buffer.cc new file mode 100644 index 000000000..98ea3ba9b --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_buffer.cc @@ -0,0 +1,79 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; + + +D3D11IndexBuffer::D3D11IndexBuffer( + D3D11BufferCache* buffer_cache, + const IndexBufferInfo& info, + const uint8_t* src_ptr, size_t length) + : IndexBuffer(info, src_ptr, length), + buffer_cache_(buffer_cache), + handle_(nullptr) { +} + +D3D11IndexBuffer::~D3D11IndexBuffer() { + XESAFERELEASE(handle_); +} + +bool D3D11IndexBuffer::FetchNew(uint64_t hash) { + hash_ = hash; + + D3D11_BUFFER_DESC buffer_desc; + xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); + buffer_desc.ByteWidth = info_.index_size; + buffer_desc.Usage = D3D11_USAGE_DYNAMIC; + buffer_desc.BindFlags = D3D11_BIND_INDEX_BUFFER; + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + HRESULT hr = buffer_cache_->device()->CreateBuffer(&buffer_desc, NULL, &handle_); + if (FAILED(hr)) { + XELOGW("D3D11: failed to create index buffer"); + return false; + } + + return FetchDirty(hash); +} + +bool D3D11IndexBuffer::FetchDirty(uint64_t hash) { + hash_ = hash; + + // All that's done so far: + XEASSERT(info_.endianness == 0x2); + + D3D11_MAPPED_SUBRESOURCE res; + buffer_cache_->context()->Map(handle_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + if (info_.index_32bit) { + const uint32_t* src = reinterpret_cast(src_); + uint32_t* dest = reinterpret_cast(res.pData); + for (uint32_t n = 0; n < info_.index_count; n++) { + uint32_t d = { XESWAP32(src[n]) }; + dest[n] = d; + } + } else { + const uint16_t* src = reinterpret_cast(src_); + uint16_t* dest = reinterpret_cast(res.pData); + for (uint32_t n = 0; n < info_.index_count; n++) { + uint16_t d = XESWAP16(src[n]); + dest[n] = d; + } + } + buffer_cache_->context()->Unmap(handle_, 0); + + return true; +} diff --git a/src/xenia/gpu/d3d11/d3d11_buffer.h b/src/xenia/gpu/d3d11/d3d11_buffer.h new file mode 100644 index 000000000..02160db2e --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_buffer.h @@ -0,0 +1,57 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_BUFFER_H_ +#define XENIA_GPU_D3D11_D3D11_BUFFER_H_ + +#include + +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + +class D3D11BufferCache; + + +class D3D11IndexBuffer : public IndexBuffer { +public: + D3D11IndexBuffer(D3D11BufferCache* buffer_cache, + const IndexBufferInfo& info, + const uint8_t* src_ptr, size_t length); + virtual ~D3D11IndexBuffer(); + + ID3D11Buffer* handle() const { return handle_; } + + bool FetchNew(uint64_t hash) override; + bool FetchDirty(uint64_t hash) override; + +private: + D3D11BufferCache* buffer_cache_; + ID3D11Buffer* handle_; +}; + + +class D3D11VertexBuffer : public VertexBuffer { +public: +private: +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_BUFFER_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_buffer_cache.cc b/src/xenia/gpu/d3d11/d3d11_buffer_cache.cc new file mode 100644 index 000000000..b6aac9d1b --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_buffer_cache.cc @@ -0,0 +1,38 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; + + +D3D11BufferCache::D3D11BufferCache(ID3D11DeviceContext* context, + ID3D11Device* device) + : context_(context), device_(device) { + context->AddRef(); + device_->AddRef(); +} + +D3D11BufferCache::~D3D11BufferCache() { + XESAFERELEASE(device_); + XESAFERELEASE(context_); +} + +IndexBuffer* D3D11BufferCache::CreateIndexBuffer( + const IndexBufferInfo& info, + const uint8_t* src_ptr, size_t length) { + return new D3D11IndexBuffer(this, info, src_ptr, length); +} diff --git a/src/xenia/gpu/d3d11/d3d11_buffer_cache.h b/src/xenia/gpu/d3d11/d3d11_buffer_cache.h new file mode 100644 index 000000000..eca2f5b55 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_buffer_cache.h @@ -0,0 +1,50 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_BUFFER_CACHE_H_ +#define XENIA_GPU_D3D11_D3D11_BUFFER_CACHE_H_ + +#include + +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + + +class D3D11BufferCache : public BufferCache { +public: + D3D11BufferCache(ID3D11DeviceContext* context, ID3D11Device* device); + virtual ~D3D11BufferCache(); + + ID3D11DeviceContext* context() const { return context_; } + ID3D11Device* device() const { return device_; } + +protected: + IndexBuffer* CreateIndexBuffer( + const IndexBufferInfo& info, + const uint8_t* src_ptr, size_t length) override; + +protected: + ID3D11DeviceContext* context_; + ID3D11Device* device_; +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_BUFFER_CACHE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc index f29afd285..209313091 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc @@ -10,6 +10,8 @@ #include #include +#include +#include #include #include #include @@ -33,6 +35,7 @@ D3D11GraphicsDriver::D3D11GraphicsDriver( device_ = device; device_->AddRef(); device_->GetImmediateContext(&context_); + buffer_cache_ = new D3D11BufferCache(context_, device_); shader_cache_ = new D3D11ShaderCache(device_); texture_cache_ = new D3D11TextureCache(memory_, context_, device_); @@ -136,6 +139,7 @@ D3D11GraphicsDriver::~D3D11GraphicsDriver() { XESAFERELEASE(state_.constant_buffers.gs_consts); XESAFERELEASE(invalid_texture_view_); XESAFERELEASE(invalid_texture_sampler_state_); + delete buffer_cache_; delete texture_cache_; delete shader_cache_; XESAFERELEASE(context_); @@ -1098,43 +1102,20 @@ int D3D11GraphicsDriver::PrepareIndexBuffer( uint32_t address = index_base + address_translation_; - // All that's done so far: - XEASSERT(endianness == 0x2); - - ID3D11Buffer* buffer = 0; - D3D11_BUFFER_DESC buffer_desc; - xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); - buffer_desc.ByteWidth = index_size; - buffer_desc.Usage = D3D11_USAGE_DYNAMIC; - buffer_desc.BindFlags = D3D11_BIND_INDEX_BUFFER; - buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - device_->CreateBuffer(&buffer_desc, NULL, &buffer); - D3D11_MAPPED_SUBRESOURCE res; - context_->Map(buffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); - if (index_32bit) { - uint32_t* src = (uint32_t*)memory_->Translate(address); - uint32_t* dest = (uint32_t*)res.pData; - for (uint32_t n = 0; n < index_count; n++) { - uint32_t d = { XESWAP32(src[n]) }; - //XELOGGPU("i%.4d %0.8X", n, d); - dest[n] = d; - } - } else { - uint16_t* src = (uint16_t*)memory_->Translate(address); - uint16_t* dest = (uint16_t*)res.pData; - for (uint32_t n = 0; n < index_count; n++) { - uint16_t d = XESWAP16(src[n]); - //XELOGGPU("i%.4d, %.4X", n, d); - dest[n] = d; - } + IndexBufferInfo info; + info.endianness = endianness; + info.index_32bit = index_32bit; + info.index_count = index_count; + info.index_size = index_size; + auto ib = static_cast(buffer_cache_->FetchIndexBuffer( + info, memory_->Translate(address), index_size)); + if (!ib) { + return 1; } - context_->Unmap(buffer, 0); DXGI_FORMAT format; format = index_32bit ? DXGI_FORMAT_R32_UINT : DXGI_FORMAT_R16_UINT; - context_->IASetIndexBuffer(buffer, format, 0); - - buffer->Release(); + context_->IASetIndexBuffer(ib->handle(), format, 0); return 0; } diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.h b/src/xenia/gpu/d3d11/d3d11_graphics_driver.h index 5a289d255..2f2316488 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.h +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.h @@ -24,6 +24,7 @@ namespace xe { namespace gpu { namespace d3d11 { +class D3D11BufferCache; class D3D11PixelShader; class D3D11ShaderCache; class D3D11TextureCache; @@ -76,6 +77,7 @@ private: IDXGISwapChain* swap_chain_; ID3D11Device* device_; ID3D11DeviceContext* context_; + D3D11BufferCache* buffer_cache_; D3D11ShaderCache* shader_cache_; D3D11TextureCache* texture_cache_; diff --git a/src/xenia/gpu/d3d11/d3d11_shader_cache.h b/src/xenia/gpu/d3d11/d3d11_shader_cache.h index 661fb38f8..8c33523b4 100644 --- a/src/xenia/gpu/d3d11/d3d11_shader_cache.h +++ b/src/xenia/gpu/d3d11/d3d11_shader_cache.h @@ -28,10 +28,10 @@ public: virtual ~D3D11ShaderCache(); protected: - virtual Shader* CreateCore( + Shader* CreateCore( xenos::XE_GPU_SHADER_TYPE type, const uint8_t* src_ptr, size_t length, - uint64_t hash); + uint64_t hash) override; protected: ID3D11Device* device_; diff --git a/src/xenia/gpu/d3d11/sources.gypi b/src/xenia/gpu/d3d11/sources.gypi index 46c391ee6..6dc7ae242 100644 --- a/src/xenia/gpu/d3d11/sources.gypi +++ b/src/xenia/gpu/d3d11/sources.gypi @@ -1,6 +1,10 @@ # Copyright 2013 Ben Vanik. All Rights Reserved. { 'sources': [ + 'd3d11_buffer.cc', + 'd3d11_buffer.h', + 'd3d11_buffer_cache.cc', + 'd3d11_buffer_cache.h', 'd3d11_geometry_shader.cc', 'd3d11_geometry_shader.h', 'd3d11_gpu-private.h', diff --git a/src/xenia/gpu/shader_cache.cc b/src/xenia/gpu/shader_cache.cc index 33033bc36..2c5e84294 100644 --- a/src/xenia/gpu/shader_cache.cc +++ b/src/xenia/gpu/shader_cache.cc @@ -30,7 +30,7 @@ Shader* ShaderCache::Create( const uint8_t* src_ptr, size_t length) { uint64_t hash = Hash(src_ptr, length); Shader* shader = CreateCore(type, src_ptr, length, hash); - map_.insert(pair(hash, shader)); + map_.insert({ hash, shader }); return shader; } @@ -45,7 +45,7 @@ Shader* ShaderCache::Find( XE_GPU_SHADER_TYPE type, const uint8_t* src_ptr, size_t length) { uint64_t hash = Hash(src_ptr, length); - unordered_map::iterator it = map_.find(hash); + auto it = map_.find(hash); if (it != map_.end()) { return it->second; } @@ -58,19 +58,17 @@ Shader* ShaderCache::FindOrCreate( SCOPE_profile_cpu_f("gpu"); uint64_t hash = Hash(src_ptr, length); - unordered_map::iterator it = map_.find(hash); + auto it = map_.find(hash); if (it != map_.end()) { return it->second; } Shader* shader = CreateCore(type, src_ptr, length, hash); - map_.insert(pair(hash, shader)); + map_.insert({ hash, shader }); return shader; } void ShaderCache::Clear() { - // TODO(benvanik): clear. - for (unordered_map::iterator it = map_.begin(); - it != map_.end(); ++it) { + for (auto it = map_.begin(); it != map_.end(); ++it) { Shader* shader = it->second; delete shader; } diff --git a/src/xenia/gpu/sources.gypi b/src/xenia/gpu/sources.gypi index 9309e0ec3..b2c9134c0 100644 --- a/src/xenia/gpu/sources.gypi +++ b/src/xenia/gpu/sources.gypi @@ -1,6 +1,10 @@ # Copyright 2013 Ben Vanik. All Rights Reserved. { 'sources': [ + 'buffer.cc', + 'buffer.h', + 'buffer_cache.cc', + 'buffer_cache.h', 'command_buffer.h', 'gpu-private.h', 'gpu.cc', From 3a8065b7b13a05b1eb87d2b2c5ad988dd1bb45c8 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 1 Jun 2014 09:42:07 -0700 Subject: [PATCH 145/184] Vertex buffer caching. Doesn't help, though, as buffers are weird. Need to rethink all of this. --- src/xenia/gpu/buffer.cc | 6 +- src/xenia/gpu/buffer.h | 21 +++++- src/xenia/gpu/buffer_cache.cc | 22 ++++++ src/xenia/gpu/buffer_cache.h | 8 +++ src/xenia/gpu/d3d11/d3d11_buffer.cc | 73 +++++++++++++++++++- src/xenia/gpu/d3d11/d3d11_buffer.h | 12 ++++ src/xenia/gpu/d3d11/d3d11_buffer_cache.cc | 6 ++ src/xenia/gpu/d3d11/d3d11_buffer_cache.h | 3 + src/xenia/gpu/d3d11/d3d11_graphics_driver.cc | 70 ++++++++----------- 9 files changed, 175 insertions(+), 46 deletions(-) diff --git a/src/xenia/gpu/buffer.cc b/src/xenia/gpu/buffer.cc index 0b7fe9ad6..499cb43a6 100644 --- a/src/xenia/gpu/buffer.cc +++ b/src/xenia/gpu/buffer.cc @@ -33,8 +33,10 @@ IndexBuffer::IndexBuffer(const IndexBufferInfo& info, IndexBuffer::~IndexBuffer() {} -VertexBuffer::VertexBuffer(const uint8_t* src_ptr, size_t length) - : Buffer(src_ptr, length) { +VertexBuffer::VertexBuffer(const VertexBufferInfo& info, + const uint8_t* src_ptr, size_t length) + : Buffer(src_ptr, length), + info_(info) { } VertexBuffer::~VertexBuffer() {} diff --git a/src/xenia/gpu/buffer.h b/src/xenia/gpu/buffer.h index bc83ed20d..9c8e3c654 100644 --- a/src/xenia/gpu/buffer.h +++ b/src/xenia/gpu/buffer.h @@ -57,10 +57,29 @@ protected: }; +struct VertexBufferLayout { + uint32_t stride_words; + uint32_t element_count; + struct { + uint32_t format; + uint32_t offset_words; + uint32_t size_words; + } elements[16]; +}; + +struct VertexBufferInfo { + VertexBufferLayout layout; +}; + + class VertexBuffer : public Buffer { public: - VertexBuffer(const uint8_t* src_ptr, size_t length); + VertexBuffer(const VertexBufferInfo& info, + const uint8_t* src_ptr, size_t length); virtual ~VertexBuffer(); + +protected: + VertexBufferInfo info_; }; diff --git a/src/xenia/gpu/buffer_cache.cc b/src/xenia/gpu/buffer_cache.cc index 1f1d9ac00..cc963d817 100644 --- a/src/xenia/gpu/buffer_cache.cc +++ b/src/xenia/gpu/buffer_cache.cc @@ -47,6 +47,28 @@ IndexBuffer* BufferCache::FetchIndexBuffer( } } +VertexBuffer* BufferCache::FetchVertexBuffer( + const VertexBufferInfo& info, + const uint8_t* src_ptr, size_t length) { + size_t key = reinterpret_cast(src_ptr); + size_t hash = xe_hash64(src_ptr, length); + auto it = vertex_buffer_map_.find(key); + if (it != vertex_buffer_map_.end()) { + if (hash == it->second->hash()) { + return it->second; + } else { + return it->second->FetchDirty(hash) ? it->second : nullptr; + } + } else { + auto buffer = CreateVertexBuffer(info, src_ptr, length); + vertex_buffer_map_.insert({ key, buffer }); + if (!buffer->FetchNew(hash)) { + return nullptr; + } + return buffer; + } +} + void BufferCache::Clear() { for (auto it = index_buffer_map_.begin(); it != index_buffer_map_.end(); ++it) { diff --git a/src/xenia/gpu/buffer_cache.h b/src/xenia/gpu/buffer_cache.h index 21a057a0c..bcba6f9de 100644 --- a/src/xenia/gpu/buffer_cache.h +++ b/src/xenia/gpu/buffer_cache.h @@ -28,15 +28,23 @@ public: const IndexBufferInfo& info, const uint8_t* src_ptr, size_t length); + VertexBuffer* FetchVertexBuffer( + const VertexBufferInfo& info, + const uint8_t* src_ptr, size_t length); + void Clear(); protected: virtual IndexBuffer* CreateIndexBuffer( const IndexBufferInfo& info, const uint8_t* src_ptr, size_t length) = 0; + virtual VertexBuffer* CreateVertexBuffer( + const VertexBufferInfo& info, + const uint8_t* src_ptr, size_t length) = 0; private: std::unordered_map index_buffer_map_; + std::unordered_map vertex_buffer_map_; }; diff --git a/src/xenia/gpu/d3d11/d3d11_buffer.cc b/src/xenia/gpu/d3d11/d3d11_buffer.cc index 98ea3ba9b..84c0d901e 100644 --- a/src/xenia/gpu/d3d11/d3d11_buffer.cc +++ b/src/xenia/gpu/d3d11/d3d11_buffer.cc @@ -57,7 +57,13 @@ bool D3D11IndexBuffer::FetchDirty(uint64_t hash) { XEASSERT(info_.endianness == 0x2); D3D11_MAPPED_SUBRESOURCE res; - buffer_cache_->context()->Map(handle_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + HRESULT hr = buffer_cache_->context()->Map( + handle_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + if (FAILED(hr)) { + XELOGE("D3D11: unable to map index buffer"); + return false; + } + if (info_.index_32bit) { const uint32_t* src = reinterpret_cast(src_); uint32_t* dest = reinterpret_cast(res.pData); @@ -77,3 +83,68 @@ bool D3D11IndexBuffer::FetchDirty(uint64_t hash) { return true; } + + +D3D11VertexBuffer::D3D11VertexBuffer( + D3D11BufferCache* buffer_cache, + const VertexBufferInfo& info, + const uint8_t* src_ptr, size_t length) + : VertexBuffer(info, src_ptr, length), + buffer_cache_(buffer_cache), + handle_(nullptr) { +} + +D3D11VertexBuffer::~D3D11VertexBuffer() { + XESAFERELEASE(handle_); +} + +bool D3D11VertexBuffer::FetchNew(uint64_t hash) { + hash_ = hash; + + D3D11_BUFFER_DESC buffer_desc; + xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); + buffer_desc.ByteWidth = static_cast(length_); + buffer_desc.Usage = D3D11_USAGE_DYNAMIC; + buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER; + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + HRESULT hr = buffer_cache_->device()->CreateBuffer(&buffer_desc, NULL, &handle_); + if (FAILED(hr)) { + XELOGW("D3D11: failed to create index buffer"); + return false; + } + + return FetchDirty(hash); +} + +bool D3D11VertexBuffer::FetchDirty(uint64_t hash) { + hash_ = hash; + + D3D11_MAPPED_SUBRESOURCE res; + HRESULT hr = buffer_cache_->context()->Map( + handle_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + if (FAILED(hr)) { + XELOGE("D3D11: unable to map vertex buffer"); + return false; + } + uint8_t* dest = reinterpret_cast(res.pData); + + // TODO(benvanik): rewrite to be faster/special case common/etc + uint32_t stride = info_.layout.stride_words; + size_t count = (length_ / 4) / stride; + for (size_t n = 0; n < info_.layout.element_count; n++) { + const auto& el = info_.layout.elements[n]; + const uint32_t* src_ptr = (const uint32_t*)(src_ + el.offset_words * 4); + uint32_t* dest_ptr = (uint32_t*)(dest + el.offset_words * 4); + uint32_t o = 0; + for (uint32_t i = 0; i < count; i++) { + for (uint32_t j = 0; j < el.size_words; j++) { + dest_ptr[o + j] = XESWAP32(src_ptr[o + j]); + } + o += stride; + } + } + + + buffer_cache_->context()->Unmap(handle_, 0); + return true; +} diff --git a/src/xenia/gpu/d3d11/d3d11_buffer.h b/src/xenia/gpu/d3d11/d3d11_buffer.h index 02160db2e..924fb3da4 100644 --- a/src/xenia/gpu/d3d11/d3d11_buffer.h +++ b/src/xenia/gpu/d3d11/d3d11_buffer.h @@ -45,7 +45,19 @@ private: class D3D11VertexBuffer : public VertexBuffer { public: + D3D11VertexBuffer(D3D11BufferCache* buffer_cache, + const VertexBufferInfo& info, + const uint8_t* src_ptr, size_t length); + virtual ~D3D11VertexBuffer(); + + ID3D11Buffer* handle() const { return handle_; } + + bool FetchNew(uint64_t hash) override; + bool FetchDirty(uint64_t hash) override; + private: + D3D11BufferCache* buffer_cache_; + ID3D11Buffer* handle_; }; diff --git a/src/xenia/gpu/d3d11/d3d11_buffer_cache.cc b/src/xenia/gpu/d3d11/d3d11_buffer_cache.cc index b6aac9d1b..48eb8fbf8 100644 --- a/src/xenia/gpu/d3d11/d3d11_buffer_cache.cc +++ b/src/xenia/gpu/d3d11/d3d11_buffer_cache.cc @@ -36,3 +36,9 @@ IndexBuffer* D3D11BufferCache::CreateIndexBuffer( const uint8_t* src_ptr, size_t length) { return new D3D11IndexBuffer(this, info, src_ptr, length); } + +VertexBuffer* D3D11BufferCache::CreateVertexBuffer( + const VertexBufferInfo& info, + const uint8_t* src_ptr, size_t length) { + return new D3D11VertexBuffer(this, info, src_ptr, length); +} diff --git a/src/xenia/gpu/d3d11/d3d11_buffer_cache.h b/src/xenia/gpu/d3d11/d3d11_buffer_cache.h index eca2f5b55..284536ab7 100644 --- a/src/xenia/gpu/d3d11/d3d11_buffer_cache.h +++ b/src/xenia/gpu/d3d11/d3d11_buffer_cache.h @@ -35,6 +35,9 @@ protected: IndexBuffer* CreateIndexBuffer( const IndexBufferInfo& info, const uint8_t* src_ptr, size_t length) override; + VertexBuffer* CreateVertexBuffer( + const VertexBufferInfo& info, + const uint8_t* src_ptr, size_t length) override; protected: ID3D11DeviceContext* context_; diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc index 209313091..886643e32 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc @@ -933,6 +933,11 @@ int D3D11GraphicsDriver::PrepareFetchers() { int D3D11GraphicsDriver::PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc) { SCOPE_profile_cpu_f("gpu"); + D3D11VertexShader* vs = state_.vertex_shader; + if (!vs) { + return 1; + } + RegisterFile& rf = register_file_; int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6; xe_gpu_fetch_group_t* group = (xe_gpu_fetch_group_t*)&rf.values[r]; @@ -953,56 +958,37 @@ int D3D11GraphicsDriver::PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc) { XEASSERT(fetch->type == 0x3); XEASSERTNOTZERO(fetch->size); - ID3D11Buffer* buffer = 0; - D3D11_BUFFER_DESC buffer_desc; - xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); - buffer_desc.ByteWidth = fetch->size * 4; - buffer_desc.Usage = D3D11_USAGE_DYNAMIC; - buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER; - buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - HRESULT hr = device_->CreateBuffer(&buffer_desc, NULL, &buffer); - if (FAILED(hr)) { + VertexBufferInfo info; + // TODO(benvanik): make these structs the same so we can share. + info.layout.stride_words = desc.stride_words; + info.layout.element_count = desc.element_count; + for (uint32_t i = 0; i < desc.element_count; ++i) { + const auto& src_el = desc.elements[i]; + auto& dest_el = info.layout.elements[i]; + dest_el.format = src_el.format; + dest_el.offset_words = src_el.offset_words; + dest_el.size_words = src_el.size_words; + } + + uint32_t address = (fetch->address << 2) + address_translation_; + const uint8_t* src = reinterpret_cast( + memory_->Translate(address)); + + VertexBuffer* vertex_buffer = buffer_cache_->FetchVertexBuffer( + info, src, fetch->size * 4); + if (!vertex_buffer) { XELOGE("D3D11: unable to create vertex fetch buffer"); return 1; } - D3D11_MAPPED_SUBRESOURCE res; - hr = context_->Map(buffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); - if (FAILED(hr)) { - XELOGE("D3D11: unable to map vertex fetch buffer"); - XESAFERELEASE(buffer); - return 1; - } - uint32_t address = (fetch->address << 2) + address_translation_; - uint8_t* src = (uint8_t*)memory_->Translate(address); - uint8_t* dest = (uint8_t*)res.pData; - // TODO(benvanik): rewrite to be faster/special case common/etc - for (size_t n = 0; n < desc.element_count; n++) { - auto& el = desc.elements[n]; - uint32_t stride = desc.stride_words; - uint32_t count = fetch->size / stride; - uint32_t* src_ptr = (uint32_t*)(src + el.offset_words * 4); - uint32_t* dest_ptr = (uint32_t*)(dest + el.offset_words * 4); - uint32_t o = 0; - for (uint32_t i = 0; i < count; i++) { - for (uint32_t j = 0; j < el.size_words; j++) { - dest_ptr[o + j] = XESWAP32(src_ptr[o + j]); - } - o += stride; - } - } - context_->Unmap(buffer, 0); + auto d3d_vb = static_cast(vertex_buffer); - D3D11VertexShader* vs = state_.vertex_shader; - if (!vs) { - return 1; - } // TODO(benvanik): always dword aligned? uint32_t stride = desc.stride_words * 4; uint32_t offset = 0; int vb_slot = desc.input_index; - context_->IASetVertexBuffers(vb_slot, 1, &buffer, &stride, &offset); - - buffer->Release(); + ID3D11Buffer* buffers[] = { d3d_vb->handle() }; + context_->IASetVertexBuffers(vb_slot, XECOUNT(buffers), buffers, + &stride, &offset); return 0; } From 0e3854555d449cb9d00964ecd54809902f28d129 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 1 Jun 2014 23:36:18 -0700 Subject: [PATCH 146/184] Pure dynamic MMIO access. Prep for more complex GPU memory management. --- src/alloy/backend/ivm/ivm_assembler.cc | 1 - src/alloy/backend/ivm/ivm_function.cc | 1 - src/alloy/backend/ivm/ivm_intcode.cc | 263 +++---------------------- src/alloy/backend/ivm/ivm_intcode.h | 4 - src/alloy/backend/x64/x64_sequences.cc | 247 +++-------------------- src/alloy/memory.h | 9 + src/alloy/runtime/register_access.h | 38 ---- src/alloy/runtime/runtime.cc | 19 +- src/alloy/runtime/runtime.h | 9 - src/alloy/runtime/sources.gypi | 1 - src/xenia/apu/audio_system.cc | 17 +- src/xenia/apu/audio_system.h | 10 +- src/xenia/cpu/processor.cc | 5 - src/xenia/cpu/processor.h | 8 - src/xenia/cpu/xenon_memory.cc | 220 +++++++++++++++++++++ src/xenia/cpu/xenon_memory.h | 37 +++- src/xenia/emulator.h | 5 +- src/xenia/gpu/graphics_system.cc | 17 +- src/xenia/gpu/graphics_system.h | 10 +- 19 files changed, 335 insertions(+), 586 deletions(-) delete mode 100644 src/alloy/runtime/register_access.h diff --git a/src/alloy/backend/ivm/ivm_assembler.cc b/src/alloy/backend/ivm/ivm_assembler.cc index a95237f2e..0431f7ab2 100644 --- a/src/alloy/backend/ivm/ivm_assembler.cc +++ b/src/alloy/backend/ivm/ivm_assembler.cc @@ -61,7 +61,6 @@ int IVMAssembler::Assemble( fn->set_debug_info(debug_info); TranslationContext ctx; - ctx.access_callbacks = backend_->runtime()->access_callbacks(); ctx.register_count = 0; ctx.intcode_count = 0; ctx.intcode_arena = &intcode_arena_; diff --git a/src/alloy/backend/ivm/ivm_function.cc b/src/alloy/backend/ivm/ivm_function.cc index ff60f5994..72c564210 100644 --- a/src/alloy/backend/ivm/ivm_function.cc +++ b/src/alloy/backend/ivm/ivm_function.cc @@ -120,7 +120,6 @@ int IVMFunction::CallImpl(ThreadState* thread_state, uint64_t return_address) { ics.membase = memory->membase(); ics.did_carry = 0; ics.did_saturate = 0; - ics.access_callbacks = thread_state->runtime()->access_callbacks(); ics.thread_state = thread_state; ics.return_address = return_address; ics.call_return_address = 0; diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 0542a7277..1f24bc6ea 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -196,213 +196,6 @@ int DispatchToC(TranslationContext& ctx, Instr* i, IntCodeFn fn) { return 0; } -uint32_t IntCode_LOAD_REGISTER_I8(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i8 = (int8_t)cbs->read(cbs->context, address); - return IA_NEXT; -} -uint32_t IntCode_LOAD_REGISTER_I16(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i16 = XESWAP16((int16_t)cbs->read(cbs->context, address)); - return IA_NEXT; -} -uint32_t IntCode_LOAD_REGISTER_I32(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i32 = XESWAP32((int32_t)cbs->read(cbs->context, address)); - return IA_NEXT; -} -uint32_t IntCode_LOAD_REGISTER_I64(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i64 = XESWAP64((int64_t)cbs->read(cbs->context, address)); - return IA_NEXT; -} -int DispatchRegisterRead( - TranslationContext& ctx, Instr* i, RegisterAccessCallbacks* cbs) { - static IntCodeFn fns[] = { - IntCode_LOAD_REGISTER_I8, - IntCode_LOAD_REGISTER_I16, - IntCode_LOAD_REGISTER_I32, - IntCode_LOAD_REGISTER_I64, - IntCode_INVALID_TYPE, - IntCode_INVALID_TYPE, - IntCode_INVALID_TYPE, - }; - IntCodeFn fn = fns[i->dest->type]; - XEASSERT(fn != IntCode_INVALID_TYPE); - uint32_t dest_reg = AllocDynamicRegister(ctx, i->dest); - uint32_t src1_reg = AllocOpRegister(ctx, OPCODE_SIG_TYPE_V, &i->src1); - ctx.intcode_count++; - IntCode* ic = ctx.intcode_arena->Alloc(); - ic->intcode_fn = fn; - ic->flags = i->flags; - ic->debug_flags = 0; - ic->dest_reg = dest_reg; - ic->src1_reg = src1_reg; - ic->src2_reg = (uint32_t)((uint64_t)cbs); - ic->src3_reg = (uint32_t)(((uint64_t)cbs) >> 32); - return 0; -} -uint32_t IntCode_LOAD_REGISTER_I8_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i8 = (int8_t)cbs->read(cbs->context, address); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} -uint32_t IntCode_LOAD_REGISTER_I16_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i16 = XESWAP16((int16_t)cbs->read(cbs->context, address)); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} -uint32_t IntCode_LOAD_REGISTER_I32_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i32 = XESWAP32((int32_t)cbs->read(cbs->context, address)); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} -uint32_t IntCode_LOAD_REGISTER_I64_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i64 = XESWAP64((int64_t)cbs->read(cbs->context, address)); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} - -uint32_t IntCode_STORE_REGISTER_I8(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, ics.rf[i->src2_reg].i8); - return IA_NEXT; -} -uint32_t IntCode_STORE_REGISTER_I16(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, XESWAP16(ics.rf[i->src2_reg].i16)); - return IA_NEXT; -} -uint32_t IntCode_STORE_REGISTER_I32(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, XESWAP32(ics.rf[i->src2_reg].i32)); - return IA_NEXT; -} -uint32_t IntCode_STORE_REGISTER_I64(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, XESWAP64(ics.rf[i->src2_reg].i64)); - return IA_NEXT; -} -int DispatchRegisterWrite( - TranslationContext& ctx, Instr* i, RegisterAccessCallbacks* cbs) { - static IntCodeFn fns[] = { - IntCode_STORE_REGISTER_I8, - IntCode_STORE_REGISTER_I16, - IntCode_STORE_REGISTER_I32, - IntCode_STORE_REGISTER_I64, - IntCode_INVALID_TYPE, - IntCode_INVALID_TYPE, - IntCode_INVALID_TYPE, - }; - IntCodeFn fn = fns[i->src2.value->type]; - XEASSERT(fn != IntCode_INVALID_TYPE); - uint32_t src1_reg = AllocOpRegister(ctx, OPCODE_SIG_TYPE_V, &i->src1); - uint32_t src2_reg = AllocOpRegister(ctx, OPCODE_SIG_TYPE_V, &i->src2); - ctx.intcode_count++; - IntCode* ic = ctx.intcode_arena->Alloc(); - ic->intcode_fn = fn; - ic->flags = i->flags; - ic->debug_flags = 0; - ic->dest_reg = (uint32_t)(((uint64_t)cbs) >> 32); - ic->src1_reg = src1_reg; - ic->src2_reg = src2_reg; - ic->src3_reg = (uint32_t)((uint64_t)cbs); - return 0; -} -uint32_t IntCode_STORE_REGISTER_I8_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, ics.rf[i->src2_reg].i8); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} -uint32_t IntCode_STORE_REGISTER_I16_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, XESWAP16(ics.rf[i->src2_reg].i16)); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} -uint32_t IntCode_STORE_REGISTER_I32_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, XESWAP32(ics.rf[i->src2_reg].i32)); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} -uint32_t IntCode_STORE_REGISTER_I64_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, XESWAP64(ics.rf[i->src2_reg].i64)); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} - - uint32_t IntCode_INVALID(IntCodeState& ics, const IntCode* i) { XEASSERTALWAYS(); return IA_NEXT; @@ -1549,7 +1342,8 @@ int Translate_STORE_CONTEXT(TranslationContext& ctx, Instr* i) { uint32_t IntCode_LOAD_I8(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_LOAD_REGISTER_I8_DYNAMIC(ics, i); + ics.rf[i->dest_reg].i8 = ics.thread_state->memory()->LoadI8(address); + return IA_NEXT; } DPRINT("%d (%X) = load.i8 %.8X\n", *((int8_t*)(ics.membase + address)), @@ -1562,7 +1356,9 @@ uint32_t IntCode_LOAD_I8(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_LOAD_I16(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_LOAD_REGISTER_I16_DYNAMIC(ics, i); + ics.rf[i->dest_reg].i16 = + XESWAP16(ics.thread_state->memory()->LoadI16(address)); + return IA_NEXT; } DPRINT("%d (%X) = load.i16 %.8X\n", *((int16_t*)(ics.membase + address)), @@ -1575,7 +1371,9 @@ uint32_t IntCode_LOAD_I16(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_LOAD_I32(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_LOAD_REGISTER_I32_DYNAMIC(ics, i); + ics.rf[i->dest_reg].i32 = + XESWAP32(ics.thread_state->memory()->LoadI32(address)); + return IA_NEXT; } DFLUSH(); DPRINT("%d (%X) = load.i32 %.8X\n", @@ -1589,7 +1387,9 @@ uint32_t IntCode_LOAD_I32(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_LOAD_I64(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_LOAD_REGISTER_I64(ics, i); + ics.rf[i->dest_reg].i64 = + XESWAP64(ics.thread_state->memory()->LoadI64(address)); + return IA_NEXT; } DPRINT("%lld (%llX) = load.i64 %.8X\n", *((int64_t*)(ics.membase + address)), @@ -1642,26 +1442,14 @@ int Translate_LOAD(TranslationContext& ctx, Instr* i) { IntCode_LOAD_F64, IntCode_LOAD_V128, }; - if (i->src1.value->IsConstant()) { - // Constant address - check register access callbacks. - // NOTE: we still will likely want to check on access in debug mode, as - // constant propagation may not have happened. - uint64_t address = i->src1.value->AsUint64(); - RegisterAccessCallbacks* cbs = ctx.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - return DispatchRegisterRead(ctx, i, cbs); - } - cbs = cbs->next; - } - } return DispatchToC(ctx, i, fns[i->dest->type]); } uint32_t IntCode_STORE_I8(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_STORE_REGISTER_I8_DYNAMIC(ics, i); + ics.thread_state->memory()->StoreI8(address, ics.rf[i->src2_reg].i8); + return IA_NEXT; } DPRINT("store.i8 %.8X = %d (%X)\n", address, ics.rf[i->src2_reg].i8, ics.rf[i->src2_reg].u8); @@ -1672,7 +1460,9 @@ uint32_t IntCode_STORE_I8(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_STORE_I16(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_STORE_REGISTER_I16_DYNAMIC(ics, i); + ics.thread_state->memory()->StoreI16(address, + XESWAP16(ics.rf[i->src2_reg].i16)); + return IA_NEXT; } DPRINT("store.i16 %.8X = %d (%X)\n", address, ics.rf[i->src2_reg].i16, ics.rf[i->src2_reg].u16); @@ -1683,7 +1473,9 @@ uint32_t IntCode_STORE_I16(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_STORE_I32(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_STORE_REGISTER_I32_DYNAMIC(ics, i); + ics.thread_state->memory()->StoreI32(address, + XESWAP32(ics.rf[i->src2_reg].i32)); + return IA_NEXT; } DPRINT("store.i32 %.8X = %d (%X)\n", address, ics.rf[i->src2_reg].i32, ics.rf[i->src2_reg].u32); @@ -1694,7 +1486,9 @@ uint32_t IntCode_STORE_I32(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_STORE_I64(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_STORE_REGISTER_I64_DYNAMIC(ics, i); + ics.thread_state->memory()->StoreI64(address, + XESWAP64(ics.rf[i->src2_reg].i64)); + return IA_NEXT; } DPRINT("store.i64 %.8X = %lld (%llX)\n", address, ics.rf[i->src2_reg].i64, ics.rf[i->src2_reg].u64); @@ -1738,19 +1532,6 @@ int Translate_STORE(TranslationContext& ctx, Instr* i) { IntCode_STORE_F64, IntCode_STORE_V128, }; - if (i->src1.value->IsConstant()) { - // Constant address - check register access callbacks. - // NOTE: we still will likely want to check on access in debug mode, as - // constant propagation may not have happened. - uint64_t address = i->src1.value->AsUint64(); - RegisterAccessCallbacks* cbs = ctx.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - return DispatchRegisterWrite(ctx, i, cbs); - } - cbs = cbs->next; - } - } return DispatchToC(ctx, i, fns[i->src2.value->type]); } diff --git a/src/alloy/backend/ivm/ivm_intcode.h b/src/alloy/backend/ivm/ivm_intcode.h index ded43d5e1..340bb4dd3 100644 --- a/src/alloy/backend/ivm/ivm_intcode.h +++ b/src/alloy/backend/ivm/ivm_intcode.h @@ -14,7 +14,6 @@ #include #include -#include namespace alloy { namespace runtime { class ThreadState; } } @@ -46,7 +45,6 @@ typedef struct { uint8_t* membase; int8_t did_carry; int8_t did_saturate; - runtime::RegisterAccessCallbacks* access_callbacks; runtime::ThreadState* thread_state; uint64_t return_address; uint64_t call_return_address; @@ -97,8 +95,6 @@ typedef struct SourceMapEntry_s { typedef struct { - runtime::RegisterAccessCallbacks* access_callbacks; - uint32_t register_count; size_t intcode_count; Arena* intcode_arena; diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 8af3c5669..e12491f6a 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -1456,42 +1456,6 @@ EMITTER_OPCODE_TABLE( // ============================================================================ // Note: most *should* be aligned, but needs to be checked! template -bool CheckLoadAccessCallback(X64Emitter& e, const T& i) { - // If this is a constant address load, check to see if it's in a - // register range. We'll also probably want a dynamic check for - // unverified stores. So far, most games use constants. - if (!i.src1.is_constant) { - return false; - } - uint64_t address = i.src1.constant() & 0xFFFFFFFF; - auto cbs = e.runtime()->access_callbacks(); - while (cbs) { - if (cbs->handles(cbs->context, address)) { - e.mov(e.rcx, reinterpret_cast(cbs->context)); - e.mov(e.rdx, address); - e.CallNative(cbs->read); - if (T::dest_type == KEY_TYPE_V_I8) { - // No swap required. - e.mov(i.dest, e.al); - } else if (T::dest_type == KEY_TYPE_V_I16) { - e.ror(e.ax, 8); - e.mov(i.dest, e.ax); - } else if (T::dest_type == KEY_TYPE_V_I32) { - e.bswap(e.eax); - e.mov(i.dest, e.eax); - } else if (T::dest_type == KEY_TYPE_V_I64) { - e.bswap(e.rax); - e.mov(i.dest, e.rax); - } else { - XEASSERTALWAYS(); - } - return true; - } - cbs = cbs->next; - } - return false; -} -template RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { if (guest.is_constant) { // TODO(benvanik): figure out how to do this without a temp. @@ -1506,128 +1470,12 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { return e.rdx + e.rax; } } -uint64_t DynamicRegisterLoad(void* raw_context, uint32_t address) { - auto thread_state = *((ThreadState**)raw_context); - auto cbs = thread_state->runtime()->access_callbacks(); - while (cbs) { - if (cbs->handles(cbs->context, address)) { - return cbs->read(cbs->context, address); - } - cbs = cbs->next; - } - return 0; -} -void DynamicRegisterStore(void* raw_context, uint32_t address, uint64_t value) { - auto thread_state = *((ThreadState**)raw_context); - auto cbs = thread_state->runtime()->access_callbacks(); - while (cbs) { - if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, value); - return; - } - cbs = cbs->next; - } -} -template -void EmitLoadCheck(X64Emitter& e, const I64<>& addr_value, DEST_REG& dest) { - // rax = reserved - // if (address >> 24 == 0x7F) call register load handler; - auto addr = ComputeMemoryAddress(e, addr_value); - e.lea(e.r8d, e.ptr[addr]); - e.shr(e.r8d, 24); - e.cmp(e.r8b, 0x7F); - e.inLocalLabel(); - Xbyak::Label normal_addr; - Xbyak::Label skip_load; - e.jne(normal_addr); - e.lea(e.rdx, e.ptr[addr]); - e.CallNative(DynamicRegisterLoad); - if (DEST_REG::key_type == KEY_TYPE_V_I32) { - e.bswap(e.eax); - e.mov(dest, e.eax); - } - e.jmp(skip_load); - e.L(normal_addr); - if (DEST_REG::key_type == KEY_TYPE_V_I32) { - e.mov(dest, e.dword[addr]); - } - if (IsTracingData()) { - e.mov(e.r8, dest); - e.lea(e.rdx, e.ptr[addr]); - if (DEST_REG::key_type == KEY_TYPE_V_I32) { - e.CallNative(TraceMemoryLoadI32); - } else if (DEST_REG::key_type == KEY_TYPE_V_I64) { - e.CallNative(TraceMemoryLoadI64); - } - } - e.L(skip_load); - e.outLocalLabel(); -} -template -void EmitStoreCheck(X64Emitter& e, const I64<>& addr_value, SRC_REG& src) { - // rax = reserved - // if (address >> 24 == 0x7F) call register store handler; - auto addr = ComputeMemoryAddress(e, addr_value); - e.lea(e.r8d, e.ptr[addr]); - e.shr(e.r8d, 24); - e.cmp(e.r8b, 0x7F); - e.inLocalLabel(); - Xbyak::Label normal_addr; - Xbyak::Label skip_load; - e.jne(normal_addr); - e.lea(e.rdx, e.ptr[addr]); - if (SRC_REG::key_type == KEY_TYPE_V_I32) { - if (src.is_constant) { - e.mov(e.r8d, XESWAP32(static_cast(src.constant()))); - } else { - e.mov(e.r8d, src); - e.bswap(e.r8d); - } - } else if (SRC_REG::key_type == KEY_TYPE_V_I64) { - if (src.is_constant) { - e.mov(e.r8, XESWAP64(static_cast(src.constant()))); - } else { - e.mov(e.r8, src); - e.bswap(e.r8); - } - } - e.CallNative(DynamicRegisterStore); - e.jmp(skip_load); - e.L(normal_addr); - if (SRC_REG::key_type == KEY_TYPE_V_I32) { - if (src.is_constant) { - e.mov(e.dword[addr], src.constant()); - } else { - e.mov(e.dword[addr], src); - } - } else if (SRC_REG::key_type == KEY_TYPE_V_I64) { - if (src.is_constant) { - e.MovMem64(addr, src.constant()); - } else { - e.mov(e.qword[addr], src); - } - } - if (IsTracingData()) { - e.mov(e.r8, e.qword[addr]); - e.lea(e.rdx, e.ptr[addr]); - if (SRC_REG::key_type == KEY_TYPE_V_I32) { - e.CallNative(TraceMemoryStoreI32); - } else if (SRC_REG::key_type == KEY_TYPE_V_I64) { - e.CallNative(TraceMemoryStoreI64); - } - } - e.L(skip_load); - e.outLocalLabel(); -} EMITTER(LOAD_I8, MATCH(I, I64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (CheckLoadAccessCallback(e, i)) { - return; - } auto addr = ComputeMemoryAddress(e, i.src1); e.mov(i.dest, e.byte[addr]); if (IsTracingData()) { - e.mov(e.r8, i.dest); + e.mov(e.r8b, i.dest); e.lea(e.rdx, e.ptr[addr]); e.CallNative(TraceMemoryLoadI8); } @@ -1635,13 +1483,10 @@ EMITTER(LOAD_I8, MATCH(I, I64<>>)) { }; EMITTER(LOAD_I16, MATCH(I, I64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (CheckLoadAccessCallback(e, i)) { - return; - } auto addr = ComputeMemoryAddress(e, i.src1); e.mov(i.dest, e.word[addr]); if (IsTracingData()) { - e.mov(e.r8, i.dest); + e.mov(e.r8w, i.dest); e.lea(e.rdx, e.ptr[addr]); e.CallNative(TraceMemoryLoadI16); } @@ -1649,17 +1494,17 @@ EMITTER(LOAD_I16, MATCH(I, I64<>>)) { }; EMITTER(LOAD_I32, MATCH(I, I64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (CheckLoadAccessCallback(e, i)) { - return; + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(i.dest, e.dword[addr]); + if (IsTracingData()) { + e.mov(e.r8d, i.dest); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadI32); } - EmitLoadCheck(e, i.src1, i.dest); } }; EMITTER(LOAD_I64, MATCH(I, I64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (CheckLoadAccessCallback(e, i)) { - return; - } auto addr = ComputeMemoryAddress(e, i.src1); e.mov(i.dest, e.qword[addr]); if (IsTracingData()) { @@ -1718,51 +1563,8 @@ EMITTER_OPCODE_TABLE( // OPCODE_STORE // ============================================================================ // Note: most *should* be aligned, but needs to be checked! -template -bool CheckStoreAccessCallback(X64Emitter& e, const T& i) { - // If this is a constant address store, check to see if it's in a - // register range. We'll also probably want a dynamic check for - // unverified stores. So far, most games use constants. - if (!i.src1.is_constant) { - return false; - } - uint64_t address = i.src1.constant() & 0xFFFFFFFF; - auto cbs = e.runtime()->access_callbacks(); - while (cbs) { - if (cbs->handles(cbs->context, address)) { - e.mov(e.rcx, reinterpret_cast(cbs->context)); - e.mov(e.rdx, address); - if (i.src2.is_constant) { - e.mov(e.r8, i.src2.constant()); - } else { - if (T::src2_type == KEY_TYPE_V_I8) { - // No swap required. - e.movzx(e.r8, i.src2.reg().cvt8()); - } else if (T::src2_type == KEY_TYPE_V_I16) { - e.movzx(e.r8, i.src2.reg().cvt16()); - e.ror(e.r8w, 8); - } else if (T::src2_type == KEY_TYPE_V_I32) { - e.mov(e.r8d, i.src2.reg().cvt32()); - e.bswap(e.r8d); - } else if (T::src2_type == KEY_TYPE_V_I64) { - e.mov(e.r8, i.src2); - e.bswap(e.r8); - } else { - XEASSERTALWAYS(); - } - } - e.CallNative(cbs->write); - return true; - } - cbs = cbs->next; - } - return false; -} EMITTER(STORE_I8, MATCH(I, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (CheckStoreAccessCallback(e, i)) { - return; - } auto addr = ComputeMemoryAddress(e, i.src1); if (i.src2.is_constant) { e.mov(e.byte[addr], i.src2.constant()); @@ -1770,7 +1572,7 @@ EMITTER(STORE_I8, MATCH(I, I8<>>)) { e.mov(e.byte[addr], i.src2); } if (IsTracingData()) { - e.mov(e.r8, e.byte[addr]); + e.mov(e.r8b, e.byte[addr]); e.lea(e.rdx, e.ptr[addr]); e.CallNative(TraceMemoryStoreI8); } @@ -1778,9 +1580,6 @@ EMITTER(STORE_I8, MATCH(I, I8<>>)) { }; EMITTER(STORE_I16, MATCH(I, I16<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (CheckStoreAccessCallback(e, i)) { - return; - } auto addr = ComputeMemoryAddress(e, i.src1); if (i.src2.is_constant) { e.mov(e.word[addr], i.src2.constant()); @@ -1788,7 +1587,7 @@ EMITTER(STORE_I16, MATCH(I, I16<>>)) { e.mov(e.word[addr], i.src2); } if (IsTracingData()) { - e.mov(e.r8, e.word[addr]); + e.mov(e.r8w, e.word[addr]); e.lea(e.rdx, e.ptr[addr]); e.CallNative(TraceMemoryStoreI16); } @@ -1796,18 +1595,32 @@ EMITTER(STORE_I16, MATCH(I, I16<>>)) { }; EMITTER(STORE_I32, MATCH(I, I32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (CheckStoreAccessCallback(e, i)) { - return; + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.constant()); + } else { + e.mov(e.dword[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8d, e.dword[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreI32); } - EmitStoreCheck(e, i.src1, i.src2); } }; EMITTER(STORE_I64, MATCH(I, I64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (CheckStoreAccessCallback(e, i)) { - return; + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.constant()); + } else { + e.mov(e.qword[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.qword[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreI64); } - EmitStoreCheck(e, i.src1, i.src2); } }; EMITTER(STORE_F32, MATCH(I, F32<>>)) { diff --git a/src/alloy/memory.h b/src/alloy/memory.h index 9fa8c11fd..d51d4dc65 100644 --- a/src/alloy/memory.h +++ b/src/alloy/memory.h @@ -43,6 +43,15 @@ public: uint64_t SearchAligned(uint64_t start, uint64_t end, const uint32_t* values, size_t value_count); + virtual uint8_t LoadI8(uint64_t address) = 0; + virtual uint16_t LoadI16(uint64_t address) = 0; + virtual uint32_t LoadI32(uint64_t address) = 0; + virtual uint64_t LoadI64(uint64_t address) = 0; + virtual void StoreI8(uint64_t address, uint8_t value) = 0; + virtual void StoreI16(uint64_t address, uint16_t value) = 0; + virtual void StoreI32(uint64_t address, uint32_t value) = 0; + virtual void StoreI64(uint64_t address, uint64_t value) = 0; + virtual uint64_t HeapAlloc( uint64_t base_address, size_t size, uint32_t flags, uint32_t alignment = 0x20) = 0; diff --git a/src/alloy/runtime/register_access.h b/src/alloy/runtime/register_access.h deleted file mode 100644 index 21e3f1549..000000000 --- a/src/alloy/runtime/register_access.h +++ /dev/null @@ -1,38 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef ALLOY_RUNTIME_REGISTER_ACCESS_H_ -#define ALLOY_RUNTIME_REGISTER_ACCESS_H_ - -#include - - -namespace alloy { -namespace runtime { - -typedef bool (*RegisterHandlesCallback)(void* context, uint64_t addr); -typedef uint64_t (*RegisterReadCallback)(void* context, uint64_t addr); -typedef void (*RegisterWriteCallback)(void* context, uint64_t addr, - uint64_t value); - -typedef struct RegisterAccessCallbacks_s { - void* context; - RegisterHandlesCallback handles; - RegisterReadCallback read; - RegisterWriteCallback write; - - RegisterAccessCallbacks_s* next; -} RegisterAccessCallbacks; - - -} // namespace runtime -} // namespace alloy - - -#endif // ALLOY_RUNTIME_REGISTER_ACCESS_H_ diff --git a/src/alloy/runtime/runtime.cc b/src/alloy/runtime/runtime.cc index 1aff92e04..db5e52d7d 100644 --- a/src/alloy/runtime/runtime.cc +++ b/src/alloy/runtime/runtime.cc @@ -25,8 +25,7 @@ DEFINE_string(runtime_backend, "any", Runtime::Runtime(Memory* memory) : - memory_(memory), debugger_(0), backend_(0), frontend_(0), - access_callbacks_(0) { + memory_(memory), debugger_(0), backend_(0), frontend_(0) { tracing::Initialize(); modules_lock_ = AllocMutex(10000); } @@ -41,14 +40,6 @@ Runtime::~Runtime() { UnlockMutex(modules_lock_); FreeMutex(modules_lock_); - RegisterAccessCallbacks* cbs = access_callbacks_; - while (cbs) { - RegisterAccessCallbacks* next = cbs->next; - delete cbs; - cbs = next; - } - access_callbacks_ = NULL; - delete frontend_; delete backend_; delete debugger_; @@ -281,11 +272,3 @@ int Runtime::DemandFunction( return 0; } - -void Runtime::AddRegisterAccessCallbacks( - const RegisterAccessCallbacks& callbacks) { - RegisterAccessCallbacks* cbs = new RegisterAccessCallbacks(); - xe_copy_struct(cbs, &callbacks, sizeof(callbacks)); - cbs->next = access_callbacks_; - access_callbacks_ = cbs; -} diff --git a/src/alloy/runtime/runtime.h b/src/alloy/runtime/runtime.h index 3ccd82fb6..a6c506fc5 100644 --- a/src/alloy/runtime/runtime.h +++ b/src/alloy/runtime/runtime.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -38,9 +37,6 @@ public: Debugger* debugger() const { return debugger_; } frontend::Frontend* frontend() const { return frontend_; } backend::Backend* backend() const { return backend_; } - RegisterAccessCallbacks* access_callbacks() const { - return access_callbacks_; - } int Initialize(frontend::Frontend* frontend, backend::Backend* backend = 0); @@ -55,9 +51,6 @@ public: FunctionInfo** out_symbol_info); int ResolveFunction(uint64_t address, Function** out_function); - void AddRegisterAccessCallbacks( - const RegisterAccessCallbacks& callbacks); - //uint32_t CreateCallback(void (*callback)(void* data), void* data); private: @@ -74,8 +67,6 @@ protected: EntryTable entry_table_; Mutex* modules_lock_; ModuleList modules_; - - RegisterAccessCallbacks* access_callbacks_; }; diff --git a/src/alloy/runtime/sources.gypi b/src/alloy/runtime/sources.gypi index be12e8f4e..399580ec0 100644 --- a/src/alloy/runtime/sources.gypi +++ b/src/alloy/runtime/sources.gypi @@ -15,7 +15,6 @@ 'module.h', 'raw_module.cc', 'raw_module.h', - 'register_access.h', 'runtime.cc', 'runtime.h', 'symbol_info.cc', diff --git a/src/xenia/apu/audio_system.cc b/src/xenia/apu/audio_system.cc index 40d151bd2..897e9bc03 100644 --- a/src/xenia/apu/audio_system.cc +++ b/src/xenia/apu/audio_system.cc @@ -42,12 +42,13 @@ X_STATUS AudioSystem::Setup() { processor_ = emulator_->processor(); // Let the processor know we want register access callbacks. - RegisterAccessCallbacks callbacks; - callbacks.context = this; - callbacks.handles = (RegisterHandlesCallback)HandlesRegisterThunk; - callbacks.read = (RegisterReadCallback)ReadRegisterThunk; - callbacks.write = (RegisterWriteCallback)WriteRegisterThunk; - emulator_->processor()->AddRegisterAccessCallbacks(callbacks); + emulator_->memory()->AddMappedRange( + 0x7FEA0000, + 0xFFFF0000, + 0x0000FFFF, + this, + reinterpret_cast(MMIOReadRegisterThunk), + reinterpret_cast(MMIOWriteRegisterThunk)); // Setup worker thread state. This lets us make calls into guest code. thread_state_ = new XenonThreadState( @@ -181,10 +182,6 @@ void AudioSystem::UnregisterClient(size_t index) { xe_mutex_unlock(lock_); } -bool AudioSystem::HandlesRegister(uint64_t addr) { - return (addr & 0xFFFF0000) == 0x7FEA0000; -} - // free60 may be useful here, however it looks like it's using a different // piece of hardware: // https://github.com/Free60Project/libxenon/blob/master/libxenon/drivers/xenon_sound/sound.c diff --git a/src/xenia/apu/audio_system.h b/src/xenia/apu/audio_system.h index 25d0b5829..964e331cf 100644 --- a/src/xenia/apu/audio_system.h +++ b/src/xenia/apu/audio_system.h @@ -42,7 +42,6 @@ public: virtual X_STATUS CreateDriver(size_t index, HANDLE wait_handle, AudioDriver** out_driver) = 0; virtual void DestroyDriver(AudioDriver* driver) = 0; - bool HandlesRegister(uint64_t addr); virtual uint64_t ReadRegister(uint64_t addr); virtual void WriteRegister(uint64_t addr, uint64_t value); @@ -55,14 +54,11 @@ private: } void ThreadStart(); - static bool HandlesRegisterThunk(AudioSystem* as, uint64_t addr) { - return as->HandlesRegister(addr); - } - static uint64_t ReadRegisterThunk(AudioSystem* as, uint64_t addr) { + static uint64_t MMIOReadRegisterThunk(AudioSystem* as, uint64_t addr) { return as->ReadRegister(addr); } - static void WriteRegisterThunk(AudioSystem* as, uint64_t addr, - uint64_t value) { + static void MMIOWriteRegisterThunk(AudioSystem* as, uint64_t addr, + uint64_t value) { as->WriteRegister(addr, value); } diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index 0c780ce22..3d3a76e72 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -141,11 +141,6 @@ int Processor::Setup() { return 0; } -void Processor::AddRegisterAccessCallbacks( - xe::cpu::RegisterAccessCallbacks callbacks) { - runtime_->AddRegisterAccessCallbacks(callbacks); -} - int Processor::Execute(XenonThreadState* thread_state, uint64_t address) { SCOPE_profile_cpu_f("cpu"); diff --git a/src/xenia/cpu/processor.h b/src/xenia/cpu/processor.h index d08912c88..25b367ff3 100644 --- a/src/xenia/cpu/processor.h +++ b/src/xenia/cpu/processor.h @@ -10,7 +10,6 @@ #ifndef XENIA_CPU_PROCESSOR_H_ #define XENIA_CPU_PROCESSOR_H_ -#include #include #include @@ -28,11 +27,6 @@ XEDECLARECLASS2(xe, cpu, XexModule); namespace xe { namespace cpu { -using RegisterAccessCallbacks = alloy::runtime::RegisterAccessCallbacks; -using RegisterHandlesCallback = alloy::runtime::RegisterHandlesCallback; -using RegisterReadCallback = alloy::runtime::RegisterReadCallback; -using RegisterWriteCallback = alloy::runtime::RegisterWriteCallback; - class Processor : public debug::DebugTarget { public: @@ -45,8 +39,6 @@ public: int Setup(); - void AddRegisterAccessCallbacks(RegisterAccessCallbacks callbacks); - int Execute( XenonThreadState* thread_state, uint64_t address); uint64_t Execute( diff --git a/src/xenia/cpu/xenon_memory.cc b/src/xenia/cpu/xenon_memory.cc index f730f99a4..adc96392f 100644 --- a/src/xenia/cpu/xenon_memory.cc +++ b/src/xenia/cpu/xenon_memory.cc @@ -119,6 +119,111 @@ private: }; uint32_t XenonMemoryHeap::next_heap_id_ = 1; +namespace { + +namespace BE { +#include +} + +struct MMIORange { + uint64_t address; + uint64_t mask; + uint64_t size; + void* context; + MMIOReadCallback read; + MMIOWriteCallback write; +}; +MMIORange g_mapped_ranges_[16] = { 0 }; +int g_mapped_range_count_ = 0; + +uint64_t* GetContextRegPtr(BE::Int32 arg_type, PCONTEXT context) { + DWORD index = 0; + _BitScanForward(&index, arg_type); + return &context->Rax + index; +} + +// Handles potential accesses to mmio. We look for access violations to +// addresses in our range and call into the registered handlers, if any. +// If there are none, we continue. +LONG CALLBACK CheckMMIOHandler(PEXCEPTION_POINTERS ex_info) { + // http://msdn.microsoft.com/en-us/library/ms679331(v=vs.85).aspx + // http://msdn.microsoft.com/en-us/library/aa363082(v=vs.85).aspx + auto code = ex_info->ExceptionRecord->ExceptionCode; + if (code == STATUS_ACCESS_VIOLATION) { + // Access violations are pretty rare, so we can do a linear search here. + auto address = ex_info->ExceptionRecord->ExceptionInformation[1]; + for (int i = 0; i < g_mapped_range_count_; ++i) { + const auto& range = g_mapped_ranges_[i]; + if ((address & range.mask) == range.address) { + // Within our range. + + // TODO(benvanik): replace with simple check of mov (that's all + // we care about). + BE::DISASM disasm = { 0 }; + disasm.Archi = 64; + disasm.Options = BE::MasmSyntax + BE::PrefixedNumeral; + disasm.EIP = (BE::UIntPtr)ex_info->ExceptionRecord->ExceptionAddress; + BE::UIntPtr eip_end = disasm.EIP + 20; + size_t len = BE::Disasm(&disasm); + if (len == BE::UNKNOWN_OPCODE) { + break; + } + + auto action = ex_info->ExceptionRecord->ExceptionInformation[0]; + if (action == 0) { + uint64_t value = range.read(range.context, address & 0xFFFFFFFF); + XEASSERT((disasm.Argument1.ArgType & BE::REGISTER_TYPE) == + BE::REGISTER_TYPE); + uint64_t* reg_ptr = GetContextRegPtr(disasm.Argument1.ArgType, + ex_info->ContextRecord); + switch (disasm.Argument1.ArgSize) { + case 8: + *reg_ptr = static_cast(value); + break; + case 16: + *reg_ptr = XESWAP16(static_cast(value)); + break; + case 32: + *reg_ptr = XESWAP32(static_cast(value)); + break; + case 64: + *reg_ptr = XESWAP64(static_cast(value)); + break; + } + ex_info->ContextRecord->Rip += len; + return EXCEPTION_CONTINUE_EXECUTION; + } else if (action == 1) { + XEASSERT((disasm.Argument2.ArgType & BE::REGISTER_TYPE) == + BE::REGISTER_TYPE); + uint64_t* reg_ptr = GetContextRegPtr(disasm.Argument2.ArgType, + ex_info->ContextRecord); + uint64_t value = *reg_ptr; + switch (disasm.Argument2.ArgSize) { + case 8: + value = static_cast(value); + break; + case 16: + value = XESWAP16(static_cast(value)); + break; + case 32: + value = XESWAP32(static_cast(value)); + break; + case 64: + value = XESWAP64(static_cast(value)); + break; + } + range.write(range.context, address & 0xFFFFFFFF, value); + ex_info->ContextRecord->Rip += len; + return EXCEPTION_CONTINUE_EXECUTION; + } + } + } + } + return EXCEPTION_CONTINUE_SEARCH; +} + +} // namespace + XenonMemory::XenonMemory() : mapping_(0), mapping_base_(0), @@ -204,6 +309,15 @@ int XenonMemory::Initialize() { 0x00100000, MEM_COMMIT, PAGE_READWRITE); + // Add handlers for MMIO. + // If there is a debugger attached the normal exception handler will not + // fire and we must instead add the continue handler. + AddVectoredExceptionHandler(1, CheckMMIOHandler); + if (IsDebuggerPresent()) { + // TODO(benvanik): is this really required? + //AddVectoredContinueHandler(1, CheckMMIOHandler); + } + return 0; XECLEANUP: @@ -248,6 +362,112 @@ void XenonMemory::UnmapViews() { } } +bool XenonMemory::AddMappedRange(uint64_t address, uint64_t mask, + uint64_t size, void* context, + MMIOReadCallback read_callback, + MMIOWriteCallback write_callback) { + DWORD protect = 0; + if (read_callback && write_callback) { + protect = PAGE_NOACCESS; + } else if (write_callback) { + protect = PAGE_READONLY; + } else { + // Write-only memory is not supported. + XEASSERTALWAYS(); + } + if (!VirtualAlloc(Translate(address), + size, + MEM_COMMIT, protect)) { + return false; + } + XEASSERT(g_mapped_range_count_ + 1 < XECOUNT(g_mapped_ranges_)); + g_mapped_ranges_[g_mapped_range_count_++] = { + reinterpret_cast(mapping_base_) | address, + 0xFFFFFFFF00000000 | mask, + size, context, + read_callback, write_callback, + }; + return true; +} + +bool XenonMemory::CheckMMIOLoad(uint64_t address, uint64_t* out_value) { + for (int i = 0; i < g_mapped_range_count_; ++i) { + const auto& range = g_mapped_ranges_[i]; + if (((address | (uint64_t)mapping_base_) & range.mask) == range.address) { + *out_value = static_cast(range.read(range.context, address)); + return true; + } + } + return false; +} + +uint8_t XenonMemory::LoadI8(uint64_t address) { + uint64_t value; + if (!CheckMMIOLoad(address, &value)) { + value = *reinterpret_cast(Translate(address)); + } + return static_cast(value); +} + +uint16_t XenonMemory::LoadI16(uint64_t address) { + uint64_t value; + if (!CheckMMIOLoad(address, &value)) { + value = *reinterpret_cast(Translate(address)); + } + return static_cast(value); +} + +uint32_t XenonMemory::LoadI32(uint64_t address) { + uint64_t value; + if (!CheckMMIOLoad(address, &value)) { + value = *reinterpret_cast(Translate(address)); + } + return static_cast(value); +} + +uint64_t XenonMemory::LoadI64(uint64_t address) { + uint64_t value; + if (!CheckMMIOLoad(address, &value)) { + value = *reinterpret_cast(Translate(address)); + } + return static_cast(value); +} + +bool XenonMemory::CheckMMIOStore(uint64_t address, uint64_t value) { + for (int i = 0; i < g_mapped_range_count_; ++i) { + const auto& range = g_mapped_ranges_[i]; + if (((address | (uint64_t)mapping_base_) & range.mask) == range.address) { + range.write(range.context, address, value); + return true; + } + } + return false; +} + +void XenonMemory::StoreI8(uint64_t address, uint8_t value) { + if (!CheckMMIOStore(address, value)) { + *reinterpret_cast(Translate(address)) = value; + } +} + +void XenonMemory::StoreI16(uint64_t address, uint16_t value) { + if (!CheckMMIOStore(address, value)) { + *reinterpret_cast(Translate(address)) = value; + } +} + +void XenonMemory::StoreI32(uint64_t address, uint32_t value) { + if (!CheckMMIOStore(address, value)) { + *reinterpret_cast(Translate(address)) = value; + } +} + +void XenonMemory::StoreI64(uint64_t address, uint64_t value) { + if (!CheckMMIOStore(address, value)) { + *reinterpret_cast(Translate(address)) = value; + } +} + uint64_t XenonMemory::HeapAlloc( uint64_t base_address, size_t size, uint32_t flags, uint32_t alignment) { diff --git a/src/xenia/cpu/xenon_memory.h b/src/xenia/cpu/xenon_memory.h index 96ba352fa..5c97649a4 100644 --- a/src/xenia/cpu/xenon_memory.h +++ b/src/xenia/cpu/xenon_memory.h @@ -15,33 +15,56 @@ #include +typedef struct xe_ppc_state xe_ppc_state_t; + namespace xe { namespace cpu { class XenonMemoryHeap; +typedef uint64_t (*MMIOReadCallback)(void* context, uint64_t addr); +typedef void (*MMIOWriteCallback)(void* context, uint64_t addr, + uint64_t value); class XenonMemory : public alloy::Memory { public: XenonMemory(); virtual ~XenonMemory(); - virtual int Initialize(); + int Initialize() override; - virtual uint64_t HeapAlloc( + bool AddMappedRange(uint64_t address, uint64_t mask, + uint64_t size, + void* context, + MMIOReadCallback read_callback = nullptr, + MMIOWriteCallback write_callback = nullptr); + + uint8_t LoadI8(uint64_t address) override; + uint16_t LoadI16(uint64_t address) override; + uint32_t LoadI32(uint64_t address) override; + uint64_t LoadI64(uint64_t address) override; + void StoreI8(uint64_t address, uint8_t value) override; + void StoreI16(uint64_t address, uint16_t value) override; + void StoreI32(uint64_t address, uint32_t value) override; + void StoreI64(uint64_t address, uint64_t value) override; + + uint64_t HeapAlloc( uint64_t base_address, size_t size, uint32_t flags, - uint32_t alignment = 0x20); - virtual int HeapFree(uint64_t address, size_t size); + uint32_t alignment = 0x20) override; + int HeapFree(uint64_t address, size_t size) override; - virtual size_t QuerySize(uint64_t base_address); + size_t QuerySize(uint64_t base_address) override; - virtual int Protect(uint64_t address, size_t size, uint32_t access); - virtual uint32_t QueryProtect(uint64_t address); + int Protect(uint64_t address, size_t size, uint32_t access) override; + uint32_t QueryProtect(uint64_t address) override; private: int MapViews(uint8_t* mapping_base); void UnmapViews(); + bool CheckMMIOLoad(uint64_t address, uint64_t* out_value); + bool CheckMMIOStore(uint64_t address, uint64_t value); + private: HANDLE mapping_; uint8_t* mapping_base_; diff --git a/src/xenia/emulator.h b/src/xenia/emulator.h index c94fa3771..82ede0ec6 100644 --- a/src/xenia/emulator.h +++ b/src/xenia/emulator.h @@ -13,6 +13,7 @@ #include #include #include +#include XEDECLARECLASS1(xe, ExportResolver); @@ -41,7 +42,7 @@ public: ui::Window* main_window() const { return main_window_; } void set_main_window(ui::Window* window); - Memory* memory() const { return memory_; } + cpu::XenonMemory* memory() const { return memory_; } debug::DebugServer* debug_server() const { return debug_server_; } @@ -68,7 +69,7 @@ private: ui::Window* main_window_; - Memory* memory_; + cpu::XenonMemory* memory_; debug::DebugServer* debug_server_; diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index fbcb1d744..c0a614d35 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -45,12 +45,13 @@ X_STATUS GraphicsSystem::Setup() { worker_ = new RingBufferWorker(this, memory_); // Let the processor know we want register access callbacks. - RegisterAccessCallbacks callbacks; - callbacks.context = this; - callbacks.handles = (RegisterHandlesCallback)HandlesRegisterThunk; - callbacks.read = (RegisterReadCallback)ReadRegisterThunk; - callbacks.write = (RegisterWriteCallback)WriteRegisterThunk; - emulator_->processor()->AddRegisterAccessCallbacks(callbacks); + emulator_->memory()->AddMappedRange( + 0x7FC80000, + 0xFFFF0000, + 0x0000FFFF, + this, + reinterpret_cast(MMIOReadRegisterThunk), + reinterpret_cast(MMIOWriteRegisterThunk)); // Create worker thread. // This will initialize the graphics system. @@ -132,10 +133,6 @@ void GraphicsSystem::EnableReadPointerWriteBack(uint32_t ptr, worker_->EnableReadPointerWriteBack(ptr, block_size); } -bool GraphicsSystem::HandlesRegister(uint64_t addr) { - return (addr & 0xFFFF0000) == 0x7FC80000; -} - uint64_t GraphicsSystem::ReadRegister(uint64_t addr) { uint32_t r = addr & 0xFFFF; XELOGGPU("ReadRegister(%.4X)", r); diff --git a/src/xenia/gpu/graphics_system.h b/src/xenia/gpu/graphics_system.h index 5c1f03f8d..c7c72fea5 100644 --- a/src/xenia/gpu/graphics_system.h +++ b/src/xenia/gpu/graphics_system.h @@ -40,7 +40,6 @@ public: void InitializeRingBuffer(uint32_t ptr, uint32_t page_count); void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size); - bool HandlesRegister(uint64_t addr); virtual uint64_t ReadRegister(uint64_t addr); virtual void WriteRegister(uint64_t addr, uint64_t value); @@ -59,14 +58,11 @@ private: } void ThreadStart(); - static bool HandlesRegisterThunk(GraphicsSystem* gs, uint64_t addr) { - return gs->HandlesRegister(addr); - } - static uint64_t ReadRegisterThunk(GraphicsSystem* gs, uint64_t addr) { + static uint64_t MMIOReadRegisterThunk(GraphicsSystem* gs, uint64_t addr) { return gs->ReadRegister(addr); } - static void WriteRegisterThunk(GraphicsSystem* gs, uint64_t addr, - uint64_t value) { + static void MMIOWriteRegisterThunk(GraphicsSystem* gs, uint64_t addr, + uint64_t value) { gs->WriteRegister(addr, value); } From ddbebcda6c249e61d2164b62cf19ace3cd26ace0 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 2 Jun 2014 07:11:27 -0700 Subject: [PATCH 147/184] Cleanup memory. --- src/xenia/cpu/xenon_memory.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/xenia/cpu/xenon_memory.cc b/src/xenia/cpu/xenon_memory.cc index adc96392f..1e4116bf3 100644 --- a/src/xenia/cpu/xenon_memory.cc +++ b/src/xenia/cpu/xenon_memory.cc @@ -233,6 +233,17 @@ XenonMemory::XenonMemory() : } XenonMemory::~XenonMemory() { + // Remove exception handlers. + RemoveVectoredExceptionHandler(CheckMMIOHandler); + RemoveVectoredContinueHandler(CheckMMIOHandler); + + // Unallocate mapped ranges. + for (int i = 0; i < g_mapped_range_count_; ++i) { + const auto& range = g_mapped_ranges_[i]; + VirtualFree(reinterpret_cast(range.address), range.size, + MEM_DECOMMIT); + } + if (mapping_base_) { // GPU writeback. VirtualFree( From beb5ee40ee58db2555bd70ca776fcc25ba5b6e32 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 2 Jun 2014 07:49:45 -0700 Subject: [PATCH 148/184] Dirty page table. Not yet used. --- src/alloy/backend/ivm/ivm_function.cc | 1 + src/alloy/backend/ivm/ivm_intcode.cc | 11 +++++++++++ src/alloy/backend/ivm/ivm_intcode.h | 1 + src/alloy/backend/x64/x64_emitter.cc | 5 +++++ src/alloy/backend/x64/x64_emitter.h | 2 ++ src/alloy/backend/x64/x64_sequences.cc | 20 ++++++++++++++++++++ src/alloy/memory.h | 2 ++ src/xenia/cpu/xenon_memory.cc | 13 ++++++++++--- src/xenia/cpu/xenon_memory.h | 4 ++++ 9 files changed, 56 insertions(+), 3 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_function.cc b/src/alloy/backend/ivm/ivm_function.cc index 72c564210..88306b228 100644 --- a/src/alloy/backend/ivm/ivm_function.cc +++ b/src/alloy/backend/ivm/ivm_function.cc @@ -118,6 +118,7 @@ int IVMFunction::CallImpl(ThreadState* thread_state, uint64_t return_address) { ics.locals = local_stack; ics.context = (uint8_t*)thread_state->raw_context(); ics.membase = memory->membase(); + ics.page_table = ics.membase + memory->page_table(); ics.did_carry = 0; ics.did_saturate = 0; ics.thread_state = thread_state; diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 1f24bc6ea..1badeab7e 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -1445,6 +1445,10 @@ int Translate_LOAD(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->dest->type]); } +void MarkPageDirty(IntCodeState& ics, uint32_t address) { + // 16KB pages. + ics.page_table[(address >> 14) & 0x7FFF] = 1; +} uint32_t IntCode_STORE_I8(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { @@ -1455,6 +1459,7 @@ uint32_t IntCode_STORE_I8(IntCodeState& ics, const IntCode* i) { address, ics.rf[i->src2_reg].i8, ics.rf[i->src2_reg].u8); DFLUSH(); *((int8_t*)(ics.membase + address)) = ics.rf[i->src2_reg].i8; + MarkPageDirty(ics, address); return IA_NEXT; } uint32_t IntCode_STORE_I16(IntCodeState& ics, const IntCode* i) { @@ -1468,6 +1473,7 @@ uint32_t IntCode_STORE_I16(IntCodeState& ics, const IntCode* i) { address, ics.rf[i->src2_reg].i16, ics.rf[i->src2_reg].u16); DFLUSH(); *((int16_t*)(ics.membase + address)) = ics.rf[i->src2_reg].i16; + MarkPageDirty(ics, address); return IA_NEXT; } uint32_t IntCode_STORE_I32(IntCodeState& ics, const IntCode* i) { @@ -1481,6 +1487,7 @@ uint32_t IntCode_STORE_I32(IntCodeState& ics, const IntCode* i) { address, ics.rf[i->src2_reg].i32, ics.rf[i->src2_reg].u32); DFLUSH(); *((int32_t*)(ics.membase + address)) = ics.rf[i->src2_reg].i32; + MarkPageDirty(ics, address); return IA_NEXT; } uint32_t IntCode_STORE_I64(IntCodeState& ics, const IntCode* i) { @@ -1494,6 +1501,7 @@ uint32_t IntCode_STORE_I64(IntCodeState& ics, const IntCode* i) { address, ics.rf[i->src2_reg].i64, ics.rf[i->src2_reg].u64); DFLUSH(); *((int64_t*)(ics.membase + address)) = ics.rf[i->src2_reg].i64; + MarkPageDirty(ics, address); return IA_NEXT; } uint32_t IntCode_STORE_F32(IntCodeState& ics, const IntCode* i) { @@ -1502,6 +1510,7 @@ uint32_t IntCode_STORE_F32(IntCodeState& ics, const IntCode* i) { address, ics.rf[i->src2_reg].f32, ics.rf[i->src2_reg].u32); DFLUSH(); *((float*)(ics.membase + address)) = ics.rf[i->src2_reg].f32; + MarkPageDirty(ics, address); return IA_NEXT; } uint32_t IntCode_STORE_F64(IntCodeState& ics, const IntCode* i) { @@ -1510,6 +1519,7 @@ uint32_t IntCode_STORE_F64(IntCodeState& ics, const IntCode* i) { address, ics.rf[i->src2_reg].f64, ics.rf[i->src2_reg].u64); DFLUSH(); *((double*)(ics.membase + address)) = ics.rf[i->src2_reg].f64; + MarkPageDirty(ics, address); return IA_NEXT; } uint32_t IntCode_STORE_V128(IntCodeState& ics, const IntCode* i) { @@ -1520,6 +1530,7 @@ uint32_t IntCode_STORE_V128(IntCodeState& ics, const IntCode* i) { VECI4(ics.rf[i->src2_reg].v128,0), VECI4(ics.rf[i->src2_reg].v128,1), VECI4(ics.rf[i->src2_reg].v128,2), VECI4(ics.rf[i->src2_reg].v128,3)); DFLUSH(); *((vec128_t*)(ics.membase + address)) = ics.rf[i->src2_reg].v128; + MarkPageDirty(ics, address); return IA_NEXT; } int Translate_STORE(TranslationContext& ctx, Instr* i) { diff --git a/src/alloy/backend/ivm/ivm_intcode.h b/src/alloy/backend/ivm/ivm_intcode.h index 340bb4dd3..389ccbef2 100644 --- a/src/alloy/backend/ivm/ivm_intcode.h +++ b/src/alloy/backend/ivm/ivm_intcode.h @@ -43,6 +43,7 @@ typedef struct { uint8_t* locals; uint8_t* context; uint8_t* membase; + uint8_t* page_table; int8_t did_carry; int8_t did_saturate; runtime::ThreadState* thread_state; diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 090c8fe9a..8674459bf 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -446,6 +446,11 @@ void X64Emitter::StoreEflags() { #endif // STORE_EFLAGS } +uint32_t X64Emitter::page_table_address() const { + uint64_t addr = runtime_->memory()->page_table(); + return static_cast(addr); +} + bool X64Emitter::ConstantFitsIn32Reg(uint64_t v) { if ((v & ~0x7FFFFFFF) == 0) { // Fits under 31 bits, so just load using normal mov. diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index a720e1970..785ff5ac7 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -132,6 +132,8 @@ public: void LoadEflags(); void StoreEflags(); + uint32_t page_table_address() const; + // Moves a 64bit immediate into memory. bool ConstantFitsIn32Reg(uint64_t v); void MovMem64(const Xbyak::RegExp& addr, uint64_t v); diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index e12491f6a..50981f1cb 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -1563,6 +1563,12 @@ EMITTER_OPCODE_TABLE( // OPCODE_STORE // ============================================================================ // Note: most *should* be aligned, but needs to be checked! +void EmitMarkPageDirty(X64Emitter& e, RegExp& addr) { + // 16KB pages. + e.shr(e.eax, 14); + e.and(e.eax, 0x7FFF); + e.mov(e.byte[e.rdx + e.rax + e.page_table_address()], 1); +} EMITTER(STORE_I8, MATCH(I, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { auto addr = ComputeMemoryAddress(e, i.src1); @@ -1571,7 +1577,9 @@ EMITTER(STORE_I8, MATCH(I, I8<>>)) { } else { e.mov(e.byte[addr], i.src2); } + EmitMarkPageDirty(e, addr); if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); e.mov(e.r8b, e.byte[addr]); e.lea(e.rdx, e.ptr[addr]); e.CallNative(TraceMemoryStoreI8); @@ -1586,7 +1594,9 @@ EMITTER(STORE_I16, MATCH(I, I16<>>)) { } else { e.mov(e.word[addr], i.src2); } + EmitMarkPageDirty(e, addr); if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); e.mov(e.r8w, e.word[addr]); e.lea(e.rdx, e.ptr[addr]); e.CallNative(TraceMemoryStoreI16); @@ -1601,7 +1611,9 @@ EMITTER(STORE_I32, MATCH(I, I32<>>)) { } else { e.mov(e.dword[addr], i.src2); } + EmitMarkPageDirty(e, addr); if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); e.mov(e.r8d, e.dword[addr]); e.lea(e.rdx, e.ptr[addr]); e.CallNative(TraceMemoryStoreI32); @@ -1616,7 +1628,9 @@ EMITTER(STORE_I64, MATCH(I, I64<>>)) { } else { e.mov(e.qword[addr], i.src2); } + EmitMarkPageDirty(e, addr); if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); e.mov(e.r8, e.qword[addr]); e.lea(e.rdx, e.ptr[addr]); e.CallNative(TraceMemoryStoreI64); @@ -1631,7 +1645,9 @@ EMITTER(STORE_F32, MATCH(I, F32<>>)) { } else { e.vmovss(e.dword[addr], i.src2); } + EmitMarkPageDirty(e, addr); if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); e.lea(e.r8, e.ptr[addr]); e.lea(e.rdx, e.ptr[addr]); e.CallNative(TraceMemoryStoreF32); @@ -1646,7 +1662,9 @@ EMITTER(STORE_F64, MATCH(I, F64<>>)) { } else { e.vmovsd(e.qword[addr], i.src2); } + EmitMarkPageDirty(e, addr); if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); e.lea(e.r8, e.ptr[addr]); e.lea(e.rdx, e.ptr[addr]); e.CallNative(TraceMemoryStoreF64); @@ -1662,7 +1680,9 @@ EMITTER(STORE_V128, MATCH(I, V128<>>)) { } else { e.vmovaps(e.ptr[addr], i.src2); } + EmitMarkPageDirty(e, addr); if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); e.lea(e.r8, e.ptr[addr]); e.lea(e.rdx, e.ptr[addr]); e.CallNative(TraceMemoryStoreV128); diff --git a/src/alloy/memory.h b/src/alloy/memory.h index d51d4dc65..72719cc4a 100644 --- a/src/alloy/memory.h +++ b/src/alloy/memory.h @@ -34,6 +34,8 @@ public: }; inline uint32_t* reserve_address() { return &reserve_address_; } + virtual uint64_t page_table() const = 0; + virtual int Initialize(); void Zero(uint64_t address, size_t size); diff --git a/src/xenia/cpu/xenon_memory.cc b/src/xenia/cpu/xenon_memory.cc index 1e4116bf3..22f928022 100644 --- a/src/xenia/cpu/xenon_memory.cc +++ b/src/xenia/cpu/xenon_memory.cc @@ -225,9 +225,9 @@ LONG CALLBACK CheckMMIOHandler(PEXCEPTION_POINTERS ex_info) { } // namespace -XenonMemory::XenonMemory() : - mapping_(0), mapping_base_(0), - Memory() { +XenonMemory::XenonMemory() + : Memory(), + mapping_(0), mapping_base_(0), page_table_(0) { virtual_heap_ = new XenonMemoryHeap(this, false); physical_heap_ = new XenonMemoryHeap(this, true); } @@ -329,6 +329,13 @@ int XenonMemory::Initialize() { //AddVectoredContinueHandler(1, CheckMMIOHandler); } + // Allocate dirty page table. + // This must live within our low heap. Ideally we'd hardcode the address but + // this is more flexible. + page_table_ = physical_heap_->Alloc( + 0, (512 * 1024 * 1024) / (16 * 1024), + X_MEM_COMMIT, 16 * 1024); + return 0; XECLEANUP: diff --git a/src/xenia/cpu/xenon_memory.h b/src/xenia/cpu/xenon_memory.h index 5c97649a4..05872d12e 100644 --- a/src/xenia/cpu/xenon_memory.h +++ b/src/xenia/cpu/xenon_memory.h @@ -33,6 +33,8 @@ public: int Initialize() override; + uint64_t page_table() const override { return page_table_; } + bool AddMappedRange(uint64_t address, uint64_t mask, uint64_t size, void* context, @@ -83,6 +85,8 @@ private: XenonMemoryHeap* virtual_heap_; XenonMemoryHeap* physical_heap_; + uint64_t page_table_; + friend class XenonMemoryHeap; }; From 4072640a6493b865fe709291acaf4e1125353183 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 2 Jun 2014 19:24:33 -0700 Subject: [PATCH 149/184] Skeleton of new shared GPU files. --- src/xenia/gpu/buffer_resource.cc | 17 +++++++++++ src/xenia/gpu/buffer_resource.h | 30 ++++++++++++++++++ src/xenia/gpu/command_buffer.h | 49 ------------------------------ src/xenia/gpu/command_processor.cc | 17 +++++++++++ src/xenia/gpu/command_processor.h | 30 ++++++++++++++++++ src/xenia/gpu/register_file.cc | 17 +++++++++++ src/xenia/gpu/register_file.h | 30 ++++++++++++++++++ src/xenia/gpu/resource.cc | 17 +++++++++++ src/xenia/gpu/resource.h | 30 ++++++++++++++++++ src/xenia/gpu/resource_cache.cc | 17 +++++++++++ src/xenia/gpu/resource_cache.h | 30 ++++++++++++++++++ src/xenia/gpu/shader_resource.cc | 17 +++++++++++ src/xenia/gpu/shader_resource.h | 30 ++++++++++++++++++ src/xenia/gpu/sources.gypi | 29 +++++++++--------- src/xenia/gpu/texture_resource.cc | 17 +++++++++++ src/xenia/gpu/texture_resource.h | 33 ++++++++++++++++++++ 16 files changed, 346 insertions(+), 64 deletions(-) create mode 100644 src/xenia/gpu/buffer_resource.cc create mode 100644 src/xenia/gpu/buffer_resource.h delete mode 100644 src/xenia/gpu/command_buffer.h create mode 100644 src/xenia/gpu/command_processor.cc create mode 100644 src/xenia/gpu/command_processor.h create mode 100644 src/xenia/gpu/register_file.cc create mode 100644 src/xenia/gpu/register_file.h create mode 100644 src/xenia/gpu/resource.cc create mode 100644 src/xenia/gpu/resource.h create mode 100644 src/xenia/gpu/resource_cache.cc create mode 100644 src/xenia/gpu/resource_cache.h create mode 100644 src/xenia/gpu/shader_resource.cc create mode 100644 src/xenia/gpu/shader_resource.h create mode 100644 src/xenia/gpu/texture_resource.cc create mode 100644 src/xenia/gpu/texture_resource.h diff --git a/src/xenia/gpu/buffer_resource.cc b/src/xenia/gpu/buffer_resource.cc new file mode 100644 index 000000000..d6019d95f --- /dev/null +++ b/src/xenia/gpu/buffer_resource.cc @@ -0,0 +1,17 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + diff --git a/src/xenia/gpu/buffer_resource.h b/src/xenia/gpu/buffer_resource.h new file mode 100644 index 000000000..385a5049a --- /dev/null +++ b/src/xenia/gpu/buffer_resource.h @@ -0,0 +1,30 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_BUFFER_RESOURCE_H_ +#define XENIA_GPU_BUFFER_RESOURCE_H_ + +#include +#include + + +namespace xe { +namespace gpu { + + +class BufferResource : public Resource { +public: +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_BUFFER_RESOURCE_H_ diff --git a/src/xenia/gpu/command_buffer.h b/src/xenia/gpu/command_buffer.h deleted file mode 100644 index b601505f5..000000000 --- a/src/xenia/gpu/command_buffer.h +++ /dev/null @@ -1,49 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_COMMAND_BUFFER_H_ -#define XENIA_GPU_COMMAND_BUFFER_H_ - -#include - - -namespace xe { -namespace gpu { - - -// TODO(benvanik): command packet types. - - -class CommandBuffer { -public: - CommandBuffer(xe_memory_ref memory) { - memory_ = xe_memory_retain(memory); - } - - virtual ~CommandBuffer() { - xe_memory_release(memory_); - } - - xe_memory_ref memory() { - return memory_; - } - - // TODO(benvanik): command methods. - virtual void Foo() = 0; - -protected: - xe_memory_ref memory_; -}; - - -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_COMMAND_BUFFER_H_ diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc new file mode 100644 index 000000000..bb7a02bc7 --- /dev/null +++ b/src/xenia/gpu/command_processor.cc @@ -0,0 +1,17 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h new file mode 100644 index 000000000..65d5dfc71 --- /dev/null +++ b/src/xenia/gpu/command_processor.h @@ -0,0 +1,30 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_COMMAND_PROCESSOR_H_ +#define XENIA_GPU_COMMAND_PROCESSOR_H_ + +#include +#include + + +namespace xe { +namespace gpu { + + +class CommandProcessor { +public: +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_COMMAND_PROCESSOR_H_ diff --git a/src/xenia/gpu/register_file.cc b/src/xenia/gpu/register_file.cc new file mode 100644 index 000000000..f6f119376 --- /dev/null +++ b/src/xenia/gpu/register_file.cc @@ -0,0 +1,17 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + diff --git a/src/xenia/gpu/register_file.h b/src/xenia/gpu/register_file.h new file mode 100644 index 000000000..2a530995f --- /dev/null +++ b/src/xenia/gpu/register_file.h @@ -0,0 +1,30 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_REGISTER_FILE_H_ +#define XENIA_GPU_REGISTER_FILE_H_ + +#include +#include + + +namespace xe { +namespace gpu { + + +class RegisterFile { +public: +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_REGISTER_FILE_H_ diff --git a/src/xenia/gpu/resource.cc b/src/xenia/gpu/resource.cc new file mode 100644 index 000000000..88966aac5 --- /dev/null +++ b/src/xenia/gpu/resource.cc @@ -0,0 +1,17 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + diff --git a/src/xenia/gpu/resource.h b/src/xenia/gpu/resource.h new file mode 100644 index 000000000..e9a0be7fa --- /dev/null +++ b/src/xenia/gpu/resource.h @@ -0,0 +1,30 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_RESOURCE_H_ +#define XENIA_GPU_RESOURCE_H_ + +#include +#include + + +namespace xe { +namespace gpu { + + +class Resource { +public: +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_RESOURCE_H_ diff --git a/src/xenia/gpu/resource_cache.cc b/src/xenia/gpu/resource_cache.cc new file mode 100644 index 000000000..7a9a1c24d --- /dev/null +++ b/src/xenia/gpu/resource_cache.cc @@ -0,0 +1,17 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + diff --git a/src/xenia/gpu/resource_cache.h b/src/xenia/gpu/resource_cache.h new file mode 100644 index 000000000..7caaad51f --- /dev/null +++ b/src/xenia/gpu/resource_cache.h @@ -0,0 +1,30 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_RESOURCE_CACHE_H_ +#define XENIA_GPU_RESOURCE_CACHE_H_ + +#include +#include + + +namespace xe { +namespace gpu { + + +class ResourceCache { +public: +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_RESOURCE_CACHE_H_ diff --git a/src/xenia/gpu/shader_resource.cc b/src/xenia/gpu/shader_resource.cc new file mode 100644 index 000000000..e2520db62 --- /dev/null +++ b/src/xenia/gpu/shader_resource.cc @@ -0,0 +1,17 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + diff --git a/src/xenia/gpu/shader_resource.h b/src/xenia/gpu/shader_resource.h new file mode 100644 index 000000000..24b787ec4 --- /dev/null +++ b/src/xenia/gpu/shader_resource.h @@ -0,0 +1,30 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_SHADER_RESOURCE_H_ +#define XENIA_GPU_SHADER_RESOURCE_H_ + +#include +#include + + +namespace xe { +namespace gpu { + + +class ShaderResource : public Resource { +public: +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_SHADER_RESOURCE_H_ diff --git a/src/xenia/gpu/sources.gypi b/src/xenia/gpu/sources.gypi index b2c9134c0..3d4462fd1 100644 --- a/src/xenia/gpu/sources.gypi +++ b/src/xenia/gpu/sources.gypi @@ -1,11 +1,10 @@ # Copyright 2013 Ben Vanik. All Rights Reserved. { 'sources': [ - 'buffer.cc', - 'buffer.h', - 'buffer_cache.cc', - 'buffer_cache.h', - 'command_buffer.h', + 'buffer_resource.cc', + 'buffer_resource.h', + 'command_processor.cc', + 'command_processor.h', 'gpu-private.h', 'gpu.cc', 'gpu.h', @@ -13,16 +12,16 @@ 'graphics_driver.h', 'graphics_system.cc', 'graphics_system.h', - 'ring_buffer_worker.cc', - 'ring_buffer_worker.h', - 'shader.cc', - 'shader.h', - 'shader_cache.cc', - 'shader_cache.h', - 'texture.cc', - 'texture.h', - 'texture_cache.cc', - 'texture_cache.h', + 'register_file.cc', + 'register_file.h', + 'resource.cc', + 'resource.h', + 'resource_cache.cc', + 'resource_cache.h', + 'shader_resource.cc', + 'shader_resource.h', + 'texture_resource.cc', + 'texture_resource.h', ], 'includes': [ diff --git a/src/xenia/gpu/texture_resource.cc b/src/xenia/gpu/texture_resource.cc new file mode 100644 index 000000000..5875e76f3 --- /dev/null +++ b/src/xenia/gpu/texture_resource.cc @@ -0,0 +1,17 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + diff --git a/src/xenia/gpu/texture_resource.h b/src/xenia/gpu/texture_resource.h new file mode 100644 index 000000000..35f83bcda --- /dev/null +++ b/src/xenia/gpu/texture_resource.h @@ -0,0 +1,33 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_TEXTURE_RESOURCE_H_ +#define XENIA_GPU_TEXTURE_RESOURCE_H_ + +#include +#include + +// TODO(benvanik): replace DXGI constants with xenia constants. +#include + + +namespace xe { +namespace gpu { + + +class TextureResource : public Resource { +public: +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_TEXTURE_RESOURCE_H_ From 295910c3d8f562fe5a18e14a04797d4db0dcc977 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 8 Jun 2014 11:23:55 -0700 Subject: [PATCH 150/184] Initial refactoring of gpu/. Runs too fast - now there are ringbuffer wrapping issues. --- src/xenia/gpu/buffer.cc | 42 - src/xenia/gpu/buffer.h | 91 - src/xenia/gpu/buffer_cache.cc | 79 - src/xenia/gpu/buffer_cache.h | 55 - src/xenia/gpu/buffer_resource.cc | 39 + src/xenia/gpu/buffer_resource.h | 73 +- src/xenia/gpu/command_processor.cc | 775 ++++++- src/xenia/gpu/command_processor.h | 55 + src/xenia/gpu/d3d11/d3d11_buffer.cc | 150 -- src/xenia/gpu/d3d11/d3d11_buffer.h | 69 - src/xenia/gpu/d3d11/d3d11_buffer_cache.cc | 44 - src/xenia/gpu/d3d11/d3d11_buffer_cache.h | 53 - src/xenia/gpu/d3d11/d3d11_buffer_resource.cc | 149 ++ src/xenia/gpu/d3d11/d3d11_buffer_resource.h | 69 + src/xenia/gpu/d3d11/d3d11_geometry_shader.cc | 40 +- src/xenia/gpu/d3d11/d3d11_geometry_shader.h | 33 +- src/xenia/gpu/d3d11/d3d11_graphics_driver.cc | 854 +++---- src/xenia/gpu/d3d11/d3d11_graphics_driver.h | 68 +- src/xenia/gpu/d3d11/d3d11_graphics_system.cc | 6 + src/xenia/gpu/d3d11/d3d11_resource_cache.cc | 71 + src/xenia/gpu/d3d11/d3d11_resource_cache.h | 64 + ...che.cc => d3d11_sampler_state_resource.cc} | 83 +- .../gpu/d3d11/d3d11_sampler_state_resource.h | 48 + src/xenia/gpu/d3d11/d3d11_shader.cc | 2059 ----------------- src/xenia/gpu/d3d11/d3d11_shader.h | 125 - src/xenia/gpu/d3d11/d3d11_shader_cache.cc | 45 - src/xenia/gpu/d3d11/d3d11_shader_cache.h | 46 - src/xenia/gpu/d3d11/d3d11_shader_resource.cc | 381 +++ src/xenia/gpu/d3d11/d3d11_shader_resource.h | 91 + .../gpu/d3d11/d3d11_shader_translator.cc | 1625 +++++++++++++ src/xenia/gpu/d3d11/d3d11_shader_translator.h | 125 + src/xenia/gpu/d3d11/d3d11_texture.cc | 264 --- src/xenia/gpu/d3d11/d3d11_texture.h | 78 - src/xenia/gpu/d3d11/d3d11_texture_cache.h | 61 - src/xenia/gpu/d3d11/d3d11_texture_resource.cc | 219 ++ src/xenia/gpu/d3d11/d3d11_texture_resource.h | 60 + src/xenia/gpu/d3d11/sources.gypi | 24 +- .../{xenos/registers.cc => draw_command.cc} | 44 +- src/xenia/gpu/draw_command.h | 78 + src/xenia/gpu/graphics_driver.cc | 292 ++- src/xenia/gpu/graphics_driver.h | 49 +- src/xenia/gpu/graphics_system.cc | 44 +- src/xenia/gpu/graphics_system.h | 4 +- src/xenia/gpu/nop/nop_graphics_driver.cc | 63 +- src/xenia/gpu/nop/nop_graphics_driver.h | 26 +- src/xenia/gpu/register_file.cc | 17 +- src/xenia/gpu/register_file.h | 23 +- src/xenia/gpu/resource.cc | 20 + src/xenia/gpu/resource.h | 74 + src/xenia/gpu/resource_cache.cc | 137 ++ src/xenia/gpu/resource_cache.h | 97 + src/xenia/gpu/ring_buffer_worker.cc | 741 ------ src/xenia/gpu/ring_buffer_worker.h | 81 - src/xenia/gpu/sampler_state_resource.cc | 32 + src/xenia/gpu/sampler_state_resource.h | 67 + src/xenia/gpu/shader.cc | 266 --- src/xenia/gpu/shader.h | 104 - src/xenia/gpu/shader_cache.cc | 80 - src/xenia/gpu/shader_cache.h | 56 - src/xenia/gpu/shader_resource.cc | 258 +++ src/xenia/gpu/shader_resource.h | 102 +- src/xenia/gpu/sources.gypi | 4 + src/xenia/gpu/texture.cc | 369 --- src/xenia/gpu/texture.h | 110 - src/xenia/gpu/texture_cache.cc | 50 - src/xenia/gpu/texture_cache.h | 50 - src/xenia/gpu/texture_resource.cc | 333 +++ src/xenia/gpu/texture_resource.h | 81 +- src/xenia/gpu/xenos/registers.h | 51 - src/xenia/gpu/xenos/sources.gypi | 2 - 70 files changed, 5923 insertions(+), 6095 deletions(-) delete mode 100644 src/xenia/gpu/buffer.cc delete mode 100644 src/xenia/gpu/buffer.h delete mode 100644 src/xenia/gpu/buffer_cache.cc delete mode 100644 src/xenia/gpu/buffer_cache.h delete mode 100644 src/xenia/gpu/d3d11/d3d11_buffer.cc delete mode 100644 src/xenia/gpu/d3d11/d3d11_buffer.h delete mode 100644 src/xenia/gpu/d3d11/d3d11_buffer_cache.cc delete mode 100644 src/xenia/gpu/d3d11/d3d11_buffer_cache.h create mode 100644 src/xenia/gpu/d3d11/d3d11_buffer_resource.cc create mode 100644 src/xenia/gpu/d3d11/d3d11_buffer_resource.h create mode 100644 src/xenia/gpu/d3d11/d3d11_resource_cache.cc create mode 100644 src/xenia/gpu/d3d11/d3d11_resource_cache.h rename src/xenia/gpu/d3d11/{d3d11_texture_cache.cc => d3d11_sampler_state_resource.cc} (51%) create mode 100644 src/xenia/gpu/d3d11/d3d11_sampler_state_resource.h delete mode 100644 src/xenia/gpu/d3d11/d3d11_shader.cc delete mode 100644 src/xenia/gpu/d3d11/d3d11_shader.h delete mode 100644 src/xenia/gpu/d3d11/d3d11_shader_cache.cc delete mode 100644 src/xenia/gpu/d3d11/d3d11_shader_cache.h create mode 100644 src/xenia/gpu/d3d11/d3d11_shader_resource.cc create mode 100644 src/xenia/gpu/d3d11/d3d11_shader_resource.h create mode 100644 src/xenia/gpu/d3d11/d3d11_shader_translator.cc create mode 100644 src/xenia/gpu/d3d11/d3d11_shader_translator.h delete mode 100644 src/xenia/gpu/d3d11/d3d11_texture.cc delete mode 100644 src/xenia/gpu/d3d11/d3d11_texture.h delete mode 100644 src/xenia/gpu/d3d11/d3d11_texture_cache.h create mode 100644 src/xenia/gpu/d3d11/d3d11_texture_resource.cc create mode 100644 src/xenia/gpu/d3d11/d3d11_texture_resource.h rename src/xenia/gpu/{xenos/registers.cc => draw_command.cc} (56%) create mode 100644 src/xenia/gpu/draw_command.h delete mode 100644 src/xenia/gpu/ring_buffer_worker.cc delete mode 100644 src/xenia/gpu/ring_buffer_worker.h create mode 100644 src/xenia/gpu/sampler_state_resource.cc create mode 100644 src/xenia/gpu/sampler_state_resource.h delete mode 100644 src/xenia/gpu/shader.cc delete mode 100644 src/xenia/gpu/shader.h delete mode 100644 src/xenia/gpu/shader_cache.cc delete mode 100644 src/xenia/gpu/shader_cache.h delete mode 100644 src/xenia/gpu/texture.cc delete mode 100644 src/xenia/gpu/texture.h delete mode 100644 src/xenia/gpu/texture_cache.cc delete mode 100644 src/xenia/gpu/texture_cache.h delete mode 100644 src/xenia/gpu/xenos/registers.h diff --git a/src/xenia/gpu/buffer.cc b/src/xenia/gpu/buffer.cc deleted file mode 100644 index 499cb43a6..000000000 --- a/src/xenia/gpu/buffer.cc +++ /dev/null @@ -1,42 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::xenos; - - -Buffer::Buffer( - const uint8_t* src_ptr, size_t length) : - src_(src_ptr), length_(length) { -} - -Buffer::~Buffer() { -} - -IndexBuffer::IndexBuffer(const IndexBufferInfo& info, - const uint8_t* src_ptr, size_t length) - : Buffer(src_ptr, length), - info_(info) { -} - -IndexBuffer::~IndexBuffer() {} - -VertexBuffer::VertexBuffer(const VertexBufferInfo& info, - const uint8_t* src_ptr, size_t length) - : Buffer(src_ptr, length), - info_(info) { -} - -VertexBuffer::~VertexBuffer() {} diff --git a/src/xenia/gpu/buffer.h b/src/xenia/gpu/buffer.h deleted file mode 100644 index 9c8e3c654..000000000 --- a/src/xenia/gpu/buffer.h +++ /dev/null @@ -1,91 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_BUFFER_H_ -#define XENIA_GPU_BUFFER_H_ - -#include -#include -#include - - -namespace xe { -namespace gpu { - - -class Buffer { -public: - Buffer(const uint8_t* src_ptr, size_t length); - virtual ~Buffer(); - - const uint8_t* src() const { return src_; } - size_t length() const { return length_; } - uint64_t hash() const { return hash_; } - - virtual bool FetchNew(uint64_t hash) = 0; - virtual bool FetchDirty(uint64_t hash) = 0; - -protected: - const uint8_t* src_; - size_t length_; - uint64_t hash_; -}; - - -struct IndexBufferInfo { - bool index_32bit; - uint32_t index_count; - uint32_t index_size; - uint32_t endianness; -}; - - -class IndexBuffer : public Buffer { -public: - IndexBuffer(const IndexBufferInfo& info, - const uint8_t* src_ptr, size_t length); - virtual ~IndexBuffer(); - -protected: - IndexBufferInfo info_; -}; - - -struct VertexBufferLayout { - uint32_t stride_words; - uint32_t element_count; - struct { - uint32_t format; - uint32_t offset_words; - uint32_t size_words; - } elements[16]; -}; - -struct VertexBufferInfo { - VertexBufferLayout layout; -}; - - -class VertexBuffer : public Buffer { -public: - VertexBuffer(const VertexBufferInfo& info, - const uint8_t* src_ptr, size_t length); - virtual ~VertexBuffer(); - -protected: - VertexBufferInfo info_; -}; - - - -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_BUFFER_H_ diff --git a/src/xenia/gpu/buffer_cache.cc b/src/xenia/gpu/buffer_cache.cc deleted file mode 100644 index cc963d817..000000000 --- a/src/xenia/gpu/buffer_cache.cc +++ /dev/null @@ -1,79 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include - - -using namespace std; -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::xenos; - - -BufferCache::BufferCache() { -} - -BufferCache::~BufferCache() { - Clear(); -} - -IndexBuffer* BufferCache::FetchIndexBuffer( - const IndexBufferInfo& info, - const uint8_t* src_ptr, size_t length) { - size_t key = hash_combine(info.endianness, info.index_32bit, info.index_count, info.index_size); - size_t hash = xe_hash64(src_ptr, length); - auto it = index_buffer_map_.find(key); - if (it != index_buffer_map_.end()) { - if (hash == it->second->hash()) { - return it->second; - } else { - return it->second->FetchDirty(hash) ? it->second : nullptr; - } - } else { - auto buffer = CreateIndexBuffer(info, src_ptr, length); - index_buffer_map_.insert({ key, buffer }); - if (!buffer->FetchNew(hash)) { - return nullptr; - } - return buffer; - } -} - -VertexBuffer* BufferCache::FetchVertexBuffer( - const VertexBufferInfo& info, - const uint8_t* src_ptr, size_t length) { - size_t key = reinterpret_cast(src_ptr); - size_t hash = xe_hash64(src_ptr, length); - auto it = vertex_buffer_map_.find(key); - if (it != vertex_buffer_map_.end()) { - if (hash == it->second->hash()) { - return it->second; - } else { - return it->second->FetchDirty(hash) ? it->second : nullptr; - } - } else { - auto buffer = CreateVertexBuffer(info, src_ptr, length); - vertex_buffer_map_.insert({ key, buffer }); - if (!buffer->FetchNew(hash)) { - return nullptr; - } - return buffer; - } -} - -void BufferCache::Clear() { - for (auto it = index_buffer_map_.begin(); - it != index_buffer_map_.end(); ++it) { - auto buffer = it->second; - delete buffer; - } - index_buffer_map_.clear(); -} diff --git a/src/xenia/gpu/buffer_cache.h b/src/xenia/gpu/buffer_cache.h deleted file mode 100644 index bcba6f9de..000000000 --- a/src/xenia/gpu/buffer_cache.h +++ /dev/null @@ -1,55 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_BUFFER_CACHE_H_ -#define XENIA_GPU_BUFFER_CACHE_H_ - -#include -#include -#include - - -namespace xe { -namespace gpu { - - -class BufferCache { -public: - BufferCache(); - virtual ~BufferCache(); - - IndexBuffer* FetchIndexBuffer( - const IndexBufferInfo& info, - const uint8_t* src_ptr, size_t length); - - VertexBuffer* FetchVertexBuffer( - const VertexBufferInfo& info, - const uint8_t* src_ptr, size_t length); - - void Clear(); - -protected: - virtual IndexBuffer* CreateIndexBuffer( - const IndexBufferInfo& info, - const uint8_t* src_ptr, size_t length) = 0; - virtual VertexBuffer* CreateVertexBuffer( - const VertexBufferInfo& info, - const uint8_t* src_ptr, size_t length) = 0; - -private: - std::unordered_map index_buffer_map_; - std::unordered_map vertex_buffer_map_; -}; - - -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_BUFFER_CACHE_H_ diff --git a/src/xenia/gpu/buffer_resource.cc b/src/xenia/gpu/buffer_resource.cc index d6019d95f..9f9accb9b 100644 --- a/src/xenia/gpu/buffer_resource.cc +++ b/src/xenia/gpu/buffer_resource.cc @@ -15,3 +15,42 @@ using namespace xe; using namespace xe::gpu; using namespace xe::gpu::xenos; + +BufferResource::BufferResource(const MemoryRange& memory_range) + : PagedResource(memory_range) { +} + +BufferResource::~BufferResource() = default; + +int BufferResource::Prepare() { + if (!handle()) { + if (CreateHandle()) { + XELOGE("Unable to create buffer handle"); + return 1; + } + } + + if (!dirtied_) { + return 0; + } + dirtied_ = false; + + // pass dirty regions? + return InvalidateRegion(memory_range_); +} + +IndexBufferResource::IndexBufferResource(const MemoryRange& memory_range, + const Info& info) + : BufferResource(memory_range), + info_(info) { +} + +IndexBufferResource::~IndexBufferResource() = default; + +VertexBufferResource::VertexBufferResource(const MemoryRange& memory_range, + const Info& info) + : BufferResource(memory_range), + info_(info) { +} + +VertexBufferResource::~VertexBufferResource() = default; diff --git a/src/xenia/gpu/buffer_resource.h b/src/xenia/gpu/buffer_resource.h index 385a5049a..a88d1ae06 100644 --- a/src/xenia/gpu/buffer_resource.h +++ b/src/xenia/gpu/buffer_resource.h @@ -10,7 +10,8 @@ #ifndef XENIA_GPU_BUFFER_RESOURCE_H_ #define XENIA_GPU_BUFFER_RESOURCE_H_ -#include +#include +#include #include @@ -18,8 +19,76 @@ namespace xe { namespace gpu { -class BufferResource : public Resource { +class BufferResource : public PagedResource { public: + BufferResource(const MemoryRange& memory_range); + ~BufferResource() override; + + virtual int Prepare(); + +protected: + virtual int CreateHandle() = 0; + virtual int InvalidateRegion(const MemoryRange& memory_range) = 0; +}; + + +enum IndexFormat { + INDEX_FORMAT_16BIT = 0, + INDEX_FORMAT_32BIT = 1, +}; + +class IndexBufferResource : public BufferResource { +public: + struct Info { + IndexFormat format; + xenos::XE_GPU_ENDIAN endianness; + }; + + IndexBufferResource(const MemoryRange& memory_range, + const Info& info); + ~IndexBufferResource() override; + + const Info& info() const { return info_; } + + bool Equals(const void* info_ptr, size_t info_length) override { + return info_length == sizeof(Info) && + memcmp(info_ptr, &info_, info_length) == 0; + } + +protected: + Info info_; +}; + + +class VertexBufferResource : public BufferResource { +public: + struct DeclElement { + xenos::instr_fetch_vtx_t vtx_fetch; + uint32_t format; + uint32_t offset_words; + uint32_t size_words; + bool is_signed; + bool is_normalized; + }; + struct Info { + uint32_t stride_words; + uint32_t element_count; + DeclElement elements[16]; + }; + + VertexBufferResource(const MemoryRange& memory_range, + const Info& info); + ~VertexBufferResource() override; + + const Info& info() const { return info_; } + + bool Equals(const void* info_ptr, size_t info_length) override { + return info_length == sizeof(Info) && + memcmp(info_ptr, &info_, info_length) == 0; + } + +protected: + Info info_; }; diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index bb7a02bc7..c7a6a166b 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -9,9 +9,782 @@ #include +#include +#include +#include +#include + -using namespace std; using namespace xe; using namespace xe::gpu; using namespace xe::gpu::xenos; + +#define XETRACECP(fmt, ...) if (FLAGS_trace_ring_buffer) XELOGGPU(fmt, ##__VA_ARGS__) + + +CommandProcessor::CommandProcessor( + GraphicsSystem* graphics_system, Memory* memory) : + graphics_system_(graphics_system), memory_(memory), driver_(0) { + write_ptr_index_event_ = CreateEvent(NULL, FALSE, FALSE, NULL); + + primary_buffer_ptr_ = 0; + primary_buffer_size_ = 0; + read_ptr_index_ = 0; + read_ptr_update_freq_ = 0; + read_ptr_writeback_ptr_ = 0; + write_ptr_index_ = 0; + write_ptr_max_index_ = 0; + + LARGE_INTEGER perf_counter; + QueryPerformanceCounter(&perf_counter); + time_base_ = perf_counter.QuadPart; + counter_ = 0; +} + +CommandProcessor::~CommandProcessor() { + SetEvent(write_ptr_index_event_); + CloseHandle(write_ptr_index_event_); +} + +uint64_t CommandProcessor::QueryTime() { + LARGE_INTEGER perf_counter; + QueryPerformanceCounter(&perf_counter); + return perf_counter.QuadPart - time_base_; +} + +void CommandProcessor::Initialize(GraphicsDriver* driver, + uint32_t ptr, uint32_t page_count) { + driver_ = driver; + primary_buffer_ptr_ = ptr; + // Not sure this is correct, but it's a way to take the page_count back to + // the number of bytes allocated by the physical alloc. + uint32_t original_size = 1 << (0x1C - page_count - 1); + primary_buffer_size_ = original_size; + read_ptr_index_ = 0; + + // Tell the driver what to use for translation. + driver_->set_address_translation(primary_buffer_ptr_ & ~0x1FFFFFFF); +} + +void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr, + uint32_t block_size) { + // CP_RB_RPTR_ADDR Ring Buffer Read Pointer Address 0x70C + // ptr = RB_RPTR_ADDR, pointer to write back the address to. + read_ptr_writeback_ptr_ = (primary_buffer_ptr_ & ~0x1FFFFFFF) + ptr; + // CP_RB_CNTL Ring Buffer Control 0x704 + // block_size = RB_BLKSZ, number of quadwords read between updates of the + // read pointer. + read_ptr_update_freq_ = (uint32_t)pow(2.0, (double)block_size) / 4; +} + +void CommandProcessor::UpdateWritePointer(uint32_t value) { + write_ptr_max_index_ = MAX(write_ptr_max_index_, value); + write_ptr_index_ = value; + SetEvent(write_ptr_index_event_); +} + +void CommandProcessor::Pump() { + uint8_t* p = memory_->membase(); + + while (write_ptr_index_ == 0xBAADF00D || + read_ptr_index_ == write_ptr_index_) { + // Check if the pointer has moved. + // We wait a short bit here to yield time. Since we are also running the + // main window display we don't want to pause too long, though. + // YieldProcessor(); + const int wait_time_ms = 1; + if (WaitForSingleObject(write_ptr_index_event_, + wait_time_ms) == WAIT_TIMEOUT) { + return; + } + } + + // Bring local so we don't have to worry about them changing out from under + // us. + uint32_t write_ptr_index = write_ptr_index_; + uint32_t write_ptr_max_index = write_ptr_max_index_; + if (read_ptr_index_ == write_ptr_index) { + return; + } + + // Process the new commands. + XETRACECP("Command processor thread work"); + + // Execute. Note that we handle wraparound transparently. + ExecutePrimaryBuffer(read_ptr_index_, write_ptr_index); + read_ptr_index_ = write_ptr_index; + + // TODO(benvanik): use read_ptr_update_freq_ and only issue after moving + // that many indices. + if (read_ptr_writeback_ptr_) { + XESETUINT32BE(p + read_ptr_writeback_ptr_, read_ptr_index_); + } +} + +void CommandProcessor::ExecutePrimaryBuffer( + uint32_t start_index, uint32_t end_index) { + SCOPE_profile_cpu_f("gpu"); + + // Adjust pointer base. + uint32_t ptr = primary_buffer_ptr_ + start_index * 4; + ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (ptr & 0x1FFFFFFF); + uint32_t end_ptr = primary_buffer_ptr_ + end_index * 4; + end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF); + + XETRACECP("[%.8X] ExecutePrimaryBuffer(%dw -> %dw)", + ptr, start_index, end_index); + + // Execute commands! + PacketArgs args; + args.ptr = ptr; + args.base_ptr = primary_buffer_ptr_; + args.max_address = primary_buffer_ptr_ + primary_buffer_size_; + args.ptr_mask = (primary_buffer_size_ / 4) - 1; + uint32_t n = 0; + while (args.ptr != end_ptr) { + n += ExecutePacket(args); + XEASSERT(args.ptr < args.max_address); + } + if (end_index > start_index) { + XEASSERT(n == (end_index - start_index)); + } + + XETRACECP(" ExecutePrimaryBuffer End"); +} + +void CommandProcessor::ExecuteIndirectBuffer(uint32_t ptr, uint32_t length) { + XETRACECP("[%.8X] ExecuteIndirectBuffer(%dw)", ptr, length); + + // Execute commands! + PacketArgs args; + args.ptr = ptr; + args.base_ptr = ptr; + args.max_address = ptr + length * 4; + args.ptr_mask = 0; + for (uint32_t n = 0; n < length;) { + n += ExecutePacket(args); + XEASSERT(n <= length); + } + + XETRACECP(" ExecuteIndirectBuffer End"); +} + +#define LOG_DATA(count) \ + for (uint32_t __m = 0; __m < count; __m++) { \ + XETRACECP("[%.8X] %.8X", \ + packet_ptr + (1 + __m) * 4, \ + XEGETUINT32BE(packet_base + 1 * 4 + __m * 4)); \ + } + +void CommandProcessor::AdvancePtr(PacketArgs& args, uint32_t n) { + args.ptr = args.ptr + n * 4; + if (args.ptr_mask) { + args.ptr = + args.base_ptr + (((args.ptr - args.base_ptr) / 4) & args.ptr_mask) * 4; + } +} +#define ADVANCE_PTR(n) AdvancePtr(args, n) +#define PEEK_PTR() \ + XEGETUINT32BE(p + args.ptr) +#define READ_PTR() \ + XEGETUINT32BE(p + args.ptr); ADVANCE_PTR(1); + +uint32_t CommandProcessor::ExecutePacket(PacketArgs& args) { + uint8_t* p = memory_->membase(); + RegisterFile* regs = driver_->register_file(); + + uint32_t packet_ptr = args.ptr; + const uint8_t* packet_base = p + packet_ptr; + const uint32_t packet = PEEK_PTR(); + ADVANCE_PTR(1); + const uint32_t packet_type = packet >> 30; + if (packet == 0) { + XETRACECP("[%.8X] Packet(%.8X): 0?", + packet_ptr, packet); + return 1; + } + + switch (packet_type) { + case 0x00: + { + // Type-0 packet. + // Write count registers in sequence to the registers starting at + // (base_index << 2). + XETRACECP("[%.8X] Packet(%.8X): set registers:", + packet_ptr, packet); + uint32_t count = ((packet >> 16) & 0x3FFF) + 1; + uint32_t base_index = (packet & 0x7FFF); + uint32_t write_one_reg = (packet >> 15) & 0x1; + for (uint32_t m = 0; m < count; m++) { + uint32_t reg_data = PEEK_PTR(); + uint32_t target_index = write_one_reg ? base_index : base_index + m; + const char* reg_name = regs->GetRegisterName(target_index); + XETRACECP("[%.8X] %.8X -> %.4X %s", + args.ptr, + reg_data, target_index, reg_name ? reg_name : ""); + ADVANCE_PTR(1); + WriteRegister(packet_ptr, target_index, reg_data); + } + return 1 + count; + } + break; + case 0x01: + { + // Type-1 packet. + // Contains two registers of data. Type-0 should be more common. + XETRACECP("[%.8X] Packet(%.8X): set registers:", + packet_ptr, packet); + uint32_t reg_index_1 = packet & 0x7FF; + uint32_t reg_index_2 = (packet >> 11) & 0x7FF; + uint32_t reg_ptr_1 = args.ptr; + uint32_t reg_data_1 = READ_PTR(); + uint32_t reg_ptr_2 = args.ptr; + uint32_t reg_data_2 = READ_PTR(); + const char* reg_name_1 = regs->GetRegisterName(reg_index_1); + const char* reg_name_2 = regs->GetRegisterName(reg_index_2); + XETRACECP("[%.8X] %.8X -> %.4X %s", + reg_ptr_1, + reg_data_1, reg_index_1, reg_name_1 ? reg_name_1 : ""); + XETRACECP("[%.8X] %.8X -> %.4X %s", + reg_ptr_2, + reg_data_2, reg_index_2, reg_name_2 ? reg_name_2 : ""); + WriteRegister(packet_ptr, reg_index_1, reg_data_1); + WriteRegister(packet_ptr, reg_index_2, reg_data_2); + return 1 + 2; + } + break; + case 0x02: + // Type-2 packet. + // No-op. Do nothing. + XETRACECP("[%.8X] Packet(%.8X): padding", + packet_ptr, packet); + return 1; + case 0x03: + { + // Type-3 packet. + uint32_t count = ((packet >> 16) & 0x3FFF) + 1; + uint32_t opcode = (packet >> 8) & 0x7F; + // & 1 == predicate, maybe? + + switch (opcode) { + case PM4_ME_INIT: + // initialize CP's micro-engine + XETRACECP("[%.8X] Packet(%.8X): PM4_ME_INIT", + packet_ptr, packet); + LOG_DATA(count); + ADVANCE_PTR(count); + break; + + case PM4_NOP: + // skip N 32-bit words to get to the next packet + // No-op, ignore some data. + XETRACECP("[%.8X] Packet(%.8X): PM4_NOP", + packet_ptr, packet); + LOG_DATA(count); + ADVANCE_PTR(count); + break; + + case PM4_INTERRUPT: + // generate interrupt from the command stream + { + XETRACECP("[%.8X] Packet(%.8X): PM4_INTERRUPT", + packet_ptr, packet); + LOG_DATA(count); + uint32_t cpu_mask = READ_PTR(); + for (int n = 0; n < 6; n++) { + if (cpu_mask & (1 << n)) { + graphics_system_->DispatchInterruptCallback(1, n); + } + } + } + break; + + case PM4_INDIRECT_BUFFER: + // indirect buffer dispatch + { + uint32_t list_ptr = READ_PTR(); + uint32_t list_length = READ_PTR(); + XETRACECP("[%.8X] Packet(%.8X): PM4_INDIRECT_BUFFER %.8X (%dw)", + packet_ptr, packet, list_ptr, list_length); + ExecuteIndirectBuffer(GpuToCpu(list_ptr), list_length); + } + break; + + case PM4_WAIT_REG_MEM: + // wait until a register or memory location is a specific value + { + XETRACECP("[%.8X] Packet(%.8X): PM4_WAIT_REG_MEM", + packet_ptr, packet); + LOG_DATA(count); + uint32_t wait_info = READ_PTR(); + uint32_t poll_reg_addr = READ_PTR(); + uint32_t ref = READ_PTR(); + uint32_t mask = READ_PTR(); + uint32_t wait = READ_PTR(); + bool matched = false; + do { + uint32_t value; + if (wait_info & 0x10) { + // Memory. + XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(poll_reg_addr & 0x3); + poll_reg_addr &= ~0x3; + value = XEGETUINT32LE(p + GpuToCpu(packet_ptr, poll_reg_addr)); + value = GpuSwap(value, endianness); + } else { + // Register. + XEASSERT(poll_reg_addr < RegisterFile::kRegisterCount); + + if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) { + // Waiting for coherency. We should have all the info we need + // now (base+size+mode), so kick it off. + MakeCoherent(); + } + + value = regs->values[poll_reg_addr].u32; + } + switch (wait_info & 0x7) { + case 0x0: // Never. + matched = false; + break; + case 0x1: // Less than reference. + matched = (value & mask) < ref; + break; + case 0x2: // Less than or equal to reference. + matched = (value & mask) <= ref; + break; + case 0x3: // Equal to reference. + matched = (value & mask) == ref; + break; + case 0x4: // Not equal to reference. + matched = (value & mask) != ref; + break; + case 0x5: // Greater than or equal to reference. + matched = (value & mask) >= ref; + break; + case 0x6: // Greater than reference. + matched = (value & mask) > ref; + break; + case 0x7: // Always + matched = true; + break; + } + if (!matched) { + // Wait. + if (wait >= 0x100) { + Sleep(wait / 0x100); + } else { + SwitchToThread(); + } + } + } while (!matched); + } + break; + + case PM4_REG_RMW: + // register read/modify/write + // ? (used during shader upload and edram setup) + { + XETRACECP("[%.8X] Packet(%.8X): PM4_REG_RMW", + packet_ptr, packet); + LOG_DATA(count); + uint32_t rmw_info = READ_PTR(); + uint32_t and_mask = READ_PTR(); + uint32_t or_mask = READ_PTR(); + uint32_t value = regs->values[rmw_info & 0x1FFF].u32; + if ((rmw_info >> 30) & 0x1) { + // | reg + value |= regs->values[or_mask & 0x1FFF].u32; + } else { + // | imm + value |= or_mask; + } + if ((rmw_info >> 31) & 0x1) { + // & reg + value &= regs->values[and_mask & 0x1FFF].u32; + } else { + // & imm + value &= and_mask; + } + WriteRegister(packet_ptr, rmw_info & 0x1FFF, value); + } + break; + + case PM4_COND_WRITE: + // conditional write to memory or register + { + XETRACECP("[%.8X] Packet(%.8X): PM4_COND_WRITE", + packet_ptr, packet); + LOG_DATA(count); + uint32_t wait_info = READ_PTR(); + uint32_t poll_reg_addr = READ_PTR(); + uint32_t ref = READ_PTR(); + uint32_t mask = READ_PTR(); + uint32_t write_reg_addr = READ_PTR(); + uint32_t write_data = READ_PTR(); + uint32_t value; + if (wait_info & 0x10) { + // Memory. + XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(poll_reg_addr & 0x3); + poll_reg_addr &= ~0x3; + value = XEGETUINT32LE(p + GpuToCpu(packet_ptr, poll_reg_addr)); + value = GpuSwap(value, endianness); + } else { + // Register. + XEASSERT(poll_reg_addr < RegisterFile::kRegisterCount); + value = regs->values[poll_reg_addr].u32; + } + bool matched = false; + switch (wait_info & 0x7) { + case 0x0: // Never. + matched = false; + break; + case 0x1: // Less than reference. + matched = (value & mask) < ref; + break; + case 0x2: // Less than or equal to reference. + matched = (value & mask) <= ref; + break; + case 0x3: // Equal to reference. + matched = (value & mask) == ref; + break; + case 0x4: // Not equal to reference. + matched = (value & mask) != ref; + break; + case 0x5: // Greater than or equal to reference. + matched = (value & mask) >= ref; + break; + case 0x6: // Greater than reference. + matched = (value & mask) > ref; + break; + case 0x7: // Always + matched = true; + break; + } + if (matched) { + // Write. + if (wait_info & 0x100) { + // Memory. + XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(write_reg_addr & 0x3); + write_reg_addr &= ~0x3; + write_data = GpuSwap(write_data, endianness); + XESETUINT32LE(p + GpuToCpu(packet_ptr, write_reg_addr), + write_data); + } else { + // Register. + WriteRegister(packet_ptr, write_reg_addr, write_data); + } + } + } + break; + + case PM4_EVENT_WRITE: + // generate an event that creates a write to memory when completed + { + XETRACECP("[%.8X] Packet(%.8X): PM4_EVENT_WRITE (unimplemented!)", + packet_ptr, packet); + LOG_DATA(count); + uint32_t initiator = READ_PTR(); + if (count == 1) { + // Just an event flag? Where does this write? + } else { + // Write to an address. + XEASSERTALWAYS(); + ADVANCE_PTR(count - 1); + } + } + break; + case PM4_EVENT_WRITE_SHD: + // generate a VS|PS_done event + { + XETRACECP("[%.8X] Packet(%.8X): PM4_EVENT_WRITE_SHD", + packet_ptr, packet); + LOG_DATA(count); + uint32_t initiator = READ_PTR(); + uint32_t address = READ_PTR(); + uint32_t value = READ_PTR(); + // Writeback initiator. + WriteRegister(packet_ptr, XE_GPU_REG_VGT_EVENT_INITIATOR, + initiator & 0x1F); + uint32_t data_value; + if ((initiator >> 31) & 0x1) { + // Write counter (GPU vblank counter?). + data_value = counter_; + } else { + // Write value. + data_value = value; + } + XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(address & 0x3); + address &= ~0x3; + data_value = GpuSwap(data_value, endianness); + XESETUINT32LE(p + GpuToCpu(address), data_value); + } + break; + + case PM4_DRAW_INDX: + // initiate fetch of index buffer and draw + { + XETRACECP("[%.8X] Packet(%.8X): PM4_DRAW_INDX", + packet_ptr, packet); + LOG_DATA(count); + // d0 = viz query info + uint32_t d0 = READ_PTR(); + uint32_t d1 = READ_PTR(); + uint32_t index_count = d1 >> 16; + uint32_t prim_type = d1 & 0x3F; + uint32_t src_sel = (d1 >> 6) & 0x3; + if (!driver_->PrepareDraw(draw_command_)) { + draw_command_.prim_type = (XE_GPU_PRIMITIVE_TYPE)prim_type; + draw_command_.start_index = 0; + draw_command_.index_count = index_count; + draw_command_.base_vertex = 0; + if (src_sel == 0x0) { + // Indexed draw. + // TODO(benvanik): detect subregions of larger index buffers! + uint32_t index_base = READ_PTR(); + uint32_t index_size = READ_PTR(); + uint32_t endianness = index_size >> 29; + index_size &= 0x00FFFFFF; + bool index_32bit = (d1 >> 11) & 0x1; + index_size *= index_32bit ? 4 : 2; + driver_->PrepareDrawIndexBuffer( + draw_command_, + index_base, index_size, + (XE_GPU_ENDIAN)endianness, + index_32bit ? INDEX_FORMAT_32BIT : INDEX_FORMAT_16BIT); + } else if (src_sel == 0x2) { + // Auto draw. + draw_command_.index_buffer = nullptr; + } else { + // Unknown source select. + XEASSERTALWAYS(); + } + driver_->Draw(draw_command_); + } else { + if (src_sel == 0x0) { + ADVANCE_PTR(2); // skip + } + } + } + break; + case PM4_DRAW_INDX_2: + // draw using supplied indices in packet + { + XETRACECP("[%.8X] Packet(%.8X): PM4_DRAW_INDX_2", + packet_ptr, packet); + LOG_DATA(count); + uint32_t d0 = READ_PTR(); + uint32_t index_count = d0 >> 16; + uint32_t prim_type = d0 & 0x3F; + uint32_t src_sel = (d0 >> 6) & 0x3; + XEASSERT(src_sel == 0x2); // 'SrcSel=AutoIndex' + if (!driver_->PrepareDraw(draw_command_)) { + draw_command_.prim_type = (XE_GPU_PRIMITIVE_TYPE)prim_type; + draw_command_.start_index = 0; + draw_command_.index_count = index_count; + draw_command_.base_vertex = 0; + draw_command_.index_buffer = nullptr; + driver_->Draw(draw_command_); + } + } + break; + + case PM4_SET_CONSTANT: + // load constant into chip and to memory + { + XETRACECP("[%.8X] Packet(%.8X): PM4_SET_CONSTANT", + packet_ptr, packet); + // PM4_REG(reg) ((0x4 << 16) | (GSL_HAL_SUBBLOCK_OFFSET(reg))) + // reg - 0x2000 + uint32_t offset_type = READ_PTR(); + uint32_t index = offset_type & 0x7FF; + uint32_t type = (offset_type >> 16) & 0xFF; + switch (type) { + case 0x4: // REGISTER + index += 0x2000; // registers + for (uint32_t n = 0; n < count - 1; n++, index++) { + uint32_t data = READ_PTR(); + const char* reg_name = regs->GetRegisterName(index); + XETRACECP("[%.8X] %.8X -> %.4X %s", + packet_ptr + (1 + n) * 4, + data, index, reg_name ? reg_name : ""); + WriteRegister(packet_ptr, index, data); + } + break; + default: + XEASSERTALWAYS(); + break; + } + } + break; + case PM4_LOAD_ALU_CONSTANT: + // load constants from memory + { + XETRACECP("[%.8X] Packet(%.8X): PM4_LOAD_ALU_CONSTANT", + packet_ptr, packet); + uint32_t address = READ_PTR(); + address &= 0x3FFFFFFF; + uint32_t offset_type = READ_PTR(); + uint32_t index = offset_type & 0x7FF; + uint32_t size = READ_PTR(); + size &= 0xFFF; + index += 0x4000; // alu constants + for (uint32_t n = 0; n < size; n++, index++) { + uint32_t data = XEGETUINT32BE( + p + GpuToCpu(packet_ptr, address + n * 4)); + const char* reg_name = regs->GetRegisterName(index); + XETRACECP("[%.8X] %.8X -> %.4X %s", + packet_ptr, + data, index, reg_name ? reg_name : ""); + WriteRegister(packet_ptr, index, data); + } + } + break; + + case PM4_IM_LOAD: + // load sequencer instruction memory (pointer-based) + { + XETRACECP("[%.8X] Packet(%.8X): PM4_IM_LOAD", + packet_ptr, packet); + LOG_DATA(count); + uint32_t addr_type = READ_PTR(); + uint32_t type = addr_type & 0x3; + uint32_t addr = addr_type & ~0x3; + uint32_t start_size = READ_PTR(); + uint32_t start = start_size >> 16; + uint32_t size = start_size & 0xFFFF; // dwords + XEASSERT(start == 0); + driver_->LoadShader((XE_GPU_SHADER_TYPE)type, + GpuToCpu(packet_ptr, addr), size * 4, start); + } + break; + case PM4_IM_LOAD_IMMEDIATE: + // load sequencer instruction memory (code embedded in packet) + { + XETRACECP("[%.8X] Packet(%.8X): PM4_IM_LOAD_IMMEDIATE", + packet_ptr, packet); + LOG_DATA(count); + uint32_t type = READ_PTR(); + uint32_t start_size = READ_PTR(); + uint32_t start = start_size >> 16; + uint32_t size = start_size & 0xFFFF; // dwords + XEASSERT(start == 0); + // TODO(benvanik): figure out if this could wrap. + XEASSERT(args.ptr + size * 4 < args.max_address); + driver_->LoadShader((XE_GPU_SHADER_TYPE)type, + args.ptr, size * 4, start); + ADVANCE_PTR(size); + } + break; + + case PM4_INVALIDATE_STATE: + // selective invalidation of state pointers + { + XETRACECP("[%.8X] Packet(%.8X): PM4_INVALIDATE_STATE", + packet_ptr, packet); + LOG_DATA(count); + uint32_t mask = READ_PTR(); + //driver_->InvalidateState(mask); + } + break; + + case PM4_SET_BIN_MASK_LO: + { + uint32_t value = READ_PTR(); + XETRACECP("[%.8X] Packet(%.8X): PM4_SET_BIN_MASK_LO = %.8X", + packet_ptr, packet, value); + } + break; + case PM4_SET_BIN_MASK_HI: + { + uint32_t value = READ_PTR(); + XETRACECP("[%.8X] Packet(%.8X): PM4_SET_BIN_MASK_HI = %.8X", + packet_ptr, packet, value); + } + break; + case PM4_SET_BIN_SELECT_LO: + { + uint32_t value = READ_PTR(); + XETRACECP("[%.8X] Packet(%.8X): PM4_SET_BIN_SELECT_LO = %.8X", + packet_ptr, packet, value); + } + break; + case PM4_SET_BIN_SELECT_HI: + { + uint32_t value = READ_PTR(); + XETRACECP("[%.8X] Packet(%.8X): PM4_SET_BIN_SELECT_HI = %.8X", + packet_ptr, packet, value); + } + break; + + // Ignored packets - useful if breaking on the default handler below. + case 0x50: // 0xC0015000 usually 2 words, 0xFFFFFFFF / 0x00000000 + XETRACECP("[%.8X] Packet(%.8X): unknown!", + packet_ptr, packet); + LOG_DATA(count); + ADVANCE_PTR(count); + break; + + default: + XETRACECP("[%.8X] Packet(%.8X): unknown!", + packet_ptr, packet); + LOG_DATA(count); + ADVANCE_PTR(count); + break; + } + + return 1 + count; + } + break; + } + + return 0; +} + +void CommandProcessor::WriteRegister( + uint32_t packet_ptr, uint32_t index, uint32_t value) { + RegisterFile* regs = driver_->register_file(); + XEASSERT(index < RegisterFile::kRegisterCount); + regs->values[index].u32 = value; + + // If this is a COHER register, set the dirty flag. + // This will block the command processor the next time it WAIT_MEM_REGs and + // allow us to synchronize the memory. + if (index == XE_GPU_REG_COHER_STATUS_HOST) { + regs->values[index].u32 |= 0x80000000ul; + } + + // Scratch register writeback. + if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) { + uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0; + if ((1 << scratch_reg) & regs->values[XE_GPU_REG_SCRATCH_UMSK].u32) { + // Enabled - write to address. + uint8_t* p = memory_->membase(); + uint32_t scratch_addr = regs->values[XE_GPU_REG_SCRATCH_ADDR].u32; + uint32_t mem_addr = scratch_addr + (scratch_reg * 4); + XESETUINT32BE(p + GpuToCpu(primary_buffer_ptr_, mem_addr), value); + } + } +} + +void CommandProcessor::MakeCoherent() { + RegisterFile* regs = driver_->register_file(); + auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32; + auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32; + auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32; + + // Status host often has 0x01000000 or 0x03000000. + // This is likely toggling VC (vertex cache) or TC (texture cache). + // Or, it also has a direction in here maybe - there is probably + // some way to check for dest coherency (what all the COHER_DEST_BASE_* + // registers are for). + + // TODO(benvanik): notify resource cache of base->size and type. + XETRACECP("Make %.8X -> %.8X (%db) coherent", + base_host, base_host + size_host, size_host); + driver_->resource_cache()->SyncRange(base_host, size_host); + + // Mark coherent. + status_host &= ~0x80000000ul; + regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32 = status_host; +} diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h index 65d5dfc71..ba081aefb 100644 --- a/src/xenia/gpu/command_processor.h +++ b/src/xenia/gpu/command_processor.h @@ -11,15 +11,70 @@ #define XENIA_GPU_COMMAND_PROCESSOR_H_ #include +#include +#include #include namespace xe { namespace gpu { +class GraphicsDriver; +class GraphicsSystem; + class CommandProcessor { public: + CommandProcessor(GraphicsSystem* graphics_system, Memory* memory); + virtual ~CommandProcessor(); + + Memory* memory() const { return memory_; } + + uint64_t QueryTime(); + uint32_t counter() const { return counter_; } + void increment_counter() { counter_++; } + + void Initialize(GraphicsDriver* driver, uint32_t ptr, uint32_t page_count); + void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size); + + void UpdateWritePointer(uint32_t value); + + void Pump(); + +private: + typedef struct { + uint32_t ptr; + uint32_t base_ptr; + uint32_t max_address; + uint32_t ptr_mask; + } PacketArgs; + + void AdvancePtr(PacketArgs& args, uint32_t n); + void ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index); + void ExecuteIndirectBuffer(uint32_t ptr, uint32_t length); + uint32_t ExecutePacket(PacketArgs& args); + void WriteRegister(uint32_t packet_ptr, uint32_t index, uint32_t value); + void MakeCoherent(); + + Memory* memory_; + GraphicsSystem* graphics_system_; + GraphicsDriver* driver_; + + uint64_t time_base_; + uint32_t counter_; + + uint32_t primary_buffer_ptr_; + uint32_t primary_buffer_size_; + + uint32_t read_ptr_index_; + uint32_t read_ptr_update_freq_; + uint32_t read_ptr_writeback_ptr_; + + HANDLE write_ptr_index_event_; + volatile uint32_t write_ptr_index_; + volatile uint32_t write_ptr_max_index_; + + DrawCommand draw_command_; }; diff --git a/src/xenia/gpu/d3d11/d3d11_buffer.cc b/src/xenia/gpu/d3d11/d3d11_buffer.cc deleted file mode 100644 index 84c0d901e..000000000 --- a/src/xenia/gpu/d3d11/d3d11_buffer.cc +++ /dev/null @@ -1,150 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::d3d11; -using namespace xe::gpu::xenos; - - -D3D11IndexBuffer::D3D11IndexBuffer( - D3D11BufferCache* buffer_cache, - const IndexBufferInfo& info, - const uint8_t* src_ptr, size_t length) - : IndexBuffer(info, src_ptr, length), - buffer_cache_(buffer_cache), - handle_(nullptr) { -} - -D3D11IndexBuffer::~D3D11IndexBuffer() { - XESAFERELEASE(handle_); -} - -bool D3D11IndexBuffer::FetchNew(uint64_t hash) { - hash_ = hash; - - D3D11_BUFFER_DESC buffer_desc; - xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); - buffer_desc.ByteWidth = info_.index_size; - buffer_desc.Usage = D3D11_USAGE_DYNAMIC; - buffer_desc.BindFlags = D3D11_BIND_INDEX_BUFFER; - buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - HRESULT hr = buffer_cache_->device()->CreateBuffer(&buffer_desc, NULL, &handle_); - if (FAILED(hr)) { - XELOGW("D3D11: failed to create index buffer"); - return false; - } - - return FetchDirty(hash); -} - -bool D3D11IndexBuffer::FetchDirty(uint64_t hash) { - hash_ = hash; - - // All that's done so far: - XEASSERT(info_.endianness == 0x2); - - D3D11_MAPPED_SUBRESOURCE res; - HRESULT hr = buffer_cache_->context()->Map( - handle_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); - if (FAILED(hr)) { - XELOGE("D3D11: unable to map index buffer"); - return false; - } - - if (info_.index_32bit) { - const uint32_t* src = reinterpret_cast(src_); - uint32_t* dest = reinterpret_cast(res.pData); - for (uint32_t n = 0; n < info_.index_count; n++) { - uint32_t d = { XESWAP32(src[n]) }; - dest[n] = d; - } - } else { - const uint16_t* src = reinterpret_cast(src_); - uint16_t* dest = reinterpret_cast(res.pData); - for (uint32_t n = 0; n < info_.index_count; n++) { - uint16_t d = XESWAP16(src[n]); - dest[n] = d; - } - } - buffer_cache_->context()->Unmap(handle_, 0); - - return true; -} - - -D3D11VertexBuffer::D3D11VertexBuffer( - D3D11BufferCache* buffer_cache, - const VertexBufferInfo& info, - const uint8_t* src_ptr, size_t length) - : VertexBuffer(info, src_ptr, length), - buffer_cache_(buffer_cache), - handle_(nullptr) { -} - -D3D11VertexBuffer::~D3D11VertexBuffer() { - XESAFERELEASE(handle_); -} - -bool D3D11VertexBuffer::FetchNew(uint64_t hash) { - hash_ = hash; - - D3D11_BUFFER_DESC buffer_desc; - xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); - buffer_desc.ByteWidth = static_cast(length_); - buffer_desc.Usage = D3D11_USAGE_DYNAMIC; - buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER; - buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - HRESULT hr = buffer_cache_->device()->CreateBuffer(&buffer_desc, NULL, &handle_); - if (FAILED(hr)) { - XELOGW("D3D11: failed to create index buffer"); - return false; - } - - return FetchDirty(hash); -} - -bool D3D11VertexBuffer::FetchDirty(uint64_t hash) { - hash_ = hash; - - D3D11_MAPPED_SUBRESOURCE res; - HRESULT hr = buffer_cache_->context()->Map( - handle_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); - if (FAILED(hr)) { - XELOGE("D3D11: unable to map vertex buffer"); - return false; - } - uint8_t* dest = reinterpret_cast(res.pData); - - // TODO(benvanik): rewrite to be faster/special case common/etc - uint32_t stride = info_.layout.stride_words; - size_t count = (length_ / 4) / stride; - for (size_t n = 0; n < info_.layout.element_count; n++) { - const auto& el = info_.layout.elements[n]; - const uint32_t* src_ptr = (const uint32_t*)(src_ + el.offset_words * 4); - uint32_t* dest_ptr = (uint32_t*)(dest + el.offset_words * 4); - uint32_t o = 0; - for (uint32_t i = 0; i < count; i++) { - for (uint32_t j = 0; j < el.size_words; j++) { - dest_ptr[o + j] = XESWAP32(src_ptr[o + j]); - } - o += stride; - } - } - - - buffer_cache_->context()->Unmap(handle_, 0); - return true; -} diff --git a/src/xenia/gpu/d3d11/d3d11_buffer.h b/src/xenia/gpu/d3d11/d3d11_buffer.h deleted file mode 100644 index 924fb3da4..000000000 --- a/src/xenia/gpu/d3d11/d3d11_buffer.h +++ /dev/null @@ -1,69 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_D3D11_D3D11_BUFFER_H_ -#define XENIA_GPU_D3D11_D3D11_BUFFER_H_ - -#include - -#include -#include - -#include - - -namespace xe { -namespace gpu { -namespace d3d11 { - -class D3D11BufferCache; - - -class D3D11IndexBuffer : public IndexBuffer { -public: - D3D11IndexBuffer(D3D11BufferCache* buffer_cache, - const IndexBufferInfo& info, - const uint8_t* src_ptr, size_t length); - virtual ~D3D11IndexBuffer(); - - ID3D11Buffer* handle() const { return handle_; } - - bool FetchNew(uint64_t hash) override; - bool FetchDirty(uint64_t hash) override; - -private: - D3D11BufferCache* buffer_cache_; - ID3D11Buffer* handle_; -}; - - -class D3D11VertexBuffer : public VertexBuffer { -public: - D3D11VertexBuffer(D3D11BufferCache* buffer_cache, - const VertexBufferInfo& info, - const uint8_t* src_ptr, size_t length); - virtual ~D3D11VertexBuffer(); - - ID3D11Buffer* handle() const { return handle_; } - - bool FetchNew(uint64_t hash) override; - bool FetchDirty(uint64_t hash) override; - -private: - D3D11BufferCache* buffer_cache_; - ID3D11Buffer* handle_; -}; - - -} // namespace d3d11 -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_D3D11_D3D11_BUFFER_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_buffer_cache.cc b/src/xenia/gpu/d3d11/d3d11_buffer_cache.cc deleted file mode 100644 index 48eb8fbf8..000000000 --- a/src/xenia/gpu/d3d11/d3d11_buffer_cache.cc +++ /dev/null @@ -1,44 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::d3d11; -using namespace xe::gpu::xenos; - - -D3D11BufferCache::D3D11BufferCache(ID3D11DeviceContext* context, - ID3D11Device* device) - : context_(context), device_(device) { - context->AddRef(); - device_->AddRef(); -} - -D3D11BufferCache::~D3D11BufferCache() { - XESAFERELEASE(device_); - XESAFERELEASE(context_); -} - -IndexBuffer* D3D11BufferCache::CreateIndexBuffer( - const IndexBufferInfo& info, - const uint8_t* src_ptr, size_t length) { - return new D3D11IndexBuffer(this, info, src_ptr, length); -} - -VertexBuffer* D3D11BufferCache::CreateVertexBuffer( - const VertexBufferInfo& info, - const uint8_t* src_ptr, size_t length) { - return new D3D11VertexBuffer(this, info, src_ptr, length); -} diff --git a/src/xenia/gpu/d3d11/d3d11_buffer_cache.h b/src/xenia/gpu/d3d11/d3d11_buffer_cache.h deleted file mode 100644 index 284536ab7..000000000 --- a/src/xenia/gpu/d3d11/d3d11_buffer_cache.h +++ /dev/null @@ -1,53 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_D3D11_D3D11_BUFFER_CACHE_H_ -#define XENIA_GPU_D3D11_D3D11_BUFFER_CACHE_H_ - -#include - -#include -#include - -#include - - -namespace xe { -namespace gpu { -namespace d3d11 { - - -class D3D11BufferCache : public BufferCache { -public: - D3D11BufferCache(ID3D11DeviceContext* context, ID3D11Device* device); - virtual ~D3D11BufferCache(); - - ID3D11DeviceContext* context() const { return context_; } - ID3D11Device* device() const { return device_; } - -protected: - IndexBuffer* CreateIndexBuffer( - const IndexBufferInfo& info, - const uint8_t* src_ptr, size_t length) override; - VertexBuffer* CreateVertexBuffer( - const VertexBufferInfo& info, - const uint8_t* src_ptr, size_t length) override; - -protected: - ID3D11DeviceContext* context_; - ID3D11Device* device_; -}; - - -} // namespace d3d11 -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_D3D11_D3D11_BUFFER_CACHE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_buffer_resource.cc b/src/xenia/gpu/d3d11/d3d11_buffer_resource.cc new file mode 100644 index 000000000..8f03cfe58 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_buffer_resource.cc @@ -0,0 +1,149 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; + + +D3D11IndexBufferResource::D3D11IndexBufferResource( + D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info) + : IndexBufferResource(memory_range, info), + resource_cache_(resource_cache), + handle_(nullptr) { +} + +D3D11IndexBufferResource::~D3D11IndexBufferResource() { + XESAFERELEASE(handle_); +} + +int D3D11IndexBufferResource::CreateHandle() { + D3D11_BUFFER_DESC buffer_desc; + xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); + buffer_desc.ByteWidth = static_cast(memory_range_.length); + buffer_desc.Usage = D3D11_USAGE_DYNAMIC; + buffer_desc.BindFlags = D3D11_BIND_INDEX_BUFFER; + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + HRESULT hr = resource_cache_->device()->CreateBuffer( + &buffer_desc, nullptr, &handle_); + if (FAILED(hr)) { + XELOGW("D3D11: failed to create index buffer"); + return 1; + } + return 0; +} + +int D3D11IndexBufferResource::InvalidateRegion( + const MemoryRange& memory_range) { + SCOPE_profile_cpu_f("gpu"); + + // All that's done so far: + XEASSERT(info_.endianness == 0x2); + + D3D11_MAPPED_SUBRESOURCE res; + HRESULT hr = resource_cache_->context()->Map( + handle_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + if (FAILED(hr)) { + XELOGE("D3D11: unable to map index buffer"); + return 1; + } + + if (info_.format == INDEX_FORMAT_32BIT) { + uint32_t index_count = memory_range_.length / 4; + const uint32_t* src = reinterpret_cast( + memory_range_.host_base); + uint32_t* dest = reinterpret_cast(res.pData); + for (uint32_t n = 0; n < index_count; n++) { + dest[n] = XESWAP32(src[n]); + } + } else { + uint32_t index_count = memory_range_.length / 2; + const uint16_t* src = reinterpret_cast( + memory_range_.host_base); + uint16_t* dest = reinterpret_cast(res.pData); + for (uint32_t n = 0; n < index_count; n++) { + dest[n] = XESWAP16(src[n]); + } + } + resource_cache_->context()->Unmap(handle_, 0); + + return 0; +} + +D3D11VertexBufferResource::D3D11VertexBufferResource( + D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info) + : VertexBufferResource(memory_range, info), + resource_cache_(resource_cache), + handle_(nullptr) { +} + +D3D11VertexBufferResource::~D3D11VertexBufferResource() { + XESAFERELEASE(handle_); +} + +int D3D11VertexBufferResource::CreateHandle() { + D3D11_BUFFER_DESC buffer_desc; + xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); + buffer_desc.ByteWidth = static_cast(memory_range_.length); + buffer_desc.Usage = D3D11_USAGE_DYNAMIC; + buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER; + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + HRESULT hr = resource_cache_->device()->CreateBuffer( + &buffer_desc, nullptr, &handle_); + if (FAILED(hr)) { + XELOGW("D3D11: failed to create vertex buffer"); + return 1; + } + return 0; +} + +int D3D11VertexBufferResource::InvalidateRegion( + const MemoryRange& memory_range) { + SCOPE_profile_cpu_f("gpu"); + + D3D11_MAPPED_SUBRESOURCE res; + HRESULT hr = resource_cache_->context()->Map( + handle_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + if (FAILED(hr)) { + XELOGE("D3D11: unable to map vertex buffer"); + return 1; + } + uint8_t* dest = reinterpret_cast(res.pData); + + // TODO(benvanik): rewrite to be faster/special case common/etc + uint32_t stride = info_.stride_words; + size_t count = (memory_range_.length / 4) / stride; + for (size_t n = 0; n < info_.element_count; n++) { + const auto& el = info_.elements[n]; + const uint32_t* src_ptr = (const uint32_t*)( + memory_range_.host_base + el.offset_words * 4); + uint32_t* dest_ptr = (uint32_t*)(dest + el.offset_words * 4); + uint32_t o = 0; + for (uint32_t i = 0; i < count; i++) { + for (uint32_t j = 0; j < el.size_words; j++) { + dest_ptr[o + j] = XESWAP32(src_ptr[o + j]); + } + o += stride; + } + } + + resource_cache_->context()->Unmap(handle_, 0); + return 0; +} diff --git a/src/xenia/gpu/d3d11/d3d11_buffer_resource.h b/src/xenia/gpu/d3d11/d3d11_buffer_resource.h new file mode 100644 index 000000000..2e8071ae1 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_buffer_resource.h @@ -0,0 +1,69 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_BUFFER_RESOURCE_H_ +#define XENIA_GPU_D3D11_D3D11_BUFFER_RESOURCE_H_ + +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + +class D3D11ResourceCache; + + +class D3D11IndexBufferResource : public IndexBufferResource { +public: + D3D11IndexBufferResource(D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info); + ~D3D11IndexBufferResource() override; + + void* handle() const override { return handle_; } + +protected: + int CreateHandle() override; + int InvalidateRegion(const MemoryRange& memory_range) override; + +private: + D3D11ResourceCache* resource_cache_; + ID3D11Buffer* handle_; +}; + + +class D3D11VertexBufferResource : public VertexBufferResource { +public: + D3D11VertexBufferResource(D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info); + ~D3D11VertexBufferResource() override; + + void* handle() const override { return handle_; } + +protected: + int CreateHandle() override; + int InvalidateRegion(const MemoryRange& memory_range) override; + +private: + D3D11ResourceCache* resource_cache_; + ID3D11Buffer* handle_; +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_BUFFER_RESOURCE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc b/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc index ba677f7a0..d8660cbfe 100644 --- a/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc +++ b/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc @@ -10,7 +10,8 @@ #include #include -#include +#include +#include #include #include @@ -22,8 +23,8 @@ using namespace xe::gpu::d3d11; using namespace xe::gpu::xenos; -D3D11GeometryShader::D3D11GeometryShader(ID3D11Device* device, uint64_t hash) : - hash_(hash), handle_(NULL) { +D3D11GeometryShader::D3D11GeometryShader(ID3D11Device* device) + : handle_(nullptr) { device_ = device; device_->AddRef(); } @@ -33,7 +34,7 @@ D3D11GeometryShader::~D3D11GeometryShader() { XESAFERELEASE(device_); } -int D3D11GeometryShader::Prepare(D3D11VertexShader* vertex_shader) { +int D3D11GeometryShader::Prepare(D3D11VertexShaderResource* vertex_shader) { SCOPE_profile_cpu_f("gpu"); if (handle_) { @@ -94,11 +95,12 @@ ID3D10Blob* D3D11GeometryShader::Compile(const char* shader_source) { if (FLAGS_dump_shaders.size()) { base_path = FLAGS_dump_shaders.c_str(); } + uint64_t hash = xe_hash64(shader_source, xestrlena(shader_source)); // ? char file_name[XE_MAX_PATH]; xesnprintfa(file_name, XECOUNT(file_name), "%s/gen_%.16llX.gs", base_path, - hash_); + hash); if (FLAGS_dump_shaders.size()) { FILE* f = fopen(file_name, "w"); @@ -128,7 +130,7 @@ ID3D10Blob* D3D11GeometryShader::Compile(const char* shader_source) { return shader_blob; } -int D3D11GeometryShader::Generate(D3D11VertexShader* vertex_shader, +int D3D11GeometryShader::Generate(D3D11VertexShaderResource* vertex_shader, alloy::StringBuffer* output) { output->Append( "struct VERTEX {\n" @@ -138,7 +140,7 @@ int D3D11GeometryShader::Generate(D3D11VertexShader* vertex_shader, // TODO(benvanik): only add used ones? output->Append( " float4 o[%d] : XE_O;\n", - D3D11Shader::MAX_INTERPOLATORS); + D3D11ShaderTranslator::kMaxInterpolators); } if (alloc_counts.point_size) { output->Append( @@ -156,15 +158,15 @@ int D3D11GeometryShader::Generate(D3D11VertexShader* vertex_shader, D3D11PointSpriteGeometryShader::D3D11PointSpriteGeometryShader( - ID3D11Device* device, uint64_t hash) : - D3D11GeometryShader(device, hash) { + ID3D11Device* device) : D3D11GeometryShader(device) { } D3D11PointSpriteGeometryShader::~D3D11PointSpriteGeometryShader() { } -int D3D11PointSpriteGeometryShader::Generate(D3D11VertexShader* vertex_shader, - alloy::StringBuffer* output) { +int D3D11PointSpriteGeometryShader::Generate( + D3D11VertexShaderResource* vertex_shader, + alloy::StringBuffer* output) { SCOPE_profile_cpu_f("gpu"); if (D3D11GeometryShader::Generate(vertex_shader, output)) { return 1; @@ -211,15 +213,15 @@ int D3D11PointSpriteGeometryShader::Generate(D3D11VertexShader* vertex_shader, D3D11RectListGeometryShader::D3D11RectListGeometryShader( - ID3D11Device* device, uint64_t hash) : - D3D11GeometryShader(device, hash) { + ID3D11Device* device) : D3D11GeometryShader(device) { } D3D11RectListGeometryShader::~D3D11RectListGeometryShader() { } -int D3D11RectListGeometryShader::Generate(D3D11VertexShader* vertex_shader, - alloy::StringBuffer* output) { +int D3D11RectListGeometryShader::Generate( + D3D11VertexShaderResource* vertex_shader, + alloy::StringBuffer* output) { SCOPE_profile_cpu_f("gpu"); if (D3D11GeometryShader::Generate(vertex_shader, output)) { return 1; @@ -256,15 +258,15 @@ int D3D11RectListGeometryShader::Generate(D3D11VertexShader* vertex_shader, D3D11QuadListGeometryShader::D3D11QuadListGeometryShader( - ID3D11Device* device, uint64_t hash) : - D3D11GeometryShader(device, hash) { + ID3D11Device* device) : D3D11GeometryShader(device) { } D3D11QuadListGeometryShader::~D3D11QuadListGeometryShader() { } -int D3D11QuadListGeometryShader::Generate(D3D11VertexShader* vertex_shader, - alloy::StringBuffer* output) { +int D3D11QuadListGeometryShader::Generate( + D3D11VertexShaderResource* vertex_shader, + alloy::StringBuffer* output) { SCOPE_profile_cpu_f("gpu"); if (D3D11GeometryShader::Generate(vertex_shader, output)) { return 1; diff --git a/src/xenia/gpu/d3d11/d3d11_geometry_shader.h b/src/xenia/gpu/d3d11/d3d11_geometry_shader.h index cdfebad5f..89529b2a4 100644 --- a/src/xenia/gpu/d3d11/d3d11_geometry_shader.h +++ b/src/xenia/gpu/d3d11/d3d11_geometry_shader.h @@ -21,7 +21,7 @@ namespace xe { namespace gpu { namespace d3d11 { -class D3D11VertexShader; +class D3D11VertexShaderResource; class D3D11GeometryShader { @@ -30,53 +30,52 @@ public: ID3D11GeometryShader* handle() const { return handle_; } - int Prepare(D3D11VertexShader* vertex_shader); + int Prepare(D3D11VertexShaderResource* vertex_shader); protected: - D3D11GeometryShader(ID3D11Device* device, uint64_t hash); + D3D11GeometryShader(ID3D11Device* device); ID3D10Blob* Compile(const char* shader_source); - virtual int Generate(D3D11VertexShader* vertex_shader, + virtual int Generate(D3D11VertexShaderResource* vertex_shader, alloy::StringBuffer* output); protected: ID3D11Device* device_; - uint64_t hash_; ID3D11GeometryShader* handle_; }; class D3D11PointSpriteGeometryShader : public D3D11GeometryShader { public: - D3D11PointSpriteGeometryShader(ID3D11Device* device, uint64_t hash); - virtual ~D3D11PointSpriteGeometryShader(); + D3D11PointSpriteGeometryShader(ID3D11Device* device); + ~D3D11PointSpriteGeometryShader() override; protected: - virtual int Generate(D3D11VertexShader* vertex_shader, - alloy::StringBuffer* output); + int Generate(D3D11VertexShaderResource* vertex_shader, + alloy::StringBuffer* output) override; }; class D3D11RectListGeometryShader : public D3D11GeometryShader { public: - D3D11RectListGeometryShader(ID3D11Device* device, uint64_t hash); - virtual ~D3D11RectListGeometryShader(); + D3D11RectListGeometryShader(ID3D11Device* device); + ~D3D11RectListGeometryShader() override; protected: - virtual int Generate(D3D11VertexShader* vertex_shader, - alloy::StringBuffer* output); + int Generate(D3D11VertexShaderResource* vertex_shader, + alloy::StringBuffer* output) override; }; class D3D11QuadListGeometryShader : public D3D11GeometryShader { public: - D3D11QuadListGeometryShader(ID3D11Device* device, uint64_t hash); - virtual ~D3D11QuadListGeometryShader(); + D3D11QuadListGeometryShader(ID3D11Device* device); + ~D3D11QuadListGeometryShader() override; protected: - virtual int Generate(D3D11VertexShader* vertex_shader, - alloy::StringBuffer* output); + int Generate(D3D11VertexShaderResource* vertex_shader, + alloy::StringBuffer* output) override; }; diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc index 886643e32..a671b4626 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc @@ -10,13 +10,12 @@ #include #include -#include -#include +#include +#include +#include #include -#include -#include -#include -#include +#include + using namespace xe; using namespace xe::gpu; @@ -35,9 +34,8 @@ D3D11GraphicsDriver::D3D11GraphicsDriver( device_ = device; device_->AddRef(); device_->GetImmediateContext(&context_); - buffer_cache_ = new D3D11BufferCache(context_, device_); - shader_cache_ = new D3D11ShaderCache(device_); - texture_cache_ = new D3D11TextureCache(memory_, context_, device_); + + resource_cache_ = new D3D11ResourceCache(memory, device_, context_); xe_zero_struct(&state_, sizeof(state_)); @@ -64,7 +62,29 @@ D3D11GraphicsDriver::D3D11GraphicsDriver( buffer_desc.ByteWidth = (32) * sizeof(int); hr = device_->CreateBuffer( &buffer_desc, NULL, &state_.constant_buffers.gs_consts); +} +D3D11GraphicsDriver::~D3D11GraphicsDriver() { + RebuildRenderTargets(0, 0); + XESAFERELEASE(state_.constant_buffers.float_constants); + XESAFERELEASE(state_.constant_buffers.bool_constants); + XESAFERELEASE(state_.constant_buffers.loop_constants); + XESAFERELEASE(state_.constant_buffers.vs_consts); + XESAFERELEASE(state_.constant_buffers.gs_consts); + XESAFERELEASE(invalid_texture_view_); + XESAFERELEASE(invalid_texture_sampler_state_); + delete resource_cache_; + XESAFERELEASE(context_); + XESAFERELEASE(device_); + XESAFERELEASE(swap_chain_); +} + +int D3D11GraphicsDriver::Initialize() { + InitializeInvalidTexture(); + return 0; +} + +void D3D11GraphicsDriver::InitializeInvalidTexture() { // TODO(benvanik): pattern? D3D11_TEXTURE2D_DESC texture_desc; xe_zero_struct(&texture_desc, sizeof(texture_desc)); @@ -90,7 +110,7 @@ D3D11GraphicsDriver::D3D11GraphicsDriver( initial_data.SysMemSlicePitch = 0; initial_data.pSysMem = texture_data; ID3D11Texture2D* texture = NULL; - hr = device_->CreateTexture2D( + HRESULT hr = device_->CreateTexture2D( &texture_desc, &initial_data, (ID3D11Texture2D**)&texture); if (FAILED(hr)) { XEFATAL("D3D11: unable to create invalid texture"); @@ -130,315 +150,53 @@ D3D11GraphicsDriver::D3D11GraphicsDriver( } } -D3D11GraphicsDriver::~D3D11GraphicsDriver() { - RebuildRenderTargets(0, 0); - XESAFERELEASE(state_.constant_buffers.float_constants); - XESAFERELEASE(state_.constant_buffers.bool_constants); - XESAFERELEASE(state_.constant_buffers.loop_constants); - XESAFERELEASE(state_.constant_buffers.vs_consts); - XESAFERELEASE(state_.constant_buffers.gs_consts); - XESAFERELEASE(invalid_texture_view_); - XESAFERELEASE(invalid_texture_sampler_state_); - delete buffer_cache_; - delete texture_cache_; - delete shader_cache_; - XESAFERELEASE(context_); - XESAFERELEASE(device_); - XESAFERELEASE(swap_chain_); -} - -void D3D11GraphicsDriver::Initialize() { -} - -void D3D11GraphicsDriver::InvalidateState( - uint32_t mask) { - if (mask == XE_GPU_INVALIDATE_MASK_ALL) { - XETRACED3D("D3D11: (invalidate all)"); - } - if (mask & XE_GPU_INVALIDATE_MASK_VERTEX_SHADER) { - XETRACED3D("D3D11: invalidate vertex shader"); - } - if (mask & XE_GPU_INVALIDATE_MASK_PIXEL_SHADER) { - XETRACED3D("D3D11: invalidate pixel shader"); - } -} - -void D3D11GraphicsDriver::SetShader( - XE_GPU_SHADER_TYPE type, - uint32_t address, - uint32_t start, - uint32_t length) { - // Find or create shader in the cache. - uint8_t* p = memory_->Translate(address); - Shader* shader = shader_cache_->FindOrCreate( - type, p, length); - - if (!shader->is_prepared()) { - // Disassemble. - const char* source = shader->disasm_src(); - if (!source) { - source = ""; - } - XETRACED3D("D3D11: set shader %d at %0.8X (%db):\n%s", - type, address, length, source); - } - - // Stash for later. - switch (type) { - case XE_GPU_SHADER_TYPE_VERTEX: - state_.vertex_shader = (D3D11VertexShader*)shader; - break; - case XE_GPU_SHADER_TYPE_PIXEL: - state_.pixel_shader = (D3D11PixelShader*)shader; - break; - } -} - -int D3D11GraphicsDriver::SetupDraw(XE_GPU_PRIMITIVE_TYPE prim_type) { +int D3D11GraphicsDriver::Draw(const DrawCommand& command) { SCOPE_profile_cpu_f("gpu"); - RegisterFile& rf = register_file_; - - // Ignore copies. - uint32_t enable_mode = rf.values[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7; - if (enable_mode != 4) { - XELOGW("D3D11: ignoring draw with enable mode %d", enable_mode); - return 1; - } - - uint32_t state_overrides = 0; - if (prim_type == XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST) { - // Rect lists aren't culled. There may be other things they skip too. - state_overrides |= STATE_OVERRIDE_DISABLE_CULLING; - } - // Misc state. - if (UpdateState(state_overrides)) { + if (UpdateState(command)) { return 1; } // Build constant buffers. - if (UpdateConstantBuffers()) { + if (SetupConstantBuffers(command)) { return 1; } // Bind shaders. - if (BindShaders()) { + if (SetupShaders(command)) { return 1; } - // Switch primitive topology. - // Some are unsupported on D3D11 and must be emulated. - D3D11_PRIMITIVE_TOPOLOGY primitive_topology; - D3D11GeometryShader* geometry_shader = NULL; - switch (prim_type) { - case XE_GPU_PRIMITIVE_TYPE_POINT_LIST: - primitive_topology = D3D_PRIMITIVE_TOPOLOGY_POINTLIST; - if (state_.vertex_shader) { - if (state_.vertex_shader->DemandGeometryShader( - D3D11VertexShader::POINT_SPRITE_SHADER, &geometry_shader)) { - return 1; - } - } - break; - case XE_GPU_PRIMITIVE_TYPE_LINE_LIST: - primitive_topology = D3D_PRIMITIVE_TOPOLOGY_LINELIST; - break; - case XE_GPU_PRIMITIVE_TYPE_LINE_STRIP: - primitive_topology = D3D_PRIMITIVE_TOPOLOGY_LINESTRIP; - break; - case XE_GPU_PRIMITIVE_TYPE_TRIANGLE_LIST: - primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST; - break; - case XE_GPU_PRIMITIVE_TYPE_TRIANGLE_STRIP: - primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP; - break; - case XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST: - primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP; - if (state_.vertex_shader) { - if (state_.vertex_shader->DemandGeometryShader( - D3D11VertexShader::RECT_LIST_SHADER, &geometry_shader)) { - return 1; - } - } - break; - case XE_GPU_PRIMITIVE_TYPE_QUAD_LIST: - primitive_topology = D3D_PRIMITIVE_TOPOLOGY_LINELIST_ADJ; - if (state_.vertex_shader) { - if (state_.vertex_shader->DemandGeometryShader( - D3D11VertexShader::QUAD_LIST_SHADER, &geometry_shader)) { - return 1; - } - } - break; - default: - case XE_GPU_PRIMITIVE_TYPE_TRIANGLE_FAN: - case XE_GPU_PRIMITIVE_TYPE_UNKNOWN_07: - case XE_GPU_PRIMITIVE_TYPE_LINE_LOOP: - primitive_topology = D3D_PRIMITIVE_TOPOLOGY_POINTLIST; - XELOGE("D3D11: unsupported primitive type %d", prim_type); - break; + // Bind vertex buffers/index buffer. + if (SetupInputAssembly(command)) { + return 1; } - context_->IASetPrimitiveTopology(primitive_topology); - if (geometry_shader) { - context_->GSSetShader(geometry_shader->handle(), NULL, NULL); - context_->GSSetConstantBuffers( - 0, 1, &state_.constant_buffers.gs_consts); + // Bind texture fetchers. + if (SetupSamplers(command)) { + return 1; + } + + if (command.index_buffer) { + // Have an actual index buffer. + XETRACED3D("D3D11: draw indexed %d (indicies [%d,%d] (%d))", + command.prim_type, command.start_index, + command.start_index + command.index_count, command.index_count); + context_->DrawIndexed(command.index_count, command.start_index, + command.base_vertex); } else { - context_->GSSetShader(NULL, NULL, NULL); + // Auto draw. + XETRACED3D("D3D11: draw indexed auto %d (indicies [%d,%d] (%d))", + command.prim_type, command.start_index, + command.start_index + command.index_count, command.index_count); + context_->Draw(command.index_count, 0); } - // Setup all fetchers (vertices/textures). - if (PrepareFetchers()) { - return 1; - } - - // All ready to draw (except index buffer)! - return 0; } -void D3D11GraphicsDriver::DrawIndexBuffer( - XE_GPU_PRIMITIVE_TYPE prim_type, - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness) { - SCOPE_profile_cpu_f("gpu"); - - RegisterFile& rf = register_file_; - - XETRACED3D("D3D11: draw indexed %d (%d indicies) from %.8X", - prim_type, index_count, index_base); - - // Setup shaders/etc. - if (SetupDraw(prim_type)) { - return; - } - - // Setup index buffer. - if (PrepareIndexBuffer( - index_32bit, index_count, index_base, index_size, endianness)) { - return; - } - - // Issue draw. - uint32_t start_index = rf.values[XE_GPU_REG_VGT_INDX_OFFSET].u32; - uint32_t base_vertex = 0; - context_->DrawIndexed(index_count, start_index, base_vertex); -} - -void D3D11GraphicsDriver::DrawIndexAuto( - XE_GPU_PRIMITIVE_TYPE prim_type, - uint32_t index_count) { - SCOPE_profile_cpu_f("gpu"); - - RegisterFile& rf = register_file_; - - XETRACED3D("D3D11: draw indexed %d (%d indicies)", - prim_type, index_count); - - // Setup shaders/etc. - if (SetupDraw(prim_type)) { - return; - } - - // Issue draw. - uint32_t start_index = rf.values[XE_GPU_REG_VGT_INDX_OFFSET].u32; - uint32_t base_vertex = 0; - //context_->DrawIndexed(index_count, start_index, base_vertex); - context_->Draw(index_count, 0); -} - -int D3D11GraphicsDriver::RebuildRenderTargets( - uint32_t width, uint32_t height) { - if (width == render_targets_.width && - height == render_targets_.height) { - // Cached copies are good. - return 0; - } - - SCOPE_profile_cpu_f("gpu"); - - // Remove old versions. - for (int n = 0; n < XECOUNT(render_targets_.color_buffers); n++) { - auto& cb = render_targets_.color_buffers[n]; - XESAFERELEASE(cb.buffer); - XESAFERELEASE(cb.color_view_8888); - } - XESAFERELEASE(render_targets_.depth_buffer); - XESAFERELEASE(render_targets_.depth_view_d28s8); - XESAFERELEASE(render_targets_.depth_view_d28fs8); - - render_targets_.width = width; - render_targets_.height = height; - - if (!width || !height) { - // This should only happen when cleaning up. - return 0; - } - - for (int n = 0; n < XECOUNT(render_targets_.color_buffers); n++) { - auto& cb = render_targets_.color_buffers[n]; - D3D11_TEXTURE2D_DESC color_buffer_desc; - xe_zero_struct(&color_buffer_desc, sizeof(color_buffer_desc)); - color_buffer_desc.Width = width; - color_buffer_desc.Height = height; - color_buffer_desc.MipLevels = 1; - color_buffer_desc.ArraySize = 1; - color_buffer_desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; - color_buffer_desc.SampleDesc.Count = 1; - color_buffer_desc.SampleDesc.Quality = 0; - color_buffer_desc.Usage = D3D11_USAGE_DEFAULT; - color_buffer_desc.BindFlags = - D3D11_BIND_SHADER_RESOURCE | - D3D11_BIND_RENDER_TARGET; - color_buffer_desc.CPUAccessFlags = 0; - color_buffer_desc.MiscFlags = 0; - device_->CreateTexture2D( - &color_buffer_desc, NULL, &cb.buffer); - - D3D11_RENDER_TARGET_VIEW_DESC render_target_view_desc; - xe_zero_struct(&render_target_view_desc, sizeof(render_target_view_desc)); - render_target_view_desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; - render_target_view_desc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D; - // render_target_view_desc.Buffer ? - device_->CreateRenderTargetView( - cb.buffer, - &render_target_view_desc, - &cb.color_view_8888); - } - - D3D11_TEXTURE2D_DESC depth_stencil_desc; - xe_zero_struct(&depth_stencil_desc, sizeof(depth_stencil_desc)); - depth_stencil_desc.Width = width; - depth_stencil_desc.Height = height; - depth_stencil_desc.MipLevels = 1; - depth_stencil_desc.ArraySize = 1; - depth_stencil_desc.Format = DXGI_FORMAT_D24_UNORM_S8_UINT; - depth_stencil_desc.SampleDesc.Count = 1; - depth_stencil_desc.SampleDesc.Quality = 0; - depth_stencil_desc.Usage = D3D11_USAGE_DEFAULT; - depth_stencil_desc.BindFlags = - D3D11_BIND_DEPTH_STENCIL; - depth_stencil_desc.CPUAccessFlags = 0; - depth_stencil_desc.MiscFlags = 0; - device_->CreateTexture2D( - &depth_stencil_desc, NULL, &render_targets_.depth_buffer); - - D3D11_DEPTH_STENCIL_VIEW_DESC depth_stencil_view_desc; - xe_zero_struct(&depth_stencil_view_desc, sizeof(depth_stencil_view_desc)); - depth_stencil_view_desc.Format = DXGI_FORMAT_D24_UNORM_S8_UINT; - depth_stencil_view_desc.ViewDimension = D3D11_DSV_DIMENSION_TEXTURE2D; - depth_stencil_view_desc.Flags = 0; - device_->CreateDepthStencilView( - render_targets_.depth_buffer, - &depth_stencil_view_desc, - &render_targets_.depth_view_d28s8); - - return 0; -} - -int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { +int D3D11GraphicsDriver::UpdateState(const DrawCommand& command) { SCOPE_profile_cpu_f("gpu"); // Most information comes from here: @@ -449,8 +207,8 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { RegisterFile& rf = register_file_; - uint32_t window_scissor_tl = rf.values[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32; - uint32_t window_scissor_br = rf.values[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32; + uint32_t window_scissor_tl = register_file_[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32; + uint32_t window_scissor_br = register_file_[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32; //uint32_t window_width = // (window_scissor_br & 0x7FFF) - (window_scissor_tl & 0x7FFF); //uint32_t window_height = @@ -466,16 +224,16 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { // RB_SURFACE_INFO ? // Enable buffers. - uint32_t enable_mode = rf.values[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7; + uint32_t enable_mode = register_file_[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7; // 4 = color + depth // 6 = copy ? // color_info[0-3] has format 8888 uint32_t color_info[4] = { - rf.values[XE_GPU_REG_RB_COLOR_INFO].u32, - rf.values[XE_GPU_REG_RB_COLOR1_INFO].u32, - rf.values[XE_GPU_REG_RB_COLOR2_INFO].u32, - rf.values[XE_GPU_REG_RB_COLOR3_INFO].u32, + register_file_[XE_GPU_REG_RB_COLOR_INFO].u32, + register_file_[XE_GPU_REG_RB_COLOR1_INFO].u32, + register_file_[XE_GPU_REG_RB_COLOR2_INFO].u32, + register_file_[XE_GPU_REG_RB_COLOR3_INFO].u32, }; ID3D11RenderTargetView* render_target_views[4] = { 0 }; for (int n = 0; n < XECOUNT(color_info); n++) { @@ -494,7 +252,7 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { } // depth_info has format 24_8 - uint32_t depth_info = rf.values[XE_GPU_REG_RB_DEPTH_INFO].u32; + uint32_t depth_info = register_file_[XE_GPU_REG_RB_DEPTH_INFO].u32; uint32_t depth_format = (depth_info >> 16) & 0x1; ID3D11DepthStencilView* depth_stencil_view = 0; switch (depth_format) { @@ -514,7 +272,7 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { context_->OMSetRenderTargets(4, render_target_views, depth_stencil_view); // General rasterizer state. - uint32_t mode_control = rf.values[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32; + uint32_t mode_control = register_file_[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32; D3D11_RASTERIZER_DESC rasterizer_desc; xe_zero_struct(&rasterizer_desc, sizeof(rasterizer_desc)); rasterizer_desc.FillMode = D3D11_FILL_SOLID; // D3D11_FILL_WIREFRAME; @@ -529,7 +287,8 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { rasterizer_desc.CullMode = D3D11_CULL_BACK; break; } - if (state_overrides & STATE_OVERRIDE_DISABLE_CULLING) { + if (command.prim_type == XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST) { + // Rect lists aren't culled. There may be other things they skip too. rasterizer_desc.CullMode = D3D11_CULL_NONE; } rasterizer_desc.FrontCounterClockwise = (mode_control & 0x4) == 0; @@ -547,7 +306,7 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { // Viewport. // If we have resized the window we will want to change this. - uint32_t window_offset = rf.values[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32; + uint32_t window_offset = register_file_[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32; // signed? uint32_t window_offset_x = window_offset & 0x7FFF; uint32_t window_offset_y = (window_offset >> 16) & 0x7FFF; @@ -555,19 +314,19 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { // ? // TODO(benvanik): figure out how to emulate viewports in D3D11. Could use // viewport above to scale, though that doesn't support negatives/etc. - uint32_t vte_control = rf.values[XE_GPU_REG_PA_CL_VTE_CNTL].u32; + uint32_t vte_control = register_file_[XE_GPU_REG_PA_CL_VTE_CNTL].u32; bool vport_xscale_enable = (vte_control & (1 << 0)) > 0; - float vport_xscale = rf.values[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32; // 640 + float vport_xscale = register_file_[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32; // 640 bool vport_xoffset_enable = (vte_control & (1 << 1)) > 0; - float vport_xoffset = rf.values[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32; // 640 + float vport_xoffset = register_file_[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32; // 640 bool vport_yscale_enable = (vte_control & (1 << 2)) > 0; - float vport_yscale = rf.values[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32; // -360 + float vport_yscale = register_file_[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32; // -360 bool vport_yoffset_enable = (vte_control & (1 << 3)) > 0; - float vport_yoffset = rf.values[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; // 360 + float vport_yoffset = register_file_[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; // 360 bool vport_zscale_enable = (vte_control & (1 << 4)) > 0; - float vport_zscale = rf.values[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32; // 1 + float vport_zscale = register_file_[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32; // 1 bool vport_zoffset_enable = (vte_control & (1 << 5)) > 0; - float vport_zoffset = rf.values[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32; // 0 + float vport_zoffset = register_file_[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32; // 0 // TODO(benvanik): compute viewport values. D3D11_VIEWPORT viewport; @@ -619,8 +378,8 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { // Scissoring. // TODO(benvanik): pull from scissor registers. // ScissorEnable must be set in raster state above. - uint32_t screen_scissor_tl = rf.values[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL].u32; - uint32_t screen_scissor_br = rf.values[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR].u32; + uint32_t screen_scissor_tl = register_file_[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL].u32; + uint32_t screen_scissor_br = register_file_[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR].u32; if (screen_scissor_tl != 0 && screen_scissor_br != 0x20002000) { D3D11_RECT scissor_rect; scissor_rect.top = (screen_scissor_tl >> 16) & 0x7FFF; @@ -654,8 +413,8 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { }; // Depth-stencil state. - uint32_t depth_control = rf.values[XE_GPU_REG_RB_DEPTHCONTROL].u32; - uint32_t stencil_ref_mask = rf.values[XE_GPU_REG_RB_STENCILREFMASK].u32; + uint32_t depth_control = register_file_[XE_GPU_REG_RB_DEPTHCONTROL].u32; + uint32_t stencil_ref_mask = register_file_[XE_GPU_REG_RB_STENCILREFMASK].u32; D3D11_DEPTH_STENCIL_DESC depth_stencil_desc; xe_zero_struct(&depth_stencil_desc, sizeof(depth_stencil_desc)); // A2XX_RB_DEPTHCONTROL_BACKFACE_ENABLE @@ -727,22 +486,22 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { // alpha testing -- ALPHAREF, ALPHAFUNC, ALPHATESTENABLE // Not in D3D11! // http://msdn.microsoft.com/en-us/library/windows/desktop/bb205120(v=vs.85).aspx - uint32_t color_control = rf.values[XE_GPU_REG_RB_COLORCONTROL].u32; + uint32_t color_control = register_file_[XE_GPU_REG_RB_COLORCONTROL].u32; // Blend state. - uint32_t color_mask = rf.values[XE_GPU_REG_RB_COLOR_MASK].u32; + uint32_t color_mask = register_file_[XE_GPU_REG_RB_COLOR_MASK].u32; uint32_t sample_mask = 0xFFFFFFFF; // ? float blend_factor[4] = { - rf.values[XE_GPU_REG_RB_BLEND_RED].f32, - rf.values[XE_GPU_REG_RB_BLEND_GREEN].f32, - rf.values[XE_GPU_REG_RB_BLEND_BLUE].f32, - rf.values[XE_GPU_REG_RB_BLEND_ALPHA].f32, + register_file_[XE_GPU_REG_RB_BLEND_RED].f32, + register_file_[XE_GPU_REG_RB_BLEND_GREEN].f32, + register_file_[XE_GPU_REG_RB_BLEND_BLUE].f32, + register_file_[XE_GPU_REG_RB_BLEND_ALPHA].f32, }; uint32_t blend_control[4] = { - rf.values[XE_GPU_REG_RB_BLENDCONTROL_0].u32, - rf.values[XE_GPU_REG_RB_BLENDCONTROL_1].u32, - rf.values[XE_GPU_REG_RB_BLENDCONTROL_2].u32, - rf.values[XE_GPU_REG_RB_BLENDCONTROL_3].u32, + register_file_[XE_GPU_REG_RB_BLENDCONTROL_0].u32, + register_file_[XE_GPU_REG_RB_BLENDCONTROL_1].u32, + register_file_[XE_GPU_REG_RB_BLENDCONTROL_2].u32, + register_file_[XE_GPU_REG_RB_BLENDCONTROL_3].u32, }; D3D11_BLEND_DESC blend_desc; xe_zero_struct(&blend_desc, sizeof(blend_desc)); @@ -782,60 +541,43 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { return 0; } -int D3D11GraphicsDriver::UpdateConstantBuffers() { +int D3D11GraphicsDriver::SetupConstantBuffers(const DrawCommand& command) { SCOPE_profile_cpu_f("gpu"); - RegisterFile& rf = register_file_; - D3D11_MAPPED_SUBRESOURCE res; context_->Map( state_.constant_buffers.float_constants, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); memcpy(res.pData, - &rf.values[XE_GPU_REG_SHADER_CONSTANT_000_X], - (512 * 4) * sizeof(float)); + command.float4_constants.values, + command.float4_constants.count * 4 * sizeof(float)); context_->Unmap(state_.constant_buffers.float_constants, 0); context_->Map( state_.constant_buffers.loop_constants, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); memcpy(res.pData, - &rf.values[XE_GPU_REG_SHADER_CONSTANT_LOOP_00], - (32) * sizeof(int)); + command.loop_constants.values, + command.loop_constants.count * sizeof(int)); context_->Unmap(state_.constant_buffers.loop_constants, 0); context_->Map( state_.constant_buffers.bool_constants, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); memcpy(res.pData, - &rf.values[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031], - (8) * sizeof(int)); + command.bool_constants.values, + command.bool_constants.count * sizeof(int)); context_->Unmap(state_.constant_buffers.bool_constants, 0); return 0; } -int D3D11GraphicsDriver::BindShaders() { +int D3D11GraphicsDriver::SetupShaders(const DrawCommand& command) { SCOPE_profile_cpu_f("gpu"); - RegisterFile& rf = register_file_; - xe_gpu_program_cntl_t program_cntl; - program_cntl.dword_0 = rf.values[XE_GPU_REG_SQ_PROGRAM_CNTL].u32; - - // Vertex shader setup. - D3D11VertexShader* vs = state_.vertex_shader; - if (vs) { - if (!vs->is_prepared()) { - // Prepare for use. - if (vs->Prepare(&program_cntl)) { - XELOGGPU("D3D11: failed to prepare vertex shader"); - state_.vertex_shader = NULL; - return 1; - } - } - - // Bind. - context_->VSSetShader(vs->handle(), NULL, 0); + if (command.vertex_shader) { + context_->VSSetShader( + command.vertex_shader->handle_as(), nullptr, 0); // Set constant buffers. ID3D11Buffer* vs_constant_buffers[] = { @@ -844,31 +586,22 @@ int D3D11GraphicsDriver::BindShaders() { state_.constant_buffers.loop_constants, state_.constant_buffers.vs_consts, }; - context_->VSSetConstantBuffers( - 0, XECOUNT(vs_constant_buffers), vs_constant_buffers); + context_->VSSetConstantBuffers(0, XECOUNT(vs_constant_buffers), + vs_constant_buffers); // Setup input layout (as encoded in vertex shader). + auto vs = static_cast(command.vertex_shader); context_->IASetInputLayout(vs->input_layout()); } else { - context_->VSSetShader(NULL, NULL, 0); - context_->IASetInputLayout(NULL); + context_->VSSetShader(nullptr, nullptr, 0); + context_->IASetInputLayout(nullptr); return 1; } // Pixel shader setup. - D3D11PixelShader* ps = state_.pixel_shader; - if (ps) { - if (!ps->is_prepared()) { - // Prepare for use. - if (ps->Prepare(&program_cntl, vs)) { - XELOGGPU("D3D11: failed to prepare pixel shader"); - state_.pixel_shader = NULL; - return 1; - } - } - - // Bind. - context_->PSSetShader(ps->handle(), NULL, 0); + if (command.pixel_shader) { + context_->PSSetShader( + command.pixel_shader->handle_as(), nullptr, 0); // Set constant buffers. ID3D11Buffer* vs_constant_buffers[] = { @@ -876,232 +609,233 @@ int D3D11GraphicsDriver::BindShaders() { state_.constant_buffers.bool_constants, state_.constant_buffers.loop_constants, }; - context_->PSSetConstantBuffers( - 0, XECOUNT(vs_constant_buffers), vs_constant_buffers); + context_->PSSetConstantBuffers(0, XECOUNT(vs_constant_buffers), + vs_constant_buffers); } else { - context_->PSSetShader(NULL, NULL, 0); + context_->PSSetShader(nullptr, nullptr, 0); return 1; } return 0; } -int D3D11GraphicsDriver::PrepareFetchers() { +int D3D11GraphicsDriver::SetupInputAssembly(const DrawCommand& command) { SCOPE_profile_cpu_f("gpu"); - // Input assembly. - XEASSERTNOTNULL(state_.vertex_shader); - auto vtx_inputs = state_.vertex_shader->GetVertexBufferInputs(); - for (size_t n = 0; n < vtx_inputs->count; n++) { - auto input = vtx_inputs->descs[n]; - if (PrepareVertexBuffer(input)) { - XELOGE("D3D11: unable to prepare vertex buffer"); - return 1; - } - } - - // All texture inputs. - if (PrepareTextureFetchers()) { - XELOGE("D3D11: unable to prepare texture fetchers"); - return 1; - } - - // Vertex texture samplers. - auto tex_inputs = state_.vertex_shader->GetTextureBufferInputs(); - for (size_t n = 0; n < tex_inputs->count; n++) { - auto input = tex_inputs->descs[n]; - if (PrepareTextureSampler(XE_GPU_SHADER_TYPE_VERTEX, input)) { - XELOGE("D3D11: unable to prepare texture buffer"); - return 1; - } - } - - // Pixel shader texture sampler. - XEASSERTNOTNULL(state_.pixel_shader); - tex_inputs = state_.pixel_shader->GetTextureBufferInputs(); - for (size_t n = 0; n < tex_inputs->count; n++) { - auto input = tex_inputs->descs[n]; - if (PrepareTextureSampler(XE_GPU_SHADER_TYPE_PIXEL, input)) { - XELOGE("D3D11: unable to prepare texture buffer"); - return 1; - } - } - - return 0; -} - -int D3D11GraphicsDriver::PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc) { - SCOPE_profile_cpu_f("gpu"); - - D3D11VertexShader* vs = state_.vertex_shader; + auto vs = static_cast(command.vertex_shader); if (!vs) { return 1; } - RegisterFile& rf = register_file_; - int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6; - xe_gpu_fetch_group_t* group = (xe_gpu_fetch_group_t*)&rf.values[r]; - xe_gpu_vertex_fetch_t* fetch = NULL; - switch (desc.fetch_slot % 3) { - case 0: - fetch = &group->vertex_fetch_0; + // Switch primitive topology. + // Some are unsupported on D3D11 and must be emulated. + D3D11_PRIMITIVE_TOPOLOGY primitive_topology; + D3D11GeometryShader* geometry_shader = NULL; + switch (command.prim_type) { + case XE_GPU_PRIMITIVE_TYPE_POINT_LIST: + primitive_topology = D3D_PRIMITIVE_TOPOLOGY_POINTLIST; + if (vs->DemandGeometryShader( + D3D11VertexShaderResource::POINT_SPRITE_SHADER, &geometry_shader)) { + return 1; + } break; - case 1: - fetch = &group->vertex_fetch_1; + case XE_GPU_PRIMITIVE_TYPE_LINE_LIST: + primitive_topology = D3D_PRIMITIVE_TOPOLOGY_LINELIST; break; - case 2: - fetch = &group->vertex_fetch_2; + case XE_GPU_PRIMITIVE_TYPE_LINE_STRIP: + primitive_topology = D3D_PRIMITIVE_TOPOLOGY_LINESTRIP; + break; + case XE_GPU_PRIMITIVE_TYPE_TRIANGLE_LIST: + primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST; + break; + case XE_GPU_PRIMITIVE_TYPE_TRIANGLE_STRIP: + primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP; + break; + case XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST: + primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP; + if (vs->DemandGeometryShader( + D3D11VertexShaderResource::RECT_LIST_SHADER, &geometry_shader)) { + return 1; + } + break; + case XE_GPU_PRIMITIVE_TYPE_QUAD_LIST: + primitive_topology = D3D_PRIMITIVE_TOPOLOGY_LINELIST_ADJ; + if (vs->DemandGeometryShader( + D3D11VertexShaderResource::QUAD_LIST_SHADER, &geometry_shader)) { + return 1; + } + break; + default: + case XE_GPU_PRIMITIVE_TYPE_TRIANGLE_FAN: + case XE_GPU_PRIMITIVE_TYPE_UNKNOWN_07: + case XE_GPU_PRIMITIVE_TYPE_LINE_LOOP: + primitive_topology = D3D_PRIMITIVE_TOPOLOGY_POINTLIST; + XELOGE("D3D11: unsupported primitive type %d", command.prim_type); break; } - XEASSERTNOTNULL(fetch); - // If this assert doesn't hold, maybe we just abort? - XEASSERT(fetch->type == 0x3); - XEASSERTNOTZERO(fetch->size); + context_->IASetPrimitiveTopology(primitive_topology); - VertexBufferInfo info; - // TODO(benvanik): make these structs the same so we can share. - info.layout.stride_words = desc.stride_words; - info.layout.element_count = desc.element_count; - for (uint32_t i = 0; i < desc.element_count; ++i) { - const auto& src_el = desc.elements[i]; - auto& dest_el = info.layout.elements[i]; - dest_el.format = src_el.format; - dest_el.offset_words = src_el.offset_words; - dest_el.size_words = src_el.size_words; + // Set the geometry shader, if we are emulating a primitive type. + if (geometry_shader) { + context_->GSSetShader(geometry_shader->handle(), NULL, NULL); + context_->GSSetConstantBuffers(0, 1, &state_.constant_buffers.gs_consts); + } else { + context_->GSSetShader(NULL, NULL, NULL); } - uint32_t address = (fetch->address << 2) + address_translation_; - const uint8_t* src = reinterpret_cast( - memory_->Translate(address)); - - VertexBuffer* vertex_buffer = buffer_cache_->FetchVertexBuffer( - info, src, fetch->size * 4); - if (!vertex_buffer) { - XELOGE("D3D11: unable to create vertex fetch buffer"); - return 1; + // Index buffer, if any. May be auto draw. + if (command.index_buffer) { + DXGI_FORMAT format; + switch (command.index_buffer->info().format) { + case INDEX_FORMAT_16BIT: + format = DXGI_FORMAT_R16_UINT; + break; + case INDEX_FORMAT_32BIT: + format = DXGI_FORMAT_R32_UINT; + break; + } + context_->IASetIndexBuffer( + command.index_buffer->handle_as(), + format, 0); + } else { + context_->IASetIndexBuffer(nullptr, DXGI_FORMAT_UNKNOWN, 0); } - auto d3d_vb = static_cast(vertex_buffer); - // TODO(benvanik): always dword aligned? - uint32_t stride = desc.stride_words * 4; - uint32_t offset = 0; - int vb_slot = desc.input_index; - ID3D11Buffer* buffers[] = { d3d_vb->handle() }; - context_->IASetVertexBuffers(vb_slot, XECOUNT(buffers), buffers, - &stride, &offset); - - return 0; -} - -int D3D11GraphicsDriver::PrepareTextureFetchers() { - SCOPE_profile_cpu_f("gpu"); - - RegisterFile& rf = register_file_; - - for (int n = 0; n < XECOUNT(state_.texture_fetchers); n++) { - auto& fetcher = state_.texture_fetchers[n]; - - // TODO(benvanik): quick validate without refetching. - fetcher.enabled = false; - fetcher.view = NULL; - - int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + n * 6; - xe_gpu_fetch_group_t* group = (xe_gpu_fetch_group_t*)&rf.values[r]; - auto& fetch = group->texture_fetch; - if (fetch.type != 0x2) { - continue; - } - - // Stash a copy of the fetch register. - fetcher.fetch = fetch; - - // Fetch texture from the cache. - uint32_t address = (fetch.address << 12) + address_translation_; - auto texture_view = texture_cache_->FetchTexture(address, fetch); - if (!texture_view) { - XELOGW("D3D11: unable to fetch texture at %.8X", address); - continue; - } - if (texture_view->format == DXGI_FORMAT_UNKNOWN) { - XELOGW("D3D11: unknown texture format %d", fetch.format); - continue; - } - fetcher.view = static_cast(texture_view); - - // Only enable if we get all the way down here successfully. - fetcher.enabled = true; + // All vertex buffers. + for (auto i = 0; i < command.vertex_buffer_count; ++i) { + const auto& vb = command.vertex_buffers[i]; + auto buffer = vb.buffer->handle_as(); + auto stride = vb.stride; + auto offset = vb.offset; + context_->IASetVertexBuffers(vb.input_index, 1, &buffer, + &stride, &offset); } return 0; } -int D3D11GraphicsDriver::PrepareTextureSampler( - xenos::XE_GPU_SHADER_TYPE shader_type, Shader::tex_buffer_desc_t& desc) { +int D3D11GraphicsDriver::SetupSamplers(const DrawCommand& command) { SCOPE_profile_cpu_f("gpu"); - // If the fetcher is disabled or invalid, set some default textures. - auto& fetcher = state_.texture_fetchers[desc.fetch_slot]; - if (!fetcher.enabled || - fetcher.view->format == DXGI_FORMAT_UNKNOWN) { - XELOGW("D3D11: ignoring texture fetch: disabled or an unknown format"); - if (shader_type == XE_GPU_SHADER_TYPE_VERTEX) { - context_->VSSetShaderResources(desc.input_index, - 1, &invalid_texture_view_); - context_->VSSetSamplers(desc.input_index, - 1, &invalid_texture_sampler_state_); + for (auto i = 0; i < command.vertex_shader_sampler_count; ++i) { + const auto& input = command.vertex_shader_samplers[i]; + if (input.texture) { + auto texture = input.texture->handle_as(); + context_->VSSetShaderResources(input.input_index, 1, &texture); } else { - context_->PSSetShaderResources(desc.input_index, - 1, &invalid_texture_view_); - context_->PSSetSamplers(desc.input_index, - 1, &invalid_texture_sampler_state_); + context_->VSSetShaderResources(input.input_index, 1, &invalid_texture_view_); } + if (input.sampler_state) { + auto sampler_state = input.sampler_state->handle_as(); + context_->VSSetSamplers(input.input_index, 1, &sampler_state); + } else { + context_->VSSetSamplers(input.input_index, 1, &invalid_texture_sampler_state_); + } + } + + for (auto i = 0; i < command.pixel_shader_sampler_count; ++i) { + const auto& input = command.pixel_shader_samplers[i]; + if (input.texture) { + auto texture = input.texture->handle_as(); + context_->PSSetShaderResources(input.input_index, 1, &texture); + } else { + context_->PSSetShaderResources(input.input_index, 1, &invalid_texture_view_); + } + if (input.sampler_state) { + auto sampler_state = input.sampler_state->handle_as(); + context_->PSSetSamplers(input.input_index, 1, &sampler_state); + } else { + context_->PSSetSamplers(input.input_index, 1, &invalid_texture_sampler_state_); + } + } + + return 0; +} + +int D3D11GraphicsDriver::RebuildRenderTargets(uint32_t width, + uint32_t height) { + if (width == render_targets_.width && + height == render_targets_.height) { + // Cached copies are good. return 0; } - // Get and set the real shader resource views/samplers. - if (shader_type == XE_GPU_SHADER_TYPE_VERTEX) { - context_->VSSetShaderResources(desc.input_index, 1, &fetcher.view->srv); - } else { - context_->PSSetShaderResources(desc.input_index, 1, &fetcher.view->srv); - } - ID3D11SamplerState* sampler_state = texture_cache_->GetSamplerState( - fetcher.fetch, desc); - if (!sampler_state) { - XELOGW("D3D11: failed to set sampler state; ignoring texture"); - return 1; - } - if (shader_type == XE_GPU_SHADER_TYPE_VERTEX) { - context_->VSSetSamplers(desc.input_index, 1, &sampler_state); - } else { - context_->PSSetSamplers(desc.input_index, 1, &sampler_state); - } - - return 0; -} - -int D3D11GraphicsDriver::PrepareIndexBuffer( - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness) { SCOPE_profile_cpu_f("gpu"); - RegisterFile& rf = register_file_; + // Remove old versions. + for (int n = 0; n < XECOUNT(render_targets_.color_buffers); n++) { + auto& cb = render_targets_.color_buffers[n]; + XESAFERELEASE(cb.buffer); + XESAFERELEASE(cb.color_view_8888); + } + XESAFERELEASE(render_targets_.depth_buffer); + XESAFERELEASE(render_targets_.depth_view_d28s8); + XESAFERELEASE(render_targets_.depth_view_d28fs8); - uint32_t address = index_base + address_translation_; + render_targets_.width = width; + render_targets_.height = height; - IndexBufferInfo info; - info.endianness = endianness; - info.index_32bit = index_32bit; - info.index_count = index_count; - info.index_size = index_size; - auto ib = static_cast(buffer_cache_->FetchIndexBuffer( - info, memory_->Translate(address), index_size)); - if (!ib) { - return 1; + if (!width || !height) { + // This should only happen when cleaning up. + return 0; } - DXGI_FORMAT format; - format = index_32bit ? DXGI_FORMAT_R32_UINT : DXGI_FORMAT_R16_UINT; - context_->IASetIndexBuffer(ib->handle(), format, 0); + for (int n = 0; n < XECOUNT(render_targets_.color_buffers); n++) { + auto& cb = render_targets_.color_buffers[n]; + D3D11_TEXTURE2D_DESC color_buffer_desc; + xe_zero_struct(&color_buffer_desc, sizeof(color_buffer_desc)); + color_buffer_desc.Width = width; + color_buffer_desc.Height = height; + color_buffer_desc.MipLevels = 1; + color_buffer_desc.ArraySize = 1; + color_buffer_desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; + color_buffer_desc.SampleDesc.Count = 1; + color_buffer_desc.SampleDesc.Quality = 0; + color_buffer_desc.Usage = D3D11_USAGE_DEFAULT; + color_buffer_desc.BindFlags = + D3D11_BIND_SHADER_RESOURCE | + D3D11_BIND_RENDER_TARGET; + color_buffer_desc.CPUAccessFlags = 0; + color_buffer_desc.MiscFlags = 0; + device_->CreateTexture2D( + &color_buffer_desc, NULL, &cb.buffer); + + D3D11_RENDER_TARGET_VIEW_DESC render_target_view_desc; + xe_zero_struct(&render_target_view_desc, sizeof(render_target_view_desc)); + render_target_view_desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; + render_target_view_desc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D; + // render_target_view_desc.Buffer ? + device_->CreateRenderTargetView( + cb.buffer, + &render_target_view_desc, + &cb.color_view_8888); + } + + D3D11_TEXTURE2D_DESC depth_stencil_desc; + xe_zero_struct(&depth_stencil_desc, sizeof(depth_stencil_desc)); + depth_stencil_desc.Width = width; + depth_stencil_desc.Height = height; + depth_stencil_desc.MipLevels = 1; + depth_stencil_desc.ArraySize = 1; + depth_stencil_desc.Format = DXGI_FORMAT_D24_UNORM_S8_UINT; + depth_stencil_desc.SampleDesc.Count = 1; + depth_stencil_desc.SampleDesc.Quality = 0; + depth_stencil_desc.Usage = D3D11_USAGE_DEFAULT; + depth_stencil_desc.BindFlags = D3D11_BIND_DEPTH_STENCIL; + depth_stencil_desc.CPUAccessFlags = 0; + depth_stencil_desc.MiscFlags = 0; + device_->CreateTexture2D( + &depth_stencil_desc, NULL, &render_targets_.depth_buffer); + + D3D11_DEPTH_STENCIL_VIEW_DESC depth_stencil_view_desc; + xe_zero_struct(&depth_stencil_view_desc, sizeof(depth_stencil_view_desc)); + depth_stencil_view_desc.Format = DXGI_FORMAT_D24_UNORM_S8_UINT; + depth_stencil_view_desc.ViewDimension = D3D11_DSV_DIMENSION_TEXTURE2D; + depth_stencil_view_desc.Flags = 0; + device_->CreateDepthStencilView( + render_targets_.depth_buffer, + &depth_stencil_view_desc, + &render_targets_.depth_view_d28s8); return 0; } diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.h b/src/xenia/gpu/d3d11/d3d11_graphics_driver.h index 2f2316488..4faa493ee 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.h +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.h @@ -13,8 +13,8 @@ #include #include -#include #include +#include #include #include @@ -24,13 +24,6 @@ namespace xe { namespace gpu { namespace d3d11 { -class D3D11BufferCache; -class D3D11PixelShader; -class D3D11ShaderCache; -class D3D11TextureCache; -struct D3D11TextureView; -class D3D11VertexShader; - class D3D11GraphicsDriver : public GraphicsDriver { public: @@ -38,48 +31,32 @@ public: Memory* memory, IDXGISwapChain* swap_chain, ID3D11Device* device); virtual ~D3D11GraphicsDriver(); - virtual void Initialize(); + ResourceCache* resource_cache() const override { return resource_cache_; } - virtual void InvalidateState( - uint32_t mask); - virtual void SetShader( - xenos::XE_GPU_SHADER_TYPE type, - uint32_t address, - uint32_t start, - uint32_t length); - virtual void DrawIndexBuffer( - xenos::XE_GPU_PRIMITIVE_TYPE prim_type, - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness); - virtual void DrawIndexAuto( - xenos::XE_GPU_PRIMITIVE_TYPE prim_type, - uint32_t index_count); + int Initialize() override; + + int Draw(const DrawCommand& command) override; // TODO(benvanik): figure this out. - virtual int Resolve(); + int Resolve() override; private: - int SetupDraw(xenos::XE_GPU_PRIMITIVE_TYPE prim_type); + void InitializeInvalidTexture(); + + int UpdateState(const DrawCommand& command); + int SetupConstantBuffers(const DrawCommand& command); + int SetupShaders(const DrawCommand& command); + int SetupInputAssembly(const DrawCommand& command); + int SetupSamplers(const DrawCommand& command); + int RebuildRenderTargets(uint32_t width, uint32_t height); - int UpdateState(uint32_t state_overrides = 0); - int UpdateConstantBuffers(); - int BindShaders(); - int PrepareFetchers(); - int PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc); - int PrepareTextureFetchers(); - int PrepareTextureSampler(xenos::XE_GPU_SHADER_TYPE shader_type, - Shader::tex_buffer_desc_t& desc); - int PrepareIndexBuffer( - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness); private: IDXGISwapChain* swap_chain_; ID3D11Device* device_; ID3D11DeviceContext* context_; - D3D11BufferCache* buffer_cache_; - D3D11ShaderCache* shader_cache_; - D3D11TextureCache* texture_cache_; + + D3D11ResourceCache* resource_cache_; ID3D11ShaderResourceView* invalid_texture_view_; ID3D11SamplerState* invalid_texture_sampler_state_; @@ -97,9 +74,6 @@ private: } render_targets_; struct { - D3D11VertexShader* vertex_shader; - D3D11PixelShader* pixel_shader; - struct { ID3D11Buffer* float_constants; ID3D11Buffer* bool_constants; @@ -107,17 +81,7 @@ private: ID3D11Buffer* vs_consts; ID3D11Buffer* gs_consts; } constant_buffers; - - struct { - bool enabled; - xenos::xe_gpu_texture_fetch_t fetch; - D3D11TextureView* view; - } texture_fetchers[32]; } state_; - - enum StateOverrides { - STATE_OVERRIDE_DISABLE_CULLING = (1 << 0), - }; }; diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc index 553ed8828..7258195d3 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc @@ -146,12 +146,18 @@ void D3D11GraphicsSystem::Initialize() { XEASSERTNULL(driver_); driver_ = new D3D11GraphicsDriver( memory_, window_->swap_chain(), device_); + if (driver_->Initialize()) { + XELOGE("Unable to initialize D3D11 driver"); + return; + } // Initial vsync kick. DispatchInterruptCallback(0); } void D3D11GraphicsSystem::Pump() { + SCOPE_profile_cpu_f("gpu"); + if (swap_pending_) { swap_pending_ = false; diff --git a/src/xenia/gpu/d3d11/d3d11_resource_cache.cc b/src/xenia/gpu/d3d11/d3d11_resource_cache.cc new file mode 100644 index 000000000..145e3d395 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_resource_cache.cc @@ -0,0 +1,71 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include +#include +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; + + +D3D11ResourceCache::D3D11ResourceCache(Memory* memory, + ID3D11Device* device, + ID3D11DeviceContext* context) + : ResourceCache(memory), + device_(device), context_(context) { + device_->AddRef(); + context_->AddRef(); +} + +D3D11ResourceCache::~D3D11ResourceCache() { + XESAFERELEASE(device_); + XESAFERELEASE(context_); +} + +VertexShaderResource* D3D11ResourceCache::CreateVertexShader( + const MemoryRange& memory_range, + const VertexShaderResource::Info& info) { + return new D3D11VertexShaderResource(this, memory_range, info); +} + +PixelShaderResource* D3D11ResourceCache::CreatePixelShader( + const MemoryRange& memory_range, + const PixelShaderResource::Info& info) { + return new D3D11PixelShaderResource(this, memory_range, info); +} + +TextureResource* D3D11ResourceCache::CreateTexture( + const MemoryRange& memory_range, + const TextureResource::Info& info) { + return new D3D11TextureResource(this, memory_range, info); +} + +SamplerStateResource* D3D11ResourceCache::CreateSamplerState( + const SamplerStateResource::Info& info) { + return new D3D11SamplerStateResource(this, info); +} + +IndexBufferResource* D3D11ResourceCache::CreateIndexBuffer( + const MemoryRange& memory_range, + const IndexBufferResource::Info& info) { + return new D3D11IndexBufferResource(this, memory_range, info); +} + +VertexBufferResource* D3D11ResourceCache::CreateVertexBuffer( + const MemoryRange& memory_range, + const VertexBufferResource::Info& info) { + return new D3D11VertexBufferResource(this, memory_range, info); +} diff --git a/src/xenia/gpu/d3d11/d3d11_resource_cache.h b/src/xenia/gpu/d3d11/d3d11_resource_cache.h new file mode 100644 index 000000000..27248eb9c --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_resource_cache.h @@ -0,0 +1,64 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_RESOURCE_CACHE_H_ +#define XENIA_GPU_D3D11_D3D11_RESOURCE_CACHE_H_ + +#include + +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + + +class D3D11ResourceCache : public ResourceCache { +public: + D3D11ResourceCache(Memory* memory, + ID3D11Device* device, ID3D11DeviceContext* context); + virtual ~D3D11ResourceCache(); + + ID3D11Device* device() const { return device_; } + ID3D11DeviceContext* context() const { return context_; } + +protected: + VertexShaderResource* CreateVertexShader( + const MemoryRange& memory_range, + const VertexShaderResource::Info& info) override; + PixelShaderResource* CreatePixelShader( + const MemoryRange& memory_range, + const PixelShaderResource::Info& info) override; + TextureResource* CreateTexture( + const MemoryRange& memory_range, + const TextureResource::Info& info) override; + SamplerStateResource* CreateSamplerState( + const SamplerStateResource::Info& info) override; + IndexBufferResource* CreateIndexBuffer( + const MemoryRange& memory_range, + const IndexBufferResource::Info& info) override; + VertexBufferResource* CreateVertexBuffer( + const MemoryRange& memory_range, + const VertexBufferResource::Info& info) override; + +private: + ID3D11Device* device_; + ID3D11DeviceContext* context_; +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_RESOURCE_CACHE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_texture_cache.cc b/src/xenia/gpu/d3d11/d3d11_sampler_state_resource.cc similarity index 51% rename from src/xenia/gpu/d3d11/d3d11_texture_cache.cc rename to src/xenia/gpu/d3d11/d3d11_sampler_state_resource.cc index eb3442bfc..7fb09858a 100644 --- a/src/xenia/gpu/d3d11/d3d11_texture_cache.cc +++ b/src/xenia/gpu/d3d11/d3d11_sampler_state_resource.cc @@ -7,53 +7,36 @@ ****************************************************************************** */ -#include +#include -#include +#include +using namespace std; using namespace xe; using namespace xe::gpu; using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; -D3D11TextureCache::D3D11TextureCache( - Memory* memory, - ID3D11DeviceContext* context, ID3D11Device* device) - : TextureCache(memory), - context_(context), device_(device) { - context_->AddRef(); - device_->AddRef(); +D3D11SamplerStateResource::D3D11SamplerStateResource( + D3D11ResourceCache* resource_cache, const Info& info) + : SamplerStateResource(info), + resource_cache_(resource_cache), + handle_(nullptr) { } -D3D11TextureCache::~D3D11TextureCache() { - for (auto it = samplers_.begin(); it != samplers_.end(); ++it) { - auto& cached_state = it->second; - XESAFERELEASE(cached_state.state); +D3D11SamplerStateResource::~D3D11SamplerStateResource() { + XESAFERELEASE(handle_); +} + +int D3D11SamplerStateResource::Prepare() { + if (handle_) { + return 0; } - samplers_.clear(); - XESAFERELEASE(device_); - XESAFERELEASE(context_); -} - -Texture* D3D11TextureCache::CreateTexture( - uint32_t address, const uint8_t* host_address, - const xenos::xe_gpu_texture_fetch_t& fetch) { - return new D3D11Texture(this, address, host_address); -} - -ID3D11SamplerState* D3D11TextureCache::GetSamplerState( - const xenos::xe_gpu_texture_fetch_t& fetch, - const Shader::tex_buffer_desc_t& desc) { D3D11_SAMPLER_DESC sampler_desc; xe_zero_struct(&sampler_desc, sizeof(sampler_desc)); - uint32_t min_filter = desc.tex_fetch.min_filter == 3 ? - fetch.min_filter : desc.tex_fetch.min_filter; - uint32_t mag_filter = desc.tex_fetch.mag_filter == 3 ? - fetch.mag_filter : desc.tex_fetch.mag_filter; - uint32_t mip_filter = desc.tex_fetch.mip_filter == 3 ? - fetch.mip_filter : desc.tex_fetch.mip_filter; // MIN, MAG, MIP static const D3D11_FILTER filter_matrix[2][2][3] = { { @@ -87,7 +70,8 @@ ID3D11SamplerState* D3D11TextureCache::GetSamplerState( }, }, }; - sampler_desc.Filter = filter_matrix[min_filter][mag_filter][mip_filter]; + sampler_desc.Filter = + filter_matrix[info_.min_filter][info_.mag_filter][info_.mip_filter]; static const D3D11_TEXTURE_ADDRESS_MODE mode_map[] = { D3D11_TEXTURE_ADDRESS_WRAP, D3D11_TEXTURE_ADDRESS_MIRROR, @@ -98,9 +82,9 @@ ID3D11SamplerState* D3D11TextureCache::GetSamplerState( D3D11_TEXTURE_ADDRESS_BORDER, // ? D3D11_TEXTURE_ADDRESS_MIRROR, // ? }; - sampler_desc.AddressU = mode_map[fetch.clamp_x]; - sampler_desc.AddressV = mode_map[fetch.clamp_y]; - sampler_desc.AddressW = mode_map[fetch.clamp_z]; + sampler_desc.AddressU = mode_map[info_.clamp_u]; + sampler_desc.AddressV = mode_map[info_.clamp_v]; + sampler_desc.AddressW = mode_map[info_.clamp_w]; sampler_desc.MipLODBias; sampler_desc.MaxAnisotropy = 1; sampler_desc.ComparisonFunc = D3D11_COMPARISON_ALWAYS; @@ -111,29 +95,12 @@ ID3D11SamplerState* D3D11TextureCache::GetSamplerState( sampler_desc.MinLOD; sampler_desc.MaxLOD; - // TODO(benvanik): do this earlier without having to setup the whole struct? - size_t hash = hash_combine( - sampler_desc.Filter, - sampler_desc.AddressU, - sampler_desc.AddressV, - sampler_desc.AddressW); - auto range = samplers_.equal_range(hash); - for (auto it = range.first; it != range.second; ++it) { - const auto& cached_state = it->second; - // TODO(benvanik): faster compare? - if (memcmp(&sampler_desc, &cached_state.desc, sizeof(sampler_desc)) == 0) { - return cached_state.state; - } - } - - ID3D11SamplerState* sampler_state = NULL; - HRESULT hr = device_->CreateSamplerState(&sampler_desc, &sampler_state); + HRESULT hr = resource_cache_->device()->CreateSamplerState( + &sampler_desc, &handle_); if (FAILED(hr)) { XELOGE("D3D11: unable to create sampler state"); - return nullptr; + return 1; } - samplers_.insert({ hash, { sampler_desc, sampler_state } }); - - return sampler_state; + return 0; } diff --git a/src/xenia/gpu/d3d11/d3d11_sampler_state_resource.h b/src/xenia/gpu/d3d11/d3d11_sampler_state_resource.h new file mode 100644 index 000000000..6097339b4 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_sampler_state_resource.h @@ -0,0 +1,48 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_SAMPLER_STATE_RESOURCE_H_ +#define XENIA_GPU_D3D11_D3D11_SAMPLER_STATE_RESOURCE_H_ + +#include +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + +class D3D11ResourceCache; + + +class D3D11SamplerStateResource : public SamplerStateResource { +public: + D3D11SamplerStateResource(D3D11ResourceCache* resource_cache, + const Info& info); + ~D3D11SamplerStateResource() override; + + void* handle() const override { return handle_; } + + int Prepare() override; + +protected: + D3D11ResourceCache* resource_cache_; + ID3D11SamplerState* handle_; +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_SAMPLER_STATE_RESOURCE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_shader.cc b/src/xenia/gpu/d3d11/d3d11_shader.cc deleted file mode 100644 index 97e0cb295..000000000 --- a/src/xenia/gpu/d3d11/d3d11_shader.cc +++ /dev/null @@ -1,2059 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include -#include -#include - -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::d3d11; -using namespace xe::gpu::xenos; - - -namespace { - -const int OUTPUT_CAPACITY = 64 * 1024; - -int GetFormatComponentCount(uint32_t format) { - switch (format) { - case FMT_32: - case FMT_32_FLOAT: - return 1; - case FMT_16_16: - case FMT_16_16_FLOAT: - case FMT_32_32: - case FMT_32_32_FLOAT: - return 2; - case FMT_10_11_11: - case FMT_11_11_10: - case FMT_32_32_32_FLOAT: - return 3; - case FMT_8_8_8_8: - case FMT_2_10_10_10: - case FMT_16_16_16_16: - case FMT_16_16_16_16_FLOAT: - case FMT_32_32_32_32: - case FMT_32_32_32_32_FLOAT: - return 4; - default: - XELOGE("Unknown vertex format: %d", format); - XEASSERTALWAYS(); - return 4; - } -} - -const char* GetFormatTypeName( - uint32_t format, uint32_t format_comp_all, uint32_t num_format_all) { - switch (format) { - case FMT_32: - return format_comp_all ? "int" : "uint"; - case FMT_32_FLOAT: - return "float"; - case FMT_16_16: - case FMT_32_32: - if (!num_format_all) { - return format_comp_all ? "snorm float2" : "unorm float2"; - } else { - return format_comp_all ? "int2" : "uint2"; - } - case FMT_16_16_FLOAT: - case FMT_32_32_FLOAT: - return "float2"; - case FMT_10_11_11: - case FMT_11_11_10: - return "int3"; // ? - case FMT_32_32_32_FLOAT: - return "float3"; - case FMT_8_8_8_8: - case FMT_2_10_10_10: - case FMT_16_16_16_16: - case FMT_32_32_32_32: - if (!num_format_all) { - return format_comp_all ? "snorm float4" : "unorm float4"; - } else { - return format_comp_all ? "int4" : "uint4"; - } - case FMT_16_16_16_16_FLOAT: - case FMT_32_32_32_32_FLOAT: - return "float4"; - default: - XELOGE("Unknown vertex format: %d", format); - XEASSERTALWAYS(); - return "float4"; - } -} - -} // anonymous namespace - - -struct xe::gpu::d3d11::Output { - char buffer[OUTPUT_CAPACITY]; - size_t capacity; - size_t offset; - Output() : - capacity(OUTPUT_CAPACITY), - offset(0) { - buffer[0] = 0; - } - void append(const char* format, ...) { - va_list args; - va_start(args, format); - int len = xevsnprintfa( - buffer + offset, capacity - offset, format, args); - va_end(args); - offset += len; - buffer[offset] = 0; - } -}; - - -D3D11Shader::D3D11Shader( - ID3D11Device* device, - XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash) : - translated_src_(NULL), - Shader(type, src_ptr, length, hash) { - device_ = device; - device_->AddRef(); -} - -D3D11Shader::~D3D11Shader() { - if (translated_src_) { - xe_free(translated_src_); - } - XESAFERELEASE(device_); -} - -void D3D11Shader::set_translated_src(char* value) { - if (translated_src_) { - xe_free(translated_src_); - } - translated_src_ = xestrdupa(value); -} - -ID3D10Blob* D3D11Shader::Compile(const char* shader_source) { - SCOPE_profile_cpu_f("gpu"); - - // TODO(benvanik): pick shared runtime mode defines. - D3D10_SHADER_MACRO defines[] = { - "TEST_DEFINE", "1", - 0, 0, - }; - - uint32_t flags1 = 0; - flags1 |= D3D10_SHADER_DEBUG; - flags1 |= D3D10_SHADER_ENABLE_STRICTNESS; - uint32_t flags2 = 0; - - // Create a name. - const char* base_path = ""; - if (FLAGS_dump_shaders.size()) { - base_path = FLAGS_dump_shaders.c_str(); - } - char file_name[XE_MAX_PATH]; - xesnprintfa(file_name, XECOUNT(file_name), - "%s/gen_%.16llX.%s", - base_path, - hash_, - type_ == XE_GPU_SHADER_TYPE_VERTEX ? "vs" : "ps"); - - if (FLAGS_dump_shaders.size()) { - FILE* f = fopen(file_name, "w"); - fprintf(f, shader_source); - fprintf(f, "\n\n"); - fprintf(f, "/*\n"); - fprintf(f, disasm_src_); - fprintf(f, " */\n"); - fclose(f); - } - - // Compile shader to bytecode blob. - ID3D10Blob* shader_blob = 0; - ID3D10Blob* error_blob = 0; - HRESULT hr = D3DCompile( - shader_source, strlen(shader_source), - file_name, - defines, NULL, - "main", - type_ == XE_GPU_SHADER_TYPE_VERTEX ? - "vs_5_0" : "ps_5_0", - flags1, flags2, - &shader_blob, &error_blob); - if (error_blob) { - char* msg = (char*)error_blob->GetBufferPointer(); - XELOGE("D3D11: shader compile failed with %s", msg); - } - XESAFERELEASE(error_blob); - if (FAILED(hr)) { - return NULL; - } - return shader_blob; -} - -void D3D11Shader::AppendTextureHeader(Output* output) { - bool fetch_setup[32] = { false }; - - // 1 texture per constant slot, 1 sampler per fetch. - for (uint32_t n = 0; n < tex_buffer_inputs_.count; n++) { - auto& input = tex_buffer_inputs_.descs[n]; - auto& fetch = input.tex_fetch; - - // Add texture, if needed. - if (!fetch_setup[fetch.const_idx]) { - fetch_setup[fetch.const_idx] = true; - const char* texture_type = NULL; - switch (fetch.dimension) { - case DIMENSION_1D: - texture_type = "Texture1D"; - break; - default: - case DIMENSION_2D: - texture_type = "Texture2D"; - break; - case DIMENSION_3D: - texture_type = "Texture3D"; - break; - case DIMENSION_CUBE: - texture_type = "TextureCube"; - break; - } - output->append("%s x_texture_%d;\n", texture_type, fetch.const_idx); - } - - // Add sampler. - output->append("SamplerState x_sampler_%d;\n", n); - } -} - - -D3D11VertexShader::D3D11VertexShader( - ID3D11Device* device, - const uint8_t* src_ptr, size_t length, - uint64_t hash) : - handle_(0), input_layout_(0), - D3D11Shader(device, XE_GPU_SHADER_TYPE_VERTEX, - src_ptr, length, hash) { - xe_zero_struct(geometry_shaders_, sizeof(geometry_shaders_)); -} - -D3D11VertexShader::~D3D11VertexShader() { - for (size_t n = 0; n < XECOUNT(geometry_shaders_); n++) { - delete geometry_shaders_[n]; - } - XESAFERELEASE(input_layout_); - XESAFERELEASE(handle_); -} - -int D3D11VertexShader::Prepare(xe_gpu_program_cntl_t* program_cntl) { - SCOPE_profile_cpu_f("gpu"); - if (handle_) { - return 0; - } - - // TODO(benvanik): look in file based on hash/etc. - void* byte_code = NULL; - size_t byte_code_length = 0; - - // Translate and compile source. - const char* shader_source = Translate(program_cntl); - if (!shader_source) { - return 1; - } - ID3D10Blob* shader_blob = Compile(shader_source); - if (!shader_blob) { - return 1; - } - byte_code_length = shader_blob->GetBufferSize(); - byte_code = xe_malloc(byte_code_length); - xe_copy_struct( - byte_code, shader_blob->GetBufferPointer(), byte_code_length); - XESAFERELEASE(shader_blob); - - // Create shader. - HRESULT hr = device_->CreateVertexShader( - byte_code, byte_code_length, - NULL, - &handle_); - if (FAILED(hr)) { - XELOGE("D3D11: failed to create vertex shader"); - xe_free(byte_code); - return 1; - } - - // Create input layout. - size_t element_count = 0; - for (uint32_t n = 0; n < vtx_buffer_inputs_.count; n++) { - element_count += vtx_buffer_inputs_.descs[n].element_count; - } - if (!element_count) { - XELOGW("D3D11: vertex shader with zero inputs -- retaining previous values?"); - input_layout_ = NULL; - return 0; - } - - D3D11_INPUT_ELEMENT_DESC* element_descs = - (D3D11_INPUT_ELEMENT_DESC*)xe_alloca( - sizeof(D3D11_INPUT_ELEMENT_DESC) * element_count); - uint32_t el_index = 0; - for (uint32_t n = 0; n < vtx_buffer_inputs_.count; n++) { - auto& input = vtx_buffer_inputs_.descs[n]; - for (uint32_t m = 0; m < input.element_count; m++) { - auto& el = input.elements[m]; - uint32_t vb_slot = input.input_index; - uint32_t num_format_all = el.vtx_fetch.num_format_all; - uint32_t format_comp_all = el.vtx_fetch.format_comp_all; - DXGI_FORMAT vtx_format; - switch (el.format) { - case FMT_8_8_8_8: - if (!num_format_all) { - vtx_format = format_comp_all ? - DXGI_FORMAT_R8G8B8A8_SNORM : DXGI_FORMAT_R8G8B8A8_UNORM; - } else { - vtx_format = format_comp_all ? - DXGI_FORMAT_R8G8B8A8_SINT : DXGI_FORMAT_R8G8B8A8_UINT; - } - break; - case FMT_2_10_10_10: - if (!num_format_all) { - vtx_format = DXGI_FORMAT_R10G10B10A2_UNORM; - } else { - vtx_format = DXGI_FORMAT_R10G10B10A2_UINT; - } - break; - // DXGI_FORMAT_R11G11B10_FLOAT? - case FMT_16_16: - if (!num_format_all) { - vtx_format = format_comp_all ? - DXGI_FORMAT_R16G16_SNORM : DXGI_FORMAT_R16G16_UNORM; - } else { - vtx_format = format_comp_all ? - DXGI_FORMAT_R16G16_SINT : DXGI_FORMAT_R16G16_UINT; - } - break; - case FMT_16_16_16_16: - if (!num_format_all) { - vtx_format = format_comp_all ? - DXGI_FORMAT_R16G16B16A16_SNORM : DXGI_FORMAT_R16G16B16A16_UNORM; - } else { - vtx_format = format_comp_all ? - DXGI_FORMAT_R16G16B16A16_SINT : DXGI_FORMAT_R16G16B16A16_UINT; - } - break; - case FMT_16_16_FLOAT: - vtx_format = DXGI_FORMAT_R16G16_FLOAT; - break; - case FMT_16_16_16_16_FLOAT: - vtx_format = DXGI_FORMAT_R16G16B16A16_FLOAT; - break; - case FMT_32: - vtx_format = format_comp_all ? - DXGI_FORMAT_R32_SINT : DXGI_FORMAT_R32_UINT; - break; - case FMT_32_32: - vtx_format = format_comp_all ? - DXGI_FORMAT_R32G32_SINT : DXGI_FORMAT_R32G32_UINT; - break; - case FMT_32_32_32_32: - vtx_format = format_comp_all ? - DXGI_FORMAT_R32G32B32A32_SINT : DXGI_FORMAT_R32G32B32A32_UINT; - break; - case FMT_32_FLOAT: - vtx_format = DXGI_FORMAT_R32_FLOAT; - break; - case FMT_32_32_FLOAT: - vtx_format = DXGI_FORMAT_R32G32_FLOAT; - break; - case FMT_32_32_32_FLOAT: - vtx_format = DXGI_FORMAT_R32G32B32_FLOAT; - break; - case FMT_32_32_32_32_FLOAT: - vtx_format = DXGI_FORMAT_R32G32B32A32_FLOAT; - break; - default: - XEASSERTALWAYS(); - break; - } - element_descs[el_index].SemanticName = "XE_VF"; - element_descs[el_index].SemanticIndex = el_index; - element_descs[el_index].Format = vtx_format; - element_descs[el_index].InputSlot = vb_slot; - element_descs[el_index].AlignedByteOffset = el.offset_words * 4; - element_descs[el_index].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA; - element_descs[el_index].InstanceDataStepRate = 0; - el_index++; - } - } - hr = device_->CreateInputLayout( - element_descs, - (UINT)element_count, - byte_code, byte_code_length, - &input_layout_); - if (FAILED(hr)) { - XELOGE("D3D11: failed to create vertex shader input layout"); - xe_free(byte_code); - return 1; - } - - xe_free(byte_code); - - is_prepared_ = true; - return 0; -} - -const char* D3D11VertexShader::Translate(xe_gpu_program_cntl_t* program_cntl) { - SCOPE_profile_cpu_f("gpu"); - - Output* output = new Output(); - xe_gpu_translate_ctx_t ctx; - ctx.output = output; - ctx.type = type_; - ctx.tex_fetch_index = 0; - - // Add constants buffers. - // We could optimize this by only including used buffers, but the compiler - // seems to do a good job of doing this for us. - // It also does read detection, so c[512] can end up c[4] in the asm - - // instead of doing this optimization ourselves we could maybe just query - // this from the compiler. - output->append( - "cbuffer float_consts : register(b0) {\n" - " float4 c[512];\n" - "};\n"); - // TODO(benvanik): add bool/loop constants. - - AppendTextureHeader(output); - - // Transform utilities. We adjust the output position in various ways - // as we can't do this via D3D11 APIs. - output->append( - "cbuffer vs_consts : register(b3) {\n" - " float4 window;\n" // x,y,w,h - " float4 viewport_z_enable;\n" // min,(max - min),?,enabled - " float4 viewport_size;\n" // x,y,w,h - "};" - "float4 applyViewport(float4 pos) {\n" - " if (viewport_z_enable.w) {\n" - //" pos.x = (pos.x + 1) * viewport_size.z * 0.5 + viewport_size.x;\n" - //" pos.y = (1 - pos.y) * viewport_size.w * 0.5 + viewport_size.y;\n" - //" pos.z = viewport_z_enable.x + pos.z * viewport_z_enable.y;\n" - // w? - " } else {\n" - " pos.xy = pos.xy / float2(window.z / 2.0, -window.w / 2.0) + float2(-1.0, 1.0);\n" - " pos.zw = float2(0.0, 1.0);\n" - " }\n" - " pos.xy += window.xy;\n" - " return pos;\n" - "}\n"); - - // Add vertex shader input. - output->append( - "struct VS_INPUT {\n"); - uint32_t el_index = 0; - for (uint32_t n = 0; n < vtx_buffer_inputs_.count; n++) { - auto& input = vtx_buffer_inputs_.descs[n]; - for (uint32_t m = 0; m < input.element_count; m++) { - auto& el = input.elements[m]; - auto& vtx = el.vtx_fetch; - const char* type_name = GetFormatTypeName( - el.format, el.vtx_fetch.format_comp_all, el.vtx_fetch.num_format_all); - uint32_t fetch_slot = vtx.const_index * 3 + vtx.const_index_sel; - output->append( - " %s vf%u_%d : XE_VF%u;\n", - type_name, fetch_slot, vtx.offset, el_index); - el_index++; - } - } - output->append( - "};\n"); - - // Add vertex shader output (pixel shader input). - output->append( - "struct VS_OUTPUT {\n"); - if (alloc_counts_.positions) { - XEASSERT(alloc_counts_.positions == 1); - output->append( - " float4 oPos : SV_POSITION;\n"); - } - if (alloc_counts_.params) { - output->append( - " float4 o[%d] : XE_O;\n", - MAX_INTERPOLATORS); - } - if (alloc_counts_.point_size) { - output->append( - " float4 oPointSize : PSIZE;\n"); - } - output->append( - "};\n"); - - // Vertex shader main() header. - output->append( - "VS_OUTPUT main(VS_INPUT i) {\n" - " VS_OUTPUT o;\n"); - - // Always write position, as some shaders seem to only write certain values. - output->append( - " o.oPos = float4(0.0, 0.0, 0.0, 0.0);\n"); - if (alloc_counts_.point_size) { - output->append( - " o.oPointSize = float4(1.0, 0.0, 0.0, 0.0);\n"); - } - - // TODO(benvanik): remove this, if possible (though the compiler may be smart - // enough to do it for us). - if (alloc_counts_.params) { - for (uint32_t n = 0; n < MAX_INTERPOLATORS; n++) { - output->append( - " o.o[%d] = float4(0.0, 0.0, 0.0, 0.0);\n", n); - } - } - - // Add temporaries for any registers we may use. - uint32_t temp_regs = program_cntl->vs_regs + program_cntl->ps_regs; - for (uint32_t n = 0; n <= temp_regs; n++) { - output->append( - " float4 r%d = c[%d];\n", n, n); - } - output->append(" float4 t;\n"); - - // Execute blocks. - for (std::vector::iterator it = execs_.begin(); - it != execs_.end(); ++it) { - instr_cf_exec_t& cf = *it; - // TODO(benvanik): figure out how sequences/jmps/loops/etc work. - if (TranslateExec(ctx, cf)) { - delete output; - return NULL; - } - } - - // main footer. - output->append( - " o.oPos = applyViewport(o.oPos);\n" - " return o;\n" - "};\n"); - - set_translated_src(output->buffer); - delete output; - return translated_src_; -} - -int D3D11VertexShader::DemandGeometryShader(GeometryShaderType type, - D3D11GeometryShader** out_shader) { - if (geometry_shaders_[type]) { - *out_shader = geometry_shaders_[type]; - return 0; - } - - // Demand generate. - D3D11GeometryShader* shader = NULL; - switch (type) { - case POINT_SPRITE_SHADER: - shader = new D3D11PointSpriteGeometryShader(device_, hash_); - break; - case RECT_LIST_SHADER: - shader = new D3D11RectListGeometryShader(device_, hash_); - break; - case QUAD_LIST_SHADER: - shader = new D3D11QuadListGeometryShader(device_, hash_); - break; - default: - XEASSERTALWAYS(); - return 1; - } - if (!shader) { - return 1; - } - - if (shader->Prepare(this)) { - delete shader; - return 1; - } - - geometry_shaders_[type] = shader; - *out_shader = geometry_shaders_[type]; - return 0; -} - - -D3D11PixelShader::D3D11PixelShader( - ID3D11Device* device, - const uint8_t* src_ptr, size_t length, - uint64_t hash) : - handle_(0), - D3D11Shader(device, XE_GPU_SHADER_TYPE_PIXEL, - src_ptr, length, hash) { -} - -D3D11PixelShader::~D3D11PixelShader() { - XESAFERELEASE(handle_); -} - -int D3D11PixelShader::Prepare(xe_gpu_program_cntl_t* program_cntl, - D3D11VertexShader* input_shader) { - SCOPE_profile_cpu_f("gpu"); - if (handle_) { - return 0; - } - - // TODO(benvanik): look in file based on hash/etc. - void* byte_code = NULL; - size_t byte_code_length = 0; - - // Translate and compile source. - const char* shader_source = Translate(program_cntl, input_shader); - if (!shader_source) { - return 1; - } - ID3D10Blob* shader_blob = Compile(shader_source); - if (!shader_blob) { - return 1; - } - byte_code_length = shader_blob->GetBufferSize(); - byte_code = xe_malloc(byte_code_length); - xe_copy_struct( - byte_code, shader_blob->GetBufferPointer(), byte_code_length); - XESAFERELEASE(shader_blob); - - // Create shader. - HRESULT hr = device_->CreatePixelShader( - byte_code, byte_code_length, - NULL, - &handle_); - if (FAILED(hr)) { - XELOGE("D3D11: failed to create pixel shader"); - xe_free(byte_code); - return 1; - } - - xe_free(byte_code); - - is_prepared_ = true; - return 0; -} - -const char* D3D11PixelShader::Translate( - xe_gpu_program_cntl_t* program_cntl, D3D11VertexShader* input_shader) { - SCOPE_profile_cpu_f("gpu"); - Output* output = new Output(); - xe_gpu_translate_ctx_t ctx; - ctx.output = output; - ctx.type = type_; - ctx.tex_fetch_index = 0; - - // We need an input VS to make decisions here. - // TODO(benvanik): do we need to pair VS/PS up and store the combination? - // If the same PS is used with different VS that output different amounts - // (and less than the number of required registers), things may die. - XEASSERTNOTNULL(input_shader); - const Shader::alloc_counts_t& input_alloc_counts = - input_shader->alloc_counts(); - - // Add constants buffers. - // We could optimize this by only including used buffers, but the compiler - // seems to do a good job of doing this for us. - // It also does read detection, so c[512] can end up c[4] in the asm - - // instead of doing this optimization ourselves we could maybe just query - // this from the compiler. - output->append( - "cbuffer float_consts : register(b0) {\n" - " float4 c[512];\n" - "};\n"); - // TODO(benvanik): add bool/loop constants. - - AppendTextureHeader(output); - - // Add vertex shader output (pixel shader input). - output->append( - "struct VS_OUTPUT {\n"); - if (input_alloc_counts.positions) { - XEASSERT(input_alloc_counts.positions == 1); - output->append( - " float4 oPos : SV_POSITION;\n"); - } - if (input_alloc_counts.params) { - output->append( - " float4 o[%d] : XE_O;\n", - MAX_INTERPOLATORS); - } - output->append( - "};\n"); - - // Add pixel shader output. - output->append( - "struct PS_OUTPUT {\n"); - for (uint32_t n = 0; n < alloc_counts_.params; n++) { - output->append( - " float4 oC%d : SV_TARGET%d;\n", n, n); - if (program_cntl->ps_export_depth) { - // Is this per render-target? - output->append( - " float oD%d : SV_DEPTH%d;\n", n, n); - } - } - output->append( - "};\n"); - - // Pixel shader main() header. - output->append( - "PS_OUTPUT main(VS_OUTPUT i) {\n" - " PS_OUTPUT o;\n"); - - // Add temporary registers. - uint32_t temp_regs = program_cntl->vs_regs + program_cntl->ps_regs; - for (uint32_t n = 0; n <= MAX(15, temp_regs); n++) { - output->append( - " float4 r%d = c[%d];\n", n, n); - } - output->append(" float4 t;\n"); - - // Bring registers local. - if (input_alloc_counts.params) { - for (uint32_t n = 0; n < MAX_INTERPOLATORS; n++) { - output->append( - " r%d = i.o[%d];\n", n, n); - } - } - - // Execute blocks. - for (std::vector::iterator it = execs_.begin(); - it != execs_.end(); ++it) { - instr_cf_exec_t& cf = *it; - // TODO(benvanik): figure out how sequences/jmps/loops/etc work. - if (TranslateExec(ctx, cf)) { - delete output; - return NULL; - } - } - - // main footer. - output->append( - " return o;\n" - "}\n"); - - set_translated_src(output->buffer); - delete output; - return translated_src_; -} - - -namespace { - -static const char chan_names[] = { - 'x', 'y', 'z', 'w', - // these only apply to FETCH dst's, and we shouldn't be using them: - '0', '1', '?', '_', -}; - -void AppendSrcReg( - xe_gpu_translate_ctx_t& ctx, - uint32_t num, uint32_t type, - uint32_t swiz, uint32_t negate, uint32_t abs) { - if (negate) { - ctx.output->append("-"); - } - if (abs) { - ctx.output->append("abs("); - } - if (type) { - // Register. - ctx.output->append("r%u", num); - } else { - // Constant. - ctx.output->append("c[%u]", num); - } - if (swiz) { - ctx.output->append("."); - for (int i = 0; i < 4; i++) { - ctx.output->append("%c", chan_names[(swiz + i) & 0x3]); - swiz >>= 2; - } - } - if (abs) { - ctx.output->append(")"); - } -} - -void AppendDestRegName( - xe_gpu_translate_ctx_t& ctx, - uint32_t num, uint32_t dst_exp) { - if (!dst_exp) { - // Register. - ctx.output->append("r%u", num); - } else { - // Export. - switch (ctx.type) { - case XE_GPU_SHADER_TYPE_VERTEX: - switch (num) { - case 62: - ctx.output->append("o.oPos"); - break; - case 63: - ctx.output->append("o.oPointSize"); - break; - default: - // Varying. - ctx.output->append("o.o[%u]", num);; - break; - } - break; - case XE_GPU_SHADER_TYPE_PIXEL: - switch (num) { - case 0: - ctx.output->append("o.oC0"); - break; - default: - // TODO(benvanik): other render targets? - // TODO(benvanik): depth? - XEASSERTALWAYS(); - break; - } - break; - } - } -} - -void AppendDestReg( - xe_gpu_translate_ctx_t& ctx, - uint32_t num, uint32_t mask, uint32_t dst_exp) { - if (mask != 0xF) { - // If masking, store to a temporary variable and clean it up later. - ctx.output->append("t"); - } else { - // Store directly to output. - AppendDestRegName(ctx, num, dst_exp); - } -} - -void AppendDestRegPost( - xe_gpu_translate_ctx_t& ctx, - uint32_t num, uint32_t mask, uint32_t dst_exp) { - if (mask != 0xF) { - // Masking. - ctx.output->append(" "); - AppendDestRegName(ctx, num, dst_exp); - ctx.output->append(" = float4("); - for (int i = 0; i < 4; i++) { - // TODO(benvanik): mask out values? mix in old value as temp? - // ctx.output->append("%c", (mask & 0x1) ? chan_names[i] : 'w'); - if (!(mask & 0x1)) { - AppendDestRegName(ctx, num, dst_exp); - } else { - ctx.output->append("t"); - } - ctx.output->append(".%c", chan_names[i]); - mask >>= 1; - if (i < 3) { - ctx.output->append(", "); - } - } - ctx.output->append(");\n"); - } -} - -void print_srcreg( - Output* output, - uint32_t num, uint32_t type, - uint32_t swiz, uint32_t negate, uint32_t abs) { - if (negate) { - output->append("-"); - } - if (abs) { - output->append("|"); - } - output->append("%c%u", type ? 'R' : 'C', num); - if (swiz) { - output->append("."); - for (int i = 0; i < 4; i++) { - output->append("%c", chan_names[(swiz + i) & 0x3]); - swiz >>= 2; - } - } - if (abs) { - output->append("|"); - } -} - -void print_dstreg( - Output* output, uint32_t num, uint32_t mask, uint32_t dst_exp) { - output->append("%s%u", dst_exp ? "export" : "R", num); - if (mask != 0xf) { - output->append("."); - for (int i = 0; i < 4; i++) { - output->append("%c", (mask & 0x1) ? chan_names[i] : '_'); - mask >>= 1; - } - } -} - -void print_export_comment( - Output* output, uint32_t num, XE_GPU_SHADER_TYPE type) { - const char *name = NULL; - switch (type) { - case XE_GPU_SHADER_TYPE_VERTEX: - switch (num) { - case 62: name = "gl_Position"; break; - case 63: name = "gl_PointSize"; break; - } - break; - case XE_GPU_SHADER_TYPE_PIXEL: - switch (num) { - case 0: name = "gl_FragColor"; break; - } - break; - } - /* if we had a symbol table here, we could look - * up the name of the varying.. - */ - if (name) { - output->append("\t; %s", name); - } -} - -int TranslateALU_ADDv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(" + "); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_MULv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(" * "); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_MAXv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - if (alu.src1_reg == alu.src2_reg && - alu.src1_sel == alu.src2_sel && - alu.src1_swiz == alu.src2_swiz && - alu.src1_reg_negate == alu.src2_reg_negate && - alu.src1_reg_abs == alu.src2_reg_abs) { - // This is a mov. - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - } else { - ctx.output->append("max("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(", "); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(")"); - } - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_MINv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("min("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(", "); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_SETXXv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu, const char* op) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("float4(("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").x %s (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").x ? 1.0 : 0.0, ("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").y %s (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").y ? 1.0 : 0.0, ("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").z %s (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").z ? 1.0 : 0.0, ("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").w %s (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").w ? 1.0 : 0.0)"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} -int TranslateALU_SETEv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXv(ctx, alu, "=="); -} -int TranslateALU_SETGTv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXv(ctx, alu, ">"); -} -int TranslateALU_SETGTEv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXv(ctx, alu, ">="); -} -int TranslateALU_SETNEv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXv(ctx, alu, "!="); -} - -int TranslateALU_FRACv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("frac("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_TRUNCv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("trunc("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_FLOORv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("floor("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_MULADDv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("mad("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(", "); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(", "); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_CNDXXv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu, const char* op) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - // TODO(benvanik): check argument order - could be 3 as compare and 1 and 2 as values. - ctx.output->append("float4(("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").x %s 0.0 ? (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").x : ("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(").x, ("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").y %s 0.0 ? (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").y : ("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(").y, ("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").z %s 0.0 ? (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").z : ("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(").z, ("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").w %s 0.0 ? (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").w : ("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(").w)"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} -int TranslateALU_CNDEv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_CNDXXv(ctx, alu, "=="); -} -int TranslateALU_CNDGTEv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_CNDXXv(ctx, alu, ">="); -} -int TranslateALU_CNDGTv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_CNDXXv(ctx, alu, ">"); -} - -int TranslateALU_DOT4v( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("dot("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(", "); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_DOT3v( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("dot(float4("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").xyz, float4("); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").xyz)"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_DOT2ADDv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("dot(float4("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").xy, float4("); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").xy) + "); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".x"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -// CUBEv - -int TranslateALU_MAX4v( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("max("); - ctx.output->append("max("); - ctx.output->append("max("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(".x, "); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(".y), "); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(".z), "); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(".w)"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -// ... - -int TranslateALU_MAXs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - if ((alu.src3_swiz & 0x3) == (((alu.src3_swiz >> 2) + 1) & 0x3)) { - // This is a mov. - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - } else { - ctx.output->append("max("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".x, "); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".y).xxxx"); - } - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_MINs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("min("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".x, "); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".y).xxxx"); - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_SETXXs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu, const char* op) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("(("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".x %s 0.0) ? 1.0 : 0.0).xxxx", op); - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} -int TranslateALU_SETEs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXs(ctx, alu, "=="); -} -int TranslateALU_SETGTs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXs(ctx, alu, ">"); -} -int TranslateALU_SETGTEs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXs(ctx, alu, ">="); -} -int TranslateALU_SETNEs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXs(ctx, alu, "!="); -} - -int TranslateALU_RECIP_IEEE( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("(1.0 / "); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(")"); - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_MUL_CONST_0( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - uint32_t src3_swiz = alu.src3_swiz & ~0x3C; - uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; - uint32_t swiz_b = (src3_swiz & 0x3); - uint32_t reg2 = (alu.scalar_opc & 1) | (alu.src3_swiz & 0x3C) | (alu.src3_sel << 1); - ctx.output->append("("); - AppendSrcReg(ctx, alu.src3_reg, 0, 0, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".%c * ", chan_names[swiz_a]); - AppendSrcReg(ctx, reg2, 1, 0, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".%c", chan_names[swiz_b]); - ctx.output->append(").xxxx"); - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} -int TranslateALU_MUL_CONST_1( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_MUL_CONST_0(ctx, alu); -} - -int TranslateALU_ADD_CONST_0( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - uint32_t src3_swiz = alu.src3_swiz & ~0x3C; - uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; - uint32_t swiz_b = (src3_swiz & 0x3); - uint32_t reg2 = (alu.scalar_opc & 1) | (alu.src3_swiz & 0x3C) | (alu.src3_sel << 1); - ctx.output->append("("); - AppendSrcReg(ctx, alu.src3_reg, 0, 0, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".%c + ", chan_names[swiz_a]); - AppendSrcReg(ctx, reg2, 1, 0, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".%c", chan_names[swiz_b]); - ctx.output->append(").xxxx"); - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} -int TranslateALU_ADD_CONST_1( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_ADD_CONST_0(ctx, alu); -} - -int TranslateALU_SUB_CONST_0( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - uint32_t src3_swiz = alu.src3_swiz & ~0x3C; - uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; - uint32_t swiz_b = (src3_swiz & 0x3); - uint32_t reg2 = (alu.scalar_opc & 1) | (alu.src3_swiz & 0x3C) | (alu.src3_sel << 1); - ctx.output->append("("); - AppendSrcReg(ctx, alu.src3_reg, 0, 0, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".%c - ", chan_names[swiz_a]); - AppendSrcReg(ctx, reg2, 1, 0, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".%c", chan_names[swiz_b]); - ctx.output->append(").xxxx"); - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} -int TranslateALU_SUB_CONST_1( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SUB_CONST_0(ctx, alu); -} - -typedef int (*xe_gpu_translate_alu_fn)( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu); -typedef struct { - uint32_t num_srcs; - const char* name; - xe_gpu_translate_alu_fn fn; -} xe_gpu_translate_alu_info_t; -#define ALU_INSTR(opc, num_srcs) \ - { num_srcs, #opc, 0 } -#define ALU_INSTR_IMPL(opc, num_srcs) \ - { num_srcs, #opc, TranslateALU_##opc } -static xe_gpu_translate_alu_info_t vector_alu_instrs[0x20] = { - ALU_INSTR_IMPL(ADDv, 2), // 0 - ALU_INSTR_IMPL(MULv, 2), // 1 - ALU_INSTR_IMPL(MAXv, 2), // 2 - ALU_INSTR_IMPL(MINv, 2), // 3 - ALU_INSTR_IMPL(SETEv, 2), // 4 - ALU_INSTR_IMPL(SETGTv, 2), // 5 - ALU_INSTR_IMPL(SETGTEv, 2), // 6 - ALU_INSTR_IMPL(SETNEv, 2), // 7 - ALU_INSTR_IMPL(FRACv, 1), // 8 - ALU_INSTR_IMPL(TRUNCv, 1), // 9 - ALU_INSTR_IMPL(FLOORv, 1), // 10 - ALU_INSTR_IMPL(MULADDv, 3), // 11 - ALU_INSTR_IMPL(CNDEv, 3), // 12 - ALU_INSTR_IMPL(CNDGTEv, 3), // 13 - ALU_INSTR_IMPL(CNDGTv, 3), // 14 - ALU_INSTR_IMPL(DOT4v, 2), // 15 - ALU_INSTR_IMPL(DOT3v, 2), // 16 - ALU_INSTR_IMPL(DOT2ADDv, 3), // 17 -- ??? - ALU_INSTR(CUBEv, 2), // 18 - ALU_INSTR_IMPL(MAX4v, 1), // 19 - ALU_INSTR(PRED_SETE_PUSHv, 2), // 20 - ALU_INSTR(PRED_SETNE_PUSHv, 2), // 21 - ALU_INSTR(PRED_SETGT_PUSHv, 2), // 22 - ALU_INSTR(PRED_SETGTE_PUSHv, 2), // 23 - ALU_INSTR(KILLEv, 2), // 24 - ALU_INSTR(KILLGTv, 2), // 25 - ALU_INSTR(KILLGTEv, 2), // 26 - ALU_INSTR(KILLNEv, 2), // 27 - ALU_INSTR(DSTv, 2), // 28 - ALU_INSTR(MOVAv, 1), // 29 -}; -static xe_gpu_translate_alu_info_t scalar_alu_instrs[0x40] = { - ALU_INSTR(ADDs, 1), // 0 - ALU_INSTR(ADD_PREVs, 1), // 1 - ALU_INSTR(MULs, 1), // 2 - ALU_INSTR(MUL_PREVs, 1), // 3 - ALU_INSTR(MUL_PREV2s, 1), // 4 - ALU_INSTR_IMPL(MAXs, 1), // 5 - ALU_INSTR_IMPL(MINs, 1), // 6 - ALU_INSTR_IMPL(SETEs, 1), // 7 - ALU_INSTR_IMPL(SETGTs, 1), // 8 - ALU_INSTR_IMPL(SETGTEs, 1), // 9 - ALU_INSTR_IMPL(SETNEs, 1), // 10 - ALU_INSTR(FRACs, 1), // 11 - ALU_INSTR(TRUNCs, 1), // 12 - ALU_INSTR(FLOORs, 1), // 13 - ALU_INSTR(EXP_IEEE, 1), // 14 - ALU_INSTR(LOG_CLAMP, 1), // 15 - ALU_INSTR(LOG_IEEE, 1), // 16 - ALU_INSTR(RECIP_CLAMP, 1), // 17 - ALU_INSTR(RECIP_FF, 1), // 18 - ALU_INSTR_IMPL(RECIP_IEEE, 1), // 19 - ALU_INSTR(RECIPSQ_CLAMP, 1), // 20 - ALU_INSTR(RECIPSQ_FF, 1), // 21 - ALU_INSTR(RECIPSQ_IEEE, 1), // 22 - ALU_INSTR(MOVAs, 1), // 23 - ALU_INSTR(MOVA_FLOORs, 1), // 24 - ALU_INSTR(SUBs, 1), // 25 - ALU_INSTR(SUB_PREVs, 1), // 26 - ALU_INSTR(PRED_SETEs, 1), // 27 - ALU_INSTR(PRED_SETNEs, 1), // 28 - ALU_INSTR(PRED_SETGTs, 1), // 29 - ALU_INSTR(PRED_SETGTEs, 1), // 30 - ALU_INSTR(PRED_SET_INVs, 1), // 31 - ALU_INSTR(PRED_SET_POPs, 1), // 32 - ALU_INSTR(PRED_SET_CLRs, 1), // 33 - ALU_INSTR(PRED_SET_RESTOREs, 1), // 34 - ALU_INSTR(KILLEs, 1), // 35 - ALU_INSTR(KILLGTs, 1), // 36 - ALU_INSTR(KILLGTEs, 1), // 37 - ALU_INSTR(KILLNEs, 1), // 38 - ALU_INSTR(KILLONEs, 1), // 39 - ALU_INSTR(SQRT_IEEE, 1), // 40 - { 0, 0, false }, - ALU_INSTR_IMPL(MUL_CONST_0, 2), // 42 - ALU_INSTR_IMPL(MUL_CONST_1, 2), // 43 - ALU_INSTR_IMPL(ADD_CONST_0, 2), // 44 - ALU_INSTR_IMPL(ADD_CONST_1, 2), // 45 - ALU_INSTR_IMPL(SUB_CONST_0, 2), // 46 - ALU_INSTR_IMPL(SUB_CONST_1, 2), // 47 - ALU_INSTR(SIN, 1), // 48 - ALU_INSTR(COS, 1), // 49 - ALU_INSTR(RETAIN_PREV, 1), // 50 -}; -#undef ALU_INSTR - -int TranslateALU( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t* alu, int sync) { - Output* output = ctx.output; - - if (!alu->scalar_write_mask && !alu->vector_write_mask) { - output->append(" // \n"); - return 0; - } - - if (alu->vector_write_mask) { - // Disassemble vector op. - xe_gpu_translate_alu_info_t& iv = vector_alu_instrs[alu->vector_opc]; - output->append(" // %sALU:\t", sync ? "(S)" : " "); - output->append("%s", iv.name); - if (alu->pred_select & 0x2) { - // seems to work similar to conditional execution in ARM instruction - // set, so let's use a similar syntax for now: - output->append((alu->pred_select & 0x1) ? "EQ" : "NE"); - } - output->append("\t"); - print_dstreg(output, - alu->vector_dest, alu->vector_write_mask, alu->export_data); - output->append(" = "); - if (iv.num_srcs == 3) { - print_srcreg(output, - alu->src3_reg, alu->src3_sel, alu->src3_swiz, - alu->src3_reg_negate, alu->src3_reg_abs); - output->append(", "); - } - print_srcreg(output, - alu->src1_reg, alu->src1_sel, alu->src1_swiz, - alu->src1_reg_negate, alu->src1_reg_abs); - if (iv.num_srcs > 1) { - output->append(", "); - print_srcreg(output, - alu->src2_reg, alu->src2_sel, alu->src2_swiz, - alu->src2_reg_negate, alu->src2_reg_abs); - } - if (alu->vector_clamp) { - output->append(" CLAMP"); - } - if (alu->export_data) { - print_export_comment(output, alu->vector_dest, ctx.type); - } - output->append("\n"); - - // Translate vector op. - if (iv.fn) { - output->append(" "); - if (iv.fn(ctx, *alu)) { - return 1; - } - } else { - output->append(" // \n"); - } - } - - if (alu->scalar_write_mask || !alu->vector_write_mask) { - // 2nd optional scalar op: - - // Disassemble scalar op. - xe_gpu_translate_alu_info_t& is = scalar_alu_instrs[alu->scalar_opc]; - output->append(" // "); - output->append("\t"); - if (is.name) { - output->append("\t \t%s\t", is.name); - } else { - output->append("\t \tOP(%u)\t", alu->scalar_opc); - } - print_dstreg(output, - alu->scalar_dest, alu->scalar_write_mask, alu->export_data); - output->append(" = "); - if (is.num_srcs == 2) { - // ADD_CONST_0 dest, [const], [reg] - uint32_t src3_swiz = alu->src3_swiz & ~0x3C; - uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; - uint32_t swiz_b = (src3_swiz & 0x3); - print_srcreg(output, - alu->src3_reg, 0, 0, - alu->src3_reg_negate, alu->src3_reg_abs); - output->append(".%c", chan_names[swiz_a]); - output->append(", "); - uint32_t reg2 = (alu->scalar_opc & 1) | (alu->src3_swiz & 0x3C) | (alu->src3_sel << 1); - print_srcreg(output, - reg2, 1, 0, - alu->src3_reg_negate, alu->src3_reg_abs); - output->append(".%c", chan_names[swiz_b]); - } else { - print_srcreg(output, - alu->src3_reg, alu->src3_sel, alu->src3_swiz, - alu->src3_reg_negate, alu->src3_reg_abs); - } - if (alu->scalar_clamp) { - output->append(" CLAMP"); - } - if (alu->export_data) { - print_export_comment(output, alu->scalar_dest, ctx.type); - } - output->append("\n"); - - // Translate scalar op. - if (is.fn) { - output->append(" "); - if (is.fn(ctx, *alu)) { - return 1; - } - } else { - output->append(" // \n"); - } - } - - return 0; -} - -struct { - const char *name; -} fetch_types[0xff] = { -#define TYPE(id) { #id } - TYPE(FMT_1_REVERSE), // 0 - {0}, - TYPE(FMT_8), // 2 - {0}, - {0}, - {0}, - TYPE(FMT_8_8_8_8), // 6 - TYPE(FMT_2_10_10_10), // 7 - {0}, - {0}, - TYPE(FMT_8_8), // 10 - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - TYPE(FMT_16), // 24 - TYPE(FMT_16_16), // 25 - TYPE(FMT_16_16_16_16), // 26 - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - TYPE(FMT_32), // 33 - TYPE(FMT_32_32), // 34 - TYPE(FMT_32_32_32_32), // 35 - TYPE(FMT_32_FLOAT), // 36 - TYPE(FMT_32_32_FLOAT), // 37 - TYPE(FMT_32_32_32_32_FLOAT), // 38 - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - TYPE(FMT_32_32_32_FLOAT), // 57 -#undef TYPE -}; - -void print_fetch_dst(Output* output, uint32_t dst_reg, uint32_t dst_swiz) { - output->append("\tR%u.", dst_reg); - for (int i = 0; i < 4; i++) { - output->append("%c", chan_names[dst_swiz & 0x7]); - dst_swiz >>= 3; - } -} - -void AppendFetchDest(Output* output, uint32_t dst_reg, uint32_t dst_swiz) { - output->append("r%u.", dst_reg); - for (int i = 0; i < 4; i++) { - output->append("%c", chan_names[dst_swiz & 0x7]); - dst_swiz >>= 3; - } -} - -int TranslateVertexFetch( - xe_gpu_translate_ctx_t& ctx, const instr_fetch_vtx_t* vtx, int sync) { - Output* output = ctx.output; - - // Disassemble. - output->append(" // %sFETCH:\t", sync ? "(S)" : " "); - if (vtx->pred_select) { - output->append(vtx->pred_condition ? "EQ" : "NE"); - } - print_fetch_dst(output, vtx->dst_reg, vtx->dst_swiz); - output->append(" = R%u.", vtx->src_reg); - output->append("%c", chan_names[vtx->src_swiz & 0x3]); - if (fetch_types[vtx->format].name) { - output->append(" %s", fetch_types[vtx->format].name); - } else { - output->append(" TYPE(0x%x)", vtx->format); - } - output->append(" %s", vtx->format_comp_all ? "SIGNED" : "UNSIGNED"); - if (!vtx->num_format_all) { - output->append(" NORMALIZED"); - } - output->append(" STRIDE(%u)", vtx->stride); - if (vtx->offset) { - output->append(" OFFSET(%u)", vtx->offset); - } - output->append(" CONST(%u, %u)", vtx->const_index, vtx->const_index_sel); - if (1) { - // XXX - output->append(" src_reg_am=%u", vtx->src_reg_am); - output->append(" dst_reg_am=%u", vtx->dst_reg_am); - output->append(" num_format_all=%u", vtx->num_format_all); - output->append(" signed_rf_mode_all=%u", vtx->signed_rf_mode_all); - output->append(" exp_adjust_all=%u", vtx->exp_adjust_all); - } - output->append("\n"); - - // Translate. - output->append(" "); - output->append("r%u.xyzw", vtx->dst_reg); - output->append(" = float4("); - uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel; - // TODO(benvanik): detect xyzw = xyzw, etc. - // TODO(benvanik): detect and set as rN = float4(samp.xyz, 1.0); / etc - uint32_t component_count = GetFormatComponentCount(vtx->format); - uint32_t dst_swiz = vtx->dst_swiz; - for (int i = 0; i < 4; i++) { - if ((dst_swiz & 0x7) == 4) { - output->append("0.0"); - } else if ((dst_swiz & 0x7) == 5) { - output->append("1.0"); - } else if ((dst_swiz & 0x7) == 6) { - // ? - output->append("?"); - } else if ((dst_swiz & 0x7) == 7) { - output->append("r%u.%c", vtx->dst_reg, chan_names[i]); - } else { - output->append("i.vf%u_%d.%c", - fetch_slot, vtx->offset, - chan_names[dst_swiz & 0x3]); - } - if (i < 3) { - output->append(", "); - } - dst_swiz >>= 3; - } - output->append(");\n"); - return 0; -} - -int TranslateTextureFetch( - xe_gpu_translate_ctx_t& ctx, const instr_fetch_tex_t* tex, int sync) { - Output* output = ctx.output; - - // Disassemble. - static const char *filter[] = { - "POINT", // TEX_FILTER_POINT - "LINEAR", // TEX_FILTER_LINEAR - "BASEMAP", // TEX_FILTER_BASEMAP - }; - static const char *aniso_filter[] = { - "DISABLED", // ANISO_FILTER_DISABLED - "MAX_1_1", // ANISO_FILTER_MAX_1_1 - "MAX_2_1", // ANISO_FILTER_MAX_2_1 - "MAX_4_1", // ANISO_FILTER_MAX_4_1 - "MAX_8_1", // ANISO_FILTER_MAX_8_1 - "MAX_16_1", // ANISO_FILTER_MAX_16_1 - }; - static const char *arbitrary_filter[] = { - "2x4_SYM", // ARBITRARY_FILTER_2X4_SYM - "2x4_ASYM", // ARBITRARY_FILTER_2X4_ASYM - "4x2_SYM", // ARBITRARY_FILTER_4X2_SYM - "4x2_ASYM", // ARBITRARY_FILTER_4X2_ASYM - "4x4_SYM", // ARBITRARY_FILTER_4X4_SYM - "4x4_ASYM", // ARBITRARY_FILTER_4X4_ASYM - }; - static const char *sample_loc[] = { - "CENTROID", // SAMPLE_CENTROID - "CENTER", // SAMPLE_CENTER - }; - uint32_t src_swiz = tex->src_swiz; - output->append(" // %sFETCH:\t", sync ? "(S)" : " "); - if (tex->pred_select) { - output->append(tex->pred_condition ? "EQ" : "NE"); - } - print_fetch_dst(output, tex->dst_reg, tex->dst_swiz); - output->append(" = R%u.", tex->src_reg); - for (int i = 0; i < 3; i++) { - output->append("%c", chan_names[src_swiz & 0x3]); - src_swiz >>= 2; - } - output->append(" CONST(%u)", tex->const_idx); - if (tex->fetch_valid_only) { - output->append(" VALID_ONLY"); - } - if (tex->tx_coord_denorm) { - output->append(" DENORM"); - } - if (tex->mag_filter != TEX_FILTER_USE_FETCH_CONST) { - output->append(" MAG(%s)", filter[tex->mag_filter]); - } - if (tex->min_filter != TEX_FILTER_USE_FETCH_CONST) { - output->append(" MIN(%s)", filter[tex->min_filter]); - } - if (tex->mip_filter != TEX_FILTER_USE_FETCH_CONST) { - output->append(" MIP(%s)", filter[tex->mip_filter]); - } - if (tex->aniso_filter != ANISO_FILTER_USE_FETCH_CONST) { - output->append(" ANISO(%s)", aniso_filter[tex->aniso_filter]); - } - if (tex->arbitrary_filter != ARBITRARY_FILTER_USE_FETCH_CONST) { - output->append(" ARBITRARY(%s)", arbitrary_filter[tex->arbitrary_filter]); - } - if (tex->vol_mag_filter != TEX_FILTER_USE_FETCH_CONST) { - output->append(" VOL_MAG(%s)", filter[tex->vol_mag_filter]); - } - if (tex->vol_min_filter != TEX_FILTER_USE_FETCH_CONST) { - output->append(" VOL_MIN(%s)", filter[tex->vol_min_filter]); - } - if (!tex->use_comp_lod) { - output->append(" LOD(%u)", tex->use_comp_lod); - output->append(" LOD_BIAS(%u)", tex->lod_bias); - } - if (tex->use_reg_lod) { - output->append(" REG_LOD(%u)", tex->use_reg_lod); - } - if (tex->use_reg_gradients) { - output->append(" USE_REG_GRADIENTS"); - } - output->append(" LOCATION(%s)", sample_loc[tex->sample_location]); - if (tex->offset_x || tex->offset_y || tex->offset_z) { - output->append(" OFFSET(%u,%u,%u)", tex->offset_x, tex->offset_y, tex->offset_z); - } - output->append("\n"); - - int src_component_count = 0; - switch (tex->dimension) { - case DIMENSION_1D: - src_component_count = 1; - break; - default: - case DIMENSION_2D: - src_component_count = 2; - break; - case DIMENSION_3D: - src_component_count = 3; - break; - case DIMENSION_CUBE: - src_component_count = 3; - break; - } - - // Translate. - output->append(" "); - output->append("r%u.xyzw", tex->dst_reg); - output->append(" = "); - output->append( - "x_texture_%d.Sample(x_sampler_%d, r%u.", - tex->const_idx, - ctx.tex_fetch_index++, // hacky way to line up to tex buffers - tex->src_reg); - src_swiz = tex->src_swiz; - for (int i = 0; i < src_component_count; i++) { - output->append("%c", chan_names[src_swiz & 0x3]); - src_swiz >>= 2; - } - output->append(")."); - - // Pass one over dest does xyzw and fakes the special values. - // TODO(benvanik): detect and set as rN = float4(samp.xyz, 1.0); / etc - uint32_t dst_swiz = tex->dst_swiz; - for (int i = 0; i < 4; i++) { - output->append("%c", chan_names[dst_swiz & 0x3]); - dst_swiz >>= 3; - } - output->append(";\n"); - // Do another pass to set constant values. - dst_swiz = tex->dst_swiz; - for (int i = 0; i < 4; i++) { - if ((dst_swiz & 0x7) == 4) { - output->append(" r%u.%c = 0.0;\n", tex->dst_reg, chan_names[i]); - } else if ((dst_swiz & 0x7) == 5) { - output->append(" r%u.%c = 1.0;\n", tex->dst_reg, chan_names[i]); - } - dst_swiz >>= 3; - } - return 0; -} - -struct { - const char *name; -} cf_instructions[] = { -#define INSTR(opc, fxn) { #opc } - INSTR(NOP, print_cf_nop), - INSTR(EXEC, print_cf_exec), - INSTR(EXEC_END, print_cf_exec), - INSTR(COND_EXEC, print_cf_exec), - INSTR(COND_EXEC_END, print_cf_exec), - INSTR(COND_PRED_EXEC, print_cf_exec), - INSTR(COND_PRED_EXEC_END, print_cf_exec), - INSTR(LOOP_START, print_cf_loop), - INSTR(LOOP_END, print_cf_loop), - INSTR(COND_CALL, print_cf_jmp_call), - INSTR(RETURN, print_cf_jmp_call), - INSTR(COND_JMP, print_cf_jmp_call), - INSTR(ALLOC, print_cf_alloc), - INSTR(COND_EXEC_PRED_CLEAN, print_cf_exec), - INSTR(COND_EXEC_PRED_CLEAN_END, print_cf_exec), - INSTR(MARK_VS_FETCH_DONE, print_cf_nop), // ?? -#undef INSTR -}; - -} // anonymous namespace - - -int D3D11Shader::TranslateExec(xe_gpu_translate_ctx_t& ctx, const instr_cf_exec_t& cf) { - Output* output = ctx.output; - - output->append( - " // %s ADDR(0x%x) CNT(0x%x)", - cf_instructions[cf.opc].name, cf.address, cf.count); - if (cf.yeild) { - output->append(" YIELD"); - } - uint8_t vc = cf.vc_hi | (cf.vc_lo << 2); - if (vc) { - output->append(" VC(0x%x)", vc); - } - if (cf.bool_addr) { - output->append(" BOOL_ADDR(0x%x)", cf.bool_addr); - } - if (cf.address_mode == ABSOLUTE_ADDR) { - output->append(" ABSOLUTE_ADDR"); - } - if (cf.is_cond_exec()) { - output->append(" COND(%d)", cf.condition); - } - output->append("\n"); - - uint32_t sequence = cf.serialize; - for (uint32_t i = 0; i < cf.count; i++) { - uint32_t alu_off = (cf.address + i); - int sync = sequence & 0x2; - if (sequence & 0x1) { - const instr_fetch_t* fetch = - (const instr_fetch_t*)(dwords_ + alu_off * 3); - switch (fetch->opc) { - case VTX_FETCH: - if (TranslateVertexFetch(ctx, &fetch->vtx, sync)) { - return 1; - } - break; - case TEX_FETCH: - if (TranslateTextureFetch(ctx, &fetch->tex, sync)) { - return 1; - } - break; - case TEX_GET_BORDER_COLOR_FRAC: - case TEX_GET_COMP_TEX_LOD: - case TEX_GET_GRADIENTS: - case TEX_GET_WEIGHTS: - case TEX_SET_TEX_LOD: - case TEX_SET_GRADIENTS_H: - case TEX_SET_GRADIENTS_V: - default: - XEASSERTALWAYS(); - break; - } - } else { - const instr_alu_t* alu = - (const instr_alu_t*)(dwords_ + alu_off * 3); - if (TranslateALU(ctx, alu, sync)) { - return 1; - } - } - sequence >>= 2; - } - - return 0; -} diff --git a/src/xenia/gpu/d3d11/d3d11_shader.h b/src/xenia/gpu/d3d11/d3d11_shader.h deleted file mode 100644 index 0b0bb492c..000000000 --- a/src/xenia/gpu/d3d11/d3d11_shader.h +++ /dev/null @@ -1,125 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_D3D11_D3D11_SHADER_H_ -#define XENIA_GPU_D3D11_D3D11_SHADER_H_ - -#include - -#include -#include - -#include - - -namespace xe { -namespace gpu { -namespace d3d11 { - -struct Output; - -typedef struct { - Output* output; - xenos::XE_GPU_SHADER_TYPE type; - uint32_t tex_fetch_index; -} xe_gpu_translate_ctx_t; - -class D3D11GeometryShader; - - -class D3D11Shader : public Shader { -public: - virtual ~D3D11Shader(); - - const static uint32_t MAX_INTERPOLATORS = 16; - -protected: - D3D11Shader( - ID3D11Device* device, - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash); - - const char* translated_src() const { return translated_src_; } - void set_translated_src(char* value); - - void AppendTextureHeader(Output* output); - int TranslateExec( - xe_gpu_translate_ctx_t& ctx, const xenos::instr_cf_exec_t& cf); - - ID3D10Blob* Compile(const char* shader_source); - -protected: - ID3D11Device* device_; - - char* translated_src_; -}; - - -class D3D11VertexShader : public D3D11Shader { -public: - D3D11VertexShader( - ID3D11Device* device, - const uint8_t* src_ptr, size_t length, - uint64_t hash); - virtual ~D3D11VertexShader(); - - ID3D11VertexShader* handle() const { return handle_; } - ID3D11InputLayout* input_layout() const { return input_layout_; } - - int Prepare(xenos::xe_gpu_program_cntl_t* program_cntl); - - enum GeometryShaderType { - POINT_SPRITE_SHADER, - RECT_LIST_SHADER, - QUAD_LIST_SHADER, - - MAX_GEOMETRY_SHADER_TYPE, - }; - int DemandGeometryShader(GeometryShaderType type, - D3D11GeometryShader** out_shader); - -private: - const char* Translate(xenos::xe_gpu_program_cntl_t* program_cntl); - -private: - ID3D11VertexShader* handle_; - ID3D11InputLayout* input_layout_; - D3D11GeometryShader* geometry_shaders_[MAX_GEOMETRY_SHADER_TYPE]; -}; - - -class D3D11PixelShader : public D3D11Shader { -public: - D3D11PixelShader( - ID3D11Device* device, - const uint8_t* src_ptr, size_t length, - uint64_t hash); - virtual ~D3D11PixelShader(); - - ID3D11PixelShader* handle() const { return handle_; } - - int Prepare(xenos::xe_gpu_program_cntl_t* program_cntl, - D3D11VertexShader* input_shader); - -private: - const char* Translate(xenos::xe_gpu_program_cntl_t* program_cntl, - D3D11VertexShader* input_shader); - -private: - ID3D11PixelShader* handle_; -}; - - -} // namespace d3d11 -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_D3D11_D3D11_SHADER_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_shader_cache.cc b/src/xenia/gpu/d3d11/d3d11_shader_cache.cc deleted file mode 100644 index 7f6a5a722..000000000 --- a/src/xenia/gpu/d3d11/d3d11_shader_cache.cc +++ /dev/null @@ -1,45 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::d3d11; -using namespace xe::gpu::xenos; - - -D3D11ShaderCache::D3D11ShaderCache(ID3D11Device* device) { - device_ = device; - device_->AddRef(); -} - -D3D11ShaderCache::~D3D11ShaderCache() { - device_->Release(); -} - -Shader* D3D11ShaderCache::CreateCore( - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash) { - switch (type) { - case XE_GPU_SHADER_TYPE_VERTEX: - return new D3D11VertexShader( - device_, src_ptr, length, hash); - case XE_GPU_SHADER_TYPE_PIXEL: - return new D3D11PixelShader( - device_, src_ptr, length, hash); - default: - XEASSERTALWAYS(); - return NULL; - } -} \ No newline at end of file diff --git a/src/xenia/gpu/d3d11/d3d11_shader_cache.h b/src/xenia/gpu/d3d11/d3d11_shader_cache.h deleted file mode 100644 index 8c33523b4..000000000 --- a/src/xenia/gpu/d3d11/d3d11_shader_cache.h +++ /dev/null @@ -1,46 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_D3D11_D3D11_SHADER_CACHE_H_ -#define XENIA_GPU_D3D11_D3D11_SHADER_CACHE_H_ - -#include - -#include - -#include - - -namespace xe { -namespace gpu { -namespace d3d11 { - - -class D3D11ShaderCache : public ShaderCache { -public: - D3D11ShaderCache(ID3D11Device* device); - virtual ~D3D11ShaderCache(); - -protected: - Shader* CreateCore( - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash) override; - -protected: - ID3D11Device* device_; -}; - - -} // namespace d3d11 -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_D3D11_D3D11_SHADER_CACHE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_shader_resource.cc b/src/xenia/gpu/d3d11/d3d11_shader_resource.cc new file mode 100644 index 000000000..e4be7e2cf --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_shader_resource.cc @@ -0,0 +1,381 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include +#include +#include + +#include + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; + + +namespace { + +ID3D10Blob* D3D11ShaderCompile(XE_GPU_SHADER_TYPE type, + const char* shader_source, + const char* disasm_source) { + SCOPE_profile_cpu_f("gpu"); + + // TODO(benvanik): pick shared runtime mode defines. + D3D10_SHADER_MACRO defines[] = { + "TEST_DEFINE", "1", + 0, 0, + }; + + uint32_t flags1 = 0; + flags1 |= D3D10_SHADER_DEBUG; + flags1 |= D3D10_SHADER_ENABLE_STRICTNESS; + uint32_t flags2 = 0; + + // Create a name. + const char* base_path = ""; + if (FLAGS_dump_shaders.size()) { + base_path = FLAGS_dump_shaders.c_str(); + } + size_t hash = xe_hash64(disasm_source, xestrlena(disasm_source)); // ? + char file_name[XE_MAX_PATH]; + xesnprintfa(file_name, XECOUNT(file_name), + "%s/gen_%.16llX.%s", + base_path, + hash, + type == XE_GPU_SHADER_TYPE_VERTEX ? "vs" : "ps"); + + if (FLAGS_dump_shaders.size()) { + FILE* f = fopen(file_name, "w"); + fprintf(f, shader_source); + fprintf(f, "\n\n"); + fprintf(f, "/*\n"); + fprintf(f, disasm_source); + fprintf(f, " */\n"); + fclose(f); + } + + // Compile shader to bytecode blob. + ID3D10Blob* shader_blob = 0; + ID3D10Blob* error_blob = 0; + HRESULT hr = D3DCompile( + shader_source, strlen(shader_source), + file_name, + defines, nullptr, + "main", + type == XE_GPU_SHADER_TYPE_VERTEX ? "vs_5_0" : "ps_5_0", + flags1, flags2, + &shader_blob, &error_blob); + if (error_blob) { + char* msg = (char*)error_blob->GetBufferPointer(); + XELOGE("D3D11: shader compile failed with %s", msg); + } + XESAFERELEASE(error_blob); + if (FAILED(hr)) { + return nullptr; + } + return shader_blob; +} + +} // namespace + + +D3D11VertexShaderResource::D3D11VertexShaderResource( + D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info) + : VertexShaderResource(memory_range, info), + resource_cache_(resource_cache), + handle_(nullptr), + input_layout_(nullptr), + translated_src_(nullptr) { + xe_zero_struct(geometry_shaders_, sizeof(geometry_shaders_)); +} + +D3D11VertexShaderResource::~D3D11VertexShaderResource() { + XESAFERELEASE(handle_); + XESAFERELEASE(input_layout_); + for (int i = 0; i < XECOUNT(geometry_shaders_); ++i) { + delete geometry_shaders_[i]; + } + xe_free(translated_src_); +} + +int D3D11VertexShaderResource::Prepare( + const xe_gpu_program_cntl_t& program_cntl) { + SCOPE_profile_cpu_f("gpu"); + if (is_prepared_ || handle_) { + return 0; + } + + // TODO(benvanik): look in file based on hash/etc. + void* byte_code = NULL; + size_t byte_code_length = 0; + + // Translate and compile source. + D3D11ShaderTranslator translator; + int ret = translator.TranslateVertexShader(this, program_cntl); + if (ret) { + XELOGE("D3D11: failed to translate vertex shader"); + return ret; + } + translated_src_ = xestrdupa(translator.translated_src()); + + ID3D10Blob* shader_blob = D3D11ShaderCompile( + XE_GPU_SHADER_TYPE_VERTEX, translated_src_, disasm_src()); + if (!shader_blob) { + return 1; + } + byte_code_length = shader_blob->GetBufferSize(); + byte_code = xe_malloc(byte_code_length); + xe_copy_struct( + byte_code, shader_blob->GetBufferPointer(), byte_code_length); + XESAFERELEASE(shader_blob); + + // Create shader. + HRESULT hr = resource_cache_->device()->CreateVertexShader( + byte_code, byte_code_length, + nullptr, + &handle_); + if (FAILED(hr)) { + XELOGE("D3D11: failed to create vertex shader"); + xe_free(byte_code); + return 1; + } + + // Create input layout. + ret = CreateInputLayout(byte_code, byte_code_length); + xe_free(byte_code); + if (ret) { + return 1; + } + is_prepared_ = true; + return 0; +} + +int D3D11VertexShaderResource::CreateInputLayout(const void* byte_code, + size_t byte_code_length) { + size_t element_count = 0; + const auto& inputs = buffer_inputs(); + for (uint32_t n = 0; n < inputs.count; n++) { + element_count += inputs.descs[n].info.element_count; + } + if (!element_count) { + XELOGW("D3D11: vertex shader with zero inputs -- retaining previous values?"); + input_layout_ = NULL; + return 0; + } + + D3D11_INPUT_ELEMENT_DESC* element_descs = + (D3D11_INPUT_ELEMENT_DESC*)xe_alloca( + sizeof(D3D11_INPUT_ELEMENT_DESC) * element_count); + uint32_t el_index = 0; + for (uint32_t n = 0; n < inputs.count; n++) { + const auto& input = inputs.descs[n]; + for (uint32_t m = 0; m < input.info.element_count; m++) { + const auto& el = input.info.elements[m]; + uint32_t vb_slot = input.input_index; + DXGI_FORMAT vtx_format; + switch (el.format) { + case FMT_8_8_8_8: + if (el.is_normalized) { + vtx_format = el.is_signed ? + DXGI_FORMAT_R8G8B8A8_SNORM : DXGI_FORMAT_R8G8B8A8_UNORM; + } else { + vtx_format = el.is_signed ? + DXGI_FORMAT_R8G8B8A8_SINT : DXGI_FORMAT_R8G8B8A8_UINT; + } + break; + case FMT_2_10_10_10: + if (el.is_normalized) { + vtx_format = DXGI_FORMAT_R10G10B10A2_UNORM; + } else { + vtx_format = DXGI_FORMAT_R10G10B10A2_UINT; + } + break; + // DXGI_FORMAT_R11G11B10_FLOAT? + case FMT_16_16: + if (el.is_normalized) { + vtx_format = el.is_signed ? + DXGI_FORMAT_R16G16_SNORM : DXGI_FORMAT_R16G16_UNORM; + } else { + vtx_format = el.is_signed ? + DXGI_FORMAT_R16G16_SINT : DXGI_FORMAT_R16G16_UINT; + } + break; + case FMT_16_16_16_16: + if (el.is_normalized) { + vtx_format = el.is_signed ? + DXGI_FORMAT_R16G16B16A16_SNORM : DXGI_FORMAT_R16G16B16A16_UNORM; + } else { + vtx_format = el.is_signed ? + DXGI_FORMAT_R16G16B16A16_SINT : DXGI_FORMAT_R16G16B16A16_UINT; + } + break; + case FMT_16_16_FLOAT: + vtx_format = DXGI_FORMAT_R16G16_FLOAT; + break; + case FMT_16_16_16_16_FLOAT: + vtx_format = DXGI_FORMAT_R16G16B16A16_FLOAT; + break; + case FMT_32: + vtx_format = el.is_signed ? + DXGI_FORMAT_R32_SINT : DXGI_FORMAT_R32_UINT; + break; + case FMT_32_32: + vtx_format = el.is_signed ? + DXGI_FORMAT_R32G32_SINT : DXGI_FORMAT_R32G32_UINT; + break; + case FMT_32_32_32_32: + vtx_format = el.is_signed ? + DXGI_FORMAT_R32G32B32A32_SINT : DXGI_FORMAT_R32G32B32A32_UINT; + break; + case FMT_32_FLOAT: + vtx_format = DXGI_FORMAT_R32_FLOAT; + break; + case FMT_32_32_FLOAT: + vtx_format = DXGI_FORMAT_R32G32_FLOAT; + break; + case FMT_32_32_32_FLOAT: + vtx_format = DXGI_FORMAT_R32G32B32_FLOAT; + break; + case FMT_32_32_32_32_FLOAT: + vtx_format = DXGI_FORMAT_R32G32B32A32_FLOAT; + break; + default: + XEASSERTALWAYS(); + break; + } + element_descs[el_index].SemanticName = "XE_VF"; + element_descs[el_index].SemanticIndex = el_index; + element_descs[el_index].Format = vtx_format; + element_descs[el_index].InputSlot = vb_slot; + element_descs[el_index].AlignedByteOffset = el.offset_words * 4; + element_descs[el_index].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA; + element_descs[el_index].InstanceDataStepRate = 0; + el_index++; + } + } + HRESULT hr = resource_cache_->device()->CreateInputLayout( + element_descs, + (UINT)element_count, + byte_code, byte_code_length, + &input_layout_); + if (FAILED(hr)) { + XELOGE("D3D11: failed to create vertex shader input layout"); + return 1; + } + + return 0; +} + +int D3D11VertexShaderResource::DemandGeometryShader( + GeometryShaderType type, D3D11GeometryShader** out_shader) { + if (geometry_shaders_[type]) { + *out_shader = geometry_shaders_[type]; + return 0; + } + + // Demand generate. + auto device = resource_cache_->device(); + D3D11GeometryShader* shader = nullptr; + switch (type) { + case POINT_SPRITE_SHADER: + shader = new D3D11PointSpriteGeometryShader(device); + break; + case RECT_LIST_SHADER: + shader = new D3D11RectListGeometryShader(device); + break; + case QUAD_LIST_SHADER: + shader = new D3D11QuadListGeometryShader(device); + break; + default: + XEASSERTALWAYS(); + return 1; + } + if (!shader) { + return 1; + } + + if (shader->Prepare(this)) { + delete shader; + return 1; + } + + geometry_shaders_[type] = shader; + *out_shader = geometry_shaders_[type]; + return 0; +} + +D3D11PixelShaderResource::D3D11PixelShaderResource( + D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info) + : PixelShaderResource(memory_range, info), + resource_cache_(resource_cache), + handle_(nullptr), + translated_src_(nullptr) { +} + +D3D11PixelShaderResource::~D3D11PixelShaderResource() { + XESAFERELEASE(handle_); + xe_free(translated_src_); +} + +int D3D11PixelShaderResource::Prepare(const xe_gpu_program_cntl_t& program_cntl, + VertexShaderResource* input_shader) { + SCOPE_profile_cpu_f("gpu"); + if (is_prepared_ || handle_) { + return 0; + } + + // TODO(benvanik): look in file based on hash/etc. + void* byte_code = NULL; + size_t byte_code_length = 0; + + // Translate and compile source. + D3D11ShaderTranslator translator; + int ret = translator.TranslatePixelShader(this, + program_cntl, + input_shader->alloc_counts()); + if (ret) { + XELOGE("D3D11: failed to translate pixel shader"); + return ret; + } + translated_src_ = xestrdupa(translator.translated_src()); + + ID3D10Blob* shader_blob = D3D11ShaderCompile( + XE_GPU_SHADER_TYPE_PIXEL, translated_src_, disasm_src()); + if (!shader_blob) { + return 1; + } + byte_code_length = shader_blob->GetBufferSize(); + byte_code = xe_malloc(byte_code_length); + xe_copy_struct( + byte_code, shader_blob->GetBufferPointer(), byte_code_length); + XESAFERELEASE(shader_blob); + + // Create shader. + HRESULT hr = resource_cache_->device()->CreatePixelShader( + byte_code, byte_code_length, + nullptr, + &handle_); + if (FAILED(hr)) { + XELOGE("D3D11: failed to create pixel shader"); + xe_free(byte_code); + return 1; + } + + xe_free(byte_code); + is_prepared_ = true; + return 0; +} diff --git a/src/xenia/gpu/d3d11/d3d11_shader_resource.h b/src/xenia/gpu/d3d11/d3d11_shader_resource.h new file mode 100644 index 000000000..5c0da8242 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_shader_resource.h @@ -0,0 +1,91 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_SHADER_RESOURCE_H_ +#define XENIA_GPU_D3D11_D3D11_SHADER_RESOURCE_H_ + +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + +class D3D11GeometryShader; +class D3D11ResourceCache; + +struct Output; +typedef struct { + Output* output; + xenos::XE_GPU_SHADER_TYPE type; + uint32_t tex_fetch_index; +} xe_gpu_translate_ctx_t; + +class D3D11VertexShaderResource : public VertexShaderResource { +public: + D3D11VertexShaderResource(D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info); + ~D3D11VertexShaderResource() override; + + void* handle() const override { return handle_; } + ID3D11InputLayout* input_layout() const { return input_layout_; } + const char* translated_src() const { return translated_src_; } + + int Prepare(const xenos::xe_gpu_program_cntl_t& program_cntl) override; + + enum GeometryShaderType { + POINT_SPRITE_SHADER, + RECT_LIST_SHADER, + QUAD_LIST_SHADER, + MAX_GEOMETRY_SHADER_TYPE, // keep at the end + }; + int DemandGeometryShader(GeometryShaderType type, + D3D11GeometryShader** out_shader); + +private: + int CreateInputLayout(const void* byte_code, size_t byte_code_length); + + D3D11ResourceCache* resource_cache_; + ID3D11VertexShader* handle_; + ID3D11InputLayout* input_layout_; + D3D11GeometryShader* geometry_shaders_[MAX_GEOMETRY_SHADER_TYPE]; + char* translated_src_; +}; + + +class D3D11PixelShaderResource : public PixelShaderResource { +public: + D3D11PixelShaderResource(D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info); + ~D3D11PixelShaderResource() override; + + void* handle() const override { return handle_; } + const char* translated_src() const { return translated_src_; } + + int Prepare(const xenos::xe_gpu_program_cntl_t& program_cntl, + VertexShaderResource* vertex_shader) override; + +private: + D3D11ResourceCache* resource_cache_; + ID3D11PixelShader* handle_; + char* translated_src_; +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_SHADER_RESOURCE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_shader_translator.cc b/src/xenia/gpu/d3d11/d3d11_shader_translator.cc new file mode 100644 index 000000000..dde024356 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_shader_translator.cc @@ -0,0 +1,1625 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; + + +namespace { + +const char* GetFormatTypeName(const VertexBufferResource::DeclElement& el) { + switch (el.format) { + case FMT_32: + return el.is_signed ? "int" : "uint"; + case FMT_32_FLOAT: + return "float"; + case FMT_16_16: + case FMT_32_32: + if (el.is_normalized) { + return el.is_signed ? "snorm float2" : "unorm float2"; + } else { + return el.is_signed ? "int2" : "uint2"; + } + case FMT_16_16_FLOAT: + case FMT_32_32_FLOAT: + return "float2"; + case FMT_10_11_11: + case FMT_11_11_10: + return "int3"; // ? + case FMT_32_32_32_FLOAT: + return "float3"; + case FMT_8_8_8_8: + case FMT_2_10_10_10: + case FMT_16_16_16_16: + case FMT_32_32_32_32: + if (el.is_normalized) { + return el.is_signed ? "snorm float4" : "unorm float4"; + } else { + return el.is_signed ? "int4" : "uint4"; + } + case FMT_16_16_16_16_FLOAT: + case FMT_32_32_32_32_FLOAT: + return "float4"; + default: + XELOGE("Unknown vertex format: %d", el.format); + XEASSERTALWAYS(); + return "float4"; + } +} + +} // anonymous namespace + +D3D11ShaderTranslator::D3D11ShaderTranslator() + : capacity_(kCapacity), offset_(0) { + buffer_[0] = 0; +} + +int D3D11ShaderTranslator::TranslateVertexShader( + VertexShaderResource* vertex_shader, + const xe_gpu_program_cntl_t& program_cntl) { + SCOPE_profile_cpu_f("gpu"); + + type_ = XE_GPU_SHADER_TYPE_VERTEX; + tex_fetch_index_ = 0; + dwords_ = vertex_shader->dwords(); + + // Add constants buffers. + // We could optimize this by only including used buffers, but the compiler + // seems to do a good job of doing this for us. + // It also does read detection, so c[512] can end up c[4] in the asm - + // instead of doing this optimization ourselves we could maybe just query + // this from the compiler. + append( + "cbuffer float_consts : register(b0) {\n" + " float4 c[512];\n" + "};\n"); + // TODO(benvanik): add bool/loop constants. + + AppendTextureHeader(vertex_shader->sampler_inputs()); + + // Transform utilities. We adjust the output position in various ways + // as we can't do this via D3D11 APIs. + append( + "cbuffer vs_consts : register(b3) {\n" + " float4 window;\n" // x,y,w,h + " float4 viewport_z_enable;\n" // min,(max - min),?,enabled + " float4 viewport_size;\n" // x,y,w,h + "};" + "float4 applyViewport(float4 pos) {\n" + " if (viewport_z_enable.w) {\n" + //" pos.x = (pos.x + 1) * viewport_size.z * 0.5 + viewport_size.x;\n" + //" pos.y = (1 - pos.y) * viewport_size.w * 0.5 + viewport_size.y;\n" + //" pos.z = viewport_z_enable.x + pos.z * viewport_z_enable.y;\n" + // w? + " } else {\n" + " pos.xy = pos.xy / float2(window.z / 2.0, -window.w / 2.0) + float2(-1.0, 1.0);\n" + " pos.zw = float2(0.0, 1.0);\n" + " }\n" + " pos.xy += window.xy;\n" + " return pos;\n" + "}\n"); + + // Add vertex shader input. + append( + "struct VS_INPUT {\n"); + uint32_t el_index = 0; + const auto& buffer_inputs = vertex_shader->buffer_inputs(); + for (uint32_t n = 0; n < buffer_inputs.count; n++) { + const auto& input = buffer_inputs.descs[n]; + for (uint32_t m = 0; m < input.info.element_count; m++) { + const auto& el = input.info.elements[m]; + const char* type_name = GetFormatTypeName(el); + const auto& fetch = el.vtx_fetch; + uint32_t fetch_slot = fetch.const_index * 3 + fetch.const_index_sel; + append( + " %s vf%u_%d : XE_VF%u;\n", + type_name, fetch_slot, fetch.offset, el_index); + el_index++; + } + } + append( + "};\n"); + + // Add vertex shader output (pixel shader input). + const auto& alloc_counts = vertex_shader->alloc_counts(); + append( + "struct VS_OUTPUT {\n"); + if (alloc_counts.positions) { + XEASSERT(alloc_counts.positions == 1); + append( + " float4 oPos : SV_POSITION;\n"); + } + if (alloc_counts.params) { + append( + " float4 o[%d] : XE_O;\n", + kMaxInterpolators); + } + if (alloc_counts.point_size) { + append( + " float4 oPointSize : PSIZE;\n"); + } + append( + "};\n"); + + // Vertex shader main() header. + append( + "VS_OUTPUT main(VS_INPUT i) {\n" + " VS_OUTPUT o;\n"); + + // Always write position, as some shaders seem to only write certain values. + append( + " o.oPos = float4(0.0, 0.0, 0.0, 0.0);\n"); + if (alloc_counts.point_size) { + append( + " o.oPointSize = float4(1.0, 0.0, 0.0, 0.0);\n"); + } + + // TODO(benvanik): remove this, if possible (though the compiler may be smart + // enough to do it for us). + if (alloc_counts.params) { + for (uint32_t n = 0; n < kMaxInterpolators; n++) { + append( + " o.o[%d] = float4(0.0, 0.0, 0.0, 0.0);\n", n); + } + } + + // Add temporaries for any registers we may use. + uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs; + for (uint32_t n = 0; n <= temp_regs; n++) { + append( + " float4 r%d = c[%d];\n", n, n); + } + append(" float4 t;\n"); + + // Execute blocks. + const auto& execs = vertex_shader->execs(); + for (auto it = execs.begin(); it != execs.end(); ++it) { + const instr_cf_exec_t& cf = *it; + // TODO(benvanik): figure out how sequences/jmps/loops/etc work. + if (TranslateExec(cf)) { + return 1; + } + } + + // main footer. + append( + " o.oPos = applyViewport(o.oPos);\n" + " return o;\n" + "};\n"); + + return 0; +} + +int D3D11ShaderTranslator::TranslatePixelShader( + PixelShaderResource* pixel_shader, + const xe_gpu_program_cntl_t& program_cntl, + const VertexShaderResource::AllocCounts& alloc_counts) { + SCOPE_profile_cpu_f("gpu"); + + // We need an input VS to make decisions here. + // TODO(benvanik): do we need to pair VS/PS up and store the combination? + // If the same PS is used with different VS that output different amounts + // (and less than the number of required registers), things may die. + + type_ = XE_GPU_SHADER_TYPE_PIXEL; + tex_fetch_index_ = 0; + dwords_ = pixel_shader->dwords(); + + // Add constants buffers. + // We could optimize this by only including used buffers, but the compiler + // seems to do a good job of doing this for us. + // It also does read detection, so c[512] can end up c[4] in the asm - + // instead of doing this optimization ourselves we could maybe just query + // this from the compiler. + append( + "cbuffer float_consts : register(b0) {\n" + " float4 c[512];\n" + "};\n"); + // TODO(benvanik): add bool/loop constants. + + AppendTextureHeader(pixel_shader->sampler_inputs()); + + // Add vertex shader output (pixel shader input). + append( + "struct VS_OUTPUT {\n"); + if (alloc_counts.positions) { + XEASSERT(alloc_counts.positions == 1); + append( + " float4 oPos : SV_POSITION;\n"); + } + if (alloc_counts.params) { + append( + " float4 o[%d] : XE_O;\n", + kMaxInterpolators); + } + append( + "};\n"); + + // Add pixel shader output. + append( + "struct PS_OUTPUT {\n"); + for (uint32_t n = 0; n < alloc_counts.params; n++) { + append( + " float4 oC%d : SV_TARGET%d;\n", n, n); + if (program_cntl.ps_export_depth) { + // Is this per render-target? + append( + " float oD%d : SV_DEPTH%d;\n", n, n); + } + } + append( + "};\n"); + + // Pixel shader main() header. + append( + "PS_OUTPUT main(VS_OUTPUT i) {\n" + " PS_OUTPUT o;\n"); + + // Add temporary registers. + uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs; + for (uint32_t n = 0; n <= MAX(15, temp_regs); n++) { + append( + " float4 r%d = c[%d];\n", n, n); + } + append(" float4 t;\n"); + + // Bring registers local. + if (alloc_counts.params) { + for (uint32_t n = 0; n < kMaxInterpolators; n++) { + append( + " r%d = i.o[%d];\n", n, n); + } + } + + // Execute blocks. + const auto& execs = pixel_shader->execs(); + for (auto it = execs.begin(); it != execs.end(); ++it) { + const instr_cf_exec_t& cf = *it; + // TODO(benvanik): figure out how sequences/jmps/loops/etc work. + if (TranslateExec(cf)) { + return 1; + } + } + + // main footer. + append( + " return o;\n" + "}\n"); + + return 0; +} + +void D3D11ShaderTranslator::AppendTextureHeader( + const ShaderResource::SamplerInputs& sampler_inputs) { + bool fetch_setup[32] = { false }; + + // 1 texture per constant slot, 1 sampler per fetch. + for (uint32_t n = 0; n < sampler_inputs.count; n++) { + const auto& input = sampler_inputs.descs[n]; + const auto& fetch = input.tex_fetch; + + // Add texture, if needed. + if (!fetch_setup[fetch.const_idx]) { + fetch_setup[fetch.const_idx] = true; + const char* texture_type = NULL; + switch (fetch.dimension) { + case DIMENSION_1D: + texture_type = "Texture1D"; + break; + default: + case DIMENSION_2D: + texture_type = "Texture2D"; + break; + case DIMENSION_3D: + texture_type = "Texture3D"; + break; + case DIMENSION_CUBE: + texture_type = "TextureCube"; + break; + } + append("%s x_texture_%d;\n", texture_type, fetch.const_idx); + } + + // Add sampler. + append("SamplerState x_sampler_%d;\n", n); + } +} + +namespace { + +static const char chan_names[] = { + 'x', 'y', 'z', 'w', + // these only apply to FETCH dst's, and we shouldn't be using them: + '0', '1', '?', '_', +}; + +} // namespace + +void D3D11ShaderTranslator::AppendSrcReg(uint32_t num, uint32_t type, + uint32_t swiz, uint32_t negate, + uint32_t abs) { + if (negate) { + append("-"); + } + if (abs) { + append("abs("); + } + if (type) { + // Register. + append("r%u", num); + } else { + // Constant. + append("c[%u]", num); + } + if (swiz) { + append("."); + for (int i = 0; i < 4; i++) { + append("%c", chan_names[(swiz + i) & 0x3]); + swiz >>= 2; + } + } + if (abs) { + append(")"); + } +} + +void D3D11ShaderTranslator::AppendDestRegName(uint32_t num, uint32_t dst_exp) { + if (!dst_exp) { + // Register. + append("r%u", num); + } else { + // Export. + switch (type_) { + case XE_GPU_SHADER_TYPE_VERTEX: + switch (num) { + case 62: + append("o.oPos"); + break; + case 63: + append("o.oPointSize"); + break; + default: + // Varying. + append("o.o[%u]", num);; + break; + } + break; + case XE_GPU_SHADER_TYPE_PIXEL: + switch (num) { + case 0: + append("o.oC0"); + break; + default: + // TODO(benvanik): other render targets? + // TODO(benvanik): depth? + XEASSERTALWAYS(); + break; + } + break; + } + } +} + +void D3D11ShaderTranslator::AppendDestReg(uint32_t num, uint32_t mask, + uint32_t dst_exp) { + if (mask != 0xF) { + // If masking, store to a temporary variable and clean it up later. + append("t"); + } else { + // Store directly to output. + AppendDestRegName(num, dst_exp); + } +} + +void D3D11ShaderTranslator::AppendDestRegPost(uint32_t num, uint32_t mask, + uint32_t dst_exp) { + if (mask != 0xF) { + // Masking. + append(" "); + AppendDestRegName(num, dst_exp); + append(" = float4("); + for (int i = 0; i < 4; i++) { + // TODO(benvanik): mask out values? mix in old value as temp? + // append("%c", (mask & 0x1) ? chan_names[i] : 'w'); + if (!(mask & 0x1)) { + AppendDestRegName(num, dst_exp); + } else { + append("t"); + } + append(".%c", chan_names[i]); + mask >>= 1; + if (i < 3) { + append(", "); + } + } + append(");\n"); + } +} + +void D3D11ShaderTranslator::PrintSrcReg(uint32_t num, uint32_t type, + uint32_t swiz, uint32_t negate, + uint32_t abs) { + if (negate) { + append("-"); + } + if (abs) { + append("|"); + } + append("%c%u", type ? 'R' : 'C', num); + if (swiz) { + append("."); + for (int i = 0; i < 4; i++) { + append("%c", chan_names[(swiz + i) & 0x3]); + swiz >>= 2; + } + } + if (abs) { + append("|"); + } +} + +void D3D11ShaderTranslator::PrintDstReg(uint32_t num, uint32_t mask, + uint32_t dst_exp) { + append("%s%u", dst_exp ? "export" : "R", num); + if (mask != 0xf) { + append("."); + for (int i = 0; i < 4; i++) { + append("%c", (mask & 0x1) ? chan_names[i] : '_'); + mask >>= 1; + } + } +} + +void D3D11ShaderTranslator::PrintExportComment(uint32_t num) { + const char *name = NULL; + switch (type_) { + case XE_GPU_SHADER_TYPE_VERTEX: + switch (num) { + case 62: name = "gl_Position"; break; + case 63: name = "gl_PointSize"; break; + } + break; + case XE_GPU_SHADER_TYPE_PIXEL: + switch (num) { + case 0: name = "gl_FragColor"; break; + } + break; + } + /* if we had a symbol table here, we could look + * up the name of the varying.. + */ + if (name) { + append("\t; %s", name); + } +} + +int D3D11ShaderTranslator::TranslateALU_ADDv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(" + "); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_MULv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(" * "); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_MAXv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + if (alu.src1_reg == alu.src2_reg && + alu.src1_sel == alu.src2_sel && + alu.src1_swiz == alu.src2_swiz && + alu.src1_reg_negate == alu.src2_reg_negate && + alu.src1_reg_abs == alu.src2_reg_abs) { + // This is a mov. + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + } else { + append("max("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(", "); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(")"); + } + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_MINv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("min("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(", "); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_SETXXv(const instr_alu_t& alu, const char* op) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("float4(("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").x %s (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").x ? 1.0 : 0.0, ("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").y %s (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").y ? 1.0 : 0.0, ("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").z %s (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").z ? 1.0 : 0.0, ("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").w %s (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").w ? 1.0 : 0.0)"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} +int D3D11ShaderTranslator::TranslateALU_SETEv(const instr_alu_t& alu) { + return TranslateALU_SETXXv(alu, "=="); +} +int D3D11ShaderTranslator::TranslateALU_SETGTv(const instr_alu_t& alu) { + return TranslateALU_SETXXv(alu, ">"); +} +int D3D11ShaderTranslator::TranslateALU_SETGTEv(const instr_alu_t& alu) { + return TranslateALU_SETXXv(alu, ">="); +} +int D3D11ShaderTranslator::TranslateALU_SETNEv(const instr_alu_t& alu) { + return TranslateALU_SETXXv(alu, "!="); +} + +int D3D11ShaderTranslator::TranslateALU_FRACv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("frac("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_TRUNCv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("trunc("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_FLOORv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("floor("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_MULADDv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("mad("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(", "); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(", "); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_CNDXXv(const instr_alu_t& alu, const char* op) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + // TODO(benvanik): check argument order - could be 3 as compare and 1 and 2 as values. + append("float4(("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").x %s 0.0 ? (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").x : ("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(").x, ("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").y %s 0.0 ? (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").y : ("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(").y, ("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").z %s 0.0 ? (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").z : ("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(").z, ("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").w %s 0.0 ? (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").w : ("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(").w)"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} +int D3D11ShaderTranslator::TranslateALU_CNDEv(const instr_alu_t& alu) { + return TranslateALU_CNDXXv(alu, "=="); +} +int D3D11ShaderTranslator::TranslateALU_CNDGTEv(const instr_alu_t& alu) { + return TranslateALU_CNDXXv(alu, ">="); +} +int D3D11ShaderTranslator::TranslateALU_CNDGTv(const instr_alu_t& alu) { + return TranslateALU_CNDXXv(alu, ">"); +} + +int D3D11ShaderTranslator::TranslateALU_DOT4v(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("dot("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(", "); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_DOT3v(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("dot(float4("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").xyz, float4("); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").xyz)"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_DOT2ADDv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("dot(float4("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").xy, float4("); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").xy) + "); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(".x"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +// CUBEv + +int D3D11ShaderTranslator::TranslateALU_MAX4v(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("max("); + append("max("); + append("max("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(".x, "); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(".y), "); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(".z), "); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(".w)"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +// ... + +int D3D11ShaderTranslator::TranslateALU_MAXs(const instr_alu_t& alu) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + if ((alu.src3_swiz & 0x3) == (((alu.src3_swiz >> 2) + 1) & 0x3)) { + // This is a mov. + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + } else { + append("max("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(".x, "); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(".y).xxxx"); + } + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_MINs(const instr_alu_t& alu) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + append("min("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(".x, "); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(".y).xxxx"); + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_SETXXs(const instr_alu_t& alu, const char* op) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + append("(("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(".x %s 0.0) ? 1.0 : 0.0).xxxx", op); + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} +int D3D11ShaderTranslator::TranslateALU_SETEs(const instr_alu_t& alu) { + return TranslateALU_SETXXs(alu, "=="); +} +int D3D11ShaderTranslator::TranslateALU_SETGTs(const instr_alu_t& alu) { + return TranslateALU_SETXXs(alu, ">"); +} +int D3D11ShaderTranslator::TranslateALU_SETGTEs(const instr_alu_t& alu) { + return TranslateALU_SETXXs(alu, ">="); +} +int D3D11ShaderTranslator::TranslateALU_SETNEs(const instr_alu_t& alu) { + return TranslateALU_SETXXs(alu, "!="); +} + +int D3D11ShaderTranslator::TranslateALU_RECIP_IEEE(const instr_alu_t& alu) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + append("(1.0 / "); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(")"); + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_MUL_CONST_0(const instr_alu_t& alu) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + uint32_t src3_swiz = alu.src3_swiz & ~0x3C; + uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; + uint32_t swiz_b = (src3_swiz & 0x3); + uint32_t reg2 = (alu.scalar_opc & 1) | (alu.src3_swiz & 0x3C) | (alu.src3_sel << 1); + append("("); + AppendSrcReg(alu.src3_reg, 0, 0, alu.src3_reg_negate, alu.src3_reg_abs); + append(".%c * ", chan_names[swiz_a]); + AppendSrcReg(reg2, 1, 0, alu.src3_reg_negate, alu.src3_reg_abs); + append(".%c", chan_names[swiz_b]); + append(").xxxx"); + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} +int D3D11ShaderTranslator::TranslateALU_MUL_CONST_1(const instr_alu_t& alu) { + return TranslateALU_MUL_CONST_0(alu); +} + +int D3D11ShaderTranslator::TranslateALU_ADD_CONST_0(const instr_alu_t& alu) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + uint32_t src3_swiz = alu.src3_swiz & ~0x3C; + uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; + uint32_t swiz_b = (src3_swiz & 0x3); + uint32_t reg2 = (alu.scalar_opc & 1) | (alu.src3_swiz & 0x3C) | (alu.src3_sel << 1); + append("("); + AppendSrcReg(alu.src3_reg, 0, 0, alu.src3_reg_negate, alu.src3_reg_abs); + append(".%c + ", chan_names[swiz_a]); + AppendSrcReg(reg2, 1, 0, alu.src3_reg_negate, alu.src3_reg_abs); + append(".%c", chan_names[swiz_b]); + append(").xxxx"); + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} +int D3D11ShaderTranslator::TranslateALU_ADD_CONST_1(const instr_alu_t& alu) { + return TranslateALU_ADD_CONST_0(alu); +} + +int D3D11ShaderTranslator::TranslateALU_SUB_CONST_0(const instr_alu_t& alu) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + uint32_t src3_swiz = alu.src3_swiz & ~0x3C; + uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; + uint32_t swiz_b = (src3_swiz & 0x3); + uint32_t reg2 = (alu.scalar_opc & 1) | (alu.src3_swiz & 0x3C) | (alu.src3_sel << 1); + append("("); + AppendSrcReg(alu.src3_reg, 0, 0, alu.src3_reg_negate, alu.src3_reg_abs); + append(".%c - ", chan_names[swiz_a]); + AppendSrcReg(reg2, 1, 0, alu.src3_reg_negate, alu.src3_reg_abs); + append(".%c", chan_names[swiz_b]); + append(").xxxx"); + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} +int D3D11ShaderTranslator::TranslateALU_SUB_CONST_1(const instr_alu_t& alu) { + return TranslateALU_SUB_CONST_0(alu); +} + +namespace { + +typedef int (D3D11ShaderTranslator::*TranslateFn)(const instr_alu_t& alu); +typedef struct { + uint32_t num_srcs; + const char* name; + TranslateFn fn; +} TranslateInfo; +#define ALU_INSTR(opc, num_srcs) \ + { num_srcs, #opc, nullptr } +#define ALU_INSTR_IMPL(opc, num_srcs) \ + { num_srcs, #opc, &D3D11ShaderTranslator::TranslateALU_##opc } + +} // namespace + +int D3D11ShaderTranslator::TranslateALU(const instr_alu_t* alu, int sync) { + static TranslateInfo vector_alu_instrs[0x20] = { + ALU_INSTR_IMPL(ADDv, 2), // 0 + ALU_INSTR_IMPL(MULv, 2), // 1 + ALU_INSTR_IMPL(MAXv, 2), // 2 + ALU_INSTR_IMPL(MINv, 2), // 3 + ALU_INSTR_IMPL(SETEv, 2), // 4 + ALU_INSTR_IMPL(SETGTv, 2), // 5 + ALU_INSTR_IMPL(SETGTEv, 2), // 6 + ALU_INSTR_IMPL(SETNEv, 2), // 7 + ALU_INSTR_IMPL(FRACv, 1), // 8 + ALU_INSTR_IMPL(TRUNCv, 1), // 9 + ALU_INSTR_IMPL(FLOORv, 1), // 10 + ALU_INSTR_IMPL(MULADDv, 3), // 11 + ALU_INSTR_IMPL(CNDEv, 3), // 12 + ALU_INSTR_IMPL(CNDGTEv, 3), // 13 + ALU_INSTR_IMPL(CNDGTv, 3), // 14 + ALU_INSTR_IMPL(DOT4v, 2), // 15 + ALU_INSTR_IMPL(DOT3v, 2), // 16 + ALU_INSTR_IMPL(DOT2ADDv, 3), // 17 -- ??? + ALU_INSTR(CUBEv, 2), // 18 + ALU_INSTR_IMPL(MAX4v, 1), // 19 + ALU_INSTR(PRED_SETE_PUSHv, 2), // 20 + ALU_INSTR(PRED_SETNE_PUSHv, 2), // 21 + ALU_INSTR(PRED_SETGT_PUSHv, 2), // 22 + ALU_INSTR(PRED_SETGTE_PUSHv, 2), // 23 + ALU_INSTR(KILLEv, 2), // 24 + ALU_INSTR(KILLGTv, 2), // 25 + ALU_INSTR(KILLGTEv, 2), // 26 + ALU_INSTR(KILLNEv, 2), // 27 + ALU_INSTR(DSTv, 2), // 28 + ALU_INSTR(MOVAv, 1), // 29 + }; + static TranslateInfo scalar_alu_instrs[0x40] = { + ALU_INSTR(ADDs, 1), // 0 + ALU_INSTR(ADD_PREVs, 1), // 1 + ALU_INSTR(MULs, 1), // 2 + ALU_INSTR(MUL_PREVs, 1), // 3 + ALU_INSTR(MUL_PREV2s, 1), // 4 + ALU_INSTR_IMPL(MAXs, 1), // 5 + ALU_INSTR_IMPL(MINs, 1), // 6 + ALU_INSTR_IMPL(SETEs, 1), // 7 + ALU_INSTR_IMPL(SETGTs, 1), // 8 + ALU_INSTR_IMPL(SETGTEs, 1), // 9 + ALU_INSTR_IMPL(SETNEs, 1), // 10 + ALU_INSTR(FRACs, 1), // 11 + ALU_INSTR(TRUNCs, 1), // 12 + ALU_INSTR(FLOORs, 1), // 13 + ALU_INSTR(EXP_IEEE, 1), // 14 + ALU_INSTR(LOG_CLAMP, 1), // 15 + ALU_INSTR(LOG_IEEE, 1), // 16 + ALU_INSTR(RECIP_CLAMP, 1), // 17 + ALU_INSTR(RECIP_FF, 1), // 18 + ALU_INSTR_IMPL(RECIP_IEEE, 1), // 19 + ALU_INSTR(RECIPSQ_CLAMP, 1), // 20 + ALU_INSTR(RECIPSQ_FF, 1), // 21 + ALU_INSTR(RECIPSQ_IEEE, 1), // 22 + ALU_INSTR(MOVAs, 1), // 23 + ALU_INSTR(MOVA_FLOORs, 1), // 24 + ALU_INSTR(SUBs, 1), // 25 + ALU_INSTR(SUB_PREVs, 1), // 26 + ALU_INSTR(PRED_SETEs, 1), // 27 + ALU_INSTR(PRED_SETNEs, 1), // 28 + ALU_INSTR(PRED_SETGTs, 1), // 29 + ALU_INSTR(PRED_SETGTEs, 1), // 30 + ALU_INSTR(PRED_SET_INVs, 1), // 31 + ALU_INSTR(PRED_SET_POPs, 1), // 32 + ALU_INSTR(PRED_SET_CLRs, 1), // 33 + ALU_INSTR(PRED_SET_RESTOREs, 1), // 34 + ALU_INSTR(KILLEs, 1), // 35 + ALU_INSTR(KILLGTs, 1), // 36 + ALU_INSTR(KILLGTEs, 1), // 37 + ALU_INSTR(KILLNEs, 1), // 38 + ALU_INSTR(KILLONEs, 1), // 39 + ALU_INSTR(SQRT_IEEE, 1), // 40 + { 0, 0, false }, + ALU_INSTR_IMPL(MUL_CONST_0, 2), // 42 + ALU_INSTR_IMPL(MUL_CONST_1, 2), // 43 + ALU_INSTR_IMPL(ADD_CONST_0, 2), // 44 + ALU_INSTR_IMPL(ADD_CONST_1, 2), // 45 + ALU_INSTR_IMPL(SUB_CONST_0, 2), // 46 + ALU_INSTR_IMPL(SUB_CONST_1, 2), // 47 + ALU_INSTR(SIN, 1), // 48 + ALU_INSTR(COS, 1), // 49 + ALU_INSTR(RETAIN_PREV, 1), // 50 + }; +#undef ALU_INSTR +#undef ALU_INSTR_IMPL + + if (!alu->scalar_write_mask && !alu->vector_write_mask) { + append(" // \n"); + return 0; + } + + if (alu->vector_write_mask) { + // Disassemble vector op. + const auto& iv = vector_alu_instrs[alu->vector_opc]; + append(" // %sALU:\t", sync ? "(S)" : " "); + append("%s", iv.name); + if (alu->pred_select & 0x2) { + // seems to work similar to conditional execution in ARM instruction + // set, so let's use a similar syntax for now: + append((alu->pred_select & 0x1) ? "EQ" : "NE"); + } + append("\t"); + PrintDstReg(alu->vector_dest, alu->vector_write_mask, alu->export_data); + append(" = "); + if (iv.num_srcs == 3) { + PrintSrcReg(alu->src3_reg, alu->src3_sel, alu->src3_swiz, + alu->src3_reg_negate, alu->src3_reg_abs); + append(", "); + } + PrintSrcReg(alu->src1_reg, alu->src1_sel, alu->src1_swiz, + alu->src1_reg_negate, alu->src1_reg_abs); + if (iv.num_srcs > 1) { + append(", "); + PrintSrcReg(alu->src2_reg, alu->src2_sel, alu->src2_swiz, + alu->src2_reg_negate, alu->src2_reg_abs); + } + if (alu->vector_clamp) { + append(" CLAMP"); + } + if (alu->export_data) { + PrintExportComment(alu->vector_dest); + } + append("\n"); + + // Translate vector op. + if (iv.fn) { + append(" "); + if ((this->*iv.fn)(*alu)) { + return 1; + } + } else { + append(" // \n"); + } + } + + if (alu->scalar_write_mask || !alu->vector_write_mask) { + // 2nd optional scalar op: + + // Disassemble scalar op. + const auto& is = scalar_alu_instrs[alu->scalar_opc]; + append(" // "); + append("\t"); + if (is.name) { + append("\t \t%s\t", is.name); + } else { + append("\t \tOP(%u)\t", alu->scalar_opc); + } + PrintDstReg(alu->scalar_dest, alu->scalar_write_mask, alu->export_data); + append(" = "); + if (is.num_srcs == 2) { + // ADD_CONST_0 dest, [const], [reg] + uint32_t src3_swiz = alu->src3_swiz & ~0x3C; + uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; + uint32_t swiz_b = (src3_swiz & 0x3); + PrintSrcReg(alu->src3_reg, 0, 0, + alu->src3_reg_negate, alu->src3_reg_abs); + append(".%c", chan_names[swiz_a]); + append(", "); + uint32_t reg2 = (alu->scalar_opc & 1) | (alu->src3_swiz & 0x3C) | (alu->src3_sel << 1); + PrintSrcReg(reg2, 1, 0, + alu->src3_reg_negate, alu->src3_reg_abs); + append(".%c", chan_names[swiz_b]); + } else { + PrintSrcReg(alu->src3_reg, alu->src3_sel, alu->src3_swiz, + alu->src3_reg_negate, alu->src3_reg_abs); + } + if (alu->scalar_clamp) { + append(" CLAMP"); + } + if (alu->export_data) { + PrintExportComment(alu->scalar_dest); + } + append("\n"); + + // Translate scalar op. + if (is.fn) { + append(" "); + if ((this->*is.fn)(*alu)) { + return 1; + } + } else { + append(" // \n"); + } + } + + return 0; +} + +void D3D11ShaderTranslator::PrintDestFecth(uint32_t dst_reg, + uint32_t dst_swiz) { + append("\tR%u.", dst_reg); + for (int i = 0; i < 4; i++) { + append("%c", chan_names[dst_swiz & 0x7]); + dst_swiz >>= 3; + } +} + +void D3D11ShaderTranslator::AppendFetchDest(uint32_t dst_reg, + uint32_t dst_swiz) { + append("r%u.", dst_reg); + for (int i = 0; i < 4; i++) { + append("%c", chan_names[dst_swiz & 0x7]); + dst_swiz >>= 3; + } +} + +int D3D11ShaderTranslator::GetFormatComponentCount(uint32_t format) { + switch (format) { + case FMT_32: + case FMT_32_FLOAT: + return 1; + case FMT_16_16: + case FMT_16_16_FLOAT: + case FMT_32_32: + case FMT_32_32_FLOAT: + return 2; + case FMT_10_11_11: + case FMT_11_11_10: + case FMT_32_32_32_FLOAT: + return 3; + case FMT_8_8_8_8: + case FMT_2_10_10_10: + case FMT_16_16_16_16: + case FMT_16_16_16_16_FLOAT: + case FMT_32_32_32_32: + case FMT_32_32_32_32_FLOAT: + return 4; + default: + XELOGE("Unknown vertex format: %d", format); + XEASSERTALWAYS(); + return 4; + } +} + +int D3D11ShaderTranslator::TranslateExec(const instr_cf_exec_t& cf) { + static const struct { + const char *name; + } cf_instructions[] = { + #define INSTR(opc, fxn) { #opc } + INSTR(NOP, print_cf_nop), + INSTR(EXEC, print_cf_exec), + INSTR(EXEC_END, print_cf_exec), + INSTR(COND_EXEC, print_cf_exec), + INSTR(COND_EXEC_END, print_cf_exec), + INSTR(COND_PRED_EXEC, print_cf_exec), + INSTR(COND_PRED_EXEC_END, print_cf_exec), + INSTR(LOOP_START, print_cf_loop), + INSTR(LOOP_END, print_cf_loop), + INSTR(COND_CALL, print_cf_jmp_call), + INSTR(RETURN, print_cf_jmp_call), + INSTR(COND_JMP, print_cf_jmp_call), + INSTR(ALLOC, print_cf_alloc), + INSTR(COND_EXEC_PRED_CLEAN, print_cf_exec), + INSTR(COND_EXEC_PRED_CLEAN_END, print_cf_exec), + INSTR(MARK_VS_FETCH_DONE, print_cf_nop), // ?? + #undef INSTR + }; + + append( + " // %s ADDR(0x%x) CNT(0x%x)", + cf_instructions[cf.opc].name, cf.address, cf.count); + if (cf.yeild) { + append(" YIELD"); + } + uint8_t vc = cf.vc_hi | (cf.vc_lo << 2); + if (vc) { + append(" VC(0x%x)", vc); + } + if (cf.bool_addr) { + append(" BOOL_ADDR(0x%x)", cf.bool_addr); + } + if (cf.address_mode == ABSOLUTE_ADDR) { + append(" ABSOLUTE_ADDR"); + } + if (cf.is_cond_exec()) { + append(" COND(%d)", cf.condition); + } + append("\n"); + + uint32_t sequence = cf.serialize; + for (uint32_t i = 0; i < cf.count; i++) { + uint32_t alu_off = (cf.address + i); + int sync = sequence & 0x2; + if (sequence & 0x1) { + const instr_fetch_t* fetch = + (const instr_fetch_t*)(dwords_ + alu_off * 3); + switch (fetch->opc) { + case VTX_FETCH: + if (TranslateVertexFetch(&fetch->vtx, sync)) { + return 1; + } + break; + case TEX_FETCH: + if (TranslateTextureFetch(&fetch->tex, sync)) { + return 1; + } + break; + case TEX_GET_BORDER_COLOR_FRAC: + case TEX_GET_COMP_TEX_LOD: + case TEX_GET_GRADIENTS: + case TEX_GET_WEIGHTS: + case TEX_SET_TEX_LOD: + case TEX_SET_GRADIENTS_H: + case TEX_SET_GRADIENTS_V: + default: + XEASSERTALWAYS(); + break; + } + } else { + const instr_alu_t* alu = + (const instr_alu_t*)(dwords_ + alu_off * 3); + if (TranslateALU(alu, sync)) { + return 1; + } + } + sequence >>= 2; + } + + return 0; +} + +int D3D11ShaderTranslator::TranslateVertexFetch(const instr_fetch_vtx_t* vtx, + int sync) { + static const struct { + const char *name; + } fetch_types[0xff] = { + #define TYPE(id) { #id } + TYPE(FMT_1_REVERSE), // 0 + {0}, + TYPE(FMT_8), // 2 + {0}, + {0}, + {0}, + TYPE(FMT_8_8_8_8), // 6 + TYPE(FMT_2_10_10_10), // 7 + {0}, + {0}, + TYPE(FMT_8_8), // 10 + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + TYPE(FMT_16), // 24 + TYPE(FMT_16_16), // 25 + TYPE(FMT_16_16_16_16), // 26 + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + TYPE(FMT_32), // 33 + TYPE(FMT_32_32), // 34 + TYPE(FMT_32_32_32_32), // 35 + TYPE(FMT_32_FLOAT), // 36 + TYPE(FMT_32_32_FLOAT), // 37 + TYPE(FMT_32_32_32_32_FLOAT), // 38 + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + TYPE(FMT_32_32_32_FLOAT), // 57 + #undef TYPE + }; + + // Disassemble. + append(" // %sFETCH:\t", sync ? "(S)" : " "); + if (vtx->pred_select) { + append(vtx->pred_condition ? "EQ" : "NE"); + } + PrintDestFecth(vtx->dst_reg, vtx->dst_swiz); + append(" = R%u.", vtx->src_reg); + append("%c", chan_names[vtx->src_swiz & 0x3]); + if (fetch_types[vtx->format].name) { + append(" %s", fetch_types[vtx->format].name); + } else { + append(" TYPE(0x%x)", vtx->format); + } + append(" %s", vtx->format_comp_all ? "SIGNED" : "UNSIGNED"); + if (!vtx->num_format_all) { + append(" NORMALIZED"); + } + append(" STRIDE(%u)", vtx->stride); + if (vtx->offset) { + append(" OFFSET(%u)", vtx->offset); + } + append(" CONST(%u, %u)", vtx->const_index, vtx->const_index_sel); + if (1) { + // XXX + append(" src_reg_am=%u", vtx->src_reg_am); + append(" dst_reg_am=%u", vtx->dst_reg_am); + append(" num_format_all=%u", vtx->num_format_all); + append(" signed_rf_mode_all=%u", vtx->signed_rf_mode_all); + append(" exp_adjust_all=%u", vtx->exp_adjust_all); + } + append("\n"); + + // Translate. + append(" "); + append("r%u.xyzw", vtx->dst_reg); + append(" = float4("); + uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel; + // TODO(benvanik): detect xyzw = xyzw, etc. + // TODO(benvanik): detect and set as rN = float4(samp.xyz, 1.0); / etc + uint32_t component_count = GetFormatComponentCount(vtx->format); + uint32_t dst_swiz = vtx->dst_swiz; + for (int i = 0; i < 4; i++) { + if ((dst_swiz & 0x7) == 4) { + append("0.0"); + } else if ((dst_swiz & 0x7) == 5) { + append("1.0"); + } else if ((dst_swiz & 0x7) == 6) { + // ? + append("?"); + } else if ((dst_swiz & 0x7) == 7) { + append("r%u.%c", vtx->dst_reg, chan_names[i]); + } else { + append("i.vf%u_%d.%c", + fetch_slot, vtx->offset, + chan_names[dst_swiz & 0x3]); + } + if (i < 3) { + append(", "); + } + dst_swiz >>= 3; + } + append(");\n"); + return 0; +} + +int D3D11ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex, + int sync) { + // Disassemble. + static const char *filter[] = { + "POINT", // TEX_FILTER_POINT + "LINEAR", // TEX_FILTER_LINEAR + "BASEMAP", // TEX_FILTER_BASEMAP + }; + static const char *aniso_filter[] = { + "DISABLED", // ANISO_FILTER_DISABLED + "MAX_1_1", // ANISO_FILTER_MAX_1_1 + "MAX_2_1", // ANISO_FILTER_MAX_2_1 + "MAX_4_1", // ANISO_FILTER_MAX_4_1 + "MAX_8_1", // ANISO_FILTER_MAX_8_1 + "MAX_16_1", // ANISO_FILTER_MAX_16_1 + }; + static const char *arbitrary_filter[] = { + "2x4_SYM", // ARBITRARY_FILTER_2X4_SYM + "2x4_ASYM", // ARBITRARY_FILTER_2X4_ASYM + "4x2_SYM", // ARBITRARY_FILTER_4X2_SYM + "4x2_ASYM", // ARBITRARY_FILTER_4X2_ASYM + "4x4_SYM", // ARBITRARY_FILTER_4X4_SYM + "4x4_ASYM", // ARBITRARY_FILTER_4X4_ASYM + }; + static const char *sample_loc[] = { + "CENTROID", // SAMPLE_CENTROID + "CENTER", // SAMPLE_CENTER + }; + uint32_t src_swiz = tex->src_swiz; + append(" // %sFETCH:\t", sync ? "(S)" : " "); + if (tex->pred_select) { + append(tex->pred_condition ? "EQ" : "NE"); + } + PrintDestFecth(tex->dst_reg, tex->dst_swiz); + append(" = R%u.", tex->src_reg); + for (int i = 0; i < 3; i++) { + append("%c", chan_names[src_swiz & 0x3]); + src_swiz >>= 2; + } + append(" CONST(%u)", tex->const_idx); + if (tex->fetch_valid_only) { + append(" VALID_ONLY"); + } + if (tex->tx_coord_denorm) { + append(" DENORM"); + } + if (tex->mag_filter != TEX_FILTER_USE_FETCH_CONST) { + append(" MAG(%s)", filter[tex->mag_filter]); + } + if (tex->min_filter != TEX_FILTER_USE_FETCH_CONST) { + append(" MIN(%s)", filter[tex->min_filter]); + } + if (tex->mip_filter != TEX_FILTER_USE_FETCH_CONST) { + append(" MIP(%s)", filter[tex->mip_filter]); + } + if (tex->aniso_filter != ANISO_FILTER_USE_FETCH_CONST) { + append(" ANISO(%s)", aniso_filter[tex->aniso_filter]); + } + if (tex->arbitrary_filter != ARBITRARY_FILTER_USE_FETCH_CONST) { + append(" ARBITRARY(%s)", arbitrary_filter[tex->arbitrary_filter]); + } + if (tex->vol_mag_filter != TEX_FILTER_USE_FETCH_CONST) { + append(" VOL_MAG(%s)", filter[tex->vol_mag_filter]); + } + if (tex->vol_min_filter != TEX_FILTER_USE_FETCH_CONST) { + append(" VOL_MIN(%s)", filter[tex->vol_min_filter]); + } + if (!tex->use_comp_lod) { + append(" LOD(%u)", tex->use_comp_lod); + append(" LOD_BIAS(%u)", tex->lod_bias); + } + if (tex->use_reg_lod) { + append(" REG_LOD(%u)", tex->use_reg_lod); + } + if (tex->use_reg_gradients) { + append(" USE_REG_GRADIENTS"); + } + append(" LOCATION(%s)", sample_loc[tex->sample_location]); + if (tex->offset_x || tex->offset_y || tex->offset_z) { + append(" OFFSET(%u,%u,%u)", tex->offset_x, tex->offset_y, tex->offset_z); + } + append("\n"); + + int src_component_count = 0; + switch (tex->dimension) { + case DIMENSION_1D: + src_component_count = 1; + break; + default: + case DIMENSION_2D: + src_component_count = 2; + break; + case DIMENSION_3D: + src_component_count = 3; + break; + case DIMENSION_CUBE: + src_component_count = 3; + break; + } + + // Translate. + append(" "); + append("r%u.xyzw", tex->dst_reg); + append(" = "); + append( + "x_texture_%d.Sample(x_sampler_%d, r%u.", + tex->const_idx, + tex_fetch_index_++, // hacky way to line up to tex buffers + tex->src_reg); + src_swiz = tex->src_swiz; + for (int i = 0; i < src_component_count; i++) { + append("%c", chan_names[src_swiz & 0x3]); + src_swiz >>= 2; + } + append(")."); + + // Pass one over dest does xyzw and fakes the special values. + // TODO(benvanik): detect and set as rN = float4(samp.xyz, 1.0); / etc + uint32_t dst_swiz = tex->dst_swiz; + for (int i = 0; i < 4; i++) { + append("%c", chan_names[dst_swiz & 0x3]); + dst_swiz >>= 3; + } + append(";\n"); + // Do another pass to set constant values. + dst_swiz = tex->dst_swiz; + for (int i = 0; i < 4; i++) { + if ((dst_swiz & 0x7) == 4) { + append(" r%u.%c = 0.0;\n", tex->dst_reg, chan_names[i]); + } else if ((dst_swiz & 0x7) == 5) { + append(" r%u.%c = 1.0;\n", tex->dst_reg, chan_names[i]); + } + dst_swiz >>= 3; + } + return 0; +} diff --git a/src/xenia/gpu/d3d11/d3d11_shader_translator.h b/src/xenia/gpu/d3d11/d3d11_shader_translator.h new file mode 100644 index 000000000..ad85c7775 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_shader_translator.h @@ -0,0 +1,125 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_SHADER_TRANSLATOR_H_ +#define XENIA_GPU_D3D11_D3D11_SHADER_TRANSLATOR_H_ + +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + + +class D3D11ShaderTranslator { +public: + const static uint32_t kMaxInterpolators = 16; + + D3D11ShaderTranslator(); + + int TranslateVertexShader(VertexShaderResource* vertex_shader, + const xenos::xe_gpu_program_cntl_t& program_cntl); + int TranslatePixelShader( + PixelShaderResource* pixel_shader, + const xenos::xe_gpu_program_cntl_t& program_cntl, + const VertexShaderResource::AllocCounts& alloc_counts); + + const char* translated_src() const { return buffer_; } + +private: + xenos::XE_GPU_SHADER_TYPE type_; + uint32_t tex_fetch_index_; + const uint32_t* dwords_; + + static const int kCapacity = 64 * 1024; + char buffer_[kCapacity]; + size_t capacity_; + size_t offset_; + void append(const char* format, ...) { + va_list args; + va_start(args, format); + int len = xevsnprintfa(buffer_ + offset_, capacity_ - offset_, + format, args); + va_end(args); + offset_ += len; + buffer_[offset_] = 0; + } + + void AppendTextureHeader( + const ShaderResource::SamplerInputs& sampler_inputs); + + void AppendSrcReg(uint32_t num, uint32_t type, uint32_t swiz, uint32_t negate, + uint32_t abs); + void AppendDestRegName(uint32_t num, uint32_t dst_exp); + void AppendDestReg(uint32_t num, uint32_t mask, uint32_t dst_exp); + void AppendDestRegPost(uint32_t num, uint32_t mask, uint32_t dst_exp); + void PrintSrcReg(uint32_t num, uint32_t type, uint32_t swiz, uint32_t negate, + uint32_t abs); + void PrintDstReg(uint32_t num, uint32_t mask, uint32_t dst_exp); + void PrintExportComment(uint32_t num); + + int TranslateALU(const xenos::instr_alu_t* alu, int sync); + int TranslateALU_ADDv(const xenos::instr_alu_t& alu); + int TranslateALU_MULv(const xenos::instr_alu_t& alu); + int TranslateALU_MAXv(const xenos::instr_alu_t& alu); + int TranslateALU_MINv(const xenos::instr_alu_t& alu); + int TranslateALU_SETXXv(const xenos::instr_alu_t& alu, const char* op); + int TranslateALU_SETEv(const xenos::instr_alu_t& alu); + int TranslateALU_SETGTv(const xenos::instr_alu_t& alu); + int TranslateALU_SETGTEv(const xenos::instr_alu_t& alu); + int TranslateALU_SETNEv(const xenos::instr_alu_t& alu); + int TranslateALU_FRACv(const xenos::instr_alu_t& alu); + int TranslateALU_TRUNCv(const xenos::instr_alu_t& alu); + int TranslateALU_FLOORv(const xenos::instr_alu_t& alu); + int TranslateALU_MULADDv(const xenos::instr_alu_t& alu); + int TranslateALU_CNDXXv(const xenos::instr_alu_t& alu, const char* op); + int TranslateALU_CNDEv(const xenos::instr_alu_t& alu); + int TranslateALU_CNDGTEv(const xenos::instr_alu_t& alu); + int TranslateALU_CNDGTv(const xenos::instr_alu_t& alu); + int TranslateALU_DOT4v(const xenos::instr_alu_t& alu); + int TranslateALU_DOT3v(const xenos::instr_alu_t& alu); + int TranslateALU_DOT2ADDv(const xenos::instr_alu_t& alu); + // CUBEv + int TranslateALU_MAX4v(const xenos::instr_alu_t& alu); + // ... + int TranslateALU_MAXs(const xenos::instr_alu_t& alu); + int TranslateALU_MINs(const xenos::instr_alu_t& alu); + int TranslateALU_SETXXs(const xenos::instr_alu_t& alu, const char* op); + int TranslateALU_SETEs(const xenos::instr_alu_t& alu); + int TranslateALU_SETGTs(const xenos::instr_alu_t& alu); + int TranslateALU_SETGTEs(const xenos::instr_alu_t& alu); + int TranslateALU_SETNEs(const xenos::instr_alu_t& alu); + int TranslateALU_RECIP_IEEE(const xenos::instr_alu_t& alu); + int TranslateALU_MUL_CONST_0(const xenos::instr_alu_t& alu); + int TranslateALU_MUL_CONST_1(const xenos::instr_alu_t& alu); + int TranslateALU_ADD_CONST_0(const xenos::instr_alu_t& alu); + int TranslateALU_ADD_CONST_1(const xenos::instr_alu_t& alu); + int TranslateALU_SUB_CONST_0(const xenos::instr_alu_t& alu); + int TranslateALU_SUB_CONST_1(const xenos::instr_alu_t& alu); + + void PrintDestFecth(uint32_t dst_reg, uint32_t dst_swiz); + void AppendFetchDest(uint32_t dst_reg, uint32_t dst_swiz); + int GetFormatComponentCount(uint32_t format); + + int TranslateExec(const xenos::instr_cf_exec_t& cf); + int TranslateVertexFetch(const xenos::instr_fetch_vtx_t* vtx, int sync); + int TranslateTextureFetch(const xenos::instr_fetch_tex_t* tex, int sync); +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_SHADER_TRANSLATOR_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_texture.cc b/src/xenia/gpu/d3d11/d3d11_texture.cc deleted file mode 100644 index 809a971ac..000000000 --- a/src/xenia/gpu/d3d11/d3d11_texture.cc +++ /dev/null @@ -1,264 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include -#include -#include -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::d3d11; -using namespace xe::gpu::xenos; - - -D3D11Texture::D3D11Texture(D3D11TextureCache* cache, uint32_t address, - const uint8_t* host_address) - : Texture(address, host_address), - cache_(cache) { -} - -D3D11Texture::~D3D11Texture() { -} - -TextureView* D3D11Texture::FetchNew( - const xenos::xe_gpu_texture_fetch_t& fetch) { - D3D11TextureView* view = new D3D11TextureView(); - if (!FillViewInfo(view, fetch)) { - return nullptr; - } - - D3D11_SHADER_RESOURCE_VIEW_DESC srv_desc; - xe_zero_struct(&srv_desc, sizeof(srv_desc)); - // TODO(benvanik): this may need to be typed on the fetch instruction (float/int/etc?) - srv_desc.Format = view->format; - - D3D_SRV_DIMENSION dimension = D3D11_SRV_DIMENSION_UNKNOWN; - switch (view->dimensions) { - case DIMENSION_1D: - srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D; - srv_desc.Texture1D.MipLevels = 1; - srv_desc.Texture1D.MostDetailedMip = 0; - if (!CreateTexture1D(view, fetch)) { - XELOGE("D3D11: failed to fetch Texture1D"); - return nullptr; - } - break; - case DIMENSION_2D: - srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; - srv_desc.Texture2D.MipLevels = 1; - srv_desc.Texture2D.MostDetailedMip = 0; - if (!CreateTexture2D(view, fetch)) { - XELOGE("D3D11: failed to fetch Texture2D"); - return nullptr; - } - break; - case DIMENSION_3D: - srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D; - srv_desc.Texture3D.MipLevels = 1; - srv_desc.Texture3D.MostDetailedMip = 0; - if (!CreateTexture3D(view, fetch)) { - XELOGE("D3D11: failed to fetch Texture3D"); - return nullptr; - } - break; - case DIMENSION_CUBE: - srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBE; - srv_desc.TextureCube.MipLevels = 1; - srv_desc.TextureCube.MostDetailedMip = 0; - if (!CreateTextureCube(view, fetch)) { - XELOGE("D3D11: failed to fetch TextureCube"); - return nullptr; - } - break; - } - - HRESULT hr = cache_->device()->CreateShaderResourceView( - view->resource, &srv_desc, &view->srv); - if (FAILED(hr)) { - XELOGE("D3D11: unable to create texture resource view"); - return nullptr; - } - - return view; -} - -bool D3D11Texture::FetchDirty( - TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) { - auto d3d_view = static_cast(view); - switch (view->dimensions) { - case DIMENSION_1D: - return FetchTexture1D(d3d_view, fetch); - case DIMENSION_2D: - return FetchTexture2D(d3d_view, fetch); - case DIMENSION_3D: - return FetchTexture3D(d3d_view, fetch); - case DIMENSION_CUBE: - return FetchTextureCube(d3d_view, fetch); - } - return false; -} - -bool D3D11Texture::CreateTexture1D( - D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) { - uint32_t width = 1 + fetch.size_1d.width; - - D3D11_TEXTURE1D_DESC texture_desc; - xe_zero_struct(&texture_desc, sizeof(texture_desc)); - texture_desc.Width = width; - texture_desc.MipLevels = 1; - texture_desc.ArraySize = 1; - texture_desc.Format = view->format; - texture_desc.Usage = D3D11_USAGE_DYNAMIC; - texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; - texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - texture_desc.MiscFlags = 0; // D3D11_RESOURCE_MISC_GENERATE_MIPS? - HRESULT hr = cache_->device()->CreateTexture1D( - &texture_desc, NULL, (ID3D11Texture1D**)&view->resource); - if (FAILED(hr)) { - return false; - } - - return FetchTexture1D(view, fetch); -} - -bool D3D11Texture::FetchTexture1D( - D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { - SCOPE_profile_cpu_f("gpu"); - - // TODO(benvanik): upload! - XELOGE("D3D11: FetchTexture1D not yet implemented"); - return false; -} - -bool D3D11Texture::CreateTexture2D( - D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) { - XEASSERTTRUE(fetch.dimension == 1); - - D3D11_TEXTURE2D_DESC texture_desc; - xe_zero_struct(&texture_desc, sizeof(texture_desc)); - texture_desc.Width = view->sizes_2d.output_width; - texture_desc.Height = view->sizes_2d.output_height; - texture_desc.MipLevels = 1; - texture_desc.ArraySize = 1; - texture_desc.Format = view->format; - texture_desc.SampleDesc.Count = 1; - texture_desc.SampleDesc.Quality = 0; - texture_desc.Usage = D3D11_USAGE_DYNAMIC; - texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; - texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - texture_desc.MiscFlags = 0; // D3D11_RESOURCE_MISC_GENERATE_MIPS? - HRESULT hr = cache_->device()->CreateTexture2D( - &texture_desc, NULL, (ID3D11Texture2D**)&view->resource); - if (FAILED(hr)) { - return false; - } - - return FetchTexture2D(view, fetch); -} - -bool D3D11Texture::FetchTexture2D( - D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { - SCOPE_profile_cpu_f("gpu"); - - XEASSERTTRUE(fetch.dimension == 1); - - auto sizes = GetTextureSizes2D(view); - - // TODO(benvanik): all mip levels. - D3D11_MAPPED_SUBRESOURCE res; - HRESULT hr = cache_->context()->Map(view->resource, 0, - D3D11_MAP_WRITE_DISCARD, 0, &res); - if (FAILED(hr)) { - XELOGE("D3D11: failed to map texture"); - return false; - } - - const uint8_t* src = cache_->memory()->Translate(address_); - uint8_t* dest = (uint8_t*)res.pData; - - //memset(dest, 0, output_pitch * (output_height / view->block_size)); // TODO(gibbed): remove me later - - uint32_t output_pitch = res.RowPitch; // (output_width / info.block_size) * info.texel_pitch; - if (!fetch.tiled) { - dest = (uint8_t*)res.pData; - for (uint32_t y = 0; y < sizes.block_height; y++) { - for (uint32_t x = 0; x < sizes.logical_pitch; x += view->texel_pitch) { - TextureSwap(dest + x, src + x, view->texel_pitch, (XE_GPU_ENDIAN)fetch.endianness); - } - src += sizes.input_pitch; - dest += output_pitch; - } - } else { - auto bpp = (view->texel_pitch >> 2) + ((view->texel_pitch >> 1) >> (view->texel_pitch >> 2)); - for (uint32_t y = 0, output_base_offset = 0; - y < sizes.block_height; - y++, output_base_offset += output_pitch) { - auto input_base_offset = TiledOffset2DOuter(y, (sizes.input_width / view->block_size), bpp); - for (uint32_t x = 0, output_offset = output_base_offset; - x < sizes.block_width; - x++, output_offset += view->texel_pitch) { - auto input_offset = TiledOffset2DInner(x, y, bpp, input_base_offset) >> bpp; - TextureSwap(dest + output_offset, - src + input_offset * view->texel_pitch, - view->texel_pitch, (XE_GPU_ENDIAN)fetch.endianness); - } - } - } - cache_->context()->Unmap(view->resource, 0); - return true; -} - -bool D3D11Texture::CreateTexture3D( - D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) { - XELOGE("D3D11: CreateTexture3D not yet implemented"); - XEASSERTALWAYS(); - return false; -} - -bool D3D11Texture::FetchTexture3D( - D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { - SCOPE_profile_cpu_f("gpu"); - - XELOGE("D3D11: FetchTexture3D not yet implemented"); - XEASSERTALWAYS(); - return false; - //D3D11_TEXTURE3D_DESC texture_desc; - //xe_zero_struct(&texture_desc, sizeof(texture_desc)); - //texture_desc.Width; - //texture_desc.Height; - //texture_desc.Depth; - //texture_desc.MipLevels; - //texture_desc.Format; - //texture_desc.Usage; - //texture_desc.BindFlags; - //texture_desc.CPUAccessFlags; - //texture_desc.MiscFlags; - //hr = device_->CreateTexture3D( - // &texture_desc, &initial_data, (ID3D11Texture3D**)&view->resource); -} - -bool D3D11Texture::CreateTextureCube( - D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) { - XELOGE("D3D11: CreateTextureCube not yet implemented"); - XEASSERTALWAYS(); - return false; -} - -bool D3D11Texture::FetchTextureCube( - D3D11TextureView* view, const xe_gpu_texture_fetch_t& fetch) { - SCOPE_profile_cpu_f("gpu"); - - XELOGE("D3D11: FetchTextureCube not yet implemented"); - XEASSERTALWAYS(); - return false; -} diff --git a/src/xenia/gpu/d3d11/d3d11_texture.h b/src/xenia/gpu/d3d11/d3d11_texture.h deleted file mode 100644 index a8ee91662..000000000 --- a/src/xenia/gpu/d3d11/d3d11_texture.h +++ /dev/null @@ -1,78 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_D3D11_D3D11_TEXTURE_H_ -#define XENIA_GPU_D3D11_D3D11_TEXTURE_H_ - -#include - -#include - -#include - - -namespace xe { -namespace gpu { -namespace d3d11 { - -class D3D11TextureCache; - - -struct D3D11TextureView : TextureView { - ID3D11Resource* resource; - ID3D11ShaderResourceView* srv; - - D3D11TextureView() - : resource(nullptr), srv(nullptr) {} - virtual ~D3D11TextureView() { - XESAFERELEASE(srv); - XESAFERELEASE(resource); - } -}; - - -class D3D11Texture : public Texture { -public: - D3D11Texture(D3D11TextureCache* cache, uint32_t address, - const uint8_t* host_address); - virtual ~D3D11Texture(); - -protected: - TextureView* FetchNew( - const xenos::xe_gpu_texture_fetch_t& fetch) override; - bool FetchDirty( - TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) override; - - bool CreateTexture1D( - D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); - bool FetchTexture1D( - D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); - bool CreateTexture2D( - D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); - bool FetchTexture2D( - D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); - bool CreateTexture3D( - D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); - bool FetchTexture3D( - D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); - bool CreateTextureCube( - D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); - bool FetchTextureCube( - D3D11TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch); - - D3D11TextureCache* cache_; -}; - - -} // namespace d3d11 -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_D3D11_D3D11_TEXTURE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_texture_cache.h b/src/xenia/gpu/d3d11/d3d11_texture_cache.h deleted file mode 100644 index 63f275d02..000000000 --- a/src/xenia/gpu/d3d11/d3d11_texture_cache.h +++ /dev/null @@ -1,61 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_D3D11_D3D11_TEXTURE_CACHE_H_ -#define XENIA_GPU_D3D11_D3D11_TEXTURE_CACHE_H_ - -#include - -#include -#include -#include - -#include - - -namespace xe { -namespace gpu { -namespace d3d11 { - - -class D3D11TextureCache : public TextureCache { -public: - D3D11TextureCache(Memory* memory, - ID3D11DeviceContext* context, ID3D11Device* device); - virtual ~D3D11TextureCache(); - - ID3D11DeviceContext* context() const { return context_; } - ID3D11Device* device() const { return device_; } - - ID3D11SamplerState* GetSamplerState( - const xenos::xe_gpu_texture_fetch_t& fetch, - const Shader::tex_buffer_desc_t& desc); - -protected: - Texture* CreateTexture(uint32_t address, const uint8_t* host_address, - const xenos::xe_gpu_texture_fetch_t& fetch) override; - -private: - ID3D11DeviceContext* context_; - ID3D11Device* device_; - - struct CachedSamplerState { - D3D11_SAMPLER_DESC desc; - ID3D11SamplerState* state; - }; - std::unordered_multimap samplers_; -}; - - -} // namespace d3d11 -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_D3D11_D3D11_TEXTURE_CACHE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_texture_resource.cc b/src/xenia/gpu/d3d11/d3d11_texture_resource.cc new file mode 100644 index 000000000..a90c60b0d --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_texture_resource.cc @@ -0,0 +1,219 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; + + +D3D11TextureResource::D3D11TextureResource( + D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info) + : TextureResource(memory_range, info), + resource_cache_(resource_cache), + texture_(nullptr), + handle_(nullptr) { +} + +D3D11TextureResource::~D3D11TextureResource() { + XESAFERELEASE(texture_); + XESAFERELEASE(handle_); +} + +int D3D11TextureResource::CreateHandle() { + SCOPE_profile_cpu_f("gpu"); + + D3D11_SHADER_RESOURCE_VIEW_DESC srv_desc; + xe_zero_struct(&srv_desc, sizeof(srv_desc)); + // TODO(benvanik): this may need to be typed on the fetch instruction (float/int/etc?) + srv_desc.Format = info_.format; + + D3D_SRV_DIMENSION dimension = D3D11_SRV_DIMENSION_UNKNOWN; + switch (info_.dimension) { + case TEXTURE_DIMENSION_1D: + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D; + srv_desc.Texture1D.MipLevels = 1; + srv_desc.Texture1D.MostDetailedMip = 0; + if (CreateHandle1D()) { + XELOGE("D3D11: failed to create Texture1D"); + return 1; + } + break; + case TEXTURE_DIMENSION_2D: + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + srv_desc.Texture2D.MipLevels = 1; + srv_desc.Texture2D.MostDetailedMip = 0; + if (CreateHandle2D()) { + XELOGE("D3D11: failed to create Texture2D"); + return 1; + } + break; + case TEXTURE_DIMENSION_3D: + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D; + srv_desc.Texture3D.MipLevels = 1; + srv_desc.Texture3D.MostDetailedMip = 0; + if (CreateHandle3D()) { + XELOGE("D3D11: failed to create Texture3D"); + return 1; + } + break; + case TEXTURE_DIMENSION_CUBE: + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBE; + srv_desc.TextureCube.MipLevels = 1; + srv_desc.TextureCube.MostDetailedMip = 0; + if (CreateHandleCube()) { + XELOGE("D3D11: failed to create TextureCube"); + return 1; + } + break; + } + + HRESULT hr = resource_cache_->device()->CreateShaderResourceView( + texture_, &srv_desc, &handle_); + if (FAILED(hr)) { + XELOGE("D3D11: unable to create texture resource view"); + return 1; + } + return 0; +} + +int D3D11TextureResource::CreateHandle1D() { + uint32_t width = 1 + info_.size_1d.width; + + D3D11_TEXTURE1D_DESC texture_desc; + xe_zero_struct(&texture_desc, sizeof(texture_desc)); + texture_desc.Width = width; + texture_desc.MipLevels = 1; + texture_desc.ArraySize = 1; + texture_desc.Format = info_.format; + texture_desc.Usage = D3D11_USAGE_DYNAMIC; + texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + texture_desc.MiscFlags = 0; // D3D11_RESOURCE_MISC_GENERATE_MIPS? + HRESULT hr = resource_cache_->device()->CreateTexture1D( + &texture_desc, NULL, (ID3D11Texture1D**)&texture_); + if (FAILED(hr)) { + return 1; + } + return 0; +} + +int D3D11TextureResource::CreateHandle2D() { + D3D11_TEXTURE2D_DESC texture_desc; + xe_zero_struct(&texture_desc, sizeof(texture_desc)); + texture_desc.Width = info_.size_2d.output_width; + texture_desc.Height = info_.size_2d.output_height; + texture_desc.MipLevels = 1; + texture_desc.ArraySize = 1; + texture_desc.Format = info_.format; + texture_desc.SampleDesc.Count = 1; + texture_desc.SampleDesc.Quality = 0; + texture_desc.Usage = D3D11_USAGE_DYNAMIC; + texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + texture_desc.MiscFlags = 0; // D3D11_RESOURCE_MISC_GENERATE_MIPS? + HRESULT hr = resource_cache_->device()->CreateTexture2D( + &texture_desc, NULL, (ID3D11Texture2D**)&texture_); + if (FAILED(hr)) { + return 1; + } + return 0; +} + +int D3D11TextureResource::CreateHandle3D() { + XELOGE("D3D11: CreateTexture3D not yet implemented"); + XEASSERTALWAYS(); + return 1; +} + +int D3D11TextureResource::CreateHandleCube() { + XELOGE("D3D11: CreateTextureCube not yet implemented"); + XEASSERTALWAYS(); + return 1; +} + +int D3D11TextureResource::InvalidateRegion(const MemoryRange& memory_range) { + SCOPE_profile_cpu_f("gpu"); + + switch (info_.dimension) { + case TEXTURE_DIMENSION_1D: + return InvalidateRegion1D(memory_range); + case TEXTURE_DIMENSION_2D: + return InvalidateRegion2D(memory_range); + case TEXTURE_DIMENSION_3D: + return InvalidateRegion3D(memory_range); + case TEXTURE_DIMENSION_CUBE: + return InvalidateRegionCube(memory_range); + } + return 1; +} + +int D3D11TextureResource::InvalidateRegion1D(const MemoryRange& memory_range) { + return 1; +} + +int D3D11TextureResource::InvalidateRegion2D(const MemoryRange& memory_range) { + // TODO(benvanik): all mip levels. + D3D11_MAPPED_SUBRESOURCE res; + HRESULT hr = resource_cache_->context()->Map( + texture_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + if (FAILED(hr)) { + XELOGE("D3D11: failed to map texture"); + return 1; + } + + const uint8_t* src = memory_range_.host_base; + uint8_t* dest = (uint8_t*)res.pData; + + uint32_t output_pitch = res.RowPitch; // (output_width / info.block_size) * info.texel_pitch; + if (!info_.is_tiled) { + dest = (uint8_t*)res.pData; + for (uint32_t y = 0; y < info_.size_2d.block_height; y++) { + for (uint32_t x = 0; x < info_.size_2d.logical_pitch; x += info_.texel_pitch) { + TextureSwap(dest + x, src + x, info_.texel_pitch); + } + src += info_.size_2d.input_pitch; + dest += output_pitch; + } + } else { + auto bpp = (info_.texel_pitch >> 2) + ((info_.texel_pitch >> 1) >> (info_.texel_pitch >> 2)); + for (uint32_t y = 0, output_base_offset = 0; + y < info_.size_2d.block_height; + y++, output_base_offset += output_pitch) { + auto input_base_offset = TiledOffset2DOuter(y, (info_.size_2d.input_width / info_.block_size), bpp); + for (uint32_t x = 0, output_offset = output_base_offset; + x < info_.size_2d.block_width; + x++, output_offset += info_.texel_pitch) { + auto input_offset = TiledOffset2DInner(x, y, bpp, input_base_offset) >> bpp; + TextureSwap(dest + output_offset, + src + input_offset * info_.texel_pitch, + info_.texel_pitch); + } + } + } + resource_cache_->context()->Unmap(texture_, 0); + return 0; +} + +int D3D11TextureResource::InvalidateRegion3D(const MemoryRange& memory_range) { + return 1; +} + +int D3D11TextureResource::InvalidateRegionCube( + const MemoryRange& memory_range) { + return 1; +} diff --git a/src/xenia/gpu/d3d11/d3d11_texture_resource.h b/src/xenia/gpu/d3d11/d3d11_texture_resource.h new file mode 100644 index 000000000..4e59662a4 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_texture_resource.h @@ -0,0 +1,60 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_TEXTURE_RESOURCE_H_ +#define XENIA_GPU_D3D11_D3D11_TEXTURE_RESOURCE_H_ + +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + +class D3D11ResourceCache; + + +class D3D11TextureResource : public TextureResource { +public: + D3D11TextureResource(D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info); + ~D3D11TextureResource() override; + + void* handle() const override { return handle_; } + +protected: + int CreateHandle() override; + int CreateHandle1D(); + int CreateHandle2D(); + int CreateHandle3D(); + int CreateHandleCube(); + + int InvalidateRegion(const MemoryRange& memory_range) override; + int InvalidateRegion1D(const MemoryRange& memory_range); + int InvalidateRegion2D(const MemoryRange& memory_range); + int InvalidateRegion3D(const MemoryRange& memory_range); + int InvalidateRegionCube(const MemoryRange& memory_range); + +private: + D3D11ResourceCache* resource_cache_; + ID3D11Resource* texture_; + ID3D11ShaderResourceView* handle_; +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_TEXTURE_RESOURCE_H_ diff --git a/src/xenia/gpu/d3d11/sources.gypi b/src/xenia/gpu/d3d11/sources.gypi index 6dc7ae242..b6b6d76c1 100644 --- a/src/xenia/gpu/d3d11/sources.gypi +++ b/src/xenia/gpu/d3d11/sources.gypi @@ -1,10 +1,8 @@ # Copyright 2013 Ben Vanik. All Rights Reserved. { 'sources': [ - 'd3d11_buffer.cc', - 'd3d11_buffer.h', - 'd3d11_buffer_cache.cc', - 'd3d11_buffer_cache.h', + 'd3d11_buffer_resource.cc', + 'd3d11_buffer_resource.h', 'd3d11_geometry_shader.cc', 'd3d11_geometry_shader.h', 'd3d11_gpu-private.h', @@ -16,14 +14,16 @@ 'd3d11_graphics_system.h', 'd3d11_profiler_display.cc', 'd3d11_profiler_display.h', - 'd3d11_shader.cc', - 'd3d11_shader.h', - 'd3d11_shader_cache.cc', - 'd3d11_shader_cache.h', - 'd3d11_texture.cc', - 'd3d11_texture.h', - 'd3d11_texture_cache.cc', - 'd3d11_texture_cache.h', + 'd3d11_resource_cache.cc', + 'd3d11_resource_cache.h', + 'd3d11_sampler_state_resource.cc', + 'd3d11_sampler_state_resource.h', + 'd3d11_shader_resource.cc', + 'd3d11_shader_resource.h', + 'd3d11_shader_translator.cc', + 'd3d11_shader_translator.h', + 'd3d11_texture_resource.cc', + 'd3d11_texture_resource.h', 'd3d11_window.cc', 'd3d11_window.h', ], diff --git a/src/xenia/gpu/xenos/registers.cc b/src/xenia/gpu/draw_command.cc similarity index 56% rename from src/xenia/gpu/xenos/registers.cc rename to src/xenia/gpu/draw_command.cc index 5d4e99106..468c4ed08 100644 --- a/src/xenia/gpu/xenos/registers.cc +++ b/src/xenia/gpu/draw_command.cc @@ -1,27 +1,17 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::xenos; - - -const char* xe::gpu::xenos::GetRegisterName(uint32_t index) { - switch (index) { -#define XE_GPU_REGISTER(index, type, name) \ - case index: return #name; -#include -#undef XE_GPU_REGISTER - default: - return NULL; - } -} +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + diff --git a/src/xenia/gpu/draw_command.h b/src/xenia/gpu/draw_command.h new file mode 100644 index 000000000..ac5b07fe6 --- /dev/null +++ b/src/xenia/gpu/draw_command.h @@ -0,0 +1,78 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_DRAW_COMMAND_H_ +#define XENIA_GPU_DRAW_COMMAND_H_ + +#include +#include +#include +#include +#include +#include + + +namespace xe { +namespace gpu { + + +// TODO(benvanik): move more of the enums in here? +struct DrawCommand { + xenos::XE_GPU_PRIMITIVE_TYPE prim_type; + uint32_t start_index; + uint32_t index_count; + uint32_t base_vertex; + + VertexShaderResource* vertex_shader; + PixelShaderResource* pixel_shader; + + // TODO(benvanik): dirty tracking/max ranges/etc. + struct { + float* values; + size_t count; + } float4_constants; + struct { + uint32_t* values; + size_t count; + } loop_constants; + struct { + uint32_t* values; + size_t count; + } bool_constants; + + // Index buffer, if present. If index_count > 0 then auto draw. + IndexBufferResource* index_buffer; + + // Vertex buffers. + struct { + uint32_t input_index; + VertexBufferResource* buffer; + uint32_t stride; + uint32_t offset; + } vertex_buffers[96]; + size_t vertex_buffer_count; + + // Texture samplers. + struct SamplerInput { + uint32_t input_index; + TextureResource* texture; + SamplerStateResource* sampler_state; + }; + SamplerInput vertex_shader_samplers[32]; + size_t vertex_shader_sampler_count; + SamplerInput pixel_shader_samplers[32]; + size_t pixel_shader_sampler_count; +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_DRAW_COMMAND_H_ diff --git a/src/xenia/gpu/graphics_driver.cc b/src/xenia/gpu/graphics_driver.cc index 65dddea49..e398839b8 100644 --- a/src/xenia/gpu/graphics_driver.cc +++ b/src/xenia/gpu/graphics_driver.cc @@ -12,12 +12,300 @@ using namespace xe; using namespace xe::gpu; +using namespace xe::gpu::xenos; GraphicsDriver::GraphicsDriver(Memory* memory) : - memory_(memory), address_translation_(0) { - memset(®ister_file_, 0, sizeof(register_file_)); + memory_(memory), address_translation_(0) { } GraphicsDriver::~GraphicsDriver() { } + +int GraphicsDriver::LoadShader(XE_GPU_SHADER_TYPE type, + uint32_t address, uint32_t length, + uint32_t start) { + MemoryRange memory_range( + memory_->Translate(address), + address, length); + + ShaderResource* shader = nullptr; + if (type == XE_GPU_SHADER_TYPE_VERTEX) { + VertexShaderResource::Info info; + shader = vertex_shader_ = resource_cache()->FetchVertexShader(memory_range, + info); + if (!vertex_shader_) { + XELOGE("Unable to fetch vertex shader"); + return 1; + } + } else { + PixelShaderResource::Info info; + shader = pixel_shader_ = resource_cache()->FetchPixelShader(memory_range, + info); + if (!pixel_shader_) { + XELOGE("Unable to fetch pixel shader"); + return 1; + } + } + + if (!shader->is_prepared()) { + // Disassemble. + const char* source = shader->disasm_src(); + XELOGGPU("Set shader %d at %0.8X (%db):\n%s", + type, address, length, + source ? source : ""); + } + + return 0; +} + +int GraphicsDriver::PrepareDraw(DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); + + // Ignore copies for now. + uint32_t enable_mode = register_file_[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7; + if (enable_mode != 4) { + XELOGW("GPU: ignoring draw with enable mode %d", enable_mode); + return 1; + } + + // Reset the things we don't modify so that we have clean state. + command.prim_type = XE_GPU_PRIMITIVE_TYPE_POINT_LIST; + command.index_count = 0; + command.index_buffer = nullptr; + + // Generic stuff. + command.start_index = register_file_[XE_GPU_REG_VGT_INDX_OFFSET].u32; + command.base_vertex = 0; + + int ret; + ret = PopulateState(command); + if (ret) { + XELOGE("Unable to prepare draw state"); + return ret; + } + ret = PopulateConstantBuffers(command); + if (ret) { + XELOGE("Unable to prepare draw constant buffers"); + return ret; + } + ret = PopulateShaders(command); + if (ret) { + XELOGE("Unable to prepare draw shaders"); + return ret; + } + ret = PopulateInputAssembly(command); + if (ret) { + XELOGE("Unable to prepare draw input assembly"); + return ret; + } + ret = PopulateSamplers(command); + if (ret) { + XELOGE("Unable to prepare draw samplers"); + return ret; + } + return 0; +} + +int GraphicsDriver::PrepareDrawIndexBuffer( + DrawCommand& command, + uint32_t address, uint32_t length, + xenos::XE_GPU_ENDIAN endianness, + IndexFormat format) { + SCOPE_profile_cpu_f("gpu"); + + address += address_translation_; + MemoryRange memory_range(memory_->Translate(address), address, length); + + IndexBufferResource::Info info; + info.endianness = endianness; + info.format = format; + + command.index_buffer = + resource_cache()->FetchIndexBuffer(memory_range, info); + if (!command.index_buffer) { + return 1; + } + return 0; +} + +int GraphicsDriver::PopulateState(DrawCommand& command) { + return 0; +} + +int GraphicsDriver::PopulateConstantBuffers(DrawCommand& command) { + command.float4_constants.count = 512; + command.float4_constants.values = + ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_000_X].f32; + command.loop_constants.count = 32; + command.loop_constants.values = + ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_LOOP_00].u32; + command.bool_constants.count = 8; + command.bool_constants.values = + ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32; + return 0; +} + +int GraphicsDriver::PopulateShaders(DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); + + if (!vertex_shader_) { + XELOGE("No vertex shader bound; ignoring"); + return 1; + } + if (!pixel_shader_) { + XELOGE("No pixel shader bound; ignoring"); + return 1; + } + + xe_gpu_program_cntl_t program_cntl; + program_cntl.dword_0 = register_file_[XE_GPU_REG_SQ_PROGRAM_CNTL].u32; + if (!vertex_shader_->is_prepared()) { + if (vertex_shader_->Prepare(program_cntl)) { + XELOGE("Unable to prepare vertex shader"); + return 1; + } + } + if (!pixel_shader_->is_prepared()) { + if (pixel_shader_->Prepare(program_cntl, vertex_shader_)) { + XELOGE("Unable to prepare pixel shader"); + return 1; + } + } + + command.vertex_shader = vertex_shader_; + command.pixel_shader = pixel_shader_; + + return 0; +} + +int GraphicsDriver::PopulateInputAssembly(DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); + + const auto& buffer_inputs = command.vertex_shader->buffer_inputs(); + command.vertex_buffer_count = buffer_inputs.count; + for (size_t n = 0; n < buffer_inputs.count; n++) { + const auto& desc = buffer_inputs.descs[n]; + + int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6; + auto group = reinterpret_cast(®ister_file_.values[r]); + xe_gpu_vertex_fetch_t* fetch = nullptr; + switch (desc.fetch_slot % 3) { + case 0: + fetch = &group->vertex_fetch_0; + break; + case 1: + fetch = &group->vertex_fetch_1; + break; + case 2: + fetch = &group->vertex_fetch_2; + break; + } + XEASSERTNOTNULL(fetch); + // If this assert doesn't hold, maybe we just abort? + XEASSERT(fetch->type == 0x3); + XEASSERTNOTZERO(fetch->size); + + const auto& info = desc.info; + + MemoryRange memory_range; + memory_range.guest_base = (fetch->address << 2) + address_translation_; + memory_range.host_base = memory_->Translate(memory_range.guest_base); + memory_range.length = fetch->size * 4; + // TODO(benvanik): if the memory range is within the command buffer, we + // should use a cached transient buffer. + + auto buffer = resource_cache()->FetchVertexBuffer(memory_range, info); + if (!buffer) { + XELOGE("Unable to create vertex fetch buffer"); + return 1; + } + + command.vertex_buffers[n].input_index = desc.input_index; + command.vertex_buffers[n].buffer = buffer; + command.vertex_buffers[n].stride = desc.info.stride_words * 4; + command.vertex_buffers[n].offset = 0; + } + return 0; +} + +int GraphicsDriver::PopulateSamplers(DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); + + // Vertex texture samplers. + const auto& vertex_sampler_inputs = command.vertex_shader->sampler_inputs(); + command.vertex_shader_sampler_count = vertex_sampler_inputs.count; + for (size_t i = 0; i < command.vertex_shader_sampler_count; ++i) { + if (PopulateSamplerSet(vertex_sampler_inputs.descs[i], + command.vertex_shader_samplers[i])) { + return 1; + } + } + + // Pixel shader texture sampler. + const auto& pixel_sampler_inputs = command.pixel_shader->sampler_inputs(); + command.pixel_shader_sampler_count = pixel_sampler_inputs.count; + for (size_t i = 0; i < command.pixel_shader_sampler_count; ++i) { + if (PopulateSamplerSet(pixel_sampler_inputs.descs[i], + command.pixel_shader_samplers[i])) { + return 1; + } + } + + return 0; +} + +int GraphicsDriver::PopulateSamplerSet( + const ShaderResource::SamplerDesc& src_input, + DrawCommand::SamplerInput& dst_input) { + int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + src_input.fetch_slot * 6; + const auto group = (const xe_gpu_fetch_group_t*)®ister_file_.values[r]; + const xenos::xe_gpu_texture_fetch_t& fetch = group->texture_fetch; + if (fetch.type != 0x2) { + return 0; + } + + dst_input.input_index = src_input.input_index; + dst_input.texture = nullptr; + dst_input.sampler_state = nullptr; + + TextureResource::Info info; + if (!TextureResource::Info::Prepare(fetch, info)) { + XELOGE("D3D11: unable to parse texture fetcher info"); + return 0; // invalid texture used + } + if (info.format == DXGI_FORMAT_UNKNOWN) { + XELOGW("D3D11: unknown texture format %d", info.format); + return 0; // invalid texture used + } + + // TODO(benvanik): quick validate without refetching intraframe. + // Fetch texture from the cache. + MemoryRange memory_range; + memory_range.guest_base = (fetch.address << 12) + address_translation_; + memory_range.host_base = memory_->Translate(memory_range.guest_base); + memory_range.length = info.input_length; + + auto texture = resource_cache()->FetchTexture(memory_range, info); + if (!texture) { + XELOGW("D3D11: unable to fetch texture"); + return 0; // invalid texture used + } + + SamplerStateResource::Info sampler_info; + if (!SamplerStateResource::Info::Prepare(fetch, + src_input.tex_fetch, + sampler_info)) { + XELOGW("D3D11: unable to parse sampler info"); + return 0; // invalid texture used + } + auto sampler_state = resource_cache()->FetchSamplerState(sampler_info); + if (!sampler_state) { + XELOGW("D3D11: unable to fetch sampler"); + return 0; // invalid texture used + } + + dst_input.texture = texture; + dst_input.sampler_state = sampler_state; + return 0; +} diff --git a/src/xenia/gpu/graphics_driver.h b/src/xenia/gpu/graphics_driver.h index 675a5a7c2..23cb24972 100644 --- a/src/xenia/gpu/graphics_driver.h +++ b/src/xenia/gpu/graphics_driver.h @@ -11,7 +11,9 @@ #define XENIA_GPU_GRAPHICS_DRIVER_H_ #include -#include +#include +#include +#include #include @@ -24,38 +26,45 @@ public: virtual ~GraphicsDriver(); Memory* memory() const { return memory_; } - xenos::RegisterFile* register_file() { return ®ister_file_; }; + virtual ResourceCache* resource_cache() const = 0; + RegisterFile* register_file() { return ®ister_file_; }; void set_address_translation(uint32_t value) { address_translation_ = value; } - virtual void Initialize() = 0; + virtual int Initialize() = 0; - virtual void InvalidateState( - uint32_t mask) = 0; - virtual void SetShader( - xenos::XE_GPU_SHADER_TYPE type, - uint32_t address, - uint32_t start, - uint32_t length) = 0; - virtual void DrawIndexBuffer( - xenos::XE_GPU_PRIMITIVE_TYPE prim_type, - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness) = 0; - //virtual void DrawIndexImmediate(); - virtual void DrawIndexAuto( - xenos::XE_GPU_PRIMITIVE_TYPE prim_type, - uint32_t index_count) = 0; + int LoadShader(xenos::XE_GPU_SHADER_TYPE type, + uint32_t address, uint32_t length, + uint32_t start); + + int PrepareDraw(DrawCommand& command); + int PrepareDrawIndexBuffer(DrawCommand& command, + uint32_t address, uint32_t length, + xenos::XE_GPU_ENDIAN endianness, + IndexFormat format); + virtual int Draw(const DrawCommand& command) = 0; virtual int Resolve() = 0; +private: + int PopulateState(DrawCommand& command); + int PopulateConstantBuffers(DrawCommand& command); + int PopulateShaders(DrawCommand& command); + int PopulateInputAssembly(DrawCommand& command); + int PopulateSamplers(DrawCommand& command); + int PopulateSamplerSet(const ShaderResource::SamplerDesc& src_input, + DrawCommand::SamplerInput& dst_input); + protected: GraphicsDriver(Memory* memory); Memory* memory_; - - xenos::RegisterFile register_file_; + RegisterFile register_file_; uint32_t address_translation_; + + VertexShaderResource* vertex_shader_; + PixelShaderResource* pixel_shader_; }; diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index c0a614d35..be3e4e0de 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -11,9 +11,10 @@ #include #include +#include +#include #include -#include -#include +#include using namespace xe; @@ -24,10 +25,10 @@ using namespace xe::gpu::xenos; GraphicsSystem::GraphicsSystem(Emulator* emulator) : emulator_(emulator), memory_(emulator->memory()), - thread_(0), running_(false), driver_(0), worker_(0), + thread_(nullptr), running_(false), driver_(nullptr), + command_processor_(nullptr), interrupt_callback_(0), interrupt_callback_data_(0), - last_interrupt_time_(0), swap_pending_(false), - thread_wait_(NULL) { + last_interrupt_time_(0), swap_pending_(false), thread_wait_(nullptr) { // Create the run loop used for any windows/etc. // This must be done on the thread we create the driver. run_loop_ = xe_run_loop_create(); @@ -42,7 +43,7 @@ X_STATUS GraphicsSystem::Setup() { processor_ = emulator_->processor(); // Create worker. - worker_ = new RingBufferWorker(this, memory_); + command_processor_ = new CommandProcessor(this, memory_); // Let the processor know we want register access callbacks. emulator_->memory()->AddMappedRange( @@ -77,15 +78,18 @@ void GraphicsSystem::ThreadStart() { // Main run loop. while (running_) { // Peek main run loop. - if (xe_run_loop_pump(run_loop)) { - break; + { + SCOPE_profile_cpu_i("gpu", "GraphicsSystemRunLoopPump"); + if (xe_run_loop_pump(run_loop)) { + break; + } } if (!running_) { break; } // Pump worker. - worker_->Pump(); + command_processor_->Pump(); if (!running_) { break; @@ -107,7 +111,7 @@ void GraphicsSystem::Shutdown() { xe_thread_join(thread_); xe_thread_release(thread_); - delete worker_; + delete command_processor_; xe_run_loop_release(run_loop_); } @@ -125,17 +129,19 @@ void GraphicsSystem::InitializeRingBuffer(uint32_t ptr, uint32_t page_count) { Sleep(0); } XEASSERTNOTNULL(driver_); - worker_->Initialize(driver_, ptr, page_count); + command_processor_->Initialize(driver_, ptr, page_count); } void GraphicsSystem::EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size) { - worker_->EnableReadPointerWriteBack(ptr, block_size); + command_processor_->EnableReadPointerWriteBack(ptr, block_size); } uint64_t GraphicsSystem::ReadRegister(uint64_t addr) { uint32_t r = addr & 0xFFFF; - XELOGGPU("ReadRegister(%.4X)", r); + if (FLAGS_trace_ring_buffer) { + XELOGGPU("ReadRegister(%.4X)", r); + } RegisterFile* regs = driver_->register_file(); @@ -148,31 +154,33 @@ uint64_t GraphicsSystem::ReadRegister(uint64_t addr) { return 1; } - XEASSERT(r >= 0 && r < kXEGpuRegisterCount); + XEASSERT(r >= 0 && r < RegisterFile::kRegisterCount); return regs->values[r].u32; } void GraphicsSystem::WriteRegister(uint64_t addr, uint64_t value) { uint32_t r = addr & 0xFFFF; - XELOGGPU("WriteRegister(%.4X, %.8X)", r, value); + if (FLAGS_trace_ring_buffer) { + XELOGGPU("WriteRegister(%.4X, %.8X)", r, value); + } RegisterFile* regs = driver_->register_file(); switch (r) { case 0x0714: // CP_RB_WPTR - worker_->UpdateWritePointer((uint32_t)value); + command_processor_->UpdateWritePointer((uint32_t)value); break; default: XELOGW("Unknown GPU register %.4X write: %.8X", r, value); break; } - XEASSERT(r >= 0 && r < kXEGpuRegisterCount); + XEASSERT(r >= 0 && r < RegisterFile::kRegisterCount); regs->values[r].u32 = (uint32_t)value; } void GraphicsSystem::MarkVblank() { - worker_->increment_counter(); + command_processor_->increment_counter(); } void GraphicsSystem::DispatchInterruptCallback( diff --git a/src/xenia/gpu/graphics_system.h b/src/xenia/gpu/graphics_system.h index c7c72fea5..8c0a542c8 100644 --- a/src/xenia/gpu/graphics_system.h +++ b/src/xenia/gpu/graphics_system.h @@ -21,8 +21,8 @@ XEDECLARECLASS2(xe, cpu, Processor); namespace xe { namespace gpu { +class CommandProcessor; class GraphicsDriver; -class RingBufferWorker; class GraphicsSystem { @@ -78,7 +78,7 @@ protected: bool running_; GraphicsDriver* driver_; - RingBufferWorker* worker_; + CommandProcessor* command_processor_; uint32_t interrupt_callback_; uint32_t interrupt_callback_data_; diff --git a/src/xenia/gpu/nop/nop_graphics_driver.cc b/src/xenia/gpu/nop/nop_graphics_driver.cc index 69f88fa95..b710b85e4 100644 --- a/src/xenia/gpu/nop/nop_graphics_driver.cc +++ b/src/xenia/gpu/nop/nop_graphics_driver.cc @@ -10,7 +10,6 @@ #include #include -#include using namespace xe; @@ -19,69 +18,19 @@ using namespace xe::gpu::nop; using namespace xe::gpu::xenos; -NopGraphicsDriver::NopGraphicsDriver(Memory* memory) : - GraphicsDriver(memory) { - shader_cache_ = new ShaderCache(); +NopGraphicsDriver::NopGraphicsDriver(Memory* memory) + : GraphicsDriver(memory), resource_cache_(nullptr) { } NopGraphicsDriver::~NopGraphicsDriver() { - delete shader_cache_; } -void NopGraphicsDriver::Initialize() { +int NopGraphicsDriver::Initialize() { + return 0; } -void NopGraphicsDriver::InvalidateState( - uint32_t mask) { - if (mask == XE_GPU_INVALIDATE_MASK_ALL) { - XELOGGPU("NOP: (invalidate all)"); - } - if (mask & XE_GPU_INVALIDATE_MASK_VERTEX_SHADER) { - XELOGGPU("NOP: invalidate vertex shader"); - } - if (mask & XE_GPU_INVALIDATE_MASK_PIXEL_SHADER) { - XELOGGPU("NOP: invalidate pixel shader"); - } -} - -void NopGraphicsDriver::SetShader( - XE_GPU_SHADER_TYPE type, - uint32_t address, - uint32_t start, - uint32_t length) { - // Find or create shader in the cache. - uint8_t* p = memory_->Translate(address); - Shader* shader = shader_cache_->FindOrCreate( - type, p, length); - - // Disassemble. - const char* source = shader->disasm_src(); - if (!source) { - source = ""; - } - XELOGGPU("NOP: set shader %d at %0.8X (%db):\n%s", - type, address, length, source); -} - -void NopGraphicsDriver::DrawIndexBuffer( - XE_GPU_PRIMITIVE_TYPE prim_type, - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness) { - XELOGGPU("NOP: draw index buffer"); -} - -void NopGraphicsDriver::DrawIndexAuto( - XE_GPU_PRIMITIVE_TYPE prim_type, - uint32_t index_count) { - XELOGGPU("NOP: draw indexed %d (%d indicies)", - prim_type, index_count); - - // TODO(benvanik): - // program control - // context misc - // interpolator control - // shader constants / bools / integers - // fetch constants +int NopGraphicsDriver::Draw(const DrawCommand& command) { + return 0; } int NopGraphicsDriver::Resolve() { diff --git a/src/xenia/gpu/nop/nop_graphics_driver.h b/src/xenia/gpu/nop/nop_graphics_driver.h index d345c8159..9463a0cd5 100644 --- a/src/xenia/gpu/nop/nop_graphics_driver.h +++ b/src/xenia/gpu/nop/nop_graphics_driver.h @@ -19,9 +19,6 @@ namespace xe { namespace gpu { - -class ShaderCache; - namespace nop { @@ -30,27 +27,16 @@ public: NopGraphicsDriver(Memory* memory); virtual ~NopGraphicsDriver(); - virtual void Initialize(); + ResourceCache* resource_cache() const override { return resource_cache_; } - virtual void InvalidateState( - uint32_t mask); - virtual void SetShader( - xenos::XE_GPU_SHADER_TYPE type, - uint32_t address, - uint32_t start, - uint32_t length); - virtual void DrawIndexBuffer( - xenos::XE_GPU_PRIMITIVE_TYPE prim_type, - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness); - virtual void DrawIndexAuto( - xenos::XE_GPU_PRIMITIVE_TYPE prim_type, - uint32_t index_count); + int Initialize() override; - virtual int Resolve(); + int Draw(const DrawCommand& command) override; + + int Resolve() override; protected: - ShaderCache* shader_cache_; + ResourceCache* resource_cache_; }; diff --git a/src/xenia/gpu/register_file.cc b/src/xenia/gpu/register_file.cc index f6f119376..288881d58 100644 --- a/src/xenia/gpu/register_file.cc +++ b/src/xenia/gpu/register_file.cc @@ -10,8 +10,21 @@ #include -using namespace std; using namespace xe; using namespace xe::gpu; -using namespace xe::gpu::xenos; + +RegisterFile::RegisterFile() { + xe_zero_struct(values, sizeof(values)); +} + +const char* RegisterFile::GetRegisterName(uint32_t index) { + switch (index) { +#define XE_GPU_REGISTER(index, type, name) \ + case index: return #name; +#include +#undef XE_GPU_REGISTER + default: + return NULL; + } +} diff --git a/src/xenia/gpu/register_file.h b/src/xenia/gpu/register_file.h index 2a530995f..3ab23b4fa 100644 --- a/src/xenia/gpu/register_file.h +++ b/src/xenia/gpu/register_file.h @@ -11,15 +11,36 @@ #define XENIA_GPU_REGISTER_FILE_H_ #include -#include namespace xe { namespace gpu { +enum Register { +#define XE_GPU_REGISTER(index, type, name) \ + XE_GPU_REG_##name = index, +#include +#undef XE_GPU_REGISTER +}; + + class RegisterFile { public: + RegisterFile(); + + const char* GetRegisterName(uint32_t index); + + static const size_t kRegisterCount = 0x5003; + union RegisterValue { + uint32_t u32; + float f32; + }; + RegisterValue values[kRegisterCount]; + + RegisterValue& operator[](Register reg) { + return values[reg]; + } }; diff --git a/src/xenia/gpu/resource.cc b/src/xenia/gpu/resource.cc index 88966aac5..35ef82bb6 100644 --- a/src/xenia/gpu/resource.cc +++ b/src/xenia/gpu/resource.cc @@ -15,3 +15,23 @@ using namespace xe; using namespace xe::gpu; using namespace xe::gpu::xenos; + +HashedResource::HashedResource(const MemoryRange& memory_range) + : memory_range_(memory_range) { +} + +HashedResource::~HashedResource() = default; + +PagedResource::PagedResource(const MemoryRange& memory_range) + : memory_range_(memory_range), dirtied_(true) { +} + +PagedResource::~PagedResource() = default; + +void PagedResource::MarkDirty(uint32_t lo_address, uint32_t hi_address) { + dirtied_ = true; +} + +StaticResource::StaticResource() = default; + +StaticResource::~StaticResource() = default; diff --git a/src/xenia/gpu/resource.h b/src/xenia/gpu/resource.h index e9a0be7fa..1fb56b3d8 100644 --- a/src/xenia/gpu/resource.h +++ b/src/xenia/gpu/resource.h @@ -18,8 +18,82 @@ namespace xe { namespace gpu { +struct MemoryRange { + uint8_t* host_base; + uint32_t guest_base; + uint32_t length; + + MemoryRange() : host_base(nullptr), guest_base(0), length(0) {} + MemoryRange(const MemoryRange& other) + : host_base(other.host_base), guest_base(other.guest_base), + length(other.length) {} + MemoryRange(uint8_t* _host_base, uint32_t _guest_base, uint32_t _length) + : host_base(_host_base), guest_base(_guest_base), length(_length) {} +}; + + class Resource { public: + virtual ~Resource() = default; + + virtual void* handle() const = 0; + + template + T* handle_as() { + return reinterpret_cast(handle()); + } + +protected: + Resource() = default; + + // last use/LRU stuff +}; + + +class HashedResource : public Resource { +public: + ~HashedResource() override; + + const MemoryRange& memory_range() const { return memory_range_; } + +protected: + HashedResource(const MemoryRange& memory_range); + + MemoryRange memory_range_; + // key +}; + + +class PagedResource : public Resource { +public: + ~PagedResource() override; + + const MemoryRange& memory_range() const { return memory_range_; } + + template + bool Equals(const T& info) { + return Equals(&info, sizeof(info)); + } + virtual bool Equals(const void* info_ptr, size_t info_length) = 0; + + bool is_dirty() const { return dirtied_; } + void MarkDirty(uint32_t lo_address, uint32_t hi_address); + +protected: + PagedResource(const MemoryRange& memory_range); + + MemoryRange memory_range_; + bool dirtied_; + // dirtied pages list +}; + + +class StaticResource : public Resource { +public: + ~StaticResource() override; + +protected: + StaticResource(); }; diff --git a/src/xenia/gpu/resource_cache.cc b/src/xenia/gpu/resource_cache.cc index 7a9a1c24d..c317a12be 100644 --- a/src/xenia/gpu/resource_cache.cc +++ b/src/xenia/gpu/resource_cache.cc @@ -15,3 +15,140 @@ using namespace xe; using namespace xe::gpu; using namespace xe::gpu::xenos; + +ResourceCache::ResourceCache(Memory* memory) + : memory_(memory) { +} + +ResourceCache::~ResourceCache() { + for (auto it = resources_.begin(); it != resources_.end(); ++it) { + Resource* resource = *it; + delete resource; + } + resources_.clear(); +} + +VertexShaderResource* ResourceCache::FetchVertexShader( + const MemoryRange& memory_range, + const VertexShaderResource::Info& info) { + return FetchHashedResource( + memory_range, info, &ResourceCache::CreateVertexShader); +} + +PixelShaderResource* ResourceCache::FetchPixelShader( + const MemoryRange& memory_range, + const PixelShaderResource::Info& info) { + return FetchHashedResource( + memory_range, info, &ResourceCache::CreatePixelShader); +} + +TextureResource* ResourceCache::FetchTexture( + const MemoryRange& memory_range, + const TextureResource::Info& info) { + auto resource = FetchPagedResource( + memory_range, info, &ResourceCache::CreateTexture); + if (!resource) { + return nullptr; + } + if (resource->Prepare()) { + XELOGE("Unable to prepare texture"); + return nullptr; + } + return resource; +} + +SamplerStateResource* ResourceCache::FetchSamplerState( + const SamplerStateResource::Info& info) { + auto key = info.hash(); + auto it = static_resources_.find(key); + if (it != static_resources_.end()) { + return static_cast(it->second); + } + auto resource = CreateSamplerState(info); + if (resource->Prepare()) { + XELOGE("Unable to prepare sampler state"); + return nullptr; + } + static_resources_.insert({ key, resource }); + resources_.push_back(resource); + return resource; +} + +IndexBufferResource* ResourceCache::FetchIndexBuffer( + const MemoryRange& memory_range, + const IndexBufferResource::Info& info) { + auto resource = FetchPagedResource( + memory_range, info, &ResourceCache::CreateIndexBuffer); + if (!resource) { + return nullptr; + } + if (resource->Prepare()) { + XELOGE("Unable to prepare index buffer"); + return nullptr; + } + return resource; +} + +VertexBufferResource* ResourceCache::FetchVertexBuffer( + const MemoryRange& memory_range, + const VertexBufferResource::Info& info) { + auto resource = FetchPagedResource( + memory_range, info, &ResourceCache::CreateVertexBuffer); + if (!resource) { + return nullptr; + } + if (resource->Prepare()) { + XELOGE("Unable to prepare vertex buffer"); + return nullptr; + } + return resource; +} + +uint64_t ResourceCache::HashRange(const MemoryRange& memory_range) { + // We could do something smarter here to potentially early exit. + return xe_hash64(memory_range.host_base, memory_range.length); +} + +void ResourceCache::SyncRange(uint32_t address, int length) { + // Scan the page table in sync with our resource list. This means + // we have O(n) complexity for updates, though we could definitely + // make this faster/cleaner. + // TODO(benvanik): actually do this right. + // For now we assume the page table in the range of our resources + // will not be changing, which allows us to do a foreach(res) and reload + // and then clear the table. + + // total bytes = (512 * 1024 * 1024) / (16 * 1024) = 32768 + // each byte = 1 page + // Walk as qwords so we can clear things up faster. + uint64_t* page_table = reinterpret_cast( + memory_->Translate(memory_->page_table())); + int page_size = 16 * 1024; // 16KB pages + + uint32_t lo_address = address % 0x20000000; + uint32_t hi_address = lo_address + length; + hi_address = (hi_address / page_size) * page_size + page_size; + int start_page = lo_address / page_size; + int end_page = hi_address / page_size; + + auto it = paged_resources_.upper_bound(lo_address); + auto end_it = paged_resources_.lower_bound(hi_address); + while (it != end_it) { + const auto& memory_range = it->second->memory_range(); + int lo_page = (memory_range.guest_base % 0x20000000) / page_size; + int hi_page = lo_page + (memory_range.length / page_size); + for (int i = lo_page / 8; i <= hi_page / 8; ++i) { + uint64_t page_flags = page_table[i]; + if (page_flags) { + // Dirty! + it->second->MarkDirty(i * 8 * page_size, (i * 8 + 7) * page_size); + } + } + ++it; + } + + // Reset page table. + for (auto i = start_page / 8; i <= end_page / 8; ++i) { + page_table[i] = 0; + } +} diff --git a/src/xenia/gpu/resource_cache.h b/src/xenia/gpu/resource_cache.h index 7caaad51f..be95f0861 100644 --- a/src/xenia/gpu/resource_cache.h +++ b/src/xenia/gpu/resource_cache.h @@ -10,7 +10,14 @@ #ifndef XENIA_GPU_RESOURCE_CACHE_H_ #define XENIA_GPU_RESOURCE_CACHE_H_ +#include + #include +#include +#include +#include +#include +#include #include @@ -20,6 +27,96 @@ namespace gpu { class ResourceCache { public: + virtual ~ResourceCache(); + + VertexShaderResource* FetchVertexShader( + const MemoryRange& memory_range, + const VertexShaderResource::Info& info); + PixelShaderResource* FetchPixelShader( + const MemoryRange& memory_range, + const PixelShaderResource::Info& info); + + TextureResource* FetchTexture( + const MemoryRange& memory_range, + const TextureResource::Info& info); + SamplerStateResource* FetchSamplerState( + const SamplerStateResource::Info& info); + + IndexBufferResource* FetchIndexBuffer( + const MemoryRange& memory_range, + const IndexBufferResource::Info& info); + VertexBufferResource* FetchVertexBuffer( + const MemoryRange& memory_range, + const VertexBufferResource::Info& info); + + uint64_t HashRange(const MemoryRange& memory_range); + + void SyncRange(uint32_t address, int length); + +protected: + ResourceCache(Memory* memory); + + template + T* FetchHashedResource(const MemoryRange& memory_range, + const typename T::Info& info, + const V& factory) { + // TODO(benvanik): if there's no way it's changed and it's been checked, + // just lookup. This way we don't rehash 100x a frame. + auto key = HashRange(memory_range); + auto it = hashed_resources_.find(key); + if (it != hashed_resources_.end()) { + return static_cast(it->second); + } + auto resource = (this->*factory)(memory_range, info); + hashed_resources_.insert({ key, resource }); + resources_.push_back(resource); + return resource; + } + + template + T* FetchPagedResource(const MemoryRange& memory_range, + const typename T::Info& info, + const V& factory) { + uint32_t lo_address = memory_range.guest_base % 0x20000000; + auto key = uint64_t(lo_address); + auto range = paged_resources_.equal_range(key); + for (auto it = range.first; it != range.second; ++it) { + if (it->second->memory_range().length == memory_range.length && + it->second->Equals(info)) { + return static_cast(it->second); + } + } + auto resource = (this->*factory)(memory_range, info); + paged_resources_.insert({ key, resource }); + resources_.push_back(resource); + return resource; + } + + virtual VertexShaderResource* CreateVertexShader( + const MemoryRange& memory_range, + const VertexShaderResource::Info& info) = 0; + virtual PixelShaderResource* CreatePixelShader( + const MemoryRange& memory_range, + const PixelShaderResource::Info& info) = 0; + virtual TextureResource* CreateTexture( + const MemoryRange& memory_range, + const TextureResource::Info& info) = 0; + virtual SamplerStateResource* CreateSamplerState( + const SamplerStateResource::Info& info) = 0; + virtual IndexBufferResource* CreateIndexBuffer( + const MemoryRange& memory_range, + const IndexBufferResource::Info& info) = 0; + virtual VertexBufferResource* CreateVertexBuffer( + const MemoryRange& memory_range, + const VertexBufferResource::Info& info) = 0; + +private: + Memory* memory_; + + std::vector resources_; + std::unordered_map hashed_resources_; + std::unordered_map static_resources_; + std::multimap paged_resources_; }; diff --git a/src/xenia/gpu/ring_buffer_worker.cc b/src/xenia/gpu/ring_buffer_worker.cc deleted file mode 100644 index 9999601bb..000000000 --- a/src/xenia/gpu/ring_buffer_worker.cc +++ /dev/null @@ -1,741 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include -#include -#include -#include -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::xenos; - - -#define XETRACERB(fmt, ...) if (FLAGS_trace_ring_buffer) XELOGGPU(fmt, ##__VA_ARGS__) - - -RingBufferWorker::RingBufferWorker( - GraphicsSystem* graphics_system, Memory* memory) : - graphics_system_(graphics_system), memory_(memory), driver_(0) { - write_ptr_index_event_ = CreateEvent( - NULL, FALSE, FALSE, NULL); - - primary_buffer_ptr_ = 0; - primary_buffer_size_ = 0; - read_ptr_index_ = 0; - read_ptr_update_freq_ = 0; - read_ptr_writeback_ptr_ = 0; - write_ptr_index_ = 0; - write_ptr_max_index_ = 0; - - LARGE_INTEGER perf_counter; - QueryPerformanceCounter(&perf_counter); - time_base_ = perf_counter.QuadPart; - counter_ = 0; -} - -RingBufferWorker::~RingBufferWorker() { - SetEvent(write_ptr_index_event_); - CloseHandle(write_ptr_index_event_); -} - -uint64_t RingBufferWorker::QueryTime() { - LARGE_INTEGER perf_counter; - QueryPerformanceCounter(&perf_counter); - return perf_counter.QuadPart - time_base_; -} - -void RingBufferWorker::Initialize(GraphicsDriver* driver, - uint32_t ptr, uint32_t page_count) { - driver_ = driver; - primary_buffer_ptr_ = ptr; - // Not sure this is correct, but it's a way to take the page_count back to - // the number of bytes allocated by the physical alloc. - uint32_t original_size = 1 << (0x1C - page_count - 1); - primary_buffer_size_ = original_size; - read_ptr_index_ = 0; - - // Tell the driver what to use for translation. - driver_->set_address_translation(primary_buffer_ptr_ & ~0x1FFFFFFF); -} - -void RingBufferWorker::EnableReadPointerWriteBack(uint32_t ptr, - uint32_t block_size) { - // CP_RB_RPTR_ADDR Ring Buffer Read Pointer Address 0x70C - // ptr = RB_RPTR_ADDR, pointer to write back the address to. - read_ptr_writeback_ptr_ = (primary_buffer_ptr_ & ~0x1FFFFFFF) + ptr; - // CP_RB_CNTL Ring Buffer Control 0x704 - // block_size = RB_BLKSZ, number of quadwords read between updates of the - // read pointer. - read_ptr_update_freq_ = (uint32_t)pow(2.0, (double)block_size) / 4; -} - -void RingBufferWorker::UpdateWritePointer(uint32_t value) { - write_ptr_max_index_ = MAX(write_ptr_max_index_, value); - write_ptr_index_ = value; - SetEvent(write_ptr_index_event_); -} - -void RingBufferWorker::Pump() { - uint8_t* p = memory_->membase(); - - if (write_ptr_index_ == 0xBAADF00D || - read_ptr_index_ == write_ptr_index_) { - // Check if the pointer has moved. - // We wait a short bit here to yield time. Since we are also running the - // main window display we don't want to pause too long, though. - const int wait_time_ms = 1; - if (WaitForSingleObject(write_ptr_index_event_, - wait_time_ms) == WAIT_TIMEOUT) { - return; - } - } - - // Bring local so we don't have to worry about them changing out from under - // us. - uint32_t write_ptr_index = write_ptr_index_; - uint32_t write_ptr_max_index = write_ptr_max_index_; - if (read_ptr_index_ == write_ptr_index) { - return; - } - - // Process the new commands. - XETRACERB("Ring buffer thread work"); - - // Execute. Note that we handle wraparound transparently. - ExecutePrimaryBuffer(read_ptr_index_, write_ptr_index); - read_ptr_index_ = write_ptr_index; - - // TODO(benvanik): use read_ptr_update_freq_ and only issue after moving - // that many indices. - if (read_ptr_writeback_ptr_) { - XESETUINT32BE(p + read_ptr_writeback_ptr_, read_ptr_index_); - } -} - -void RingBufferWorker::ExecutePrimaryBuffer( - uint32_t start_index, uint32_t end_index) { - SCOPE_profile_cpu_f("gpu"); - - // Adjust pointer base. - uint32_t ptr = primary_buffer_ptr_ + start_index * 4; - ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (ptr & 0x1FFFFFFF); - uint32_t end_ptr = primary_buffer_ptr_ + end_index * 4; - end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF); - - XETRACERB("[%.8X] ExecutePrimaryBuffer(%dw -> %dw)", - ptr, start_index, end_index); - - // Execute commands! - PacketArgs args; - args.ptr = ptr; - args.base_ptr = primary_buffer_ptr_; - args.max_address = primary_buffer_ptr_ + primary_buffer_size_ * 4; - args.ptr_mask = (primary_buffer_size_ / 4) - 1; - uint32_t n = 0; - while (args.ptr != end_ptr) { - n += ExecutePacket(args); - } - if (end_index > start_index) { - XEASSERT(n == (end_index - start_index)); - } - - XETRACERB(" ExecutePrimaryBuffer End"); -} - -void RingBufferWorker::ExecuteIndirectBuffer(uint32_t ptr, uint32_t length) { - XETRACERB("[%.8X] ExecuteIndirectBuffer(%dw)", ptr, length); - - // Execute commands! - PacketArgs args; - args.ptr = ptr; - args.base_ptr = ptr; - args.max_address = ptr + length * 4; - args.ptr_mask = 0; - for (uint32_t n = 0; n < length;) { - n += ExecutePacket(args); - XEASSERT(n <= length); - } - - XETRACERB(" ExecuteIndirectBuffer End"); -} - -#define LOG_DATA(count) \ - for (uint32_t __m = 0; __m < count; __m++) { \ - XETRACERB("[%.8X] %.8X", \ - packet_ptr + (1 + __m) * 4, \ - XEGETUINT32BE(packet_base + 1 * 4 + __m * 4)); \ - } - -void RingBufferWorker::AdvancePtr(PacketArgs& args, uint32_t n) { - args.ptr = args.ptr + n * 4; - if (args.ptr_mask) { - args.ptr = - args.base_ptr + (((args.ptr - args.base_ptr) / 4) & args.ptr_mask) * 4; - } -} -#define ADVANCE_PTR(n) AdvancePtr(args, n) -#define PEEK_PTR() \ - XEGETUINT32BE(p + args.ptr) -#define READ_PTR() \ - XEGETUINT32BE(p + args.ptr); ADVANCE_PTR(1); - -uint32_t RingBufferWorker::ExecutePacket(PacketArgs& args) { - uint8_t* p = memory_->membase(); - RegisterFile* regs = driver_->register_file(); - - uint32_t packet_ptr = args.ptr; - const uint8_t* packet_base = p + packet_ptr; - const uint32_t packet = PEEK_PTR(); - ADVANCE_PTR(1); - const uint32_t packet_type = packet >> 30; - if (packet == 0) { - XETRACERB("[%.8X] Packet(%.8X): 0?", - packet_ptr, packet); - return 1; - } - - switch (packet_type) { - case 0x00: - { - // Type-0 packet. - // Write count registers in sequence to the registers starting at - // (base_index << 2). - XETRACERB("[%.8X] Packet(%.8X): set registers:", - packet_ptr, packet); - uint32_t count = ((packet >> 16) & 0x3FFF) + 1; - uint32_t base_index = (packet & 0x7FFF); - uint32_t write_one_reg = (packet >> 15) & 0x1; - for (uint32_t m = 0; m < count; m++) { - uint32_t reg_data = PEEK_PTR(); - uint32_t target_index = write_one_reg ? base_index : base_index + m; - const char* reg_name = xenos::GetRegisterName(target_index); - XETRACERB("[%.8X] %.8X -> %.4X %s", - args.ptr, - reg_data, target_index, reg_name ? reg_name : ""); - ADVANCE_PTR(1); - WriteRegister(packet_ptr, target_index, reg_data); - } - return 1 + count; - } - break; - case 0x01: - { - // Type-1 packet. - // Contains two registers of data. Type-0 should be more common. - XETRACERB("[%.8X] Packet(%.8X): set registers:", - packet_ptr, packet); - uint32_t reg_index_1 = packet & 0x7FF; - uint32_t reg_index_2 = (packet >> 11) & 0x7FF; - uint32_t reg_ptr_1 = args.ptr; - uint32_t reg_data_1 = READ_PTR(); - uint32_t reg_ptr_2 = args.ptr; - uint32_t reg_data_2 = READ_PTR(); - const char* reg_name_1 = xenos::GetRegisterName(reg_index_1); - const char* reg_name_2 = xenos::GetRegisterName(reg_index_2); - XETRACERB("[%.8X] %.8X -> %.4X %s", - reg_ptr_1, - reg_data_1, reg_index_1, reg_name_1 ? reg_name_1 : ""); - XETRACERB("[%.8X] %.8X -> %.4X %s", - reg_ptr_2, - reg_data_2, reg_index_2, reg_name_2 ? reg_name_2 : ""); - WriteRegister(packet_ptr, reg_index_1, reg_data_1); - WriteRegister(packet_ptr, reg_index_2, reg_data_2); - return 1 + 2; - } - break; - case 0x02: - // Type-2 packet. - // No-op. Do nothing. - XETRACERB("[%.8X] Packet(%.8X): padding", - packet_ptr, packet); - return 1; - case 0x03: - { - // Type-3 packet. - uint32_t count = ((packet >> 16) & 0x3FFF) + 1; - uint32_t opcode = (packet >> 8) & 0x7F; - // & 1 == predicate, maybe? - - switch (opcode) { - case PM4_ME_INIT: - // initialize CP's micro-engine - XETRACERB("[%.8X] Packet(%.8X): PM4_ME_INIT", - packet_ptr, packet); - LOG_DATA(count); - ADVANCE_PTR(count); - break; - - case PM4_NOP: - // skip N 32-bit words to get to the next packet - // No-op, ignore some data. - XETRACERB("[%.8X] Packet(%.8X): PM4_NOP", - packet_ptr, packet); - LOG_DATA(count); - ADVANCE_PTR(count); - break; - - case PM4_INTERRUPT: - // generate interrupt from the command stream - { - XETRACERB("[%.8X] Packet(%.8X): PM4_INTERRUPT", - packet_ptr, packet); - LOG_DATA(count); - uint32_t cpu_mask = READ_PTR(); - for (int n = 0; n < 6; n++) { - if (cpu_mask & (1 << n)) { - graphics_system_->DispatchInterruptCallback(1, n); - } - } - } - break; - - case PM4_INDIRECT_BUFFER: - // indirect buffer dispatch - { - uint32_t list_ptr = READ_PTR(); - uint32_t list_length = READ_PTR(); - XETRACERB("[%.8X] Packet(%.8X): PM4_INDIRECT_BUFFER %.8X (%dw)", - packet_ptr, packet, list_ptr, list_length); - ExecuteIndirectBuffer(GpuToCpu(list_ptr), list_length); - } - break; - - case PM4_WAIT_REG_MEM: - // wait until a register or memory location is a specific value - { - XETRACERB("[%.8X] Packet(%.8X): PM4_WAIT_REG_MEM", - packet_ptr, packet); - LOG_DATA(count); - uint32_t wait_info = READ_PTR(); - uint32_t poll_reg_addr = READ_PTR(); - uint32_t ref = READ_PTR(); - uint32_t mask = READ_PTR(); - uint32_t wait = READ_PTR(); - bool matched = false; - do { - uint32_t value; - if (wait_info & 0x10) { - // Memory. - XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(poll_reg_addr & 0x3); - poll_reg_addr &= ~0x3; - value = XEGETUINT32LE(p + GpuToCpu(packet_ptr, poll_reg_addr)); - value = GpuSwap(value, endianness); - } else { - // Register. - XEASSERT(poll_reg_addr < kXEGpuRegisterCount); - value = regs->values[poll_reg_addr].u32; - } - switch (wait_info & 0x7) { - case 0x0: // Never. - matched = false; - break; - case 0x1: // Less than reference. - matched = (value & mask) < ref; - break; - case 0x2: // Less than or equal to reference. - matched = (value & mask) <= ref; - break; - case 0x3: // Equal to reference. - matched = (value & mask) == ref; - break; - case 0x4: // Not equal to reference. - matched = (value & mask) != ref; - break; - case 0x5: // Greater than or equal to reference. - matched = (value & mask) >= ref; - break; - case 0x6: // Greater than reference. - matched = (value & mask) > ref; - break; - case 0x7: // Always - matched = true; - break; - } - if (!matched) { - // Wait. - if (wait >= 0x100) { - Sleep(wait / 0x100); - } else { - SwitchToThread(); - } - } - } while (!matched); - } - break; - - case PM4_REG_RMW: - // register read/modify/write - // ? (used during shader upload and edram setup) - { - XETRACERB("[%.8X] Packet(%.8X): PM4_REG_RMW", - packet_ptr, packet); - LOG_DATA(count); - uint32_t rmw_info = READ_PTR(); - uint32_t and_mask = READ_PTR(); - uint32_t or_mask = READ_PTR(); - uint32_t value = regs->values[rmw_info & 0x1FFF].u32; - if ((rmw_info >> 30) & 0x1) { - // | reg - value |= regs->values[or_mask & 0x1FFF].u32; - } else { - // | imm - value |= or_mask; - } - if ((rmw_info >> 31) & 0x1) { - // & reg - value &= regs->values[and_mask & 0x1FFF].u32; - } else { - // & imm - value &= and_mask; - } - WriteRegister(packet_ptr, rmw_info & 0x1FFF, value); - } - break; - - case PM4_COND_WRITE: - // conditional write to memory or register - { - XETRACERB("[%.8X] Packet(%.8X): PM4_COND_WRITE", - packet_ptr, packet); - LOG_DATA(count); - uint32_t wait_info = READ_PTR(); - uint32_t poll_reg_addr = READ_PTR(); - uint32_t ref = READ_PTR(); - uint32_t mask = READ_PTR(); - uint32_t write_reg_addr = READ_PTR(); - uint32_t write_data = READ_PTR(); - uint32_t value; - if (wait_info & 0x10) { - // Memory. - XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(poll_reg_addr & 0x3); - poll_reg_addr &= ~0x3; - value = XEGETUINT32LE(p + GpuToCpu(packet_ptr, poll_reg_addr)); - value = GpuSwap(value, endianness); - } else { - // Register. - XEASSERT(poll_reg_addr < kXEGpuRegisterCount); - value = regs->values[poll_reg_addr].u32; - } - bool matched = false; - switch (wait_info & 0x7) { - case 0x0: // Never. - matched = false; - break; - case 0x1: // Less than reference. - matched = (value & mask) < ref; - break; - case 0x2: // Less than or equal to reference. - matched = (value & mask) <= ref; - break; - case 0x3: // Equal to reference. - matched = (value & mask) == ref; - break; - case 0x4: // Not equal to reference. - matched = (value & mask) != ref; - break; - case 0x5: // Greater than or equal to reference. - matched = (value & mask) >= ref; - break; - case 0x6: // Greater than reference. - matched = (value & mask) > ref; - break; - case 0x7: // Always - matched = true; - break; - } - if (matched) { - // Write. - if (wait_info & 0x100) { - // Memory. - XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(write_reg_addr & 0x3); - write_reg_addr &= ~0x3; - write_data = GpuSwap(write_data, endianness); - XESETUINT32LE(p + GpuToCpu(packet_ptr, write_reg_addr), - write_data); - } else { - // Register. - WriteRegister(packet_ptr, write_reg_addr, write_data); - } - } - } - break; - - case PM4_EVENT_WRITE: - // generate an event that creates a write to memory when completed - { - XETRACERB("[%.8X] Packet(%.8X): PM4_EVENT_WRITE (unimplemented!)", - packet_ptr, packet); - LOG_DATA(count); - uint32_t initiator = READ_PTR(); - if (count == 1) { - // Just an event flag? Where does this write? - } else { - // Write to an address. - XEASSERTALWAYS(); - ADVANCE_PTR(count - 1); - } - } - break; - case PM4_EVENT_WRITE_SHD: - // generate a VS|PS_done event - { - XETRACERB("[%.8X] Packet(%.8X): PM4_EVENT_WRITE_SHD", - packet_ptr, packet); - LOG_DATA(count); - uint32_t initiator = READ_PTR(); - uint32_t address = READ_PTR(); - uint32_t value = READ_PTR(); - // Writeback initiator. - WriteRegister(packet_ptr, XE_GPU_REG_VGT_EVENT_INITIATOR, - initiator & 0x1F); - uint32_t data_value; - if ((initiator >> 31) & 0x1) { - // Write counter (GPU vblank counter?). - data_value = counter_; - } else { - // Write value. - data_value = value; - } - XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(address & 0x3); - address &= ~0x3; - data_value = GpuSwap(data_value, endianness); - XESETUINT32LE(p + GpuToCpu(address), data_value); - } - break; - - case PM4_DRAW_INDX: - // initiate fetch of index buffer and draw - { - XETRACERB("[%.8X] Packet(%.8X): PM4_DRAW_INDX", - packet_ptr, packet); - LOG_DATA(count); - // d0 = viz query info - uint32_t d0 = READ_PTR(); - uint32_t d1 = READ_PTR(); - uint32_t index_count = d1 >> 16; - uint32_t prim_type = d1 & 0x3F; - uint32_t src_sel = (d1 >> 6) & 0x3; - if (src_sel == 0x0) { - uint32_t index_base = READ_PTR(); - uint32_t index_size = READ_PTR(); - uint32_t endianness = index_size >> 29; - index_size &= 0x00FFFFFF; - bool index_32bit = (d1 >> 11) & 0x1; - index_size *= index_32bit ? 4 : 2; - driver_->DrawIndexBuffer( - (XE_GPU_PRIMITIVE_TYPE)prim_type, - index_32bit, index_count, index_base, index_size, endianness); - } else if (src_sel == 0x2) { - driver_->DrawIndexAuto( - (XE_GPU_PRIMITIVE_TYPE)prim_type, - index_count); - } else { - // Unknown source select. - XEASSERTALWAYS(); - } - } - break; - case PM4_DRAW_INDX_2: - // draw using supplied indices in packet - { - XETRACERB("[%.8X] Packet(%.8X): PM4_DRAW_INDX_2", - packet_ptr, packet); - LOG_DATA(count); - uint32_t d0 = READ_PTR(); - uint32_t index_count = d0 >> 16; - uint32_t prim_type = d0 & 0x3F; - uint32_t src_sel = (d0 >> 6) & 0x3; - XEASSERT(src_sel == 0x2); // 'SrcSel=AutoIndex' - driver_->DrawIndexAuto( - (XE_GPU_PRIMITIVE_TYPE)prim_type, - index_count); - } - break; - - case PM4_SET_CONSTANT: - // load constant into chip and to memory - { - XETRACERB("[%.8X] Packet(%.8X): PM4_SET_CONSTANT", - packet_ptr, packet); - // PM4_REG(reg) ((0x4 << 16) | (GSL_HAL_SUBBLOCK_OFFSET(reg))) - // reg - 0x2000 - uint32_t offset_type = READ_PTR(); - uint32_t index = offset_type & 0x7FF; - uint32_t type = (offset_type >> 16) & 0xFF; - switch (type) { - case 0x4: // REGISTER - index += 0x2000; // registers - for (uint32_t n = 0; n < count - 1; n++, index++) { - uint32_t data = READ_PTR(); - const char* reg_name = xenos::GetRegisterName(index); - XETRACERB("[%.8X] %.8X -> %.4X %s", - packet_ptr + (1 + n) * 4, - data, index, reg_name ? reg_name : ""); - WriteRegister(packet_ptr, index, data); - } - break; - default: - XEASSERTALWAYS(); - break; - } - } - break; - case PM4_LOAD_ALU_CONSTANT: - // load constants from memory - { - XETRACERB("[%.8X] Packet(%.8X): PM4_LOAD_ALU_CONSTANT", - packet_ptr, packet); - uint32_t address = READ_PTR(); - address &= 0x3FFFFFFF; - uint32_t offset_type = READ_PTR(); - uint32_t index = offset_type & 0x7FF; - uint32_t size = READ_PTR(); - size &= 0xFFF; - index += 0x4000; // alu constants - for (uint32_t n = 0; n < size; n++, index++) { - uint32_t data = XEGETUINT32BE( - p + GpuToCpu(packet_ptr, address + n * 4)); - const char* reg_name = xenos::GetRegisterName(index); - XETRACERB("[%.8X] %.8X -> %.4X %s", - packet_ptr, - data, index, reg_name ? reg_name : ""); - WriteRegister(packet_ptr, index, data); - } - } - break; - - case PM4_IM_LOAD: - // load sequencer instruction memory (pointer-based) - { - XETRACERB("[%.8X] Packet(%.8X): PM4_IM_LOAD", - packet_ptr, packet); - LOG_DATA(count); - uint32_t addr_type = READ_PTR(); - uint32_t type = addr_type & 0x3; - uint32_t addr = addr_type & ~0x3; - uint32_t start_size = READ_PTR(); - uint32_t start = start_size >> 16; - uint32_t size = start_size & 0xFFFF; // dwords - XEASSERT(start == 0); - driver_->SetShader( - (XE_GPU_SHADER_TYPE)type, - GpuToCpu(packet_ptr, addr), - start, - size * 4); - } - break; - case PM4_IM_LOAD_IMMEDIATE: - // load sequencer instruction memory (code embedded in packet) - { - XETRACERB("[%.8X] Packet(%.8X): PM4_IM_LOAD_IMMEDIATE", - packet_ptr, packet); - LOG_DATA(count); - uint32_t type = READ_PTR(); - uint32_t start_size = READ_PTR(); - uint32_t start = start_size >> 16; - uint32_t size = start_size & 0xFFFF; // dwords - XEASSERT(start == 0); - // TODO(benvanik): figure out if this could wrap. - XEASSERT(args.ptr + size * 4 < args.max_address); - driver_->SetShader( - (XE_GPU_SHADER_TYPE)type, - args.ptr, - start, - size * 4); - ADVANCE_PTR(size); - } - break; - - case PM4_INVALIDATE_STATE: - // selective invalidation of state pointers - { - XETRACERB("[%.8X] Packet(%.8X): PM4_INVALIDATE_STATE", - packet_ptr, packet); - LOG_DATA(count); - uint32_t mask = READ_PTR(); - driver_->InvalidateState(mask); - } - break; - - case PM4_SET_BIN_MASK_LO: - { - uint32_t value = READ_PTR(); - XETRACERB("[%.8X] Packet(%.8X): PM4_SET_BIN_MASK_LO = %.8X", - packet_ptr, packet, value); - } - break; - case PM4_SET_BIN_MASK_HI: - { - uint32_t value = READ_PTR(); - XETRACERB("[%.8X] Packet(%.8X): PM4_SET_BIN_MASK_HI = %.8X", - packet_ptr, packet, value); - } - break; - case PM4_SET_BIN_SELECT_LO: - { - uint32_t value = READ_PTR(); - XETRACERB("[%.8X] Packet(%.8X): PM4_SET_BIN_SELECT_LO = %.8X", - packet_ptr, packet, value); - } - break; - case PM4_SET_BIN_SELECT_HI: - { - uint32_t value = READ_PTR(); - XETRACERB("[%.8X] Packet(%.8X): PM4_SET_BIN_SELECT_HI = %.8X", - packet_ptr, packet, value); - } - break; - - // Ignored packets - useful if breaking on the default handler below. - case 0x50: // 0xC0015000 usually 2 words, 0xFFFFFFFF / 0x00000000 - XETRACERB("[%.8X] Packet(%.8X): unknown!", - packet_ptr, packet); - LOG_DATA(count); - ADVANCE_PTR(count); - break; - - default: - XETRACERB("[%.8X] Packet(%.8X): unknown!", - packet_ptr, packet); - LOG_DATA(count); - ADVANCE_PTR(count); - break; - } - - return 1 + count; - } - break; - } - - return 0; -} - -void RingBufferWorker::WriteRegister( - uint32_t packet_ptr, uint32_t index, uint32_t value) { - RegisterFile* regs = driver_->register_file(); - XEASSERT(index < kXEGpuRegisterCount); - regs->values[index].u32 = value; - - // Scratch register writeback. - if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) { - uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0; - if ((1 << scratch_reg) & regs->values[XE_GPU_REG_SCRATCH_UMSK].u32) { - // Enabled - write to address. - uint8_t* p = memory_->membase(); - uint32_t scratch_addr = regs->values[XE_GPU_REG_SCRATCH_ADDR].u32; - uint32_t mem_addr = scratch_addr + (scratch_reg * 4); - XESETUINT32BE(p + GpuToCpu(primary_buffer_ptr_, mem_addr), value); - } - } -} diff --git a/src/xenia/gpu/ring_buffer_worker.h b/src/xenia/gpu/ring_buffer_worker.h deleted file mode 100644 index 889625d68..000000000 --- a/src/xenia/gpu/ring_buffer_worker.h +++ /dev/null @@ -1,81 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_RING_BUFFER_WORKER_H_ -#define XENIA_GPU_RING_BUFFER_WORKER_H_ - -#include - -#include - - -namespace xe { -namespace gpu { - -class GraphicsDriver; -class GraphicsSystem; - -class RingBufferWorker { -public: - RingBufferWorker(GraphicsSystem* graphics_system, Memory* memory); - virtual ~RingBufferWorker(); - - Memory* memory() const { return memory_; } - - uint64_t QueryTime(); - uint32_t counter() const { return counter_; } - void increment_counter() { counter_++; } - - void Initialize(GraphicsDriver* driver, - uint32_t ptr, uint32_t page_count); - void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size); - - void UpdateWritePointer(uint32_t value); - - void Pump(); - -private: - typedef struct { - uint32_t ptr; - uint32_t base_ptr; - uint32_t max_address; - uint32_t ptr_mask; - } PacketArgs; - void AdvancePtr(PacketArgs& args, uint32_t n); - void ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index); - void ExecuteIndirectBuffer(uint32_t ptr, uint32_t length); - uint32_t ExecutePacket(PacketArgs& args); - void WriteRegister(uint32_t packet_ptr, uint32_t index, uint32_t value); - -protected: - Memory* memory_; - GraphicsSystem* graphics_system_; - GraphicsDriver* driver_; - - uint64_t time_base_; - uint32_t counter_; - - uint32_t primary_buffer_ptr_; - uint32_t primary_buffer_size_; - - uint32_t read_ptr_index_; - uint32_t read_ptr_update_freq_; - uint32_t read_ptr_writeback_ptr_; - - HANDLE write_ptr_index_event_; - volatile uint32_t write_ptr_index_; - volatile uint32_t write_ptr_max_index_; -}; - - -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_RING_BUFFER_WORKER_H_ diff --git a/src/xenia/gpu/sampler_state_resource.cc b/src/xenia/gpu/sampler_state_resource.cc new file mode 100644 index 000000000..5865a6920 --- /dev/null +++ b/src/xenia/gpu/sampler_state_resource.cc @@ -0,0 +1,32 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + + +bool SamplerStateResource::Info::Prepare( + const xe_gpu_texture_fetch_t& fetch, const instr_fetch_tex_t& fetch_instr, + Info& out_info) { + out_info.min_filter = static_cast( + fetch_instr.min_filter == 3 ? fetch.min_filter : fetch_instr.min_filter); + out_info.mag_filter = static_cast( + fetch_instr.mag_filter == 3 ? fetch.mag_filter : fetch_instr.mag_filter); + out_info.mip_filter = static_cast( + fetch_instr.mip_filter == 3 ? fetch.mip_filter : fetch_instr.mip_filter); + out_info.clamp_u = fetch.clamp_x; + out_info.clamp_v = fetch.clamp_y; + out_info.clamp_w = fetch.clamp_z; + return true; +} diff --git a/src/xenia/gpu/sampler_state_resource.h b/src/xenia/gpu/sampler_state_resource.h new file mode 100644 index 000000000..c0a3c4ab3 --- /dev/null +++ b/src/xenia/gpu/sampler_state_resource.h @@ -0,0 +1,67 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_SAMPLER_STATE_RESOURCE_H_ +#define XENIA_GPU_SAMPLER_STATE_RESOURCE_H_ + +#include +#include +#include + + +namespace xe { +namespace gpu { + + +class SamplerStateResource : public StaticResource { +public: + struct Info { + xenos::instr_tex_filter_t min_filter; + xenos::instr_tex_filter_t mag_filter; + xenos::instr_tex_filter_t mip_filter; + uint32_t clamp_u; + uint32_t clamp_v; + uint32_t clamp_w; + + uint64_t hash() const { + return hash_combine(0, + min_filter, mag_filter, mip_filter, + clamp_u, clamp_v, clamp_w); + } + bool Equals(const Info& other) const { + return min_filter == other.min_filter && + mag_filter == other.mag_filter && + mip_filter == other.mip_filter && + clamp_u == other.clamp_u && + clamp_v == other.clamp_v && + clamp_w == other.clamp_w; + } + + static bool Prepare(const xenos::xe_gpu_texture_fetch_t& fetch, + const xenos::instr_fetch_tex_t& fetch_instr, + Info& out_info); + }; + + SamplerStateResource(const Info& info) : info_(info) {} + virtual ~SamplerStateResource() = default; + + const Info& info() const { return info_; } + + virtual int Prepare() = 0; + +protected: + Info info_; +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_SAMPLER_STATE_RESOURCE_H_ diff --git a/src/xenia/gpu/shader.cc b/src/xenia/gpu/shader.cc deleted file mode 100644 index 69b083a60..000000000 --- a/src/xenia/gpu/shader.cc +++ /dev/null @@ -1,266 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::xenos; - - -Shader::Shader( - XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash) : - type_(type), hash_(hash), is_prepared_(false), disasm_src_(NULL) { - xe_zero_struct(&alloc_counts_, sizeof(alloc_counts_)); - xe_zero_struct(&vtx_buffer_inputs_, sizeof(vtx_buffer_inputs_)); - xe_zero_struct(&tex_buffer_inputs_, sizeof(tex_buffer_inputs_)); - - // Verify. - dword_count_ = length / 4; - XEASSERT(dword_count_ <= 512); - - // Copy bytes and swap. - size_t byte_size = dword_count_ * sizeof(uint32_t); - dwords_ = (uint32_t*)xe_malloc(byte_size); - for (uint32_t n = 0; n < dword_count_; n++) { - dwords_[n] = XEGETUINT32BE(src_ptr + n * 4); - } - - // Gather input/output registers/etc. - GatherIO(); - - // Disassemble, for debugging. - disasm_src_ = DisassembleShader(type_, dwords_, dword_count_); -} - -Shader::~Shader() { - if (disasm_src_) { - xe_free(disasm_src_); - } - xe_free(dwords_); -} - -void Shader::GatherIO() { - // Process all execution blocks. - instr_cf_t cfa; - instr_cf_t cfb; - for (int idx = 0; idx < dword_count_; idx += 3) { - uint32_t dword_0 = dwords_[idx + 0]; - uint32_t dword_1 = dwords_[idx + 1]; - uint32_t dword_2 = dwords_[idx + 2]; - cfa.dword_0 = dword_0; - cfa.dword_1 = dword_1 & 0xFFFF; - cfb.dword_0 = (dword_1 >> 16) | (dword_2 << 16); - cfb.dword_1 = dword_2 >> 16; - if (cfa.opc == ALLOC) { - GatherAlloc(&cfa.alloc); - } else if (cfa.is_exec()) { - GatherExec(&cfa.exec); - } - if (cfb.opc == ALLOC) { - GatherAlloc(&cfb.alloc); - } else if (cfb.is_exec()) { - GatherExec(&cfb.exec); - } - if (cfa.opc == EXEC_END || cfb.opc == EXEC_END) { - break; - } - } -} - -void Shader::GatherAlloc(const instr_cf_alloc_t* cf) { - allocs_.push_back(*cf); - - switch (cf->buffer_select) { - case SQ_POSITION: - // Position (SV_POSITION). - alloc_counts_.positions += cf->size + 1; - break; - case SQ_PARAMETER_PIXEL: - // Output to PS (if VS), or frag output (if PS). - alloc_counts_.params += cf->size + 1; - break; - case SQ_MEMORY: - // MEMEXPORT? - alloc_counts_.memories += cf->size + 1; - break; - } -} - -void Shader::GatherExec(const instr_cf_exec_t* cf) { - execs_.push_back(*cf); - - uint32_t sequence = cf->serialize; - for (uint32_t i = 0; i < cf->count; i++) { - uint32_t alu_off = (cf->address + i); - int sync = sequence & 0x2; - if (sequence & 0x1) { - const instr_fetch_t* fetch = - (const instr_fetch_t*)(dwords_ + alu_off * 3); - switch (fetch->opc) { - case VTX_FETCH: - GatherVertexFetch(&fetch->vtx); - break; - case TEX_FETCH: - GatherTextureFetch(&fetch->tex); - break; - case TEX_GET_BORDER_COLOR_FRAC: - case TEX_GET_COMP_TEX_LOD: - case TEX_GET_GRADIENTS: - case TEX_GET_WEIGHTS: - case TEX_SET_TEX_LOD: - case TEX_SET_GRADIENTS_H: - case TEX_SET_GRADIENTS_V: - default: - XEASSERTALWAYS(); - break; - } - } else { - // TODO(benvanik): gather registers used, predicate bits used, etc. - const instr_alu_t* alu = - (const instr_alu_t*)(dwords_ + alu_off * 3); - if (alu->vector_write_mask) { - if (alu->export_data && alu->vector_dest == 63) { - alloc_counts_.point_size = true; - } - } - if (alu->scalar_write_mask || !alu->vector_write_mask) { - if (alu->export_data && alu->scalar_dest == 63) { - alloc_counts_.point_size = true; - } - } - } - sequence >>= 2; - } -} - -void Shader::GatherVertexFetch(const instr_fetch_vtx_t* vtx) { - // dst_reg/dst_swiz - // src_reg/src_swiz - // format = a2xx_sq_surfaceformat - // format_comp_all ? signed : unsigned - // num_format_all ? normalized - // stride - // offset - // const_index/const_index_sel -- fetch constant register - // num_format_all ? integer : fraction - // exp_adjust_all - [-32,31] - (2^exp_adjust_all)*fetch - 0 = default - - // Sometimes games have fetches that just produce constants. We can - // ignore those. - uint32_t dst_swiz = vtx->dst_swiz; - bool fetches_any_data = false; - for (int i = 0; i < 4; i++) { - if ((dst_swiz & 0x7) == 4) { - // 0.0 - } else if ((dst_swiz & 0x7) == 5) { - // 1.0 - } else if ((dst_swiz & 0x7) == 6) { - // ? - } else if ((dst_swiz & 0x7) == 7) { - // Previous register value. - } else { - fetches_any_data = true; - break; - } - dst_swiz >>= 3; - } - if (!fetches_any_data) { - return; - } - - uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel; - auto& inputs = vtx_buffer_inputs_; - vtx_buffer_element_t* el = NULL; - for (size_t n = 0; n < inputs.count; n++) { - auto& input = inputs.descs[n]; - if (input.fetch_slot == fetch_slot) { - XEASSERT(input.element_count + 1 < XECOUNT(input.elements)); - // It may not hold that all strides are equal, but I hope it does. - XEASSERT(!vtx->stride || input.stride_words == vtx->stride); - el = &input.elements[input.element_count++]; - break; - } - } - if (!el) { - XEASSERTNOTZERO(vtx->stride); - XEASSERT(inputs.count + 1 < XECOUNT(inputs.descs)); - auto& input = inputs.descs[inputs.count++]; - input.input_index = inputs.count - 1; - input.fetch_slot = fetch_slot; - input.stride_words = vtx->stride; - el = &input.elements[input.element_count++]; - } - - el->vtx_fetch = *vtx; - el->format = vtx->format; - el->offset_words = vtx->offset; - el->size_words = 0; - switch (el->format) { - case FMT_8_8_8_8: - case FMT_2_10_10_10: - case FMT_10_11_11: - case FMT_11_11_10: - el->size_words = 1; - break; - case FMT_16_16: - case FMT_16_16_FLOAT: - el->size_words = 1; - break; - case FMT_16_16_16_16: - case FMT_16_16_16_16_FLOAT: - el->size_words = 2; - break; - case FMT_32: - case FMT_32_FLOAT: - el->size_words = 1; - break; - case FMT_32_32: - case FMT_32_32_FLOAT: - el->size_words = 2; - break; - case FMT_32_32_32_FLOAT: - el->size_words = 3; - break; - case FMT_32_32_32_32: - case FMT_32_32_32_32_FLOAT: - el->size_words = 4; - break; - default: - XELOGE("Unknown vertex format: %d", el->format); - XEASSERTALWAYS(); - break; - } -} - -const Shader::vtx_buffer_inputs_t* Shader::GetVertexBufferInputs() { - return &vtx_buffer_inputs_; -} - -void Shader::GatherTextureFetch(const xenos::instr_fetch_tex_t* tex) { - // TODO(benvanik): check dest_swiz to see if we are writing anything. - - auto& inputs = tex_buffer_inputs_; - XEASSERT(inputs.count + 1 < XECOUNT(inputs.descs)); - auto& input = inputs.descs[inputs.count++]; - input.input_index = inputs.count - 1; - input.fetch_slot = tex->const_idx & 0xF; // ? - input.tex_fetch = *tex; - - // Format mangling, size estimation, etc. -} - -const Shader::tex_buffer_inputs_t* Shader::GetTextureBufferInputs() { - return &tex_buffer_inputs_; -} diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h deleted file mode 100644 index 1dd26b2b4..000000000 --- a/src/xenia/gpu/shader.h +++ /dev/null @@ -1,104 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_SHADER_H_ -#define XENIA_GPU_SHADER_H_ - -#include -#include -#include - - -namespace xe { -namespace gpu { - - -class Shader { -public: - Shader(xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash); - virtual ~Shader(); - - xenos::XE_GPU_SHADER_TYPE type() const { return type_; } - const uint32_t* dwords() const { return dwords_; } - size_t dword_count() const { return dword_count_; } - uint64_t hash() const { return hash_; } - bool is_prepared() const { return is_prepared_; } - - const char* disasm_src() const { return disasm_src_; } - - typedef struct { - xenos::instr_fetch_vtx_t vtx_fetch; - uint32_t format; - uint32_t offset_words; - uint32_t size_words; - } vtx_buffer_element_t; - typedef struct { - uint32_t input_index; - uint32_t fetch_slot; - uint32_t stride_words; - uint32_t element_count; - vtx_buffer_element_t elements[16]; - } vtx_buffer_desc_t; - typedef struct { - uint32_t count; - vtx_buffer_desc_t descs[16]; - } vtx_buffer_inputs_t; - const vtx_buffer_inputs_t* GetVertexBufferInputs(); - - typedef struct { - uint32_t input_index; - uint32_t fetch_slot; - xenos::instr_fetch_tex_t tex_fetch; - uint32_t format; - } tex_buffer_desc_t; - typedef struct { - uint32_t count; - tex_buffer_desc_t descs[32]; - } tex_buffer_inputs_t; - const tex_buffer_inputs_t* GetTextureBufferInputs(); - - typedef struct { - uint32_t positions; - uint32_t params; - uint32_t memories; - bool point_size; - } alloc_counts_t; - const alloc_counts_t& alloc_counts() const { return alloc_counts_; } - -private: - void GatherIO(); - void GatherAlloc(const xenos::instr_cf_alloc_t* cf); - void GatherExec(const xenos::instr_cf_exec_t* cf); - void GatherVertexFetch(const xenos::instr_fetch_vtx_t* vtx); - void GatherTextureFetch(const xenos::instr_fetch_tex_t* tex); - -protected: - xenos::XE_GPU_SHADER_TYPE type_; - uint32_t* dwords_; - size_t dword_count_; - uint64_t hash_; - bool is_prepared_; - - char* disasm_src_; - - alloc_counts_t alloc_counts_; - std::vector execs_; - std::vector allocs_; - vtx_buffer_inputs_t vtx_buffer_inputs_; - tex_buffer_inputs_t tex_buffer_inputs_; -}; - - -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_SHADER_H_ diff --git a/src/xenia/gpu/shader_cache.cc b/src/xenia/gpu/shader_cache.cc deleted file mode 100644 index 2c5e84294..000000000 --- a/src/xenia/gpu/shader_cache.cc +++ /dev/null @@ -1,80 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include - - -using namespace std; -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::xenos; - - -ShaderCache::ShaderCache() { -} - -ShaderCache::~ShaderCache() { - Clear(); -} - -Shader* ShaderCache::Create( - XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length) { - uint64_t hash = Hash(src_ptr, length); - Shader* shader = CreateCore(type, src_ptr, length, hash); - map_.insert({ hash, shader }); - return shader; -} - -Shader* ShaderCache::CreateCore( - XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash) { - return new Shader(type, src_ptr, length, hash); -} - -Shader* ShaderCache::Find( - XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length) { - uint64_t hash = Hash(src_ptr, length); - auto it = map_.find(hash); - if (it != map_.end()) { - return it->second; - } - return NULL; -} - -Shader* ShaderCache::FindOrCreate( - XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length) { - SCOPE_profile_cpu_f("gpu"); - - uint64_t hash = Hash(src_ptr, length); - auto it = map_.find(hash); - if (it != map_.end()) { - return it->second; - } - Shader* shader = CreateCore(type, src_ptr, length, hash); - map_.insert({ hash, shader }); - return shader; -} - -void ShaderCache::Clear() { - for (auto it = map_.begin(); it != map_.end(); ++it) { - Shader* shader = it->second; - delete shader; - } - map_.clear(); -} - -uint64_t ShaderCache::Hash(const uint8_t* src_ptr, size_t length) { - return xe_hash64(src_ptr, length, 0); -} diff --git a/src/xenia/gpu/shader_cache.h b/src/xenia/gpu/shader_cache.h deleted file mode 100644 index 97edc382f..000000000 --- a/src/xenia/gpu/shader_cache.h +++ /dev/null @@ -1,56 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_SHADER_CACHE_H_ -#define XENIA_GPU_SHADER_CACHE_H_ - -#include -#include -#include - - -namespace xe { -namespace gpu { - - -class ShaderCache { -public: - ShaderCache(); - virtual ~ShaderCache(); - - Shader* Create( - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length); - Shader* Find( - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length); - Shader* FindOrCreate( - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length); - - void Clear(); - -private: - uint64_t Hash(const uint8_t* src_ptr, size_t length); - - std::unordered_map map_; - -protected: - virtual Shader* CreateCore( - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash); -}; - - -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_SHADER_CACHE_H_ diff --git a/src/xenia/gpu/shader_resource.cc b/src/xenia/gpu/shader_resource.cc index e2520db62..9fbcbf2bb 100644 --- a/src/xenia/gpu/shader_resource.cc +++ b/src/xenia/gpu/shader_resource.cc @@ -9,9 +9,267 @@ #include +#include + using namespace std; using namespace xe; using namespace xe::gpu; using namespace xe::gpu::xenos; + +ShaderResource::ShaderResource(const MemoryRange& memory_range, + const Info& info, + xenos::XE_GPU_SHADER_TYPE type) + : HashedResource(memory_range), + info_(info), type_(type), is_prepared_(false), disasm_src_(nullptr) { + xe_zero_struct(&alloc_counts_, sizeof(alloc_counts_)); + xe_zero_struct(&buffer_inputs_, sizeof(buffer_inputs_)); + xe_zero_struct(&sampler_inputs_, sizeof(sampler_inputs_)); + + // Verify. + dword_count_ = memory_range.length / 4; + XEASSERT(dword_count_ <= 512); + + // Copy bytes and swap. + size_t byte_size = dword_count_ * sizeof(uint32_t); + dwords_ = (uint32_t*)xe_malloc(byte_size); + for (uint32_t n = 0; n < dword_count_; n++) { + dwords_[n] = XEGETUINT32BE(memory_range.host_base + n * 4); + } + + // Disassemble, for debugging. + disasm_src_ = DisassembleShader(type_, dwords_, dword_count_); + + // Gather input/output registers/etc. + GatherIO(); +} + +ShaderResource::~ShaderResource() { + xe_free(disasm_src_); + xe_free(dwords_); +} + +void ShaderResource::GatherIO() { + // Process all execution blocks. + instr_cf_t cfa; + instr_cf_t cfb; + for (int idx = 0; idx < dword_count_; idx += 3) { + uint32_t dword_0 = dwords_[idx + 0]; + uint32_t dword_1 = dwords_[idx + 1]; + uint32_t dword_2 = dwords_[idx + 2]; + cfa.dword_0 = dword_0; + cfa.dword_1 = dword_1 & 0xFFFF; + cfb.dword_0 = (dword_1 >> 16) | (dword_2 << 16); + cfb.dword_1 = dword_2 >> 16; + if (cfa.opc == ALLOC) { + GatherAlloc(&cfa.alloc); + } else if (cfa.is_exec()) { + GatherExec(&cfa.exec); + } + if (cfb.opc == ALLOC) { + GatherAlloc(&cfb.alloc); + } else if (cfb.is_exec()) { + GatherExec(&cfb.exec); + } + if (cfa.opc == EXEC_END || cfb.opc == EXEC_END) { + break; + } + } +} + +void ShaderResource::GatherAlloc(const instr_cf_alloc_t* cf) { + allocs_.push_back(*cf); + + switch (cf->buffer_select) { + case SQ_POSITION: + // Position (SV_POSITION). + alloc_counts_.positions += cf->size + 1; + break; + case SQ_PARAMETER_PIXEL: + // Output to PS (if VS), or frag output (if PS). + alloc_counts_.params += cf->size + 1; + break; + case SQ_MEMORY: + // MEMEXPORT? + alloc_counts_.memories += cf->size + 1; + break; + } +} + +void ShaderResource::GatherExec(const instr_cf_exec_t* cf) { + execs_.push_back(*cf); + + uint32_t sequence = cf->serialize; + for (uint32_t i = 0; i < cf->count; i++) { + uint32_t alu_off = (cf->address + i); + int sync = sequence & 0x2; + if (sequence & 0x1) { + const instr_fetch_t* fetch = + (const instr_fetch_t*)(dwords_ + alu_off * 3); + switch (fetch->opc) { + case VTX_FETCH: + GatherVertexFetch(&fetch->vtx); + break; + case TEX_FETCH: + GatherTextureFetch(&fetch->tex); + break; + case TEX_GET_BORDER_COLOR_FRAC: + case TEX_GET_COMP_TEX_LOD: + case TEX_GET_GRADIENTS: + case TEX_GET_WEIGHTS: + case TEX_SET_TEX_LOD: + case TEX_SET_GRADIENTS_H: + case TEX_SET_GRADIENTS_V: + default: + XEASSERTALWAYS(); + break; + } + } else { + // TODO(benvanik): gather registers used, predicate bits used, etc. + const instr_alu_t* alu = + (const instr_alu_t*)(dwords_ + alu_off * 3); + if (alu->vector_write_mask) { + if (alu->export_data && alu->vector_dest == 63) { + alloc_counts_.point_size = true; + } + } + if (alu->scalar_write_mask || !alu->vector_write_mask) { + if (alu->export_data && alu->scalar_dest == 63) { + alloc_counts_.point_size = true; + } + } + } + sequence >>= 2; + } +} + +void ShaderResource::GatherVertexFetch(const instr_fetch_vtx_t* vtx) { + XEASSERT(type_ == XE_GPU_SHADER_TYPE_VERTEX); + + // dst_reg/dst_swiz + // src_reg/src_swiz + // format = a2xx_sq_surfaceformat + // format_comp_all ? signed : unsigned + // num_format_all ? normalized + // stride + // offset + // const_index/const_index_sel -- fetch constant register + // num_format_all ? integer : fraction + // exp_adjust_all - [-32,31] - (2^exp_adjust_all)*fetch - 0 = default + + // Sometimes games have fetches that just produce constants. We can + // ignore those. + uint32_t dst_swiz = vtx->dst_swiz; + bool fetches_any_data = false; + for (int i = 0; i < 4; i++) { + if ((dst_swiz & 0x7) == 4) { + // 0.0 + } else if ((dst_swiz & 0x7) == 5) { + // 1.0 + } else if ((dst_swiz & 0x7) == 6) { + // ? + } else if ((dst_swiz & 0x7) == 7) { + // Previous register value. + } else { + fetches_any_data = true; + break; + } + dst_swiz >>= 3; + } + if (!fetches_any_data) { + return; + } + + uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel; + auto& inputs = buffer_inputs_; + VertexBufferResource::DeclElement* el = nullptr; + for (size_t n = 0; n < inputs.count; n++) { + auto& desc = inputs.descs[n]; + auto& info = desc.info; + if (desc.fetch_slot == fetch_slot) { + XEASSERT(info.element_count + 1 < XECOUNT(info.elements)); + // It may not hold that all strides are equal, but I hope it does. + XEASSERT(!vtx->stride || info.stride_words == vtx->stride); + el = &info.elements[info.element_count++]; + break; + } + } + if (!el) { + XEASSERTNOTZERO(vtx->stride); + XEASSERT(inputs.count + 1 < XECOUNT(inputs.descs)); + auto& desc = inputs.descs[inputs.count++]; + desc.input_index = inputs.count - 1; + desc.fetch_slot = fetch_slot; + desc.info.stride_words = vtx->stride; + el = &desc.info.elements[desc.info.element_count++]; + } + + el->vtx_fetch = *vtx; + el->format = vtx->format; + el->is_normalized = vtx->num_format_all == 0; + el->is_signed = vtx->format_comp_all == 1; + el->offset_words = vtx->offset; + el->size_words = 0; + switch (el->format) { + case FMT_8_8_8_8: + case FMT_2_10_10_10: + case FMT_10_11_11: + case FMT_11_11_10: + el->size_words = 1; + break; + case FMT_16_16: + case FMT_16_16_FLOAT: + el->size_words = 1; + break; + case FMT_16_16_16_16: + case FMT_16_16_16_16_FLOAT: + el->size_words = 2; + break; + case FMT_32: + case FMT_32_FLOAT: + el->size_words = 1; + break; + case FMT_32_32: + case FMT_32_32_FLOAT: + el->size_words = 2; + break; + case FMT_32_32_32_FLOAT: + el->size_words = 3; + break; + case FMT_32_32_32_32: + case FMT_32_32_32_32_FLOAT: + el->size_words = 4; + break; + default: + XELOGE("Unknown vertex format: %d", el->format); + XEASSERTALWAYS(); + break; + } +} + +void ShaderResource::GatherTextureFetch(const xenos::instr_fetch_tex_t* tex) { + // TODO(benvanik): check dest_swiz to see if we are writing anything. + + XEASSERT(sampler_inputs_.count + 1 < XECOUNT(sampler_inputs_.descs)); + auto& input = sampler_inputs_.descs[sampler_inputs_.count++]; + input.input_index = sampler_inputs_.count - 1; + input.fetch_slot = tex->const_idx & 0xF; // ? + input.tex_fetch = *tex; + + // Format mangling, size estimation, etc. +} + +VertexShaderResource::VertexShaderResource( + const MemoryRange& memory_range, const Info& info) + : ShaderResource(memory_range, info, XE_GPU_SHADER_TYPE_VERTEX) { +} + +VertexShaderResource::~VertexShaderResource() = default; + +PixelShaderResource::PixelShaderResource( + const MemoryRange& memory_range, const Info& info) + : ShaderResource(memory_range, info, XE_GPU_SHADER_TYPE_PIXEL) { +} + +PixelShaderResource::~PixelShaderResource() = default; diff --git a/src/xenia/gpu/shader_resource.h b/src/xenia/gpu/shader_resource.h index 24b787ec4..b591bfaf2 100644 --- a/src/xenia/gpu/shader_resource.h +++ b/src/xenia/gpu/shader_resource.h @@ -10,7 +10,9 @@ #ifndef XENIA_GPU_SHADER_RESOURCE_H_ #define XENIA_GPU_SHADER_RESOURCE_H_ -#include +#include +#include +#include #include @@ -18,8 +20,104 @@ namespace xe { namespace gpu { -class ShaderResource : public Resource { +class ShaderResource : public HashedResource { public: + struct Info { + // type, etc? + }; + + ~ShaderResource() override; + + const Info& info() const { return info_; } + xenos::XE_GPU_SHADER_TYPE type() const { return type_; } + const uint32_t* dwords() const { return dwords_; } + const size_t dword_count() const { return dword_count_; } + + bool is_prepared() const { return is_prepared_; } + const char* disasm_src() const { return disasm_src_; } + + struct BufferDesc { + uint32_t input_index; + uint32_t fetch_slot; + VertexBufferResource::Info info; + // xenos::instr_fetch_vtx_t vtx_fetch; for each el + }; + struct BufferInputs { + uint32_t count; + BufferDesc descs[32]; + }; + const BufferInputs& buffer_inputs() { return buffer_inputs_; } + + struct SamplerDesc { + uint32_t input_index; + uint32_t fetch_slot; + uint32_t format; + xenos::instr_fetch_tex_t tex_fetch; + }; + struct SamplerInputs { + uint32_t count; + SamplerDesc descs[32]; + }; + const SamplerInputs& sampler_inputs() { return sampler_inputs_; } + + struct AllocCounts { + uint32_t positions; + uint32_t params; + uint32_t memories; + bool point_size; + }; + const AllocCounts& alloc_counts() const { return alloc_counts_; } + const std::vector& execs() const { return execs_; } + const std::vector& allocs() const { return allocs_; } + +private: + void GatherIO(); + void GatherAlloc(const xenos::instr_cf_alloc_t* cf); + void GatherExec(const xenos::instr_cf_exec_t* cf); + void GatherVertexFetch(const xenos::instr_fetch_vtx_t* vtx); + void GatherTextureFetch(const xenos::instr_fetch_tex_t* tex); + +protected: + ShaderResource(const MemoryRange& memory_range, + const Info& info, + xenos::XE_GPU_SHADER_TYPE type); + + Info info_; + xenos::XE_GPU_SHADER_TYPE type_; + size_t dword_count_; + uint32_t* dwords_; + char* disasm_src_; + + AllocCounts alloc_counts_; + std::vector execs_; + std::vector allocs_; + BufferInputs buffer_inputs_; + SamplerInputs sampler_inputs_; + + bool is_prepared_; +}; + + +class VertexShaderResource : public ShaderResource { +public: + VertexShaderResource(const MemoryRange& memory_range, + const Info& info); + ~VertexShaderResource() override; + + // buffer_inputs() matching VertexBufferResource::Info + + virtual int Prepare(const xenos::xe_gpu_program_cntl_t& program_cntl) = 0; +}; + + +class PixelShaderResource : public ShaderResource { +public: + PixelShaderResource(const MemoryRange& memory_range, + const Info& info); + ~PixelShaderResource() override; + + virtual int Prepare(const xenos::xe_gpu_program_cntl_t& program_cntl, + VertexShaderResource* vertex_shader) = 0; }; diff --git a/src/xenia/gpu/sources.gypi b/src/xenia/gpu/sources.gypi index 3d4462fd1..b01f7a33b 100644 --- a/src/xenia/gpu/sources.gypi +++ b/src/xenia/gpu/sources.gypi @@ -5,6 +5,8 @@ 'buffer_resource.h', 'command_processor.cc', 'command_processor.h', + 'draw_command.cc', + 'draw_command.h', 'gpu-private.h', 'gpu.cc', 'gpu.h', @@ -18,6 +20,8 @@ 'resource.h', 'resource_cache.cc', 'resource_cache.h', + 'sampler_state_resource.cc', + 'sampler_state_resource.h', 'shader_resource.cc', 'shader_resource.h', 'texture_resource.cc', diff --git a/src/xenia/gpu/texture.cc b/src/xenia/gpu/texture.cc deleted file mode 100644 index d624d82ce..000000000 --- a/src/xenia/gpu/texture.cc +++ /dev/null @@ -1,369 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include -#include - -// TODO(benvanik): replace DXGI constants with xenia constants. -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::xenos; - - -Texture::Texture(uint32_t address, const uint8_t* host_address) - : address_(address), host_address_(host_address) { -} - -Texture::~Texture() { - for (auto it = views_.begin(); it != views_.end(); ++it) { - auto view = *it; - delete view; - } - views_.clear(); -} - -TextureView* Texture::Fetch( - const xenos::xe_gpu_texture_fetch_t& fetch) { - // TODO(benvanik): compute length for hash check. - size_t length = 0; - switch (fetch.dimension) { - case DIMENSION_1D: - break; - case DIMENSION_2D: - break; - case DIMENSION_3D: - break; - case DIMENSION_CUBE: - break; - } - uint64_t hash = xe_hash64(host_address_, length); - - for (auto it = views_.begin(); it != views_.end(); ++it) { - auto view = *it; - if (memcmp(&view->fetch, &fetch, sizeof(fetch))) { - continue; - } - bool dirty = hash != view->hash; - if (dirty) { - return FetchDirty(view, fetch) ? view : nullptr; - } else { - return view; - } - } - - auto new_view = FetchNew(fetch); - if (!new_view) { - return nullptr; - } - new_view->hash = hash; - views_.push_back(new_view); - return new_view; -} - -bool Texture::FillViewInfo(TextureView* view, - const xenos::xe_gpu_texture_fetch_t& fetch) { - // http://msdn.microsoft.com/en-us/library/windows/desktop/cc308051(v=vs.85).aspx - // a2xx_sq_surfaceformat - - view->texture = this; - view->fetch = fetch; - - view->dimensions = fetch.dimension; - switch (fetch.dimension) { - case DIMENSION_1D: - view->width = fetch.size_1d.width; - break; - case DIMENSION_2D: - view->width = fetch.size_2d.width; - view->height = fetch.size_2d.height; - break; - case DIMENSION_3D: - view->width = fetch.size_3d.width; - view->height = fetch.size_3d.height; - view->depth = fetch.size_3d.depth; - break; - case DIMENSION_CUBE: - view->width = fetch.size_stack.width; - view->height = fetch.size_stack.height; - view->depth = fetch.size_stack.depth; - break; - } - view->format = DXGI_FORMAT_UNKNOWN; - view->block_size = 0; - view->texel_pitch = 0; - view->is_compressed = false; - switch (fetch.format) { - case FMT_8: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_RRR1: - view->format = DXGI_FORMAT_R8_UNORM; - break; - case XE_GPU_SWIZZLE_000R: - view->format = DXGI_FORMAT_A8_UNORM; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_8"); - view->format = DXGI_FORMAT_A8_UNORM; - break; - } - view->block_size = 1; - view->texel_pitch = 1; - break; - case FMT_1_5_5_5: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_BGRA: - view->format = DXGI_FORMAT_B5G5R5A1_UNORM; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_1_5_5_5"); - view->format = DXGI_FORMAT_B5G5R5A1_UNORM; - break; - } - view->block_size = 1; - view->texel_pitch = 2; - break; - case FMT_8_8_8_8: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_RGBA: - view->format = DXGI_FORMAT_R8G8B8A8_UNORM; - break; - case XE_GPU_SWIZZLE_BGRA: - view->format = DXGI_FORMAT_B8G8R8A8_UNORM; - break; - case XE_GPU_SWIZZLE_RGB1: - view->format = DXGI_FORMAT_R8G8B8A8_UNORM; // ? - break; - case XE_GPU_SWIZZLE_BGR1: - view->format = DXGI_FORMAT_B8G8R8X8_UNORM; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_8_8_8_8"); - view->format = DXGI_FORMAT_R8G8B8A8_UNORM; - break; - } - view->block_size = 1; - view->texel_pitch = 4; - break; - case FMT_4_4_4_4: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_BGRA: - view->format = DXGI_FORMAT_B4G4R4A4_UNORM; // only supported on Windows 8+ - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_4_4_4_4"); - view->format = DXGI_FORMAT_B4G4R4A4_UNORM; // only supported on Windows 8+ - break; - } - view->block_size = 1; - view->texel_pitch = 2; - break; - case FMT_16_16_16_16_FLOAT: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_RGBA: - view->format = DXGI_FORMAT_R16G16B16A16_FLOAT; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_16_16_16_16_FLOAT"); - view->format = DXGI_FORMAT_R16G16B16A16_FLOAT; - break; - } - view->block_size = 1; - view->texel_pitch = 8; - break; - case FMT_32_FLOAT: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_R111: - view->format = DXGI_FORMAT_R32_FLOAT; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_32_FLOAT"); - view->format = DXGI_FORMAT_R32_FLOAT; - break; - } - view->block_size = 1; - view->texel_pitch = 4; - break; - case FMT_DXT1: - view->format = DXGI_FORMAT_BC1_UNORM; - view->block_size = 4; - view->texel_pitch = 8; - view->is_compressed = true; - break; - case FMT_DXT2_3: - case FMT_DXT4_5: - view->format = (fetch.format == FMT_DXT4_5 ? DXGI_FORMAT_BC3_UNORM : DXGI_FORMAT_BC2_UNORM); - view->block_size = 4; - view->texel_pitch = 16; - view->is_compressed = true; - break; - case FMT_1_REVERSE: - case FMT_1: - case FMT_5_6_5: - case FMT_6_5_5: - case FMT_2_10_10_10: - case FMT_8_A: - case FMT_8_B: - case FMT_8_8: - case FMT_Cr_Y1_Cb_Y0: - case FMT_Y1_Cr_Y0_Cb: - case FMT_5_5_5_1: - case FMT_8_8_8_8_A: - case FMT_10_11_11: - case FMT_11_11_10: - case FMT_24_8: - case FMT_24_8_FLOAT: - case FMT_16: - case FMT_16_16: - case FMT_16_16_16_16: - case FMT_16_EXPAND: - case FMT_16_16_EXPAND: - case FMT_16_16_16_16_EXPAND: - case FMT_16_FLOAT: - case FMT_16_16_FLOAT: - case FMT_32: - case FMT_32_32: - case FMT_32_32_32_32: - case FMT_32_32_FLOAT: - case FMT_32_32_32_32_FLOAT: - case FMT_32_AS_8: - case FMT_32_AS_8_8: - case FMT_16_MPEG: - case FMT_16_16_MPEG: - case FMT_8_INTERLACED: - case FMT_32_AS_8_INTERLACED: - case FMT_32_AS_8_8_INTERLACED: - case FMT_16_INTERLACED: - case FMT_16_MPEG_INTERLACED: - case FMT_16_16_MPEG_INTERLACED: - case FMT_DXN: - case FMT_8_8_8_8_AS_16_16_16_16: - case FMT_DXT1_AS_16_16_16_16: - case FMT_DXT2_3_AS_16_16_16_16: - case FMT_DXT4_5_AS_16_16_16_16: - case FMT_2_10_10_10_AS_16_16_16_16: - case FMT_10_11_11_AS_16_16_16_16: - case FMT_11_11_10_AS_16_16_16_16: - case FMT_32_32_32_FLOAT: - case FMT_DXT3A: - case FMT_DXT5A: - case FMT_CTX1: - case FMT_DXT3A_AS_1_1_1_1: - view->format = DXGI_FORMAT_UNKNOWN; - break; - } - - if (view->format == DXGI_FORMAT_UNKNOWN) { - return false; - } - - switch (fetch.dimension) { - case DIMENSION_1D: - break; - case DIMENSION_2D: - view->sizes_2d = GetTextureSizes2D(view); - break; - case DIMENSION_3D: - break; - case DIMENSION_CUBE: - break; - } - return true; -} - -const TextureSizes2D Texture::GetTextureSizes2D(TextureView* view) { - TextureSizes2D sizes; - - sizes.logical_width = 1 + view->fetch.size_2d.width; - sizes.logical_height = 1 + view->fetch.size_2d.height; - - sizes.block_width = sizes.logical_width / view->block_size; - sizes.block_height = sizes.logical_height / view->block_size; - - if (!view->is_compressed) { - // must be 32x32, but also must have a pitch that is a multiple of 256 bytes - uint32_t bytes_per_block = view->block_size * view->block_size * - view->texel_pitch; - uint32_t width_multiple = 32; - if (bytes_per_block) { - uint32_t minimum_multiple = 256 / bytes_per_block; - if (width_multiple < minimum_multiple) { - width_multiple = minimum_multiple; - } - } - sizes.input_width = XEROUNDUP(sizes.logical_width, width_multiple); - sizes.input_height = XEROUNDUP(sizes.logical_height, 32); - sizes.output_width = sizes.logical_width; - sizes.output_height = sizes.logical_height; - } else { - // must be 128x128 - sizes.input_width = XEROUNDUP(sizes.logical_width, 128); - sizes.input_height = XEROUNDUP(sizes.logical_height, 128); - sizes.output_width = XENEXTPOW2(sizes.logical_width); - sizes.output_height = XENEXTPOW2(sizes.logical_height); - } - - sizes.logical_pitch = - (sizes.logical_width / view->block_size) * view->texel_pitch; - sizes.input_pitch = - (sizes.input_width / view->block_size) * view->texel_pitch; - - return sizes; -} - -void Texture::TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch, - XE_GPU_ENDIAN endianness) { - switch (endianness) { - case XE_GPU_ENDIAN_8IN16: - for (uint32_t i = 0; i < pitch; i += 2, src += 2, dest += 2) { - *(uint16_t*)dest = XESWAP16(*(uint16_t*)src); - } - break; - case XE_GPU_ENDIAN_8IN32: // Swap bytes. - for (uint32_t i = 0; i < pitch; i += 4, src += 4, dest += 4) { - *(uint32_t*)dest = XESWAP32(*(uint32_t*)src); - } - break; - case XE_GPU_ENDIAN_16IN32: // Swap half words. - for (uint32_t i = 0; i < pitch; i += 4, src += 4, dest += 4) { - uint32_t value = *(uint32_t*)src; - *(uint32_t*)dest = ((value >> 16) & 0xFFFF) | (value << 16); - } - break; - default: - case XE_GPU_ENDIAN_NONE: - memcpy(dest, src, pitch); - break; - } -} - -// https://code.google.com/p/crunch/source/browse/trunk/inc/crn_decomp.h#4104 -uint32_t Texture::TiledOffset2DOuter(uint32_t y, uint32_t width, - uint32_t log_bpp) { - uint32_t macro = ((y >> 5) * (width >> 5)) << (log_bpp + 7); - uint32_t micro = ((y & 6) << 2) << log_bpp; - return macro + - ((micro & ~15) << 1) + - (micro & 15) + - ((y & 8) << (3 + log_bpp)) + - ((y & 1) << 4); -} - -uint32_t Texture::TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp, - uint32_t base_offset) { - uint32_t macro = (x >> 5) << (bpp + 7); - uint32_t micro = (x & 7) << bpp; - uint32_t offset = base_offset + (macro + ((micro & ~15) << 1) + (micro & 15)); - return ((offset & ~511) << 3) + ((offset & 448) << 2) + (offset & 63) + - ((y & 16) << 7) + (((((y & 8) >> 2) + (x >> 3)) & 3) << 6); -} diff --git a/src/xenia/gpu/texture.h b/src/xenia/gpu/texture.h deleted file mode 100644 index 9b919a5d9..000000000 --- a/src/xenia/gpu/texture.h +++ /dev/null @@ -1,110 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_TEXTURE_H_ -#define XENIA_GPU_TEXTURE_H_ - -#include -#include - -// TODO(benvanik): replace DXGI constants with xenia constants. -#include - - -namespace xe { -namespace gpu { - - -class Texture; - -struct TextureSizes1D {}; -struct TextureSizes2D { - uint32_t logical_width; - uint32_t logical_height; - uint32_t block_width; - uint32_t block_height; - uint32_t input_width; - uint32_t input_height; - uint32_t output_width; - uint32_t output_height; - uint32_t logical_pitch; - uint32_t input_pitch; -}; -struct TextureSizes3D {}; -struct TextureSizesCube {}; - -struct TextureView { - Texture* texture; - xenos::xe_gpu_texture_fetch_t fetch; - uint64_t hash; - - union { - TextureSizes1D sizes_1d; - TextureSizes2D sizes_2d; - TextureSizes3D sizes_3d; - TextureSizesCube sizes_cube; - }; - - int dimensions; - uint32_t width; - uint32_t height; - uint32_t depth; - uint32_t block_size; - uint32_t texel_pitch; - bool is_compressed; - DXGI_FORMAT format; - - TextureView() - : texture(nullptr), - dimensions(0), - width(0), height(0), depth(0), - block_size(0), texel_pitch(0), - is_compressed(false), format(DXGI_FORMAT_UNKNOWN) {} -}; - - -class Texture { -public: - Texture(uint32_t address, const uint8_t* host_address); - virtual ~Texture(); - - TextureView* Fetch( - const xenos::xe_gpu_texture_fetch_t& fetch); - -protected: - bool FillViewInfo(TextureView* view, - const xenos::xe_gpu_texture_fetch_t& fetch); - - virtual TextureView* FetchNew( - const xenos::xe_gpu_texture_fetch_t& fetch) = 0; - virtual bool FetchDirty( - TextureView* view, const xenos::xe_gpu_texture_fetch_t& fetch) = 0; - - const TextureSizes2D GetTextureSizes2D(TextureView* view); - - static void TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch, - xenos::XE_GPU_ENDIAN endianness); - static uint32_t TiledOffset2DOuter(uint32_t y, uint32_t width, - uint32_t log_bpp); - static uint32_t TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp, - uint32_t base_offset); - - uint32_t address_; - const uint8_t* host_address_; - - // TODO(benvanik): replace with LRU keyed list. - std::vector views_; -}; - - -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_TEXTURE_H_ diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc deleted file mode 100644 index 1f0a4a5ac..000000000 --- a/src/xenia/gpu/texture_cache.cc +++ /dev/null @@ -1,50 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::xenos; - - -// https://github.com/ivmai/bdwgc/blob/master/os_dep.c - -TextureCache::TextureCache(Memory* memory) - : memory_(memory) { -} - -TextureCache::~TextureCache() { - for (auto it = textures_.begin(); it != textures_.end(); ++it) { - auto texture = it->second; - delete texture; - } - textures_.clear(); -} - -TextureView* TextureCache::FetchTexture( - uint32_t address, const xenos::xe_gpu_texture_fetch_t& fetch) { - auto it = textures_.find(address); - if (it == textures_.end()) { - // Texture not found. - const uint8_t* host_address = memory_->Translate(address); - auto texture = CreateTexture(address, host_address, fetch); - if (!texture) { - return nullptr; - } - textures_.insert({ address, texture }); - return texture->Fetch(fetch); - } else { - // Texture found. - return it->second->Fetch(fetch); - } -} diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h deleted file mode 100644 index 285ffe1d7..000000000 --- a/src/xenia/gpu/texture_cache.h +++ /dev/null @@ -1,50 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2014 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_TEXTURE_CACHE_H_ -#define XENIA_GPU_TEXTURE_CACHE_H_ - -#include -#include -#include - - -namespace xe { -namespace gpu { - - -// TODO(benvanik): overlapping textures. -// TODO(benvanik): multiple textures (differing formats/etc) per address. -class TextureCache { -public: - TextureCache(Memory* memory); - virtual ~TextureCache(); - - Memory* memory() const { return memory_; } - - TextureView* FetchTexture( - uint32_t address, const xenos::xe_gpu_texture_fetch_t& fetch); - -protected: - virtual Texture* CreateTexture( - uint32_t address, const uint8_t* host_address, - const xenos::xe_gpu_texture_fetch_t& fetch) = 0; - - Memory* memory_; - - // Mapped by guest address. - std::unordered_map textures_; -}; - - -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_TEXTURE_CACHE_H_ diff --git a/src/xenia/gpu/texture_resource.cc b/src/xenia/gpu/texture_resource.cc index 5875e76f3..d063acc56 100644 --- a/src/xenia/gpu/texture_resource.cc +++ b/src/xenia/gpu/texture_resource.cc @@ -9,9 +9,342 @@ #include +#include +#include + using namespace std; using namespace xe; using namespace xe::gpu; using namespace xe::gpu::xenos; + +bool TextureResource::Info::Prepare(const xe_gpu_texture_fetch_t& fetch, + Info& info) { + // http://msdn.microsoft.com/en-us/library/windows/desktop/cc308051(v=vs.85).aspx + // a2xx_sq_surfaceformat + + info.dimension = (TextureDimension)fetch.dimension; + switch (info.dimension) { + case TEXTURE_DIMENSION_1D: + info.width = fetch.size_1d.width; + break; + case TEXTURE_DIMENSION_2D: + info.width = fetch.size_2d.width; + info.height = fetch.size_2d.height; + break; + case TEXTURE_DIMENSION_3D: + case TEXTURE_DIMENSION_CUBE: + info.width = fetch.size_3d.width; + info.height = fetch.size_3d.height; + info.depth = fetch.size_3d.depth; + break; + } + info.block_size = 0; + info.texel_pitch = 0; + info.endianness = (XE_GPU_ENDIAN)fetch.endianness; + info.is_tiled = fetch.tiled; + info.is_compressed = false; + info.input_length = 0; + info.format = DXGI_FORMAT_UNKNOWN; + switch (fetch.format) { + case FMT_8: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_RRR1: + info.format = DXGI_FORMAT_R8_UNORM; + break; + case XE_GPU_SWIZZLE_000R: + info.format = DXGI_FORMAT_A8_UNORM; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_8"); + info.format = DXGI_FORMAT_A8_UNORM; + break; + } + info.block_size = 1; + info.texel_pitch = 1; + break; + case FMT_1_5_5_5: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_BGRA: + info.format = DXGI_FORMAT_B5G5R5A1_UNORM; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_1_5_5_5"); + info.format = DXGI_FORMAT_B5G5R5A1_UNORM; + break; + } + info.block_size = 1; + info.texel_pitch = 2; + break; + case FMT_8_8_8_8: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_RGBA: + info.format = DXGI_FORMAT_R8G8B8A8_UNORM; + break; + case XE_GPU_SWIZZLE_BGRA: + info.format = DXGI_FORMAT_B8G8R8A8_UNORM; + break; + case XE_GPU_SWIZZLE_RGB1: + info.format = DXGI_FORMAT_R8G8B8A8_UNORM; // ? + break; + case XE_GPU_SWIZZLE_BGR1: + info.format = DXGI_FORMAT_B8G8R8X8_UNORM; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_8_8_8_8"); + info.format = DXGI_FORMAT_R8G8B8A8_UNORM; + break; + } + info.block_size = 1; + info.texel_pitch = 4; + break; + case FMT_4_4_4_4: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_BGRA: + info.format = DXGI_FORMAT_B4G4R4A4_UNORM; // only supported on Windows 8+ + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_4_4_4_4"); + info.format = DXGI_FORMAT_B4G4R4A4_UNORM; // only supported on Windows 8+ + break; + } + info.block_size = 1; + info.texel_pitch = 2; + break; + case FMT_16_16_16_16_FLOAT: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_RGBA: + info.format = DXGI_FORMAT_R16G16B16A16_FLOAT; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_16_16_16_16_FLOAT"); + info.format = DXGI_FORMAT_R16G16B16A16_FLOAT; + break; + } + info.block_size = 1; + info.texel_pitch = 8; + break; + case FMT_32_FLOAT: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_R111: + info.format = DXGI_FORMAT_R32_FLOAT; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_32_FLOAT"); + info.format = DXGI_FORMAT_R32_FLOAT; + break; + } + info.block_size = 1; + info.texel_pitch = 4; + break; + case FMT_DXT1: + info.format = DXGI_FORMAT_BC1_UNORM; + info.block_size = 4; + info.texel_pitch = 8; + info.is_compressed = true; + break; + case FMT_DXT2_3: + case FMT_DXT4_5: + info.format = (fetch.format == FMT_DXT4_5 ? DXGI_FORMAT_BC3_UNORM : DXGI_FORMAT_BC2_UNORM); + info.block_size = 4; + info.texel_pitch = 16; + info.is_compressed = true; + break; + case FMT_1_REVERSE: + case FMT_1: + case FMT_5_6_5: + case FMT_6_5_5: + case FMT_2_10_10_10: + case FMT_8_A: + case FMT_8_B: + case FMT_8_8: + case FMT_Cr_Y1_Cb_Y0: + case FMT_Y1_Cr_Y0_Cb: + case FMT_5_5_5_1: + case FMT_8_8_8_8_A: + case FMT_10_11_11: + case FMT_11_11_10: + case FMT_24_8: + case FMT_24_8_FLOAT: + case FMT_16: + case FMT_16_16: + case FMT_16_16_16_16: + case FMT_16_EXPAND: + case FMT_16_16_EXPAND: + case FMT_16_16_16_16_EXPAND: + case FMT_16_FLOAT: + case FMT_16_16_FLOAT: + case FMT_32: + case FMT_32_32: + case FMT_32_32_32_32: + case FMT_32_32_FLOAT: + case FMT_32_32_32_32_FLOAT: + case FMT_32_AS_8: + case FMT_32_AS_8_8: + case FMT_16_MPEG: + case FMT_16_16_MPEG: + case FMT_8_INTERLACED: + case FMT_32_AS_8_INTERLACED: + case FMT_32_AS_8_8_INTERLACED: + case FMT_16_INTERLACED: + case FMT_16_MPEG_INTERLACED: + case FMT_16_16_MPEG_INTERLACED: + case FMT_DXN: + case FMT_8_8_8_8_AS_16_16_16_16: + case FMT_DXT1_AS_16_16_16_16: + case FMT_DXT2_3_AS_16_16_16_16: + case FMT_DXT4_5_AS_16_16_16_16: + case FMT_2_10_10_10_AS_16_16_16_16: + case FMT_10_11_11_AS_16_16_16_16: + case FMT_11_11_10_AS_16_16_16_16: + case FMT_32_32_32_FLOAT: + case FMT_DXT3A: + case FMT_DXT5A: + case FMT_CTX1: + case FMT_DXT3A_AS_1_1_1_1: + info.format = DXGI_FORMAT_UNKNOWN; + break; + } + + if (info.format == DXGI_FORMAT_UNKNOWN) { + return false; + } + + // Must be called here when we know the format. + switch (info.dimension) { + case TEXTURE_DIMENSION_1D: + info.CalculateTextureSizes1D(fetch); + break; + case TEXTURE_DIMENSION_2D: + info.CalculateTextureSizes2D(fetch); + break; + case TEXTURE_DIMENSION_3D: + // TODO(benvanik): calculate size. + return false; + case TEXTURE_DIMENSION_CUBE: + // TODO(benvanik): calculate size. + return false; + } + return true; +} + +void TextureResource::Info::CalculateTextureSizes1D( + const xe_gpu_texture_fetch_t& fetch) { + // ? + size_1d.width = fetch.size_1d.width; +} + +void TextureResource::Info::CalculateTextureSizes2D( + const xe_gpu_texture_fetch_t& fetch) { + size_2d.logical_width = 1 + fetch.size_2d.width; + size_2d.logical_height = 1 + fetch.size_2d.height; + + size_2d.block_width = size_2d.logical_width / block_size; + size_2d.block_height = size_2d.logical_height / block_size; + + if (!is_compressed) { + // must be 32x32 but also must have a pitch that is a multiple of 256 bytes + uint32_t bytes_per_block = block_size * block_size * texel_pitch; + uint32_t width_multiple = 32; + if (bytes_per_block) { + uint32_t minimum_multiple = 256 / bytes_per_block; + if (width_multiple < minimum_multiple) { + width_multiple = minimum_multiple; + } + } + size_2d.input_width = XEROUNDUP(size_2d.logical_width, width_multiple); + size_2d.input_height = XEROUNDUP(size_2d.logical_height, 32); + size_2d.output_width = size_2d.logical_width; + size_2d.output_height = size_2d.logical_height; + } else { + // must be 128x128 + size_2d.input_width = XEROUNDUP(size_2d.logical_width, 128); + size_2d.input_height = XEROUNDUP(size_2d.logical_height, 128); + size_2d.output_width = XENEXTPOW2(size_2d.logical_width); + size_2d.output_height = XENEXTPOW2(size_2d.logical_height); + } + + size_2d.logical_pitch = (size_2d.logical_width / block_size) * texel_pitch; + size_2d.input_pitch = (size_2d.input_width / block_size) * texel_pitch; + + if (!is_tiled) { + input_length = size_2d.block_height * size_2d.logical_pitch; + } else { + input_length = size_2d.block_height * size_2d.logical_pitch; // ? + } +} + +TextureResource::TextureResource(const MemoryRange& memory_range, + const Info& info) + : PagedResource(memory_range), + info_(info) { +} + +TextureResource::~TextureResource() { +} + +int TextureResource::Prepare() { + if (!handle()) { + if (CreateHandle()) { + XELOGE("Unable to create texture handle"); + return 1; + } + } + + if (!dirtied_) { + return 0; + } + dirtied_ = false; + + // pass dirty regions? + return InvalidateRegion(memory_range_); +} + +void TextureResource::TextureSwap(uint8_t* dest, const uint8_t* src, + uint32_t pitch) const { + // TODO(benvanik): optimize swapping paths. + switch (info_.endianness) { + case XE_GPU_ENDIAN_8IN16: + for (uint32_t i = 0; i < pitch; i += 2, src += 2, dest += 2) { + *(uint16_t*)dest = XESWAP16(*(uint16_t*)src); + } + break; + case XE_GPU_ENDIAN_8IN32: // Swap bytes. + for (uint32_t i = 0; i < pitch; i += 4, src += 4, dest += 4) { + *(uint32_t*)dest = XESWAP32(*(uint32_t*)src); + } + break; + case XE_GPU_ENDIAN_16IN32: // Swap half words. + for (uint32_t i = 0; i < pitch; i += 4, src += 4, dest += 4) { + uint32_t value = *(uint32_t*)src; + *(uint32_t*)dest = ((value >> 16) & 0xFFFF) | (value << 16); + } + break; + default: + case XE_GPU_ENDIAN_NONE: + memcpy(dest, src, pitch); + break; + } +} + +// https://code.google.com/p/crunch/source/browse/trunk/inc/crn_decomp.h#4104 +uint32_t TextureResource::TiledOffset2DOuter(uint32_t y, uint32_t width, + uint32_t log_bpp) const { + uint32_t macro = ((y >> 5) * (width >> 5)) << (log_bpp + 7); + uint32_t micro = ((y & 6) << 2) << log_bpp; + return macro + + ((micro & ~15) << 1) + + (micro & 15) + + ((y & 8) << (3 + log_bpp)) + + ((y & 1) << 4); +} + +uint32_t TextureResource::TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp, + uint32_t base_offset) const { + uint32_t macro = (x >> 5) << (bpp + 7); + uint32_t micro = (x & 7) << bpp; + uint32_t offset = base_offset + (macro + ((micro & ~15) << 1) + (micro & 15)); + return ((offset & ~511) << 3) + ((offset & 448) << 2) + (offset & 63) + + ((y & 16) << 7) + (((((y & 8) >> 2) + (x >> 3)) & 3) << 6); +} diff --git a/src/xenia/gpu/texture_resource.h b/src/xenia/gpu/texture_resource.h index 35f83bcda..57dc63422 100644 --- a/src/xenia/gpu/texture_resource.h +++ b/src/xenia/gpu/texture_resource.h @@ -10,7 +10,7 @@ #ifndef XENIA_GPU_TEXTURE_RESOURCE_H_ #define XENIA_GPU_TEXTURE_RESOURCE_H_ -#include +#include #include // TODO(benvanik): replace DXGI constants with xenia constants. @@ -21,8 +21,85 @@ namespace xe { namespace gpu { -class TextureResource : public Resource { +enum TextureDimension { + TEXTURE_DIMENSION_1D = 0, + TEXTURE_DIMENSION_2D = 1, + TEXTURE_DIMENSION_3D = 2, + TEXTURE_DIMENSION_CUBE = 3, +}; + + +class TextureResource : public PagedResource { public: + struct Info { + TextureDimension dimension; + uint32_t width; + uint32_t height; + uint32_t depth; + uint32_t block_size; + uint32_t texel_pitch; + xenos::XE_GPU_ENDIAN endianness; + bool is_tiled; + bool is_compressed; + uint32_t input_length; + + // TODO(benvanik): replace with our own constants. + DXGI_FORMAT format; + + union { + struct { + uint32_t width; + } size_1d; + struct { + uint32_t logical_width; + uint32_t logical_height; + uint32_t block_width; + uint32_t block_height; + uint32_t input_width; + uint32_t input_height; + uint32_t output_width; + uint32_t output_height; + uint32_t logical_pitch; + uint32_t input_pitch; + } size_2d; + struct { + } size_3d; + struct { + } size_cube; + }; + + static bool Prepare(const xenos::xe_gpu_texture_fetch_t& fetch, + Info& out_info); + + private: + void CalculateTextureSizes1D(const xenos::xe_gpu_texture_fetch_t& fetch); + void CalculateTextureSizes2D(const xenos::xe_gpu_texture_fetch_t& fetch); + }; + + TextureResource(const MemoryRange& memory_range, + const Info& info); + ~TextureResource() override; + + const Info& info() const { return info_; } + + bool Equals(const void* info_ptr, size_t info_length) override { + return info_length == sizeof(Info) && + memcmp(info_ptr, &info_, info_length) == 0; + } + + virtual int Prepare(); + +protected: + virtual int CreateHandle() = 0; + virtual int InvalidateRegion(const MemoryRange& memory_range) = 0; + + void TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch) const; + uint32_t TiledOffset2DOuter(uint32_t y, uint32_t width, + uint32_t log_bpp) const; + uint32_t TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp, + uint32_t base_offset) const; + + Info info_; }; diff --git a/src/xenia/gpu/xenos/registers.h b/src/xenia/gpu/xenos/registers.h deleted file mode 100644 index 39a0d43db..000000000 --- a/src/xenia/gpu/xenos/registers.h +++ /dev/null @@ -1,51 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_XENOS_REGISTERS_H_ -#define XENIA_GPU_XENOS_REGISTERS_H_ - -#include - - -namespace xe { -namespace gpu { -namespace xenos { - - -static const uint32_t kXEGpuRegisterCount = 0x5003; - - -enum Registers { -#define XE_GPU_REGISTER(index, type, name) \ - XE_GPU_REG_##name = index, -#include -#undef XE_GPU_REGISTER -}; - - -const char* GetRegisterName(uint32_t index); - - -union RegisterValue { - uint32_t u32; - float f32; -}; - - -struct RegisterFile { - RegisterValue values[kXEGpuRegisterCount]; -}; - - -} // namespace xenos -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_XENOS_REGISTERS_H_ diff --git a/src/xenia/gpu/xenos/sources.gypi b/src/xenia/gpu/xenos/sources.gypi index c1f677682..998444938 100644 --- a/src/xenia/gpu/xenos/sources.gypi +++ b/src/xenia/gpu/xenos/sources.gypi @@ -3,8 +3,6 @@ 'sources': [ 'packets.h', 'register_table.inc', - 'registers.cc', - 'registers.h', 'ucode.h', 'ucode_disassembler.cc', 'ucode_disassembler.h', From 6514eaa78084be9c992cd7d04e27d6b855114e78 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 8 Jun 2014 11:25:10 -0700 Subject: [PATCH 151/184] Disabling paged resource caching for now, as it's broken. --- src/xenia/gpu/buffer_resource.cc | 7 ++++--- src/xenia/gpu/resource_cache.cc | 3 +++ src/xenia/gpu/texture_resource.cc | 7 ++++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/xenia/gpu/buffer_resource.cc b/src/xenia/gpu/buffer_resource.cc index 9f9accb9b..949bfe02b 100644 --- a/src/xenia/gpu/buffer_resource.cc +++ b/src/xenia/gpu/buffer_resource.cc @@ -30,9 +30,10 @@ int BufferResource::Prepare() { } } - if (!dirtied_) { - return 0; - } + // DISABLED + //if (!dirtied_) { + // return 0; + //} dirtied_ = false; // pass dirty regions? diff --git a/src/xenia/gpu/resource_cache.cc b/src/xenia/gpu/resource_cache.cc index c317a12be..46eec8f0b 100644 --- a/src/xenia/gpu/resource_cache.cc +++ b/src/xenia/gpu/resource_cache.cc @@ -118,6 +118,9 @@ void ResourceCache::SyncRange(uint32_t address, int length) { // will not be changing, which allows us to do a foreach(res) and reload // and then clear the table. + // DISABLED + return; + // total bytes = (512 * 1024 * 1024) / (16 * 1024) = 32768 // each byte = 1 page // Walk as qwords so we can clear things up faster. diff --git a/src/xenia/gpu/texture_resource.cc b/src/xenia/gpu/texture_resource.cc index d063acc56..e7cfdee0d 100644 --- a/src/xenia/gpu/texture_resource.cc +++ b/src/xenia/gpu/texture_resource.cc @@ -292,9 +292,10 @@ int TextureResource::Prepare() { } } - if (!dirtied_) { - return 0; - } + // DISABLED + //if (!dirtied_) { + // return 0; + //} dirtied_ = false; // pass dirty regions? From daa8a241736cdd42c834e6681de70c8bdb7f54c0 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 8 Jun 2014 11:51:53 -0700 Subject: [PATCH 152/184] Caching states. --- src/xenia/gpu/d3d11/d3d11_graphics_driver.cc | 336 +++++++++++-------- src/xenia/gpu/d3d11/d3d11_graphics_driver.h | 7 + 2 files changed, 205 insertions(+), 138 deletions(-) diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc index a671b4626..4ea0b6210 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc @@ -71,6 +71,18 @@ D3D11GraphicsDriver::~D3D11GraphicsDriver() { XESAFERELEASE(state_.constant_buffers.loop_constants); XESAFERELEASE(state_.constant_buffers.vs_consts); XESAFERELEASE(state_.constant_buffers.gs_consts); + for (auto it = rasterizer_state_cache_.begin(); + it != rasterizer_state_cache_.end(); ++it) { + XESAFERELEASE(it->second); + } + for (auto it = blend_state_cache_.begin(); + it != blend_state_cache_.end(); ++it) { + XESAFERELEASE(it->second); + } + for (auto it = depth_stencil_state_cache_.begin(); + it != depth_stencil_state_cache_.end(); ++it) { + XESAFERELEASE(it->second); + } XESAFERELEASE(invalid_texture_view_); XESAFERELEASE(invalid_texture_sampler_state_); delete resource_cache_; @@ -271,39 +283,6 @@ int D3D11GraphicsDriver::UpdateState(const DrawCommand& command) { // TODO(benvanik): only enable the number of valid render targets. context_->OMSetRenderTargets(4, render_target_views, depth_stencil_view); - // General rasterizer state. - uint32_t mode_control = register_file_[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32; - D3D11_RASTERIZER_DESC rasterizer_desc; - xe_zero_struct(&rasterizer_desc, sizeof(rasterizer_desc)); - rasterizer_desc.FillMode = D3D11_FILL_SOLID; // D3D11_FILL_WIREFRAME; - switch (mode_control & 0x3) { - case 0: - rasterizer_desc.CullMode = D3D11_CULL_NONE; - break; - case 1: - rasterizer_desc.CullMode = D3D11_CULL_FRONT; - break; - case 2: - rasterizer_desc.CullMode = D3D11_CULL_BACK; - break; - } - if (command.prim_type == XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST) { - // Rect lists aren't culled. There may be other things they skip too. - rasterizer_desc.CullMode = D3D11_CULL_NONE; - } - rasterizer_desc.FrontCounterClockwise = (mode_control & 0x4) == 0; - rasterizer_desc.DepthBias = 0; - rasterizer_desc.DepthBiasClamp = 0; - rasterizer_desc.SlopeScaledDepthBias = 0; - rasterizer_desc.DepthClipEnable = false; // ? - rasterizer_desc.ScissorEnable = false; - rasterizer_desc.MultisampleEnable = false; - rasterizer_desc.AntialiasedLineEnable = false; - ID3D11RasterizerState* rasterizer_state = 0; - device_->CreateRasterizerState(&rasterizer_desc, &rasterizer_state); - context_->RSSetState(rasterizer_state); - XESAFERELEASE(rasterizer_state); - // Viewport. // If we have resized the window we will want to change this. uint32_t window_offset = register_file_[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32; @@ -391,71 +370,67 @@ int D3D11GraphicsDriver::UpdateState(const DrawCommand& command) { context_->RSSetScissorRects(0, NULL); } - static const D3D11_COMPARISON_FUNC compare_func_map[] = { - /* 0 */ D3D11_COMPARISON_NEVER, - /* 1 */ D3D11_COMPARISON_LESS, - /* 2 */ D3D11_COMPARISON_EQUAL, - /* 3 */ D3D11_COMPARISON_LESS_EQUAL, - /* 4 */ D3D11_COMPARISON_GREATER, - /* 5 */ D3D11_COMPARISON_NOT_EQUAL, - /* 6 */ D3D11_COMPARISON_GREATER_EQUAL, - /* 7 */ D3D11_COMPARISON_ALWAYS, - }; - static const D3D11_STENCIL_OP stencil_op_map[] = { - /* 0 */ D3D11_STENCIL_OP_KEEP, - /* 1 */ D3D11_STENCIL_OP_ZERO, - /* 2 */ D3D11_STENCIL_OP_REPLACE, - /* 3 */ D3D11_STENCIL_OP_INCR_SAT, - /* 4 */ D3D11_STENCIL_OP_DECR_SAT, - /* 5 */ D3D11_STENCIL_OP_INVERT, - /* 6 */ D3D11_STENCIL_OP_INCR, - /* 7 */ D3D11_STENCIL_OP_DECR, - }; + if (SetupRasterizerState(command)) { + XELOGE("Unable to setup rasterizer state"); + return 1; + } + if (SetupBlendState(command)) { + XELOGE("Unable to setup blend state"); + return 1; + } + if (SetupDepthStencilState(command)) { + XELOGE("Unable to setup depth/stencil state"); + return 1; + } - // Depth-stencil state. - uint32_t depth_control = register_file_[XE_GPU_REG_RB_DEPTHCONTROL].u32; - uint32_t stencil_ref_mask = register_file_[XE_GPU_REG_RB_STENCILREFMASK].u32; - D3D11_DEPTH_STENCIL_DESC depth_stencil_desc; - xe_zero_struct(&depth_stencil_desc, sizeof(depth_stencil_desc)); - // A2XX_RB_DEPTHCONTROL_BACKFACE_ENABLE - // ? - // A2XX_RB_DEPTHCONTROL_Z_ENABLE - depth_stencil_desc.DepthEnable = (depth_control & 0x00000002) != 0; - // A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE - depth_stencil_desc.DepthWriteMask = (depth_control & 0x00000004) ? D3D11_DEPTH_WRITE_MASK_ALL : D3D11_DEPTH_WRITE_MASK_ZERO; - // A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE - // ? - // A2XX_RB_DEPTHCONTROL_ZFUNC - depth_stencil_desc.DepthFunc = compare_func_map[(depth_control & 0x00000070) >> 4]; - // A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE - depth_stencil_desc.StencilEnable = (depth_control & 0x00000001) != 0; - // RB_STENCILREFMASK_STENCILMASK - depth_stencil_desc.StencilReadMask = (stencil_ref_mask & 0x0000FF00) >> 8; - // RB_STENCILREFMASK_STENCILWRITEMASK - depth_stencil_desc.StencilWriteMask = (stencil_ref_mask & 0x00FF0000) >> 16; - // A2XX_RB_DEPTHCONTROL_STENCILFUNC - depth_stencil_desc.FrontFace.StencilFunc = compare_func_map[(depth_control & 0x00000700) >> 8]; - // A2XX_RB_DEPTHCONTROL_STENCILFAIL - depth_stencil_desc.FrontFace.StencilFailOp = stencil_op_map[(depth_control & 0x00003800) >> 11]; - // A2XX_RB_DEPTHCONTROL_STENCILZPASS - depth_stencil_desc.FrontFace.StencilPassOp = stencil_op_map[(depth_control & 0x0001C000) >> 14]; - // A2XX_RB_DEPTHCONTROL_STENCILZFAIL - depth_stencil_desc.FrontFace.StencilDepthFailOp = stencil_op_map[(depth_control & 0x000E0000) >> 17]; - // A2XX_RB_DEPTHCONTROL_STENCILFUNC_BF - depth_stencil_desc.BackFace.StencilFunc = compare_func_map[(depth_control & 0x00700000) >> 20]; - // A2XX_RB_DEPTHCONTROL_STENCILFAIL_BF - depth_stencil_desc.BackFace.StencilFailOp = stencil_op_map[(depth_control & 0x03800000) >> 23]; - // A2XX_RB_DEPTHCONTROL_STENCILZPASS_BF - depth_stencil_desc.BackFace.StencilPassOp = stencil_op_map[(depth_control & 0x1C000000) >> 26]; - // A2XX_RB_DEPTHCONTROL_STENCILZFAIL_BF - depth_stencil_desc.BackFace.StencilDepthFailOp = stencil_op_map[(depth_control & 0xE0000000) >> 29]; - // RB_STENCILREFMASK_STENCILREF - uint32_t stencil_ref = (stencil_ref_mask & 0x000000FF); - ID3D11DepthStencilState* depth_stencil_state = 0; - device_->CreateDepthStencilState(&depth_stencil_desc, &depth_stencil_state); - context_->OMSetDepthStencilState(depth_stencil_state, stencil_ref); - XESAFERELEASE(depth_stencil_state); + return 0; +} +int D3D11GraphicsDriver::SetupRasterizerState(const DrawCommand& command) { + uint32_t mode_control = register_file_[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32; + + // Check cache. + uint64_t key = hash_combine(mode_control); + ID3D11RasterizerState* rasterizer_state = nullptr; + auto it = rasterizer_state_cache_.find(key); + if (it == rasterizer_state_cache_.end()) { + D3D11_RASTERIZER_DESC rasterizer_desc; + xe_zero_struct(&rasterizer_desc, sizeof(rasterizer_desc)); + rasterizer_desc.FillMode = D3D11_FILL_SOLID; // D3D11_FILL_WIREFRAME; + switch (mode_control & 0x3) { + case 0: + rasterizer_desc.CullMode = D3D11_CULL_NONE; + break; + case 1: + rasterizer_desc.CullMode = D3D11_CULL_FRONT; + break; + case 2: + rasterizer_desc.CullMode = D3D11_CULL_BACK; + break; + } + if (command.prim_type == XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST) { + // Rect lists aren't culled. There may be other things they skip too. + rasterizer_desc.CullMode = D3D11_CULL_NONE; + } + rasterizer_desc.FrontCounterClockwise = (mode_control & 0x4) == 0; + rasterizer_desc.DepthBias = 0; + rasterizer_desc.DepthBiasClamp = 0; + rasterizer_desc.SlopeScaledDepthBias = 0; + rasterizer_desc.DepthClipEnable = false; // ? + rasterizer_desc.ScissorEnable = false; + rasterizer_desc.MultisampleEnable = false; + rasterizer_desc.AntialiasedLineEnable = false; + device_->CreateRasterizerState(&rasterizer_desc, &rasterizer_state); + rasterizer_state_cache_.insert({ key, rasterizer_state }); + } else { + rasterizer_state = it->second; + } + + context_->RSSetState(rasterizer_state); + return 0; +} + +int D3D11GraphicsDriver::SetupBlendState(const DrawCommand& command) { static const D3D11_BLEND blend_map[] = { /* 0 */ D3D11_BLEND_ZERO, /* 1 */ D3D11_BLEND_ONE, @@ -488,56 +463,141 @@ int D3D11GraphicsDriver::UpdateState(const DrawCommand& command) { // http://msdn.microsoft.com/en-us/library/windows/desktop/bb205120(v=vs.85).aspx uint32_t color_control = register_file_[XE_GPU_REG_RB_COLORCONTROL].u32; - // Blend state. uint32_t color_mask = register_file_[XE_GPU_REG_RB_COLOR_MASK].u32; - uint32_t sample_mask = 0xFFFFFFFF; // ? - float blend_factor[4] = { - register_file_[XE_GPU_REG_RB_BLEND_RED].f32, - register_file_[XE_GPU_REG_RB_BLEND_GREEN].f32, - register_file_[XE_GPU_REG_RB_BLEND_BLUE].f32, - register_file_[XE_GPU_REG_RB_BLEND_ALPHA].f32, - }; uint32_t blend_control[4] = { register_file_[XE_GPU_REG_RB_BLENDCONTROL_0].u32, register_file_[XE_GPU_REG_RB_BLENDCONTROL_1].u32, register_file_[XE_GPU_REG_RB_BLENDCONTROL_2].u32, register_file_[XE_GPU_REG_RB_BLENDCONTROL_3].u32, }; - D3D11_BLEND_DESC blend_desc; - xe_zero_struct(&blend_desc, sizeof(blend_desc)); - //blend_desc.AlphaToCoverageEnable = false; - // ? - blend_desc.IndependentBlendEnable = true; - for (int n = 0; n < XECOUNT(blend_control); n++) { - // A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND - blend_desc.RenderTarget[n].SrcBlend = blend_map[(blend_control[n] & 0x0000001F) >> 0]; - // A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND - blend_desc.RenderTarget[n].DestBlend = blend_map[(blend_control[n] & 0x00001F00) >> 8]; - // A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN - blend_desc.RenderTarget[n].BlendOp = blend_op_map[(blend_control[n] & 0x000000E0) >> 5]; - // A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND - blend_desc.RenderTarget[n].SrcBlendAlpha = blend_map[(blend_control[n] & 0x001F0000) >> 16]; - // A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND - blend_desc.RenderTarget[n].DestBlendAlpha = blend_map[(blend_control[n] & 0x1F000000) >> 24]; - // A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN - blend_desc.RenderTarget[n].BlendOpAlpha = blend_op_map[(blend_control[n] & 0x00E00000) >> 21]; - // A2XX_RB_COLOR_MASK_WRITE_* - blend_desc.RenderTarget[n].RenderTargetWriteMask = (color_mask >> (n * 4)) & 0xF; - // A2XX_RB_COLORCONTROL_BLEND_DISABLE ?? Can't find this! - // Just guess based on actions. - blend_desc.RenderTarget[n].BlendEnable = !( - (blend_desc.RenderTarget[n].SrcBlend == D3D11_BLEND_ONE) && - (blend_desc.RenderTarget[n].DestBlend == D3D11_BLEND_ZERO) && - (blend_desc.RenderTarget[n].BlendOp == D3D11_BLEND_OP_ADD) && - (blend_desc.RenderTarget[n].SrcBlendAlpha == D3D11_BLEND_ONE) && - (blend_desc.RenderTarget[n].DestBlendAlpha == D3D11_BLEND_ZERO) && - (blend_desc.RenderTarget[n].BlendOpAlpha == D3D11_BLEND_OP_ADD)); - } - ID3D11BlendState* blend_state = 0; - device_->CreateBlendState(&blend_desc, &blend_state); - context_->OMSetBlendState(blend_state, blend_factor, sample_mask); - XESAFERELEASE(blend_state); + // Check cache. + uint64_t key = hash_combine(color_mask, + blend_control[0], blend_control[1], + blend_control[2], blend_control[3]); + ID3D11BlendState* blend_state = nullptr; + auto it = blend_state_cache_.find(key); + if (it == blend_state_cache_.end()) { + D3D11_BLEND_DESC blend_desc; + xe_zero_struct(&blend_desc, sizeof(blend_desc)); + //blend_desc.AlphaToCoverageEnable = false; + // ? + blend_desc.IndependentBlendEnable = true; + for (int n = 0; n < XECOUNT(blend_control); n++) { + // A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND + blend_desc.RenderTarget[n].SrcBlend = blend_map[(blend_control[n] & 0x0000001F) >> 0]; + // A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND + blend_desc.RenderTarget[n].DestBlend = blend_map[(blend_control[n] & 0x00001F00) >> 8]; + // A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN + blend_desc.RenderTarget[n].BlendOp = blend_op_map[(blend_control[n] & 0x000000E0) >> 5]; + // A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND + blend_desc.RenderTarget[n].SrcBlendAlpha = blend_map[(blend_control[n] & 0x001F0000) >> 16]; + // A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND + blend_desc.RenderTarget[n].DestBlendAlpha = blend_map[(blend_control[n] & 0x1F000000) >> 24]; + // A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN + blend_desc.RenderTarget[n].BlendOpAlpha = blend_op_map[(blend_control[n] & 0x00E00000) >> 21]; + // A2XX_RB_COLOR_MASK_WRITE_* + blend_desc.RenderTarget[n].RenderTargetWriteMask = (color_mask >> (n * 4)) & 0xF; + // A2XX_RB_COLORCONTROL_BLEND_DISABLE ?? Can't find this! + // Just guess based on actions. + blend_desc.RenderTarget[n].BlendEnable = !( + (blend_desc.RenderTarget[n].SrcBlend == D3D11_BLEND_ONE) && + (blend_desc.RenderTarget[n].DestBlend == D3D11_BLEND_ZERO) && + (blend_desc.RenderTarget[n].BlendOp == D3D11_BLEND_OP_ADD) && + (blend_desc.RenderTarget[n].SrcBlendAlpha == D3D11_BLEND_ONE) && + (blend_desc.RenderTarget[n].DestBlendAlpha == D3D11_BLEND_ZERO) && + (blend_desc.RenderTarget[n].BlendOpAlpha == D3D11_BLEND_OP_ADD)); + } + device_->CreateBlendState(&blend_desc, &blend_state); + blend_state_cache_.insert({ key, blend_state }); + } else { + blend_state = it->second; + } + + float blend_factor[4] = { + register_file_[XE_GPU_REG_RB_BLEND_RED].f32, + register_file_[XE_GPU_REG_RB_BLEND_GREEN].f32, + register_file_[XE_GPU_REG_RB_BLEND_BLUE].f32, + register_file_[XE_GPU_REG_RB_BLEND_ALPHA].f32, + }; + uint32_t sample_mask = 0xFFFFFFFF; // ? + context_->OMSetBlendState(blend_state, blend_factor, sample_mask); + return 0; +} + +int D3D11GraphicsDriver::SetupDepthStencilState(const DrawCommand& command) { + static const D3D11_COMPARISON_FUNC compare_func_map[] = { + /* 0 */ D3D11_COMPARISON_NEVER, + /* 1 */ D3D11_COMPARISON_LESS, + /* 2 */ D3D11_COMPARISON_EQUAL, + /* 3 */ D3D11_COMPARISON_LESS_EQUAL, + /* 4 */ D3D11_COMPARISON_GREATER, + /* 5 */ D3D11_COMPARISON_NOT_EQUAL, + /* 6 */ D3D11_COMPARISON_GREATER_EQUAL, + /* 7 */ D3D11_COMPARISON_ALWAYS, + }; + static const D3D11_STENCIL_OP stencil_op_map[] = { + /* 0 */ D3D11_STENCIL_OP_KEEP, + /* 1 */ D3D11_STENCIL_OP_ZERO, + /* 2 */ D3D11_STENCIL_OP_REPLACE, + /* 3 */ D3D11_STENCIL_OP_INCR_SAT, + /* 4 */ D3D11_STENCIL_OP_DECR_SAT, + /* 5 */ D3D11_STENCIL_OP_INVERT, + /* 6 */ D3D11_STENCIL_OP_INCR, + /* 7 */ D3D11_STENCIL_OP_DECR, + }; + + uint32_t depth_control = register_file_[XE_GPU_REG_RB_DEPTHCONTROL].u32; + uint32_t stencil_ref_mask = register_file_[XE_GPU_REG_RB_STENCILREFMASK].u32; + + // Check cache. + uint64_t key = (uint64_t(depth_control) << 32) | stencil_ref_mask; + ID3D11DepthStencilState* depth_stencil_state = nullptr; + auto it = depth_stencil_state_cache_.find(key); + if (it == depth_stencil_state_cache_.end()) { + D3D11_DEPTH_STENCIL_DESC depth_stencil_desc; + xe_zero_struct(&depth_stencil_desc, sizeof(depth_stencil_desc)); + // A2XX_RB_DEPTHCONTROL_BACKFACE_ENABLE + // ? + // A2XX_RB_DEPTHCONTROL_Z_ENABLE + depth_stencil_desc.DepthEnable = (depth_control & 0x00000002) != 0; + // A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE + depth_stencil_desc.DepthWriteMask = (depth_control & 0x00000004) ? D3D11_DEPTH_WRITE_MASK_ALL : D3D11_DEPTH_WRITE_MASK_ZERO; + // A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE + // ? + // A2XX_RB_DEPTHCONTROL_ZFUNC + depth_stencil_desc.DepthFunc = compare_func_map[(depth_control & 0x00000070) >> 4]; + // A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE + depth_stencil_desc.StencilEnable = (depth_control & 0x00000001) != 0; + // RB_STENCILREFMASK_STENCILMASK + depth_stencil_desc.StencilReadMask = (stencil_ref_mask & 0x0000FF00) >> 8; + // RB_STENCILREFMASK_STENCILWRITEMASK + depth_stencil_desc.StencilWriteMask = (stencil_ref_mask & 0x00FF0000) >> 16; + // A2XX_RB_DEPTHCONTROL_STENCILFUNC + depth_stencil_desc.FrontFace.StencilFunc = compare_func_map[(depth_control & 0x00000700) >> 8]; + // A2XX_RB_DEPTHCONTROL_STENCILFAIL + depth_stencil_desc.FrontFace.StencilFailOp = stencil_op_map[(depth_control & 0x00003800) >> 11]; + // A2XX_RB_DEPTHCONTROL_STENCILZPASS + depth_stencil_desc.FrontFace.StencilPassOp = stencil_op_map[(depth_control & 0x0001C000) >> 14]; + // A2XX_RB_DEPTHCONTROL_STENCILZFAIL + depth_stencil_desc.FrontFace.StencilDepthFailOp = stencil_op_map[(depth_control & 0x000E0000) >> 17]; + // A2XX_RB_DEPTHCONTROL_STENCILFUNC_BF + depth_stencil_desc.BackFace.StencilFunc = compare_func_map[(depth_control & 0x00700000) >> 20]; + // A2XX_RB_DEPTHCONTROL_STENCILFAIL_BF + depth_stencil_desc.BackFace.StencilFailOp = stencil_op_map[(depth_control & 0x03800000) >> 23]; + // A2XX_RB_DEPTHCONTROL_STENCILZPASS_BF + depth_stencil_desc.BackFace.StencilPassOp = stencil_op_map[(depth_control & 0x1C000000) >> 26]; + // A2XX_RB_DEPTHCONTROL_STENCILZFAIL_BF + depth_stencil_desc.BackFace.StencilDepthFailOp = stencil_op_map[(depth_control & 0xE0000000) >> 29]; + device_->CreateDepthStencilState(&depth_stencil_desc, &depth_stencil_state); + depth_stencil_state_cache_.insert({ key, depth_stencil_state }); + } else { + depth_stencil_state = it->second; + } + + // RB_STENCILREFMASK_STENCILREF + uint32_t stencil_ref = (stencil_ref_mask & 0x000000FF); + context_->OMSetDepthStencilState(depth_stencil_state, stencil_ref); return 0; } diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.h b/src/xenia/gpu/d3d11/d3d11_graphics_driver.h index 4faa493ee..2d23b142f 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.h +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.h @@ -44,6 +44,9 @@ private: void InitializeInvalidTexture(); int UpdateState(const DrawCommand& command); + int SetupRasterizerState(const DrawCommand& command); + int SetupBlendState(const DrawCommand& command); + int SetupDepthStencilState(const DrawCommand& command); int SetupConstantBuffers(const DrawCommand& command); int SetupShaders(const DrawCommand& command); int SetupInputAssembly(const DrawCommand& command); @@ -61,6 +64,10 @@ private: ID3D11ShaderResourceView* invalid_texture_view_; ID3D11SamplerState* invalid_texture_sampler_state_; + std::unordered_map rasterizer_state_cache_; + std::unordered_map blend_state_cache_; + std::unordered_map depth_stencil_state_cache_; + struct { uint32_t width; uint32_t height; From 83378205000eecc60b3fda60f6b25bc75a645528 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 8 Jun 2014 11:59:57 -0700 Subject: [PATCH 153/184] Fiddling with interrupt triggering - still not right. --- src/xenia/gpu/d3d11/d3d11_graphics_system.cc | 51 +++++++++++--------- src/xenia/gpu/d3d11/d3d11_graphics_system.h | 4 ++ 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc index 7258195d3..ba20e5797 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc @@ -20,28 +20,11 @@ using namespace xe::gpu; using namespace xe::gpu::d3d11; -namespace { - -void __stdcall D3D11GraphicsSystemVsyncCallback( - D3D11GraphicsSystem* gs, BOOLEAN) { - static bool thread_name_set = false; - if (!thread_name_set) { - thread_name_set = true; - Profiler::ThreadEnter("VsyncTimer"); - } - SCOPE_profile_cpu_f("gpu"); - - gs->MarkVblank(); - gs->DispatchInterruptCallback(0); -} - -} - - -D3D11GraphicsSystem::D3D11GraphicsSystem(Emulator* emulator) : - window_(0), dxgi_factory_(0), device_(0), - timer_queue_(NULL), vsync_timer_(NULL), - GraphicsSystem(emulator) { +D3D11GraphicsSystem::D3D11GraphicsSystem(Emulator* emulator) + : GraphicsSystem(emulator), + window_(nullptr), dxgi_factory_(nullptr), device_(nullptr), + timer_queue_(nullptr), vsync_timer_(nullptr), + interrupt_pending_(true) { } D3D11GraphicsSystem::~D3D11GraphicsSystem() { @@ -57,7 +40,7 @@ void D3D11GraphicsSystem::Initialize() { CreateTimerQueueTimer( &vsync_timer_, timer_queue_, - (WAITORTIMERCALLBACK)D3D11GraphicsSystemVsyncCallback, + (WAITORTIMERCALLBACK)VsyncCallback, this, 16, 16, @@ -169,6 +152,10 @@ void D3D11GraphicsSystem::Pump() { window_->Swap(); DispatchInterruptCallback(0); + interrupt_pending_ = false; + } else if (interrupt_pending_) { + DispatchInterruptCallback(0); + interrupt_pending_ = false; } else { double time_since_last_interrupt = xe_pal_now() - last_interrupt_time_; if (time_since_last_interrupt > 0.5) { @@ -184,6 +171,24 @@ void D3D11GraphicsSystem::Pump() { } } +void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs, + BOOLEAN) { + static bool thread_name_set = false; + if (!thread_name_set) { + thread_name_set = true; + Profiler::ThreadEnter("VsyncTimer"); + } + SCOPE_profile_cpu_f("gpu"); + + gs->MarkVblank(); + + // TODO(benvanik): we shouldn't need to do the dispatch here, but there's + // something wrong and the CP will block waiting for code that + // needs to be run in the interrupt. + // gs->interrupt_pending_ = true; + gs->DispatchInterruptCallback(0); +} + void D3D11GraphicsSystem::Shutdown() { GraphicsSystem::Shutdown(); diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.h b/src/xenia/gpu/d3d11/d3d11_graphics_system.h index 0414d1bb5..00ca43e76 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.h +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.h @@ -40,12 +40,16 @@ protected: virtual void Pump(); private: + static void __stdcall VsyncCallback(D3D11GraphicsSystem* gs, BOOLEAN); + IDXGIFactory1* dxgi_factory_; ID3D11Device* device_; D3D11Window* window_; HANDLE timer_queue_; HANDLE vsync_timer_; + + bool interrupt_pending_; }; From 6e76c169d6fd2300116ed1b197b1a237185c812f Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 8 Jun 2014 21:24:29 -0700 Subject: [PATCH 154/184] Proper(ish) VdSwap - fixes a bunch of things. Caching is working a bit better, now. --- src/xenia/gpu/buffer_resource.cc | 7 ++- src/xenia/gpu/command_processor.cc | 38 ++++++++++----- src/xenia/gpu/d3d11/d3d11_graphics_system.cc | 45 +++++++---------- src/xenia/gpu/d3d11/d3d11_graphics_system.h | 4 +- src/xenia/gpu/graphics_system.cc | 2 +- src/xenia/gpu/graphics_system.h | 4 +- src/xenia/gpu/nop/nop_graphics_system.h | 2 + src/xenia/gpu/resource_cache.cc | 51 +++++++++++++------- src/xenia/gpu/texture_resource.cc | 9 ++-- src/xenia/gpu/xenos/packets.h | 2 + src/xenia/kernel/xboxkrnl_video.cc | 20 ++++---- 11 files changed, 101 insertions(+), 83 deletions(-) diff --git a/src/xenia/gpu/buffer_resource.cc b/src/xenia/gpu/buffer_resource.cc index 949bfe02b..9f9accb9b 100644 --- a/src/xenia/gpu/buffer_resource.cc +++ b/src/xenia/gpu/buffer_resource.cc @@ -30,10 +30,9 @@ int BufferResource::Prepare() { } } - // DISABLED - //if (!dirtied_) { - // return 0; - //} + if (!dirtied_) { + return 0; + } dirtied_ = false; // pass dirty regions? diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index c7a6a166b..23c27c5a9 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -300,6 +300,16 @@ uint32_t CommandProcessor::ExecutePacket(PacketArgs& args) { } break; + case PM4_XE_SWAP: + // Xenia-specific VdSwap hook. + // VdSwap will post this to tell us we need to swap the screen/fire an interrupt. + XETRACECP("[%.8X] Packet(%.8X): PM4_XE_SWAP", + packet_ptr, packet); + LOG_DATA(count); + ADVANCE_PTR(count); + graphics_system_->Swap(); + break; + case PM4_INDIRECT_BUFFER: // indirect buffer dispatch { @@ -334,14 +344,11 @@ uint32_t CommandProcessor::ExecutePacket(PacketArgs& args) { } else { // Register. XEASSERT(poll_reg_addr < RegisterFile::kRegisterCount); - - if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) { - // Waiting for coherency. We should have all the info we need - // now (base+size+mode), so kick it off. - MakeCoherent(); - } - value = regs->values[poll_reg_addr].u32; + if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) { + MakeCoherent(); + value = regs->values[poll_reg_addr].u32; + } } switch (wait_info & 0x7) { case 0x0: // Never. @@ -768,16 +775,23 @@ void CommandProcessor::WriteRegister( } void CommandProcessor::MakeCoherent() { - RegisterFile* regs = driver_->register_file(); - auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32; - auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32; - auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32; - // Status host often has 0x01000000 or 0x03000000. // This is likely toggling VC (vertex cache) or TC (texture cache). // Or, it also has a direction in here maybe - there is probably // some way to check for dest coherency (what all the COHER_DEST_BASE_* // registers are for). + // Best docs I've found on this are here: + // http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/10/R6xx_R7xx_3D.pdf + // http://cgit.freedesktop.org/xorg/driver/xf86-video-radeonhd/tree/src/r6xx_accel.c?id=3f8b6eccd9dba116cc4801e7f80ce21a879c67d2#n454 + + RegisterFile* regs = driver_->register_file(); + auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32; + auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32; + auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32; + + if (!(status_host & 0x80000000ul)) { + return; + } // TODO(benvanik): notify resource cache of base->size and type. XETRACECP("Make %.8X -> %.8X (%db) coherent", diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc index ba20e5797..8e6fc5a7e 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc @@ -24,7 +24,7 @@ D3D11GraphicsSystem::D3D11GraphicsSystem(Emulator* emulator) : GraphicsSystem(emulator), window_(nullptr), dxgi_factory_(nullptr), device_(nullptr), timer_queue_(nullptr), vsync_timer_(nullptr), - interrupt_pending_(true) { + last_swap_time_(0.0) { } D3D11GraphicsSystem::~D3D11GraphicsSystem() { @@ -141,36 +141,26 @@ void D3D11GraphicsSystem::Initialize() { void D3D11GraphicsSystem::Pump() { SCOPE_profile_cpu_f("gpu"); - if (swap_pending_) { - swap_pending_ = false; - - // TODO(benvanik): remove this when commands are understood. - driver_->Resolve(); - - // Swap window. - // If we are set to vsync this will block. - window_->Swap(); - - DispatchInterruptCallback(0); - interrupt_pending_ = false; - } else if (interrupt_pending_) { - DispatchInterruptCallback(0); - interrupt_pending_ = false; - } else { - double time_since_last_interrupt = xe_pal_now() - last_interrupt_time_; - if (time_since_last_interrupt > 0.5) { - // If we have gone too long without an interrupt, fire one. - DispatchInterruptCallback(0); - } - if (time_since_last_interrupt > 0.3) { - // Force a swap when profiling. - if (Profiler::is_enabled()) { - window_->Swap(); - } + double time_since_last_swap = xe_pal_now() - last_swap_time_; + if (time_since_last_swap > 1.0) { + // Force a swap when profiling. + if (Profiler::is_enabled()) { + window_->Swap(); } } } +void D3D11GraphicsSystem::Swap() { + // TODO(benvanik): remove this when commands are understood. + driver_->Resolve(); + + // Swap window. + // If we are set to vsync this will block. + window_->Swap(); + + last_swap_time_ = xe_pal_now(); +} + void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs, BOOLEAN) { static bool thread_name_set = false; @@ -185,7 +175,6 @@ void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs, // TODO(benvanik): we shouldn't need to do the dispatch here, but there's // something wrong and the CP will block waiting for code that // needs to be run in the interrupt. - // gs->interrupt_pending_ = true; gs->DispatchInterruptCallback(0); } diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.h b/src/xenia/gpu/d3d11/d3d11_graphics_system.h index 00ca43e76..7bd641667 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.h +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.h @@ -35,6 +35,8 @@ public: virtual void Shutdown(); + void Swap() override; + protected: virtual void Initialize(); virtual void Pump(); @@ -49,7 +51,7 @@ private: HANDLE timer_queue_; HANDLE vsync_timer_; - bool interrupt_pending_; + double last_swap_time_; }; diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index be3e4e0de..212074168 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -28,7 +28,7 @@ GraphicsSystem::GraphicsSystem(Emulator* emulator) : thread_(nullptr), running_(false), driver_(nullptr), command_processor_(nullptr), interrupt_callback_(0), interrupt_callback_data_(0), - last_interrupt_time_(0), swap_pending_(false), thread_wait_(nullptr) { + last_interrupt_time_(0), thread_wait_(nullptr) { // Create the run loop used for any windows/etc. // This must be done on the thread we create the driver. run_loop_ = xe_run_loop_create(); diff --git a/src/xenia/gpu/graphics_system.h b/src/xenia/gpu/graphics_system.h index 8c0a542c8..3b8fdabb1 100644 --- a/src/xenia/gpu/graphics_system.h +++ b/src/xenia/gpu/graphics_system.h @@ -45,8 +45,7 @@ public: void MarkVblank(); void DispatchInterruptCallback(uint32_t source, uint32_t cpu = 0xFFFFFFFF); - bool swap_pending() const { return swap_pending_; } - void set_swap_pending(bool value) { swap_pending_ = value; } + virtual void Swap() = 0; protected: virtual void Initialize(); @@ -83,7 +82,6 @@ protected: uint32_t interrupt_callback_; uint32_t interrupt_callback_data_; double last_interrupt_time_; - bool swap_pending_; HANDLE thread_wait_; }; diff --git a/src/xenia/gpu/nop/nop_graphics_system.h b/src/xenia/gpu/nop/nop_graphics_system.h index 54f77e04e..cf5f43b8a 100644 --- a/src/xenia/gpu/nop/nop_graphics_system.h +++ b/src/xenia/gpu/nop/nop_graphics_system.h @@ -28,6 +28,8 @@ public: virtual void Shutdown(); + void Swap() override {} + protected: virtual void Initialize(); virtual void Pump(); diff --git a/src/xenia/gpu/resource_cache.cc b/src/xenia/gpu/resource_cache.cc index 46eec8f0b..5641c8318 100644 --- a/src/xenia/gpu/resource_cache.cc +++ b/src/xenia/gpu/resource_cache.cc @@ -9,6 +9,8 @@ #include +#include + using namespace std; using namespace xe; @@ -110,6 +112,8 @@ uint64_t ResourceCache::HashRange(const MemoryRange& memory_range) { } void ResourceCache::SyncRange(uint32_t address, int length) { + SCOPE_profile_cpu_f("gpu"); + // Scan the page table in sync with our resource list. This means // we have O(n) complexity for updates, though we could definitely // make this faster/cleaner. @@ -118,15 +122,12 @@ void ResourceCache::SyncRange(uint32_t address, int length) { // will not be changing, which allows us to do a foreach(res) and reload // and then clear the table. - // DISABLED - return; - // total bytes = (512 * 1024 * 1024) / (16 * 1024) = 32768 // each byte = 1 page // Walk as qwords so we can clear things up faster. uint64_t* page_table = reinterpret_cast( memory_->Translate(memory_->page_table())); - int page_size = 16 * 1024; // 16KB pages + uint32_t page_size = 16 * 1024; // 16KB pages uint32_t lo_address = address % 0x20000000; uint32_t hi_address = lo_address + length; @@ -134,24 +135,38 @@ void ResourceCache::SyncRange(uint32_t address, int length) { int start_page = lo_address / page_size; int end_page = hi_address / page_size; - auto it = paged_resources_.upper_bound(lo_address); - auto end_it = paged_resources_.lower_bound(hi_address); - while (it != end_it) { - const auto& memory_range = it->second->memory_range(); - int lo_page = (memory_range.guest_base % 0x20000000) / page_size; - int hi_page = lo_page + (memory_range.length / page_size); - for (int i = lo_page / 8; i <= hi_page / 8; ++i) { - uint64_t page_flags = page_table[i]; - if (page_flags) { - // Dirty! - it->second->MarkDirty(i * 8 * page_size, (i * 8 + 7) * page_size); + { + SCOPE_profile_cpu_i("gpu", "SyncRange:mark"); + auto it = lo_address > page_size ? + paged_resources_.upper_bound(lo_address - page_size) : + paged_resources_.begin(); + auto end_it = paged_resources_.lower_bound(hi_address + page_size); + while (it != end_it) { + const auto& memory_range = it->second->memory_range(); + int lo_page = (memory_range.guest_base % 0x20000000) / page_size; + int hi_page = lo_page + (memory_range.length / page_size); + lo_page = std::max(lo_page, start_page); + hi_page = std::min(hi_page, end_page); + if (lo_page > hi_page) { + ++it; + continue; } + for (int i = lo_page / 8; i <= hi_page / 8; ++i) { + uint64_t page_flags = page_table[i]; + if (page_flags) { + // Dirty! + it->second->MarkDirty(i * 8 * page_size, (i * 8 + 7) * page_size); + } + } + ++it; } - ++it; } // Reset page table. - for (auto i = start_page / 8; i <= end_page / 8; ++i) { - page_table[i] = 0; + { + SCOPE_profile_cpu_i("gpu", "SyncRange:reset"); + for (auto i = start_page / 8; i <= end_page / 8; ++i) { + page_table[i] = 0; + } } } diff --git a/src/xenia/gpu/texture_resource.cc b/src/xenia/gpu/texture_resource.cc index e7cfdee0d..531796c11 100644 --- a/src/xenia/gpu/texture_resource.cc +++ b/src/xenia/gpu/texture_resource.cc @@ -291,11 +291,10 @@ int TextureResource::Prepare() { return 1; } } - - // DISABLED - //if (!dirtied_) { - // return 0; - //} + + if (!dirtied_) { + return 0; + } dirtied_ = false; // pass dirty regions? diff --git a/src/xenia/gpu/xenos/packets.h b/src/xenia/gpu/xenos/packets.h index 4b7124310..459ab7e6e 100644 --- a/src/xenia/gpu/xenos/packets.h +++ b/src/xenia/gpu/xenos/packets.h @@ -70,6 +70,8 @@ enum Type3Opcode { PM4_CONTEXT_UPDATE = 0x5e, // updates the current context, if needed PM4_INTERRUPT = 0x54, // generate interrupt from the command stream + PM4_XE_SWAP = 0x55, // Xenia only: VdSwap uses this to trigger a swap. + PM4_IM_STORE = 0x2c, // copy sequencer instruction memory to system memory // Tiled rendering: diff --git a/src/xenia/kernel/xboxkrnl_video.cc b/src/xenia/kernel/xboxkrnl_video.cc index 951606bde..6519a067c 100644 --- a/src/xenia/kernel/xboxkrnl_video.cc +++ b/src/xenia/kernel/xboxkrnl_video.cc @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -422,19 +423,16 @@ SHIM_CALL VdSwap_shim( unk6, unk7); - KernelState* kernel_state = shared_kernel_state_; - XEASSERTNOTNULL(kernel_state); - GraphicsSystem* gs = kernel_state->emulator()->graphics_system(); - if (!gs) { - return; - } - - gs->set_swap_pending(true); - // The caller seems to reserve 64 words (256b) in the primary ringbuffer - // for this method to do what it needs. We just zero them out. We could - // encode the parameters in the stream for the ringbuffer, if needed. + // for this method to do what it needs. We just zero them out and send a + // token value. It'd be nice to figure out what this is really doing so + // that we could simulate it, though due to TCR I bet all games need to + // use this method. xe_zero_struct(SHIM_MEM_ADDR(unk0), 64 * 4); + auto dwords = reinterpret_cast(SHIM_MEM_ADDR(unk0)); + dwords[0] = XESWAP32((0x03 << 30) | + ((1 - 1) << 16) | + (xenos::PM4_XE_SWAP << 8)); SHIM_SET_RETURN_64(0); } From 264fc7cc91f0715eb0507458307670a0fde120a2 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 8 Jun 2014 22:21:05 -0700 Subject: [PATCH 155/184] Fix assert. --- src/xenia/kernel/xboxkrnl_audio.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/xenia/kernel/xboxkrnl_audio.cc b/src/xenia/kernel/xboxkrnl_audio.cc index c1ba83ed5..e048dad1a 100644 --- a/src/xenia/kernel/xboxkrnl_audio.cc +++ b/src/xenia/kernel/xboxkrnl_audio.cc @@ -75,7 +75,9 @@ SHIM_CALL XAudioGetVoiceCategoryVolumeChangeMask_shim( "XAudioGetVoiceCategoryVolumeChangeMask(%.8X, %.8X)", driver_ptr, out_ptr); - XEASSERT(driver_ptr == 0xAADD1100); + XEASSERT((driver_ptr & 0xFFFF0000) == 0x41550000); + + auto audio_system = state->emulator()->audio_system(); // Checking these bits to see if any voice volume changed. // I think. From 43a3b4f3c05917756f75c95b90c2dd9f4f9cfd3d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 8 Jun 2014 22:21:23 -0700 Subject: [PATCH 156/184] Support constant value movs to mmio memory. --- src/xenia/cpu/xenon_memory.cc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/xenia/cpu/xenon_memory.cc b/src/xenia/cpu/xenon_memory.cc index 22f928022..10dcf5f47 100644 --- a/src/xenia/cpu/xenon_memory.cc +++ b/src/xenia/cpu/xenon_memory.cc @@ -193,11 +193,16 @@ LONG CALLBACK CheckMMIOHandler(PEXCEPTION_POINTERS ex_info) { ex_info->ContextRecord->Rip += len; return EXCEPTION_CONTINUE_EXECUTION; } else if (action == 1) { - XEASSERT((disasm.Argument2.ArgType & BE::REGISTER_TYPE) == - BE::REGISTER_TYPE); - uint64_t* reg_ptr = GetContextRegPtr(disasm.Argument2.ArgType, - ex_info->ContextRecord); - uint64_t value = *reg_ptr; + uint64_t value; + if ((disasm.Argument2.ArgType & BE::REGISTER_TYPE) == BE::REGISTER_TYPE) { + uint64_t* reg_ptr = GetContextRegPtr(disasm.Argument2.ArgType, + ex_info->ContextRecord); + value = *reg_ptr; + } else if ((disasm.Argument2.ArgType & BE::CONSTANT_TYPE) == BE::CONSTANT_TYPE) { + value = disasm.Instruction.Immediat; + } else { + XEASSERTALWAYS(); + } switch (disasm.Argument2.ArgSize) { case 8: value = static_cast(value); From 052dca5010cd6c855b453b3789bb2e99afb8cc83 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 8 Jun 2014 22:21:38 -0700 Subject: [PATCH 157/184] Const src1 permute. --- src/alloy/backend/x64/x64_sequences.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 50981f1cb..a1bc716df 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4497,7 +4497,12 @@ EMITTER(PERMUTE_V128, MATCH(I, V128<>, V128<>, V128<>>)) e.vpxor(i.dest, i.dest); } else { // Control mask needs to be shuffled. - e.vpshufb(e.xmm0, i.src1, e.GetXmmConstPtr(XMMByteSwapMask)); + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMByteSwapMask)); + } else { + e.vpshufb(e.xmm0, i.src1, e.GetXmmConstPtr(XMMByteSwapMask)); + } if (i.src2.is_constant) { e.LoadConstantXmm(i.dest, i.src2.constant()); e.vpshufb(i.dest, i.dest, e.xmm0); From 95508273f6875b421662af9fe9eb69e89411d86b Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 8 Jun 2014 23:11:09 -0700 Subject: [PATCH 158/184] Misc experimentation. --- src/alloy/backend/x64/x64_sequences.cc | 1 + src/xenia/kernel/xboxkrnl_debug.cc | 4 ++++ src/xenia/kernel/xboxkrnl_io.cc | 7 +++++++ 3 files changed, 12 insertions(+) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index a1bc716df..8c8a2ed00 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -3439,6 +3439,7 @@ EMITTER_OPCODE_TABLE( // ============================================================================ // TODO(benvanik): use approx here: // http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html +// TODO(benvanik): this emulated fn destroys all xmm registers! don't do it! EMITTER(LOG2_F32, MATCH(I, F32<>>)) { static __m128 EmulateLog2(__m128 src) { float result = log2(src.m128_f32[0]); diff --git a/src/xenia/kernel/xboxkrnl_debug.cc b/src/xenia/kernel/xboxkrnl_debug.cc index e6cc023c3..c0a9e7771 100644 --- a/src/xenia/kernel/xboxkrnl_debug.cc +++ b/src/xenia/kernel/xboxkrnl_debug.cc @@ -288,9 +288,13 @@ SHIM_CALL RtlRaiseException_shim( } if (thread) { + XELOGD("SetThreadName(%d, %s)", thread->thread_id(), name); thread->set_name(name); thread->Release(); } + + // TODO(benvanik): unwinding required here? + return; } // TODO(benvanik): unwinding. diff --git a/src/xenia/kernel/xboxkrnl_io.cc b/src/xenia/kernel/xboxkrnl_io.cc index b93e3a104..cb3f4f2ca 100644 --- a/src/xenia/kernel/xboxkrnl_io.cc +++ b/src/xenia/kernel/xboxkrnl_io.cc @@ -392,6 +392,13 @@ SHIM_CALL NtQueryInformationFile_shim( if (XSUCCEEDED(result)) { result = X_STATUS_SUCCESS; switch (file_info_class) { + case XFileInternalInformation: + // Internal unique file pointer. Not sure why anyone would want this. + XEASSERT(length == 8); + info = 8; + // TODO(benvanik): use pointer to fs:: entry? + SHIM_SET_MEM_64(file_info_ptr, hash_combine(0, file->absolute_path())); + break; case XFilePositionInformation: // struct FILE_POSITION_INFORMATION { // LARGE_INTEGER CurrentByteOffset; From 5343cab8517335300f638b2080e2b38fc4f8d8f1 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 9 Jun 2014 21:15:13 -0700 Subject: [PATCH 159/184] Support constant CALL_INDIRECT. --- .../compiler/passes/constant_propagation_pass.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/alloy/compiler/passes/constant_propagation_pass.cc b/src/alloy/compiler/passes/constant_propagation_pass.cc index 140d0bf9c..13001a904 100644 --- a/src/alloy/compiler/passes/constant_propagation_pass.cc +++ b/src/alloy/compiler/passes/constant_propagation_pass.cc @@ -9,6 +9,9 @@ #include +#include +#include + using namespace alloy; using namespace alloy::compiler; using namespace alloy::compiler::passes; @@ -89,6 +92,17 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) { } } break; + case OPCODE_CALL_INDIRECT: + if (i->src1.value->IsConstant()) { + runtime::FunctionInfo* symbol_info; + if (runtime_->LookupFunctionInfo( + (uint32_t)i->src1.value->constant.i32, &symbol_info)) { + break; + } + i->Replace(&OPCODE_CALL_info, i->flags); + i->src1.symbol_info = symbol_info; + } + break; case OPCODE_CALL_INDIRECT_TRUE: if (i->src1.value->IsConstant()) { if (i->src1.value->IsConstantTrue()) { From 289075d0521f916d4b776eea6fd7434b401ccc45 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 9 Jun 2014 21:29:15 -0700 Subject: [PATCH 160/184] Constant D3DCOLOR UNPACK. Could optimize earlier. --- src/alloy/backend/x64/x64_sequences.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 8c8a2ed00..143257b6e 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4716,6 +4716,10 @@ EMITTER(UNPACK, MATCH(I, V128<>>)) { // dest.f4[1] = (float)((src >> 8) & 0xFF) * (1.0f / 255.0f); // dest.f4[2] = (float)(src & 0xFF) * (1.0f / 255.0f); // dest.f4[3] = (float)((src >> 24) & 0xFF) * (1.0f / 255.0f); + if (i.src1.is_constant) { + e.vpxor(i.dest, i.dest); + return; + } // src = ZZYYXXWW // unpack to 000000ZZ,000000YY,000000XX,000000WW From ff8c03046ac1b1ab3066d65680c36539d5de5340 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 9 Jun 2014 21:29:35 -0700 Subject: [PATCH 161/184] VECTOR_COMPARE_UGT/UGE (probably). --- src/alloy/backend/x64/x64_emitter.cc | 4 + src/alloy/backend/x64/x64_emitter.h | 4 + src/alloy/backend/x64/x64_sequences.cc | 124 +++++++++++++++++++++---- 3 files changed, 116 insertions(+), 16 deletions(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 8674459bf..ace7964a6 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -505,6 +505,10 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { /* XMMShiftByteMask */ vec128i(0x000000FFu, 0x000000FFu, 0x000000FFu, 0x000000FFu), /* XMMUnsignedDwordMax */ vec128i(0xFFFFFFFFu, 0x00000000u, 0xFFFFFFFFu, 0x00000000u), /* XMM255 */ vec128f(255.0f, 255.0f, 255.0f, 255.0f), + /* XMMSignMaskI8 */ vec128i(0x80808080u, 0x80808080u, 0x80808080u, 0x80808080u), + /* XMMSignMaskI16 */ vec128i(0x80008000u, 0x80008000u, 0x80008000u, 0x80008000u), + /* XMMSignMaskI32 */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), + /* XMMSignMaskF32 */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), }; // TODO(benvanik): cache base pointer somewhere? stack? It'd be nice to // prevent this move. diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 785ff5ac7..2a56411c8 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -57,6 +57,10 @@ enum XmmConst { XMMShiftByteMask, XMMUnsignedDwordMax, XMM255, + XMMSignMaskI8, + XMMSignMaskI16, + XMMSignMaskI32, + XMMSignMaskF32, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 143257b6e..4f7f55a18 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -2337,25 +2337,117 @@ EMITTER_OPCODE_TABLE( // ============================================================================ // OPCODE_VECTOR_COMPARE_UGT // ============================================================================ -//EMITTER(VECTOR_COMPARE_UGT_V128, MATCH(I, V128<>, V128<>>)) { -// static void Emit(X64Emitter& e, const EmitArgType& i) { -// } -//}; -//EMITTER_OPCODE_TABLE( -// OPCODE_VECTOR_COMPARE_UGT, -// VECTOR_COMPARE_UGT_V128); +EMITTER(VECTOR_COMPARE_UGT_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy + switch (i.instr->flags) { + case INT8_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI8); + break; + case INT16_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI16); + break; + case INT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); + break; + case FLOAT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); + break; + } + if (i.src1.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + e.vpxor(e.xmm0, sign_addr); + } else { + e.vpxor(e.xmm0, i.src1, sign_addr); + } + if (i.src2.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm1, i.src1.constant()); + e.vpxor(e.xmm1, sign_addr); + } else { + e.vpxor(e.xmm1, i.src2, sign_addr); + } + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpgtb(i.dest, e.xmm0, e.xmm1); + break; + case INT16_TYPE: + e.vpcmpgtw(i.dest, e.xmm0, e.xmm1); + break; + case INT32_TYPE: + e.vpcmpgtd(i.dest, e.xmm0, e.xmm1); + break; + case FLOAT32_TYPE: + e.vcmpgtps(i.dest, e.xmm0, e.xmm1); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_COMPARE_UGT, + VECTOR_COMPARE_UGT_V128); // ============================================================================ // OPCODE_VECTOR_COMPARE_UGE // ============================================================================ -//EMITTER(VECTOR_COMPARE_UGE_V128, MATCH(I, V128<>, V128<>>)) { -// static void Emit(X64Emitter& e, const EmitArgType& i) { -// } -//}; -//EMITTER_OPCODE_TABLE( -// OPCODE_VECTOR_COMPARE_UGE, -// VECTOR_COMPARE_UGE_V128); +EMITTER(VECTOR_COMPARE_UGE_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy + switch (i.instr->flags) { + case INT8_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI8); + break; + case INT16_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI16); + break; + case INT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); + break; + case FLOAT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); + break; + } + if (i.src1.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + e.vpxor(e.xmm0, sign_addr); + } else { + e.vpxor(e.xmm0, i.src1, sign_addr); + } + if (i.src2.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm1, i.src1.constant()); + e.vpxor(e.xmm1, sign_addr); + } else { + e.vpxor(e.xmm1, i.src2, sign_addr); + } + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpeqb(e.xmm2, e.xmm0, e.xmm1); + e.vpcmpgtb(i.dest, e.xmm0, e.xmm1); + e.vpor(i.dest, e.xmm2); + break; + case INT16_TYPE: + e.vpcmpeqw(e.xmm2, e.xmm0, e.xmm1); + e.vpcmpgtw(i.dest, e.xmm0, e.xmm1); + e.vpor(i.dest, e.xmm2); + break; + case INT32_TYPE: + e.vpcmpeqd(e.xmm2, e.xmm0, e.xmm1); + e.vpcmpgtd(i.dest, e.xmm0, e.xmm1); + e.vpor(i.dest, e.xmm2); + break; + case FLOAT32_TYPE: + e.vcmpgeps(i.dest, e.xmm0, e.xmm1); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_COMPARE_UGE, + VECTOR_COMPARE_UGE_V128); // ============================================================================ @@ -4968,8 +5060,8 @@ void alloy::backend::x64::RegisterSequences() { REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_EQ); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGT); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGE); - //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGT); - //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGE); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ADD); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ADD_CARRY); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ADD); From 829b4f67bfb15ae6cc8e6287a23c909cad2bbe3d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 10 Jun 2014 20:40:11 -0700 Subject: [PATCH 162/184] ReadFile EOF. --- src/xenia/kernel/fs/devices/disc_image_file.cc | 3 +++ src/xenia/kernel/fs/devices/host_path_file.cc | 2 +- src/xenia/kernel/fs/devices/stfs_container_file.cc | 3 +++ src/xenia/xbox.h | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/xenia/kernel/fs/devices/disc_image_file.cc b/src/xenia/kernel/fs/devices/disc_image_file.cc index d98919c62..c094dbef7 100644 --- a/src/xenia/kernel/fs/devices/disc_image_file.cc +++ b/src/xenia/kernel/fs/devices/disc_image_file.cc @@ -63,6 +63,9 @@ X_STATUS DiscImageFile::ReadSync( size_t* out_bytes_read) { GDFXEntry* gdfx_entry = entry_->gdfx_entry(); xe_mmap_ref mmap = entry_->mmap(); + if (byte_offset >= gdfx_entry->size) { + return X_STATUS_END_OF_FILE; + } size_t real_offset = gdfx_entry->offset + byte_offset; size_t real_length = MIN(buffer_length, gdfx_entry->size - byte_offset); xe_copy_memory( diff --git a/src/xenia/kernel/fs/devices/host_path_file.cc b/src/xenia/kernel/fs/devices/host_path_file.cc index cf75e69e9..b36f9f890 100644 --- a/src/xenia/kernel/fs/devices/host_path_file.cc +++ b/src/xenia/kernel/fs/devices/host_path_file.cc @@ -71,6 +71,6 @@ X_STATUS HostPathFile::ReadSync( *out_bytes_read = bytes_read; return X_STATUS_SUCCESS; } else { - return X_STATUS_UNSUCCESSFUL; + return X_STATUS_END_OF_FILE; } } diff --git a/src/xenia/kernel/fs/devices/stfs_container_file.cc b/src/xenia/kernel/fs/devices/stfs_container_file.cc index 05b1a21a8..4f9f25a53 100644 --- a/src/xenia/kernel/fs/devices/stfs_container_file.cc +++ b/src/xenia/kernel/fs/devices/stfs_container_file.cc @@ -64,6 +64,9 @@ X_STATUS STFSContainerFile::ReadSync( STFSEntry* stfs_entry = entry_->stfs_entry(); xe_mmap_ref mmap = entry_->mmap(); uint8_t* map_ptr = xe_mmap_get_addr(mmap); + if (byte_offset >= stfs_entry->size) { + return X_STATUS_END_OF_FILE; + } // Each block is 4096. // Blocks may not be sequential, so we need to read by blocks and handle the diff --git a/src/xenia/xbox.h b/src/xenia/xbox.h index ebf2f816f..aab667584 100644 --- a/src/xenia/xbox.h +++ b/src/xenia/xbox.h @@ -43,6 +43,7 @@ typedef uint32_t X_STATUS; #define X_STATUS_INVALID_HANDLE ((X_STATUS)0xC0000008L) #define X_STATUS_INVALID_PARAMETER ((X_STATUS)0xC000000DL) #define X_STATUS_NO_SUCH_FILE ((X_STATUS)0xC000000FL) +#define X_STATUS_END_OF_FILE ((X_STATUS)0xC0000011L) #define X_STATUS_NO_MEMORY ((X_STATUS)0xC0000017L) #define X_STATUS_ALREADY_COMMITTED ((X_STATUS)0xC0000021L) #define X_STATUS_ACCESS_DENIED ((X_STATUS)0xC0000022L) From 38454372762482f59b6601fbce700804d405972f Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 10 Jun 2014 21:06:36 -0700 Subject: [PATCH 163/184] Fix assert to allow 16 fetchers. --- src/xenia/gpu/shader_resource.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xenia/gpu/shader_resource.cc b/src/xenia/gpu/shader_resource.cc index 9fbcbf2bb..07b64efbe 100644 --- a/src/xenia/gpu/shader_resource.cc +++ b/src/xenia/gpu/shader_resource.cc @@ -188,7 +188,7 @@ void ShaderResource::GatherVertexFetch(const instr_fetch_vtx_t* vtx) { auto& desc = inputs.descs[n]; auto& info = desc.info; if (desc.fetch_slot == fetch_slot) { - XEASSERT(info.element_count + 1 < XECOUNT(info.elements)); + XEASSERT(info.element_count <= XECOUNT(info.elements)); // It may not hold that all strides are equal, but I hope it does. XEASSERT(!vtx->stride || info.stride_words == vtx->stride); el = &info.elements[info.element_count++]; From 574a04a853e79547ab36af9662f6da5390879e83 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 10 Jun 2014 21:33:58 -0700 Subject: [PATCH 164/184] Preventing emulated instructions from stomping the xmm registers. --- src/alloy/backend/x64/x64_emitter.cc | 14 ++++++++++++++ src/alloy/backend/x64/x64_emitter.h | 1 + src/alloy/backend/x64/x64_sequences.cc | 13 ++++++------- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index ace7964a6..6daba0195 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -414,6 +414,20 @@ void X64Emitter::CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0), uin ReloadEDX(); } +void X64Emitter::CallNativeSafe(void* fn) { + // rcx = context + // rdx = target host function + // r8 = arg0 + // r9 = arg1 + mov(rdx, reinterpret_cast(fn)); + auto thunk = backend()->guest_to_host_thunk(); + mov(rax, reinterpret_cast(thunk)); + call(rax); + ReloadECX(); + ReloadEDX(); + // rax = host return +} + void X64Emitter::SetReturnAddress(uint64_t value) { mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], value); } diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 2a56411c8..12c8c0310 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -127,6 +127,7 @@ public: void CallNative(uint64_t(*fn)(void* raw_context)); void CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0)); void CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0), uint64_t arg0); + void CallNativeSafe(void* fn); void SetReturnAddress(uint64_t value); void ReloadECX(); void ReloadEDX(); diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 4f7f55a18..f7fbf6997 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -3489,7 +3489,7 @@ EMITTER(POW2_F32, MATCH(I, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERTALWAYS(); e.lea(e.r8, e.StashXmm(i.src1)); - e.CallNative(EmulatePow2); + e.CallNativeSafe(EmulatePow2); e.vmovaps(i.dest, e.xmm0); } }; @@ -3501,7 +3501,7 @@ EMITTER(POW2_F64, MATCH(I, F64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERTALWAYS(); e.lea(e.r8, e.StashXmm(i.src1)); - e.CallNative(EmulatePow2); + e.CallNativeSafe(EmulatePow2); e.vmovaps(i.dest, e.xmm0); } }; @@ -3515,7 +3515,7 @@ EMITTER(POW2_V128, MATCH(I, V128<>>)) { } static void Emit(X64Emitter& e, const EmitArgType& i) { e.lea(e.r8, e.StashXmm(i.src1)); - e.CallNative(EmulatePow2); + e.CallNativeSafe(EmulatePow2); e.vmovaps(i.dest, e.xmm0); } }; @@ -3540,7 +3540,7 @@ EMITTER(LOG2_F32, MATCH(I, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERTALWAYS(); e.lea(e.r8, e.StashXmm(i.src1)); - e.CallNative(EmulateLog2); + e.CallNativeSafe(EmulateLog2); e.vmovaps(i.dest, e.xmm0); } }; @@ -3552,7 +3552,7 @@ EMITTER(LOG2_F64, MATCH(I, F64<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { XEASSERTALWAYS(); e.lea(e.r8, e.StashXmm(i.src1)); - e.CallNative(EmulateLog2); + e.CallNativeSafe(EmulateLog2); e.vmovaps(i.dest, e.xmm0); } }; @@ -3565,9 +3565,8 @@ EMITTER(LOG2_V128, MATCH(I, V128<>>)) { return result; } static void Emit(X64Emitter& e, const EmitArgType& i) { - XEASSERTALWAYS(); e.lea(e.r8, e.StashXmm(i.src1)); - e.CallNative(EmulateLog2); + e.CallNativeSafe(EmulateLog2); e.vmovaps(i.dest, e.xmm0); } }; From 713cfcb4295a61fac181faff1fa8749a3295fbc3 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 10 Jun 2014 21:35:50 -0700 Subject: [PATCH 165/184] Possibly correct FLOAT16 unpack. --- src/alloy/backend/x64/x64_emitter.cc | 1 + src/alloy/backend/x64/x64_emitter.h | 1 + src/alloy/backend/x64/x64_sequences.cc | 21 +++++---------------- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 6daba0195..0e2d55860 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -505,6 +505,7 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { /* XMMFlipX16Y16 */ vec128i(0x00008000u, 0x00000000u, 0x00000000u, 0x00000000u), /* XMMFixX16Y16 */ vec128f(-32768.0f, 0.0f, 0.0f, 0.0f), /* XMMNormalizeX16Y16 */ vec128f(1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f), + /* XMM0001 */ vec128f(0.0f, 0.0f, 0.0f, 1.0f), /* XMM3301 */ vec128f(3.0f, 3.0f, 0.0f, 1.0f), /* XMMSignMaskPS */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), /* XMMSignMaskPD */ vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u), diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 12c8c0310..e6ea7b7b5 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -43,6 +43,7 @@ enum XmmConst { XMMFlipX16Y16, XMMFixX16Y16, XMMNormalizeX16Y16, + XMM0001, XMM3301, XMMSignMaskPS, XMMSignMaskPD, diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index f7fbf6997..59153e40b 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -29,9 +29,6 @@ #include #include -// TODO(benvanik): reimplement packing functions -#include - using namespace alloy; using namespace alloy::backend; using namespace alloy::backend::x64; @@ -4820,13 +4817,6 @@ EMITTER(UNPACK, MATCH(I, V128<>>)) { // mult by 1/255 e.vmulps(i.dest, e.GetXmmConstPtr(XMMOneOver255)); } - static void Unpack_FLOAT16_2(void* raw_context, __m128& v) { - uint32_t src = v.m128_i32[3]; - v.m128_f32[0] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)src); - v.m128_f32[1] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)(src >> 16)); - v.m128_f32[2] = 0.0f; - v.m128_f32[3] = 1.0f; - } static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { // 1 bit sign, 5 bit exponent, 10 bit mantissa // D3D10 half float format @@ -4844,14 +4834,13 @@ EMITTER(UNPACK, MATCH(I, V128<>>)) { // XMConvertHalfToFloat(sy), // 0.0, // 1.0 }; - auto addr = e.StashXmm(i.src1); - e.lea(e.rdx, addr); - e.CallNative(Unpack_FLOAT16_2); - e.vmovaps(i.dest, addr); + e.vcvtph2ps(i.dest, i.src1); + e.vpshufd(i.dest, i.dest, B10100100); + e.vpor(i.dest, e.GetXmmConstPtr(XMM0001)); } static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { - // Could be shared with FLOAT16_2. - XEASSERTALWAYS(); + // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0] + e.vcvtph2ps(i.dest, i.src1); } static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { // (VD.x) = 3.0 + (VB.x>>16)*2^-22 From 4089f405af9c89a1848d577054dce428af395db2 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 10 Jun 2014 21:36:13 -0700 Subject: [PATCH 166/184] PACK FLOAT16 seems correct. --- src/alloy/backend/x64/x64_sequences.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 59153e40b..dace0ac17 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4730,14 +4730,12 @@ EMITTER(PACK, MATCH(I, V128<>>)) { static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx // dest = [(src1.x | src1.y), 0, 0, 0] - e.db(0xCC); e.vcvtps2ph(e.xmm0, i.src1, B00000011); e.vxorps(i.dest, i.dest); e.vpblendw(i.dest, e.xmm0, B00000011); } static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { // dest = [(src1.x | src1.y), (src1.z | src1.w), 0, 0] - e.db(0xCC); e.vcvtps2ph(e.xmm0, i.src1, B00000011); e.vxorps(i.dest, i.dest); e.vpblendw(i.dest, e.xmm0, B00001111); From 02709e0d65c86d600831b3314089e5af1d9e9942 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 10 Jun 2014 21:38:17 -0700 Subject: [PATCH 167/184] Optimize EXTRACT_I32(0). --- src/alloy/backend/x64/x64_sequences.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index dace0ac17..7f90b09e2 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -47,8 +47,8 @@ static std::unordered_multimap sequence_table; // Selects the right byte/word/etc from a vector. We need to flip logical // indices (0,1,2,3,4,5,6,7,...) = (3,2,1,0,7,6,5,4,...) -#define VEC128_B(n) ((n) & 0xC) | ((~(n)) & 0x3) -#define VEC128_W(n) ((n) & 0x6) | ((~(n)) & 0x1) +#define VEC128_B(n) ((n) ^ 0x3) +#define VEC128_W(n) ((n) ^ 0x1) #define VEC128_D(n) (n) #define VEC128_F(n) (n) @@ -4413,7 +4413,11 @@ EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), }; if (i.src2.is_constant) { - e.vpextrd(i.dest, i.src1, VEC128_D(i.src2.constant())); + if (i.src2.constant() == 0) { + e.vmovd(i.dest, i.src1); + } else { + e.vpextrd(i.dest, i.src1, VEC128_D(i.src2.constant())); + } } else { // TODO(benvanik): try out hlide's version: // e.mov(e.eax, 3); From 45e149d00b0b908173d44010b2e81f8494c2dbc4 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 10 Jun 2014 21:38:33 -0700 Subject: [PATCH 168/184] Possibly correct variable EXTRACT_I16. --- src/alloy/backend/x64/x64_sequences.cc | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 7f90b09e2..235c3444b 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4393,14 +4393,13 @@ EMITTER(EXTRACT_I16, MATCH(I, V128<>, I8<>>)) { e.vpextrw(i.dest.reg().cvt32(), i.src1, VEC128_W(i.src2.constant())); } else { // TODO(benvanik): try out hlide's version: - // e.mov(e.eax, 7); - // e.and(e.al, i.src2); // eax = [i&7, 0, 0, 0] - // e.imul(e.eax, 0x00000202); // [(i&7)*2, (i&7)*2, 0, 0] - // e.xor(e.eax, 0x80800203); // [((i&7)*2)^3, ((i&7)*2)^2, 0x80, 0x80] - // e.vmovd(e.xmm0, e.eax); - // e.vpshufb(e.xmm0, i.src1, e.xmm0); - // e.vmovd(i.dest.reg().cvt32(), e.xmm0); - XEASSERTALWAYS(); + e.mov(e.al, i.src2); + e.xor(e.al, 0x1); + e.mov(e.ah, e.al); + e.add(e.ah, 1); + e.vmovd(e.xmm0, e.eax); + e.vpshufb(e.xmm0, i.src1, e.xmm0); + e.vmovd(i.dest.reg().cvt32(), e.xmm0); } } }; From 7936fa2dbdf554817cab60fcab82222cf16a0113 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 20 Jun 2014 20:44:39 -0700 Subject: [PATCH 169/184] Fix constants in pixel shaders. --- src/xenia/gpu/d3d11/d3d11_shader_translator.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xenia/gpu/d3d11/d3d11_shader_translator.cc b/src/xenia/gpu/d3d11/d3d11_shader_translator.cc index dde024356..ef8b75650 100644 --- a/src/xenia/gpu/d3d11/d3d11_shader_translator.cc +++ b/src/xenia/gpu/d3d11/d3d11_shader_translator.cc @@ -275,7 +275,7 @@ int D3D11ShaderTranslator::TranslatePixelShader( uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs; for (uint32_t n = 0; n <= MAX(15, temp_regs); n++) { append( - " float4 r%d = c[%d];\n", n, n); + " float4 r%d = c[%d];\n", n, n + 256); } append(" float4 t;\n"); @@ -365,7 +365,7 @@ void D3D11ShaderTranslator::AppendSrcReg(uint32_t num, uint32_t type, append("r%u", num); } else { // Constant. - append("c[%u]", num); + append("c[%u]", type_ == XE_GPU_SHADER_TYPE_PIXEL ? num + 256 : num); } if (swiz) { append("."); From da590bd9ccd942963f8e1836cbd415c83729c81b Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 20 Jun 2014 20:44:53 -0700 Subject: [PATCH 170/184] Disable logging for TlsGetValue. --- src/xenia/kernel/xboxkrnl_threading.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/xenia/kernel/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl_threading.cc index 533a05d2c..c7f31aee0 100644 --- a/src/xenia/kernel/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl_threading.cc @@ -550,9 +550,10 @@ SHIM_CALL KeTlsGetValue_shim( PPCContext* ppc_state, KernelState* state) { uint32_t tls_index = SHIM_GET_ARG_32(0); - XELOGD( - "KeTlsGetValue(%.8X)", - tls_index); + // Logging disabled, as some games spam this. + //XELOGD( + // "KeTlsGetValue(%.8X)", + // tls_index); uint64_t result = xeKeTlsGetValue(tls_index); SHIM_SET_RETURN_64(result); From a5448b1296e55946aa393ee4326c1488f19fb800 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 20 Jun 2014 20:48:02 -0700 Subject: [PATCH 171/184] Fixing pack float16. --- src/alloy/backend/x64/x64_sequences.cc | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 235c3444b..689c8b3b3 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4733,15 +4733,27 @@ EMITTER(PACK, MATCH(I, V128<>>)) { static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx // dest = [(src1.x | src1.y), 0, 0, 0] + // 0|0|0|0|W|Z|Y|X e.vcvtps2ph(e.xmm0, i.src1, B00000011); + // Y|X|W|Z|0|0|0|0 + e.vpshufd(e.xmm0, e.xmm0, B00011011); + // Shuffle to X|Y|Z|W|0|0|0|0 + e.vpshufhw(e.xmm0, e.xmm0, B10110001); + // Select just X|Y e.vxorps(i.dest, i.dest); - e.vpblendw(i.dest, e.xmm0, B00000011); + e.vpblendw(i.dest, e.xmm0, B11000000); } static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { // dest = [(src1.x | src1.y), (src1.z | src1.w), 0, 0] + // 0|0|0|0|W|Z|Y|X e.vcvtps2ph(e.xmm0, i.src1, B00000011); + // Y|X|W|Z|0|0|0|0 + e.vpshufd(e.xmm0, e.xmm0, B00011011); + // Shuffle to X|Y|Z|W|0|0|0|0 + e.vpshufhw(e.xmm0, e.xmm0, B10110001); + // Select just X|Y|Z|W e.vxorps(i.dest, i.dest); - e.vpblendw(i.dest, e.xmm0, B00001111); + e.vpblendw(i.dest, e.xmm0, B11110000); } static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { XEASSERTALWAYS(); From 71eb408d67280e1d89b8cf9564bdb994f9a18799 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 22 Jun 2014 19:41:26 -0700 Subject: [PATCH 172/184] Various fiddlings. --- .../gpu/d3d11/d3d11_shader_translator.cc | 11 ++++++-- src/xenia/kernel/kernel_state.cc | 28 +++++++++++++++++++ src/xenia/kernel/kernel_state.h | 6 ++++ src/xenia/kernel/objects/xnotify_listener.cc | 8 ++++++ src/xenia/kernel/objects/xnotify_listener.h | 3 -- src/xenia/kernel/util/xex2.cc | 5 +++- src/xenia/kernel/xam_user.cc | 18 ++++++++++++ src/xenia/kernel/xboxkrnl_threading.cc | 2 +- src/xenia/xbox.h | 4 +++ 9 files changed, 77 insertions(+), 8 deletions(-) diff --git a/src/xenia/gpu/d3d11/d3d11_shader_translator.cc b/src/xenia/gpu/d3d11/d3d11_shader_translator.cc index ef8b75650..549f0c72d 100644 --- a/src/xenia/gpu/d3d11/d3d11_shader_translator.cc +++ b/src/xenia/gpu/d3d11/d3d11_shader_translator.cc @@ -163,8 +163,10 @@ int D3D11ShaderTranslator::TranslateVertexShader( " VS_OUTPUT o;\n"); // Always write position, as some shaders seem to only write certain values. - append( - " o.oPos = float4(0.0, 0.0, 0.0, 0.0);\n"); + if (alloc_counts.positions) { + append( + " o.oPos = float4(0.0, 0.0, 0.0, 0.0);\n"); + } if (alloc_counts.point_size) { append( " o.oPointSize = float4(1.0, 0.0, 0.0, 0.0);\n"); @@ -198,8 +200,11 @@ int D3D11ShaderTranslator::TranslateVertexShader( } // main footer. + if (alloc_counts.positions) { + append( + " o.oPos = applyViewport(o.oPos);\n"); + } append( - " o.oPos = applyViewport(o.oPos);\n" " return o;\n" "};\n"); diff --git a/src/xenia/kernel/kernel_state.cc b/src/xenia/kernel/kernel_state.cc index 5cd6110b7..bc10751d7 100644 --- a/src/xenia/kernel/kernel_state.cc +++ b/src/xenia/kernel/kernel_state.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -135,3 +136,30 @@ XThread* KernelState::GetThreadByID(uint32_t thread_id) { xe_mutex_unlock(object_mutex_); return thread; } + +void KernelState::RegisterNotifyListener(XNotifyListener* listener) { + xe_mutex_lock(object_mutex_); + notify_listeners_.push_back(listener); + xe_mutex_unlock(object_mutex_); +} + +void KernelState::UnregisterNotifyListener(XNotifyListener* listener) { + xe_mutex_lock(object_mutex_); + for (auto it = notify_listeners_.begin(); it != notify_listeners_.end(); + ++it) { + if (*it == listener) { + notify_listeners_.erase(it); + break; + } + } + xe_mutex_unlock(object_mutex_); +} + +void KernelState::BroadcastNotification(XNotificationID id, uint32_t data) { + xe_mutex_lock(object_mutex_); + for (auto it = notify_listeners_.begin(); it != notify_listeners_.end(); + ++it) { + (*it)->EnqueueNotification(id, data); + } + xe_mutex_unlock(object_mutex_); +} diff --git a/src/xenia/kernel/kernel_state.h b/src/xenia/kernel/kernel_state.h index 944169918..f2fb3f47e 100644 --- a/src/xenia/kernel/kernel_state.h +++ b/src/xenia/kernel/kernel_state.h @@ -23,6 +23,7 @@ XEDECLARECLASS1(xe, Emulator); XEDECLARECLASS2(xe, cpu, Processor); XEDECLARECLASS2(xe, kernel, Dispatcher); XEDECLARECLASS2(xe, kernel, XModule); +XEDECLARECLASS2(xe, kernel, XNotifyListener); XEDECLARECLASS2(xe, kernel, XThread); XEDECLARECLASS2(xe, kernel, XUserModule); XEDECLARECLASS3(xe, kernel, fs, FileSystem); @@ -56,6 +57,10 @@ public: void UnregisterThread(XThread* thread); XThread* GetThreadByID(uint32_t thread_id); + void RegisterNotifyListener(XNotifyListener* listener); + void UnregisterNotifyListener(XNotifyListener* listener); + void BroadcastNotification(XNotificationID id, uint32_t data); + private: Emulator* emulator_; Memory* memory_; @@ -67,6 +72,7 @@ private: ObjectTable* object_table_; xe_mutex_t* object_mutex_; std::unordered_map threads_by_id_; + std::vector notify_listeners_; XUserModule* executable_module_; diff --git a/src/xenia/kernel/objects/xnotify_listener.cc b/src/xenia/kernel/objects/xnotify_listener.cc index b9d45dafb..7e9ffb704 100644 --- a/src/xenia/kernel/objects/xnotify_listener.cc +++ b/src/xenia/kernel/objects/xnotify_listener.cc @@ -20,6 +20,7 @@ XNotifyListener::XNotifyListener(KernelState* kernel_state) : } XNotifyListener::~XNotifyListener() { + kernel_state_->UnregisterNotifyListener(this); xe_mutex_free(lock_); if (wait_handle_) { CloseHandle(wait_handle_); @@ -32,9 +33,16 @@ void XNotifyListener::Initialize(uint64_t mask) { lock_ = xe_mutex_alloc(); wait_handle_ = CreateEvent(NULL, TRUE, FALSE, NULL); mask_ = mask; + + kernel_state_->RegisterNotifyListener(this); } void XNotifyListener::EnqueueNotification(XNotificationID id, uint32_t data) { + // Ignore if the notification doesn't match our mask. + if ((mask_ & uint64_t(1 << ((id >> 25) + 1))) == 0) { + return; + } + xe_mutex_lock(lock_); auto existing = notifications_.find(id); if (existing != notifications_.end()) { diff --git a/src/xenia/kernel/objects/xnotify_listener.h b/src/xenia/kernel/objects/xnotify_listener.h index a7aa16eee..436c4434a 100644 --- a/src/xenia/kernel/objects/xnotify_listener.h +++ b/src/xenia/kernel/objects/xnotify_listener.h @@ -21,9 +21,6 @@ namespace xe { namespace kernel { -// Values seem to be all over the place - GUIDs? -typedef uint32_t XNotificationID; - class XNotifyListener : public XObject { public: diff --git a/src/xenia/kernel/util/xex2.cc b/src/xenia/kernel/util/xex2.cc index af71eb182..1aca590e3 100644 --- a/src/xenia/kernel/util/xex2.cc +++ b/src/xenia/kernel/util/xex2.cc @@ -11,6 +11,7 @@ #include +#include #include #include #include @@ -20,6 +21,8 @@ using namespace alloy; +DEFINE_bool(xex_dev_key, false, "Use the devkit key."); + typedef struct xe_xex2 { xe_ref_t ref; @@ -434,7 +437,7 @@ int xe_xex2_decrypt_key(xe_xex2_header_t *header) { // Guess key based on file info. // TODO: better way to finding out which key to use? const uint8_t *xexkey; - if (header->execution_info.title_id) { + if (header->execution_info.title_id && !FLAGS_xex_dev_key) { xexkey = xe_xex2_retail_key; } else { xexkey = xe_xex2_devkit_key; diff --git a/src/xenia/kernel/xam_user.cc b/src/xenia/kernel/xam_user.cc index ebe48c2cb..005475812 100644 --- a/src/xenia/kernel/xam_user.cc +++ b/src/xenia/kernel/xam_user.cc @@ -174,6 +174,23 @@ SHIM_CALL XamUserReadProfileSettings_shim( } +SHIM_CALL XamShowSigninUI_shim( + PPCContext* ppc_state, KernelState* state) { + uint32_t unk_0 = SHIM_GET_ARG_32(0); + uint32_t unk_mask = SHIM_GET_ARG_32(1); + + XELOGD( + "XamShowSigninUI(%d, %.8X)", + unk_0, unk_mask); + + // Mask values vary. Probably matching user types? Local/remote? + // Games seem to sit and loop until we trigger this notification. + state->BroadcastNotification(0x00000009, 0); + + SHIM_SET_RETURN_32(X_ERROR_SUCCESS); +} + + } // namespace kernel } // namespace xe @@ -185,4 +202,5 @@ void xe::kernel::xam::RegisterUserExports( SHIM_SET_MAPPING("xam.xex", XamUserGetSigninInfo, state); SHIM_SET_MAPPING("xam.xex", XamUserGetName, state); SHIM_SET_MAPPING("xam.xex", XamUserReadProfileSettings, state); + SHIM_SET_MAPPING("xam.xex", XamShowSigninUI, state); } diff --git a/src/xenia/kernel/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl_threading.cc index c7f31aee0..0e119491b 100644 --- a/src/xenia/kernel/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl_threading.cc @@ -1129,7 +1129,7 @@ SHIM_CALL NtWaitForSingleObjectEx_shim( uint64_t timeout = timeout_ptr ? SHIM_MEM_64(timeout_ptr) : 0; result = object->Wait( 3, wait_mode, alertable, - timeout_ptr ? &timeout : NULL); + timeout_ptr ? &timeout : NULL); object->Release(); } diff --git a/src/xenia/xbox.h b/src/xenia/xbox.h index aab667584..9f3a6b071 100644 --- a/src/xenia/xbox.h +++ b/src/xenia/xbox.h @@ -254,6 +254,10 @@ public: }; +// Values seem to be all over the place - GUIDs? +typedef uint32_t XNotificationID; + + typedef enum _X_INPUT_FLAG { X_INPUT_FLAG_GAMEPAD = 0x00000001, } X_INPUT_FLAG; From 7b98c748fa48d2ce7b4e39a8c9181406d460a387 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 22 Jun 2014 21:03:41 -0700 Subject: [PATCH 173/184] Fixing double allocation of memory x_x --- src/alloy/runtime/runtime.cc | 7 +------ src/xenia/xbox.h | 1 + 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/alloy/runtime/runtime.cc b/src/alloy/runtime/runtime.cc index db5e52d7d..7be29dce1 100644 --- a/src/alloy/runtime/runtime.cc +++ b/src/alloy/runtime/runtime.cc @@ -55,11 +55,6 @@ int Runtime::Initialize(Frontend* frontend, Backend* backend) { // Must be initialized by subclass before calling into this. XEASSERTNOTNULL(memory_); - int result = memory_->Initialize(); - if (result) { - return result; - } - // Create debugger first. Other types hook up to it. debugger_ = new Debugger(this); @@ -102,7 +97,7 @@ int Runtime::Initialize(Frontend* frontend, Backend* backend) { backend_ = backend; frontend_ = frontend; - result = backend_->Initialize(); + int result = backend_->Initialize(); if (result) { return result; } diff --git a/src/xenia/xbox.h b/src/xenia/xbox.h index 9f3a6b071..85ad57a41 100644 --- a/src/xenia/xbox.h +++ b/src/xenia/xbox.h @@ -63,6 +63,7 @@ typedef uint32_t X_RESULT; #define X_HRESULT_FROM_WIN32(x) ((X_RESULT)(x) <= 0 ? ((X_RESULT)(x)) : ((X_RESULT) (((x) & 0x0000FFFF) | (X_FACILITY_WIN32 << 16) | 0x80000000))) #define X_ERROR_SUCCESS X_HRESULT_FROM_WIN32(0x00000000L) #define X_ERROR_ACCESS_DENIED X_HRESULT_FROM_WIN32(0x00000005L) +#define X_ERROR_NO_MORE_FILES X_HRESULT_FROM_WIN32(0x00000018L) #define X_ERROR_INSUFFICIENT_BUFFER X_HRESULT_FROM_WIN32(0x0000007AL) #define X_ERROR_BAD_ARGUMENTS X_HRESULT_FROM_WIN32(0x000000A0L) #define X_ERROR_BUSY X_HRESULT_FROM_WIN32(0x000000AAL) From 9905380bd00a1db1181db5c4de46bf6803508e5a Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 22 Jun 2014 21:09:33 -0700 Subject: [PATCH 174/184] Steal one page of physical heap - this helps catch virt underflows. --- src/xenia/cpu/xenon_memory.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xenia/cpu/xenon_memory.cc b/src/xenia/cpu/xenon_memory.cc index 10dcf5f47..8e3bf6dc5 100644 --- a/src/xenia/cpu/xenon_memory.cc +++ b/src/xenia/cpu/xenon_memory.cc @@ -316,7 +316,7 @@ int XenonMemory::Initialize() { virtual_heap_->Initialize( XENON_MEMORY_VIRTUAL_HEAP_LOW, XENON_MEMORY_VIRTUAL_HEAP_HIGH); physical_heap_->Initialize( - XENON_MEMORY_PHYSICAL_HEAP_LOW, XENON_MEMORY_PHYSICAL_HEAP_HIGH); + XENON_MEMORY_PHYSICAL_HEAP_LOW, XENON_MEMORY_PHYSICAL_HEAP_HIGH - 0x1000); // GPU writeback. // 0xC... is physical, 0x7F... is virtual. We may need to overlay these. From 4009eb1cd8edff47e0fbbcadc5a68778336b8b3a Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 22 Jun 2014 21:19:48 -0700 Subject: [PATCH 175/184] Fail out of XamContentCreateEnumerator. --- src/xenia/kernel/xam_content.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/xenia/kernel/xam_content.cc b/src/xenia/kernel/xam_content.cc index 97469d0ca..5f53bc4ea 100644 --- a/src/xenia/kernel/xam_content.cc +++ b/src/xenia/kernel/xam_content.cc @@ -44,6 +44,7 @@ SHIM_CALL XamContentGetLicenseMask_shim( } +// http://gameservice.googlecode.com/svn-history/r14/trunk/ContentManager.cpp SHIM_CALL XamContentCreateEnumerator_shim( PPCContext* ppc_state, KernelState* state) { uint32_t arg0 = SHIM_GET_ARG_32(0); @@ -52,12 +53,15 @@ SHIM_CALL XamContentCreateEnumerator_shim( uint32_t arg3 = SHIM_GET_ARG_32(3); uint32_t arg4 = SHIM_GET_ARG_32(4); uint32_t arg5 = SHIM_GET_ARG_32(5); - uint32_t arg6 = SHIM_GET_ARG_32(6); + uint32_t handle_ptr = SHIM_GET_ARG_32(6); XELOGD( "XamContentCreateEnumerator(%.8X, %.8X, %.8X, %.8X, %.8X, %.8X, %.8X)", - arg0, arg1, arg2, arg3, arg4, arg5, arg6); - SHIM_SET_RETURN_32(X_ERROR_DEVICE_NOT_CONNECTED); + arg0, arg1, arg2, arg3, arg4, arg5, handle_ptr); + + SHIM_SET_MEM_32(handle_ptr, X_INVALID_HANDLE_VALUE); + + SHIM_SET_RETURN_32(X_ERROR_NO_MORE_FILES); } From 13b3fa6f8448cb3c0347fd80a35f58d3306cf790 Mon Sep 17 00:00:00 2001 From: tj Date: Tue, 24 Jun 2014 00:38:14 -0400 Subject: [PATCH 176/184] Fixed typo in NetDll_setsockopt logging. --- src/xenia/kernel/xam_net.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xenia/kernel/xam_net.cc b/src/xenia/kernel/xam_net.cc index 4ca580cce..1a8470c48 100644 --- a/src/xenia/kernel/xam_net.cc +++ b/src/xenia/kernel/xam_net.cc @@ -113,7 +113,7 @@ SHIM_CALL NetDll_setsockopt_shim( uint32_t optval_ptr = SHIM_GET_ARG_32(4); uint32_t optlen = SHIM_GET_ARG_32(5); XELOGD( - "NetDll_send(%d, %.8X, %d, %d, %.8X, %d)", + "NetDll_setsockopt(%d, %.8X, %d, %d, %.8X, %d)", arg0, socket_ptr, level, From 15be1061d36649c1faf2eda571aaae434cde273b Mon Sep 17 00:00:00 2001 From: Fire30 Date: Wed, 2 Jul 2014 18:17:09 -0400 Subject: [PATCH 177/184] Fixed arguments for NetDll_send. --- src/xenia/kernel/xam_net.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/xenia/kernel/xam_net.cc b/src/xenia/kernel/xam_net.cc index 1a8470c48..dcf72f074 100644 --- a/src/xenia/kernel/xam_net.cc +++ b/src/xenia/kernel/xam_net.cc @@ -176,12 +176,14 @@ SHIM_CALL NetDll_recvfrom_shim( SHIM_CALL NetDll_send_shim( PPCContext* ppc_state, KernelState* state) { - uint32_t socket_ptr = SHIM_GET_ARG_32(0); - uint32_t buf_ptr = SHIM_GET_ARG_32(1); - uint32_t len = SHIM_GET_ARG_32(2); - uint32_t flags = SHIM_GET_ARG_32(3); + uint32_t arg0 = SHIM_GET_ARG_32(0); + uint32_t socket_ptr = SHIM_GET_ARG_32(1); + uint32_t buf_ptr = SHIM_GET_ARG_32(2); + uint32_t len = SHIM_GET_ARG_32(3); + uint32_t flags = SHIM_GET_ARG_32(4); XELOGD( - "NetDll_send(%.8X, %.8X, %d, %d)", + "NetDll_send(%d,%.8X, %.8X, %d, %d)", + arg0, socket_ptr, buf_ptr, len, From d5f27dbf34cc588cb2129760990657c799b39172 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 2 Jul 2014 15:39:30 -0700 Subject: [PATCH 178/184] Better arg passing for Execute and experimenting with APCs. --- src/xenia/apu/audio_system.cc | 3 +- src/xenia/cpu/processor.cc | 24 ++---- src/xenia/cpu/processor.h | 8 +- src/xenia/gpu/graphics_system.cc | 3 +- src/xenia/kernel/native_list.cc | 4 + src/xenia/kernel/native_list.h | 1 + src/xenia/kernel/objects/xthread.cc | 114 +++++++++++++++++++++++++++- src/xenia/kernel/objects/xthread.h | 5 ++ 8 files changed, 135 insertions(+), 27 deletions(-) diff --git a/src/xenia/apu/audio_system.cc b/src/xenia/apu/audio_system.cc index 897e9bc03..68b9ccbee 100644 --- a/src/xenia/apu/audio_system.cc +++ b/src/xenia/apu/audio_system.cc @@ -95,7 +95,8 @@ void AudioSystem::ThreadStart() { uint32_t client_callback_arg = clients_[index].wrapped_callback_arg; xe_mutex_unlock(lock_); if (client_callback) { - processor->Execute(thread_state_, client_callback, client_callback_arg, 0); + uint64_t args[] = { client_callback_arg }; + processor->Execute(thread_state_, client_callback, args, XECOUNT(args)); } pumped++; index++; diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index 3d3a76e72..969907d96 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -167,25 +167,15 @@ int Processor::Execute(XenonThreadState* thread_state, uint64_t address) { } uint64_t Processor::Execute( - XenonThreadState* thread_state, uint64_t address, uint64_t arg0) { + XenonThreadState* thread_state, uint64_t address, uint64_t args[], + size_t arg_count) { SCOPE_profile_cpu_f("cpu"); PPCContext* context = thread_state->context(); - context->r[3] = arg0; - if (Execute(thread_state, address)) { - return 0xDEADBABE; + XEASSERT(arg_count <= 5); + for (size_t i = 0; i < arg_count; ++i) { + context->r[3 + i] = args[i]; } - return context->r[3]; -} - -uint64_t Processor::Execute( - XenonThreadState* thread_state, uint64_t address, uint64_t arg0, - uint64_t arg1) { - SCOPE_profile_cpu_f("cpu"); - - PPCContext* context = thread_state->context(); - context->r[3] = arg0; - context->r[4] = arg1; if (Execute(thread_state, address)) { return 0xDEADBABE; } @@ -193,7 +183,7 @@ uint64_t Processor::Execute( } uint64_t Processor::ExecuteInterrupt( - uint32_t cpu, uint64_t address, uint64_t arg0, uint64_t arg1) { + uint32_t cpu, uint64_t address, uint64_t args[], size_t arg_count) { SCOPE_profile_cpu_f("cpu"); // Acquire lock on interrupt thread (we can only dispatch one at a time). @@ -204,7 +194,7 @@ uint64_t Processor::ExecuteInterrupt( XESETUINT8BE(p + interrupt_thread_block_ + 0x10C, cpu); // Execute interrupt. - uint64_t result = Execute(interrupt_thread_state_, address, arg0, arg1); + uint64_t result = Execute(interrupt_thread_state_, address, args, arg_count); xe_mutex_unlock(interrupt_thread_lock_); return result; diff --git a/src/xenia/cpu/processor.h b/src/xenia/cpu/processor.h index 25b367ff3..3ad8217db 100644 --- a/src/xenia/cpu/processor.h +++ b/src/xenia/cpu/processor.h @@ -42,13 +42,11 @@ public: int Execute( XenonThreadState* thread_state, uint64_t address); uint64_t Execute( - XenonThreadState* thread_state, uint64_t address, uint64_t arg0); - uint64_t Execute( - XenonThreadState* thread_state, uint64_t address, uint64_t arg0, - uint64_t arg1); + XenonThreadState* thread_state, uint64_t address, uint64_t args[], + size_t arg_count); uint64_t ExecuteInterrupt( - uint32_t cpu, uint64_t address, uint64_t arg0, uint64_t arg1); + uint32_t cpu, uint64_t address, uint64_t args[], size_t arg_count); virtual void OnDebugClientConnected(uint32_t client_id); virtual void OnDebugClientDisconnected(uint32_t client_id); diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index 212074168..86905cc48 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -195,6 +195,7 @@ void GraphicsSystem::DispatchInterruptCallback( if (!interrupt_callback_) { return; } + uint64_t args[] = { source, interrupt_callback_data_ }; processor_->ExecuteInterrupt( - cpu, interrupt_callback_, source, interrupt_callback_data_); + cpu, interrupt_callback_, args, XECOUNT(args)); } diff --git a/src/xenia/kernel/native_list.cc b/src/xenia/kernel/native_list.cc index fbf792124..bc922d31d 100644 --- a/src/xenia/kernel/native_list.cc +++ b/src/xenia/kernel/native_list.cc @@ -66,3 +66,7 @@ uint32_t NativeList::Shift() { Remove(ptr); return ptr; } + +bool NativeList::HasPending() { + return head_ != kInvalidPointer; +} diff --git a/src/xenia/kernel/native_list.h b/src/xenia/kernel/native_list.h index e242d822f..d521ea937 100644 --- a/src/xenia/kernel/native_list.h +++ b/src/xenia/kernel/native_list.h @@ -38,6 +38,7 @@ public: bool IsQueued(uint32_t list_entry_ptr); void Remove(uint32_t list_entry_ptr); uint32_t Shift(); + bool HasPending(); private: const uint32_t kInvalidPointer = 0xE0FE0FFF; diff --git a/src/xenia/kernel/objects/xthread.cc b/src/xenia/kernel/objects/xthread.cc index 7a4c8f5d3..d8e4bac85 100644 --- a/src/xenia/kernel/objects/xthread.cc +++ b/src/xenia/kernel/objects/xthread.cc @@ -78,6 +78,9 @@ XThread::~XThread() { if (thread_state_) { delete thread_state_; } + if (scratch_address_) { + kernel_state()->memory()->HeapFree(scratch_address_, 0); + } if (tls_address_) { kernel_state()->memory()->HeapFree(tls_address_, 0); } @@ -194,6 +197,12 @@ X_STATUS XThread::Create() { XUserModule* module = kernel_state()->GetExecutableModule(); + // Allocate thread scratch. + // This is used by interrupts/APCs/etc so we can round-trip pointers through. + scratch_size_ = 4 * 16; + scratch_address_ = (uint32_t)memory()->HeapAlloc( + 0, scratch_size_, MEMORY_FLAG_ZERO); + // Allocate TLS block. const xe_xex2_header_t* header = module->xex_header(); uint32_t tls_size = header->tls_info.slot_count * header->tls_info.data_size; @@ -244,6 +253,7 @@ X_STATUS XThread::Exit(int exit_code) { // TODO(benvanik); dispatch events? waiters? etc? event_->Set(0, false); + RundownAPCs(); // NOTE: unless PlatformExit fails, expect it to never return! X_STATUS return_code = PlatformExit(exit_code); @@ -365,15 +375,21 @@ void XThread::Execute() { // If a XapiThreadStartup value is present, we use that as a trampoline. // Otherwise, we are a raw thread. if (creation_params_.xapi_thread_startup) { + uint64_t args[] = { + creation_params_.start_address, + creation_params_.start_context + }; kernel_state()->processor()->Execute( thread_state_, - creation_params_.xapi_thread_startup, - creation_params_.start_address, creation_params_.start_context); + creation_params_.xapi_thread_startup, args, XECOUNT(args)); } else { // Run user code. + uint64_t args[] = { + creation_params_.start_context + }; int exit_code = (int)kernel_state()->processor()->Execute( thread_state_, - creation_params_.start_address, creation_params_.start_context); + creation_params_.start_address, args, XECOUNT(args)); // If we got here it means the execute completed without an exit being called. // Treat the return code as an implicit exit code. Exit(exit_code); @@ -402,7 +418,99 @@ void XThread::LockApc() { } void XThread::UnlockApc() { + bool needs_apc = apc_list_->HasPending(); xe_mutex_unlock(apc_lock_); + if (needs_apc) { + QueueUserAPC(reinterpret_cast(DeliverAPCs), + thread_handle_, + reinterpret_cast(this)); + } +} + +void XThread::DeliverAPCs(void* data) { + // http://www.drdobbs.com/inside-nts-asynchronous-procedure-call/184416590?pgno=1 + // http://www.drdobbs.com/inside-nts-asynchronous-procedure-call/184416590?pgno=7 + XThread* thread = reinterpret_cast(data); + auto membase = thread->memory()->membase(); + auto processor = thread->kernel_state()->processor(); + auto apc_list = thread->apc_list(); + thread->LockApc(); + while (apc_list->HasPending()) { + // Get APC entry (offset for LIST_ENTRY offset) and cache what we need. + // Calling the routine may delete the memory/overwrite it. + uint32_t apc_address = apc_list->Shift() - 8; + uint8_t* apc_ptr = membase + apc_address; + uint32_t kernel_routine = XEGETUINT32BE(apc_ptr + 16); + uint32_t normal_routine = XEGETUINT32BE(apc_ptr + 24); + uint32_t normal_context = XEGETUINT32BE(apc_ptr + 28); + uint32_t system_arg1 = XEGETUINT32BE(apc_ptr + 32); + uint32_t system_arg2 = XEGETUINT32BE(apc_ptr + 36); + + // Mark as uninserted so that it can be reinserted again by the routine. + uint32_t old_flags = XEGETUINT32BE(apc_ptr + 40); + XESETUINT32BE(apc_ptr + 40, old_flags & ~0xFF00); + + // Call kernel routine. + // The routine can modify all of its arguments before passing it on. + // Since we need to give guest accessible pointers over, we copy things + // into and out of scratch. + uint8_t* scratch_ptr = membase + thread->scratch_address_; + XESETUINT32BE(scratch_ptr + 0, normal_routine); + XESETUINT32BE(scratch_ptr + 4, normal_context); + XESETUINT32BE(scratch_ptr + 8, system_arg1); + XESETUINT32BE(scratch_ptr + 12, system_arg2); + // kernel_routine(apc_address, &normal_routine, &normal_context, &system_arg1, &system_arg2) + uint64_t kernel_args[] = { + apc_address, + thread->scratch_address_ + 0, + thread->scratch_address_ + 4, + thread->scratch_address_ + 8, + thread->scratch_address_ + 12, + }; + processor->ExecuteInterrupt( + 0, kernel_routine, kernel_args, XECOUNT(kernel_args)); + normal_routine = XEGETUINT32BE(scratch_ptr + 0); + normal_context = XEGETUINT32BE(scratch_ptr + 4); + system_arg1 = XEGETUINT32BE(scratch_ptr + 8); + system_arg2 = XEGETUINT32BE(scratch_ptr + 12); + + // Call the normal routine. Note that it may have been killed by the kernel + // routine. + if (normal_routine) { + thread->UnlockApc(); + // normal_routine(normal_context, system_arg1, system_arg2) + uint64_t normal_args[] = { normal_context, system_arg1, system_arg2 }; + processor->ExecuteInterrupt( + 0, normal_routine, normal_args, XECOUNT(normal_args)); + thread->LockApc(); + } + } + thread->UnlockApc(); +} + +void XThread::RundownAPCs() { + auto membase = memory()->membase(); + LockApc(); + while (apc_list_->HasPending()) { + // Get APC entry (offset for LIST_ENTRY offset) and cache what we need. + // Calling the routine may delete the memory/overwrite it. + uint32_t apc_address = apc_list_->Shift() - 8; + uint8_t* apc_ptr = membase + apc_address; + uint32_t rundown_routine = XEGETUINT32BE(apc_ptr + 20); + + // Mark as uninserted so that it can be reinserted again by the routine. + uint32_t old_flags = XEGETUINT32BE(apc_ptr + 40); + XESETUINT32BE(apc_ptr + 40, old_flags & ~0xFF00); + + // Call the rundown routine. + if (rundown_routine) { + // rundown_routine(apc) + uint64_t args[] = { apc_address }; + kernel_state()->processor()->ExecuteInterrupt( + 0, rundown_routine, args, XECOUNT(args)); + } + } + UnlockApc(); } int32_t XThread::QueryPriority() { diff --git a/src/xenia/kernel/objects/xthread.h b/src/xenia/kernel/objects/xthread.h index a11bad9ea..8b403429b 100644 --- a/src/xenia/kernel/objects/xthread.h +++ b/src/xenia/kernel/objects/xthread.h @@ -73,6 +73,9 @@ private: void PlatformDestroy(); X_STATUS PlatformExit(int exit_code); + static void DeliverAPCs(void* data); + void RundownAPCs(); + struct { uint32_t stack_size; uint32_t xapi_thread_startup; @@ -83,6 +86,8 @@ private: uint32_t thread_id_; void* thread_handle_; + uint32_t scratch_address_; + uint32_t scratch_size_; uint32_t tls_address_; uint32_t thread_state_address_; cpu::XenonThreadState* thread_state_; From a0cb3416627c8a96d049d3600c2ab0e6689d7281 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 2 Jul 2014 21:57:21 -0700 Subject: [PATCH 179/184] Trying really hard to keep games from spinning up networking. --- src/xenia/kernel/xam_net.cc | 34 ++++++++++++++++++++++++---------- src/xenia/xbox.h | 5 +++++ 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/src/xenia/kernel/xam_net.cc b/src/xenia/kernel/xam_net.cc index dcf72f074..3133b4311 100644 --- a/src/xenia/kernel/xam_net.cc +++ b/src/xenia/kernel/xam_net.cc @@ -35,7 +35,8 @@ SHIM_CALL NetDll_XNetStartup_shim( arg0, params_ptr); - SHIM_SET_RETURN_64(0); + // Fail for now. + SHIM_SET_RETURN_64(1); } SHIM_CALL NetDll_WSAStartup_shim( @@ -60,13 +61,25 @@ SHIM_CALL NetDll_WSAStartup_shim( SHIM_SET_MEM_32(data_ptr + 0x190, 0); } - SHIM_SET_RETURN_64(0); + // Fail for now. This prevents games from actually trying to use this stuff. + SHIM_SET_RETURN_64(1); } SHIM_CALL NetDll_WSAGetLastError_shim( PPCContext* ppc_state, KernelState* state) { XELOGD("NetDll_WSAGetLastError()"); - SHIM_SET_RETURN_32(WSAENETDOWN); + SHIM_SET_RETURN_32(10093L); // WSANOTINITIALISED +} + +SHIM_CALL NetDll_XNetGetTitleXnAddr_shim( + PPCContext* ppc_state, KernelState* state) { + uint32_t arg0 = SHIM_GET_ARG_32(0); + uint32_t arg1 = SHIM_GET_ARG_32(1); + XELOGD( + "NetDll_XNetGetTitleXnAddr(%d, %.8X)", + arg0, + arg1); + SHIM_SET_RETURN_32(0x00000001); } SHIM_CALL NetDll_XNetGetEthernetLinkStatus_shim( @@ -86,7 +99,7 @@ SHIM_CALL NetDll_inet_addr_shim( XELOGD( "NetDll_inet_addr(%.8X)", cp_ptr); - SHIM_SET_RETURN_32(INADDR_NONE); + SHIM_SET_RETURN_32(0xFFFFFFFF); // X_INADDR_NONE } SHIM_CALL NetDll_socket_shim( @@ -101,7 +114,7 @@ SHIM_CALL NetDll_socket_shim( af, type, protocol); - SHIM_SET_RETURN_32(SOCKET_ERROR); + SHIM_SET_RETURN_32(X_SOCKET_ERROR); } SHIM_CALL NetDll_setsockopt_shim( @@ -120,7 +133,7 @@ SHIM_CALL NetDll_setsockopt_shim( optname, optval_ptr, optlen); - SHIM_SET_RETURN_32(SOCKET_ERROR); + SHIM_SET_RETURN_32(X_SOCKET_ERROR); } SHIM_CALL NetDll_connect_shim( @@ -133,7 +146,7 @@ SHIM_CALL NetDll_connect_shim( socket_ptr, sockaddr_ptr, namelen); - SHIM_SET_RETURN_32(SOCKET_ERROR); + SHIM_SET_RETURN_32(X_SOCKET_ERROR); } SHIM_CALL NetDll_recv_shim( @@ -150,7 +163,7 @@ SHIM_CALL NetDll_recv_shim( buf_ptr, len, flags); - SHIM_SET_RETURN_32(SOCKET_ERROR); + SHIM_SET_RETURN_32(X_SOCKET_ERROR); } SHIM_CALL NetDll_recvfrom_shim( @@ -171,7 +184,7 @@ SHIM_CALL NetDll_recvfrom_shim( flags, from_ptr, fromlen_ptr); - SHIM_SET_RETURN_32(SOCKET_ERROR); + SHIM_SET_RETURN_32(X_SOCKET_ERROR); } SHIM_CALL NetDll_send_shim( @@ -188,7 +201,7 @@ SHIM_CALL NetDll_send_shim( buf_ptr, len, flags); - SHIM_SET_RETURN_32(SOCKET_ERROR); + SHIM_SET_RETURN_32(X_SOCKET_ERROR); } @@ -201,6 +214,7 @@ void xe::kernel::xam::RegisterNetExports( SHIM_SET_MAPPING("xam.xex", NetDll_XNetStartup, state); SHIM_SET_MAPPING("xam.xex", NetDll_WSAStartup, state); SHIM_SET_MAPPING("xam.xex", NetDll_WSAGetLastError, state); + SHIM_SET_MAPPING("xam.xex", NetDll_XNetGetTitleXnAddr, state); SHIM_SET_MAPPING("xam.xex", NetDll_XNetGetEthernetLinkStatus, state); SHIM_SET_MAPPING("xam.xex", NetDll_inet_addr, state); SHIM_SET_MAPPING("xam.xex", NetDll_socket, state); diff --git a/src/xenia/xbox.h b/src/xenia/xbox.h index 85ad57a41..fcf079751 100644 --- a/src/xenia/xbox.h +++ b/src/xenia/xbox.h @@ -116,6 +116,11 @@ typedef uint32_t X_RESULT; #define X_PROCTYPE_SYSTEM 2 +// Sockets/networking. +#define X_INVALID_SOCKET (uint32_t)(~0) +#define X_SOCKET_ERROR (uint32_t)(-1) + + // Thread enums. #define X_CREATE_SUSPENDED 0x00000004 From 3b853f6201d0ede7d0c087c730956cf8ac789efa Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Thu, 3 Jul 2014 12:58:27 -0700 Subject: [PATCH 180/184] WSAStartup tweak. --- src/xenia/kernel/xam_net.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/xenia/kernel/xam_net.cc b/src/xenia/kernel/xam_net.cc index 3133b4311..18094a2ed 100644 --- a/src/xenia/kernel/xam_net.cc +++ b/src/xenia/kernel/xam_net.cc @@ -35,8 +35,7 @@ SHIM_CALL NetDll_XNetStartup_shim( arg0, params_ptr); - // Fail for now. - SHIM_SET_RETURN_64(1); + SHIM_SET_RETURN_64(0); } SHIM_CALL NetDll_WSAStartup_shim( @@ -53,16 +52,18 @@ SHIM_CALL NetDll_WSAStartup_shim( if (data_ptr) { SHIM_SET_MEM_16(data_ptr + 0x000, version); - SHIM_SET_MEM_16(data_ptr + 0x002, 0); + SHIM_SET_MEM_16(data_ptr + 0x002, version); SHIM_SET_MEM_32(data_ptr + 0x004, 0); SHIM_SET_MEM_32(data_ptr + 0x105, 0); SHIM_SET_MEM_16(data_ptr + 0x186, 0); SHIM_SET_MEM_16(data_ptr + 0x188, 0); - SHIM_SET_MEM_32(data_ptr + 0x190, 0); + // Some games (PoG) want this value round-tripped - they'll compare if it + // changes and bugcheck if it does. + uint32_t vendor_ptr = SHIM_MEM_32(data_ptr + 0x190); + SHIM_SET_MEM_32(data_ptr + 0x190, vendor_ptr); } - // Fail for now. This prevents games from actually trying to use this stuff. - SHIM_SET_RETURN_64(1); + SHIM_SET_RETURN_64(0); } SHIM_CALL NetDll_WSAGetLastError_shim( From 824d3c246fea4a038f24004abad1ee1790a85795 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Thu, 3 Jul 2014 12:58:56 -0700 Subject: [PATCH 181/184] Prevent null/broken shaders from dying in d3dcompiler. --- src/xenia/gpu/d3d11/d3d11_shader_translator.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/xenia/gpu/d3d11/d3d11_shader_translator.cc b/src/xenia/gpu/d3d11/d3d11_shader_translator.cc index 549f0c72d..5bb28c6e6 100644 --- a/src/xenia/gpu/d3d11/d3d11_shader_translator.cc +++ b/src/xenia/gpu/d3d11/d3d11_shader_translator.cc @@ -274,7 +274,8 @@ int D3D11ShaderTranslator::TranslatePixelShader( // Pixel shader main() header. append( "PS_OUTPUT main(VS_OUTPUT i) {\n" - " PS_OUTPUT o;\n"); + " PS_OUTPUT o;\n" + " o.oC0 = float4(1.0, 0.0, 0.0, 1.0);\n"); // Add temporary registers. uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs; From a5627ee25aa4d71aa3eeafea09e4fd61573ec03a Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Thu, 3 Jul 2014 12:59:08 -0700 Subject: [PATCH 182/184] Hide lock logging. --- src/xenia/kernel/xboxkrnl_rtl.cc | 4 ++-- src/xenia/kernel/xboxkrnl_threading.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/xenia/kernel/xboxkrnl_rtl.cc b/src/xenia/kernel/xboxkrnl_rtl.cc index 9fb0bcdbd..f5d00f19a 100644 --- a/src/xenia/kernel/xboxkrnl_rtl.cc +++ b/src/xenia/kernel/xboxkrnl_rtl.cc @@ -639,7 +639,7 @@ SHIM_CALL RtlEnterCriticalSection_shim( uint32_t cs_ptr = SHIM_GET_ARG_32(0); - XELOGD("RtlEnterCriticalSection(%.8X)", cs_ptr); + // XELOGD("RtlEnterCriticalSection(%.8X)", cs_ptr); const uint8_t* thread_state_block = ppc_state->membase + ppc_state->r[13]; uint32_t thread_id = XThread::GetCurrentThreadId(thread_state_block); @@ -717,7 +717,7 @@ SHIM_CALL RtlLeaveCriticalSection_shim( PPCContext* ppc_state, KernelState* state) { uint32_t cs_ptr = SHIM_GET_ARG_32(0); - XELOGD("RtlLeaveCriticalSection(%.8X)", cs_ptr); + // XELOGD("RtlLeaveCriticalSection(%.8X)", cs_ptr); xeRtlLeaveCriticalSection(cs_ptr); } diff --git a/src/xenia/kernel/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl_threading.cc index 0e119491b..064f34d7b 100644 --- a/src/xenia/kernel/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl_threading.cc @@ -368,8 +368,8 @@ uint32_t xeKeGetCurrentProcessType() { SHIM_CALL KeGetCurrentProcessType_shim( PPCContext* ppc_state, KernelState* state) { - XELOGD( - "KeGetCurrentProcessType()"); + // XELOGD( + // "KeGetCurrentProcessType()"); int result = xeKeGetCurrentProcessType(); SHIM_SET_RETURN_64(result); From de8da767e9bf371277229cb2a7a282f2e5946ca2 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 4 Jul 2014 09:38:00 -0700 Subject: [PATCH 183/184] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index e51170837..1b4468c3e 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,6 @@ that there are some major work areas still untouched: * Write an [OpenGL driver](https://github.com/benvanik/xenia/issues/59) * Add input drivers for [OSX](https://github.com/benvanik/xenia/issues/61) and [PS4 controllers](https://github.com/benvanik/xenia/issues/60) (or anything else) * Start [hacking on audio](https://github.com/benvanik/xenia/issues/62) -* Support [loading of PIRS files](https://github.com/benvanik/xenia/issues/63) * Build a [virtual LIVE service](https://github.com/benvanik/xenia/issues/64) See more projects [good for contributors](https://github.com/benvanik/xenia/issues?labels=good+for+contributors&page=1&state=open). It's a good idea to ask on IRC/the bugs before beginning work From ef75042f380cd9b146443ecdcb02646ed28b202a Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 4 Jul 2014 09:59:16 -0700 Subject: [PATCH 184/184] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1b4468c3e..e0460e25c 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,7 @@ doing. Fixes and optimizations are always welcome (please!), but in addition to that there are some major work areas still untouched: +* Help work through missing functionality/bugs in game [compat](https://github.com/benvanik/xenia/issues?labels=compat) * Write an [OpenGL driver](https://github.com/benvanik/xenia/issues/59) * Add input drivers for [OSX](https://github.com/benvanik/xenia/issues/61) and [PS4 controllers](https://github.com/benvanik/xenia/issues/60) (or anything else) * Start [hacking on audio](https://github.com/benvanik/xenia/issues/62)