diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h index 14fb65968..01cf40f87 100644 --- a/src/xenia/base/memory.h +++ b/src/xenia/base/memory.h @@ -466,6 +466,48 @@ constexpr inline fourcc_t make_fourcc(const std::string_view fourcc) { } return make_fourcc(fourcc[0], fourcc[1], fourcc[2], fourcc[3]); } +//chrispy::todo:use for command stream vector, resize happens a ton and has to call memset +template +class fixed_vmem_vector { + static_assert((sz & 65535) == 0, + "Always give fixed_vmem_vector a size divisible by 65536 to " + "avoid wasting memory on windows"); + + uint8_t* data_; + size_t nbytes_; + + public: + fixed_vmem_vector() + : data_((uint8_t*)AllocFixed(nullptr, sz, AllocationType::kReserveCommit, + PageAccess::kReadWrite)), + nbytes_(0) {} + ~fixed_vmem_vector() { + if (data_) { + DeallocFixed(data_, sz, DeallocationType::kRelease); + data_ = nullptr; + } + nbytes_ = 0; + } + + uint8_t* data() const { return data_; } + size_t size() const { return nbytes_; } + + void resize(size_t newsize) { + nbytes_ = newsize; + xenia_assert(newsize < sz); + } + size_t alloc() const { return sz; } + + void clear() { + resize(0); // todo:maybe zero out + } + void reserve(size_t size) { xenia_assert(size < sz); } + + +}; + + + } // namespace xe diff --git a/src/xenia/base/mutex.h b/src/xenia/base/mutex.h index e93f71e1b..36351377b 100644 --- a/src/xenia/base/mutex.h +++ b/src/xenia/base/mutex.h @@ -12,7 +12,7 @@ #include #include "platform.h" -//#define XE_ENABLE_FAST_WIN32_MUTEX 1 +#define XE_ENABLE_FAST_WIN32_MUTEX 1 namespace xe { #if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX==1 diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index dfc993dee..4205016cd 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -493,13 +493,18 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { // very unlikely. these ORS here are meant to be bitwise ors, so that we do // not do branching evaluation of the conditions (we will almost always take // all of the branches) - if (XE_UNLIKELY( - (index - XE_GPU_REG_SCRATCH_REG0 < 8) | - (index == XE_GPU_REG_COHER_STATUS_HOST) | - ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= - (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)))) { + + unsigned expr = (index - XE_GPU_REG_SCRATCH_REG0 < 8) | + (index == XE_GPU_REG_COHER_STATUS_HOST) | + ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= + (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)); + //chrispy: reordered for msvc branch probability (assumes if is taken and else is not) + if (XE_LIKELY(expr == 0)) { + + } else { HandleSpecialRegisterWrite(index, value); } + } void CommandProcessor::MakeCoherent() { diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h index 412e8833d..c9245773b 100644 --- a/src/xenia/gpu/command_processor.h +++ b/src/xenia/gpu/command_processor.h @@ -153,6 +153,7 @@ class CommandProcessor { // rarely needed, most register writes have no special logic here XE_NOINLINE void HandleSpecialRegisterWrite(uint32_t index, uint32_t value); + XE_FORCEINLINE virtual void WriteRegister(uint32_t index, uint32_t value); const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const { diff --git a/src/xenia/gpu/d3d12/deferred_command_list.h b/src/xenia/gpu/d3d12/deferred_command_list.h index 22b4fc5da..a1b063558 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.h +++ b/src/xenia/gpu/d3d12/deferred_command_list.h @@ -30,8 +30,11 @@ class D3D12CommandProcessor; class DeferredCommandList { public: + /* + chrispy: upped from 1_MiB to 4_MiB, m:durandal hits frequent resizes in large open maps + */ DeferredCommandList(const D3D12CommandProcessor& command_processor, - size_t initial_size_bytes = 1_MiB); + size_t initial_size_bytes = 4_MiB); void Reset(); void Execute(ID3D12GraphicsCommandList* command_list,