diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index 0fe9d7cbf..e6422bc73 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -12,6 +12,7 @@ #include #include #include +#include #include "third_party/fmt/include/fmt/format.h" #include "xenia/base/byte_stream.h" @@ -49,22 +50,23 @@ CommandProcessor::~CommandProcessor() = default; bool CommandProcessor::Initialize() { // Initialize the gamma ramps to their default (linear) values - taken from - // what games set when starting. + // what games set when starting with the sRGB (return value 1) + // VdGetCurrentDisplayGamma. for (uint32_t i = 0; i < 256; ++i) { - uint32_t value = i * 1023 / 255; - gamma_ramp_.table[i].value = value | (value << 10) | (value << 20); + uint32_t value = i * 0x3FF / 0xFF; + reg::DC_LUT_30_COLOR& gamma_ramp_entry = gamma_ramp_256_entry_table_[i]; + gamma_ramp_entry.color_10_blue = value; + gamma_ramp_entry.color_10_green = value; + gamma_ramp_entry.color_10_red = value; } for (uint32_t i = 0; i < 128; ++i) { - uint32_t value = (i * 65535 / 127) & ~63; - if (i < 127) { - value |= 0x200 << 16; - } + reg::DC_LUT_PWL_DATA gamma_ramp_entry = {}; + gamma_ramp_entry.base = (i * 0xFFFF / 0x7F) & ~UINT32_C(0x3F); + gamma_ramp_entry.delta = i < 0x7F ? 0x200 : 0; for (uint32_t j = 0; j < 3; ++j) { - gamma_ramp_.pwl[i].values[j].value = value; + gamma_ramp_pwl_rgb_[i][j] = gamma_ramp_entry; } } - dirty_gamma_ramp_table_ = true; - dirty_gamma_ramp_pwl_ = true; worker_running_ = true; worker_thread_ = kernel::object_ref( @@ -128,6 +130,46 @@ void CommandProcessor::EndTracing() { trace_writer_.Close(); } +void CommandProcessor::RestoreRegisters(uint32_t first_register, + const uint32_t* register_values, + uint32_t register_count, + bool execute_callbacks) { + if (first_register > RegisterFile::kRegisterCount || + RegisterFile::kRegisterCount - first_register < register_count) { + XELOGW( + "CommandProcessor::RestoreRegisters out of bounds (0x{:X} registers " + "starting with 0x{:X}, while a total of 0x{:X} registers are stored)", + register_count, first_register, RegisterFile::kRegisterCount); + if (first_register > RegisterFile::kRegisterCount) { + return; + } + register_count = + std::min(uint32_t(RegisterFile::kRegisterCount) - first_register, + register_count); + } + if (execute_callbacks) { + for (uint32_t i = 0; i < register_count; ++i) { + WriteRegister(first_register + i, register_values[i]); + } + } else { + std::memcpy(register_file_->values + first_register, register_values, + sizeof(uint32_t) * register_count); + } +} + +void CommandProcessor::RestoreGammaRamp( + const reg::DC_LUT_30_COLOR* new_gamma_ramp_256_entry_table, + const reg::DC_LUT_PWL_DATA* new_gamma_ramp_pwl_rgb, + uint32_t new_gamma_ramp_rw_component) { + std::memcpy(gamma_ramp_256_entry_table_, new_gamma_ramp_256_entry_table, + sizeof(reg::DC_LUT_30_COLOR) * 256); + std::memcpy(gamma_ramp_pwl_rgb_, new_gamma_ramp_pwl_rgb, + sizeof(reg::DC_LUT_PWL_DATA) * 3 * 128); + gamma_ramp_rw_component_ = new_gamma_ramp_rw_component; + OnGammaRamp256EntryTableValueWritten(); + OnGammaRampPWLValueWritten(); +} + void CommandProcessor::CallInThread(std::function fn) { if (pending_fns_.empty() && kernel::XThread::IsInThread(worker_thread_.get())) { @@ -286,68 +328,141 @@ void CommandProcessor::UpdateWritePointer(uint32_t value) { } void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { - RegisterFile* regs = register_file_; + RegisterFile& regs = *register_file_; if (index >= RegisterFile::kRegisterCount) { XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", index); return; } - regs->values[index].u32 = value; - if (!regs->GetRegisterInfo(index)) { + regs.values[index].u32 = value; + if (!regs.GetRegisterInfo(index)) { XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value); } - // If this is a COHER register, set the dirty flag. - // This will block the command processor the next time it WAIT_MEM_REGs and - // allow us to synchronize the memory. - if (index == XE_GPU_REG_COHER_STATUS_HOST) { - regs->values[index].u32 |= 0x80000000ul; - } - // Scratch register writeback. if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) { uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0; - if ((1 << scratch_reg) & regs->values[XE_GPU_REG_SCRATCH_UMSK].u32) { + if ((1 << scratch_reg) & regs.values[XE_GPU_REG_SCRATCH_UMSK].u32) { // Enabled - write to address. - uint32_t scratch_addr = regs->values[XE_GPU_REG_SCRATCH_ADDR].u32; + uint32_t scratch_addr = regs.values[XE_GPU_REG_SCRATCH_ADDR].u32; uint32_t mem_addr = scratch_addr + (scratch_reg * 4); xe::store_and_swap(memory_->TranslatePhysical(mem_addr), value); } - } -} + } else { + switch (index) { + // If this is a COHER register, set the dirty flag. + // This will block the command processor the next time it WAIT_MEM_REGs + // and allow us to synchronize the memory. + case XE_GPU_REG_COHER_STATUS_HOST: { + regs.values[index].u32 |= UINT32_C(0x80000000); + } break; -void CommandProcessor::UpdateGammaRampValue(GammaRampType type, - uint32_t value) { - RegisterFile* regs = register_file_; + case XE_GPU_REG_DC_LUT_RW_INDEX: { + // Reset the sequential read / write component index (see the M56 + // DC_LUT_SEQ_COLOR documentation). + gamma_ramp_rw_component_ = 0; + } break; - auto index = regs->values[XE_GPU_REG_DC_LUT_RW_INDEX].u32; + case XE_GPU_REG_DC_LUT_SEQ_COLOR: { + // Should be in the 256-entry table writing mode. + assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1); + auto& gamma_ramp_rw_index = regs.Get(); + // DC_LUT_SEQ_COLOR is in the red, green, blue order, but the write + // enable mask is blue, green, red. + bool write_gamma_ramp_component = + (regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 & + (UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0; + if (write_gamma_ramp_component) { + reg::DC_LUT_30_COLOR& gamma_ramp_entry = + gamma_ramp_256_entry_table_[gamma_ramp_rw_index.rw_index]; + // Bits 0:5 are hardwired to zero. + uint32_t gamma_ramp_seq_color = + regs.Get().seq_color >> 6; + switch (gamma_ramp_rw_component_) { + case 0: + gamma_ramp_entry.color_10_red = gamma_ramp_seq_color; + break; + case 1: + gamma_ramp_entry.color_10_green = gamma_ramp_seq_color; + break; + case 2: + gamma_ramp_entry.color_10_blue = gamma_ramp_seq_color; + break; + } + } + if (++gamma_ramp_rw_component_ >= 3) { + gamma_ramp_rw_component_ = 0; + ++gamma_ramp_rw_index.rw_index; + } + if (write_gamma_ramp_component) { + OnGammaRamp256EntryTableValueWritten(); + } + } break; - auto mask = regs->values[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32; - auto mask_lo = (mask >> 0) & 0x7; - auto mask_hi = (mask >> 3) & 0x7; + case XE_GPU_REG_DC_LUT_PWL_DATA: { + // Should be in the PWL writing mode. + assert_not_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1); + auto& gamma_ramp_rw_index = regs.Get(); + // Bit 7 of the index is ignored for PWL. + uint32_t gamma_ramp_rw_index_pwl = gamma_ramp_rw_index.rw_index & 0x7F; + // DC_LUT_RW_INDEX is likely in the red, green, blue order because + // DC_LUT_SEQ_COLOR is, but the write enable mask is blue, green, red. + bool write_gamma_ramp_component = + (regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 & + (UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0; + if (write_gamma_ramp_component) { + reg::DC_LUT_PWL_DATA& gamma_ramp_entry = + gamma_ramp_pwl_rgb_[gamma_ramp_rw_index_pwl] + [gamma_ramp_rw_component_]; + auto gamma_ramp_value = regs.Get(); + // Bits 0:5 are hardwired to zero. + gamma_ramp_entry.base = gamma_ramp_value.base & ~UINT32_C(0x3F); + gamma_ramp_entry.delta = gamma_ramp_value.delta & ~UINT32_C(0x3F); + } + if (++gamma_ramp_rw_component_ >= 3) { + gamma_ramp_rw_component_ = 0; + // TODO(Triang3l): Should this increase beyond 7 bits for PWL? + // Direct3D 9 explicitly sets rw_index to 0x80 after writing the last + // PWL entry. However, the DC_LUT_RW_INDEX documentation says that for + // PWL, the bit 7 is ignored. + gamma_ramp_rw_index.rw_index = + (gamma_ramp_rw_index.rw_index & ~UINT32_C(0x7F)) | + ((gamma_ramp_rw_index_pwl + 1) & 0x7F); + } + if (write_gamma_ramp_component) { + OnGammaRampPWLValueWritten(); + } + } break; - // If games update individual components we're going to have a problem. - assert_true(mask_lo == 0 || mask_lo == 7); - assert_true(mask_hi == 0); - - if (mask_lo) { - switch (type) { - case GammaRampType::kTable: - assert_true(regs->values[XE_GPU_REG_DC_LUT_RW_MODE].u32 == 0); - gamma_ramp_.table[index].value = value; - dirty_gamma_ramp_table_ = true; - break; - case GammaRampType::kPWL: - assert_true(regs->values[XE_GPU_REG_DC_LUT_RW_MODE].u32 == 1); - // The lower 6 bits are hardwired to 0. - // https://developer.amd.com/wordpress/media/2012/10/RRG-216M56-03oOEM.pdf - gamma_ramp_.pwl[index].values[gamma_ramp_rw_subindex_].value = - value & ~(uint32_t(63) | (uint32_t(63) << 16)); - gamma_ramp_rw_subindex_ = (gamma_ramp_rw_subindex_ + 1) % 3; - dirty_gamma_ramp_pwl_ = true; - break; - default: - assert_unhandled_case(type); + case XE_GPU_REG_DC_LUT_30_COLOR: { + // Should be in the 256-entry table writing mode. + assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1); + auto& gamma_ramp_rw_index = regs.Get(); + uint32_t gamma_ramp_write_enable_mask = + regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 & 0b111; + if (gamma_ramp_write_enable_mask) { + reg::DC_LUT_30_COLOR& gamma_ramp_entry = + gamma_ramp_256_entry_table_[gamma_ramp_rw_index.rw_index]; + auto gamma_ramp_value = regs.Get(); + if (gamma_ramp_write_enable_mask & 0b001) { + gamma_ramp_entry.color_10_blue = gamma_ramp_value.color_10_blue; + } + if (gamma_ramp_write_enable_mask & 0b010) { + gamma_ramp_entry.color_10_green = gamma_ramp_value.color_10_green; + } + if (gamma_ramp_write_enable_mask & 0b100) { + gamma_ramp_entry.color_10_red = gamma_ramp_value.color_10_red; + } + } + ++gamma_ramp_rw_index.rw_index; + // TODO(Triang3l): Should this reset the component write index? If this + // increase is assumed to behave like a full DC_LUT_RW_INDEX write, it + // probably should. + gamma_ramp_rw_component_ = 0; + if (gamma_ramp_write_enable_mask) { + OnGammaRamp256EntryTableValueWritten(); + } + } break; } } } @@ -1493,5 +1608,17 @@ bool CommandProcessor::ExecutePacketType3_VIZ_QUERY(RingBuffer* reader, return true; } +void CommandProcessor::InitializeTrace() { + // Write the initial register values, to be loaded directly into the + // RegisterFile since all registers, including those that may have side + // effects on setting, will be saved. + trace_writer_.WriteRegisters( + 0, reinterpret_cast(register_file_->values), + RegisterFile::kRegisterCount, false); + + trace_writer_.WriteGammaRamp(gamma_ramp_256_entry_table(), + gamma_ramp_pwl_rgb(), gamma_ramp_rw_component_); +} + } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h index caa49e300..367ed9ee2 100644 --- a/src/xenia/gpu/command_processor.h +++ b/src/xenia/gpu/command_processor.h @@ -22,6 +22,7 @@ #include "xenia/base/ring_buffer.h" #include "xenia/base/threading.h" #include "xenia/gpu/register_file.h" +#include "xenia/gpu/registers.h" #include "xenia/gpu/trace_writer.h" #include "xenia/gpu/xenos.h" #include "xenia/kernel/xthread.h" @@ -64,61 +65,6 @@ enum class GammaRampType { kPWL, }; -struct GammaRamp { - // A lot of gamma ramp (DC_LUT) documentation: - // https://developer.amd.com/wordpress/media/2012/10/RRG-216M56-03oOEM.pdf - // The ramps entries are BGR, not RGB. - // For the 256-entry table (used by Direct3D 9 for a 8bpc front buffer), - // 535107D4 has in-game settings allowing separate configuration. - // The component order of the PWL table is untested, however, it's likely BGR - // too, since DC_LUTA/B registers have values for blue first, and for red - // last. - struct TableEntry { - union { - uint32_t value; - struct { - uint32_t b : 10; - uint32_t g : 10; - uint32_t r : 10; - uint32_t : 2; - }; - }; - }; - - struct PWLValue { - union { - uint32_t value; - struct { - // The lower 6 bits are always zero (these are 10-bit in the upper bits - // thus, not fully 16-bit). - // See DC_LUTA/B_CONTROL for information about the way they should be - // interpreted (`output = base + (multiplier * delta) / 2^increment`, - // where the increment is the value specified in DC_LUTA/B_CONTROL for - // the specific color channel, the base is 7 bits of the front buffer - // value above `increment` bits, the multiplier is the lower `increment` - // bits of it; the increment is nonzero, otherwise the 256-entry table - // should be used instead). - uint16_t base; - uint16_t delta; - }; - }; - }; - - struct PWLEntry { - union { - PWLValue values[3]; - struct { - PWLValue b; - PWLValue g; - PWLValue r; - }; - }; - }; - - TableEntry table[256]; - PWLEntry pwl[128]; -}; - class CommandProcessor { public: enum class SwapPostEffect { @@ -170,6 +116,13 @@ class CommandProcessor { virtual void TracePlaybackWroteMemory(uint32_t base_ptr, uint32_t length) = 0; + void RestoreRegisters(uint32_t first_register, + const uint32_t* register_values, + uint32_t register_count, bool execute_callbacks); + void RestoreGammaRamp( + const reg::DC_LUT_30_COLOR* new_gamma_ramp_256_entry_table, + const reg::DC_LUT_PWL_DATA* new_gamma_ramp_pwl_rgb, + uint32_t new_gamma_ramp_rw_component); virtual void RestoreEdramSnapshot(const void* snapshot) = 0; void InitializeRingBuffer(uint32_t ptr, uint32_t size_log2); @@ -201,7 +154,14 @@ class CommandProcessor { virtual void WriteRegister(uint32_t index, uint32_t value); - void UpdateGammaRampValue(GammaRampType type, uint32_t value); + const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const { + return gamma_ramp_256_entry_table_; + } + const reg::DC_LUT_PWL_DATA* gamma_ramp_pwl_rgb() const { + return gamma_ramp_pwl_rgb_[0]; + } + virtual void OnGammaRamp256EntryTableValueWritten() {} + virtual void OnGammaRampPWLValueWritten() {} virtual void MakeCoherent(); virtual void PrepareForWait(); @@ -285,9 +245,7 @@ class CommandProcessor { return swap_post_effect_actual_; } - // TODO(Triang3l): Write the gamma ramp (including the display controller - // write pointers) in the common code. - virtual void InitializeTrace() = 0; + virtual void InitializeTrace(); Memory* memory_ = nullptr; kernel::KernelState* kernel_state_ = nullptr; @@ -334,15 +292,15 @@ class CommandProcessor { bool paused_ = false; - GammaRamp gamma_ramp_ = {}; - int gamma_ramp_rw_subindex_ = 0; - bool dirty_gamma_ramp_table_ = true; - bool dirty_gamma_ramp_pwl_ = true; - // By default (such as for tools), post-processing is disabled. // "Desired" is for the external thread managing the post-processing effect. SwapPostEffect swap_post_effect_desired_ = SwapPostEffect::kNone; SwapPostEffect swap_post_effect_actual_ = SwapPostEffect::kNone; + + private: + reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {}; + reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {}; + uint32_t gamma_ramp_rw_component_ = 0; }; } // namespace gpu diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 9e4bdba44..ebfbbe986 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -13,6 +13,7 @@ #include #include "xenia/base/assert.h" +#include "xenia/base/byte_order.h" #include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" @@ -1161,8 +1162,8 @@ bool D3D12CommandProcessor::SetupContext() { provider.GetHeapFlagCreateNotZeroed(); // Create gamma ramp resources. - dirty_gamma_ramp_table_ = true; - dirty_gamma_ramp_pwl_ = true; + gamma_ramp_256_entry_table_up_to_date_ = false; + gamma_ramp_pwl_up_to_date_ = false; D3D12_RESOURCE_DESC gamma_ramp_buffer_desc; ui::d3d12::util::FillBufferResourceDesc( gamma_ramp_buffer_desc, (256 + 128 * 3) * 4, D3D12_RESOURCE_FLAG_NONE); @@ -1699,15 +1700,17 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { texture_cache_->TextureFetchConstantWritten( (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6); } - } else if (index == XE_GPU_REG_DC_LUT_PWL_DATA) { - UpdateGammaRampValue(GammaRampType::kPWL, value); - } else if (index == XE_GPU_REG_DC_LUT_30_COLOR) { - UpdateGammaRampValue(GammaRampType::kTable, value); - } else if (index == XE_GPU_REG_DC_LUT_RW_MODE) { - gamma_ramp_rw_subindex_ = 0; } } +void D3D12CommandProcessor::OnGammaRamp256EntryTableValueWritten() { + gamma_ramp_256_entry_table_up_to_date_ = false; +} + +void D3D12CommandProcessor::OnGammaRampPWLValueWritten() { + gamma_ramp_pwl_up_to_date_ = false; +} + void D3D12CommandProcessor::IssueSwap(uint32_t frontbuffer_ptr, uint32_t frontbuffer_width, uint32_t frontbuffer_height) { @@ -1801,6 +1804,9 @@ void D3D12CommandProcessor::IssueSwap(uint32_t frontbuffer_ptr, // This is according to D3D::InitializePresentationParameters from a // game executable, which initializes the 256-entry table gamma ramp for // 8_8_8_8 output and the PWL gamma ramp for 2_10_10_10. + // TODO(Triang3l): Choose between the table and PWL based on + // DC_LUTA_CONTROL, support both for all formats (and also different + // increments for PWL). bool use_pwl_gamma_ramp = frontbuffer_format == xenos::TextureFormat::k_2_10_10_10 || frontbuffer_format == @@ -1811,20 +1817,43 @@ void D3D12CommandProcessor::IssueSwap(uint32_t frontbuffer_ptr, // Upload the new gamma ramp, using the upload buffer for the current // frame (will close the frame after this anyway, so can't write // multiple times per frame). - if (use_pwl_gamma_ramp ? dirty_gamma_ramp_pwl_ - : dirty_gamma_ramp_table_) { + if (!(use_pwl_gamma_ramp ? gamma_ramp_pwl_up_to_date_ + : gamma_ramp_256_entry_table_up_to_date_)) { uint32_t gamma_ramp_offset_bytes = use_pwl_gamma_ramp ? 256 * 4 : 0; uint32_t gamma_ramp_upload_offset_bytes = uint32_t(frame_current_ % kQueueFrames) * ((256 + 128 * 3) * 4) + gamma_ramp_offset_bytes; uint32_t gamma_ramp_size_bytes = (use_pwl_gamma_ramp ? 128 * 3 : 256) * 4; - std::memcpy(gamma_ramp_upload_buffer_mapping_ + - gamma_ramp_upload_offset_bytes, - use_pwl_gamma_ramp - ? static_cast(gamma_ramp_.pwl) - : static_cast(gamma_ramp_.table), - gamma_ramp_size_bytes); + if (std::endian::native != std::endian::little && + use_pwl_gamma_ramp) { + // R16G16 is first R16, where the shader expects the base, and + // second G16, where the delta should be, but gamma_ramp_pwl_rgb() + // is an array of 32-bit DC_LUT_PWL_DATA registers - swap 16 bits in + // each 32. + auto gamma_ramp_pwl_upload_buffer = + reinterpret_cast( + gamma_ramp_upload_buffer_mapping_ + + gamma_ramp_upload_offset_bytes); + const reg::DC_LUT_PWL_DATA* gamma_ramp_pwl = gamma_ramp_pwl_rgb(); + for (size_t i = 0; i < 128 * 3; ++i) { + reg::DC_LUT_PWL_DATA& gamma_ramp_pwl_upload_buffer_entry = + gamma_ramp_pwl_upload_buffer[i]; + reg::DC_LUT_PWL_DATA gamma_ramp_pwl_entry = gamma_ramp_pwl[i]; + gamma_ramp_pwl_upload_buffer_entry.base = + gamma_ramp_pwl_entry.delta; + gamma_ramp_pwl_upload_buffer_entry.delta = + gamma_ramp_pwl_entry.base; + } + } else { + std::memcpy( + gamma_ramp_upload_buffer_mapping_ + + gamma_ramp_upload_offset_bytes, + use_pwl_gamma_ramp + ? static_cast(gamma_ramp_pwl_rgb()) + : static_cast(gamma_ramp_256_entry_table()), + gamma_ramp_size_bytes); + } PushTransitionBarrier(gamma_ramp_buffer_.Get(), gamma_ramp_buffer_state_, D3D12_RESOURCE_STATE_COPY_DEST); @@ -1834,8 +1863,8 @@ void D3D12CommandProcessor::IssueSwap(uint32_t frontbuffer_ptr, gamma_ramp_buffer_.Get(), gamma_ramp_offset_bytes, gamma_ramp_upload_buffer_.Get(), gamma_ramp_upload_offset_bytes, gamma_ramp_size_bytes); - (use_pwl_gamma_ramp ? dirty_gamma_ramp_pwl_ - : dirty_gamma_ramp_table_) = false; + (use_pwl_gamma_ramp ? gamma_ramp_pwl_up_to_date_ + : gamma_ramp_256_entry_table_up_to_date_) = true; } // Destination, source, and if bindful, gamma ramp. @@ -2589,6 +2618,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, } void D3D12CommandProcessor::InitializeTrace() { + CommandProcessor::InitializeTrace(); + if (!BeginSubmission(false)) { return; } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index a3b9f8577..2bb7a1c84 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -209,6 +209,9 @@ class D3D12CommandProcessor : public CommandProcessor { void WriteRegister(uint32_t index, uint32_t value) override; + void OnGammaRamp256EntryTableValueWritten() override; + void OnGammaRampPWLValueWritten() override; + void IssueSwap(uint32_t frontbuffer_ptr, uint32_t frontbuffer_width, uint32_t frontbuffer_height) override; @@ -496,17 +499,18 @@ class D3D12CommandProcessor : public CommandProcessor { std::unique_ptr texture_cache_; - // Bytes 0x0...0x3FF - 256-entry R10G10B10X2 gamma ramp (red and blue must be - // read as swapped - 535107D4 has settings allowing separate configuration). + // Bytes 0x0...0x3FF - 256-entry gamma ramp table with B10G10R10X2 data (read + // as R10G10B10X2 with swizzle). // Bytes 0x400...0x9FF - 128-entry PWL R16G16 gamma ramp (R - base, G - delta, // low 6 bits of each are zero, 3 elements per entry). - // https://www.x.org/docs/AMD/old/42590_m76_rrg_1.01o.pdf Microsoft::WRL::ComPtr gamma_ramp_buffer_; D3D12_RESOURCE_STATES gamma_ramp_buffer_state_; // Upload buffer for an image that is the same as gamma_ramp_, but with // kQueueFrames array layers. Microsoft::WRL::ComPtr gamma_ramp_upload_buffer_; uint8_t* gamma_ramp_upload_buffer_mapping_ = nullptr; + bool gamma_ramp_256_entry_table_up_to_date_ = false; + bool gamma_ramp_pwl_up_to_date_ = false; struct ApplyGammaConstants { uint32_t size[2]; diff --git a/src/xenia/gpu/register_table.inc b/src/xenia/gpu/register_table.inc index 8da898a5a..aa22558eb 100644 --- a/src/xenia/gpu/register_table.inc +++ b/src/xenia/gpu/register_table.inc @@ -275,14 +275,183 @@ XE_GPU_REGISTER(0x1844, kDword, D1GRPH_PRIMARY_SURFACE_ADDRESS) XE_GPU_REGISTER(0x1852, kDword, D1GRPH_FLIP_CONTROL) -XE_GPU_REGISTER(0x1921, kDword, DC_LUT_RW_MODE) -XE_GPU_REGISTER(0x1922, kDword, DC_LUT_RW_INDEX) +// In 4B4F07FE, the 256-entry gamma ramp for the 8bpc framebuffer is set to +// different values in multiple places in the game. For VdGetCurrentDisplayGamma +// returning 1 (sRGB), it's set up in the beginning as: +// DC_LUTA_CONTROL = 0x00000000 (256-entry unsigned fixed-point) +// DC_LUT_RW_MODE = 0x00000000 +// DC_LUT_RW_INDEX = 0x00000000 +// DC_LUT_WRITE_EN_MASK = 0x00000007 +// DC_LUT_30_COLOR = 0x00000000 +// DC_LUT_RW_INDEX = 0x00000001 +// DC_LUT_30_COLOR = 0x04812048 +// DC_LUT_RW_INDEX = 0x00000002 +// DC_LUT_30_COLOR = 0x05916459 +// DC_LUT_RW_INDEX = 0x00000003 +// DC_LUT_30_COLOR = 0x06519465 +// ... +// DC_LUT_RW_INDEX = 0x000000FE +// DC_LUT_30_COLOR = 0x3FBFEFFB +// DC_LUT_RW_INDEX = 0x000000FF +// DC_LUT_30_COLOR = 0x3FFFFFFF +// DC_LUT_RW_INDEX = 0x00000100 +// +// One another possible setup in 4B4F07FE is: +// DC_LUTA_CONTROL = 0x00000000 (256-entry unsigned fixed-point) +// DC_LUT_RW_MODE = 0x00000000 +// DC_LUT_RW_INDEX = 0x00000000 +// DC_LUT_WRITE_EN_MASK = 0x00000007 +// DC_LUT_30_COLOR = 0x00000000 +// DC_LUT_RW_INDEX = 0x00000001 +// DC_LUT_30_COLOR = 0x01A0681A +// DC_LUT_RW_INDEX = 0x00000002 +// DC_LUT_30_COLOR = 0x02709C27 +// ... +// DC_LUT_RW_INDEX = 0x000000FE +// DC_LUT_30_COLOR = 0x3FBFEFFB +// DC_LUT_RW_INDEX = 0x000000FF +// DC_LUT_30_COLOR = 0x3FFFFFFF +// DC_LUT_RW_INDEX = 0x00000100 +// +// In 4D5307E6, the 128-entry PWL gamma ramp for the 10bpc framebuffer, for +// VdGetCurrentDisplayGamma returning 1 (sRGB), is set up right after launching +// the game as: +// DC_LUTA_CONTROL = 0x00000003 (8-increment unsigned fixed-point) +// DC_LUT_RW_MODE = 0x00000001 +// DC_LUT_RW_INDEX = 0x00000000 +// DC_LUT_WRITE_EN_MASK = 0x00000007 +// DC_LUT_PWL_DATA = 0x02000000 +// DC_LUT_PWL_DATA = 0x02000000 +// DC_LUT_PWL_DATA = 0x02000000 +// DC_LUT_RW_INDEX = 0x00000001 +// DC_LUT_PWL_DATA = 0x02000200 +// DC_LUT_PWL_DATA = 0x02000200 +// DC_LUT_PWL_DATA = 0x02000200 +// DC_LUT_RW_INDEX = 0x00000001 +// DC_LUT_PWL_DATA = 0x02000400 +// DC_LUT_PWL_DATA = 0x02000400 +// DC_LUT_PWL_DATA = 0x02000400 +// ... +// DC_LUT_RW_INDEX = 0x0000007D +// DC_LUT_PWL_DATA = 0x0200FBC0 +// DC_LUT_PWL_DATA = 0x0200FBC0 +// DC_LUT_PWL_DATA = 0x0200FBC0 +// DC_LUT_RW_INDEX = 0x0000007E +// DC_LUT_PWL_DATA = 0x0200FDC0 +// DC_LUT_PWL_DATA = 0x0200FDC0 +// DC_LUT_PWL_DATA = 0x0200FDC0 +// DC_LUT_RW_INDEX = 0x0000007F +// DC_LUT_PWL_DATA = 0x0000FFC0 +// DC_LUT_PWL_DATA = 0x0000FFC0 +// DC_LUT_PWL_DATA = 0x0000FFC0 +// DC_LUT_RW_INDEX = 0x00000080 +// +// Later in 4D5307E6, for the game itself (apparently for conversion of the bit +// representation of 7e3 floating-point data in the front buffer to 10-bit fixed +// point, as the game draws the final passes to a 7e3 framebuffer), with +// VdGetCurrentDisplayGamma returning 1 (sRGB) and the normal brightness in the +// game settings, it's: +// DC_LUTA_CONTROL = 0x00000003 (8-increment unsigned fixed-point) +// DC_LUT_RW_MODE = 0x00000001 +// DC_LUT_RW_INDEX = 0x00000000 +// DC_LUT_WRITE_EN_MASK = 0x00000007 +// DC_LUT_PWL_DATA = 0x05000000 +// DC_LUT_PWL_DATA = 0x05000000 +// DC_LUT_PWL_DATA = 0x05000000 +// DC_LUT_RW_INDEX = 0x00000001 +// DC_LUT_PWL_DATA = 0x02000500 +// DC_LUT_PWL_DATA = 0x02000500 +// DC_LUT_PWL_DATA = 0x02000500 +// DC_LUT_RW_INDEX = 0x00000001 +// DC_LUT_PWL_DATA = 0x01800740 +// DC_LUT_PWL_DATA = 0x01800740 +// DC_LUT_PWL_DATA = 0x01800740 +// ... +// DC_LUT_RW_INDEX = 0x0000007D +// DC_LUT_PWL_DATA = 0x0440F340 +// DC_LUT_PWL_DATA = 0x0440F340 +// DC_LUT_PWL_DATA = 0x0440F340 +// DC_LUT_RW_INDEX = 0x0000007E +// DC_LUT_PWL_DATA = 0x0400F780 +// DC_LUT_PWL_DATA = 0x0400F780 +// DC_LUT_PWL_DATA = 0x0400F780 +// DC_LUT_RW_INDEX = 0x0000007F +// DC_LUT_PWL_DATA = 0x0400FBC0 +// DC_LUT_PWL_DATA = 0x0400FBC0 +// DC_LUT_PWL_DATA = 0x0400FBC0 +// DC_LUT_RW_INDEX = 0x00000080 +// +// In 535107D4, the 256-entry gamma ramp for the 8bpc framebuffer is +// configurable from the game's settings menu for each channel independently. +// For VdGetCurrentDisplayGamma returning 1 (sRGB), when in the settings, the +// red gamma is at the maximum of 5.56, green is at 1.00, and blue is at the +// minimum of 0.17, the setup is done as: +// DC_LUT_RW_MODE = 0x00000000 +// DC_LUT_RW_INDEX = 0x00000000 +// DC_LUT_WRITE_EN_MASK = 0x00000007 +// DC_LUT_30_COLOR = 0x00000000 +// DC_LUT_RW_INDEX = 0x00000001 +// DC_LUT_30_COLOR = 0x17901000 +// DC_LUT_RW_INDEX = 0x00000002 +// DC_LUT_30_COLOR = 0x1AB02000 +// ... +// DC_LUT_RW_INDEX = 0x000000FE +// DC_LUT_30_COLOR = 0x3FEFE3D2 +// DC_LUT_RW_INDEX = 0x000000FF +// DC_LUT_30_COLOR = 0x3FFFF3E9 +// DC_LUT_RW_INDEX = 0x00000100 +// Read / write mode in bit 0: 0 - 256-entry table, 1 - PWL. +// Default: 0x00000000. +XE_GPU_REGISTER(0x1921, kDword, DC_LUT_RW_MODE) +// Read / write index. No lower and upper halves on the Xenos apparently, for +// the 256-entry table, the bits 0:7 are the index directly (unlike on the M56, +// not split into the index in 1:7 and the lower or upper 10 bits selection in +// 0:0, instead, on the Xenos, the index in 0:7 is just increased +// monotonically). For some reason though Direct3D 9 writes an index that +// overflows by one (0x100 for the 256-entry table, 0x80 for the 128-entry PWL +// gamma ramp) after setting up all the values. However, the index is 8-bit, and +// for PWL, according to the M56 documentation, the bit 7 is not used. +// Default: 0x00000000. +XE_GPU_REGISTER(0x1922, kDword, DC_LUT_RW_INDEX) +// Sequential 10-bit R, G, B host read / write for the 256-entry table. After +// reset or writing DC_LUT_RW_INDEX, the first access is for the red component, +// the second is for green, the third is for blue, and after blue is accessed, +// the LUT index is increased by 1 (without having to explicitly change +// DC_LUT_RW_INDEX). Bits 0:5 are hardwired to zero. +// Default: 0x00000000. +XE_GPU_REGISTER(0x1923, kDword, DC_LUT_SEQ_COLOR) +// Read / write, 0:15 - base, 16:31 - delta. Bits 0:5 of both the base and the +// delta are hardwired to zero. The LUT index is increased by 1 when +// DC_LUT_PWL_DATA is accessed, though three DC_LUT_PWL_DATA writes are done for +// one entry (the order is likely R, G, B, similar to DC_LUT_SEQ_COLOR, but this +// hasn't been verified yet as no games using the PWL gamma ramp with separate +// settings for each channel have been found yet). +// Default: 0x00000000. XE_GPU_REGISTER(0x1924, kDword, DC_LUT_PWL_DATA) +// Read / write, 0:9 - blue, 10:19 - green, 20:29 - red. The LUT index is +// increased by 1 when DC_LUT_30_COLOR is accessed. +// Default: 0x00000000. XE_GPU_REGISTER(0x1925, kDword, DC_LUT_30_COLOR) +// Only LUT pipe 1 on the Xenos apparently (Direct3D 9 sets DC_LUT_WRITE_EN_MASK +// to 0b111 before writing the gamma ramp), 3 bits set, rather than 6 on the +// M56. +// Bit 0 - blue write enable mask. +// Bit 1 - green write enable mask. +// Bit 2 - red write enable mask. +// Default: 0x00000007 (though 0x0000003F on the M56 where there are two pipes). XE_GPU_REGISTER(0x1927, kDword, DC_LUT_WRITE_EN_MASK) +// Single set of parameters for all channels apparently unlike on the M56 +// (4D5307E6 sets DC_LUTA_CONTROL to 0x00000003 for the data increment of 8 in +// the 128-entry PWL gamma ramp for a 10bpc framebuffer). Also set not only +// during setup, but also apparently during every swap by Direct3D 9, though not +// directly in all games (happens in 4B4F07FE and 4D5307E6 even without proper +// VdSwap emulation, but in 535107D4, with a fake VdSwap packet rather than the +// real ones, the register is not set at all, though the expected behavior is +// that of the value of 0x00000000). +// Default: 0x00000000. XE_GPU_REGISTER(0x1930, kDword, DC_LUTA_CONTROL) XE_GPU_REGISTER(0x1961, kDword, AVIVO_D1MODE_VIEWPORT_SIZE) diff --git a/src/xenia/gpu/registers.h b/src/xenia/gpu/registers.h index 7c1020bd9..2d89d541b 100644 --- a/src/xenia/gpu/registers.h +++ b/src/xenia/gpu/registers.h @@ -825,6 +825,68 @@ union alignas(uint32_t) RB_COPY_DEST_PITCH { }; static_assert_size(RB_COPY_DEST_PITCH, sizeof(uint32_t)); +/******************************************************************************* + ___ ___ ___ ___ _ ___ __ + | \_ _/ __| _ \ | /_\ \ / / + | |) | |\__ \ _/ |__ / _ \ V / + |___/___|___/_| |____/_/ \_\_| + + ___ ___ _ _ _____ ___ ___ _ _ ___ ___ + / __/ _ \| \| |_ _| _ \/ _ \| | | | | __| _ \ + | (_| (_) | .` | | | | / (_) | |__| |__| _|| / + \___\___/|_|\_| |_| |_|_\\___/|____|____|___|_|_\ + +*******************************************************************************/ + +union alignas(uint32_t) DC_LUT_RW_INDEX { + uint32_t value; + struct { + // Unlike in the M56 documentation, for the 256-table entry, this is the + // absolute index, without the lower or upper 10 bits selection in the + // bit 0. For PWL, the bit 7 is ignored. + uint32_t rw_index : 8; // +0 + }; + static constexpr Register register_index = XE_GPU_REG_DC_LUT_RW_INDEX; +}; +static_assert_size(DC_LUT_RW_INDEX, sizeof(uint32_t)); + +union alignas(uint32_t) DC_LUT_SEQ_COLOR { + uint32_t value; + struct { + uint32_t seq_color : 16; // +0, bits 0:5 are hardwired to zero + }; + static constexpr Register register_index = XE_GPU_REG_DC_LUT_SEQ_COLOR; +}; +static_assert_size(DC_LUT_SEQ_COLOR, sizeof(uint32_t)); + +union alignas(uint32_t) DC_LUT_PWL_DATA { + uint32_t value; + struct { + // See the M56 DC_LUTA_CONTROL for information about the way these should be + // interpreted (`output = base + (multiplier * delta) / 2^increment`, where + // the increment is the value specified in DC_LUTA_CONTROL for the specific + // color channel, the base is 7 bits of the front buffer value above + // `increment` bits, the multiplier is the lower `increment` bits of it; the + // increment is nonzero, otherwise the 256-entry table should be used + // instead). + uint32_t base : 16; // +0, bits 0:5 are hardwired to zero + uint32_t delta : 16; // +16, bits 0:5 are hardwired to zero + }; + static constexpr Register register_index = XE_GPU_REG_DC_LUT_PWL_DATA; +}; +static_assert_size(DC_LUT_PWL_DATA, sizeof(uint32_t)); + +union alignas(uint32_t) DC_LUT_30_COLOR { + uint32_t value; + struct { + uint32_t color_10_blue : 10; // +0 + uint32_t color_10_green : 10; // +10 + uint32_t color_10_red : 10; // +20 + }; + static constexpr Register register_index = XE_GPU_REG_DC_LUT_30_COLOR; +}; +static_assert_size(DC_LUT_30_COLOR, sizeof(uint32_t)); + } // namespace reg } // namespace gpu diff --git a/src/xenia/gpu/shaders/apply_gamma_pwl.hlsli b/src/xenia/gpu/shaders/apply_gamma_pwl.hlsli index a77c85460..7cae461e4 100644 --- a/src/xenia/gpu/shaders/apply_gamma_pwl.hlsli +++ b/src/xenia/gpu/shaders/apply_gamma_pwl.hlsli @@ -32,10 +32,9 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { } // UNORM conversion according to the Direct3D 10+ rules. uint3 input = uint3(xe_apply_gamma_source[xe_thread_id.xy] * 1023.0f + 0.5f); - // The ramp is BGR, not RGB. - float3 output = float3(XeApplyPWLGamma(input.r, 2u), + float3 output = float3(XeApplyPWLGamma(input.r, 0u), XeApplyPWLGamma(input.g, 1u), - XeApplyPWLGamma(input.b, 0u)); + XeApplyPWLGamma(input.b, 2u)); xe_apply_gamma_dest[xe_thread_id.xy] = float4(output, XeApplyGammaGetAlpha(output)); } diff --git a/src/xenia/gpu/shaders/apply_gamma_table.hlsli b/src/xenia/gpu/shaders/apply_gamma_table.hlsli index 7f43a9567..c3786ee47 100644 --- a/src/xenia/gpu/shaders/apply_gamma_table.hlsli +++ b/src/xenia/gpu/shaders/apply_gamma_table.hlsli @@ -14,7 +14,8 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { } // UNORM conversion according to the Direct3D 10+ rules. uint3 input = uint3(xe_apply_gamma_source[xe_thread_id.xy] * 255.0f + 0.5f); - // The ramp is BGR, not RGB. + // The ramp has blue in bits 0:9, green in 10:19, red in 20:29 - BGR passed as + // an R10G10B10A2 buffer. float3 output = float3(xe_apply_gamma_ramp[input.r].b, xe_apply_gamma_ramp[input.g].g, xe_apply_gamma_ramp[input.b].r); diff --git a/src/xenia/gpu/shaders/bytecode/d3d12_5_1/apply_gamma_pwl_cs.h b/src/xenia/gpu/shaders/bytecode/d3d12_5_1/apply_gamma_pwl_cs.h index 727d15384..01cb7de40 100644 --- a/src/xenia/gpu/shaders/bytecode/d3d12_5_1/apply_gamma_pwl_cs.h +++ b/src/xenia/gpu/shaders/bytecode/d3d12_5_1/apply_gamma_pwl_cs.h @@ -55,17 +55,17 @@ ld r0.xyz, r0.xyzw, T0[0].xyzw mad r0.xyz, r0.xyzx, l(1023.000000, 1023.000000, 1023.000000, 0.000000), l(0.500000, 0.500000, 0.500000, 0.000000) ftou r0.xyz, r0.xyzx ushr r1.xyz, r0.xyzx, l(3, 3, 3, 0) -imul null, r0.w, r1.z, l(3) -imad r1.xy, r1.xyxx, l(3, 3, 0, 0), l(2, 1, 0, 0) -ld r1.xz, r1.xxxx, T1[1].xzyw -utof r1.x, r1.x +imul null, r0.w, r1.x, l(3) +ld r1.xw, r0.wwww, T1[1].xzwy +utof r0.w, r1.x and r0.xyz, r0.xyzx, l(7, 7, 7, 0) -imul null, r0.x, r1.z, r0.x +imul null, r0.x, r1.w, r0.x utof r0.x, r0.x -mad r0.x, r0.x, l(0.125000), r1.x +mad r0.x, r0.x, l(0.125000), r0.w mul r0.x, r0.x, l(0.000015) min r2.x, r0.x, l(1.000000) -ld r1.xy, r1.yyyy, T1[1].xyzw +imad r0.xw, r1.yyyz, l(3, 0, 0, 3), l(1, 0, 0, 2) +ld r1.xy, r0.xxxx, T1[1].xyzw utof r0.x, r1.x imul null, r0.y, r0.y, r1.y utof r0.y, r0.y @@ -86,10 +86,10 @@ ret const BYTE apply_gamma_pwl_cs[] = { - 68, 88, 66, 67, 180, 180, - 222, 28, 4, 138, 188, 113, - 52, 97, 214, 88, 116, 106, - 105, 240, 1, 0, 0, 0, + 68, 88, 66, 67, 134, 193, + 189, 188, 150, 246, 151, 78, + 29, 10, 33, 117, 212, 145, + 204, 130, 1, 0, 0, 0, 128, 7, 0, 0, 5, 0, 0, 0, 52, 0, 0, 0, 24, 2, 0, 0, 40, 2, @@ -257,26 +257,16 @@ const BYTE apply_gamma_pwl_cs[] = 0, 0, 0, 0, 38, 0, 0, 8, 0, 208, 0, 0, 130, 0, 16, 0, 0, 0, - 0, 0, 42, 0, 16, 0, + 0, 0, 10, 0, 16, 0, 1, 0, 0, 0, 1, 64, 0, 0, 3, 0, 0, 0, - 35, 0, 0, 15, 50, 0, + 45, 0, 0, 8, 146, 0, 16, 0, 1, 0, 0, 0, - 70, 0, 16, 0, 1, 0, - 0, 0, 2, 64, 0, 0, - 3, 0, 0, 0, 3, 0, - 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 2, 64, - 0, 0, 2, 0, 0, 0, - 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - 45, 0, 0, 8, 82, 0, - 16, 0, 1, 0, 0, 0, - 6, 0, 16, 0, 1, 0, - 0, 0, 134, 125, 32, 0, + 246, 15, 16, 0, 0, 0, + 0, 0, 134, 119, 32, 0, 1, 0, 0, 0, 1, 0, 0, 0, 86, 0, 0, 5, - 18, 0, 16, 0, 1, 0, + 130, 0, 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 1, 0, 0, 0, 1, 0, 0, 10, 114, 0, 16, 0, @@ -288,7 +278,7 @@ const BYTE apply_gamma_pwl_cs[] = 0, 0, 38, 0, 0, 8, 0, 208, 0, 0, 18, 0, 16, 0, 0, 0, 0, 0, - 42, 0, 16, 0, 1, 0, + 58, 0, 16, 0, 1, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 86, 0, 0, 5, 18, 0, 16, 0, @@ -298,8 +288,8 @@ const BYTE apply_gamma_pwl_cs[] = 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 1, 64, 0, 0, - 0, 0, 0, 62, 10, 0, - 16, 0, 1, 0, 0, 0, + 0, 0, 0, 62, 58, 0, + 16, 0, 0, 0, 0, 0, 56, 0, 0, 7, 18, 0, 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, @@ -309,10 +299,20 @@ const BYTE apply_gamma_pwl_cs[] = 2, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 1, 64, 0, 0, 0, 0, - 128, 63, 45, 0, 0, 8, + 128, 63, 35, 0, 0, 15, + 146, 0, 16, 0, 0, 0, + 0, 0, 86, 9, 16, 0, + 1, 0, 0, 0, 2, 64, + 0, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 3, 0, 0, 0, + 2, 64, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 0, + 0, 0, 45, 0, 0, 8, 50, 0, 16, 0, 1, 0, - 0, 0, 86, 5, 16, 0, - 1, 0, 0, 0, 70, 126, + 0, 0, 6, 0, 16, 0, + 0, 0, 0, 0, 70, 126, 32, 0, 1, 0, 0, 0, 1, 0, 0, 0, 86, 0, 0, 5, 18, 0, 16, 0, diff --git a/src/xenia/gpu/shaders/bytecode/d3d12_5_1/apply_gamma_pwl_fxaa_luma_cs.h b/src/xenia/gpu/shaders/bytecode/d3d12_5_1/apply_gamma_pwl_fxaa_luma_cs.h index ed86b4d4a..a4dd510d2 100644 --- a/src/xenia/gpu/shaders/bytecode/d3d12_5_1/apply_gamma_pwl_fxaa_luma_cs.h +++ b/src/xenia/gpu/shaders/bytecode/d3d12_5_1/apply_gamma_pwl_fxaa_luma_cs.h @@ -55,17 +55,17 @@ ld r0.xyz, r0.xyzw, T0[0].xyzw mad r0.xyz, r0.xyzx, l(1023.000000, 1023.000000, 1023.000000, 0.000000), l(0.500000, 0.500000, 0.500000, 0.000000) ftou r0.xyz, r0.xyzx ushr r1.xyz, r0.xyzx, l(3, 3, 3, 0) -imul null, r0.w, r1.z, l(3) -imad r1.xy, r1.xyxx, l(3, 3, 0, 0), l(2, 1, 0, 0) -ld r1.xz, r1.xxxx, T1[1].xzyw -utof r1.x, r1.x +imul null, r0.w, r1.x, l(3) +ld r1.xw, r0.wwww, T1[1].xzwy +utof r0.w, r1.x and r0.xyz, r0.xyzx, l(7, 7, 7, 0) -imul null, r0.x, r1.z, r0.x +imul null, r0.x, r1.w, r0.x utof r0.x, r0.x -mad r0.x, r0.x, l(0.125000), r1.x +mad r0.x, r0.x, l(0.125000), r0.w mul r0.x, r0.x, l(0.000015) min r2.x, r0.x, l(1.000000) -ld r1.xy, r1.yyyy, T1[1].xyzw +imad r0.xw, r1.yyyz, l(3, 0, 0, 3), l(1, 0, 0, 2) +ld r1.xy, r0.xxxx, T1[1].xyzw utof r0.x, r1.x imul null, r0.y, r0.y, r1.y utof r0.y, r0.y @@ -86,10 +86,10 @@ ret const BYTE apply_gamma_pwl_fxaa_luma_cs[] = { - 68, 88, 66, 67, 165, 122, - 242, 36, 160, 218, 193, 67, - 37, 43, 138, 45, 109, 219, - 226, 109, 1, 0, 0, 0, + 68, 88, 66, 67, 115, 68, + 69, 234, 116, 212, 118, 193, + 71, 10, 44, 165, 244, 209, + 63, 198, 1, 0, 0, 0, 148, 7, 0, 0, 5, 0, 0, 0, 52, 0, 0, 0, 24, 2, 0, 0, 40, 2, @@ -257,26 +257,16 @@ const BYTE apply_gamma_pwl_fxaa_luma_cs[] = 0, 0, 0, 0, 38, 0, 0, 8, 0, 208, 0, 0, 130, 0, 16, 0, 0, 0, - 0, 0, 42, 0, 16, 0, + 0, 0, 10, 0, 16, 0, 1, 0, 0, 0, 1, 64, 0, 0, 3, 0, 0, 0, - 35, 0, 0, 15, 50, 0, + 45, 0, 0, 8, 146, 0, 16, 0, 1, 0, 0, 0, - 70, 0, 16, 0, 1, 0, - 0, 0, 2, 64, 0, 0, - 3, 0, 0, 0, 3, 0, - 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 2, 64, - 0, 0, 2, 0, 0, 0, - 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - 45, 0, 0, 8, 82, 0, - 16, 0, 1, 0, 0, 0, - 6, 0, 16, 0, 1, 0, - 0, 0, 134, 125, 32, 0, + 246, 15, 16, 0, 0, 0, + 0, 0, 134, 119, 32, 0, 1, 0, 0, 0, 1, 0, 0, 0, 86, 0, 0, 5, - 18, 0, 16, 0, 1, 0, + 130, 0, 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 1, 0, 0, 0, 1, 0, 0, 10, 114, 0, 16, 0, @@ -288,7 +278,7 @@ const BYTE apply_gamma_pwl_fxaa_luma_cs[] = 0, 0, 38, 0, 0, 8, 0, 208, 0, 0, 18, 0, 16, 0, 0, 0, 0, 0, - 42, 0, 16, 0, 1, 0, + 58, 0, 16, 0, 1, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 86, 0, 0, 5, 18, 0, 16, 0, @@ -298,8 +288,8 @@ const BYTE apply_gamma_pwl_fxaa_luma_cs[] = 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 1, 64, 0, 0, - 0, 0, 0, 62, 10, 0, - 16, 0, 1, 0, 0, 0, + 0, 0, 0, 62, 58, 0, + 16, 0, 0, 0, 0, 0, 56, 0, 0, 7, 18, 0, 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, @@ -309,10 +299,20 @@ const BYTE apply_gamma_pwl_fxaa_luma_cs[] = 2, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 1, 64, 0, 0, 0, 0, - 128, 63, 45, 0, 0, 8, + 128, 63, 35, 0, 0, 15, + 146, 0, 16, 0, 0, 0, + 0, 0, 86, 9, 16, 0, + 1, 0, 0, 0, 2, 64, + 0, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 3, 0, 0, 0, + 2, 64, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 0, + 0, 0, 45, 0, 0, 8, 50, 0, 16, 0, 1, 0, - 0, 0, 86, 5, 16, 0, - 1, 0, 0, 0, 70, 126, + 0, 0, 6, 0, 16, 0, + 0, 0, 0, 0, 70, 126, 32, 0, 1, 0, 0, 0, 1, 0, 0, 0, 86, 0, 0, 5, 18, 0, 16, 0, diff --git a/src/xenia/gpu/trace_player.cc b/src/xenia/gpu/trace_player.cc index a141c9b8d..b1aa8f615 100644 --- a/src/xenia/gpu/trace_player.cc +++ b/src/xenia/gpu/trace_player.cc @@ -9,8 +9,11 @@ #include "xenia/gpu/trace_player.h" +#include + #include "xenia/gpu/command_processor.h" #include "xenia/gpu/graphics_system.h" +#include "xenia/gpu/registers.h" #include "xenia/gpu/xenos.h" #include "xenia/memory.h" @@ -33,8 +36,6 @@ TracePlayer::TracePlayer(GraphicsSystem* graphics_system) assert_not_null(playback_event_); } -TracePlayer::~TracePlayer() { delete[] edram_snapshot_; } - const TraceReader::Frame* TracePlayer::current_frame() const { if (current_frame_index_ >= frame_count()) { return nullptr; @@ -197,13 +198,12 @@ void TracePlayer::PlayTraceOnThread(const uint8_t* trace_data, case TraceCommandType::kEdramSnapshot: { auto cmd = reinterpret_cast(trace_ptr); trace_ptr += sizeof(*cmd); - if (!edram_snapshot_) { - edram_snapshot_ = new uint8_t[xenos::kEdramSizeBytes]; - } + std::unique_ptr edram_snapshot( + new uint8_t[xenos::kEdramSizeBytes]); DecompressMemory(cmd->encoding_format, trace_ptr, cmd->encoded_length, - edram_snapshot_, xenos::kEdramSizeBytes); + edram_snapshot.get(), xenos::kEdramSizeBytes); trace_ptr += cmd->encoded_length; - command_processor->RestoreEdramSnapshot(edram_snapshot_); + command_processor->RestoreEdramSnapshot(edram_snapshot.get()); break; } case TraceCommandType::kEvent: { @@ -219,6 +219,34 @@ void TracePlayer::PlayTraceOnThread(const uint8_t* trace_data, } break; } + case TraceCommandType::kRegisters: { + auto cmd = reinterpret_cast(trace_ptr); + trace_ptr += sizeof(*cmd); + std::unique_ptr register_values( + new uint32_t[cmd->register_count]); + DecompressMemory(cmd->encoding_format, trace_ptr, cmd->encoded_length, + register_values.get(), + sizeof(uint32_t) * cmd->register_count); + trace_ptr += cmd->encoded_length; + command_processor->RestoreRegisters( + cmd->first_register, register_values.get(), cmd->register_count, + cmd->execute_callbacks); + break; + } + case TraceCommandType::kGammaRamp: { + auto cmd = reinterpret_cast(trace_ptr); + trace_ptr += sizeof(*cmd); + std::unique_ptr gamma_ramps(new uint32_t[256 + 3 * 128]); + DecompressMemory(cmd->encoding_format, trace_ptr, cmd->encoded_length, + gamma_ramps.get(), sizeof(uint32_t) * (256 + 3 * 128)); + trace_ptr += cmd->encoded_length; + command_processor->RestoreGammaRamp( + reinterpret_cast(gamma_ramps.get()), + reinterpret_cast(gamma_ramps.get() + + 256), + cmd->rw_component); + break; + } } } diff --git a/src/xenia/gpu/trace_player.h b/src/xenia/gpu/trace_player.h index cfc2702a1..4bb5fdd2c 100644 --- a/src/xenia/gpu/trace_player.h +++ b/src/xenia/gpu/trace_player.h @@ -30,7 +30,6 @@ enum class TracePlaybackMode { class TracePlayer : public TraceReader { public: TracePlayer(GraphicsSystem* graphics_system); - ~TracePlayer() override; GraphicsSystem* graphics_system() const { return graphics_system_; } void SetPresentLastCopy(bool present_last_copy) { @@ -66,7 +65,6 @@ class TracePlayer : public TraceReader { bool playing_trace_ = false; std::atomic playback_percent_ = {0}; std::unique_ptr playback_event_; - uint8_t* edram_snapshot_ = nullptr; }; } // namespace gpu diff --git a/src/xenia/gpu/trace_protocol.h b/src/xenia/gpu/trace_protocol.h index b29ecfb7e..be881ef2b 100644 --- a/src/xenia/gpu/trace_protocol.h +++ b/src/xenia/gpu/trace_protocol.h @@ -53,6 +53,8 @@ enum class TraceCommandType : uint32_t { kMemoryWrite, kEdramSnapshot, kEvent, + kRegisters, + kGammaRamp, }; struct PrimaryBufferStartCommand { @@ -134,6 +136,40 @@ struct EventCommand { Type event_type; }; +// Represents a range of registers. +struct RegistersCommand { + TraceCommandType type; + + uint32_t first_register; + uint32_t register_count; + // Whether to set the registers via WriteRegister, which may have side + // effects, rather than by copying them directly to the register file. + bool execute_callbacks; + + // Encoding format of the values in the trace file. + MemoryEncodingFormat encoding_format; + // Number of bytes the values occupy in the trace file in their encoded form. + // If no encoding is used, this will be sizeof(uint32_t) * register_count. + uint32_t encoded_length; +}; + +// Represents a gamma ramp - encoded 256 DC_LUT_30_COLOR values and 128 +// interleaved RGB DC_LUT_PWL_DATA values. +// Assuming that all other gamma ramp state is saved as plain registers. +struct GammaRampCommand { + TraceCommandType type; + + // The component index (0 = red, 1 = green, 2 = blue) for the next + // DC_LUT_SEQ_COLOR or DC_LUT_PWL_DATA read or write. + uint8_t rw_component; + + // Encoding format of the ramps in the trace file. + MemoryEncodingFormat encoding_format; + // Number of bytes the ramps occupy in the trace file in their encoded form. + // If no encoding is used, this will be sizeof(uint32_t) * (256 + 3 * 128). + uint32_t encoded_length; +}; + } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/trace_reader.cc b/src/xenia/gpu/trace_reader.cc index 0f25b710a..6c20c79ec 100644 --- a/src/xenia/gpu/trace_reader.cc +++ b/src/xenia/gpu/trace_reader.cc @@ -205,6 +205,16 @@ void TraceReader::ParseTrace() { } break; } + case TraceCommandType::kRegisters: { + auto cmd = reinterpret_cast(trace_ptr); + trace_ptr += sizeof(*cmd) + cmd->encoded_length; + break; + } + case TraceCommandType::kGammaRamp: { + auto cmd = reinterpret_cast(trace_ptr); + trace_ptr += sizeof(*cmd) + cmd->encoded_length; + break; + } default: // Broken trace file? assert_unhandled_case(type); @@ -218,8 +228,8 @@ void TraceReader::ParseTrace() { } bool TraceReader::DecompressMemory(MemoryEncodingFormat encoding_format, - const uint8_t* src, size_t src_size, - uint8_t* dest, size_t dest_size) { + const void* src, size_t src_size, void* dest, + size_t dest_size) { switch (encoding_format) { case MemoryEncodingFormat::kNone: assert_true(src_size == dest_size); diff --git a/src/xenia/gpu/trace_reader.h b/src/xenia/gpu/trace_reader.h index 7480d3e4a..d1b51e4cd 100644 --- a/src/xenia/gpu/trace_reader.h +++ b/src/xenia/gpu/trace_reader.h @@ -135,9 +135,8 @@ class TraceReader { protected: void ParseTrace(); - bool DecompressMemory(MemoryEncodingFormat encoding_format, - const uint8_t* src, size_t src_size, uint8_t* dest, - size_t dest_size); + bool DecompressMemory(MemoryEncodingFormat encoding_format, const void* src, + size_t src_size, void* dest, size_t dest_size); std::unique_ptr mmap_; const uint8_t* trace_data_ = nullptr; diff --git a/src/xenia/gpu/trace_writer.cc b/src/xenia/gpu/trace_writer.cc index 96c287318..b83e21868 100644 --- a/src/xenia/gpu/trace_writer.cc +++ b/src/xenia/gpu/trace_writer.cc @@ -10,6 +10,7 @@ #include "xenia/gpu/trace_writer.h" #include +#include #include "third_party/snappy/snappy-sinksource.h" #include "third_party/snappy/snappy.h" @@ -19,6 +20,7 @@ #include "xenia/base/filesystem.h" #include "xenia/base/logging.h" #include "xenia/base/string.h" +#include "xenia/gpu/registers.h" #include "xenia/gpu/xenos.h" namespace xe { @@ -194,7 +196,7 @@ class SnappySink : public snappy::Sink { void TraceWriter::WriteMemoryCommand(TraceCommandType type, uint32_t base_ptr, size_t length, const void* host_ptr) { - MemoryCommand cmd; + MemoryCommand cmd = {}; cmd.type = type; cmd.base_ptr = base_ptr; cmd.encoding_format = MemoryEncodingFormat::kNone; @@ -232,8 +234,9 @@ void TraceWriter::WriteMemoryCommand(TraceCommandType type, uint32_t base_ptr, } void TraceWriter::WriteEdramSnapshot(const void* snapshot) { - EdramSnapshotCommand cmd; + EdramSnapshotCommand cmd = {}; cmd.type = TraceCommandType::kEdramSnapshot; + if (compress_output_) { // Write the header now so we reserve space in the buffer. long header_position = std::ftell(file_); @@ -272,5 +275,93 @@ void TraceWriter::WriteEvent(EventCommand::Type event_type) { fwrite(&cmd, 1, sizeof(cmd), file_); } +void TraceWriter::WriteRegisters(uint32_t first_register, + const uint32_t* register_values, + uint32_t register_count, + bool execute_callbacks_on_play) { + RegistersCommand cmd = {}; + cmd.type = TraceCommandType::kRegisters; + cmd.first_register = first_register; + cmd.register_count = register_count; + cmd.execute_callbacks = execute_callbacks_on_play; + + uint32_t uncompressed_length = uint32_t(sizeof(uint32_t) * register_count); + if (compress_output_) { + // Write the header now so we reserve space in the buffer. + long header_position = std::ftell(file_); + cmd.encoding_format = MemoryEncodingFormat::kSnappy; + fwrite(&cmd, 1, sizeof(cmd), file_); + + // Stream the content right to the buffer. + snappy::ByteArraySource snappy_source( + reinterpret_cast(register_values), uncompressed_length); + SnappySink snappy_sink(file_); + cmd.encoded_length = + static_cast(snappy::Compress(&snappy_source, &snappy_sink)); + + // Seek back and overwrite the header with our final size. + std::fseek(file_, header_position, SEEK_SET); + fwrite(&cmd, 1, sizeof(cmd), file_); + std::fseek(file_, header_position + sizeof(cmd) + cmd.encoded_length, + SEEK_SET); + } else { + // Uncompressed - write the values directly to the file. + cmd.encoding_format = MemoryEncodingFormat::kNone; + cmd.encoded_length = uncompressed_length; + fwrite(&cmd, 1, sizeof(cmd), file_); + fwrite(register_values, 1, uncompressed_length, file_); + } +} + +void TraceWriter::WriteGammaRamp( + const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table, + const reg::DC_LUT_PWL_DATA* gamma_ramp_pwl_rgb, + uint32_t gamma_ramp_rw_component) { + GammaRampCommand cmd = {}; + cmd.type = TraceCommandType::kGammaRamp; + cmd.rw_component = uint8_t(gamma_ramp_rw_component); + + constexpr uint32_t k256EntryTableUncompressedLength = + sizeof(reg::DC_LUT_30_COLOR) * 256; + constexpr uint32_t kPWLUncompressedLength = + sizeof(reg::DC_LUT_PWL_DATA) * 3 * 128; + constexpr uint32_t kUncompressedLength = + k256EntryTableUncompressedLength + kPWLUncompressedLength; + if (compress_output_) { + // Write the header now so we reserve space in the buffer. + long header_position = std::ftell(file_); + cmd.encoding_format = MemoryEncodingFormat::kSnappy; + fwrite(&cmd, 1, sizeof(cmd), file_); + + // Stream the content right to the buffer. + { + std::unique_ptr gamma_ramps(new char[kUncompressedLength]); + std::memcpy(gamma_ramps.get(), gamma_ramp_256_entry_table, + k256EntryTableUncompressedLength); + std::memcpy(gamma_ramps.get() + k256EntryTableUncompressedLength, + gamma_ramp_pwl_rgb, kPWLUncompressedLength); + snappy::ByteArraySource snappy_source(gamma_ramps.get(), + kUncompressedLength); + SnappySink snappy_sink(file_); + cmd.encoded_length = + static_cast(snappy::Compress(&snappy_source, &snappy_sink)); + } + + // Seek back and overwrite the header with our final size. + std::fseek(file_, header_position, SEEK_SET); + fwrite(&cmd, 1, sizeof(cmd), file_); + std::fseek(file_, header_position + sizeof(cmd) + cmd.encoded_length, + SEEK_SET); + } else { + // Uncompressed - write the values directly to the file. + cmd.encoding_format = MemoryEncodingFormat::kNone; + cmd.encoded_length = kUncompressedLength; + fwrite(&cmd, 1, sizeof(cmd), file_); + fwrite(gamma_ramp_256_entry_table, 1, k256EntryTableUncompressedLength, + file_); + fwrite(gamma_ramp_pwl_rgb, 1, kPWLUncompressedLength, file_); + } +} + } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/trace_writer.h b/src/xenia/gpu/trace_writer.h index 03e30185a..407166068 100644 --- a/src/xenia/gpu/trace_writer.h +++ b/src/xenia/gpu/trace_writer.h @@ -14,6 +14,7 @@ #include #include +#include "xenia/gpu/registers.h" #include "xenia/gpu/trace_protocol.h" namespace xe { @@ -44,6 +45,11 @@ class TraceWriter { const void* host_ptr = nullptr); void WriteEdramSnapshot(const void* snapshot); void WriteEvent(EventCommand::Type event_type); + void WriteRegisters(uint32_t first_register, const uint32_t* register_values, + uint32_t register_count, bool execute_callbacks_on_play); + void WriteGammaRamp(const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table, + const reg::DC_LUT_PWL_DATA* gamma_ramp_pwl_rgb, + uint32_t gamma_ramp_rw_component); private: void WriteMemoryCommand(TraceCommandType type, uint32_t base_ptr, diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 14cce000e..9421e4481 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -191,20 +191,6 @@ void VulkanCommandProcessor::WriteRegister(uint32_t index, uint32_t value) { offset ^= 0x1F; dirty_loop_constants_ |= (1 << offset); - } else if (index == XE_GPU_REG_DC_LUT_PWL_DATA) { - UpdateGammaRampValue(GammaRampType::kPWL, value); - } else if (index == XE_GPU_REG_DC_LUT_30_COLOR) { - UpdateGammaRampValue(GammaRampType::kTable, value); - } else if (index >= XE_GPU_REG_DC_LUT_RW_MODE && - index <= XE_GPU_REG_DC_LUTA_CONTROL) { - uint32_t offset = index - XE_GPU_REG_DC_LUT_RW_MODE; - offset ^= 0x05; - - dirty_gamma_constants_ |= (1 << offset); - - if (index == XE_GPU_REG_DC_LUT_RW_INDEX) { - gamma_ramp_rw_subindex_ = 0; - } } } @@ -1400,8 +1386,6 @@ bool VulkanCommandProcessor::IssueCopy() { return true; } -void VulkanCommandProcessor::InitializeTrace() {} - } // namespace vulkan } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index 2c2738440..f67570587 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -98,8 +98,6 @@ class VulkanCommandProcessor : public CommandProcessor { VulkanShader* pixel_shader); bool IssueCopy() override; - void InitializeTrace() override; - uint64_t dirty_float_constants_ = 0; // Dirty float constants in blocks of 4 uint8_t dirty_bool_constants_ = 0; uint32_t dirty_loop_constants_ = 0;