mirror of
https://github.com/xenia-project/xenia.git
synced 2026-01-19 23:20:43 +01:00
add xe::clear_lowest_bit, use it in place of shift-andnot in some bit iteration code make is_allocated_ and is_enabled_ volatile in xma_context preallocate avpacket buffer in XMAContext::Setup, the reallocations of the buffer in ffmpeg were showing up on profiles check is_enabled and is_allocated BEFORE locking an xmacontext. XMA worker was spending most of its time locking and unlocking contexts Removed XeDMAC, dma:: namespace. It was a bad idea and I couldn't make it work in the end. Kept vastcpy and moved it to the memory namespace instead Made the rest of global_critical_region's members static. They never needed an instance. Removed ifdef'ed out code from ring_buffer.h Added EventInfo struct to threading, added Event::Query to aid with implementing NtQueryEvent. Removed vector from WaitMultiple, instead use a fixed array of 64 handles that we populate. WaitForMultipleObjects cannot handle more than 64 objects. Remove XE_MSVC_OPTIMIZE_SMALL() use in x64_sequences, x64 backend is now always size optimized because of premake Make global_critical_region_ static constexpr in shared_memory.h to get rid of wasteage of 8 bytes (empty class=1byte, +alignment for next member=8) Move trace-related data to the tail of SharedMemory to keep more important data together In IssueDraw build an array of fetch constant addresses/sizes, then pre-lock the global lock before doing requestrange for each instead of individually locking within requestrange for each of them Consistent access specifier protected for pm4_command_processor_declare Devirtualize WriteOneRegisterFromRing. Move ExecutePacket and ExecutePrimaryBuffer to pm4_command_buffer_x Remove many redundant header inclusions access xenia-gpu Minor microoptimization of ExecutePacketType0 Add TextureCache::RequestTextures for batch invocation of LoadTexturesData Add TextureCache::LoadTexturesData for reducing the number of times we release and reacquire the global lock. Ideally you should hold the global lock for as little time as possible, but if you are constantly acquiring and releasing it you are actually more likely to have contention Add already_locked param to ObjectTable::LookupObject to help with reducing lock acquire/release pairs Add missing checks to XAudioRegisterRenderDriverClient_entry. this is unlikely to fix anything, it was just an easy thing to do Add NtQueryEvent system call implementation. I don't actually know of any games that need it. Instead of using std::vector + push_back in KeWaitForMultipleObjects and xeNtWaitForMultipleObjectsEx use a fixed size array of 64 and track the count. More than 64 objects is not permitted by the kernel. The repeated reallocations from push_back were appearing unusually high on the profiler, but were masked until now by waitformultipleobjects natural overhead Pre-lock the global lock before looking up each handle for xeNtWaitForMultipleObjectsEx and KeWaitForMultipleObjects. Pre-lock before looking up the signal and waiter in NtSignalAndWaitForSingleObjectEx add missing checks to NtWaitForMultipleObjectsEx Support pre-locking in XObject::GetNativeObject
632 lines
23 KiB
C++
632 lines
23 KiB
C++
/**
|
|
******************************************************************************
|
|
* Xenia : Xbox 360 Emulator Research Project *
|
|
******************************************************************************
|
|
* Copyright 2022 Ben Vanik. All rights reserved. *
|
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
|
******************************************************************************
|
|
*/
|
|
|
|
#include "xenia/gpu/command_processor.h"
|
|
|
|
#include <cinttypes>
|
|
|
|
#include "third_party/fmt/include/fmt/format.h"
|
|
#include "xenia/base/byte_stream.h"
|
|
#include "xenia/base/logging.h"
|
|
#include "xenia/base/profiling.h"
|
|
#include "xenia/gpu/gpu_flags.h"
|
|
#include "xenia/gpu/graphics_system.h"
|
|
#include "xenia/gpu/sampler_info.h"
|
|
#include "xenia/gpu/texture_info.h"
|
|
#include "xenia/kernel/kernel_state.h"
|
|
#include "xenia/kernel/user_module.h"
|
|
|
|
namespace xe {
|
|
namespace gpu {
|
|
|
|
using namespace xe::gpu::xenos;
|
|
|
|
CommandProcessor::CommandProcessor(GraphicsSystem* graphics_system,
|
|
kernel::KernelState* kernel_state)
|
|
: reader_(nullptr, 0),
|
|
memory_(graphics_system->memory()),
|
|
kernel_state_(kernel_state),
|
|
graphics_system_(graphics_system),
|
|
register_file_(graphics_system_->register_file()),
|
|
trace_writer_(graphics_system->memory()->physical_membase()),
|
|
worker_running_(true),
|
|
write_ptr_index_event_(xe::threading::Event::CreateAutoResetEvent(false)),
|
|
write_ptr_index_(0) {
|
|
assert_not_null(write_ptr_index_event_);
|
|
}
|
|
|
|
CommandProcessor::~CommandProcessor() = default;
|
|
|
|
bool CommandProcessor::Initialize() {
|
|
// Initialize the gamma ramps to their default (linear) values - taken from
|
|
// what games set when starting with the sRGB (return value 1)
|
|
// VdGetCurrentDisplayGamma.
|
|
for (uint32_t i = 0; i < 256; ++i) {
|
|
uint32_t value = i * 0x3FF / 0xFF;
|
|
reg::DC_LUT_30_COLOR& gamma_ramp_entry = gamma_ramp_256_entry_table_[i];
|
|
gamma_ramp_entry.color_10_blue = value;
|
|
gamma_ramp_entry.color_10_green = value;
|
|
gamma_ramp_entry.color_10_red = value;
|
|
}
|
|
for (uint32_t i = 0; i < 128; ++i) {
|
|
reg::DC_LUT_PWL_DATA gamma_ramp_entry = {};
|
|
gamma_ramp_entry.base = (i * 0xFFFF / 0x7F) & ~UINT32_C(0x3F);
|
|
gamma_ramp_entry.delta = i < 0x7F ? 0x200 : 0;
|
|
for (uint32_t j = 0; j < 3; ++j) {
|
|
gamma_ramp_pwl_rgb_[i][j] = gamma_ramp_entry;
|
|
}
|
|
}
|
|
|
|
worker_running_ = true;
|
|
worker_thread_ = kernel::object_ref<kernel::XHostThread>(
|
|
new kernel::XHostThread(kernel_state_, 128 * 1024, 0, [this]() {
|
|
WorkerThreadMain();
|
|
return 0;
|
|
}));
|
|
worker_thread_->set_name("GPU Commands");
|
|
worker_thread_->Create();
|
|
|
|
return true;
|
|
}
|
|
|
|
void CommandProcessor::Shutdown() {
|
|
EndTracing();
|
|
|
|
worker_running_ = false;
|
|
write_ptr_index_event_->Set();
|
|
worker_thread_->Wait(0, 0, 0, nullptr);
|
|
worker_thread_.reset();
|
|
}
|
|
|
|
void CommandProcessor::InitializeShaderStorage(
|
|
const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) {
|
|
}
|
|
|
|
void CommandProcessor::RequestFrameTrace(
|
|
const std::filesystem::path& root_path) {
|
|
if (trace_state_ == TraceState::kStreaming) {
|
|
XELOGE("Streaming trace; cannot also trace frame.");
|
|
return;
|
|
}
|
|
if (trace_state_ == TraceState::kSingleFrame) {
|
|
XELOGE("Frame trace already pending; ignoring.");
|
|
return;
|
|
}
|
|
trace_state_ = TraceState::kSingleFrame;
|
|
trace_frame_path_ = root_path;
|
|
}
|
|
|
|
void CommandProcessor::BeginTracing(const std::filesystem::path& root_path) {
|
|
if (trace_state_ == TraceState::kStreaming) {
|
|
XELOGE("Streaming already active; ignoring request.");
|
|
return;
|
|
}
|
|
if (trace_state_ == TraceState::kSingleFrame) {
|
|
XELOGE("Frame trace pending; ignoring streaming request.");
|
|
return;
|
|
}
|
|
// Streaming starts on the next primary buffer execute.
|
|
trace_state_ = TraceState::kStreaming;
|
|
trace_stream_path_ = root_path;
|
|
}
|
|
|
|
void CommandProcessor::EndTracing() {
|
|
if (!trace_writer_.is_open()) {
|
|
return;
|
|
}
|
|
assert_true(trace_state_ == TraceState::kStreaming);
|
|
trace_state_ = TraceState::kDisabled;
|
|
trace_writer_.Close();
|
|
}
|
|
|
|
void CommandProcessor::RestoreRegisters(uint32_t first_register,
|
|
const uint32_t* register_values,
|
|
uint32_t register_count,
|
|
bool execute_callbacks) {
|
|
if (first_register > RegisterFile::kRegisterCount ||
|
|
RegisterFile::kRegisterCount - first_register < register_count) {
|
|
XELOGW(
|
|
"CommandProcessor::RestoreRegisters out of bounds (0x{:X} registers "
|
|
"starting with 0x{:X}, while a total of 0x{:X} registers are stored)",
|
|
register_count, first_register, RegisterFile::kRegisterCount);
|
|
if (first_register > RegisterFile::kRegisterCount) {
|
|
return;
|
|
}
|
|
register_count =
|
|
std::min(uint32_t(RegisterFile::kRegisterCount) - first_register,
|
|
register_count);
|
|
}
|
|
if (execute_callbacks) {
|
|
for (uint32_t i = 0; i < register_count; ++i) {
|
|
WriteRegister(first_register + i, register_values[i]);
|
|
}
|
|
} else {
|
|
std::memcpy(register_file_->values + first_register, register_values,
|
|
sizeof(uint32_t) * register_count);
|
|
}
|
|
}
|
|
|
|
void CommandProcessor::RestoreGammaRamp(
|
|
const reg::DC_LUT_30_COLOR* new_gamma_ramp_256_entry_table,
|
|
const reg::DC_LUT_PWL_DATA* new_gamma_ramp_pwl_rgb,
|
|
uint32_t new_gamma_ramp_rw_component) {
|
|
std::memcpy(gamma_ramp_256_entry_table_, new_gamma_ramp_256_entry_table,
|
|
sizeof(reg::DC_LUT_30_COLOR) * 256);
|
|
std::memcpy(gamma_ramp_pwl_rgb_, new_gamma_ramp_pwl_rgb,
|
|
sizeof(reg::DC_LUT_PWL_DATA) * 3 * 128);
|
|
gamma_ramp_rw_component_ = new_gamma_ramp_rw_component;
|
|
OnGammaRamp256EntryTableValueWritten();
|
|
OnGammaRampPWLValueWritten();
|
|
}
|
|
|
|
void CommandProcessor::CallInThread(std::function<void()> fn) {
|
|
if (pending_fns_.empty() &&
|
|
kernel::XThread::IsInThread(worker_thread_.get())) {
|
|
fn();
|
|
} else {
|
|
pending_fns_.push(std::move(fn));
|
|
}
|
|
}
|
|
|
|
void CommandProcessor::ClearCaches() {}
|
|
|
|
void CommandProcessor::SetDesiredSwapPostEffect(
|
|
SwapPostEffect swap_post_effect) {
|
|
if (swap_post_effect_desired_ == swap_post_effect) {
|
|
return;
|
|
}
|
|
swap_post_effect_desired_ = swap_post_effect;
|
|
CallInThread([this, swap_post_effect]() {
|
|
swap_post_effect_actual_ = swap_post_effect;
|
|
});
|
|
}
|
|
|
|
void CommandProcessor::WorkerThreadMain() {
|
|
if (!SetupContext()) {
|
|
xe::FatalError("Unable to setup command processor internal state");
|
|
return;
|
|
}
|
|
|
|
while (worker_running_) {
|
|
while (!pending_fns_.empty()) {
|
|
auto fn = std::move(pending_fns_.front());
|
|
pending_fns_.pop();
|
|
fn();
|
|
}
|
|
|
|
uint32_t write_ptr_index = write_ptr_index_.load();
|
|
if (write_ptr_index == 0xBAADF00D || read_ptr_index_ == write_ptr_index) {
|
|
SCOPE_profile_cpu_i("gpu", "xe::gpu::CommandProcessor::Stall");
|
|
// We've run out of commands to execute.
|
|
// We spin here waiting for new ones, as the overhead of waiting on our
|
|
// event is too high.
|
|
PrepareForWait();
|
|
uint32_t loop_count = 0;
|
|
do {
|
|
// If we spin around too much, revert to a "low-power" state.
|
|
if (loop_count > 500) {
|
|
const int wait_time_ms = 2;
|
|
xe::threading::Wait(write_ptr_index_event_.get(), true,
|
|
std::chrono::milliseconds(wait_time_ms));
|
|
} else {
|
|
xe::threading::MaybeYield();
|
|
}
|
|
loop_count++;
|
|
write_ptr_index = write_ptr_index_.load();
|
|
} while (worker_running_ && pending_fns_.empty() &&
|
|
(write_ptr_index == 0xBAADF00D ||
|
|
read_ptr_index_ == write_ptr_index));
|
|
ReturnFromWait();
|
|
if (!worker_running_ || !pending_fns_.empty()) {
|
|
continue;
|
|
}
|
|
}
|
|
assert_true(read_ptr_index_ != write_ptr_index);
|
|
|
|
// Execute. Note that we handle wraparound transparently.
|
|
read_ptr_index_ = ExecutePrimaryBuffer(read_ptr_index_, write_ptr_index);
|
|
|
|
// TODO(benvanik): use reader->Read_update_freq_ and only issue after moving
|
|
// that many indices.
|
|
if (read_ptr_writeback_ptr_) {
|
|
xe::store_and_swap<uint32_t>(
|
|
memory_->TranslatePhysical(read_ptr_writeback_ptr_), read_ptr_index_);
|
|
}
|
|
|
|
// FIXME: We're supposed to process the WAIT_UNTIL register at this point,
|
|
// but no games seem to actually use it.
|
|
}
|
|
|
|
ShutdownContext();
|
|
}
|
|
|
|
void CommandProcessor::Pause() {
|
|
if (paused_) {
|
|
return;
|
|
}
|
|
paused_ = true;
|
|
|
|
threading::Fence fence;
|
|
CallInThread([&fence]() {
|
|
fence.Signal();
|
|
threading::Thread::GetCurrentThread()->Suspend();
|
|
});
|
|
|
|
fence.Wait();
|
|
}
|
|
|
|
void CommandProcessor::Resume() {
|
|
if (!paused_) {
|
|
return;
|
|
}
|
|
paused_ = false;
|
|
|
|
worker_thread_->thread()->Resume();
|
|
}
|
|
|
|
bool CommandProcessor::Save(ByteStream* stream) {
|
|
assert_true(paused_);
|
|
|
|
stream->Write<uint32_t>(primary_buffer_ptr_);
|
|
stream->Write<uint32_t>(primary_buffer_size_);
|
|
stream->Write<uint32_t>(read_ptr_index_);
|
|
stream->Write<uint32_t>(read_ptr_update_freq_);
|
|
stream->Write<uint32_t>(read_ptr_writeback_ptr_);
|
|
stream->Write<uint32_t>(write_ptr_index_.load());
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CommandProcessor::Restore(ByteStream* stream) {
|
|
assert_true(paused_);
|
|
|
|
primary_buffer_ptr_ = stream->Read<uint32_t>();
|
|
primary_buffer_size_ = stream->Read<uint32_t>();
|
|
read_ptr_index_ = stream->Read<uint32_t>();
|
|
read_ptr_update_freq_ = stream->Read<uint32_t>();
|
|
read_ptr_writeback_ptr_ = stream->Read<uint32_t>();
|
|
write_ptr_index_.store(stream->Read<uint32_t>());
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CommandProcessor::SetupContext() { return true; }
|
|
|
|
void CommandProcessor::ShutdownContext() {}
|
|
|
|
void CommandProcessor::InitializeRingBuffer(uint32_t ptr, uint32_t size_log2) {
|
|
read_ptr_index_ = 0;
|
|
primary_buffer_ptr_ = ptr;
|
|
primary_buffer_size_ = uint32_t(1) << (size_log2 + 3);
|
|
|
|
std::memset(kernel_state_->memory()->TranslatePhysical(primary_buffer_ptr_),
|
|
0, primary_buffer_size_);
|
|
}
|
|
|
|
void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,
|
|
uint32_t block_size_log2) {
|
|
// CP_RB_RPTR_ADDR Ring Buffer Read Pointer Address 0x70C
|
|
// ptr = RB_RPTR_ADDR, pointer to write back the address to.
|
|
read_ptr_writeback_ptr_ = ptr;
|
|
// CP_RB_CNTL Ring Buffer Control 0x704
|
|
// block_size = RB_BLKSZ, log2 of number of quadwords read between updates of
|
|
// the read pointer.
|
|
read_ptr_update_freq_ = uint32_t(1) << block_size_log2 >> 2;
|
|
}
|
|
|
|
void CommandProcessor::UpdateWritePointer(uint32_t value) {
|
|
write_ptr_index_ = value;
|
|
write_ptr_index_event_->SetBoostPriority();
|
|
}
|
|
void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
|
|
uint32_t value) {
|
|
RegisterFile& regs = *register_file_;
|
|
// Scratch register writeback.
|
|
if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) {
|
|
uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0;
|
|
if ((1 << scratch_reg) & regs.values[XE_GPU_REG_SCRATCH_UMSK].u32) {
|
|
// Enabled - write to address.
|
|
uint32_t scratch_addr = regs.values[XE_GPU_REG_SCRATCH_ADDR].u32;
|
|
uint32_t mem_addr = scratch_addr + (scratch_reg * 4);
|
|
xe::store_and_swap<uint32_t>(memory_->TranslatePhysical(mem_addr), value);
|
|
}
|
|
} else {
|
|
switch (index) {
|
|
// If this is a COHER register, set the dirty flag.
|
|
// This will block the command processor the next time it WAIT_MEM_REGs
|
|
// and allow us to synchronize the memory.
|
|
case XE_GPU_REG_COHER_STATUS_HOST: {
|
|
regs.values[index].u32 |= UINT32_C(0x80000000);
|
|
} break;
|
|
|
|
case XE_GPU_REG_DC_LUT_RW_INDEX: {
|
|
// Reset the sequential read / write component index (see the M56
|
|
// DC_LUT_SEQ_COLOR documentation).
|
|
gamma_ramp_rw_component_ = 0;
|
|
} break;
|
|
|
|
case XE_GPU_REG_DC_LUT_SEQ_COLOR: {
|
|
// Should be in the 256-entry table writing mode.
|
|
assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1);
|
|
auto& gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
|
|
// DC_LUT_SEQ_COLOR is in the red, green, blue order, but the write
|
|
// enable mask is blue, green, red.
|
|
bool write_gamma_ramp_component =
|
|
(regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 &
|
|
(UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0;
|
|
if (write_gamma_ramp_component) {
|
|
reg::DC_LUT_30_COLOR& gamma_ramp_entry =
|
|
gamma_ramp_256_entry_table_[gamma_ramp_rw_index.rw_index];
|
|
// Bits 0:5 are hardwired to zero.
|
|
uint32_t gamma_ramp_seq_color =
|
|
regs.Get<reg::DC_LUT_SEQ_COLOR>().seq_color >> 6;
|
|
switch (gamma_ramp_rw_component_) {
|
|
case 0:
|
|
gamma_ramp_entry.color_10_red = gamma_ramp_seq_color;
|
|
break;
|
|
case 1:
|
|
gamma_ramp_entry.color_10_green = gamma_ramp_seq_color;
|
|
break;
|
|
case 2:
|
|
gamma_ramp_entry.color_10_blue = gamma_ramp_seq_color;
|
|
break;
|
|
}
|
|
}
|
|
if (++gamma_ramp_rw_component_ >= 3) {
|
|
gamma_ramp_rw_component_ = 0;
|
|
++gamma_ramp_rw_index.rw_index;
|
|
}
|
|
if (write_gamma_ramp_component) {
|
|
OnGammaRamp256EntryTableValueWritten();
|
|
}
|
|
} break;
|
|
|
|
case XE_GPU_REG_DC_LUT_PWL_DATA: {
|
|
// Should be in the PWL writing mode.
|
|
assert_not_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1);
|
|
auto& gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
|
|
// Bit 7 of the index is ignored for PWL.
|
|
uint32_t gamma_ramp_rw_index_pwl = gamma_ramp_rw_index.rw_index & 0x7F;
|
|
// DC_LUT_PWL_DATA is likely in the red, green, blue order because
|
|
// DC_LUT_SEQ_COLOR is, but the write enable mask is blue, green, red.
|
|
bool write_gamma_ramp_component =
|
|
(regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 &
|
|
(UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0;
|
|
if (write_gamma_ramp_component) {
|
|
reg::DC_LUT_PWL_DATA& gamma_ramp_entry =
|
|
gamma_ramp_pwl_rgb_[gamma_ramp_rw_index_pwl]
|
|
[gamma_ramp_rw_component_];
|
|
auto gamma_ramp_value = regs.Get<reg::DC_LUT_PWL_DATA>();
|
|
// Bits 0:5 are hardwired to zero.
|
|
gamma_ramp_entry.base = gamma_ramp_value.base & ~UINT32_C(0x3F);
|
|
gamma_ramp_entry.delta = gamma_ramp_value.delta & ~UINT32_C(0x3F);
|
|
}
|
|
if (++gamma_ramp_rw_component_ >= 3) {
|
|
gamma_ramp_rw_component_ = 0;
|
|
// TODO(Triang3l): Should this increase beyond 7 bits for PWL?
|
|
// Direct3D 9 explicitly sets rw_index to 0x80 after writing the last
|
|
// PWL entry. However, the DC_LUT_RW_INDEX documentation says that for
|
|
// PWL, the bit 7 is ignored.
|
|
gamma_ramp_rw_index.rw_index =
|
|
(gamma_ramp_rw_index.rw_index & ~UINT32_C(0x7F)) |
|
|
((gamma_ramp_rw_index_pwl + 1) & 0x7F);
|
|
}
|
|
if (write_gamma_ramp_component) {
|
|
OnGammaRampPWLValueWritten();
|
|
}
|
|
} break;
|
|
|
|
case XE_GPU_REG_DC_LUT_30_COLOR: {
|
|
// Should be in the 256-entry table writing mode.
|
|
assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1);
|
|
auto& gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
|
|
uint32_t gamma_ramp_write_enable_mask =
|
|
regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 & 0b111;
|
|
if (gamma_ramp_write_enable_mask) {
|
|
reg::DC_LUT_30_COLOR& gamma_ramp_entry =
|
|
gamma_ramp_256_entry_table_[gamma_ramp_rw_index.rw_index];
|
|
auto gamma_ramp_value = regs.Get<reg::DC_LUT_30_COLOR>();
|
|
if (gamma_ramp_write_enable_mask & 0b001) {
|
|
gamma_ramp_entry.color_10_blue = gamma_ramp_value.color_10_blue;
|
|
}
|
|
if (gamma_ramp_write_enable_mask & 0b010) {
|
|
gamma_ramp_entry.color_10_green = gamma_ramp_value.color_10_green;
|
|
}
|
|
if (gamma_ramp_write_enable_mask & 0b100) {
|
|
gamma_ramp_entry.color_10_red = gamma_ramp_value.color_10_red;
|
|
}
|
|
}
|
|
++gamma_ramp_rw_index.rw_index;
|
|
// TODO(Triang3l): Should this reset the component write index? If this
|
|
// increase is assumed to behave like a full DC_LUT_RW_INDEX write, it
|
|
// probably should.
|
|
gamma_ramp_rw_component_ = 0;
|
|
if (gamma_ramp_write_enable_mask) {
|
|
OnGammaRamp256EntryTableValueWritten();
|
|
}
|
|
} break;
|
|
}
|
|
}
|
|
}
|
|
void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
|
// chrispy: rearrange check order, place set after checks
|
|
|
|
if (XE_LIKELY(index < RegisterFile::kRegisterCount)) {
|
|
register_file_->values[index].u32 = value;
|
|
|
|
// quick pre-test
|
|
// todo: figure out just how unlikely this is. if very (it ought to be,
|
|
// theres a ton of registers other than these) make this predicate
|
|
// branchless and mark with unlikely, then make HandleSpecialRegisterWrite
|
|
// noinline yep, its very unlikely. these ORS here are meant to be bitwise
|
|
// ors, so that we do not do branching evaluation of the conditions (we will
|
|
// almost always take all of the branches)
|
|
|
|
unsigned expr = (index - XE_GPU_REG_SCRATCH_REG0 < 8) |
|
|
(index == XE_GPU_REG_COHER_STATUS_HOST) |
|
|
((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
|
|
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
|
|
// chrispy: reordered for msvc branch probability (assumes if is taken and
|
|
// else is not)
|
|
if (XE_LIKELY(expr == 0)) {
|
|
XE_MSVC_REORDER_BARRIER();
|
|
|
|
} else {
|
|
HandleSpecialRegisterWrite(index, value);
|
|
}
|
|
} else {
|
|
XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", index);
|
|
return;
|
|
}
|
|
}
|
|
void CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
|
|
uint32_t* base,
|
|
uint32_t num_registers) {
|
|
for (uint32_t i = 0; i < num_registers; ++i) {
|
|
uint32_t data = xe::load_and_swap<uint32_t>(base + i);
|
|
this->WriteRegister(start_index + i, data);
|
|
}
|
|
}
|
|
|
|
void CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
|
|
uint32_t base,
|
|
uint32_t num_registers) {
|
|
for (uint32_t i = 0; i < num_registers; ++i) {
|
|
uint32_t data = ring->ReadAndSwap<uint32_t>();
|
|
WriteRegister(base + i, data);
|
|
}
|
|
}
|
|
|
|
void CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring,
|
|
uint32_t base,
|
|
uint32_t num_times) {
|
|
WriteRegisterRangeFromRing(ring, base + 0x4000, num_times);
|
|
}
|
|
|
|
void CommandProcessor::WriteFetchRangeFromRing(xe::RingBuffer* ring,
|
|
uint32_t base,
|
|
uint32_t num_times) {
|
|
WriteRegisterRangeFromRing(ring, base + 0x4800, num_times);
|
|
}
|
|
|
|
void CommandProcessor::WriteBoolRangeFromRing(xe::RingBuffer* ring,
|
|
uint32_t base,
|
|
uint32_t num_times) {
|
|
WriteRegisterRangeFromRing(ring, base + 0x4900, num_times);
|
|
}
|
|
|
|
void CommandProcessor::WriteLoopRangeFromRing(xe::RingBuffer* ring,
|
|
uint32_t base,
|
|
uint32_t num_times) {
|
|
WriteRegisterRangeFromRing(ring, base + 0x4908, num_times);
|
|
}
|
|
|
|
void CommandProcessor::WriteREGISTERSRangeFromRing(xe::RingBuffer* ring,
|
|
uint32_t base,
|
|
uint32_t num_times) {
|
|
WriteRegisterRangeFromRing(ring, base + 0x2000, num_times);
|
|
}
|
|
|
|
void CommandProcessor::WriteALURangeFromMem(uint32_t start_index,
|
|
uint32_t* base,
|
|
uint32_t num_registers) {
|
|
WriteRegistersFromMem(start_index + 0x4000, base, num_registers);
|
|
}
|
|
|
|
void CommandProcessor::WriteFetchRangeFromMem(uint32_t start_index,
|
|
uint32_t* base,
|
|
uint32_t num_registers) {
|
|
WriteRegistersFromMem(start_index + 0x4800, base, num_registers);
|
|
}
|
|
|
|
void CommandProcessor::WriteBoolRangeFromMem(uint32_t start_index,
|
|
uint32_t* base,
|
|
uint32_t num_registers) {
|
|
WriteRegistersFromMem(start_index + 0x4900, base, num_registers);
|
|
}
|
|
|
|
void CommandProcessor::WriteLoopRangeFromMem(uint32_t start_index,
|
|
uint32_t* base,
|
|
uint32_t num_registers) {
|
|
WriteRegistersFromMem(start_index + 0x4908, base, num_registers);
|
|
}
|
|
|
|
void CommandProcessor::WriteREGISTERSRangeFromMem(uint32_t start_index,
|
|
uint32_t* base,
|
|
uint32_t num_registers) {
|
|
WriteRegistersFromMem(start_index + 0x2000, base, num_registers);
|
|
}
|
|
XE_NOINLINE
|
|
void CommandProcessor::WriteOneRegisterFromRing(uint32_t base,
|
|
uint32_t num_times) {
|
|
for (uint32_t m = 0; m < num_times; m++) {
|
|
uint32_t reg_data = reader_.ReadAndSwap<uint32_t>();
|
|
uint32_t target_index = base;
|
|
WriteRegister(target_index, reg_data);
|
|
}
|
|
}
|
|
void CommandProcessor::MakeCoherent() {
|
|
SCOPE_profile_cpu_f("gpu");
|
|
|
|
// Status host often has 0x01000000 or 0x03000000.
|
|
// This is likely toggling VC (vertex cache) or TC (texture cache).
|
|
// Or, it also has a direction in here maybe - there is probably
|
|
// some way to check for dest coherency (what all the COHER_DEST_BASE_*
|
|
// registers are for).
|
|
// Best docs I've found on this are here:
|
|
// https://web.archive.org/web/20160711162346/https://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/10/R6xx_R7xx_3D.pdf
|
|
// https://cgit.freedesktop.org/xorg/driver/xf86-video-radeonhd/tree/src/r6xx_accel.c?id=3f8b6eccd9dba116cc4801e7f80ce21a879c67d2#n454
|
|
|
|
RegisterFile* regs = register_file_;
|
|
auto& status_host = regs->Get<reg::COHER_STATUS_HOST>();
|
|
auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32;
|
|
auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32;
|
|
|
|
if (!status_host.status) {
|
|
return;
|
|
}
|
|
|
|
const char* action = "N/A";
|
|
if (status_host.vc_action_ena && status_host.tc_action_ena) {
|
|
action = "VC | TC";
|
|
} else if (status_host.tc_action_ena) {
|
|
action = "TC";
|
|
} else if (status_host.vc_action_ena) {
|
|
action = "VC";
|
|
}
|
|
|
|
// TODO(benvanik): notify resource cache of base->size and type.
|
|
XELOGD("Make {:08X} -> {:08X} ({}b) coherent, action = {}", base_host,
|
|
base_host + size_host, size_host, action);
|
|
|
|
// Mark coherent.
|
|
status_host.status = 0;
|
|
}
|
|
|
|
void CommandProcessor::PrepareForWait() { trace_writer_.Flush(); }
|
|
|
|
void CommandProcessor::ReturnFromWait() {}
|
|
|
|
|
|
void CommandProcessor::InitializeTrace() {
|
|
// Write the initial register values, to be loaded directly into the
|
|
// RegisterFile since all registers, including those that may have side
|
|
// effects on setting, will be saved.
|
|
trace_writer_.WriteRegisters(
|
|
0, reinterpret_cast<const uint32_t*>(register_file_->values),
|
|
RegisterFile::kRegisterCount, false);
|
|
|
|
trace_writer_.WriteGammaRamp(gamma_ramp_256_entry_table(),
|
|
gamma_ramp_pwl_rgb(), gamma_ramp_rw_component_);
|
|
}
|
|
#define COMMAND_PROCESSOR CommandProcessor
|
|
#include "pm4_command_processor_implement.h"
|
|
} // namespace gpu
|
|
} // namespace xe
|