xenia/src/xenia/gpu/command_processor.cc

769 lines
28 KiB
C++

/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2022 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#include "xenia/gpu/command_processor.h"
#include <cinttypes>
#include "third_party/fmt/include/fmt/format.h"
#include "xenia/base/byte_stream.h"
#include "xenia/base/cvar.h"
#include "xenia/base/logging.h"
#include "xenia/base/profiling.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/graphics_system.h"
#include "xenia/gpu/packet_disassembler.h"
#include "xenia/gpu/sampler_info.h"
#include "xenia/gpu/texture_info.h"
#include "xenia/kernel/kernel_state.h"
#include "xenia/kernel/user_module.h"
#if !defined(NDEBUG)
#define XE_ENABLE_GPU_REG_WRITE_LOGGING 1
#endif
DEFINE_bool(
log_guest_driven_gpu_register_written_values, false,
"Only does anything in debug builds, if set will log every write to a gpu "
"register done by a guest. Does not log writes that are done by the CP on "
"its own, just ones the guest makes or instructs it to make.",
"GPU");
DEFINE_bool(disassemble_pm4, false,
"Only does anything in debug builds, if set will disassemble and "
"log all PM4 packets sent to the CP.",
"GPU");
DEFINE_bool(
log_ringbuffer_kickoff_initiator_bts, false,
"Only does anything in debug builds, if set will log the pseudo-stacktrace "
"of the guest thread that wrote the new read position.",
"GPU");
DEFINE_bool(clear_memory_page_state, false,
"Refresh state of memory pages to enable gpu written data. (Use "
"for 'Team Ninja' Games to fix missing character models)",
"GPU");
namespace xe {
namespace gpu {
void CommonSaveGPUSetting(CommonGPUSetting setting, uint64_t value) {
switch (setting) {
case CommonGPUSetting::ClearMemoryPageState:
OVERRIDE_bool(clear_memory_page_state, (bool)value);
break;
}
}
using namespace xe::gpu::xenos;
CommandProcessor::CommandProcessor(GraphicsSystem* graphics_system,
kernel::KernelState* kernel_state)
: reader_(nullptr, 0),
memory_(graphics_system->memory()),
kernel_state_(kernel_state),
graphics_system_(graphics_system),
register_file_(graphics_system_->register_file()),
trace_writer_(graphics_system->memory()->physical_membase()),
worker_running_(true),
write_ptr_index_event_(xe::threading::Event::CreateAutoResetEvent(false)),
write_ptr_index_(0) {
assert_not_null(write_ptr_index_event_);
}
CommandProcessor::~CommandProcessor() = default;
bool CommandProcessor::Initialize() {
// Initialize the gamma ramps to their default (linear) values - taken from
// what games set when starting with the sRGB (return value 1)
// VdGetCurrentDisplayGamma.
for (uint32_t i = 0; i < 256; ++i) {
uint32_t value = i * 0x3FF / 0xFF;
reg::DC_LUT_30_COLOR& gamma_ramp_entry = gamma_ramp_256_entry_table_[i];
gamma_ramp_entry.color_10_blue = value;
gamma_ramp_entry.color_10_green = value;
gamma_ramp_entry.color_10_red = value;
}
for (uint32_t i = 0; i < 128; ++i) {
reg::DC_LUT_PWL_DATA gamma_ramp_entry = {};
gamma_ramp_entry.base = (i * 0xFFFF / 0x7F) & ~UINT32_C(0x3F);
gamma_ramp_entry.delta = i < 0x7F ? 0x200 : 0;
for (uint32_t j = 0; j < 3; ++j) {
gamma_ramp_pwl_rgb_[i][j] = gamma_ramp_entry;
}
}
worker_running_ = true;
worker_thread_ =
kernel::object_ref<kernel::XHostThread>(new kernel::XHostThread(
kernel_state_, 128 * 1024, 0,
[this]() {
WorkerThreadMain();
return 0;
},
kernel_state_->GetIdleProcess()));
worker_thread_->set_name("GPU Commands");
worker_thread_->Create();
return true;
}
void CommandProcessor::Shutdown() {
EndTracing();
worker_running_ = false;
write_ptr_index_event_->Set();
worker_thread_->Wait(0, 0, 0, nullptr);
worker_thread_.reset();
}
void CommandProcessor::InitializeShaderStorage(
const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) {
}
void CommandProcessor::RequestFrameTrace(
const std::filesystem::path& root_path) {
if (trace_state_ == TraceState::kStreaming) {
XELOGE("Streaming trace; cannot also trace frame.");
return;
}
if (trace_state_ == TraceState::kSingleFrame) {
XELOGE("Frame trace already pending; ignoring.");
return;
}
trace_state_ = TraceState::kSingleFrame;
trace_frame_path_ = root_path;
}
void CommandProcessor::BeginTracing(const std::filesystem::path& root_path) {
if (trace_state_ == TraceState::kStreaming) {
XELOGE("Streaming already active; ignoring request.");
return;
}
if (trace_state_ == TraceState::kSingleFrame) {
XELOGE("Frame trace pending; ignoring streaming request.");
return;
}
// Streaming starts on the next primary buffer execute.
trace_state_ = TraceState::kStreaming;
trace_stream_path_ = root_path;
}
void CommandProcessor::EndTracing() {
if (!trace_writer_.is_open()) {
return;
}
assert_true(trace_state_ == TraceState::kStreaming);
trace_state_ = TraceState::kDisabled;
trace_writer_.Close();
}
void CommandProcessor::RestoreRegisters(uint32_t first_register,
const uint32_t* register_values,
uint32_t register_count,
bool execute_callbacks) {
if (first_register > RegisterFile::kRegisterCount ||
RegisterFile::kRegisterCount - first_register < register_count) {
XELOGW(
"CommandProcessor::RestoreRegisters out of bounds (0x{:X} registers "
"starting with 0x{:X}, while a total of 0x{:X} registers are stored)",
register_count, first_register, RegisterFile::kRegisterCount);
if (first_register > RegisterFile::kRegisterCount) {
return;
}
register_count =
std::min(uint32_t(RegisterFile::kRegisterCount) - first_register,
register_count);
}
if (execute_callbacks) {
for (uint32_t i = 0; i < register_count; ++i) {
WriteRegister(first_register + i, register_values[i]);
}
} else {
std::memcpy(register_file_->values + first_register, register_values,
sizeof(uint32_t) * register_count);
}
}
void CommandProcessor::RestoreGammaRamp(
const reg::DC_LUT_30_COLOR* new_gamma_ramp_256_entry_table,
const reg::DC_LUT_PWL_DATA* new_gamma_ramp_pwl_rgb,
uint32_t new_gamma_ramp_rw_component) {
std::memcpy(gamma_ramp_256_entry_table_, new_gamma_ramp_256_entry_table,
sizeof(reg::DC_LUT_30_COLOR) * 256);
std::memcpy(gamma_ramp_pwl_rgb_, new_gamma_ramp_pwl_rgb,
sizeof(reg::DC_LUT_PWL_DATA) * 3 * 128);
gamma_ramp_rw_component_ = new_gamma_ramp_rw_component;
OnGammaRamp256EntryTableValueWritten();
OnGammaRampPWLValueWritten();
}
void CommandProcessor::CallInThread(std::function<void()> fn) {
if (pending_fns_.empty() &&
kernel::XThread::IsInThread(worker_thread_.get())) {
fn();
} else {
pending_fns_.push(std::move(fn));
}
}
void CommandProcessor::ClearCaches() {}
void CommandProcessor::SetDesiredSwapPostEffect(
SwapPostEffect swap_post_effect) {
if (swap_post_effect_desired_ == swap_post_effect) {
return;
}
swap_post_effect_desired_ = swap_post_effect;
CallInThread([this, swap_post_effect]() {
swap_post_effect_actual_ = swap_post_effect;
});
}
void CommandProcessor::WorkerThreadMain() {
if (!SetupContext()) {
xe::FatalError("Unable to setup command processor internal state");
return;
}
while (worker_running_) {
while (!pending_fns_.empty()) {
auto fn = std::move(pending_fns_.front());
pending_fns_.pop();
fn();
}
uint32_t write_ptr_index = write_ptr_index_.load();
if (write_ptr_index == 0xBAADF00D || read_ptr_index_ == write_ptr_index) {
SCOPE_profile_cpu_i("gpu", "xe::gpu::CommandProcessor::Stall");
// We've run out of commands to execute.
// We spin here waiting for new ones, as the overhead of waiting on our
// event is too high.
PrepareForWait();
uint32_t loop_count = 0;
do {
// If we spin around too much, revert to a "low-power" state.
if (loop_count > 500) {
const int wait_time_ms = 2;
xe::threading::Wait(write_ptr_index_event_.get(), true,
std::chrono::milliseconds(wait_time_ms));
} else {
xe::threading::MaybeYield();
}
loop_count++;
write_ptr_index = write_ptr_index_.load();
} while (worker_running_ && pending_fns_.empty() &&
(write_ptr_index == 0xBAADF00D ||
read_ptr_index_ == write_ptr_index));
ReturnFromWait();
if (!worker_running_ || !pending_fns_.empty()) {
continue;
}
}
assert_true(read_ptr_index_ != write_ptr_index);
// Execute. Note that we handle wraparound transparently.
read_ptr_index_ = ExecutePrimaryBuffer(read_ptr_index_, write_ptr_index);
// TODO(benvanik): use reader->Read_update_freq_ and only issue after moving
// that many indices.
// Keep in mind that the gpu also updates the cpu-side copy if the write
// pointer and read pointer would be equal
if (read_ptr_writeback_ptr_) {
xe::store_and_swap<uint32_t>(
memory_->TranslatePhysical(read_ptr_writeback_ptr_), read_ptr_index_);
}
// FIXME: We're supposed to process the WAIT_UNTIL register at this point,
// but no games seem to actually use it.
}
ShutdownContext();
}
void CommandProcessor::Pause() {
if (paused_) {
return;
}
paused_ = true;
threading::Fence fence;
CallInThread([&fence]() {
fence.Signal();
threading::Thread::GetCurrentThread()->Suspend();
});
fence.Wait();
}
void CommandProcessor::Resume() {
if (!paused_) {
return;
}
paused_ = false;
worker_thread_->thread()->Resume();
}
bool CommandProcessor::Save(ByteStream* stream) {
assert_true(paused_);
stream->Write<uint32_t>(primary_buffer_ptr_);
stream->Write<uint32_t>(primary_buffer_size_);
stream->Write<uint32_t>(read_ptr_index_);
stream->Write<uint32_t>(read_ptr_update_freq_);
stream->Write<uint32_t>(read_ptr_writeback_ptr_);
stream->Write<uint32_t>(write_ptr_index_.load());
return true;
}
bool CommandProcessor::Restore(ByteStream* stream) {
assert_true(paused_);
primary_buffer_ptr_ = stream->Read<uint32_t>();
primary_buffer_size_ = stream->Read<uint32_t>();
read_ptr_index_ = stream->Read<uint32_t>();
read_ptr_update_freq_ = stream->Read<uint32_t>();
read_ptr_writeback_ptr_ = stream->Read<uint32_t>();
write_ptr_index_.store(stream->Read<uint32_t>());
return true;
}
bool CommandProcessor::SetupContext() { return true; }
void CommandProcessor::ShutdownContext() {}
void CommandProcessor::InitializeRingBuffer(uint32_t ptr, uint32_t size_log2) {
read_ptr_index_ = 0;
primary_buffer_ptr_ = ptr;
primary_buffer_size_ = uint32_t(1) << (size_log2 + 3);
std::memset(kernel_state_->memory()->TranslatePhysical(primary_buffer_ptr_),
0, primary_buffer_size_);
}
void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,
uint32_t block_size_log2) {
// CP_RB_RPTR_ADDR Ring Buffer Read Pointer Address 0x70C
// ptr = RB_RPTR_ADDR, pointer to write back the address to.
read_ptr_writeback_ptr_ = ptr;
// CP_RB_CNTL Ring Buffer Control 0x704
// block_size = RB_BLKSZ, log2 of number of quadwords read between updates of
// the read pointer.
read_ptr_update_freq_ = uint32_t(1) << block_size_log2 >> 2;
}
XE_NOINLINE XE_COLD void CommandProcessor::LogKickoffInitator(uint32_t value) {
cpu::backend::GuestPseudoStackTrace st;
if (logging::internal::ShouldLog(LogLevel::Debug) &&
kernel_state_->processor()->backend()->PopulatePseudoStacktrace(&st)) {
logging::LoggerBatch<LogLevel::Debug> log_initiator{};
log_initiator("Updating read ptr to {}, initiator stacktrace below\n",
value);
for (uint32_t i = 0; i < st.count; ++i) {
log_initiator("\t{:08X}\n", st.return_addrs[i]);
}
if (st.truncated_flag) {
log_initiator("\t(Truncated stacktrace to {} entries)\n",
cpu::backend::MAX_GUEST_PSEUDO_STACKTRACE_ENTRIES);
}
log_initiator.submit('d');
}
}
void CommandProcessor::UpdateWritePointer(uint32_t value) {
XE_UNLIKELY_IF(cvars::log_ringbuffer_kickoff_initiator_bts) {
LogKickoffInitator(value);
}
write_ptr_index_ = value;
write_ptr_index_event_->SetBoostPriority();
}
void CommandProcessor::LogRegisterSet(uint32_t register_index, uint32_t value) {
#if XE_ENABLE_GPU_REG_WRITE_LOGGING == 1
if (cvars::log_guest_driven_gpu_register_written_values &&
logging::internal::ShouldLog(LogLevel::Debug)) {
const RegisterInfo* reginfo = RegisterFile::GetRegisterInfo(register_index);
if (!reginfo) {
XELOGD("Unknown_Reg{:04X} <- {:08X}\n", register_index, value);
} else {
XELOGD("{} <- {:08X}\n", reginfo->name, value);
}
}
#endif
}
void CommandProcessor::LogRegisterSets(uint32_t base_register_index,
const uint32_t* values,
uint32_t n_values) {
#if XE_ENABLE_GPU_REG_WRITE_LOGGING == 1
if (cvars::log_guest_driven_gpu_register_written_values &&
logging::internal::ShouldLog(LogLevel::Debug)) {
auto target = logging::internal::GetThreadBuffer();
auto target_ptr = target.first;
size_t total_size = 0;
size_t rem_size = target.second;
for (uint32_t i = 0; i < n_values; ++i) {
uint32_t register_index = base_register_index + i;
uint32_t value = xe::load_and_swap<uint32_t>(&values[i]);
const RegisterInfo* reginfo =
RegisterFile::GetRegisterInfo(register_index);
if (!reginfo) {
auto tmpres = fmt::format_to_n(target_ptr, rem_size,
"Unknown_Reg{:04X} <- {:08X}\n",
register_index, value);
target_ptr = tmpres.out;
rem_size -= tmpres.size;
total_size += tmpres.size;
} else {
auto tmpres = fmt::format_to_n(target_ptr, rem_size, "{} <- {:08X}\n",
reginfo->name, value);
rem_size -= tmpres.size;
target_ptr = tmpres.out;
total_size += tmpres.size;
}
}
logging::internal::AppendLogLine(LogLevel::Debug, 'd', total_size);
}
#endif
}
void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
uint32_t value) {
RegisterFile& regs = *register_file_;
// Scratch register writeback.
if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) {
uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0;
if ((1 << scratch_reg) & regs.values[XE_GPU_REG_SCRATCH_UMSK]) {
// Enabled - write to address.
uint32_t scratch_addr = regs.values[XE_GPU_REG_SCRATCH_ADDR];
uint32_t mem_addr = scratch_addr + (scratch_reg * 4);
xe::store_and_swap<uint32_t>(memory_->TranslatePhysical(mem_addr), value);
}
} else {
switch (index) {
// If this is a COHER register, set the dirty flag.
// This will block the command processor the next time it WAIT_MEM_REGs
// and allow us to synchronize the memory.
case XE_GPU_REG_COHER_STATUS_HOST: {
regs.values[index] |= UINT32_C(0x80000000);
} break;
case XE_GPU_REG_DC_LUT_RW_INDEX: {
// Reset the sequential read / write component index (see the M56
// DC_LUT_SEQ_COLOR documentation).
gamma_ramp_rw_component_ = 0;
} break;
case XE_GPU_REG_DC_LUT_SEQ_COLOR: {
// Should be in the 256-entry table writing mode.
assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE] & 0b1);
auto gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
// DC_LUT_SEQ_COLOR is in the red, green, blue order, but the write
// enable mask is blue, green, red.
bool write_gamma_ramp_component =
(regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK] &
(UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0;
if (write_gamma_ramp_component) {
reg::DC_LUT_30_COLOR& gamma_ramp_entry =
gamma_ramp_256_entry_table_[gamma_ramp_rw_index.rw_index];
// Bits 0:5 are hardwired to zero.
uint32_t gamma_ramp_seq_color =
regs.Get<reg::DC_LUT_SEQ_COLOR>().seq_color >> 6;
switch (gamma_ramp_rw_component_) {
case 0:
gamma_ramp_entry.color_10_red = gamma_ramp_seq_color;
break;
case 1:
gamma_ramp_entry.color_10_green = gamma_ramp_seq_color;
break;
case 2:
gamma_ramp_entry.color_10_blue = gamma_ramp_seq_color;
break;
}
}
if (++gamma_ramp_rw_component_ >= 3) {
gamma_ramp_rw_component_ = 0;
reg::DC_LUT_RW_INDEX new_gamma_ramp_rw_index = gamma_ramp_rw_index;
++new_gamma_ramp_rw_index.rw_index;
WriteRegister(
XE_GPU_REG_DC_LUT_RW_INDEX,
xe::memory::Reinterpret<uint32_t>(new_gamma_ramp_rw_index));
}
if (write_gamma_ramp_component) {
OnGammaRamp256EntryTableValueWritten();
}
} break;
case XE_GPU_REG_DC_LUT_PWL_DATA: {
// Should be in the PWL writing mode.
assert_not_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE] & 0b1);
auto gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
// Bit 7 of the index is ignored for PWL.
uint32_t gamma_ramp_rw_index_pwl = gamma_ramp_rw_index.rw_index & 0x7F;
// DC_LUT_PWL_DATA is likely in the red, green, blue order because
// DC_LUT_SEQ_COLOR is, but the write enable mask is blue, green, red.
bool write_gamma_ramp_component =
(regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK] &
(UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0;
if (write_gamma_ramp_component) {
reg::DC_LUT_PWL_DATA& gamma_ramp_entry =
gamma_ramp_pwl_rgb_[gamma_ramp_rw_index_pwl]
[gamma_ramp_rw_component_];
auto gamma_ramp_value = regs.Get<reg::DC_LUT_PWL_DATA>();
// Bits 0:5 are hardwired to zero.
gamma_ramp_entry.base = gamma_ramp_value.base & ~UINT32_C(0x3F);
gamma_ramp_entry.delta = gamma_ramp_value.delta & ~UINT32_C(0x3F);
}
if (++gamma_ramp_rw_component_ >= 3) {
gamma_ramp_rw_component_ = 0;
reg::DC_LUT_RW_INDEX new_gamma_ramp_rw_index = gamma_ramp_rw_index;
// TODO(Triang3l): Should this increase beyond 7 bits for PWL?
// Direct3D 9 explicitly sets rw_index to 0x80 after writing the last
// PWL entry. However, the DC_LUT_RW_INDEX documentation says that for
// PWL, the bit 7 is ignored.
new_gamma_ramp_rw_index.rw_index =
(gamma_ramp_rw_index.rw_index & ~UINT32_C(0x7F)) |
((gamma_ramp_rw_index_pwl + 1) & 0x7F);
WriteRegister(
XE_GPU_REG_DC_LUT_RW_INDEX,
xe::memory::Reinterpret<uint32_t>(new_gamma_ramp_rw_index));
}
if (write_gamma_ramp_component) {
OnGammaRampPWLValueWritten();
}
} break;
case XE_GPU_REG_DC_LUT_30_COLOR: {
// Should be in the 256-entry table writing mode.
assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE] & 0b1);
auto gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
uint32_t gamma_ramp_write_enable_mask =
regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK] & 0b111;
if (gamma_ramp_write_enable_mask) {
reg::DC_LUT_30_COLOR& gamma_ramp_entry =
gamma_ramp_256_entry_table_[gamma_ramp_rw_index.rw_index];
auto gamma_ramp_value = regs.Get<reg::DC_LUT_30_COLOR>();
if (gamma_ramp_write_enable_mask & 0b001) {
gamma_ramp_entry.color_10_blue = gamma_ramp_value.color_10_blue;
}
if (gamma_ramp_write_enable_mask & 0b010) {
gamma_ramp_entry.color_10_green = gamma_ramp_value.color_10_green;
}
if (gamma_ramp_write_enable_mask & 0b100) {
gamma_ramp_entry.color_10_red = gamma_ramp_value.color_10_red;
}
}
// TODO(Triang3l): Should this reset the component write index? If this
// increase is assumed to behave like a full DC_LUT_RW_INDEX write, it
// probably should. Currently this also calls WriteRegister for
// DC_LUT_RW_INDEX, which resets gamma_ramp_rw_component_ as well.
gamma_ramp_rw_component_ = 0;
reg::DC_LUT_RW_INDEX new_gamma_ramp_rw_index = gamma_ramp_rw_index;
++new_gamma_ramp_rw_index.rw_index;
WriteRegister(
XE_GPU_REG_DC_LUT_RW_INDEX,
xe::memory::Reinterpret<uint32_t>(new_gamma_ramp_rw_index));
if (gamma_ramp_write_enable_mask) {
OnGammaRamp256EntryTableValueWritten();
}
} break;
}
}
}
void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
// chrispy: rearrange check order, place set after checks
if (XE_LIKELY(index < RegisterFile::kRegisterCount)) {
register_file_->values[index] = value;
// quick pre-test
// todo: figure out just how unlikely this is. if very (it ought to be,
// theres a ton of registers other than these) make this predicate
// branchless and mark with unlikely, then make HandleSpecialRegisterWrite
// noinline yep, its very unlikely. these ORS here are meant to be bitwise
// ors, so that we do not do branching evaluation of the conditions (we will
// almost always take all of the branches)
unsigned expr = (index - XE_GPU_REG_SCRATCH_REG0 < 8) |
(index == XE_GPU_REG_COHER_STATUS_HOST) |
((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
// chrispy: reordered for msvc branch probability (assumes if is taken and
// else is not)
if (XE_LIKELY(expr == 0)) {
XE_MSVC_REORDER_BARRIER();
} else {
HandleSpecialRegisterWrite(index, value);
}
} else {
XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", index);
return;
}
}
void CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
uint32_t* base,
uint32_t num_registers) {
for (uint32_t i = 0; i < num_registers; ++i) {
uint32_t data = xe::load_and_swap<uint32_t>(base + i);
this->WriteRegister(start_index + i, data);
}
}
void CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
uint32_t base,
uint32_t num_registers) {
for (uint32_t i = 0; i < num_registers; ++i) {
uint32_t data = ring->ReadAndSwap<uint32_t>();
WriteRegister(base + i, data);
}
}
void CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring,
uint32_t base,
uint32_t num_times) {
WriteRegisterRangeFromRing(ring, base + 0x4000, num_times);
}
void CommandProcessor::WriteFetchRangeFromRing(xe::RingBuffer* ring,
uint32_t base,
uint32_t num_times) {
WriteRegisterRangeFromRing(ring, base + 0x4800, num_times);
}
void CommandProcessor::WriteBoolRangeFromRing(xe::RingBuffer* ring,
uint32_t base,
uint32_t num_times) {
WriteRegisterRangeFromRing(ring, base + 0x4900, num_times);
}
void CommandProcessor::WriteLoopRangeFromRing(xe::RingBuffer* ring,
uint32_t base,
uint32_t num_times) {
WriteRegisterRangeFromRing(ring, base + 0x4908, num_times);
}
void CommandProcessor::WriteREGISTERSRangeFromRing(xe::RingBuffer* ring,
uint32_t base,
uint32_t num_times) {
WriteRegisterRangeFromRing(ring, base + 0x2000, num_times);
}
void CommandProcessor::WriteALURangeFromMem(uint32_t start_index,
uint32_t* base,
uint32_t num_registers) {
WriteRegistersFromMem(start_index + 0x4000, base, num_registers);
}
void CommandProcessor::WriteFetchRangeFromMem(uint32_t start_index,
uint32_t* base,
uint32_t num_registers) {
WriteRegistersFromMem(start_index + 0x4800, base, num_registers);
}
void CommandProcessor::WriteBoolRangeFromMem(uint32_t start_index,
uint32_t* base,
uint32_t num_registers) {
WriteRegistersFromMem(start_index + 0x4900, base, num_registers);
}
void CommandProcessor::WriteLoopRangeFromMem(uint32_t start_index,
uint32_t* base,
uint32_t num_registers) {
WriteRegistersFromMem(start_index + 0x4908, base, num_registers);
}
void CommandProcessor::WriteREGISTERSRangeFromMem(uint32_t start_index,
uint32_t* base,
uint32_t num_registers) {
WriteRegistersFromMem(start_index + 0x2000, base, num_registers);
}
XE_NOINLINE
void CommandProcessor::WriteOneRegisterFromRing(uint32_t base,
uint32_t num_times) {
for (uint32_t m = 0; m < num_times; m++) {
uint32_t reg_data = reader_.ReadAndSwap<uint32_t>();
uint32_t target_index = base;
WriteRegister(target_index, reg_data);
}
}
void CommandProcessor::MakeCoherent() {
SCOPE_profile_cpu_f("gpu");
// Status host often has 0x01000000 or 0x03000000.
// This is likely toggling VC (vertex cache) or TC (texture cache).
// Or, it also has a direction in here maybe - there is probably
// some way to check for dest coherency (what all the COHER_DEST_BASE_*
// registers are for).
// Best docs I've found on this are here:
// https://web.archive.org/web/20160711162346/https://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/10/R6xx_R7xx_3D.pdf
// https://cgit.freedesktop.org/xorg/driver/xf86-video-radeonhd/tree/src/r6xx_accel.c?id=3f8b6eccd9dba116cc4801e7f80ce21a879c67d2#n454
volatile uint32_t* regs_volatile = register_file_->values;
auto status_host = xe::memory::Reinterpret<reg::COHER_STATUS_HOST>(
uint32_t(regs_volatile[XE_GPU_REG_COHER_STATUS_HOST]));
uint32_t base_host = regs_volatile[XE_GPU_REG_COHER_BASE_HOST];
uint32_t size_host = regs_volatile[XE_GPU_REG_COHER_SIZE_HOST];
if (!status_host.status) {
return;
}
const char* action = "N/A";
if (status_host.vc_action_ena && status_host.tc_action_ena) {
action = "VC | TC";
} else if (status_host.tc_action_ena) {
action = "TC";
} else if (status_host.vc_action_ena) {
action = "VC";
}
// TODO(benvanik): notify resource cache of base->size and type.
XELOGD("Make {:08X} -> {:08X} ({}b) coherent, action = {}", base_host,
base_host + size_host, size_host, action);
// Mark coherent.
regs_volatile[XE_GPU_REG_COHER_STATUS_HOST] = 0;
}
void CommandProcessor::PrepareForWait() { trace_writer_.Flush(); }
void CommandProcessor::ReturnFromWait() {}
void CommandProcessor::InitializeTrace() {
// Write the initial register values, to be loaded directly into the
// RegisterFile since all registers, including those that may have side
// effects on setting, will be saved.
trace_writer_.WriteRegisters(
0, reinterpret_cast<const uint32_t*>(register_file_->values),
RegisterFile::kRegisterCount, false);
trace_writer_.WriteGammaRamp(gamma_ramp_256_entry_table(),
gamma_ramp_pwl_rgb(), gamma_ramp_rw_component_);
}
#define COMMAND_PROCESSOR CommandProcessor
#include "pm4_command_processor_implement.h"
} // namespace gpu
} // namespace xe