mirror of
https://github.com/xenia-project/xenia.git
synced 2026-03-04 12:34:09 +01:00
The "close window" keyboard hotkey (Guide-B) now toggles between loglevel -1 and the loglevel set in your config. Added LoggerBatch class, which accumulates strings into the threads scratch buffer. This is only intended to be used for very high frequency debug logging. if it exhausts the thread buffer, it just silently stops. Cleaned nearly 8 years of dust off of the pm4 packet disassembler code, now supports all packets that the command processor supports. Added extremely verbose logging for gpu register writes. This is not compiled in outside of debug builds, requires LogLevel::Debug and log_guest_driven_gpu_register_written_values = true. Added full logging of all PM4 packets in the cp. This is not compiled in outside of debug builds, requires LogLevel::Debug and disassemble_pm4. Piggybacked an implementation of guest callstack backtraces using the stackpoints from enable_host_guest_stack_synchronization. If enable_host_guest_stack_synchronization = false, no backtraces can be obtained. Added log_ringbuffer_kickoff_initiator_bts. when a thread updates the cp's read pointer, it dumps the backtrace of that thread Changed the names of the gpu registers CALLBACK_ADDRESS and CALLBACK_CONTEXT to the correct names. Added a note about CP_PROG_COUNTER Added CP_RB_WPTR to the gpu register table Added notes about CP_RB_CNTL and CP_RB_RPTR_ADDR. Both aren't necessary for HLE Changed name of UNKNOWN_0E00 gpu register to TC_CNTL_STATUS. Games only seem to write 1 to it (L2 invalidate)
740 lines
27 KiB
C++
740 lines
27 KiB
C++
/**
|
|
******************************************************************************
|
|
* Xenia : Xbox 360 Emulator Research Project *
|
|
******************************************************************************
|
|
* Copyright 2022 Ben Vanik. All rights reserved. *
|
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
|
******************************************************************************
|
|
*/
|
|
|
|
#include "xenia/gpu/command_processor.h"
|
|
|
|
#include <cinttypes>
|
|
|
|
#include "third_party/fmt/include/fmt/format.h"
|
|
#include "xenia/base/byte_stream.h"
|
|
#include "xenia/base/cvar.h"
|
|
#include "xenia/base/logging.h"
|
|
#include "xenia/base/profiling.h"
|
|
#include "xenia/gpu/gpu_flags.h"
|
|
#include "xenia/gpu/graphics_system.h"
|
|
#include "xenia/gpu/packet_disassembler.h"
|
|
#include "xenia/gpu/sampler_info.h"
|
|
#include "xenia/gpu/texture_info.h"
|
|
#include "xenia/kernel/kernel_state.h"
|
|
#include "xenia/kernel/user_module.h"
|
|
#if !defined(NDEBUG)
|
|
|
|
#define XE_ENABLE_GPU_REG_WRITE_LOGGING 1
|
|
#endif
|
|
DEFINE_bool(
|
|
log_guest_driven_gpu_register_written_values, false,
|
|
"Only does anything in debug builds, if set will log every write to a gpu "
|
|
"register done by a guest. Does not log writes that are done by the CP on "
|
|
"its own, just ones the guest makes or instructs it to make.",
|
|
"GPU");
|
|
|
|
DEFINE_bool(disassemble_pm4, false,
|
|
"Only does anything in debug builds, if set will disassemble and "
|
|
"log all PM4 packets sent to the CP.",
|
|
"GPU");
|
|
|
|
DEFINE_bool(
|
|
log_ringbuffer_kickoff_initiator_bts, false,
|
|
"Only does anything in debug builds, if set will log the pseudo-stacktrace "
|
|
"of the guest thread that wrote the new read position.",
|
|
"GPU");
|
|
|
|
namespace xe {
|
|
namespace gpu {
|
|
|
|
using namespace xe::gpu::xenos;
|
|
|
|
CommandProcessor::CommandProcessor(GraphicsSystem* graphics_system,
|
|
kernel::KernelState* kernel_state)
|
|
: reader_(nullptr, 0),
|
|
memory_(graphics_system->memory()),
|
|
kernel_state_(kernel_state),
|
|
graphics_system_(graphics_system),
|
|
register_file_(graphics_system_->register_file()),
|
|
trace_writer_(graphics_system->memory()->physical_membase()),
|
|
worker_running_(true),
|
|
write_ptr_index_event_(xe::threading::Event::CreateAutoResetEvent(false)),
|
|
write_ptr_index_(0) {
|
|
assert_not_null(write_ptr_index_event_);
|
|
}
|
|
|
|
CommandProcessor::~CommandProcessor() = default;
|
|
|
|
bool CommandProcessor::Initialize() {
|
|
// Initialize the gamma ramps to their default (linear) values - taken from
|
|
// what games set when starting with the sRGB (return value 1)
|
|
// VdGetCurrentDisplayGamma.
|
|
for (uint32_t i = 0; i < 256; ++i) {
|
|
uint32_t value = i * 0x3FF / 0xFF;
|
|
reg::DC_LUT_30_COLOR& gamma_ramp_entry = gamma_ramp_256_entry_table_[i];
|
|
gamma_ramp_entry.color_10_blue = value;
|
|
gamma_ramp_entry.color_10_green = value;
|
|
gamma_ramp_entry.color_10_red = value;
|
|
}
|
|
for (uint32_t i = 0; i < 128; ++i) {
|
|
reg::DC_LUT_PWL_DATA gamma_ramp_entry = {};
|
|
gamma_ramp_entry.base = (i * 0xFFFF / 0x7F) & ~UINT32_C(0x3F);
|
|
gamma_ramp_entry.delta = i < 0x7F ? 0x200 : 0;
|
|
for (uint32_t j = 0; j < 3; ++j) {
|
|
gamma_ramp_pwl_rgb_[i][j] = gamma_ramp_entry;
|
|
}
|
|
}
|
|
|
|
worker_running_ = true;
|
|
worker_thread_ = kernel::object_ref<kernel::XHostThread>(
|
|
new kernel::XHostThread(kernel_state_, 128 * 1024, 0, [this]() {
|
|
WorkerThreadMain();
|
|
return 0;
|
|
}));
|
|
worker_thread_->set_name("GPU Commands");
|
|
worker_thread_->Create();
|
|
|
|
return true;
|
|
}
|
|
|
|
void CommandProcessor::Shutdown() {
|
|
EndTracing();
|
|
|
|
worker_running_ = false;
|
|
write_ptr_index_event_->Set();
|
|
worker_thread_->Wait(0, 0, 0, nullptr);
|
|
worker_thread_.reset();
|
|
}
|
|
|
|
void CommandProcessor::InitializeShaderStorage(
|
|
const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) {
|
|
}
|
|
|
|
void CommandProcessor::RequestFrameTrace(
|
|
const std::filesystem::path& root_path) {
|
|
if (trace_state_ == TraceState::kStreaming) {
|
|
XELOGE("Streaming trace; cannot also trace frame.");
|
|
return;
|
|
}
|
|
if (trace_state_ == TraceState::kSingleFrame) {
|
|
XELOGE("Frame trace already pending; ignoring.");
|
|
return;
|
|
}
|
|
trace_state_ = TraceState::kSingleFrame;
|
|
trace_frame_path_ = root_path;
|
|
}
|
|
|
|
void CommandProcessor::BeginTracing(const std::filesystem::path& root_path) {
|
|
if (trace_state_ == TraceState::kStreaming) {
|
|
XELOGE("Streaming already active; ignoring request.");
|
|
return;
|
|
}
|
|
if (trace_state_ == TraceState::kSingleFrame) {
|
|
XELOGE("Frame trace pending; ignoring streaming request.");
|
|
return;
|
|
}
|
|
// Streaming starts on the next primary buffer execute.
|
|
trace_state_ = TraceState::kStreaming;
|
|
trace_stream_path_ = root_path;
|
|
}
|
|
|
|
void CommandProcessor::EndTracing() {
|
|
if (!trace_writer_.is_open()) {
|
|
return;
|
|
}
|
|
assert_true(trace_state_ == TraceState::kStreaming);
|
|
trace_state_ = TraceState::kDisabled;
|
|
trace_writer_.Close();
|
|
}
|
|
|
|
void CommandProcessor::RestoreRegisters(uint32_t first_register,
|
|
const uint32_t* register_values,
|
|
uint32_t register_count,
|
|
bool execute_callbacks) {
|
|
if (first_register > RegisterFile::kRegisterCount ||
|
|
RegisterFile::kRegisterCount - first_register < register_count) {
|
|
XELOGW(
|
|
"CommandProcessor::RestoreRegisters out of bounds (0x{:X} registers "
|
|
"starting with 0x{:X}, while a total of 0x{:X} registers are stored)",
|
|
register_count, first_register, RegisterFile::kRegisterCount);
|
|
if (first_register > RegisterFile::kRegisterCount) {
|
|
return;
|
|
}
|
|
register_count =
|
|
std::min(uint32_t(RegisterFile::kRegisterCount) - first_register,
|
|
register_count);
|
|
}
|
|
if (execute_callbacks) {
|
|
for (uint32_t i = 0; i < register_count; ++i) {
|
|
WriteRegister(first_register + i, register_values[i]);
|
|
}
|
|
} else {
|
|
std::memcpy(register_file_->values + first_register, register_values,
|
|
sizeof(uint32_t) * register_count);
|
|
}
|
|
}
|
|
|
|
void CommandProcessor::RestoreGammaRamp(
|
|
const reg::DC_LUT_30_COLOR* new_gamma_ramp_256_entry_table,
|
|
const reg::DC_LUT_PWL_DATA* new_gamma_ramp_pwl_rgb,
|
|
uint32_t new_gamma_ramp_rw_component) {
|
|
std::memcpy(gamma_ramp_256_entry_table_, new_gamma_ramp_256_entry_table,
|
|
sizeof(reg::DC_LUT_30_COLOR) * 256);
|
|
std::memcpy(gamma_ramp_pwl_rgb_, new_gamma_ramp_pwl_rgb,
|
|
sizeof(reg::DC_LUT_PWL_DATA) * 3 * 128);
|
|
gamma_ramp_rw_component_ = new_gamma_ramp_rw_component;
|
|
OnGammaRamp256EntryTableValueWritten();
|
|
OnGammaRampPWLValueWritten();
|
|
}
|
|
|
|
void CommandProcessor::CallInThread(std::function<void()> fn) {
|
|
if (pending_fns_.empty() &&
|
|
kernel::XThread::IsInThread(worker_thread_.get())) {
|
|
fn();
|
|
} else {
|
|
pending_fns_.push(std::move(fn));
|
|
}
|
|
}
|
|
|
|
void CommandProcessor::ClearCaches() {}
|
|
|
|
void CommandProcessor::SetDesiredSwapPostEffect(
|
|
SwapPostEffect swap_post_effect) {
|
|
if (swap_post_effect_desired_ == swap_post_effect) {
|
|
return;
|
|
}
|
|
swap_post_effect_desired_ = swap_post_effect;
|
|
CallInThread([this, swap_post_effect]() {
|
|
swap_post_effect_actual_ = swap_post_effect;
|
|
});
|
|
}
|
|
|
|
void CommandProcessor::WorkerThreadMain() {
|
|
if (!SetupContext()) {
|
|
xe::FatalError("Unable to setup command processor internal state");
|
|
return;
|
|
}
|
|
|
|
while (worker_running_) {
|
|
while (!pending_fns_.empty()) {
|
|
auto fn = std::move(pending_fns_.front());
|
|
pending_fns_.pop();
|
|
fn();
|
|
}
|
|
|
|
uint32_t write_ptr_index = write_ptr_index_.load();
|
|
if (write_ptr_index == 0xBAADF00D || read_ptr_index_ == write_ptr_index) {
|
|
SCOPE_profile_cpu_i("gpu", "xe::gpu::CommandProcessor::Stall");
|
|
// We've run out of commands to execute.
|
|
// We spin here waiting for new ones, as the overhead of waiting on our
|
|
// event is too high.
|
|
PrepareForWait();
|
|
uint32_t loop_count = 0;
|
|
do {
|
|
// If we spin around too much, revert to a "low-power" state.
|
|
if (loop_count > 500) {
|
|
const int wait_time_ms = 2;
|
|
xe::threading::Wait(write_ptr_index_event_.get(), true,
|
|
std::chrono::milliseconds(wait_time_ms));
|
|
} else {
|
|
xe::threading::MaybeYield();
|
|
}
|
|
loop_count++;
|
|
write_ptr_index = write_ptr_index_.load();
|
|
} while (worker_running_ && pending_fns_.empty() &&
|
|
(write_ptr_index == 0xBAADF00D ||
|
|
read_ptr_index_ == write_ptr_index));
|
|
ReturnFromWait();
|
|
if (!worker_running_ || !pending_fns_.empty()) {
|
|
continue;
|
|
}
|
|
}
|
|
assert_true(read_ptr_index_ != write_ptr_index);
|
|
|
|
// Execute. Note that we handle wraparound transparently.
|
|
read_ptr_index_ = ExecutePrimaryBuffer(read_ptr_index_, write_ptr_index);
|
|
|
|
// TODO(benvanik): use reader->Read_update_freq_ and only issue after moving
|
|
// that many indices.
|
|
// Keep in mind that the gpu also updates the cpu-side copy if the write pointer and read pointer would be equal
|
|
if (read_ptr_writeback_ptr_) {
|
|
xe::store_and_swap<uint32_t>(
|
|
memory_->TranslatePhysical(read_ptr_writeback_ptr_), read_ptr_index_);
|
|
}
|
|
|
|
// FIXME: We're supposed to process the WAIT_UNTIL register at this point,
|
|
// but no games seem to actually use it.
|
|
}
|
|
|
|
ShutdownContext();
|
|
}
|
|
|
|
void CommandProcessor::Pause() {
|
|
if (paused_) {
|
|
return;
|
|
}
|
|
paused_ = true;
|
|
|
|
threading::Fence fence;
|
|
CallInThread([&fence]() {
|
|
fence.Signal();
|
|
threading::Thread::GetCurrentThread()->Suspend();
|
|
});
|
|
|
|
fence.Wait();
|
|
}
|
|
|
|
void CommandProcessor::Resume() {
|
|
if (!paused_) {
|
|
return;
|
|
}
|
|
paused_ = false;
|
|
|
|
worker_thread_->thread()->Resume();
|
|
}
|
|
|
|
bool CommandProcessor::Save(ByteStream* stream) {
|
|
assert_true(paused_);
|
|
|
|
stream->Write<uint32_t>(primary_buffer_ptr_);
|
|
stream->Write<uint32_t>(primary_buffer_size_);
|
|
stream->Write<uint32_t>(read_ptr_index_);
|
|
stream->Write<uint32_t>(read_ptr_update_freq_);
|
|
stream->Write<uint32_t>(read_ptr_writeback_ptr_);
|
|
stream->Write<uint32_t>(write_ptr_index_.load());
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CommandProcessor::Restore(ByteStream* stream) {
|
|
assert_true(paused_);
|
|
|
|
primary_buffer_ptr_ = stream->Read<uint32_t>();
|
|
primary_buffer_size_ = stream->Read<uint32_t>();
|
|
read_ptr_index_ = stream->Read<uint32_t>();
|
|
read_ptr_update_freq_ = stream->Read<uint32_t>();
|
|
read_ptr_writeback_ptr_ = stream->Read<uint32_t>();
|
|
write_ptr_index_.store(stream->Read<uint32_t>());
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CommandProcessor::SetupContext() { return true; }
|
|
|
|
void CommandProcessor::ShutdownContext() {}
|
|
|
|
void CommandProcessor::InitializeRingBuffer(uint32_t ptr, uint32_t size_log2) {
|
|
read_ptr_index_ = 0;
|
|
primary_buffer_ptr_ = ptr;
|
|
primary_buffer_size_ = uint32_t(1) << (size_log2 + 3);
|
|
|
|
std::memset(kernel_state_->memory()->TranslatePhysical(primary_buffer_ptr_),
|
|
0, primary_buffer_size_);
|
|
}
|
|
|
|
void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,
|
|
uint32_t block_size_log2) {
|
|
// CP_RB_RPTR_ADDR Ring Buffer Read Pointer Address 0x70C
|
|
// ptr = RB_RPTR_ADDR, pointer to write back the address to.
|
|
read_ptr_writeback_ptr_ = ptr;
|
|
// CP_RB_CNTL Ring Buffer Control 0x704
|
|
// block_size = RB_BLKSZ, log2 of number of quadwords read between updates of
|
|
// the read pointer.
|
|
read_ptr_update_freq_ = uint32_t(1) << block_size_log2 >> 2;
|
|
}
|
|
|
|
XE_NOINLINE XE_COLD void CommandProcessor::LogKickoffInitator(uint32_t value) {
|
|
cpu::backend::GuestPseudoStackTrace st;
|
|
|
|
if (logging::internal::ShouldLog(LogLevel::Debug) && kernel_state_->processor()
|
|
->backend()
|
|
->PopulatePseudoStacktrace(&st)) {
|
|
logging::LoggerBatch<LogLevel::Debug> log_initiator{};
|
|
|
|
log_initiator("Updating read ptr to {}, initiator stacktrace below\n",
|
|
value);
|
|
|
|
for (uint32_t i = 0; i < st.count; ++i) {
|
|
log_initiator("\t{:08X}\n", st.return_addrs[i]);
|
|
}
|
|
|
|
if (st.truncated_flag) {
|
|
log_initiator("\t(Truncated stacktrace to {} entries)\n",
|
|
cpu::backend::MAX_GUEST_PSEUDO_STACKTRACE_ENTRIES);
|
|
}
|
|
log_initiator.submit('d');
|
|
}
|
|
}
|
|
|
|
void CommandProcessor::UpdateWritePointer(uint32_t value) {
|
|
XE_UNLIKELY_IF (cvars::log_ringbuffer_kickoff_initiator_bts) {
|
|
LogKickoffInitator(value);
|
|
}
|
|
write_ptr_index_ = value;
|
|
write_ptr_index_event_->SetBoostPriority();
|
|
}
|
|
|
|
void CommandProcessor::LogRegisterSet(uint32_t register_index, uint32_t value) {
|
|
#if XE_ENABLE_GPU_REG_WRITE_LOGGING == 1
|
|
if (cvars::log_guest_driven_gpu_register_written_values && logging::internal::ShouldLog(LogLevel::Debug)) {
|
|
const RegisterInfo* reginfo = RegisterFile::GetRegisterInfo(register_index);
|
|
|
|
if (!reginfo) {
|
|
XELOGD("Unknown_Reg{:04X} <- {:08X}\n", register_index, value);
|
|
} else {
|
|
XELOGD("{} <- {:08X}\n", reginfo->name, value);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void CommandProcessor::LogRegisterSets(uint32_t base_register_index,
|
|
const uint32_t* values,
|
|
uint32_t n_values) {
|
|
#if XE_ENABLE_GPU_REG_WRITE_LOGGING == 1
|
|
if (cvars::log_guest_driven_gpu_register_written_values &&
|
|
logging::internal::ShouldLog(LogLevel::Debug)) {
|
|
auto target = logging::internal::GetThreadBuffer();
|
|
|
|
auto target_ptr = target.first;
|
|
|
|
size_t total_size = 0;
|
|
|
|
size_t rem_size = target.second;
|
|
|
|
for (uint32_t i = 0; i < n_values; ++i) {
|
|
uint32_t register_index = base_register_index + i;
|
|
|
|
uint32_t value = xe::load_and_swap<uint32_t>(&values[i]);
|
|
|
|
const RegisterInfo* reginfo =
|
|
RegisterFile::GetRegisterInfo(register_index);
|
|
|
|
if (!reginfo) {
|
|
auto tmpres = fmt::format_to_n(target_ptr, rem_size,
|
|
"Unknown_Reg{:04X} <- {:08X}\n",
|
|
register_index, value);
|
|
target_ptr = tmpres.out;
|
|
rem_size -= tmpres.size;
|
|
total_size += tmpres.size;
|
|
|
|
} else {
|
|
auto tmpres = fmt::format_to_n(target_ptr, rem_size, "{} <- {:08X}\n",
|
|
reginfo->name, value);
|
|
rem_size -= tmpres.size;
|
|
target_ptr = tmpres.out;
|
|
total_size += tmpres.size;
|
|
}
|
|
}
|
|
logging::internal::AppendLogLine(LogLevel::Debug, 'd', total_size);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
|
|
uint32_t value) {
|
|
RegisterFile& regs = *register_file_;
|
|
// Scratch register writeback.
|
|
if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) {
|
|
uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0;
|
|
if ((1 << scratch_reg) & regs.values[XE_GPU_REG_SCRATCH_UMSK].u32) {
|
|
// Enabled - write to address.
|
|
uint32_t scratch_addr = regs.values[XE_GPU_REG_SCRATCH_ADDR].u32;
|
|
uint32_t mem_addr = scratch_addr + (scratch_reg * 4);
|
|
xe::store_and_swap<uint32_t>(memory_->TranslatePhysical(mem_addr), value);
|
|
}
|
|
} else {
|
|
switch (index) {
|
|
// If this is a COHER register, set the dirty flag.
|
|
// This will block the command processor the next time it WAIT_MEM_REGs
|
|
// and allow us to synchronize the memory.
|
|
case XE_GPU_REG_COHER_STATUS_HOST: {
|
|
regs.values[index].u32 |= UINT32_C(0x80000000);
|
|
} break;
|
|
|
|
case XE_GPU_REG_DC_LUT_RW_INDEX: {
|
|
// Reset the sequential read / write component index (see the M56
|
|
// DC_LUT_SEQ_COLOR documentation).
|
|
gamma_ramp_rw_component_ = 0;
|
|
} break;
|
|
|
|
case XE_GPU_REG_DC_LUT_SEQ_COLOR: {
|
|
// Should be in the 256-entry table writing mode.
|
|
assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1);
|
|
auto& gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
|
|
// DC_LUT_SEQ_COLOR is in the red, green, blue order, but the write
|
|
// enable mask is blue, green, red.
|
|
bool write_gamma_ramp_component =
|
|
(regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 &
|
|
(UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0;
|
|
if (write_gamma_ramp_component) {
|
|
reg::DC_LUT_30_COLOR& gamma_ramp_entry =
|
|
gamma_ramp_256_entry_table_[gamma_ramp_rw_index.rw_index];
|
|
// Bits 0:5 are hardwired to zero.
|
|
uint32_t gamma_ramp_seq_color =
|
|
regs.Get<reg::DC_LUT_SEQ_COLOR>().seq_color >> 6;
|
|
switch (gamma_ramp_rw_component_) {
|
|
case 0:
|
|
gamma_ramp_entry.color_10_red = gamma_ramp_seq_color;
|
|
break;
|
|
case 1:
|
|
gamma_ramp_entry.color_10_green = gamma_ramp_seq_color;
|
|
break;
|
|
case 2:
|
|
gamma_ramp_entry.color_10_blue = gamma_ramp_seq_color;
|
|
break;
|
|
}
|
|
}
|
|
if (++gamma_ramp_rw_component_ >= 3) {
|
|
gamma_ramp_rw_component_ = 0;
|
|
++gamma_ramp_rw_index.rw_index;
|
|
}
|
|
if (write_gamma_ramp_component) {
|
|
OnGammaRamp256EntryTableValueWritten();
|
|
}
|
|
} break;
|
|
|
|
case XE_GPU_REG_DC_LUT_PWL_DATA: {
|
|
// Should be in the PWL writing mode.
|
|
assert_not_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1);
|
|
auto& gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
|
|
// Bit 7 of the index is ignored for PWL.
|
|
uint32_t gamma_ramp_rw_index_pwl = gamma_ramp_rw_index.rw_index & 0x7F;
|
|
// DC_LUT_PWL_DATA is likely in the red, green, blue order because
|
|
// DC_LUT_SEQ_COLOR is, but the write enable mask is blue, green, red.
|
|
bool write_gamma_ramp_component =
|
|
(regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 &
|
|
(UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0;
|
|
if (write_gamma_ramp_component) {
|
|
reg::DC_LUT_PWL_DATA& gamma_ramp_entry =
|
|
gamma_ramp_pwl_rgb_[gamma_ramp_rw_index_pwl]
|
|
[gamma_ramp_rw_component_];
|
|
auto gamma_ramp_value = regs.Get<reg::DC_LUT_PWL_DATA>();
|
|
// Bits 0:5 are hardwired to zero.
|
|
gamma_ramp_entry.base = gamma_ramp_value.base & ~UINT32_C(0x3F);
|
|
gamma_ramp_entry.delta = gamma_ramp_value.delta & ~UINT32_C(0x3F);
|
|
}
|
|
if (++gamma_ramp_rw_component_ >= 3) {
|
|
gamma_ramp_rw_component_ = 0;
|
|
// TODO(Triang3l): Should this increase beyond 7 bits for PWL?
|
|
// Direct3D 9 explicitly sets rw_index to 0x80 after writing the last
|
|
// PWL entry. However, the DC_LUT_RW_INDEX documentation says that for
|
|
// PWL, the bit 7 is ignored.
|
|
gamma_ramp_rw_index.rw_index =
|
|
(gamma_ramp_rw_index.rw_index & ~UINT32_C(0x7F)) |
|
|
((gamma_ramp_rw_index_pwl + 1) & 0x7F);
|
|
}
|
|
if (write_gamma_ramp_component) {
|
|
OnGammaRampPWLValueWritten();
|
|
}
|
|
} break;
|
|
|
|
case XE_GPU_REG_DC_LUT_30_COLOR: {
|
|
// Should be in the 256-entry table writing mode.
|
|
assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1);
|
|
auto& gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
|
|
uint32_t gamma_ramp_write_enable_mask =
|
|
regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 & 0b111;
|
|
if (gamma_ramp_write_enable_mask) {
|
|
reg::DC_LUT_30_COLOR& gamma_ramp_entry =
|
|
gamma_ramp_256_entry_table_[gamma_ramp_rw_index.rw_index];
|
|
auto gamma_ramp_value = regs.Get<reg::DC_LUT_30_COLOR>();
|
|
if (gamma_ramp_write_enable_mask & 0b001) {
|
|
gamma_ramp_entry.color_10_blue = gamma_ramp_value.color_10_blue;
|
|
}
|
|
if (gamma_ramp_write_enable_mask & 0b010) {
|
|
gamma_ramp_entry.color_10_green = gamma_ramp_value.color_10_green;
|
|
}
|
|
if (gamma_ramp_write_enable_mask & 0b100) {
|
|
gamma_ramp_entry.color_10_red = gamma_ramp_value.color_10_red;
|
|
}
|
|
}
|
|
++gamma_ramp_rw_index.rw_index;
|
|
// TODO(Triang3l): Should this reset the component write index? If this
|
|
// increase is assumed to behave like a full DC_LUT_RW_INDEX write, it
|
|
// probably should.
|
|
gamma_ramp_rw_component_ = 0;
|
|
if (gamma_ramp_write_enable_mask) {
|
|
OnGammaRamp256EntryTableValueWritten();
|
|
}
|
|
} break;
|
|
}
|
|
}
|
|
}
|
|
void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
|
// chrispy: rearrange check order, place set after checks
|
|
|
|
if (XE_LIKELY(index < RegisterFile::kRegisterCount)) {
|
|
register_file_->values[index].u32 = value;
|
|
|
|
// quick pre-test
|
|
// todo: figure out just how unlikely this is. if very (it ought to be,
|
|
// theres a ton of registers other than these) make this predicate
|
|
// branchless and mark with unlikely, then make HandleSpecialRegisterWrite
|
|
// noinline yep, its very unlikely. these ORS here are meant to be bitwise
|
|
// ors, so that we do not do branching evaluation of the conditions (we will
|
|
// almost always take all of the branches)
|
|
|
|
unsigned expr = (index - XE_GPU_REG_SCRATCH_REG0 < 8) |
|
|
(index == XE_GPU_REG_COHER_STATUS_HOST) |
|
|
((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
|
|
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
|
|
// chrispy: reordered for msvc branch probability (assumes if is taken and
|
|
// else is not)
|
|
if (XE_LIKELY(expr == 0)) {
|
|
XE_MSVC_REORDER_BARRIER();
|
|
|
|
} else {
|
|
HandleSpecialRegisterWrite(index, value);
|
|
}
|
|
} else {
|
|
XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", index);
|
|
return;
|
|
}
|
|
}
|
|
void CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
|
|
uint32_t* base,
|
|
uint32_t num_registers) {
|
|
for (uint32_t i = 0; i < num_registers; ++i) {
|
|
uint32_t data = xe::load_and_swap<uint32_t>(base + i);
|
|
this->WriteRegister(start_index + i, data);
|
|
}
|
|
}
|
|
|
|
void CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
|
|
uint32_t base,
|
|
uint32_t num_registers) {
|
|
for (uint32_t i = 0; i < num_registers; ++i) {
|
|
uint32_t data = ring->ReadAndSwap<uint32_t>();
|
|
WriteRegister(base + i, data);
|
|
}
|
|
}
|
|
|
|
void CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring,
|
|
uint32_t base,
|
|
uint32_t num_times) {
|
|
WriteRegisterRangeFromRing(ring, base + 0x4000, num_times);
|
|
}
|
|
|
|
void CommandProcessor::WriteFetchRangeFromRing(xe::RingBuffer* ring,
|
|
uint32_t base,
|
|
uint32_t num_times) {
|
|
WriteRegisterRangeFromRing(ring, base + 0x4800, num_times);
|
|
}
|
|
|
|
void CommandProcessor::WriteBoolRangeFromRing(xe::RingBuffer* ring,
|
|
uint32_t base,
|
|
uint32_t num_times) {
|
|
WriteRegisterRangeFromRing(ring, base + 0x4900, num_times);
|
|
}
|
|
|
|
void CommandProcessor::WriteLoopRangeFromRing(xe::RingBuffer* ring,
|
|
uint32_t base,
|
|
uint32_t num_times) {
|
|
WriteRegisterRangeFromRing(ring, base + 0x4908, num_times);
|
|
}
|
|
|
|
void CommandProcessor::WriteREGISTERSRangeFromRing(xe::RingBuffer* ring,
|
|
uint32_t base,
|
|
uint32_t num_times) {
|
|
WriteRegisterRangeFromRing(ring, base + 0x2000, num_times);
|
|
}
|
|
|
|
void CommandProcessor::WriteALURangeFromMem(uint32_t start_index,
|
|
uint32_t* base,
|
|
uint32_t num_registers) {
|
|
WriteRegistersFromMem(start_index + 0x4000, base, num_registers);
|
|
}
|
|
|
|
void CommandProcessor::WriteFetchRangeFromMem(uint32_t start_index,
|
|
uint32_t* base,
|
|
uint32_t num_registers) {
|
|
WriteRegistersFromMem(start_index + 0x4800, base, num_registers);
|
|
}
|
|
|
|
void CommandProcessor::WriteBoolRangeFromMem(uint32_t start_index,
|
|
uint32_t* base,
|
|
uint32_t num_registers) {
|
|
WriteRegistersFromMem(start_index + 0x4900, base, num_registers);
|
|
}
|
|
|
|
void CommandProcessor::WriteLoopRangeFromMem(uint32_t start_index,
|
|
uint32_t* base,
|
|
uint32_t num_registers) {
|
|
WriteRegistersFromMem(start_index + 0x4908, base, num_registers);
|
|
}
|
|
|
|
void CommandProcessor::WriteREGISTERSRangeFromMem(uint32_t start_index,
|
|
uint32_t* base,
|
|
uint32_t num_registers) {
|
|
WriteRegistersFromMem(start_index + 0x2000, base, num_registers);
|
|
}
|
|
XE_NOINLINE
|
|
void CommandProcessor::WriteOneRegisterFromRing(uint32_t base,
|
|
uint32_t num_times) {
|
|
for (uint32_t m = 0; m < num_times; m++) {
|
|
uint32_t reg_data = reader_.ReadAndSwap<uint32_t>();
|
|
uint32_t target_index = base;
|
|
WriteRegister(target_index, reg_data);
|
|
}
|
|
}
|
|
void CommandProcessor::MakeCoherent() {
|
|
SCOPE_profile_cpu_f("gpu");
|
|
|
|
// Status host often has 0x01000000 or 0x03000000.
|
|
// This is likely toggling VC (vertex cache) or TC (texture cache).
|
|
// Or, it also has a direction in here maybe - there is probably
|
|
// some way to check for dest coherency (what all the COHER_DEST_BASE_*
|
|
// registers are for).
|
|
// Best docs I've found on this are here:
|
|
// https://web.archive.org/web/20160711162346/https://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/10/R6xx_R7xx_3D.pdf
|
|
// https://cgit.freedesktop.org/xorg/driver/xf86-video-radeonhd/tree/src/r6xx_accel.c?id=3f8b6eccd9dba116cc4801e7f80ce21a879c67d2#n454
|
|
|
|
RegisterFile* regs = register_file_;
|
|
auto& status_host = regs->Get<reg::COHER_STATUS_HOST>();
|
|
auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32;
|
|
auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32;
|
|
|
|
if (!status_host.status) {
|
|
return;
|
|
}
|
|
|
|
const char* action = "N/A";
|
|
if (status_host.vc_action_ena && status_host.tc_action_ena) {
|
|
action = "VC | TC";
|
|
} else if (status_host.tc_action_ena) {
|
|
action = "TC";
|
|
} else if (status_host.vc_action_ena) {
|
|
action = "VC";
|
|
}
|
|
|
|
// TODO(benvanik): notify resource cache of base->size and type.
|
|
XELOGD("Make {:08X} -> {:08X} ({}b) coherent, action = {}", base_host,
|
|
base_host + size_host, size_host, action);
|
|
|
|
// Mark coherent.
|
|
status_host.status = 0;
|
|
}
|
|
|
|
void CommandProcessor::PrepareForWait() { trace_writer_.Flush(); }
|
|
|
|
void CommandProcessor::ReturnFromWait() {}
|
|
|
|
|
|
void CommandProcessor::InitializeTrace() {
|
|
// Write the initial register values, to be loaded directly into the
|
|
// RegisterFile since all registers, including those that may have side
|
|
// effects on setting, will be saved.
|
|
trace_writer_.WriteRegisters(
|
|
0, reinterpret_cast<const uint32_t*>(register_file_->values),
|
|
RegisterFile::kRegisterCount, false);
|
|
|
|
trace_writer_.WriteGammaRamp(gamma_ramp_256_entry_table(),
|
|
gamma_ramp_pwl_rgb(), gamma_ramp_rw_component_);
|
|
}
|
|
#define COMMAND_PROCESSOR CommandProcessor
|
|
#include "pm4_command_processor_implement.h"
|
|
} // namespace gpu
|
|
} // namespace xe
|