mirror of
https://github.com/xenia-project/xenia.git
synced 2025-12-06 07:12:03 +01:00
2229 lines
95 KiB
C++
2229 lines
95 KiB
C++
/**
|
|
******************************************************************************
|
|
* Xenia : Xbox 360 Emulator Research Project *
|
|
******************************************************************************
|
|
* Copyright 2020 Ben Vanik. All rights reserved. *
|
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
|
******************************************************************************
|
|
*/
|
|
|
|
#include "xenia/gpu/d3d12/pipeline_cache.h"
|
|
|
|
#include <algorithm>
|
|
#include <atomic>
|
|
#include <cinttypes>
|
|
#include <cmath>
|
|
#include <cstring>
|
|
#include <deque>
|
|
#include <mutex>
|
|
#include <set>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "third_party/fmt/include/fmt/format.h"
|
|
#include "xenia/base/assert.h"
|
|
#include "xenia/base/byte_order.h"
|
|
#include "xenia/base/clock.h"
|
|
#include "xenia/base/cvar.h"
|
|
#include "xenia/base/filesystem.h"
|
|
#include "xenia/base/logging.h"
|
|
#include "xenia/base/math.h"
|
|
#include "xenia/base/profiling.h"
|
|
#include "xenia/base/string.h"
|
|
#include "xenia/base/string_buffer.h"
|
|
#include "xenia/base/xxhash.h"
|
|
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
|
|
#include "xenia/gpu/d3d12/d3d12_render_target_cache.h"
|
|
#include "xenia/gpu/draw_util.h"
|
|
#include "xenia/gpu/gpu_flags.h"
|
|
#include "xenia/gpu/xenos.h"
|
|
#include "xenia/ui/d3d12/d3d12_util.h"
|
|
|
|
DEFINE_bool(d3d12_dxbc_disasm, false,
|
|
"Disassemble DXBC shaders after generation.", "D3D12");
|
|
DEFINE_bool(
|
|
d3d12_dxbc_disasm_dxilconv, false,
|
|
"Disassemble DXBC shaders after conversion to DXIL, if DXIL shaders are "
|
|
"supported by the OS, and DirectX Shader Compiler DLLs available at "
|
|
"https://github.com/microsoft/DirectXShaderCompiler/releases are present.",
|
|
"D3D12");
|
|
DEFINE_int32(
|
|
d3d12_pipeline_creation_threads, -1,
|
|
"Number of threads used for graphics pipeline creation. -1 to calculate "
|
|
"automatically (75% of logical CPU cores), a positive number to specify "
|
|
"the number of threads explicitly (up to the number of logical CPU cores), "
|
|
"0 to disable multithreaded pipeline creation.",
|
|
"D3D12");
|
|
DEFINE_bool(d3d12_tessellation_wireframe, false,
|
|
"Display tessellated surfaces as wireframe for debugging.",
|
|
"D3D12");
|
|
|
|
namespace xe {
|
|
namespace gpu {
|
|
namespace d3d12 {
|
|
|
|
// Generated with `xb buildshaders`.
|
|
namespace shaders {
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/adaptive_quad_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/adaptive_triangle_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/continuous_quad_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/continuous_triangle_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/discrete_quad_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/discrete_triangle_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/float24_round_ps.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/float24_truncate_ps.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/primitive_point_list_gs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/primitive_quad_list_gs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/primitive_rectangle_list_gs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/tessellation_adaptive_vs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/tessellation_indexed_vs.h"
|
|
} // namespace shaders
|
|
|
|
PipelineCache::PipelineCache(D3D12CommandProcessor& command_processor,
|
|
const RegisterFile& register_file,
|
|
const D3D12RenderTargetCache& render_target_cache,
|
|
bool bindless_resources_used)
|
|
: command_processor_(command_processor),
|
|
register_file_(register_file),
|
|
render_target_cache_(render_target_cache),
|
|
bindless_resources_used_(bindless_resources_used) {
|
|
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
|
|
|
bool edram_rov_used = render_target_cache.GetPath() ==
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
|
|
shader_translator_ = std::make_unique<DxbcShaderTranslator>(
|
|
provider.GetAdapterVendorID(), bindless_resources_used_, edram_rov_used,
|
|
render_target_cache_.gamma_render_target_as_srgb(),
|
|
render_target_cache_.msaa_2x_supported(),
|
|
render_target_cache_.GetResolutionScaleX(),
|
|
render_target_cache_.GetResolutionScaleY(),
|
|
provider.GetGraphicsAnalysis() != nullptr);
|
|
|
|
if (edram_rov_used) {
|
|
depth_only_pixel_shader_ =
|
|
std::move(shader_translator_->CreateDepthOnlyPixelShader());
|
|
}
|
|
}
|
|
|
|
PipelineCache::~PipelineCache() { Shutdown(); }
|
|
|
|
bool PipelineCache::Initialize() {
|
|
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
|
|
|
// Initialize the command processor thread DXIL objects.
|
|
dxbc_converter_ = nullptr;
|
|
dxc_utils_ = nullptr;
|
|
dxc_compiler_ = nullptr;
|
|
if (cvars::d3d12_dxbc_disasm_dxilconv) {
|
|
if (FAILED(provider.DxbcConverterCreateInstance(
|
|
CLSID_DxbcConverter, IID_PPV_ARGS(&dxbc_converter_)))) {
|
|
XELOGE(
|
|
"Failed to create DxbcConverter, converted DXIL disassembly for "
|
|
"debugging will be unavailable");
|
|
}
|
|
if (FAILED(provider.DxcCreateInstance(CLSID_DxcUtils,
|
|
IID_PPV_ARGS(&dxc_utils_)))) {
|
|
XELOGE(
|
|
"Failed to create DxcUtils, converted DXIL disassembly for debugging "
|
|
"will be unavailable");
|
|
}
|
|
if (FAILED(provider.DxcCreateInstance(CLSID_DxcCompiler,
|
|
IID_PPV_ARGS(&dxc_compiler_)))) {
|
|
XELOGE(
|
|
"Failed to create DxcCompiler, converted DXIL disassembly for "
|
|
"debugging will be unavailable");
|
|
}
|
|
}
|
|
|
|
uint32_t logical_processor_count = xe::threading::logical_processor_count();
|
|
if (!logical_processor_count) {
|
|
// Pick some reasonable amount if couldn't determine the number of cores.
|
|
logical_processor_count = 6;
|
|
}
|
|
// Initialize creation thread synchronization data even if not using creation
|
|
// threads because they may be used anyway to create pipelines from the
|
|
// storage.
|
|
creation_threads_busy_ = 0;
|
|
creation_completion_event_ =
|
|
xe::threading::Event::CreateManualResetEvent(true);
|
|
creation_completion_set_event_ = false;
|
|
creation_threads_shutdown_from_ = SIZE_MAX;
|
|
if (cvars::d3d12_pipeline_creation_threads != 0) {
|
|
size_t creation_thread_count;
|
|
if (cvars::d3d12_pipeline_creation_threads < 0) {
|
|
creation_thread_count =
|
|
std::max(logical_processor_count * 3 / 4, uint32_t(1));
|
|
} else {
|
|
creation_thread_count =
|
|
std::min(uint32_t(cvars::d3d12_pipeline_creation_threads),
|
|
logical_processor_count);
|
|
}
|
|
for (size_t i = 0; i < creation_thread_count; ++i) {
|
|
std::unique_ptr<xe::threading::Thread> creation_thread =
|
|
xe::threading::Thread::Create({}, [this, i]() { CreationThread(i); });
|
|
creation_thread->set_name("D3D12 Pipelines");
|
|
creation_threads_.push_back(std::move(creation_thread));
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void PipelineCache::Shutdown() {
|
|
ClearCache(true);
|
|
|
|
// Shut down all threads.
|
|
if (!creation_threads_.empty()) {
|
|
{
|
|
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
|
creation_threads_shutdown_from_ = 0;
|
|
}
|
|
creation_request_cond_.notify_all();
|
|
for (size_t i = 0; i < creation_threads_.size(); ++i) {
|
|
xe::threading::Wait(creation_threads_[i].get(), false);
|
|
}
|
|
creation_threads_.clear();
|
|
}
|
|
creation_completion_event_.reset();
|
|
|
|
ui::d3d12::util::ReleaseAndNull(dxc_compiler_);
|
|
ui::d3d12::util::ReleaseAndNull(dxc_utils_);
|
|
ui::d3d12::util::ReleaseAndNull(dxbc_converter_);
|
|
}
|
|
|
|
void PipelineCache::ClearCache(bool shutting_down) {
|
|
bool reinitialize_shader_storage =
|
|
!shutting_down && storage_write_thread_ != nullptr;
|
|
std::filesystem::path shader_storage_cache_root;
|
|
uint32_t shader_storage_title_id = shader_storage_title_id_;
|
|
if (reinitialize_shader_storage) {
|
|
shader_storage_cache_root = shader_storage_cache_root_;
|
|
}
|
|
ShutdownShaderStorage();
|
|
|
|
// Remove references to the current pipeline.
|
|
current_pipeline_ = nullptr;
|
|
|
|
if (!creation_threads_.empty()) {
|
|
// Empty the pipeline creation queue and make sure there are no threads
|
|
// currently creating pipelines because pipelines are going to be deleted.
|
|
bool await_creation_completion_event = false;
|
|
{
|
|
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
|
creation_queue_.clear();
|
|
await_creation_completion_event = creation_threads_busy_ != 0;
|
|
if (await_creation_completion_event) {
|
|
creation_completion_event_->Reset();
|
|
creation_completion_set_event_ = true;
|
|
}
|
|
}
|
|
if (await_creation_completion_event) {
|
|
creation_request_cond_.notify_one();
|
|
xe::threading::Wait(creation_completion_event_.get(), false);
|
|
}
|
|
}
|
|
|
|
// Destroy all pipelines.
|
|
for (auto it : pipelines_) {
|
|
it.second->state->Release();
|
|
delete it.second;
|
|
}
|
|
pipelines_.clear();
|
|
COUNT_profile_set("gpu/pipeline_cache/pipelines", 0);
|
|
|
|
// Destroy all shaders.
|
|
command_processor_.NotifyShaderBindingsLayoutUIDsInvalidated();
|
|
if (bindless_resources_used_) {
|
|
bindless_sampler_layout_map_.clear();
|
|
bindless_sampler_layouts_.clear();
|
|
}
|
|
texture_binding_layout_map_.clear();
|
|
texture_binding_layouts_.clear();
|
|
for (auto it : shaders_) {
|
|
delete it.second;
|
|
}
|
|
shaders_.clear();
|
|
shader_storage_index_ = 0;
|
|
|
|
if (reinitialize_shader_storage) {
|
|
InitializeShaderStorage(shader_storage_cache_root, shader_storage_title_id,
|
|
false);
|
|
}
|
|
}
|
|
|
|
void PipelineCache::InitializeShaderStorage(
|
|
const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) {
|
|
ShutdownShaderStorage();
|
|
|
|
auto shader_storage_root = cache_root / "shaders";
|
|
// For files that can be moved between different hosts.
|
|
// Host PSO blobs - if ever added - should be stored in shaders/local/ (they
|
|
// currently aren't used because because they may be not very practical -
|
|
// would need to invalidate them every commit likely, and additional I/O
|
|
// cost - though D3D's internal validation would possibly be enough to ensure
|
|
// they are up to date).
|
|
auto shader_storage_shareable_root = shader_storage_root / "shareable";
|
|
if (!std::filesystem::exists(shader_storage_shareable_root)) {
|
|
if (!std::filesystem::create_directories(shader_storage_shareable_root)) {
|
|
XELOGE(
|
|
"Failed to create the shareable shader storage directory, persistent "
|
|
"shader storage will be disabled: {}",
|
|
xe::path_to_utf8(shader_storage_shareable_root));
|
|
return;
|
|
}
|
|
}
|
|
|
|
bool edram_rov_used = render_target_cache_.GetPath() ==
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
|
|
// Initialize the pipeline storage stream - read pipeline descriptions and
|
|
// collect used shader modifications to translate.
|
|
std::vector<PipelineStoredDescription> pipeline_stored_descriptions;
|
|
// <Shader hash, modification bits>.
|
|
std::set<std::pair<uint64_t, uint64_t>> shader_translations_needed;
|
|
auto pipeline_storage_file_path =
|
|
shader_storage_shareable_root /
|
|
fmt::format("{:08X}.{}.d3d12.xpso", title_id,
|
|
edram_rov_used ? "rov" : "rtv");
|
|
pipeline_storage_file_ =
|
|
xe::filesystem::OpenFile(pipeline_storage_file_path, "a+b");
|
|
if (!pipeline_storage_file_) {
|
|
XELOGE(
|
|
"Failed to open the Direct3D 12 pipeline description storage file for "
|
|
"writing, persistent shader storage will be disabled: {}",
|
|
xe::path_to_utf8(pipeline_storage_file_path));
|
|
return;
|
|
}
|
|
pipeline_storage_file_flush_needed_ = false;
|
|
// 'XEPS'.
|
|
const uint32_t pipeline_storage_magic = 0x53504558;
|
|
// 'DXRO' or 'DXRT'.
|
|
const uint32_t pipeline_storage_magic_api =
|
|
edram_rov_used ? 0x4F525844 : 0x54525844;
|
|
const uint32_t pipeline_storage_version_swapped =
|
|
xe::byte_swap(std::max(PipelineDescription::kVersion,
|
|
DxbcShaderTranslator::Modification::kVersion));
|
|
struct {
|
|
uint32_t magic;
|
|
uint32_t magic_api;
|
|
uint32_t version_swapped;
|
|
} pipeline_storage_file_header;
|
|
if (fread(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header),
|
|
1, pipeline_storage_file_) &&
|
|
pipeline_storage_file_header.magic == pipeline_storage_magic &&
|
|
pipeline_storage_file_header.magic_api == pipeline_storage_magic_api &&
|
|
pipeline_storage_file_header.version_swapped ==
|
|
pipeline_storage_version_swapped) {
|
|
xe::filesystem::Seek(pipeline_storage_file_, 0, SEEK_END);
|
|
int64_t pipeline_storage_told_end =
|
|
xe::filesystem::Tell(pipeline_storage_file_);
|
|
size_t pipeline_storage_told_count =
|
|
size_t(pipeline_storage_told_end >=
|
|
int64_t(sizeof(pipeline_storage_file_header))
|
|
? (uint64_t(pipeline_storage_told_end) -
|
|
sizeof(pipeline_storage_file_header)) /
|
|
sizeof(PipelineStoredDescription)
|
|
: 0);
|
|
if (pipeline_storage_told_count &&
|
|
xe::filesystem::Seek(pipeline_storage_file_,
|
|
int64_t(sizeof(pipeline_storage_file_header)),
|
|
SEEK_SET)) {
|
|
pipeline_stored_descriptions.resize(pipeline_storage_told_count);
|
|
pipeline_stored_descriptions.resize(
|
|
fread(pipeline_stored_descriptions.data(),
|
|
sizeof(PipelineStoredDescription), pipeline_storage_told_count,
|
|
pipeline_storage_file_));
|
|
size_t pipeline_storage_read_count = pipeline_stored_descriptions.size();
|
|
for (size_t i = 0; i < pipeline_storage_read_count; ++i) {
|
|
const PipelineStoredDescription& pipeline_stored_description =
|
|
pipeline_stored_descriptions[i];
|
|
// Validate file integrity, stop and truncate the stream if data is
|
|
// corrupted.
|
|
if (XXH3_64bits(&pipeline_stored_description.description,
|
|
sizeof(pipeline_stored_description.description)) !=
|
|
pipeline_stored_description.description_hash) {
|
|
pipeline_stored_descriptions.resize(i);
|
|
break;
|
|
}
|
|
// TODO(Triang3l): On Vulkan, skip pipelines requiring unsupported
|
|
// device features (to keep the cache files mostly shareable across
|
|
// devices).
|
|
// Mark the shader modifications as needed for translation.
|
|
shader_translations_needed.emplace(
|
|
pipeline_stored_description.description.vertex_shader_hash,
|
|
pipeline_stored_description.description.vertex_shader_modification);
|
|
if (pipeline_stored_description.description.pixel_shader_hash) {
|
|
shader_translations_needed.emplace(
|
|
pipeline_stored_description.description.pixel_shader_hash,
|
|
pipeline_stored_description.description
|
|
.pixel_shader_modification);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t logical_processor_count = xe::threading::logical_processor_count();
|
|
if (!logical_processor_count) {
|
|
// Pick some reasonable amount if couldn't determine the number of cores.
|
|
logical_processor_count = 6;
|
|
}
|
|
|
|
// Initialize the Xenos shader storage stream.
|
|
uint64_t shader_storage_initialization_start =
|
|
xe::Clock::QueryHostTickCount();
|
|
auto shader_storage_file_path =
|
|
shader_storage_shareable_root / fmt::format("{:08X}.xsh", title_id);
|
|
shader_storage_file_ =
|
|
xe::filesystem::OpenFile(shader_storage_file_path, "a+b");
|
|
if (!shader_storage_file_) {
|
|
XELOGE(
|
|
"Failed to open the guest shader storage file for writing, persistent "
|
|
"shader storage will be disabled: {}",
|
|
xe::path_to_utf8(shader_storage_file_path));
|
|
fclose(pipeline_storage_file_);
|
|
pipeline_storage_file_ = nullptr;
|
|
return;
|
|
}
|
|
++shader_storage_index_;
|
|
shader_storage_file_flush_needed_ = false;
|
|
struct {
|
|
uint32_t magic;
|
|
uint32_t version_swapped;
|
|
} shader_storage_file_header;
|
|
// 'XESH'.
|
|
const uint32_t shader_storage_magic = 0x48534558;
|
|
if (fread(&shader_storage_file_header, sizeof(shader_storage_file_header), 1,
|
|
shader_storage_file_) &&
|
|
shader_storage_file_header.magic == shader_storage_magic &&
|
|
xe::byte_swap(shader_storage_file_header.version_swapped) ==
|
|
ShaderStoredHeader::kVersion) {
|
|
uint64_t shader_storage_valid_bytes = sizeof(shader_storage_file_header);
|
|
// Load and translate shaders written by previous Xenia executions until the
|
|
// end of the file or until a corrupted one is detected.
|
|
ShaderStoredHeader shader_header;
|
|
std::vector<uint32_t> ucode_dwords;
|
|
ucode_dwords.reserve(0xFFFF);
|
|
size_t shaders_translated = 0;
|
|
|
|
// Threads overlapping file reading.
|
|
std::mutex shaders_translation_thread_mutex;
|
|
std::condition_variable shaders_translation_thread_cond;
|
|
std::deque<D3D12Shader*> shaders_to_translate;
|
|
size_t shader_translation_threads_busy = 0;
|
|
bool shader_translation_threads_shutdown = false;
|
|
std::mutex shaders_failed_to_translate_mutex;
|
|
std::vector<D3D12Shader::D3D12Translation*> shaders_failed_to_translate;
|
|
auto shader_translation_thread_function = [&]() {
|
|
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
|
StringBuffer ucode_disasm_buffer;
|
|
DxbcShaderTranslator translator(
|
|
provider.GetAdapterVendorID(), bindless_resources_used_,
|
|
edram_rov_used, render_target_cache_.gamma_render_target_as_srgb(),
|
|
render_target_cache_.msaa_2x_supported(),
|
|
render_target_cache_.GetResolutionScaleX(),
|
|
render_target_cache_.GetResolutionScaleY(),
|
|
provider.GetGraphicsAnalysis() != nullptr);
|
|
// If needed and possible, create objects needed for DXIL conversion and
|
|
// disassembly on this thread.
|
|
IDxbcConverter* dxbc_converter = nullptr;
|
|
IDxcUtils* dxc_utils = nullptr;
|
|
IDxcCompiler* dxc_compiler = nullptr;
|
|
if (cvars::d3d12_dxbc_disasm_dxilconv && dxbc_converter_ && dxc_utils_ &&
|
|
dxc_compiler_) {
|
|
provider.DxbcConverterCreateInstance(CLSID_DxbcConverter,
|
|
IID_PPV_ARGS(&dxbc_converter));
|
|
provider.DxcCreateInstance(CLSID_DxcUtils, IID_PPV_ARGS(&dxc_utils));
|
|
provider.DxcCreateInstance(CLSID_DxcCompiler,
|
|
IID_PPV_ARGS(&dxc_compiler));
|
|
}
|
|
for (;;) {
|
|
D3D12Shader* shader_to_translate;
|
|
for (;;) {
|
|
std::unique_lock<std::mutex> lock(shaders_translation_thread_mutex);
|
|
if (shaders_to_translate.empty()) {
|
|
if (shader_translation_threads_shutdown) {
|
|
return;
|
|
}
|
|
shaders_translation_thread_cond.wait(lock);
|
|
continue;
|
|
}
|
|
shader_to_translate = shaders_to_translate.front();
|
|
shaders_to_translate.pop_front();
|
|
++shader_translation_threads_busy;
|
|
break;
|
|
}
|
|
shader_to_translate->AnalyzeUcode(ucode_disasm_buffer);
|
|
// Translate each needed modification on this thread after performing
|
|
// modification-independent analysis of the whole shader.
|
|
uint64_t ucode_data_hash = shader_to_translate->ucode_data_hash();
|
|
for (auto modification_it = shader_translations_needed.lower_bound(
|
|
std::make_pair(ucode_data_hash, uint64_t(0)));
|
|
modification_it != shader_translations_needed.end() &&
|
|
modification_it->first == ucode_data_hash;
|
|
++modification_it) {
|
|
D3D12Shader::D3D12Translation* translation =
|
|
static_cast<D3D12Shader::D3D12Translation*>(
|
|
shader_to_translate->GetOrCreateTranslation(
|
|
modification_it->second));
|
|
// Only try (and delete in case of failure) if it's a new translation.
|
|
// If it's a shader previously encountered in the game, translation of
|
|
// which has failed, and the shader storage is loaded later, keep it
|
|
// this way not to try to translate it again.
|
|
if (!translation->is_translated() &&
|
|
!TranslateAnalyzedShader(translator, *translation, dxbc_converter,
|
|
dxc_utils, dxc_compiler)) {
|
|
std::lock_guard<std::mutex> lock(shaders_failed_to_translate_mutex);
|
|
shaders_failed_to_translate.push_back(translation);
|
|
}
|
|
}
|
|
{
|
|
std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
|
|
--shader_translation_threads_busy;
|
|
}
|
|
}
|
|
if (dxc_compiler) {
|
|
dxc_compiler->Release();
|
|
}
|
|
if (dxc_utils) {
|
|
dxc_utils->Release();
|
|
}
|
|
if (dxbc_converter) {
|
|
dxbc_converter->Release();
|
|
}
|
|
};
|
|
std::vector<std::unique_ptr<xe::threading::Thread>>
|
|
shader_translation_threads;
|
|
|
|
while (true) {
|
|
if (!fread(&shader_header, sizeof(shader_header), 1,
|
|
shader_storage_file_)) {
|
|
break;
|
|
}
|
|
size_t ucode_byte_count =
|
|
shader_header.ucode_dword_count * sizeof(uint32_t);
|
|
ucode_dwords.resize(shader_header.ucode_dword_count);
|
|
if (shader_header.ucode_dword_count &&
|
|
!fread(ucode_dwords.data(), ucode_byte_count, 1,
|
|
shader_storage_file_)) {
|
|
break;
|
|
}
|
|
uint64_t ucode_data_hash =
|
|
XXH3_64bits(ucode_dwords.data(), ucode_byte_count);
|
|
if (shader_header.ucode_data_hash != ucode_data_hash) {
|
|
// Validation failed.
|
|
break;
|
|
}
|
|
shader_storage_valid_bytes += sizeof(shader_header) + ucode_byte_count;
|
|
D3D12Shader* shader =
|
|
LoadShader(shader_header.type, ucode_dwords.data(),
|
|
shader_header.ucode_dword_count, ucode_data_hash);
|
|
if (shader->ucode_storage_index() == shader_storage_index_) {
|
|
// Appeared twice in this file for some reason - skip, otherwise race
|
|
// condition will be caused by translating twice in parallel.
|
|
continue;
|
|
}
|
|
// Loaded from the current storage - don't write again.
|
|
shader->set_ucode_storage_index(shader_storage_index_);
|
|
// Create new threads if the currently existing threads can't keep up
|
|
// with file reading, but not more than the number of logical processors
|
|
// minus one.
|
|
size_t shader_translation_threads_needed;
|
|
{
|
|
std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
|
|
shader_translation_threads_needed =
|
|
std::min(shader_translation_threads_busy +
|
|
shaders_to_translate.size() + size_t(1),
|
|
logical_processor_count - size_t(1));
|
|
}
|
|
while (shader_translation_threads.size() <
|
|
shader_translation_threads_needed) {
|
|
shader_translation_threads.push_back(xe::threading::Thread::Create(
|
|
{}, shader_translation_thread_function));
|
|
shader_translation_threads.back()->set_name("Shader Translation");
|
|
}
|
|
// Request ucode information gathering and translation of all the needed
|
|
// shaders.
|
|
{
|
|
std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
|
|
shaders_to_translate.push_back(shader);
|
|
}
|
|
shaders_translation_thread_cond.notify_one();
|
|
++shaders_translated;
|
|
}
|
|
if (!shader_translation_threads.empty()) {
|
|
{
|
|
std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
|
|
shader_translation_threads_shutdown = true;
|
|
}
|
|
shaders_translation_thread_cond.notify_all();
|
|
for (auto& shader_translation_thread : shader_translation_threads) {
|
|
xe::threading::Wait(shader_translation_thread.get(), false);
|
|
}
|
|
shader_translation_threads.clear();
|
|
for (D3D12Shader::D3D12Translation* translation :
|
|
shaders_failed_to_translate) {
|
|
D3D12Shader* shader = static_cast<D3D12Shader*>(&translation->shader());
|
|
shader->DestroyTranslation(translation->modification());
|
|
if (shader->translations().empty()) {
|
|
shaders_.erase(shader->ucode_data_hash());
|
|
delete shader;
|
|
}
|
|
}
|
|
}
|
|
XELOGGPU("Translated {} shaders from the storage in {} milliseconds",
|
|
shaders_translated,
|
|
(xe::Clock::QueryHostTickCount() -
|
|
shader_storage_initialization_start) *
|
|
1000 / xe::Clock::QueryHostTickFrequency());
|
|
xe::filesystem::TruncateStdioFile(shader_storage_file_,
|
|
shader_storage_valid_bytes);
|
|
} else {
|
|
xe::filesystem::TruncateStdioFile(shader_storage_file_, 0);
|
|
shader_storage_file_header.magic = shader_storage_magic;
|
|
shader_storage_file_header.version_swapped =
|
|
xe::byte_swap(ShaderStoredHeader::kVersion);
|
|
fwrite(&shader_storage_file_header, sizeof(shader_storage_file_header), 1,
|
|
shader_storage_file_);
|
|
}
|
|
|
|
// Create the pipelines.
|
|
if (!pipeline_stored_descriptions.empty()) {
|
|
uint64_t pipeline_creation_start_ = xe::Clock::QueryHostTickCount();
|
|
|
|
// Launch additional creation threads to use all cores to create
|
|
// pipelines faster. Will also be using the main thread, so minus 1.
|
|
size_t creation_thread_original_count = creation_threads_.size();
|
|
size_t creation_thread_needed_count = std::max(
|
|
std::min(pipeline_stored_descriptions.size(), logical_processor_count) -
|
|
size_t(1),
|
|
creation_thread_original_count);
|
|
while (creation_threads_.size() < creation_thread_original_count) {
|
|
size_t creation_thread_index = creation_threads_.size();
|
|
std::unique_ptr<xe::threading::Thread> creation_thread =
|
|
xe::threading::Thread::Create({}, [this, creation_thread_index]() {
|
|
CreationThread(creation_thread_index);
|
|
});
|
|
creation_thread->set_name("D3D12 Pipelines");
|
|
creation_threads_.push_back(std::move(creation_thread));
|
|
}
|
|
|
|
size_t pipelines_created = 0;
|
|
for (const PipelineStoredDescription& pipeline_stored_description :
|
|
pipeline_stored_descriptions) {
|
|
const PipelineDescription& pipeline_description =
|
|
pipeline_stored_description.description;
|
|
// TODO(Triang3l): On Vulkan, skip pipelines requiring unsupported device
|
|
// features (to keep the cache files mostly shareable across devices).
|
|
// Skip already known pipelines - those have already been enqueued.
|
|
auto found_range =
|
|
pipelines_.equal_range(pipeline_stored_description.description_hash);
|
|
bool pipeline_found = false;
|
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
|
Pipeline* found_pipeline = it->second;
|
|
if (!std::memcmp(&found_pipeline->description.description,
|
|
&pipeline_description, sizeof(pipeline_description))) {
|
|
pipeline_found = true;
|
|
break;
|
|
}
|
|
}
|
|
if (pipeline_found) {
|
|
continue;
|
|
}
|
|
|
|
PipelineRuntimeDescription pipeline_runtime_description;
|
|
auto vertex_shader_it =
|
|
shaders_.find(pipeline_description.vertex_shader_hash);
|
|
if (vertex_shader_it == shaders_.end()) {
|
|
continue;
|
|
}
|
|
D3D12Shader* vertex_shader = vertex_shader_it->second;
|
|
pipeline_runtime_description.vertex_shader =
|
|
static_cast<D3D12Shader::D3D12Translation*>(
|
|
vertex_shader->GetTranslation(
|
|
pipeline_description.vertex_shader_modification));
|
|
if (!pipeline_runtime_description.vertex_shader ||
|
|
!pipeline_runtime_description.vertex_shader->is_translated() ||
|
|
!pipeline_runtime_description.vertex_shader->is_valid()) {
|
|
continue;
|
|
}
|
|
D3D12Shader* pixel_shader;
|
|
if (pipeline_description.pixel_shader_hash) {
|
|
auto pixel_shader_it =
|
|
shaders_.find(pipeline_description.pixel_shader_hash);
|
|
if (pixel_shader_it == shaders_.end()) {
|
|
continue;
|
|
}
|
|
pixel_shader = pixel_shader_it->second;
|
|
pipeline_runtime_description.pixel_shader =
|
|
static_cast<D3D12Shader::D3D12Translation*>(
|
|
pixel_shader->GetTranslation(
|
|
pipeline_description.pixel_shader_modification));
|
|
if (!pipeline_runtime_description.pixel_shader ||
|
|
!pipeline_runtime_description.pixel_shader->is_translated() ||
|
|
!pipeline_runtime_description.pixel_shader->is_valid()) {
|
|
continue;
|
|
}
|
|
} else {
|
|
pixel_shader = nullptr;
|
|
pipeline_runtime_description.pixel_shader = nullptr;
|
|
}
|
|
pipeline_runtime_description.root_signature =
|
|
command_processor_.GetRootSignature(
|
|
vertex_shader, pixel_shader,
|
|
DxbcShaderTranslator::Modification(
|
|
pipeline_description.vertex_shader_modification)
|
|
.vertex.host_vertex_shader_type !=
|
|
Shader::HostVertexShaderType::kVertex);
|
|
if (!pipeline_runtime_description.root_signature) {
|
|
continue;
|
|
}
|
|
std::memcpy(&pipeline_runtime_description.description,
|
|
&pipeline_description, sizeof(pipeline_description));
|
|
|
|
Pipeline* new_pipeline = new Pipeline;
|
|
new_pipeline->state = nullptr;
|
|
std::memcpy(&new_pipeline->description, &pipeline_runtime_description,
|
|
sizeof(pipeline_runtime_description));
|
|
pipelines_.emplace(pipeline_stored_description.description_hash,
|
|
new_pipeline);
|
|
COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size());
|
|
if (!creation_threads_.empty()) {
|
|
// Submit the pipeline for creation to any available thread.
|
|
{
|
|
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
|
creation_queue_.push_back(new_pipeline);
|
|
}
|
|
creation_request_cond_.notify_one();
|
|
} else {
|
|
new_pipeline->state = CreateD3D12Pipeline(pipeline_runtime_description);
|
|
}
|
|
++pipelines_created;
|
|
}
|
|
|
|
CreateQueuedPipelinesOnProcessorThread();
|
|
if (creation_threads_.size() > creation_thread_original_count) {
|
|
{
|
|
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
|
creation_threads_shutdown_from_ = creation_thread_original_count;
|
|
// Assuming the queue is empty because of
|
|
// CreateQueuedPipelinesOnProcessorThread.
|
|
}
|
|
creation_request_cond_.notify_all();
|
|
while (creation_threads_.size() > creation_thread_original_count) {
|
|
xe::threading::Wait(creation_threads_.back().get(), false);
|
|
creation_threads_.pop_back();
|
|
}
|
|
bool await_creation_completion_event;
|
|
{
|
|
// Cleanup so additional threads can be created later again.
|
|
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
|
creation_threads_shutdown_from_ = SIZE_MAX;
|
|
// If the invocation is blocking, all the shader storage initialization
|
|
// is expected to be done before proceeding, to avoid latency in the
|
|
// command processor after the invocation.
|
|
await_creation_completion_event =
|
|
blocking && creation_threads_busy_ != 0;
|
|
if (await_creation_completion_event) {
|
|
creation_completion_event_->Reset();
|
|
creation_completion_set_event_ = true;
|
|
}
|
|
}
|
|
if (await_creation_completion_event) {
|
|
creation_request_cond_.notify_one();
|
|
xe::threading::Wait(creation_completion_event_.get(), false);
|
|
}
|
|
}
|
|
|
|
XELOGGPU(
|
|
"Created {} graphics pipelines (not including reading the "
|
|
"descriptions) from the storage in {} milliseconds",
|
|
pipelines_created,
|
|
(xe::Clock::QueryHostTickCount() - pipeline_creation_start_) * 1000 /
|
|
xe::Clock::QueryHostTickFrequency());
|
|
// If any pipeline descriptions were corrupted (or the whole file has excess
|
|
// bytes in the end), truncate to the last valid pipeline description.
|
|
xe::filesystem::TruncateStdioFile(
|
|
pipeline_storage_file_,
|
|
uint64_t(sizeof(pipeline_storage_file_header) +
|
|
sizeof(PipelineStoredDescription) *
|
|
pipeline_stored_descriptions.size()));
|
|
} else {
|
|
xe::filesystem::TruncateStdioFile(pipeline_storage_file_, 0);
|
|
pipeline_storage_file_header.magic = pipeline_storage_magic;
|
|
pipeline_storage_file_header.magic_api = pipeline_storage_magic_api;
|
|
pipeline_storage_file_header.version_swapped =
|
|
pipeline_storage_version_swapped;
|
|
fwrite(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header),
|
|
1, pipeline_storage_file_);
|
|
}
|
|
|
|
shader_storage_cache_root_ = cache_root;
|
|
shader_storage_title_id_ = title_id;
|
|
|
|
// Start the storage writing thread.
|
|
storage_write_flush_shaders_ = false;
|
|
storage_write_flush_pipelines_ = false;
|
|
storage_write_thread_shutdown_ = false;
|
|
storage_write_thread_ =
|
|
xe::threading::Thread::Create({}, [this]() { StorageWriteThread(); });
|
|
}
|
|
|
|
void PipelineCache::ShutdownShaderStorage() {
|
|
if (storage_write_thread_) {
|
|
{
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
storage_write_thread_shutdown_ = true;
|
|
}
|
|
storage_write_request_cond_.notify_all();
|
|
xe::threading::Wait(storage_write_thread_.get(), false);
|
|
storage_write_thread_.reset();
|
|
}
|
|
storage_write_shader_queue_.clear();
|
|
storage_write_pipeline_queue_.clear();
|
|
|
|
if (pipeline_storage_file_) {
|
|
fclose(pipeline_storage_file_);
|
|
pipeline_storage_file_ = nullptr;
|
|
pipeline_storage_file_flush_needed_ = false;
|
|
}
|
|
|
|
if (shader_storage_file_) {
|
|
fclose(shader_storage_file_);
|
|
shader_storage_file_ = nullptr;
|
|
shader_storage_file_flush_needed_ = false;
|
|
}
|
|
|
|
shader_storage_cache_root_.clear();
|
|
shader_storage_title_id_ = 0;
|
|
}
|
|
|
|
void PipelineCache::EndSubmission() {
|
|
if (shader_storage_file_flush_needed_ ||
|
|
pipeline_storage_file_flush_needed_) {
|
|
{
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
if (shader_storage_file_flush_needed_) {
|
|
storage_write_flush_shaders_ = true;
|
|
}
|
|
if (pipeline_storage_file_flush_needed_) {
|
|
storage_write_flush_pipelines_ = true;
|
|
}
|
|
}
|
|
storage_write_request_cond_.notify_one();
|
|
shader_storage_file_flush_needed_ = false;
|
|
pipeline_storage_file_flush_needed_ = false;
|
|
}
|
|
if (!creation_threads_.empty()) {
|
|
CreateQueuedPipelinesOnProcessorThread();
|
|
// Await creation of all queued pipelines.
|
|
bool await_creation_completion_event;
|
|
{
|
|
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
|
// Assuming the creation queue is already empty (because the processor
|
|
// thread also worked on creating the leftover pipelines), so only check
|
|
// if there are threads with pipelines currently being created.
|
|
await_creation_completion_event = creation_threads_busy_ != 0;
|
|
if (await_creation_completion_event) {
|
|
creation_completion_event_->Reset();
|
|
creation_completion_set_event_ = true;
|
|
}
|
|
}
|
|
if (await_creation_completion_event) {
|
|
creation_request_cond_.notify_one();
|
|
xe::threading::Wait(creation_completion_event_.get(), false);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool PipelineCache::IsCreatingPipelines() {
|
|
if (creation_threads_.empty()) {
|
|
return false;
|
|
}
|
|
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
|
return !creation_queue_.empty() || creation_threads_busy_ != 0;
|
|
}
|
|
|
|
D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
|
|
const uint32_t* host_address,
|
|
uint32_t dword_count) {
|
|
// Hash the input memory and lookup the shader.
|
|
return LoadShader(shader_type, host_address, dword_count,
|
|
XXH3_64bits(host_address, dword_count * sizeof(uint32_t)));
|
|
}
|
|
|
|
D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
|
|
const uint32_t* host_address,
|
|
uint32_t dword_count,
|
|
uint64_t data_hash) {
|
|
auto it = shaders_.find(data_hash);
|
|
if (it != shaders_.end()) {
|
|
// Shader has been previously loaded.
|
|
return it->second;
|
|
}
|
|
// Always create the shader and stash it away.
|
|
// We need to track it even if it fails translation so we know not to try
|
|
// again.
|
|
D3D12Shader* shader =
|
|
new D3D12Shader(shader_type, data_hash, host_address, dword_count);
|
|
shaders_.emplace(data_hash, shader);
|
|
return shader;
|
|
}
|
|
|
|
DxbcShaderTranslator::Modification
|
|
PipelineCache::GetCurrentVertexShaderModification(
|
|
const Shader& shader,
|
|
Shader::HostVertexShaderType host_vertex_shader_type) const {
|
|
assert_true(shader.type() == xenos::ShaderType::kVertex);
|
|
assert_true(shader.is_ucode_analyzed());
|
|
const auto& regs = register_file_;
|
|
auto sq_program_cntl = regs.Get<reg::SQ_PROGRAM_CNTL>();
|
|
return DxbcShaderTranslator::Modification(
|
|
shader_translator_->GetDefaultVertexShaderModification(
|
|
shader.GetDynamicAddressableRegisterCount(sq_program_cntl.vs_num_reg),
|
|
host_vertex_shader_type));
|
|
}
|
|
|
|
DxbcShaderTranslator::Modification
|
|
PipelineCache::GetCurrentPixelShaderModification(const Shader& shader) const {
|
|
assert_true(shader.type() == xenos::ShaderType::kPixel);
|
|
assert_true(shader.is_ucode_analyzed());
|
|
const auto& regs = register_file_;
|
|
auto sq_program_cntl = regs.Get<reg::SQ_PROGRAM_CNTL>();
|
|
DxbcShaderTranslator::Modification modification(
|
|
shader_translator_->GetDefaultPixelShaderModification(
|
|
shader.GetDynamicAddressableRegisterCount(
|
|
sq_program_cntl.ps_num_reg)));
|
|
if (render_target_cache_.GetPath() ==
|
|
RenderTargetCache::Path::kHostRenderTargets) {
|
|
using DepthStencilMode =
|
|
DxbcShaderTranslator::Modification::DepthStencilMode;
|
|
RenderTargetCache::DepthFloat24Conversion depth_float24_conversion =
|
|
render_target_cache_.depth_float24_conversion();
|
|
if ((depth_float24_conversion ==
|
|
RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
|
|
depth_float24_conversion ==
|
|
RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding) &&
|
|
draw_util::GetDepthControlForCurrentEdramMode(regs).z_enable &&
|
|
regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
|
|
xenos::DepthRenderTargetFormat::kD24FS8) {
|
|
modification.pixel.depth_stencil_mode =
|
|
depth_float24_conversion ==
|
|
RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating
|
|
? DepthStencilMode::kFloat24Truncating
|
|
: DepthStencilMode::kFloat24Rounding;
|
|
} else {
|
|
if (shader.implicit_early_z_write_allowed() &&
|
|
(!shader.writes_color_target(0) ||
|
|
!draw_util::DoesCoverageDependOnAlpha(
|
|
regs.Get<reg::RB_COLORCONTROL>()))) {
|
|
modification.pixel.depth_stencil_mode = DepthStencilMode::kEarlyHint;
|
|
} else {
|
|
modification.pixel.depth_stencil_mode = DepthStencilMode::kNoModifiers;
|
|
}
|
|
}
|
|
}
|
|
return modification;
|
|
}
|
|
|
|
bool PipelineCache::ConfigurePipeline(
|
|
D3D12Shader::D3D12Translation* vertex_shader,
|
|
D3D12Shader::D3D12Translation* pixel_shader,
|
|
const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
|
|
uint32_t bound_depth_and_color_render_target_bits,
|
|
const uint32_t* bound_depth_and_color_render_target_formats,
|
|
void** pipeline_handle_out, ID3D12RootSignature** root_signature_out) {
|
|
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
|
SCOPE_profile_cpu_f("gpu");
|
|
#endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
|
|
|
assert_not_null(pipeline_handle_out);
|
|
assert_not_null(root_signature_out);
|
|
|
|
// Ensure shaders are translated - needed now for GetCurrentStateDescription.
|
|
// Edge flags are not supported yet (because polygon primitives are not).
|
|
assert_true(register_file_.Get<reg::SQ_PROGRAM_CNTL>().vs_export_mode !=
|
|
xenos::VertexShaderExportMode::kPosition2VectorsEdge &&
|
|
register_file_.Get<reg::SQ_PROGRAM_CNTL>().vs_export_mode !=
|
|
xenos::VertexShaderExportMode::kPosition2VectorsEdgeKill);
|
|
assert_false(register_file_.Get<reg::SQ_PROGRAM_CNTL>().gen_index_vtx);
|
|
if (!vertex_shader->is_translated()) {
|
|
vertex_shader->shader().AnalyzeUcode(ucode_disasm_buffer_);
|
|
if (!TranslateAnalyzedShader(*shader_translator_, *vertex_shader,
|
|
dxbc_converter_, dxc_utils_, dxc_compiler_)) {
|
|
XELOGE("Failed to translate the vertex shader!");
|
|
return false;
|
|
}
|
|
if (shader_storage_file_ && vertex_shader->shader().ucode_storage_index() !=
|
|
shader_storage_index_) {
|
|
vertex_shader->shader().set_ucode_storage_index(shader_storage_index_);
|
|
assert_not_null(storage_write_thread_);
|
|
shader_storage_file_flush_needed_ = true;
|
|
{
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
storage_write_shader_queue_.push_back(&vertex_shader->shader());
|
|
}
|
|
storage_write_request_cond_.notify_all();
|
|
}
|
|
}
|
|
if (!vertex_shader->is_valid()) {
|
|
// Translation attempted previously, but not valid.
|
|
return false;
|
|
}
|
|
if (pixel_shader != nullptr) {
|
|
if (!pixel_shader->is_translated()) {
|
|
pixel_shader->shader().AnalyzeUcode(ucode_disasm_buffer_);
|
|
if (!TranslateAnalyzedShader(*shader_translator_, *pixel_shader,
|
|
dxbc_converter_, dxc_utils_,
|
|
dxc_compiler_)) {
|
|
XELOGE("Failed to translate the pixel shader!");
|
|
return false;
|
|
}
|
|
if (shader_storage_file_ &&
|
|
pixel_shader->shader().ucode_storage_index() !=
|
|
shader_storage_index_) {
|
|
pixel_shader->shader().set_ucode_storage_index(shader_storage_index_);
|
|
assert_not_null(storage_write_thread_);
|
|
shader_storage_file_flush_needed_ = true;
|
|
{
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
storage_write_shader_queue_.push_back(&pixel_shader->shader());
|
|
}
|
|
storage_write_request_cond_.notify_all();
|
|
}
|
|
}
|
|
if (!pixel_shader->is_valid()) {
|
|
// Translation attempted previously, but not valid.
|
|
return false;
|
|
}
|
|
}
|
|
|
|
PipelineRuntimeDescription runtime_description;
|
|
if (!GetCurrentStateDescription(
|
|
vertex_shader, pixel_shader, primitive_processing_result,
|
|
bound_depth_and_color_render_target_bits,
|
|
bound_depth_and_color_render_target_formats, runtime_description)) {
|
|
return false;
|
|
}
|
|
PipelineDescription& description = runtime_description.description;
|
|
|
|
if (current_pipeline_ != nullptr &&
|
|
!std::memcmp(¤t_pipeline_->description.description, &description,
|
|
sizeof(description))) {
|
|
*pipeline_handle_out = current_pipeline_;
|
|
*root_signature_out = runtime_description.root_signature;
|
|
return true;
|
|
}
|
|
|
|
// Find an existing pipeline in the cache.
|
|
uint64_t hash = XXH3_64bits(&description, sizeof(description));
|
|
auto found_range = pipelines_.equal_range(hash);
|
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
|
Pipeline* found_pipeline = it->second;
|
|
if (!std::memcmp(&found_pipeline->description.description, &description,
|
|
sizeof(description))) {
|
|
current_pipeline_ = found_pipeline;
|
|
*pipeline_handle_out = found_pipeline;
|
|
*root_signature_out = found_pipeline->description.root_signature;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
Pipeline* new_pipeline = new Pipeline;
|
|
new_pipeline->state = nullptr;
|
|
std::memcpy(&new_pipeline->description, &runtime_description,
|
|
sizeof(runtime_description));
|
|
pipelines_.emplace(hash, new_pipeline);
|
|
COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size());
|
|
|
|
if (!creation_threads_.empty()) {
|
|
// Submit the pipeline for creation to any available thread.
|
|
{
|
|
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
|
creation_queue_.push_back(new_pipeline);
|
|
}
|
|
creation_request_cond_.notify_one();
|
|
} else {
|
|
new_pipeline->state = CreateD3D12Pipeline(runtime_description);
|
|
}
|
|
|
|
if (pipeline_storage_file_) {
|
|
assert_not_null(storage_write_thread_);
|
|
pipeline_storage_file_flush_needed_ = true;
|
|
{
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
storage_write_pipeline_queue_.emplace_back();
|
|
PipelineStoredDescription& stored_description =
|
|
storage_write_pipeline_queue_.back();
|
|
stored_description.description_hash = hash;
|
|
std::memcpy(&stored_description.description, &description,
|
|
sizeof(description));
|
|
}
|
|
storage_write_request_cond_.notify_all();
|
|
}
|
|
|
|
current_pipeline_ = new_pipeline;
|
|
*pipeline_handle_out = new_pipeline;
|
|
*root_signature_out = runtime_description.root_signature;
|
|
return true;
|
|
}
|
|
|
|
bool PipelineCache::TranslateAnalyzedShader(
|
|
DxbcShaderTranslator& translator,
|
|
D3D12Shader::D3D12Translation& translation, IDxbcConverter* dxbc_converter,
|
|
IDxcUtils* dxc_utils, IDxcCompiler* dxc_compiler) {
|
|
D3D12Shader& shader = static_cast<D3D12Shader&>(translation.shader());
|
|
|
|
// Perform translation.
|
|
// If this fails the shader will be marked as invalid and ignored later.
|
|
if (!translator.TranslateAnalyzedShader(translation)) {
|
|
XELOGE("Shader {:016X} translation failed; marking as ignored",
|
|
shader.ucode_data_hash());
|
|
return false;
|
|
}
|
|
|
|
const char* host_shader_type;
|
|
if (shader.type() == xenos::ShaderType::kVertex) {
|
|
DxbcShaderTranslator::Modification modification(translation.modification());
|
|
switch (modification.vertex.host_vertex_shader_type) {
|
|
case Shader::HostVertexShaderType::kLineDomainCPIndexed:
|
|
host_shader_type = "control-point-indexed line domain";
|
|
break;
|
|
case Shader::HostVertexShaderType::kLineDomainPatchIndexed:
|
|
host_shader_type = "patch-indexed line domain";
|
|
break;
|
|
case Shader::HostVertexShaderType::kTriangleDomainCPIndexed:
|
|
host_shader_type = "control-point-indexed triangle domain";
|
|
break;
|
|
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
|
|
host_shader_type = "patch-indexed triangle domain";
|
|
break;
|
|
case Shader::HostVertexShaderType::kQuadDomainCPIndexed:
|
|
host_shader_type = "control-point-indexed quad domain";
|
|
break;
|
|
case Shader::HostVertexShaderType::kQuadDomainPatchIndexed:
|
|
host_shader_type = "patch-indexed quad domain";
|
|
break;
|
|
default:
|
|
host_shader_type = "vertex";
|
|
}
|
|
} else {
|
|
host_shader_type = "pixel";
|
|
}
|
|
XELOGGPU("Generated {} shader ({}b) - hash {:016X}:\n{}\n", host_shader_type,
|
|
shader.ucode_dword_count() * sizeof(uint32_t),
|
|
shader.ucode_data_hash(), shader.ucode_disassembly().c_str());
|
|
|
|
// Set up texture and sampler binding layouts.
|
|
if (shader.EnterBindingLayoutUserUIDSetup()) {
|
|
const std::vector<D3D12Shader::TextureBinding>& texture_bindings =
|
|
shader.GetTextureBindingsAfterTranslation();
|
|
uint32_t texture_binding_count = uint32_t(texture_bindings.size());
|
|
const std::vector<D3D12Shader::SamplerBinding>& sampler_bindings =
|
|
shader.GetSamplerBindingsAfterTranslation();
|
|
uint32_t sampler_binding_count = uint32_t(sampler_bindings.size());
|
|
assert_false(bindless_resources_used_ &&
|
|
texture_binding_count + sampler_binding_count >
|
|
D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 4);
|
|
size_t texture_binding_layout_bytes =
|
|
texture_binding_count * sizeof(*texture_bindings.data());
|
|
uint64_t texture_binding_layout_hash = 0;
|
|
if (texture_binding_count) {
|
|
texture_binding_layout_hash =
|
|
XXH3_64bits(texture_bindings.data(), texture_binding_layout_bytes);
|
|
}
|
|
uint32_t bindless_sampler_count =
|
|
bindless_resources_used_ ? sampler_binding_count : 0;
|
|
uint64_t bindless_sampler_layout_hash = 0;
|
|
if (bindless_sampler_count) {
|
|
XXH3_state_t hash_state;
|
|
XXH3_64bits_reset(&hash_state);
|
|
for (uint32_t i = 0; i < bindless_sampler_count; ++i) {
|
|
XXH3_64bits_update(
|
|
&hash_state, &sampler_bindings[i].bindless_descriptor_index,
|
|
sizeof(sampler_bindings[i].bindless_descriptor_index));
|
|
}
|
|
bindless_sampler_layout_hash = XXH3_64bits_digest(&hash_state);
|
|
}
|
|
// Obtain the unique IDs of binding layouts if there are any texture
|
|
// bindings or bindless samplers, for invalidation in the command processor.
|
|
size_t texture_binding_layout_uid = kLayoutUIDEmpty;
|
|
// Use sampler count for the bindful case because it's the only thing that
|
|
// must be the same for layouts to be compatible in this case
|
|
// (instruction-specified parameters are used as overrides for actual
|
|
// samplers).
|
|
static_assert(
|
|
kLayoutUIDEmpty == 0,
|
|
"Empty layout UID is assumed to be 0 because for bindful samplers, the "
|
|
"UID is their count");
|
|
size_t sampler_binding_layout_uid = bindless_resources_used_
|
|
? kLayoutUIDEmpty
|
|
: size_t(sampler_binding_count);
|
|
if (texture_binding_count || bindless_sampler_count) {
|
|
std::lock_guard<std::mutex> layouts_mutex_(layouts_mutex_);
|
|
if (texture_binding_count) {
|
|
auto found_range = texture_binding_layout_map_.equal_range(
|
|
texture_binding_layout_hash);
|
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
|
if (it->second.vector_span_length == texture_binding_count &&
|
|
!std::memcmp(texture_binding_layouts_.data() +
|
|
it->second.vector_span_offset,
|
|
texture_bindings.data(),
|
|
texture_binding_layout_bytes)) {
|
|
texture_binding_layout_uid = it->second.uid;
|
|
break;
|
|
}
|
|
}
|
|
if (texture_binding_layout_uid == kLayoutUIDEmpty) {
|
|
static_assert(
|
|
kLayoutUIDEmpty == 0,
|
|
"Layout UID is size + 1 because it's assumed that 0 is the UID "
|
|
"for an empty layout");
|
|
texture_binding_layout_uid = texture_binding_layout_map_.size() + 1;
|
|
LayoutUID new_uid;
|
|
new_uid.uid = texture_binding_layout_uid;
|
|
new_uid.vector_span_offset = texture_binding_layouts_.size();
|
|
new_uid.vector_span_length = texture_binding_count;
|
|
texture_binding_layouts_.resize(new_uid.vector_span_offset +
|
|
texture_binding_count);
|
|
std::memcpy(
|
|
texture_binding_layouts_.data() + new_uid.vector_span_offset,
|
|
texture_bindings.data(), texture_binding_layout_bytes);
|
|
texture_binding_layout_map_.emplace(texture_binding_layout_hash,
|
|
new_uid);
|
|
}
|
|
}
|
|
if (bindless_sampler_count) {
|
|
auto found_range = bindless_sampler_layout_map_.equal_range(
|
|
sampler_binding_layout_uid);
|
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
|
if (it->second.vector_span_length != bindless_sampler_count) {
|
|
continue;
|
|
}
|
|
sampler_binding_layout_uid = it->second.uid;
|
|
const uint32_t* vector_bindless_sampler_layout =
|
|
bindless_sampler_layouts_.data() + it->second.vector_span_offset;
|
|
for (uint32_t i = 0; i < bindless_sampler_count; ++i) {
|
|
if (vector_bindless_sampler_layout[i] !=
|
|
sampler_bindings[i].bindless_descriptor_index) {
|
|
sampler_binding_layout_uid = kLayoutUIDEmpty;
|
|
break;
|
|
}
|
|
}
|
|
if (sampler_binding_layout_uid != kLayoutUIDEmpty) {
|
|
break;
|
|
}
|
|
}
|
|
if (sampler_binding_layout_uid == kLayoutUIDEmpty) {
|
|
sampler_binding_layout_uid = bindless_sampler_layout_map_.size();
|
|
LayoutUID new_uid;
|
|
static_assert(
|
|
kLayoutUIDEmpty == 0,
|
|
"Layout UID is size + 1 because it's assumed that 0 is the UID "
|
|
"for an empty layout");
|
|
new_uid.uid = sampler_binding_layout_uid + 1;
|
|
new_uid.vector_span_offset = bindless_sampler_layouts_.size();
|
|
new_uid.vector_span_length = sampler_binding_count;
|
|
bindless_sampler_layouts_.resize(new_uid.vector_span_offset +
|
|
sampler_binding_count);
|
|
uint32_t* vector_bindless_sampler_layout =
|
|
bindless_sampler_layouts_.data() + new_uid.vector_span_offset;
|
|
for (uint32_t i = 0; i < bindless_sampler_count; ++i) {
|
|
vector_bindless_sampler_layout[i] =
|
|
sampler_bindings[i].bindless_descriptor_index;
|
|
}
|
|
bindless_sampler_layout_map_.emplace(bindless_sampler_layout_hash,
|
|
new_uid);
|
|
}
|
|
}
|
|
}
|
|
shader.SetTextureBindingLayoutUserUID(texture_binding_layout_uid);
|
|
shader.SetSamplerBindingLayoutUserUID(sampler_binding_layout_uid);
|
|
}
|
|
|
|
// Disassemble the shader for dumping.
|
|
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
|
if (cvars::d3d12_dxbc_disasm_dxilconv) {
|
|
translation.DisassembleDxbcAndDxil(provider, cvars::d3d12_dxbc_disasm,
|
|
dxbc_converter, dxc_utils, dxc_compiler);
|
|
} else {
|
|
translation.DisassembleDxbcAndDxil(provider, cvars::d3d12_dxbc_disasm);
|
|
}
|
|
|
|
// Dump shader files if desired.
|
|
if (!cvars::dump_shaders.empty()) {
|
|
bool edram_rov_used = render_target_cache_.GetPath() ==
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
translation.Dump(cvars::dump_shaders,
|
|
(shader.type() == xenos::ShaderType::kPixel)
|
|
? (edram_rov_used ? "d3d12_rov" : "d3d12_rtv")
|
|
: "d3d12");
|
|
}
|
|
|
|
return translation.is_valid();
|
|
}
|
|
|
|
bool PipelineCache::GetCurrentStateDescription(
|
|
D3D12Shader::D3D12Translation* vertex_shader,
|
|
D3D12Shader::D3D12Translation* pixel_shader,
|
|
const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
|
|
uint32_t bound_depth_and_color_render_target_bits,
|
|
const uint32_t* bound_depth_and_color_render_target_formats,
|
|
PipelineRuntimeDescription& runtime_description_out) {
|
|
// Translated shaders needed at least for the root signature.
|
|
assert_true(vertex_shader->is_translated() && vertex_shader->is_valid());
|
|
assert_true(!pixel_shader ||
|
|
(pixel_shader->is_translated() && pixel_shader->is_valid()));
|
|
|
|
PipelineDescription& description_out = runtime_description_out.description;
|
|
|
|
const auto& regs = register_file_;
|
|
auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
|
|
|
|
// Initialize all unused fields to zero for comparison/hashing.
|
|
std::memset(&runtime_description_out, 0, sizeof(runtime_description_out));
|
|
|
|
assert_true(DxbcShaderTranslator::Modification(vertex_shader->modification())
|
|
.vertex.host_vertex_shader_type ==
|
|
primitive_processing_result.host_vertex_shader_type);
|
|
bool tessellated = primitive_processing_result.IsTessellated();
|
|
bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
|
|
bool rasterization_enabled =
|
|
draw_util::IsRasterizationPotentiallyDone(regs, primitive_polygonal);
|
|
// In Direct3D, rasterization (along with pixel counting) is disabled by
|
|
// disabling the pixel shader and depth / stencil. However, if rasterization
|
|
// should be disabled, the pixel shader must be disabled externally, to ensure
|
|
// things like texture binding layout is correct for the shader actually being
|
|
// used (don't replace anything here).
|
|
if (!rasterization_enabled) {
|
|
assert_null(pixel_shader);
|
|
if (pixel_shader) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool edram_rov_used = render_target_cache_.GetPath() ==
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
|
|
// Root signature.
|
|
runtime_description_out.root_signature = command_processor_.GetRootSignature(
|
|
static_cast<const DxbcShader*>(&vertex_shader->shader()),
|
|
pixel_shader ? static_cast<const DxbcShader*>(&pixel_shader->shader())
|
|
: nullptr,
|
|
tessellated);
|
|
if (runtime_description_out.root_signature == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
// Vertex shader.
|
|
runtime_description_out.vertex_shader = vertex_shader;
|
|
description_out.vertex_shader_hash =
|
|
vertex_shader->shader().ucode_data_hash();
|
|
description_out.vertex_shader_modification = vertex_shader->modification();
|
|
|
|
// Index buffer strip cut value.
|
|
if (primitive_processing_result.host_primitive_reset_enabled) {
|
|
description_out.strip_cut_index =
|
|
primitive_processing_result.host_index_format ==
|
|
xenos::IndexFormat::kInt16
|
|
? PipelineStripCutIndex::kFFFF
|
|
: PipelineStripCutIndex::kFFFFFFFF;
|
|
} else {
|
|
description_out.strip_cut_index = PipelineStripCutIndex::kNone;
|
|
}
|
|
|
|
// Host vertex shader type and primitive topology.
|
|
if (tessellated) {
|
|
description_out.primitive_topology_type_or_tessellation_mode =
|
|
uint32_t(primitive_processing_result.tessellation_mode);
|
|
} else {
|
|
switch (primitive_processing_result.host_primitive_type) {
|
|
case xenos::PrimitiveType::kPointList:
|
|
description_out.primitive_topology_type_or_tessellation_mode =
|
|
uint32_t(PipelinePrimitiveTopologyType::kPoint);
|
|
break;
|
|
case xenos::PrimitiveType::kLineList:
|
|
case xenos::PrimitiveType::kLineStrip:
|
|
// Quads are emulated as line lists with adjacency.
|
|
case xenos::PrimitiveType::kQuadList:
|
|
case xenos::PrimitiveType::k2DLineStrip:
|
|
description_out.primitive_topology_type_or_tessellation_mode =
|
|
uint32_t(PipelinePrimitiveTopologyType::kLine);
|
|
break;
|
|
default:
|
|
description_out.primitive_topology_type_or_tessellation_mode =
|
|
uint32_t(PipelinePrimitiveTopologyType::kTriangle);
|
|
break;
|
|
}
|
|
switch (primitive_processing_result.host_primitive_type) {
|
|
case xenos::PrimitiveType::kPointList:
|
|
description_out.geometry_shader = PipelineGeometryShader::kPointList;
|
|
break;
|
|
case xenos::PrimitiveType::kRectangleList:
|
|
description_out.geometry_shader =
|
|
PipelineGeometryShader::kRectangleList;
|
|
break;
|
|
case xenos::PrimitiveType::kQuadList:
|
|
description_out.geometry_shader = PipelineGeometryShader::kQuadList;
|
|
break;
|
|
default:
|
|
description_out.geometry_shader = PipelineGeometryShader::kNone;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// The rest doesn't matter when rasterization is disabled (thus no writing to
|
|
// anywhere from post-geometry stages and no samples are counted).
|
|
if (!rasterization_enabled) {
|
|
description_out.cull_mode = PipelineCullMode::kDisableRasterization;
|
|
return true;
|
|
}
|
|
|
|
// Pixel shader.
|
|
if (pixel_shader) {
|
|
runtime_description_out.pixel_shader = pixel_shader;
|
|
description_out.pixel_shader_hash =
|
|
pixel_shader->shader().ucode_data_hash();
|
|
description_out.pixel_shader_modification = pixel_shader->modification();
|
|
}
|
|
|
|
// Rasterizer state.
|
|
// Because Direct3D 12 doesn't support per-side fill mode and depth bias, the
|
|
// values to use depends on the current culling state.
|
|
// If front faces are culled, use the ones for back faces.
|
|
// If back faces are culled, it's the other way around.
|
|
// If culling is not enabled, assume the developer wanted to draw things in a
|
|
// more special way - so if one side is wireframe or has a depth bias, then
|
|
// that's intentional (if both sides have a depth bias, the one for the front
|
|
// faces is used, though it's unlikely that they will ever be different -
|
|
// SetRenderState sets the same offset for both sides).
|
|
// Points fill mode (0) also isn't supported in Direct3D 12, but assume the
|
|
// developer didn't want to fill the whole primitive and use wireframe (like
|
|
// Xenos fill mode 1).
|
|
// Here we also assume that only one side is culled - if two sides are culled,
|
|
// rasterization will be disabled externally, or the draw call will be dropped
|
|
// early if the vertex shader doesn't export to memory.
|
|
bool cull_front, cull_back;
|
|
float poly_offset = 0.0f, poly_offset_scale = 0.0f;
|
|
if (primitive_polygonal) {
|
|
description_out.front_counter_clockwise = pa_su_sc_mode_cntl.face == 0;
|
|
cull_front = pa_su_sc_mode_cntl.cull_front != 0;
|
|
cull_back = pa_su_sc_mode_cntl.cull_back != 0;
|
|
if (cull_front) {
|
|
// The case when both faces are culled should be handled by disabling
|
|
// rasterization.
|
|
assert_false(cull_back);
|
|
description_out.cull_mode = PipelineCullMode::kFront;
|
|
} else if (cull_back) {
|
|
description_out.cull_mode = PipelineCullMode::kBack;
|
|
} else {
|
|
description_out.cull_mode = PipelineCullMode::kNone;
|
|
}
|
|
// With ROV, the depth bias is applied in the pixel shader because
|
|
// per-sample depth is needed for MSAA.
|
|
if (!cull_front) {
|
|
// Front faces aren't culled.
|
|
// Direct3D 12, unfortunately, doesn't support point fill mode.
|
|
if (pa_su_sc_mode_cntl.polymode_front_ptype !=
|
|
xenos::PolygonType::kTriangles) {
|
|
description_out.fill_mode_wireframe = 1;
|
|
}
|
|
if (!edram_rov_used && pa_su_sc_mode_cntl.poly_offset_front_enable) {
|
|
poly_offset = regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32;
|
|
poly_offset_scale = regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32;
|
|
}
|
|
}
|
|
if (!cull_back) {
|
|
// Back faces aren't culled.
|
|
if (pa_su_sc_mode_cntl.polymode_back_ptype !=
|
|
xenos::PolygonType::kTriangles) {
|
|
description_out.fill_mode_wireframe = 1;
|
|
}
|
|
// Prefer front depth bias because in general, front faces are the ones
|
|
// that are rendered (except for shadow volumes).
|
|
if (!edram_rov_used && pa_su_sc_mode_cntl.poly_offset_back_enable &&
|
|
poly_offset == 0.0f && poly_offset_scale == 0.0f) {
|
|
poly_offset = regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_OFFSET].f32;
|
|
poly_offset_scale = regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_SCALE].f32;
|
|
}
|
|
}
|
|
if (pa_su_sc_mode_cntl.poly_mode != xenos::PolygonModeEnable::kDualMode) {
|
|
description_out.fill_mode_wireframe = 0;
|
|
}
|
|
} else {
|
|
// Filled front faces only, without culling.
|
|
cull_front = false;
|
|
cull_back = false;
|
|
if (!edram_rov_used && pa_su_sc_mode_cntl.poly_offset_para_enable) {
|
|
poly_offset = regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32;
|
|
poly_offset_scale = regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32;
|
|
}
|
|
}
|
|
if (!edram_rov_used) {
|
|
float poly_offset_host_scale = draw_util::GetD3D10PolygonOffsetFactor(
|
|
regs.Get<reg::RB_DEPTH_INFO>().depth_format, true);
|
|
// Using ceil here just in case a game wants the offset but passes a value
|
|
// that is too small - it's better to apply more offset than to make depth
|
|
// fighting worse or to disable the offset completely (Direct3D 12 takes an
|
|
// integer value).
|
|
description_out.depth_bias =
|
|
int32_t(std::ceil(std::abs(poly_offset * poly_offset_host_scale))) *
|
|
(poly_offset < 0.0f ? -1 : 1);
|
|
// "slope computed in subpixels ([...] 1/16)" - R5xx Acceleration.
|
|
description_out.depth_bias_slope_scaled =
|
|
poly_offset_scale * xenos::kPolygonOffsetScaleSubpixelUnit;
|
|
}
|
|
if (tessellated && cvars::d3d12_tessellation_wireframe) {
|
|
description_out.fill_mode_wireframe = 1;
|
|
}
|
|
description_out.depth_clip = !regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable;
|
|
bool depth_stencil_bound_and_used = false;
|
|
if (!edram_rov_used) {
|
|
// Depth/stencil. No stencil, always passing depth test and no depth writing
|
|
// means depth disabled.
|
|
if (bound_depth_and_color_render_target_bits & 1) {
|
|
auto rb_depthcontrol =
|
|
draw_util::GetDepthControlForCurrentEdramMode(regs);
|
|
if (rb_depthcontrol.z_enable) {
|
|
description_out.depth_func = rb_depthcontrol.zfunc;
|
|
description_out.depth_write = rb_depthcontrol.z_write_enable;
|
|
} else {
|
|
description_out.depth_func = xenos::CompareFunction::kAlways;
|
|
}
|
|
if (rb_depthcontrol.stencil_enable) {
|
|
description_out.stencil_enable = 1;
|
|
bool stencil_backface_enable =
|
|
primitive_polygonal && rb_depthcontrol.backface_enable;
|
|
// Per-face masks not supported by Direct3D 12, choose the back face
|
|
// ones only if drawing only back faces.
|
|
Register stencil_ref_mask_reg;
|
|
if (stencil_backface_enable && cull_front) {
|
|
stencil_ref_mask_reg = XE_GPU_REG_RB_STENCILREFMASK_BF;
|
|
} else {
|
|
stencil_ref_mask_reg = XE_GPU_REG_RB_STENCILREFMASK;
|
|
}
|
|
auto stencil_ref_mask =
|
|
regs.Get<reg::RB_STENCILREFMASK>(stencil_ref_mask_reg);
|
|
description_out.stencil_read_mask = stencil_ref_mask.stencilmask;
|
|
description_out.stencil_write_mask = stencil_ref_mask.stencilwritemask;
|
|
description_out.stencil_front_fail_op = rb_depthcontrol.stencilfail;
|
|
description_out.stencil_front_depth_fail_op =
|
|
rb_depthcontrol.stencilzfail;
|
|
description_out.stencil_front_pass_op = rb_depthcontrol.stencilzpass;
|
|
description_out.stencil_front_func = rb_depthcontrol.stencilfunc;
|
|
if (stencil_backface_enable) {
|
|
description_out.stencil_back_fail_op = rb_depthcontrol.stencilfail_bf;
|
|
description_out.stencil_back_depth_fail_op =
|
|
rb_depthcontrol.stencilzfail_bf;
|
|
description_out.stencil_back_pass_op =
|
|
rb_depthcontrol.stencilzpass_bf;
|
|
description_out.stencil_back_func = rb_depthcontrol.stencilfunc_bf;
|
|
} else {
|
|
description_out.stencil_back_fail_op =
|
|
description_out.stencil_front_fail_op;
|
|
description_out.stencil_back_depth_fail_op =
|
|
description_out.stencil_front_depth_fail_op;
|
|
description_out.stencil_back_pass_op =
|
|
description_out.stencil_front_pass_op;
|
|
description_out.stencil_back_func =
|
|
description_out.stencil_front_func;
|
|
}
|
|
}
|
|
// If not binding the DSV, ignore the format in the hash.
|
|
if (description_out.depth_func != xenos::CompareFunction::kAlways ||
|
|
description_out.depth_write || description_out.stencil_enable) {
|
|
description_out.depth_format = xenos::DepthRenderTargetFormat(
|
|
bound_depth_and_color_render_target_formats[0]);
|
|
depth_stencil_bound_and_used = true;
|
|
}
|
|
} else {
|
|
description_out.depth_func = xenos::CompareFunction::kAlways;
|
|
}
|
|
|
|
// Render targets and blending state. 32 because of 0x1F mask, for safety
|
|
// (all unknown to zero).
|
|
uint32_t color_mask =
|
|
pixel_shader ? command_processor_.GetCurrentColorMask(
|
|
pixel_shader->shader().writes_color_targets())
|
|
: 0;
|
|
static const PipelineBlendFactor kBlendFactorMap[32] = {
|
|
/* 0 */ PipelineBlendFactor::kZero,
|
|
/* 1 */ PipelineBlendFactor::kOne,
|
|
/* 2 */ PipelineBlendFactor::kZero, // ?
|
|
/* 3 */ PipelineBlendFactor::kZero, // ?
|
|
/* 4 */ PipelineBlendFactor::kSrcColor,
|
|
/* 5 */ PipelineBlendFactor::kInvSrcColor,
|
|
/* 6 */ PipelineBlendFactor::kSrcAlpha,
|
|
/* 7 */ PipelineBlendFactor::kInvSrcAlpha,
|
|
/* 8 */ PipelineBlendFactor::kDestColor,
|
|
/* 9 */ PipelineBlendFactor::kInvDestColor,
|
|
/* 10 */ PipelineBlendFactor::kDestAlpha,
|
|
/* 11 */ PipelineBlendFactor::kInvDestAlpha,
|
|
// CONSTANT_COLOR
|
|
/* 12 */ PipelineBlendFactor::kBlendFactor,
|
|
// ONE_MINUS_CONSTANT_COLOR
|
|
/* 13 */ PipelineBlendFactor::kInvBlendFactor,
|
|
// CONSTANT_ALPHA
|
|
/* 14 */ PipelineBlendFactor::kBlendFactor,
|
|
// ONE_MINUS_CONSTANT_ALPHA
|
|
/* 15 */ PipelineBlendFactor::kInvBlendFactor,
|
|
/* 16 */ PipelineBlendFactor::kSrcAlphaSat,
|
|
};
|
|
// Like kBlendFactorMap, but with color modes changed to alpha. Some
|
|
// pipelines aren't created in 545407E0 because a color mode is used for
|
|
// alpha.
|
|
static const PipelineBlendFactor kBlendFactorAlphaMap[32] = {
|
|
/* 0 */ PipelineBlendFactor::kZero,
|
|
/* 1 */ PipelineBlendFactor::kOne,
|
|
/* 2 */ PipelineBlendFactor::kZero, // ?
|
|
/* 3 */ PipelineBlendFactor::kZero, // ?
|
|
/* 4 */ PipelineBlendFactor::kSrcAlpha,
|
|
/* 5 */ PipelineBlendFactor::kInvSrcAlpha,
|
|
/* 6 */ PipelineBlendFactor::kSrcAlpha,
|
|
/* 7 */ PipelineBlendFactor::kInvSrcAlpha,
|
|
/* 8 */ PipelineBlendFactor::kDestAlpha,
|
|
/* 9 */ PipelineBlendFactor::kInvDestAlpha,
|
|
/* 10 */ PipelineBlendFactor::kDestAlpha,
|
|
/* 11 */ PipelineBlendFactor::kInvDestAlpha,
|
|
/* 12 */ PipelineBlendFactor::kBlendFactor,
|
|
// ONE_MINUS_CONSTANT_COLOR
|
|
/* 13 */ PipelineBlendFactor::kInvBlendFactor,
|
|
// CONSTANT_ALPHA
|
|
/* 14 */ PipelineBlendFactor::kBlendFactor,
|
|
// ONE_MINUS_CONSTANT_ALPHA
|
|
/* 15 */ PipelineBlendFactor::kInvBlendFactor,
|
|
/* 16 */ PipelineBlendFactor::kSrcAlphaSat,
|
|
};
|
|
// While it's okay to specify fewer render targets in the pipeline state
|
|
// (even fewer than written by the shader) than actually bound to the
|
|
// command list (though this kind of truncation may only happen at the end -
|
|
// DXGI_FORMAT_UNKNOWN *requires* a null RTV descriptor to be bound), not
|
|
// doing that because sample counts of all render targets bound via
|
|
// OMSetRenderTargets, even those beyond NumRenderTargets, apparently must
|
|
// have their sample count matching the one set in the pipeline - however if
|
|
// we set NumRenderTargets to 0 and also disable depth / stencil, the sample
|
|
// count must be set to 1 - while the command list may still have
|
|
// multisampled render targets bound (happens in 4D5307E6 main menu).
|
|
// TODO(Triang3l): Investigate interaction of OMSetRenderTargets with
|
|
// non-null depth and DSVFormat DXGI_FORMAT_UNKNOWN in the same case.
|
|
for (uint32_t i = 0; i < 4; ++i) {
|
|
if (!(bound_depth_and_color_render_target_bits &
|
|
(uint32_t(1) << (1 + i)))) {
|
|
continue;
|
|
}
|
|
PipelineRenderTarget& rt = description_out.render_targets[i];
|
|
rt.used = 1;
|
|
auto color_info = regs.Get<reg::RB_COLOR_INFO>(
|
|
reg::RB_COLOR_INFO::rt_register_indices[i]);
|
|
rt.format = xenos::ColorRenderTargetFormat(
|
|
bound_depth_and_color_render_target_formats[1 + i]);
|
|
// TODO(Triang3l): Normalize unused bits of the color write mask.
|
|
rt.write_mask = (color_mask >> (i * 4)) & 0xF;
|
|
if (rt.write_mask) {
|
|
auto blendcontrol = regs.Get<reg::RB_BLENDCONTROL>(
|
|
reg::RB_BLENDCONTROL::rt_register_indices[i]);
|
|
rt.src_blend = kBlendFactorMap[uint32_t(blendcontrol.color_srcblend)];
|
|
rt.dest_blend = kBlendFactorMap[uint32_t(blendcontrol.color_destblend)];
|
|
rt.blend_op = blendcontrol.color_comb_fcn;
|
|
rt.src_blend_alpha =
|
|
kBlendFactorAlphaMap[uint32_t(blendcontrol.alpha_srcblend)];
|
|
rt.dest_blend_alpha =
|
|
kBlendFactorAlphaMap[uint32_t(blendcontrol.alpha_destblend)];
|
|
rt.blend_op_alpha = blendcontrol.alpha_comb_fcn;
|
|
} else {
|
|
rt.src_blend = PipelineBlendFactor::kOne;
|
|
rt.dest_blend = PipelineBlendFactor::kZero;
|
|
rt.blend_op = xenos::BlendOp::kAdd;
|
|
rt.src_blend_alpha = PipelineBlendFactor::kOne;
|
|
rt.dest_blend_alpha = PipelineBlendFactor::kZero;
|
|
rt.blend_op_alpha = xenos::BlendOp::kAdd;
|
|
}
|
|
}
|
|
}
|
|
xenos::MsaaSamples host_msaa_samples =
|
|
regs.Get<reg::RB_SURFACE_INFO>().msaa_samples;
|
|
if (edram_rov_used) {
|
|
if (host_msaa_samples == xenos::MsaaSamples::k2X) {
|
|
// 2 is not supported in ForcedSampleCount on Nvidia.
|
|
host_msaa_samples = xenos::MsaaSamples::k4X;
|
|
}
|
|
} else {
|
|
if (!(bound_depth_and_color_render_target_bits & ~uint32_t(1)) &&
|
|
!depth_stencil_bound_and_used) {
|
|
// Direct3D 12 requires the sample count to be 1 when no color or depth /
|
|
// stencil render targets are bound.
|
|
// FIXME(Triang3l): Use ForcedSampleCount or some other fallback for
|
|
// sample counting when needed, though with 2x it will be as incorrect as
|
|
// with 1x / 4x anyway; or bind a dummy depth / stencil buffer if really
|
|
// needed.
|
|
host_msaa_samples = xenos::MsaaSamples::k1X;
|
|
}
|
|
// TODO(Triang3l): 4x MSAA fallback when 2x isn't supported.
|
|
}
|
|
description_out.host_msaa_samples = host_msaa_samples;
|
|
|
|
return true;
|
|
}
|
|
|
|
ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline(
|
|
const PipelineRuntimeDescription& runtime_description) {
|
|
const PipelineDescription& description = runtime_description.description;
|
|
|
|
if (runtime_description.pixel_shader != nullptr) {
|
|
XELOGGPU("Creating graphics pipeline with VS {:016X}, PS {:016X}",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash(),
|
|
runtime_description.pixel_shader->shader().ucode_data_hash());
|
|
} else {
|
|
XELOGGPU("Creating graphics pipeline with VS {:016X}",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash());
|
|
}
|
|
|
|
D3D12_GRAPHICS_PIPELINE_STATE_DESC state_desc;
|
|
std::memset(&state_desc, 0, sizeof(state_desc));
|
|
|
|
bool edram_rov_used = render_target_cache_.GetPath() ==
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
|
|
// Root signature.
|
|
state_desc.pRootSignature = runtime_description.root_signature;
|
|
|
|
// Index buffer strip cut value.
|
|
switch (description.strip_cut_index) {
|
|
case PipelineStripCutIndex::kFFFF:
|
|
state_desc.IBStripCutValue = D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFF;
|
|
break;
|
|
case PipelineStripCutIndex::kFFFFFFFF:
|
|
state_desc.IBStripCutValue =
|
|
D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFFFFFF;
|
|
break;
|
|
default:
|
|
state_desc.IBStripCutValue = D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_DISABLED;
|
|
break;
|
|
}
|
|
|
|
// Primitive topology, vertex, hull, domain and geometry shaders.
|
|
if (!runtime_description.vertex_shader->is_translated()) {
|
|
XELOGE("Vertex shader {:016X} not translated",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash());
|
|
assert_always();
|
|
return nullptr;
|
|
}
|
|
Shader::HostVertexShaderType host_vertex_shader_type =
|
|
DxbcShaderTranslator::Modification(
|
|
runtime_description.vertex_shader->modification())
|
|
.vertex.host_vertex_shader_type;
|
|
if (host_vertex_shader_type == Shader::HostVertexShaderType::kVertex) {
|
|
state_desc.VS.pShaderBytecode =
|
|
runtime_description.vertex_shader->translated_binary().data();
|
|
state_desc.VS.BytecodeLength =
|
|
runtime_description.vertex_shader->translated_binary().size();
|
|
PipelinePrimitiveTopologyType primitive_topology_type =
|
|
PipelinePrimitiveTopologyType(
|
|
description.primitive_topology_type_or_tessellation_mode);
|
|
switch (primitive_topology_type) {
|
|
case PipelinePrimitiveTopologyType::kPoint:
|
|
state_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_POINT;
|
|
break;
|
|
case PipelinePrimitiveTopologyType::kLine:
|
|
state_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_LINE;
|
|
break;
|
|
case PipelinePrimitiveTopologyType::kTriangle:
|
|
state_desc.PrimitiveTopologyType =
|
|
D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
|
|
break;
|
|
default:
|
|
assert_unhandled_case(primitive_topology_type);
|
|
return nullptr;
|
|
}
|
|
switch (description.geometry_shader) {
|
|
case PipelineGeometryShader::kPointList:
|
|
state_desc.GS.pShaderBytecode = shaders::primitive_point_list_gs;
|
|
state_desc.GS.BytecodeLength = sizeof(shaders::primitive_point_list_gs);
|
|
break;
|
|
case PipelineGeometryShader::kRectangleList:
|
|
state_desc.GS.pShaderBytecode = shaders::primitive_rectangle_list_gs;
|
|
state_desc.GS.BytecodeLength =
|
|
sizeof(shaders::primitive_rectangle_list_gs);
|
|
break;
|
|
case PipelineGeometryShader::kQuadList:
|
|
state_desc.GS.pShaderBytecode = shaders::primitive_quad_list_gs;
|
|
state_desc.GS.BytecodeLength = sizeof(shaders::primitive_quad_list_gs);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
} else {
|
|
state_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_PATCH;
|
|
xenos::TessellationMode tessellation_mode = xenos::TessellationMode(
|
|
description.primitive_topology_type_or_tessellation_mode);
|
|
if (tessellation_mode == xenos::TessellationMode::kAdaptive) {
|
|
state_desc.VS.pShaderBytecode = shaders::tessellation_adaptive_vs;
|
|
state_desc.VS.BytecodeLength = sizeof(shaders::tessellation_adaptive_vs);
|
|
} else {
|
|
state_desc.VS.pShaderBytecode = shaders::tessellation_indexed_vs;
|
|
state_desc.VS.BytecodeLength = sizeof(shaders::tessellation_indexed_vs);
|
|
}
|
|
switch (tessellation_mode) {
|
|
case xenos::TessellationMode::kDiscrete:
|
|
switch (host_vertex_shader_type) {
|
|
case Shader::HostVertexShaderType::kTriangleDomainCPIndexed:
|
|
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::discrete_triangle_hs;
|
|
state_desc.HS.BytecodeLength =
|
|
sizeof(shaders::discrete_triangle_hs);
|
|
break;
|
|
case Shader::HostVertexShaderType::kQuadDomainCPIndexed:
|
|
case Shader::HostVertexShaderType::kQuadDomainPatchIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::discrete_quad_hs;
|
|
state_desc.HS.BytecodeLength = sizeof(shaders::discrete_quad_hs);
|
|
break;
|
|
default:
|
|
assert_unhandled_case(host_vertex_shader_type);
|
|
return nullptr;
|
|
}
|
|
break;
|
|
case xenos::TessellationMode::kContinuous:
|
|
switch (host_vertex_shader_type) {
|
|
case Shader::HostVertexShaderType::kTriangleDomainCPIndexed:
|
|
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::continuous_triangle_hs;
|
|
state_desc.HS.BytecodeLength =
|
|
sizeof(shaders::continuous_triangle_hs);
|
|
break;
|
|
case Shader::HostVertexShaderType::kQuadDomainCPIndexed:
|
|
case Shader::HostVertexShaderType::kQuadDomainPatchIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::continuous_quad_hs;
|
|
state_desc.HS.BytecodeLength = sizeof(shaders::continuous_quad_hs);
|
|
break;
|
|
default:
|
|
assert_unhandled_case(host_vertex_shader_type);
|
|
return nullptr;
|
|
}
|
|
break;
|
|
case xenos::TessellationMode::kAdaptive:
|
|
switch (host_vertex_shader_type) {
|
|
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::adaptive_triangle_hs;
|
|
state_desc.HS.BytecodeLength =
|
|
sizeof(shaders::adaptive_triangle_hs);
|
|
break;
|
|
case Shader::HostVertexShaderType::kQuadDomainPatchIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::adaptive_quad_hs;
|
|
state_desc.HS.BytecodeLength = sizeof(shaders::adaptive_quad_hs);
|
|
break;
|
|
default:
|
|
assert_unhandled_case(host_vertex_shader_type);
|
|
return nullptr;
|
|
}
|
|
break;
|
|
default:
|
|
assert_unhandled_case(tessellation_mode);
|
|
return nullptr;
|
|
}
|
|
state_desc.DS.pShaderBytecode =
|
|
runtime_description.vertex_shader->translated_binary().data();
|
|
state_desc.DS.BytecodeLength =
|
|
runtime_description.vertex_shader->translated_binary().size();
|
|
}
|
|
|
|
// Pixel shader.
|
|
if (runtime_description.pixel_shader != nullptr) {
|
|
if (!runtime_description.pixel_shader->is_translated()) {
|
|
XELOGE("Pixel shader {:016X} not translated",
|
|
runtime_description.pixel_shader->shader().ucode_data_hash());
|
|
assert_always();
|
|
return nullptr;
|
|
}
|
|
state_desc.PS.pShaderBytecode =
|
|
runtime_description.pixel_shader->translated_binary().data();
|
|
state_desc.PS.BytecodeLength =
|
|
runtime_description.pixel_shader->translated_binary().size();
|
|
} else if (edram_rov_used) {
|
|
state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data();
|
|
state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size();
|
|
} else {
|
|
if ((description.depth_func != xenos::CompareFunction::kAlways ||
|
|
description.depth_write) &&
|
|
description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
|
|
switch (render_target_cache_.depth_float24_conversion()) {
|
|
case RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating:
|
|
state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps;
|
|
state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps);
|
|
break;
|
|
case RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding:
|
|
state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
|
|
state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Rasterizer state.
|
|
state_desc.RasterizerState.FillMode = description.fill_mode_wireframe
|
|
? D3D12_FILL_MODE_WIREFRAME
|
|
: D3D12_FILL_MODE_SOLID;
|
|
switch (description.cull_mode) {
|
|
case PipelineCullMode::kFront:
|
|
state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_FRONT;
|
|
break;
|
|
case PipelineCullMode::kBack:
|
|
state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_BACK;
|
|
break;
|
|
default:
|
|
assert_true(description.cull_mode == PipelineCullMode::kNone ||
|
|
description.cull_mode ==
|
|
PipelineCullMode::kDisableRasterization);
|
|
state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_NONE;
|
|
break;
|
|
}
|
|
state_desc.RasterizerState.FrontCounterClockwise =
|
|
description.front_counter_clockwise ? TRUE : FALSE;
|
|
state_desc.RasterizerState.DepthBias = description.depth_bias;
|
|
state_desc.RasterizerState.DepthBiasClamp = 0.0f;
|
|
// With non-square resolution scaling, make sure the worst-case impact is
|
|
// reverted (slope only along the scaled axis), thus max. More bias is better
|
|
// than less bias, because less bias means Z fighting with the background is
|
|
// more likely.
|
|
state_desc.RasterizerState.SlopeScaledDepthBias =
|
|
description.depth_bias_slope_scaled *
|
|
float(std::max(render_target_cache_.GetResolutionScaleX(),
|
|
render_target_cache_.GetResolutionScaleY()));
|
|
state_desc.RasterizerState.DepthClipEnable =
|
|
description.depth_clip ? TRUE : FALSE;
|
|
uint32_t msaa_sample_count = uint32_t(1)
|
|
<< uint32_t(description.host_msaa_samples);
|
|
if (edram_rov_used) {
|
|
// Only 1, 4, 8 and (not on all GPUs) 16 are allowed, using sample 0 as 0
|
|
// and 3 as 1 for 2x instead (not exactly the same sample positions, but
|
|
// still top-left and bottom-right - however, this can be adjusted with
|
|
// programmable sample positions).
|
|
assert_true(msaa_sample_count == 1 || msaa_sample_count == 4);
|
|
if (msaa_sample_count != 1 && msaa_sample_count != 4) {
|
|
return nullptr;
|
|
}
|
|
state_desc.RasterizerState.ForcedSampleCount =
|
|
uint32_t(1) << uint32_t(description.host_msaa_samples);
|
|
}
|
|
|
|
// Sample mask and description.
|
|
state_desc.SampleMask = UINT_MAX;
|
|
// TODO(Triang3l): 4x MSAA fallback when 2x isn't supported without ROV.
|
|
if (edram_rov_used) {
|
|
state_desc.SampleDesc.Count = 1;
|
|
} else {
|
|
assert_true(msaa_sample_count <= 4);
|
|
if (msaa_sample_count > 4) {
|
|
return nullptr;
|
|
}
|
|
if (msaa_sample_count == 2 && !render_target_cache_.msaa_2x_supported()) {
|
|
// Using sample 0 as 0 and 3 as 1 for 2x instead (not exactly the same
|
|
// sample positions, but still top-left and bottom-right - however, this
|
|
// can be adjusted with programmable sample positions).
|
|
state_desc.SampleMask = 0b1001;
|
|
state_desc.SampleDesc.Count = 4;
|
|
} else {
|
|
state_desc.SampleDesc.Count = msaa_sample_count;
|
|
}
|
|
}
|
|
|
|
if (!edram_rov_used) {
|
|
// Depth/stencil.
|
|
if (description.depth_func != xenos::CompareFunction::kAlways ||
|
|
description.depth_write) {
|
|
state_desc.DepthStencilState.DepthEnable = TRUE;
|
|
state_desc.DepthStencilState.DepthWriteMask =
|
|
description.depth_write ? D3D12_DEPTH_WRITE_MASK_ALL
|
|
: D3D12_DEPTH_WRITE_MASK_ZERO;
|
|
// Comparison functions are the same in Direct3D 12 but plus one (minus
|
|
// one, bit 0 for less, bit 1 for equal, bit 2 for greater).
|
|
state_desc.DepthStencilState.DepthFunc =
|
|
D3D12_COMPARISON_FUNC(uint32_t(D3D12_COMPARISON_FUNC_NEVER) +
|
|
uint32_t(description.depth_func));
|
|
}
|
|
if (description.stencil_enable) {
|
|
state_desc.DepthStencilState.StencilEnable = TRUE;
|
|
state_desc.DepthStencilState.StencilReadMask =
|
|
description.stencil_read_mask;
|
|
state_desc.DepthStencilState.StencilWriteMask =
|
|
description.stencil_write_mask;
|
|
// Stencil operations are the same in Direct3D 12 too but plus one.
|
|
state_desc.DepthStencilState.FrontFace.StencilFailOp =
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
uint32_t(description.stencil_front_fail_op));
|
|
state_desc.DepthStencilState.FrontFace.StencilDepthFailOp =
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
uint32_t(description.stencil_front_depth_fail_op));
|
|
state_desc.DepthStencilState.FrontFace.StencilPassOp =
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
uint32_t(description.stencil_front_pass_op));
|
|
state_desc.DepthStencilState.FrontFace.StencilFunc =
|
|
D3D12_COMPARISON_FUNC(uint32_t(D3D12_COMPARISON_FUNC_NEVER) +
|
|
uint32_t(description.stencil_front_func));
|
|
state_desc.DepthStencilState.BackFace.StencilFailOp =
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
uint32_t(description.stencil_back_fail_op));
|
|
state_desc.DepthStencilState.BackFace.StencilDepthFailOp =
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
uint32_t(description.stencil_back_depth_fail_op));
|
|
state_desc.DepthStencilState.BackFace.StencilPassOp =
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
uint32_t(description.stencil_back_pass_op));
|
|
state_desc.DepthStencilState.BackFace.StencilFunc =
|
|
D3D12_COMPARISON_FUNC(uint32_t(D3D12_COMPARISON_FUNC_NEVER) +
|
|
uint32_t(description.stencil_back_func));
|
|
}
|
|
if (state_desc.DepthStencilState.DepthEnable ||
|
|
state_desc.DepthStencilState.StencilEnable) {
|
|
state_desc.DSVFormat = D3D12RenderTargetCache::GetDepthDSVDXGIFormat(
|
|
description.depth_format);
|
|
}
|
|
|
|
// Render targets and blending.
|
|
state_desc.BlendState.IndependentBlendEnable = TRUE;
|
|
static const D3D12_BLEND kBlendFactorMap[] = {
|
|
D3D12_BLEND_ZERO, D3D12_BLEND_ONE,
|
|
D3D12_BLEND_SRC_COLOR, D3D12_BLEND_INV_SRC_COLOR,
|
|
D3D12_BLEND_SRC_ALPHA, D3D12_BLEND_INV_SRC_ALPHA,
|
|
D3D12_BLEND_DEST_COLOR, D3D12_BLEND_INV_DEST_COLOR,
|
|
D3D12_BLEND_DEST_ALPHA, D3D12_BLEND_INV_DEST_ALPHA,
|
|
D3D12_BLEND_BLEND_FACTOR, D3D12_BLEND_INV_BLEND_FACTOR,
|
|
D3D12_BLEND_SRC_ALPHA_SAT,
|
|
};
|
|
static const D3D12_BLEND_OP kBlendOpMap[] = {
|
|
D3D12_BLEND_OP_ADD, D3D12_BLEND_OP_SUBTRACT, D3D12_BLEND_OP_MIN,
|
|
D3D12_BLEND_OP_MAX, D3D12_BLEND_OP_REV_SUBTRACT,
|
|
};
|
|
for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) {
|
|
const PipelineRenderTarget& rt = description.render_targets[i];
|
|
if (!rt.used) {
|
|
// Null RTV descriptors can be used for slots with DXGI_FORMAT_UNKNOWN
|
|
// in the pipeline state.
|
|
state_desc.RTVFormats[i] = DXGI_FORMAT_UNKNOWN;
|
|
continue;
|
|
}
|
|
state_desc.NumRenderTargets = i + 1;
|
|
state_desc.RTVFormats[i] =
|
|
render_target_cache_.GetColorDrawDXGIFormat(rt.format);
|
|
if (state_desc.RTVFormats[i] == DXGI_FORMAT_UNKNOWN) {
|
|
assert_always();
|
|
return nullptr;
|
|
}
|
|
D3D12_RENDER_TARGET_BLEND_DESC& blend_desc =
|
|
state_desc.BlendState.RenderTarget[i];
|
|
// Treat 1 * src + 0 * dest as disabled blending (there are opaque
|
|
// surfaces drawn with blending enabled, but it's 1 * src + 0 * dest, in
|
|
// 415607E6 - GPU performance is better when not blending.
|
|
if (rt.src_blend != PipelineBlendFactor::kOne ||
|
|
rt.dest_blend != PipelineBlendFactor::kZero ||
|
|
rt.blend_op != xenos::BlendOp::kAdd ||
|
|
rt.src_blend_alpha != PipelineBlendFactor::kOne ||
|
|
rt.dest_blend_alpha != PipelineBlendFactor::kZero ||
|
|
rt.blend_op_alpha != xenos::BlendOp::kAdd) {
|
|
blend_desc.BlendEnable = TRUE;
|
|
blend_desc.SrcBlend = kBlendFactorMap[uint32_t(rt.src_blend)];
|
|
blend_desc.DestBlend = kBlendFactorMap[uint32_t(rt.dest_blend)];
|
|
blend_desc.BlendOp = kBlendOpMap[uint32_t(rt.blend_op)];
|
|
blend_desc.SrcBlendAlpha =
|
|
kBlendFactorMap[uint32_t(rt.src_blend_alpha)];
|
|
blend_desc.DestBlendAlpha =
|
|
kBlendFactorMap[uint32_t(rt.dest_blend_alpha)];
|
|
blend_desc.BlendOpAlpha = kBlendOpMap[uint32_t(rt.blend_op_alpha)];
|
|
}
|
|
blend_desc.RenderTargetWriteMask = rt.write_mask;
|
|
}
|
|
}
|
|
|
|
// Disable rasterization if needed (parameter combinations that make no
|
|
// difference when rasterization is disabled have already been handled in
|
|
// GetCurrentStateDescription) the way it's disabled in Direct3D by design
|
|
// (disabling a pixel shader and depth / stencil).
|
|
// TODO(Triang3l): When it happens to be that a combination of parameters
|
|
// (no host pixel shader and depth / stencil without ROV) would disable
|
|
// rasterization when it's still needed (for occlusion query sample counting),
|
|
// ensure rasterization happens (by binding an empty pixel shader, or maybe
|
|
// via ForcedSampleCount when not using 2x MSAA - its requirements for
|
|
// OMSetRenderTargets need some investigation though).
|
|
if (description.cull_mode == PipelineCullMode::kDisableRasterization) {
|
|
state_desc.PS.pShaderBytecode = nullptr;
|
|
state_desc.PS.BytecodeLength = 0;
|
|
state_desc.DepthStencilState.DepthEnable = FALSE;
|
|
state_desc.DepthStencilState.StencilEnable = FALSE;
|
|
}
|
|
|
|
// Create the D3D12 pipeline state object.
|
|
auto device =
|
|
command_processor_.GetD3D12Context().GetD3D12Provider().GetDevice();
|
|
ID3D12PipelineState* state;
|
|
if (FAILED(device->CreateGraphicsPipelineState(&state_desc,
|
|
IID_PPV_ARGS(&state)))) {
|
|
if (runtime_description.pixel_shader != nullptr) {
|
|
XELOGE("Failed to create graphics pipeline with VS {:016X}, PS {:016X}",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash(),
|
|
runtime_description.pixel_shader->shader().ucode_data_hash());
|
|
} else {
|
|
XELOGE("Failed to create graphics pipeline with VS {:016X}",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash());
|
|
}
|
|
return nullptr;
|
|
}
|
|
std::wstring name;
|
|
if (runtime_description.pixel_shader != nullptr) {
|
|
name = fmt::format(
|
|
L"VS {:016X}, PS {:016X}",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash(),
|
|
runtime_description.pixel_shader->shader().ucode_data_hash());
|
|
} else {
|
|
name = fmt::format(
|
|
L"VS {:016X}",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash());
|
|
}
|
|
state->SetName(name.c_str());
|
|
return state;
|
|
}
|
|
|
|
void PipelineCache::StorageWriteThread() {
|
|
ShaderStoredHeader shader_header;
|
|
// Don't leak anything in unused bits.
|
|
std::memset(&shader_header, 0, sizeof(shader_header));
|
|
|
|
std::vector<uint32_t> ucode_guest_endian;
|
|
ucode_guest_endian.reserve(0xFFFF);
|
|
|
|
bool flush_shaders = false;
|
|
bool flush_pipelines = false;
|
|
|
|
while (true) {
|
|
if (flush_shaders) {
|
|
flush_shaders = false;
|
|
assert_not_null(shader_storage_file_);
|
|
fflush(shader_storage_file_);
|
|
}
|
|
if (flush_pipelines) {
|
|
flush_pipelines = false;
|
|
assert_not_null(pipeline_storage_file_);
|
|
fflush(pipeline_storage_file_);
|
|
}
|
|
|
|
const Shader* shader = nullptr;
|
|
PipelineStoredDescription pipeline_description;
|
|
bool write_pipeline = false;
|
|
{
|
|
std::unique_lock<std::mutex> lock(storage_write_request_lock_);
|
|
if (storage_write_thread_shutdown_) {
|
|
return;
|
|
}
|
|
if (!storage_write_shader_queue_.empty()) {
|
|
shader = storage_write_shader_queue_.front();
|
|
storage_write_shader_queue_.pop_front();
|
|
} else if (storage_write_flush_shaders_) {
|
|
storage_write_flush_shaders_ = false;
|
|
flush_shaders = true;
|
|
}
|
|
if (!storage_write_pipeline_queue_.empty()) {
|
|
std::memcpy(&pipeline_description,
|
|
&storage_write_pipeline_queue_.front(),
|
|
sizeof(pipeline_description));
|
|
storage_write_pipeline_queue_.pop_front();
|
|
write_pipeline = true;
|
|
} else if (storage_write_flush_pipelines_) {
|
|
storage_write_flush_pipelines_ = false;
|
|
flush_pipelines = true;
|
|
}
|
|
if (!shader && !write_pipeline) {
|
|
storage_write_request_cond_.wait(lock);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (shader) {
|
|
shader_header.ucode_data_hash = shader->ucode_data_hash();
|
|
shader_header.ucode_dword_count = shader->ucode_dword_count();
|
|
shader_header.type = shader->type();
|
|
assert_not_null(shader_storage_file_);
|
|
fwrite(&shader_header, sizeof(shader_header), 1, shader_storage_file_);
|
|
if (shader_header.ucode_dword_count) {
|
|
ucode_guest_endian.resize(shader_header.ucode_dword_count);
|
|
// Need to swap because the hash is calculated for the shader with guest
|
|
// endianness.
|
|
xe::copy_and_swap(ucode_guest_endian.data(), shader->ucode_dwords(),
|
|
shader_header.ucode_dword_count);
|
|
fwrite(ucode_guest_endian.data(),
|
|
shader_header.ucode_dword_count * sizeof(uint32_t), 1,
|
|
shader_storage_file_);
|
|
}
|
|
}
|
|
|
|
if (write_pipeline) {
|
|
assert_not_null(pipeline_storage_file_);
|
|
fwrite(&pipeline_description, sizeof(pipeline_description), 1,
|
|
pipeline_storage_file_);
|
|
}
|
|
}
|
|
}
|
|
|
|
void PipelineCache::CreationThread(size_t thread_index) {
|
|
while (true) {
|
|
Pipeline* pipeline_to_create = nullptr;
|
|
|
|
// Check if need to shut down or set the completion event and dequeue the
|
|
// pipeline if there is any.
|
|
{
|
|
std::unique_lock<std::mutex> lock(creation_request_lock_);
|
|
if (thread_index >= creation_threads_shutdown_from_ ||
|
|
creation_queue_.empty()) {
|
|
if (creation_completion_set_event_ && creation_threads_busy_ == 0) {
|
|
// Last pipeline in the queue created - signal the event if requested.
|
|
creation_completion_set_event_ = false;
|
|
creation_completion_event_->Set();
|
|
}
|
|
if (thread_index >= creation_threads_shutdown_from_) {
|
|
return;
|
|
}
|
|
creation_request_cond_.wait(lock);
|
|
continue;
|
|
}
|
|
// Take the pipeline from the queue and increment the busy thread count
|
|
// until the pipeline is created - other threads must be able to dequeue
|
|
// requests, but can't set the completion event until the pipelines are
|
|
// fully created (rather than just started creating).
|
|
pipeline_to_create = creation_queue_.front();
|
|
creation_queue_.pop_front();
|
|
++creation_threads_busy_;
|
|
}
|
|
|
|
// Create the D3D12 pipeline state object.
|
|
pipeline_to_create->state =
|
|
CreateD3D12Pipeline(pipeline_to_create->description);
|
|
|
|
// Pipeline created - the thread is not busy anymore, safe to set the
|
|
// completion event if needed (at the next iteration, or in some other
|
|
// thread).
|
|
{
|
|
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
|
--creation_threads_busy_;
|
|
}
|
|
}
|
|
}
|
|
|
|
void PipelineCache::CreateQueuedPipelinesOnProcessorThread() {
|
|
assert_false(creation_threads_.empty());
|
|
while (true) {
|
|
Pipeline* pipeline_to_create;
|
|
{
|
|
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
|
if (creation_queue_.empty()) {
|
|
break;
|
|
}
|
|
pipeline_to_create = creation_queue_.front();
|
|
creation_queue_.pop_front();
|
|
}
|
|
pipeline_to_create->state =
|
|
CreateD3D12Pipeline(pipeline_to_create->description);
|
|
}
|
|
}
|
|
|
|
} // namespace d3d12
|
|
} // namespace gpu
|
|
} // namespace xe
|