mirror of
https://github.com/xenia-project/xenia.git
synced 2025-12-06 07:12:03 +01:00
3374 lines
145 KiB
C++
3374 lines
145 KiB
C++
/**
|
|
******************************************************************************
|
|
* Xenia : Xbox 360 Emulator Research Project *
|
|
******************************************************************************
|
|
* Copyright 2022 Ben Vanik. All rights reserved. *
|
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
|
******************************************************************************
|
|
*/
|
|
|
|
#include "xenia/gpu/d3d12/pipeline_cache.h"
|
|
|
|
#include <algorithm>
|
|
#include <atomic>
|
|
#include <cinttypes>
|
|
#include <cmath>
|
|
#include <cstring>
|
|
#include <deque>
|
|
#include <mutex>
|
|
#include <set>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "third_party/dxbc/DXBCChecksum.h"
|
|
#include "third_party/fmt/include/fmt/format.h"
|
|
#include "xenia/base/assert.h"
|
|
#include "xenia/base/byte_order.h"
|
|
#include "xenia/base/clock.h"
|
|
#include "xenia/base/cvar.h"
|
|
#include "xenia/base/filesystem.h"
|
|
#include "xenia/base/logging.h"
|
|
#include "xenia/base/math.h"
|
|
#include "xenia/base/profiling.h"
|
|
#include "xenia/base/string.h"
|
|
#include "xenia/base/string_buffer.h"
|
|
#include "xenia/base/xxhash.h"
|
|
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
|
|
#include "xenia/gpu/d3d12/d3d12_render_target_cache.h"
|
|
#include "xenia/gpu/draw_util.h"
|
|
#include "xenia/gpu/dxbc.h"
|
|
#include "xenia/gpu/dxbc_shader_translator.h"
|
|
#include "xenia/gpu/gpu_flags.h"
|
|
#include "xenia/gpu/registers.h"
|
|
#include "xenia/gpu/xenos.h"
|
|
#include "xenia/ui/d3d12/d3d12_util.h"
|
|
|
|
DEFINE_bool(d3d12_dxbc_disasm, false,
|
|
"Disassemble DXBC shaders after generation.", "D3D12");
|
|
DEFINE_bool(
|
|
d3d12_dxbc_disasm_dxilconv, false,
|
|
"Disassemble DXBC shaders after conversion to DXIL, if DXIL shaders are "
|
|
"supported by the OS, and DirectX Shader Compiler DLLs available at "
|
|
"https://github.com/microsoft/DirectXShaderCompiler/releases are present.",
|
|
"D3D12");
|
|
DEFINE_int32(
|
|
d3d12_pipeline_creation_threads, -1,
|
|
"Number of threads used for graphics pipeline creation. -1 to calculate "
|
|
"automatically (75% of logical CPU cores), a positive number to specify "
|
|
"the number of threads explicitly (up to the number of logical CPU cores), "
|
|
"0 to disable multithreaded pipeline creation.",
|
|
"D3D12");
|
|
DEFINE_bool(d3d12_tessellation_wireframe, false,
|
|
"Display tessellated surfaces as wireframe for debugging.",
|
|
"D3D12");
|
|
|
|
namespace xe {
|
|
namespace gpu {
|
|
namespace d3d12 {
|
|
|
|
// Generated with `xb buildshaders`.
|
|
namespace shaders {
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/adaptive_quad_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/adaptive_triangle_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/continuous_quad_1cp_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/continuous_quad_4cp_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/continuous_triangle_1cp_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/continuous_triangle_3cp_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/discrete_quad_1cp_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/discrete_quad_4cp_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/discrete_triangle_1cp_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/discrete_triangle_3cp_hs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/float24_round_ps.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/float24_truncate_ps.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/tessellation_adaptive_vs.h"
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/tessellation_indexed_vs.h"
|
|
} // namespace shaders
|
|
|
|
PipelineCache::PipelineCache(D3D12CommandProcessor& command_processor,
|
|
const RegisterFile& register_file,
|
|
const D3D12RenderTargetCache& render_target_cache,
|
|
bool bindless_resources_used)
|
|
: command_processor_(command_processor),
|
|
register_file_(register_file),
|
|
render_target_cache_(render_target_cache),
|
|
bindless_resources_used_(bindless_resources_used) {
|
|
const ui::d3d12::D3D12Provider& provider =
|
|
command_processor_.GetD3D12Provider();
|
|
|
|
bool edram_rov_used = render_target_cache.GetPath() ==
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
|
|
shader_translator_ = std::make_unique<DxbcShaderTranslator>(
|
|
provider.GetAdapterVendorID(), bindless_resources_used_, edram_rov_used,
|
|
render_target_cache_.gamma_render_target_as_srgb(),
|
|
render_target_cache_.msaa_2x_supported(),
|
|
render_target_cache_.draw_resolution_scale_x(),
|
|
render_target_cache_.draw_resolution_scale_y(),
|
|
provider.GetGraphicsAnalysis() != nullptr);
|
|
|
|
if (edram_rov_used) {
|
|
depth_only_pixel_shader_ =
|
|
std::move(shader_translator_->CreateDepthOnlyPixelShader());
|
|
}
|
|
}
|
|
|
|
PipelineCache::~PipelineCache() { Shutdown(); }
|
|
|
|
bool PipelineCache::Initialize() {
|
|
const ui::d3d12::D3D12Provider& provider =
|
|
command_processor_.GetD3D12Provider();
|
|
|
|
// Initialize the command processor thread DXIL objects.
|
|
dxbc_converter_ = nullptr;
|
|
dxc_utils_ = nullptr;
|
|
dxc_compiler_ = nullptr;
|
|
if (cvars::d3d12_dxbc_disasm_dxilconv) {
|
|
if (FAILED(provider.DxbcConverterCreateInstance(
|
|
CLSID_DxbcConverter, IID_PPV_ARGS(&dxbc_converter_)))) {
|
|
XELOGE(
|
|
"Failed to create DxbcConverter, converted DXIL disassembly for "
|
|
"debugging will be unavailable");
|
|
}
|
|
if (FAILED(provider.DxcCreateInstance(CLSID_DxcUtils,
|
|
IID_PPV_ARGS(&dxc_utils_)))) {
|
|
XELOGE(
|
|
"Failed to create DxcUtils, converted DXIL disassembly for debugging "
|
|
"will be unavailable");
|
|
}
|
|
if (FAILED(provider.DxcCreateInstance(CLSID_DxcCompiler,
|
|
IID_PPV_ARGS(&dxc_compiler_)))) {
|
|
XELOGE(
|
|
"Failed to create DxcCompiler, converted DXIL disassembly for "
|
|
"debugging will be unavailable");
|
|
}
|
|
}
|
|
|
|
uint32_t logical_processor_count = xe::threading::logical_processor_count();
|
|
if (!logical_processor_count) {
|
|
// Pick some reasonable amount if couldn't determine the number of cores.
|
|
logical_processor_count = 6;
|
|
}
|
|
// Initialize creation thread synchronization data even if not using creation
|
|
// threads because they may be used anyway to create pipelines from the
|
|
// storage.
|
|
creation_threads_busy_ = 0;
|
|
creation_completion_event_ =
|
|
xe::threading::Event::CreateManualResetEvent(true);
|
|
assert_not_null(creation_completion_event_);
|
|
creation_completion_set_event_ = false;
|
|
creation_threads_shutdown_from_ = SIZE_MAX;
|
|
if (cvars::d3d12_pipeline_creation_threads != 0) {
|
|
size_t creation_thread_count;
|
|
if (cvars::d3d12_pipeline_creation_threads < 0) {
|
|
creation_thread_count =
|
|
std::max(logical_processor_count * 3 / 4, uint32_t(1));
|
|
} else {
|
|
creation_thread_count =
|
|
std::min(uint32_t(cvars::d3d12_pipeline_creation_threads),
|
|
logical_processor_count);
|
|
}
|
|
for (size_t i = 0; i < creation_thread_count; ++i) {
|
|
std::unique_ptr<xe::threading::Thread> creation_thread =
|
|
xe::threading::Thread::Create({}, [this, i]() { CreationThread(i); });
|
|
assert_not_null(creation_thread);
|
|
creation_thread->set_name("D3D12 Pipelines");
|
|
creation_threads_.push_back(std::move(creation_thread));
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void PipelineCache::Shutdown() {
|
|
// Shut down all threads, before destroying the pipelines since they may be
|
|
// creating them.
|
|
if (!creation_threads_.empty()) {
|
|
{
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
|
creation_threads_shutdown_from_ = 0;
|
|
}
|
|
creation_request_cond_.notify_all();
|
|
for (size_t i = 0; i < creation_threads_.size(); ++i) {
|
|
xe::threading::Wait(creation_threads_[i].get(), false);
|
|
}
|
|
creation_threads_.clear();
|
|
}
|
|
creation_completion_event_.reset();
|
|
|
|
// Shut down the persistent shader / pipeline storage.
|
|
ShutdownShaderStorage();
|
|
|
|
// Destroy all pipelines.
|
|
current_pipeline_ = nullptr;
|
|
for (auto it : pipelines_) {
|
|
it.second->state->Release();
|
|
delete it.second;
|
|
}
|
|
pipelines_.clear();
|
|
COUNT_profile_set("gpu/pipeline_cache/pipelines", 0);
|
|
|
|
// Destroy all shaders.
|
|
if (bindless_resources_used_) {
|
|
bindless_sampler_layout_map_.clear();
|
|
bindless_sampler_layouts_.clear();
|
|
}
|
|
texture_binding_layout_map_.clear();
|
|
texture_binding_layouts_.clear();
|
|
for (auto it : shaders_) {
|
|
delete it.second;
|
|
}
|
|
shaders_.clear();
|
|
shader_storage_index_ = 0;
|
|
|
|
// Shut down shader translation.
|
|
ui::d3d12::util::ReleaseAndNull(dxc_compiler_);
|
|
ui::d3d12::util::ReleaseAndNull(dxc_utils_);
|
|
ui::d3d12::util::ReleaseAndNull(dxbc_converter_);
|
|
}
|
|
|
|
void PipelineCache::InitializeShaderStorage(
|
|
const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) {
|
|
ShutdownShaderStorage();
|
|
|
|
auto shader_storage_root = cache_root / "shaders";
|
|
// For files that can be moved between different hosts.
|
|
// Host PSO blobs - if ever added - should be stored in shaders/local/ (they
|
|
// currently aren't used because because they may be not very practical -
|
|
// would need to invalidate them every commit likely, and additional I/O
|
|
// cost - though D3D's internal validation would possibly be enough to ensure
|
|
// they are up to date).
|
|
auto shader_storage_shareable_root = shader_storage_root / "shareable";
|
|
if (!std::filesystem::exists(shader_storage_shareable_root)) {
|
|
if (!std::filesystem::create_directories(shader_storage_shareable_root)) {
|
|
XELOGE(
|
|
"Failed to create the shareable shader storage directory, persistent "
|
|
"shader storage will be disabled: {}",
|
|
xe::path_to_utf8(shader_storage_shareable_root));
|
|
return;
|
|
}
|
|
}
|
|
|
|
bool edram_rov_used = render_target_cache_.GetPath() ==
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
|
|
// Initialize the pipeline storage stream - read pipeline descriptions and
|
|
// collect used shader modifications to translate.
|
|
std::vector<PipelineStoredDescription> pipeline_stored_descriptions;
|
|
// <Shader hash, modification bits>.
|
|
std::set<std::pair<uint64_t, uint64_t>> shader_translations_needed;
|
|
auto pipeline_storage_file_path =
|
|
shader_storage_shareable_root /
|
|
fmt::format("{:08X}.{}.d3d12.xpso", title_id,
|
|
edram_rov_used ? "rov" : "rtv");
|
|
pipeline_storage_file_ =
|
|
xe::filesystem::OpenFile(pipeline_storage_file_path, "a+b");
|
|
if (!pipeline_storage_file_) {
|
|
XELOGE(
|
|
"Failed to open the Direct3D 12 pipeline description storage file for "
|
|
"writing, persistent shader storage will be disabled: {}",
|
|
xe::path_to_utf8(pipeline_storage_file_path));
|
|
return;
|
|
}
|
|
pipeline_storage_file_flush_needed_ = false;
|
|
// 'XEPS'.
|
|
const uint32_t pipeline_storage_magic = 0x53504558;
|
|
// 'DXRO' or 'DXRT'.
|
|
const uint32_t pipeline_storage_magic_api =
|
|
edram_rov_used ? 0x4F525844 : 0x54525844;
|
|
const uint32_t pipeline_storage_version_swapped =
|
|
xe::byte_swap(std::max(PipelineDescription::kVersion,
|
|
DxbcShaderTranslator::Modification::kVersion));
|
|
struct {
|
|
uint32_t magic;
|
|
uint32_t magic_api;
|
|
uint32_t version_swapped;
|
|
} pipeline_storage_file_header;
|
|
if (fread(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header),
|
|
1, pipeline_storage_file_) &&
|
|
pipeline_storage_file_header.magic == pipeline_storage_magic &&
|
|
pipeline_storage_file_header.magic_api == pipeline_storage_magic_api &&
|
|
pipeline_storage_file_header.version_swapped ==
|
|
pipeline_storage_version_swapped) {
|
|
xe::filesystem::Seek(pipeline_storage_file_, 0, SEEK_END);
|
|
int64_t pipeline_storage_told_end =
|
|
xe::filesystem::Tell(pipeline_storage_file_);
|
|
size_t pipeline_storage_told_count =
|
|
size_t(pipeline_storage_told_end >=
|
|
int64_t(sizeof(pipeline_storage_file_header))
|
|
? (uint64_t(pipeline_storage_told_end) -
|
|
sizeof(pipeline_storage_file_header)) /
|
|
sizeof(PipelineStoredDescription)
|
|
: 0);
|
|
if (pipeline_storage_told_count &&
|
|
xe::filesystem::Seek(pipeline_storage_file_,
|
|
int64_t(sizeof(pipeline_storage_file_header)),
|
|
SEEK_SET)) {
|
|
pipeline_stored_descriptions.resize(pipeline_storage_told_count);
|
|
pipeline_stored_descriptions.resize(
|
|
fread(pipeline_stored_descriptions.data(),
|
|
sizeof(PipelineStoredDescription), pipeline_storage_told_count,
|
|
pipeline_storage_file_));
|
|
size_t pipeline_storage_read_count = pipeline_stored_descriptions.size();
|
|
for (size_t i = 0; i < pipeline_storage_read_count; ++i) {
|
|
const PipelineStoredDescription& pipeline_stored_description =
|
|
pipeline_stored_descriptions[i];
|
|
// Validate file integrity, stop and truncate the stream if data is
|
|
// corrupted.
|
|
if (XXH3_64bits(&pipeline_stored_description.description,
|
|
sizeof(pipeline_stored_description.description)) !=
|
|
pipeline_stored_description.description_hash) {
|
|
pipeline_stored_descriptions.resize(i);
|
|
break;
|
|
}
|
|
// TODO(Triang3l): On Vulkan, skip pipelines requiring unsupported
|
|
// device features (to keep the cache files mostly shareable across
|
|
// devices).
|
|
// Mark the shader modifications as needed for translation.
|
|
shader_translations_needed.emplace(
|
|
pipeline_stored_description.description.vertex_shader_hash,
|
|
pipeline_stored_description.description.vertex_shader_modification);
|
|
if (pipeline_stored_description.description.pixel_shader_hash) {
|
|
shader_translations_needed.emplace(
|
|
pipeline_stored_description.description.pixel_shader_hash,
|
|
pipeline_stored_description.description
|
|
.pixel_shader_modification);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t logical_processor_count = xe::threading::logical_processor_count();
|
|
if (!logical_processor_count) {
|
|
// Pick some reasonable amount if couldn't determine the number of cores.
|
|
logical_processor_count = 6;
|
|
}
|
|
|
|
// Initialize the Xenos shader storage stream.
|
|
uint64_t shader_storage_initialization_start =
|
|
xe::Clock::QueryHostTickCount();
|
|
auto shader_storage_file_path =
|
|
shader_storage_shareable_root / fmt::format("{:08X}.xsh", title_id);
|
|
shader_storage_file_ =
|
|
xe::filesystem::OpenFile(shader_storage_file_path, "a+b");
|
|
if (!shader_storage_file_) {
|
|
XELOGE(
|
|
"Failed to open the guest shader storage file for writing, persistent "
|
|
"shader storage will be disabled: {}",
|
|
xe::path_to_utf8(shader_storage_file_path));
|
|
fclose(pipeline_storage_file_);
|
|
pipeline_storage_file_ = nullptr;
|
|
return;
|
|
}
|
|
++shader_storage_index_;
|
|
shader_storage_file_flush_needed_ = false;
|
|
struct {
|
|
uint32_t magic;
|
|
uint32_t version_swapped;
|
|
} shader_storage_file_header;
|
|
// 'XESH'.
|
|
const uint32_t shader_storage_magic = 0x48534558;
|
|
if (fread(&shader_storage_file_header, sizeof(shader_storage_file_header), 1,
|
|
shader_storage_file_) &&
|
|
shader_storage_file_header.magic == shader_storage_magic &&
|
|
xe::byte_swap(shader_storage_file_header.version_swapped) ==
|
|
ShaderStoredHeader::kVersion) {
|
|
uint64_t shader_storage_valid_bytes = sizeof(shader_storage_file_header);
|
|
// Load and translate shaders written by previous Xenia executions until the
|
|
// end of the file or until a corrupted one is detected.
|
|
ShaderStoredHeader shader_header;
|
|
std::vector<uint32_t> ucode_dwords;
|
|
ucode_dwords.reserve(0xFFFF);
|
|
size_t shaders_translated = 0;
|
|
|
|
// Threads overlapping file reading.
|
|
std::mutex shaders_translation_thread_mutex;
|
|
std::condition_variable shaders_translation_thread_cond;
|
|
std::deque<D3D12Shader*> shaders_to_translate;
|
|
size_t shader_translation_threads_busy = 0;
|
|
bool shader_translation_threads_shutdown = false;
|
|
std::mutex shaders_failed_to_translate_mutex;
|
|
std::vector<D3D12Shader::D3D12Translation*> shaders_failed_to_translate;
|
|
auto shader_translation_thread_function = [&]() {
|
|
const ui::d3d12::D3D12Provider& provider =
|
|
command_processor_.GetD3D12Provider();
|
|
StringBuffer ucode_disasm_buffer;
|
|
DxbcShaderTranslator translator(
|
|
provider.GetAdapterVendorID(), bindless_resources_used_,
|
|
edram_rov_used, render_target_cache_.gamma_render_target_as_srgb(),
|
|
render_target_cache_.msaa_2x_supported(),
|
|
render_target_cache_.draw_resolution_scale_x(),
|
|
render_target_cache_.draw_resolution_scale_y(),
|
|
provider.GetGraphicsAnalysis() != nullptr);
|
|
// If needed and possible, create objects needed for DXIL conversion and
|
|
// disassembly on this thread.
|
|
IDxbcConverter* dxbc_converter = nullptr;
|
|
IDxcUtils* dxc_utils = nullptr;
|
|
IDxcCompiler* dxc_compiler = nullptr;
|
|
if (cvars::d3d12_dxbc_disasm_dxilconv && dxbc_converter_ && dxc_utils_ &&
|
|
dxc_compiler_) {
|
|
provider.DxbcConverterCreateInstance(CLSID_DxbcConverter,
|
|
IID_PPV_ARGS(&dxbc_converter));
|
|
provider.DxcCreateInstance(CLSID_DxcUtils, IID_PPV_ARGS(&dxc_utils));
|
|
provider.DxcCreateInstance(CLSID_DxcCompiler,
|
|
IID_PPV_ARGS(&dxc_compiler));
|
|
}
|
|
for (;;) {
|
|
D3D12Shader* shader_to_translate;
|
|
for (;;) {
|
|
std::unique_lock<std::mutex> lock(shaders_translation_thread_mutex);
|
|
if (shaders_to_translate.empty()) {
|
|
if (shader_translation_threads_shutdown) {
|
|
return;
|
|
}
|
|
shaders_translation_thread_cond.wait(lock);
|
|
continue;
|
|
}
|
|
shader_to_translate = shaders_to_translate.front();
|
|
shaders_to_translate.pop_front();
|
|
++shader_translation_threads_busy;
|
|
break;
|
|
}
|
|
if (!shader_to_translate->is_ucode_analyzed()) {
|
|
shader_to_translate->AnalyzeUcode(ucode_disasm_buffer);
|
|
}
|
|
// Translate each needed modification on this thread after performing
|
|
// modification-independent analysis of the whole shader.
|
|
uint64_t ucode_data_hash = shader_to_translate->ucode_data_hash();
|
|
for (auto modification_it = shader_translations_needed.lower_bound(
|
|
std::make_pair(ucode_data_hash, uint64_t(0)));
|
|
modification_it != shader_translations_needed.end() &&
|
|
modification_it->first == ucode_data_hash;
|
|
++modification_it) {
|
|
D3D12Shader::D3D12Translation* translation =
|
|
static_cast<D3D12Shader::D3D12Translation*>(
|
|
shader_to_translate->GetOrCreateTranslation(
|
|
modification_it->second));
|
|
// Only try (and delete in case of failure) if it's a new translation.
|
|
// If it's a shader previously encountered in the game, translation of
|
|
// which has failed, and the shader storage is loaded later, keep it
|
|
// this way not to try to translate it again.
|
|
if (!translation->is_translated() &&
|
|
!TranslateAnalyzedShader(translator, *translation, dxbc_converter,
|
|
dxc_utils, dxc_compiler)) {
|
|
std::lock_guard<std::mutex> lock(shaders_failed_to_translate_mutex);
|
|
shaders_failed_to_translate.push_back(translation);
|
|
}
|
|
}
|
|
{
|
|
std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
|
|
--shader_translation_threads_busy;
|
|
}
|
|
}
|
|
if (dxc_compiler) {
|
|
dxc_compiler->Release();
|
|
}
|
|
if (dxc_utils) {
|
|
dxc_utils->Release();
|
|
}
|
|
if (dxbc_converter) {
|
|
dxbc_converter->Release();
|
|
}
|
|
};
|
|
std::vector<std::unique_ptr<xe::threading::Thread>>
|
|
shader_translation_threads;
|
|
|
|
while (true) {
|
|
if (!fread(&shader_header, sizeof(shader_header), 1,
|
|
shader_storage_file_)) {
|
|
break;
|
|
}
|
|
size_t ucode_byte_count =
|
|
shader_header.ucode_dword_count * sizeof(uint32_t);
|
|
ucode_dwords.resize(shader_header.ucode_dword_count);
|
|
if (shader_header.ucode_dword_count &&
|
|
!fread(ucode_dwords.data(), ucode_byte_count, 1,
|
|
shader_storage_file_)) {
|
|
break;
|
|
}
|
|
uint64_t ucode_data_hash =
|
|
XXH3_64bits(ucode_dwords.data(), ucode_byte_count);
|
|
if (shader_header.ucode_data_hash != ucode_data_hash) {
|
|
// Validation failed.
|
|
break;
|
|
}
|
|
shader_storage_valid_bytes += sizeof(shader_header) + ucode_byte_count;
|
|
D3D12Shader* shader =
|
|
LoadShader(shader_header.type, ucode_dwords.data(),
|
|
shader_header.ucode_dword_count, ucode_data_hash);
|
|
if (shader->ucode_storage_index() == shader_storage_index_) {
|
|
// Appeared twice in this file for some reason - skip, otherwise race
|
|
// condition will be caused by translating twice in parallel.
|
|
continue;
|
|
}
|
|
// Loaded from the current storage - don't write again.
|
|
shader->set_ucode_storage_index(shader_storage_index_);
|
|
// Create new threads if the currently existing threads can't keep up
|
|
// with file reading, but not more than the number of logical processors
|
|
// minus one.
|
|
size_t shader_translation_threads_needed;
|
|
{
|
|
std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
|
|
shader_translation_threads_needed =
|
|
std::min(shader_translation_threads_busy +
|
|
shaders_to_translate.size() + size_t(1),
|
|
logical_processor_count - size_t(1));
|
|
}
|
|
while (shader_translation_threads.size() <
|
|
shader_translation_threads_needed) {
|
|
auto thread = xe::threading::Thread::Create(
|
|
{}, shader_translation_thread_function);
|
|
assert_not_null(thread);
|
|
thread->set_name("Shader Translation");
|
|
shader_translation_threads.push_back(std::move(thread));
|
|
}
|
|
// Request ucode information gathering and translation of all the needed
|
|
// shaders.
|
|
{
|
|
std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
|
|
shaders_to_translate.push_back(shader);
|
|
}
|
|
shaders_translation_thread_cond.notify_one();
|
|
++shaders_translated;
|
|
}
|
|
if (!shader_translation_threads.empty()) {
|
|
{
|
|
std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
|
|
shader_translation_threads_shutdown = true;
|
|
}
|
|
shaders_translation_thread_cond.notify_all();
|
|
for (auto& shader_translation_thread : shader_translation_threads) {
|
|
xe::threading::Wait(shader_translation_thread.get(), false);
|
|
}
|
|
shader_translation_threads.clear();
|
|
for (D3D12Shader::D3D12Translation* translation :
|
|
shaders_failed_to_translate) {
|
|
D3D12Shader* shader = static_cast<D3D12Shader*>(&translation->shader());
|
|
shader->DestroyTranslation(translation->modification());
|
|
if (shader->translations().empty()) {
|
|
shaders_.erase(shader->ucode_data_hash());
|
|
delete shader;
|
|
}
|
|
}
|
|
}
|
|
XELOGGPU("Translated {} shaders from the storage in {} milliseconds",
|
|
shaders_translated,
|
|
(xe::Clock::QueryHostTickCount() -
|
|
shader_storage_initialization_start) *
|
|
1000 / xe::Clock::QueryHostTickFrequency());
|
|
xe::filesystem::TruncateStdioFile(shader_storage_file_,
|
|
shader_storage_valid_bytes);
|
|
} else {
|
|
xe::filesystem::TruncateStdioFile(shader_storage_file_, 0);
|
|
shader_storage_file_header.magic = shader_storage_magic;
|
|
shader_storage_file_header.version_swapped =
|
|
xe::byte_swap(ShaderStoredHeader::kVersion);
|
|
fwrite(&shader_storage_file_header, sizeof(shader_storage_file_header), 1,
|
|
shader_storage_file_);
|
|
}
|
|
|
|
// Create the pipelines.
|
|
if (!pipeline_stored_descriptions.empty()) {
|
|
uint64_t pipeline_creation_start_ = xe::Clock::QueryHostTickCount();
|
|
|
|
// Launch additional creation threads to use all cores to create
|
|
// pipelines faster. Will also be using the main thread, so minus 1.
|
|
size_t creation_thread_original_count = creation_threads_.size();
|
|
size_t creation_thread_needed_count = std::max(
|
|
std::min(pipeline_stored_descriptions.size(), logical_processor_count) -
|
|
size_t(1),
|
|
creation_thread_original_count);
|
|
while (creation_threads_.size() < creation_thread_original_count) {
|
|
size_t creation_thread_index = creation_threads_.size();
|
|
std::unique_ptr<xe::threading::Thread> creation_thread =
|
|
xe::threading::Thread::Create({}, [this, creation_thread_index]() {
|
|
CreationThread(creation_thread_index);
|
|
});
|
|
assert_not_null(creation_thread);
|
|
creation_thread->set_name("D3D12 Pipelines");
|
|
creation_threads_.push_back(std::move(creation_thread));
|
|
}
|
|
|
|
size_t pipelines_created = 0;
|
|
for (const PipelineStoredDescription& pipeline_stored_description :
|
|
pipeline_stored_descriptions) {
|
|
const PipelineDescription& pipeline_description =
|
|
pipeline_stored_description.description;
|
|
// TODO(Triang3l): On Vulkan, skip pipelines requiring unsupported device
|
|
// features (to keep the cache files mostly shareable across devices).
|
|
// Skip already known pipelines - those have already been enqueued.
|
|
auto found_range =
|
|
pipelines_.equal_range(pipeline_stored_description.description_hash);
|
|
bool pipeline_found = false;
|
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
|
Pipeline* found_pipeline = it->second;
|
|
if (!std::memcmp(&found_pipeline->description.description,
|
|
&pipeline_description, sizeof(pipeline_description))) {
|
|
pipeline_found = true;
|
|
break;
|
|
}
|
|
}
|
|
if (pipeline_found) {
|
|
continue;
|
|
}
|
|
|
|
PipelineRuntimeDescription pipeline_runtime_description;
|
|
auto vertex_shader_it =
|
|
shaders_.find(pipeline_description.vertex_shader_hash);
|
|
if (vertex_shader_it == shaders_.end()) {
|
|
continue;
|
|
}
|
|
D3D12Shader* vertex_shader = vertex_shader_it->second;
|
|
pipeline_runtime_description.vertex_shader =
|
|
static_cast<D3D12Shader::D3D12Translation*>(
|
|
vertex_shader->GetTranslation(
|
|
pipeline_description.vertex_shader_modification));
|
|
if (!pipeline_runtime_description.vertex_shader ||
|
|
!pipeline_runtime_description.vertex_shader->is_translated() ||
|
|
!pipeline_runtime_description.vertex_shader->is_valid()) {
|
|
continue;
|
|
}
|
|
D3D12Shader* pixel_shader;
|
|
if (pipeline_description.pixel_shader_hash) {
|
|
auto pixel_shader_it =
|
|
shaders_.find(pipeline_description.pixel_shader_hash);
|
|
if (pixel_shader_it == shaders_.end()) {
|
|
continue;
|
|
}
|
|
pixel_shader = pixel_shader_it->second;
|
|
pipeline_runtime_description.pixel_shader =
|
|
static_cast<D3D12Shader::D3D12Translation*>(
|
|
pixel_shader->GetTranslation(
|
|
pipeline_description.pixel_shader_modification));
|
|
if (!pipeline_runtime_description.pixel_shader ||
|
|
!pipeline_runtime_description.pixel_shader->is_translated() ||
|
|
!pipeline_runtime_description.pixel_shader->is_valid()) {
|
|
continue;
|
|
}
|
|
} else {
|
|
pixel_shader = nullptr;
|
|
pipeline_runtime_description.pixel_shader = nullptr;
|
|
}
|
|
GeometryShaderKey pipeline_geometry_shader_key;
|
|
pipeline_runtime_description.geometry_shader =
|
|
GetGeometryShaderKey(
|
|
pipeline_description.geometry_shader,
|
|
DxbcShaderTranslator::Modification(
|
|
pipeline_description.vertex_shader_modification),
|
|
DxbcShaderTranslator::Modification(
|
|
pipeline_description.pixel_shader_modification),
|
|
pipeline_geometry_shader_key)
|
|
? &GetGeometryShader(pipeline_geometry_shader_key)
|
|
: nullptr;
|
|
pipeline_runtime_description.root_signature =
|
|
command_processor_.GetRootSignature(
|
|
vertex_shader, pixel_shader,
|
|
Shader::IsHostVertexShaderTypeDomain(
|
|
DxbcShaderTranslator::Modification(
|
|
pipeline_description.vertex_shader_modification)
|
|
.vertex.host_vertex_shader_type));
|
|
if (!pipeline_runtime_description.root_signature) {
|
|
continue;
|
|
}
|
|
std::memcpy(&pipeline_runtime_description.description,
|
|
&pipeline_description, sizeof(pipeline_description));
|
|
|
|
Pipeline* new_pipeline = new Pipeline;
|
|
new_pipeline->state = nullptr;
|
|
std::memcpy(&new_pipeline->description, &pipeline_runtime_description,
|
|
sizeof(pipeline_runtime_description));
|
|
pipelines_.emplace(pipeline_stored_description.description_hash,
|
|
new_pipeline);
|
|
COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size());
|
|
if (!creation_threads_.empty()) {
|
|
// Submit the pipeline for creation to any available thread.
|
|
{
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
|
creation_queue_.push_back(new_pipeline);
|
|
}
|
|
creation_request_cond_.notify_one();
|
|
} else {
|
|
new_pipeline->state = CreateD3D12Pipeline(pipeline_runtime_description);
|
|
}
|
|
++pipelines_created;
|
|
}
|
|
|
|
if (!creation_threads_.empty()) {
|
|
CreateQueuedPipelinesOnProcessorThread();
|
|
if (creation_threads_.size() > creation_thread_original_count) {
|
|
{
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
|
creation_threads_shutdown_from_ = creation_thread_original_count;
|
|
// Assuming the queue is empty because of
|
|
// CreateQueuedPipelinesOnProcessorThread.
|
|
}
|
|
creation_request_cond_.notify_all();
|
|
while (creation_threads_.size() > creation_thread_original_count) {
|
|
xe::threading::Wait(creation_threads_.back().get(), false);
|
|
creation_threads_.pop_back();
|
|
}
|
|
bool await_creation_completion_event;
|
|
{
|
|
// Cleanup so additional threads can be created later again.
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
|
creation_threads_shutdown_from_ = SIZE_MAX;
|
|
// If the invocation is blocking, all the shader storage
|
|
// initialization is expected to be done before proceeding, to avoid
|
|
// latency in the command processor after the invocation.
|
|
await_creation_completion_event =
|
|
blocking && creation_threads_busy_ != 0;
|
|
if (await_creation_completion_event) {
|
|
creation_completion_event_->Reset();
|
|
creation_completion_set_event_ = true;
|
|
}
|
|
}
|
|
if (await_creation_completion_event) {
|
|
creation_request_cond_.notify_one();
|
|
xe::threading::Wait(creation_completion_event_.get(), false);
|
|
}
|
|
}
|
|
}
|
|
|
|
XELOGGPU(
|
|
"Created {} graphics pipelines (not including reading the "
|
|
"descriptions) from the storage in {} milliseconds",
|
|
pipelines_created,
|
|
(xe::Clock::QueryHostTickCount() - pipeline_creation_start_) * 1000 /
|
|
xe::Clock::QueryHostTickFrequency());
|
|
// If any pipeline descriptions were corrupted (or the whole file has excess
|
|
// bytes in the end), truncate to the last valid pipeline description.
|
|
xe::filesystem::TruncateStdioFile(
|
|
pipeline_storage_file_,
|
|
uint64_t(sizeof(pipeline_storage_file_header) +
|
|
sizeof(PipelineStoredDescription) *
|
|
pipeline_stored_descriptions.size()));
|
|
} else {
|
|
xe::filesystem::TruncateStdioFile(pipeline_storage_file_, 0);
|
|
pipeline_storage_file_header.magic = pipeline_storage_magic;
|
|
pipeline_storage_file_header.magic_api = pipeline_storage_magic_api;
|
|
pipeline_storage_file_header.version_swapped =
|
|
pipeline_storage_version_swapped;
|
|
fwrite(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header),
|
|
1, pipeline_storage_file_);
|
|
}
|
|
|
|
shader_storage_cache_root_ = cache_root;
|
|
shader_storage_title_id_ = title_id;
|
|
|
|
// Start the storage writing thread.
|
|
storage_write_flush_shaders_ = false;
|
|
storage_write_flush_pipelines_ = false;
|
|
storage_write_thread_shutdown_ = false;
|
|
storage_write_thread_ =
|
|
xe::threading::Thread::Create({}, [this]() { StorageWriteThread(); });
|
|
assert_not_null(storage_write_thread_);
|
|
storage_write_thread_->set_name("D3D12 Storage writer");
|
|
}
|
|
|
|
void PipelineCache::ShutdownShaderStorage() {
|
|
if (storage_write_thread_) {
|
|
{
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
storage_write_thread_shutdown_ = true;
|
|
}
|
|
storage_write_request_cond_.notify_all();
|
|
xe::threading::Wait(storage_write_thread_.get(), false);
|
|
storage_write_thread_.reset();
|
|
}
|
|
storage_write_shader_queue_.clear();
|
|
storage_write_pipeline_queue_.clear();
|
|
|
|
if (pipeline_storage_file_) {
|
|
fclose(pipeline_storage_file_);
|
|
pipeline_storage_file_ = nullptr;
|
|
pipeline_storage_file_flush_needed_ = false;
|
|
}
|
|
|
|
if (shader_storage_file_) {
|
|
fclose(shader_storage_file_);
|
|
shader_storage_file_ = nullptr;
|
|
shader_storage_file_flush_needed_ = false;
|
|
}
|
|
|
|
shader_storage_cache_root_.clear();
|
|
shader_storage_title_id_ = 0;
|
|
}
|
|
|
|
void PipelineCache::EndSubmission() {
|
|
if (shader_storage_file_flush_needed_ ||
|
|
pipeline_storage_file_flush_needed_) {
|
|
{
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
if (shader_storage_file_flush_needed_) {
|
|
storage_write_flush_shaders_ = true;
|
|
}
|
|
if (pipeline_storage_file_flush_needed_) {
|
|
storage_write_flush_pipelines_ = true;
|
|
}
|
|
}
|
|
storage_write_request_cond_.notify_one();
|
|
shader_storage_file_flush_needed_ = false;
|
|
pipeline_storage_file_flush_needed_ = false;
|
|
}
|
|
if (!creation_threads_.empty()) {
|
|
CreateQueuedPipelinesOnProcessorThread();
|
|
// Await creation of all queued pipelines.
|
|
bool await_creation_completion_event;
|
|
{
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
|
// Assuming the creation queue is already empty (because the processor
|
|
// thread also worked on creating the leftover pipelines), so only check
|
|
// if there are threads with pipelines currently being created.
|
|
await_creation_completion_event = creation_threads_busy_ != 0;
|
|
if (await_creation_completion_event) {
|
|
creation_completion_event_->Reset();
|
|
creation_completion_set_event_ = true;
|
|
}
|
|
}
|
|
if (await_creation_completion_event) {
|
|
creation_request_cond_.notify_one();
|
|
xe::threading::Wait(creation_completion_event_.get(), false);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool PipelineCache::IsCreatingPipelines() {
|
|
if (creation_threads_.empty()) {
|
|
return false;
|
|
}
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
|
return !creation_queue_.empty() || creation_threads_busy_ != 0;
|
|
}
|
|
|
|
D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
|
|
const uint32_t* host_address,
|
|
uint32_t dword_count) {
|
|
// Hash the input memory and lookup the shader.
|
|
return LoadShader(shader_type, host_address, dword_count,
|
|
XXH3_64bits(host_address, dword_count * sizeof(uint32_t)));
|
|
}
|
|
|
|
D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
|
|
const uint32_t* host_address,
|
|
uint32_t dword_count,
|
|
uint64_t data_hash) {
|
|
auto it = shaders_.find(data_hash);
|
|
if (it != shaders_.end()) {
|
|
// Shader has been previously loaded.
|
|
return it->second;
|
|
}
|
|
// Always create the shader and stash it away.
|
|
// We need to track it even if it fails translation so we know not to try
|
|
// again.
|
|
D3D12Shader* shader =
|
|
new D3D12Shader(shader_type, data_hash, host_address, dword_count);
|
|
shaders_.emplace(data_hash, shader);
|
|
return shader;
|
|
}
|
|
|
|
DxbcShaderTranslator::Modification
|
|
PipelineCache::GetCurrentVertexShaderModification(
|
|
const Shader& shader, Shader::HostVertexShaderType host_vertex_shader_type,
|
|
uint32_t interpolator_mask) const {
|
|
assert_true(shader.type() == xenos::ShaderType::kVertex);
|
|
assert_true(shader.is_ucode_analyzed());
|
|
const auto& regs = register_file_;
|
|
|
|
DxbcShaderTranslator::Modification modification(
|
|
shader_translator_->GetDefaultVertexShaderModification(
|
|
shader.GetDynamicAddressableRegisterCount(
|
|
regs.Get<reg::SQ_PROGRAM_CNTL>().vs_num_reg),
|
|
host_vertex_shader_type));
|
|
|
|
modification.vertex.interpolator_mask = interpolator_mask;
|
|
|
|
auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
|
|
uint32_t user_clip_planes =
|
|
pa_cl_clip_cntl.clip_disable ? 0 : pa_cl_clip_cntl.ucp_ena;
|
|
modification.vertex.user_clip_plane_count = xe::bit_count(user_clip_planes);
|
|
modification.vertex.user_clip_plane_cull =
|
|
uint32_t(user_clip_planes && pa_cl_clip_cntl.ucp_cull_only_ena);
|
|
modification.vertex.vertex_kill_and =
|
|
uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b100) &&
|
|
!pa_cl_clip_cntl.vtx_kill_or);
|
|
|
|
modification.vertex.output_point_size =
|
|
uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b001) &&
|
|
regs.Get<reg::VGT_DRAW_INITIATOR>().prim_type ==
|
|
xenos::PrimitiveType::kPointList);
|
|
|
|
return modification;
|
|
}
|
|
|
|
DxbcShaderTranslator::Modification
|
|
PipelineCache::GetCurrentPixelShaderModification(
|
|
const Shader& shader, uint32_t interpolator_mask, uint32_t param_gen_pos,
|
|
reg::RB_DEPTHCONTROL normalized_depth_control) const {
|
|
assert_true(shader.type() == xenos::ShaderType::kPixel);
|
|
assert_true(shader.is_ucode_analyzed());
|
|
const auto& regs = register_file_;
|
|
|
|
DxbcShaderTranslator::Modification modification(
|
|
shader_translator_->GetDefaultPixelShaderModification(
|
|
shader.GetDynamicAddressableRegisterCount(
|
|
regs.Get<reg::SQ_PROGRAM_CNTL>().ps_num_reg)));
|
|
|
|
modification.pixel.interpolator_mask = interpolator_mask;
|
|
modification.pixel.interpolators_centroid =
|
|
interpolator_mask &
|
|
~xenos::GetInterpolatorSamplingPattern(
|
|
regs.Get<reg::RB_SURFACE_INFO>().msaa_samples,
|
|
regs.Get<reg::SQ_CONTEXT_MISC>().sc_sample_cntl,
|
|
regs.Get<reg::SQ_INTERPOLATOR_CNTL>().sampling_pattern);
|
|
|
|
if (param_gen_pos < xenos::kMaxInterpolators) {
|
|
modification.pixel.param_gen_enable = 1;
|
|
modification.pixel.param_gen_interpolator = param_gen_pos;
|
|
modification.pixel.param_gen_point =
|
|
uint32_t(regs.Get<reg::VGT_DRAW_INITIATOR>().prim_type ==
|
|
xenos::PrimitiveType::kPointList);
|
|
} else {
|
|
modification.pixel.param_gen_enable = 0;
|
|
modification.pixel.param_gen_interpolator = 0;
|
|
modification.pixel.param_gen_point = 0;
|
|
}
|
|
|
|
if (render_target_cache_.GetPath() ==
|
|
RenderTargetCache::Path::kHostRenderTargets) {
|
|
using DepthStencilMode =
|
|
DxbcShaderTranslator::Modification::DepthStencilMode;
|
|
if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
|
|
normalized_depth_control.z_enable &&
|
|
regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
|
|
xenos::DepthRenderTargetFormat::kD24FS8) {
|
|
modification.pixel.depth_stencil_mode =
|
|
render_target_cache_.depth_float24_round()
|
|
? DepthStencilMode::kFloat24Rounding
|
|
: DepthStencilMode::kFloat24Truncating;
|
|
} else {
|
|
if (shader.implicit_early_z_write_allowed() &&
|
|
(!shader.writes_color_target(0) ||
|
|
!draw_util::DoesCoverageDependOnAlpha(
|
|
regs.Get<reg::RB_COLORCONTROL>()))) {
|
|
modification.pixel.depth_stencil_mode = DepthStencilMode::kEarlyHint;
|
|
} else {
|
|
modification.pixel.depth_stencil_mode = DepthStencilMode::kNoModifiers;
|
|
}
|
|
}
|
|
}
|
|
|
|
return modification;
|
|
}
|
|
|
|
bool PipelineCache::ConfigurePipeline(
|
|
D3D12Shader::D3D12Translation* vertex_shader,
|
|
D3D12Shader::D3D12Translation* pixel_shader,
|
|
const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
|
|
reg::RB_DEPTHCONTROL normalized_depth_control,
|
|
uint32_t normalized_color_mask,
|
|
uint32_t bound_depth_and_color_render_target_bits,
|
|
const uint32_t* bound_depth_and_color_render_target_formats,
|
|
void** pipeline_handle_out, ID3D12RootSignature** root_signature_out) {
|
|
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
|
SCOPE_profile_cpu_f("gpu");
|
|
#endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
|
|
|
assert_not_null(pipeline_handle_out);
|
|
assert_not_null(root_signature_out);
|
|
|
|
// Ensure shaders are translated - needed now for GetCurrentStateDescription.
|
|
// Edge flags are not supported yet (because polygon primitives are not).
|
|
assert_true(register_file_.Get<reg::SQ_PROGRAM_CNTL>().vs_export_mode !=
|
|
xenos::VertexShaderExportMode::kPosition2VectorsEdge &&
|
|
register_file_.Get<reg::SQ_PROGRAM_CNTL>().vs_export_mode !=
|
|
xenos::VertexShaderExportMode::kPosition2VectorsEdgeKill);
|
|
assert_false(register_file_.Get<reg::SQ_PROGRAM_CNTL>().gen_index_vtx);
|
|
if (!vertex_shader->is_translated()) {
|
|
if (!vertex_shader->shader().is_ucode_analyzed()) {
|
|
vertex_shader->shader().AnalyzeUcode(ucode_disasm_buffer_);
|
|
}
|
|
if (!TranslateAnalyzedShader(*shader_translator_, *vertex_shader,
|
|
dxbc_converter_, dxc_utils_, dxc_compiler_)) {
|
|
XELOGE("Failed to translate the vertex shader!");
|
|
return false;
|
|
}
|
|
if (shader_storage_file_ && vertex_shader->shader().ucode_storage_index() !=
|
|
shader_storage_index_) {
|
|
vertex_shader->shader().set_ucode_storage_index(shader_storage_index_);
|
|
assert_not_null(storage_write_thread_);
|
|
shader_storage_file_flush_needed_ = true;
|
|
{
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
storage_write_shader_queue_.push_back(&vertex_shader->shader());
|
|
}
|
|
storage_write_request_cond_.notify_all();
|
|
}
|
|
}
|
|
if (!vertex_shader->is_valid()) {
|
|
// Translation attempted previously, but not valid.
|
|
return false;
|
|
}
|
|
if (pixel_shader != nullptr) {
|
|
if (!pixel_shader->is_translated()) {
|
|
if (!pixel_shader->shader().is_ucode_analyzed()) {
|
|
pixel_shader->shader().AnalyzeUcode(ucode_disasm_buffer_);
|
|
}
|
|
if (!TranslateAnalyzedShader(*shader_translator_, *pixel_shader,
|
|
dxbc_converter_, dxc_utils_,
|
|
dxc_compiler_)) {
|
|
XELOGE("Failed to translate the pixel shader!");
|
|
return false;
|
|
}
|
|
if (shader_storage_file_ &&
|
|
pixel_shader->shader().ucode_storage_index() !=
|
|
shader_storage_index_) {
|
|
pixel_shader->shader().set_ucode_storage_index(shader_storage_index_);
|
|
assert_not_null(storage_write_thread_);
|
|
shader_storage_file_flush_needed_ = true;
|
|
{
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
storage_write_shader_queue_.push_back(&pixel_shader->shader());
|
|
}
|
|
storage_write_request_cond_.notify_all();
|
|
}
|
|
}
|
|
if (!pixel_shader->is_valid()) {
|
|
// Translation attempted previously, but not valid.
|
|
return false;
|
|
}
|
|
}
|
|
|
|
PipelineRuntimeDescription runtime_description;
|
|
if (!GetCurrentStateDescription(
|
|
vertex_shader, pixel_shader, primitive_processing_result,
|
|
normalized_depth_control, normalized_color_mask,
|
|
bound_depth_and_color_render_target_bits,
|
|
bound_depth_and_color_render_target_formats, runtime_description)) {
|
|
return false;
|
|
}
|
|
PipelineDescription& description = runtime_description.description;
|
|
|
|
if (current_pipeline_ != nullptr &&
|
|
current_pipeline_->description.description == description) {
|
|
*pipeline_handle_out = current_pipeline_;
|
|
*root_signature_out = runtime_description.root_signature;
|
|
return true;
|
|
}
|
|
|
|
// Find an existing pipeline in the cache.
|
|
uint64_t hash = XXH3_64bits(&description, sizeof(description));
|
|
auto found_range = pipelines_.equal_range(hash);
|
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
|
Pipeline* found_pipeline = it->second;
|
|
if (found_pipeline->description.description == description) {
|
|
current_pipeline_ = found_pipeline;
|
|
*pipeline_handle_out = found_pipeline;
|
|
*root_signature_out = found_pipeline->description.root_signature;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
Pipeline* new_pipeline = new Pipeline;
|
|
new_pipeline->state = nullptr;
|
|
std::memcpy(&new_pipeline->description, &runtime_description,
|
|
sizeof(runtime_description));
|
|
pipelines_.emplace(hash, new_pipeline);
|
|
COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size());
|
|
|
|
if (!creation_threads_.empty()) {
|
|
// Submit the pipeline for creation to any available thread.
|
|
{
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
|
creation_queue_.push_back(new_pipeline);
|
|
}
|
|
creation_request_cond_.notify_one();
|
|
} else {
|
|
new_pipeline->state = CreateD3D12Pipeline(runtime_description);
|
|
}
|
|
|
|
if (pipeline_storage_file_) {
|
|
assert_not_null(storage_write_thread_);
|
|
pipeline_storage_file_flush_needed_ = true;
|
|
{
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
storage_write_pipeline_queue_.emplace_back();
|
|
PipelineStoredDescription& stored_description =
|
|
storage_write_pipeline_queue_.back();
|
|
stored_description.description_hash = hash;
|
|
std::memcpy(&stored_description.description, &description,
|
|
sizeof(description));
|
|
}
|
|
storage_write_request_cond_.notify_all();
|
|
}
|
|
|
|
current_pipeline_ = new_pipeline;
|
|
*pipeline_handle_out = new_pipeline;
|
|
*root_signature_out = runtime_description.root_signature;
|
|
return true;
|
|
}
|
|
|
|
bool PipelineCache::TranslateAnalyzedShader(
|
|
DxbcShaderTranslator& translator,
|
|
D3D12Shader::D3D12Translation& translation, IDxbcConverter* dxbc_converter,
|
|
IDxcUtils* dxc_utils, IDxcCompiler* dxc_compiler) {
|
|
D3D12Shader& shader = static_cast<D3D12Shader&>(translation.shader());
|
|
|
|
// Perform translation.
|
|
// If this fails the shader will be marked as invalid and ignored later.
|
|
if (!translator.TranslateAnalyzedShader(translation)) {
|
|
XELOGE("Shader {:016X} translation failed; marking as ignored",
|
|
shader.ucode_data_hash());
|
|
return false;
|
|
}
|
|
|
|
const char* host_shader_type;
|
|
if (shader.type() == xenos::ShaderType::kVertex) {
|
|
DxbcShaderTranslator::Modification modification(translation.modification());
|
|
switch (modification.vertex.host_vertex_shader_type) {
|
|
case Shader::HostVertexShaderType::kLineDomainCPIndexed:
|
|
host_shader_type = "control-point-indexed line domain";
|
|
break;
|
|
case Shader::HostVertexShaderType::kLineDomainPatchIndexed:
|
|
host_shader_type = "patch-indexed line domain";
|
|
break;
|
|
case Shader::HostVertexShaderType::kTriangleDomainCPIndexed:
|
|
host_shader_type = "control-point-indexed triangle domain";
|
|
break;
|
|
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
|
|
host_shader_type = "patch-indexed triangle domain";
|
|
break;
|
|
case Shader::HostVertexShaderType::kQuadDomainCPIndexed:
|
|
host_shader_type = "control-point-indexed quad domain";
|
|
break;
|
|
case Shader::HostVertexShaderType::kQuadDomainPatchIndexed:
|
|
host_shader_type = "patch-indexed quad domain";
|
|
break;
|
|
default:
|
|
assert(modification.vertex.host_vertex_shader_type ==
|
|
Shader::HostVertexShaderType::kVertex);
|
|
host_shader_type = "vertex";
|
|
}
|
|
} else {
|
|
host_shader_type = "pixel";
|
|
}
|
|
XELOGGPU("Generated {} shader ({}b) - hash {:016X}:\n{}\n", host_shader_type,
|
|
shader.ucode_dword_count() * sizeof(uint32_t),
|
|
shader.ucode_data_hash(), shader.ucode_disassembly().c_str());
|
|
|
|
// Set up texture and sampler binding layouts.
|
|
if (shader.EnterBindingLayoutUserUIDSetup()) {
|
|
const std::vector<D3D12Shader::TextureBinding>& texture_bindings =
|
|
shader.GetTextureBindingsAfterTranslation();
|
|
size_t texture_binding_count = texture_bindings.size();
|
|
const std::vector<D3D12Shader::SamplerBinding>& sampler_bindings =
|
|
shader.GetSamplerBindingsAfterTranslation();
|
|
size_t sampler_binding_count = sampler_bindings.size();
|
|
assert_false(bindless_resources_used_ &&
|
|
texture_binding_count + sampler_binding_count >
|
|
D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 4);
|
|
size_t texture_binding_layout_bytes =
|
|
texture_binding_count * sizeof(*texture_bindings.data());
|
|
uint64_t texture_binding_layout_hash = 0;
|
|
if (texture_binding_count) {
|
|
texture_binding_layout_hash =
|
|
XXH3_64bits(texture_bindings.data(), texture_binding_layout_bytes);
|
|
}
|
|
size_t bindless_sampler_count =
|
|
bindless_resources_used_ ? sampler_binding_count : 0;
|
|
uint64_t bindless_sampler_layout_hash = 0;
|
|
if (bindless_sampler_count) {
|
|
XXH3_state_t hash_state;
|
|
XXH3_64bits_reset(&hash_state);
|
|
for (size_t i = 0; i < bindless_sampler_count; ++i) {
|
|
XXH3_64bits_update(
|
|
&hash_state, &sampler_bindings[i].bindless_descriptor_index,
|
|
sizeof(sampler_bindings[i].bindless_descriptor_index));
|
|
}
|
|
bindless_sampler_layout_hash = XXH3_64bits_digest(&hash_state);
|
|
}
|
|
// Obtain the unique IDs of binding layouts if there are any texture
|
|
// bindings or bindless samplers, for invalidation in the command processor.
|
|
size_t texture_binding_layout_uid = kLayoutUIDEmpty;
|
|
// Use sampler count for the bindful case because it's the only thing that
|
|
// must be the same for layouts to be compatible in this case
|
|
// (instruction-specified parameters are used as overrides for actual
|
|
// samplers).
|
|
static_assert(
|
|
kLayoutUIDEmpty == 0,
|
|
"Empty layout UID is assumed to be 0 because for bindful samplers, the "
|
|
"UID is their count");
|
|
size_t sampler_binding_layout_uid =
|
|
bindless_resources_used_ ? kLayoutUIDEmpty : sampler_binding_count;
|
|
if (texture_binding_count || bindless_sampler_count) {
|
|
std::lock_guard<std::mutex> layouts_lock(layouts_mutex_);
|
|
if (texture_binding_count) {
|
|
auto found_range = texture_binding_layout_map_.equal_range(
|
|
texture_binding_layout_hash);
|
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
|
if (it->second.vector_span_length == texture_binding_count &&
|
|
!std::memcmp(texture_binding_layouts_.data() +
|
|
it->second.vector_span_offset,
|
|
texture_bindings.data(),
|
|
texture_binding_layout_bytes)) {
|
|
texture_binding_layout_uid = it->second.uid;
|
|
break;
|
|
}
|
|
}
|
|
if (texture_binding_layout_uid == kLayoutUIDEmpty) {
|
|
static_assert(
|
|
kLayoutUIDEmpty == 0,
|
|
"Layout UID is size + 1 because it's assumed that 0 is the UID "
|
|
"for an empty layout");
|
|
texture_binding_layout_uid = texture_binding_layout_map_.size() + 1;
|
|
LayoutUID new_uid;
|
|
new_uid.uid = texture_binding_layout_uid;
|
|
new_uid.vector_span_offset = texture_binding_layouts_.size();
|
|
new_uid.vector_span_length = texture_binding_count;
|
|
texture_binding_layouts_.resize(new_uid.vector_span_offset +
|
|
texture_binding_count);
|
|
std::memcpy(
|
|
texture_binding_layouts_.data() + new_uid.vector_span_offset,
|
|
texture_bindings.data(), texture_binding_layout_bytes);
|
|
texture_binding_layout_map_.emplace(texture_binding_layout_hash,
|
|
new_uid);
|
|
}
|
|
}
|
|
if (bindless_sampler_count) {
|
|
auto found_range = bindless_sampler_layout_map_.equal_range(
|
|
sampler_binding_layout_uid);
|
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
|
if (it->second.vector_span_length != bindless_sampler_count) {
|
|
continue;
|
|
}
|
|
sampler_binding_layout_uid = it->second.uid;
|
|
const uint32_t* vector_bindless_sampler_layout =
|
|
bindless_sampler_layouts_.data() + it->second.vector_span_offset;
|
|
for (size_t i = 0; i < bindless_sampler_count; ++i) {
|
|
if (vector_bindless_sampler_layout[i] !=
|
|
sampler_bindings[i].bindless_descriptor_index) {
|
|
sampler_binding_layout_uid = kLayoutUIDEmpty;
|
|
break;
|
|
}
|
|
}
|
|
if (sampler_binding_layout_uid != kLayoutUIDEmpty) {
|
|
break;
|
|
}
|
|
}
|
|
if (sampler_binding_layout_uid == kLayoutUIDEmpty) {
|
|
sampler_binding_layout_uid = bindless_sampler_layout_map_.size();
|
|
LayoutUID new_uid;
|
|
static_assert(
|
|
kLayoutUIDEmpty == 0,
|
|
"Layout UID is size + 1 because it's assumed that 0 is the UID "
|
|
"for an empty layout");
|
|
new_uid.uid = sampler_binding_layout_uid + 1;
|
|
new_uid.vector_span_offset = bindless_sampler_layouts_.size();
|
|
new_uid.vector_span_length = sampler_binding_count;
|
|
bindless_sampler_layouts_.resize(new_uid.vector_span_offset +
|
|
sampler_binding_count);
|
|
uint32_t* vector_bindless_sampler_layout =
|
|
bindless_sampler_layouts_.data() + new_uid.vector_span_offset;
|
|
for (size_t i = 0; i < bindless_sampler_count; ++i) {
|
|
vector_bindless_sampler_layout[i] =
|
|
sampler_bindings[i].bindless_descriptor_index;
|
|
}
|
|
bindless_sampler_layout_map_.emplace(bindless_sampler_layout_hash,
|
|
new_uid);
|
|
}
|
|
}
|
|
}
|
|
shader.SetTextureBindingLayoutUserUID(texture_binding_layout_uid);
|
|
shader.SetSamplerBindingLayoutUserUID(sampler_binding_layout_uid);
|
|
}
|
|
|
|
// Disassemble the shader for dumping.
|
|
const ui::d3d12::D3D12Provider& provider =
|
|
command_processor_.GetD3D12Provider();
|
|
if (cvars::d3d12_dxbc_disasm_dxilconv) {
|
|
translation.DisassembleDxbcAndDxil(provider, cvars::d3d12_dxbc_disasm,
|
|
dxbc_converter, dxc_utils, dxc_compiler);
|
|
} else {
|
|
translation.DisassembleDxbcAndDxil(provider, cvars::d3d12_dxbc_disasm);
|
|
}
|
|
|
|
// Dump shader files if desired.
|
|
if (!cvars::dump_shaders.empty()) {
|
|
bool edram_rov_used = render_target_cache_.GetPath() ==
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
translation.Dump(cvars::dump_shaders,
|
|
(shader.type() == xenos::ShaderType::kPixel)
|
|
? (edram_rov_used ? "d3d12_rov" : "d3d12_rtv")
|
|
: "d3d12");
|
|
}
|
|
|
|
return translation.is_valid();
|
|
}
|
|
|
|
bool PipelineCache::GetCurrentStateDescription(
|
|
D3D12Shader::D3D12Translation* vertex_shader,
|
|
D3D12Shader::D3D12Translation* pixel_shader,
|
|
const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
|
|
reg::RB_DEPTHCONTROL normalized_depth_control,
|
|
uint32_t normalized_color_mask,
|
|
uint32_t bound_depth_and_color_render_target_bits,
|
|
const uint32_t* bound_depth_and_color_render_target_formats,
|
|
PipelineRuntimeDescription& runtime_description_out) {
|
|
// Translated shaders needed at least for the root signature.
|
|
assert_true(vertex_shader->is_translated() && vertex_shader->is_valid());
|
|
assert_true(!pixel_shader ||
|
|
(pixel_shader->is_translated() && pixel_shader->is_valid()));
|
|
|
|
PipelineDescription& description_out = runtime_description_out.description;
|
|
|
|
const auto& regs = register_file_;
|
|
auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
|
|
|
|
// Initialize all unused fields to zero for comparison/hashing.
|
|
std::memset(&runtime_description_out, 0, sizeof(runtime_description_out));
|
|
|
|
assert_true(DxbcShaderTranslator::Modification(vertex_shader->modification())
|
|
.vertex.host_vertex_shader_type ==
|
|
primitive_processing_result.host_vertex_shader_type);
|
|
bool tessellated = primitive_processing_result.IsTessellated();
|
|
bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
|
|
bool rasterization_enabled =
|
|
draw_util::IsRasterizationPotentiallyDone(regs, primitive_polygonal);
|
|
// In Direct3D, rasterization (along with pixel counting) is disabled by
|
|
// disabling the pixel shader and depth / stencil. However, if rasterization
|
|
// should be disabled, the pixel shader must be disabled externally, to ensure
|
|
// things like texture binding layout is correct for the shader actually being
|
|
// used (don't replace anything here).
|
|
if (!rasterization_enabled) {
|
|
assert_null(pixel_shader);
|
|
if (pixel_shader) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool edram_rov_used = render_target_cache_.GetPath() ==
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
|
|
// Root signature.
|
|
runtime_description_out.root_signature = command_processor_.GetRootSignature(
|
|
static_cast<const DxbcShader*>(&vertex_shader->shader()),
|
|
pixel_shader ? static_cast<const DxbcShader*>(&pixel_shader->shader())
|
|
: nullptr,
|
|
tessellated);
|
|
if (runtime_description_out.root_signature == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
// Vertex shader.
|
|
runtime_description_out.vertex_shader = vertex_shader;
|
|
description_out.vertex_shader_hash =
|
|
vertex_shader->shader().ucode_data_hash();
|
|
description_out.vertex_shader_modification = vertex_shader->modification();
|
|
|
|
// Index buffer strip cut value.
|
|
if (primitive_processing_result.host_primitive_reset_enabled) {
|
|
description_out.strip_cut_index =
|
|
primitive_processing_result.host_index_format ==
|
|
xenos::IndexFormat::kInt16
|
|
? PipelineStripCutIndex::kFFFF
|
|
: PipelineStripCutIndex::kFFFFFFFF;
|
|
} else {
|
|
description_out.strip_cut_index = PipelineStripCutIndex::kNone;
|
|
}
|
|
|
|
// Host vertex shader type and primitive topology.
|
|
if (tessellated) {
|
|
description_out.primitive_topology_type_or_tessellation_mode =
|
|
uint32_t(primitive_processing_result.tessellation_mode);
|
|
} else {
|
|
switch (primitive_processing_result.host_primitive_type) {
|
|
case xenos::PrimitiveType::kPointList:
|
|
description_out.primitive_topology_type_or_tessellation_mode =
|
|
uint32_t(PipelinePrimitiveTopologyType::kPoint);
|
|
break;
|
|
case xenos::PrimitiveType::kLineList:
|
|
case xenos::PrimitiveType::kLineStrip:
|
|
// Quads are emulated as line lists with adjacency.
|
|
case xenos::PrimitiveType::kQuadList:
|
|
case xenos::PrimitiveType::k2DLineStrip:
|
|
description_out.primitive_topology_type_or_tessellation_mode =
|
|
uint32_t(PipelinePrimitiveTopologyType::kLine);
|
|
break;
|
|
default:
|
|
description_out.primitive_topology_type_or_tessellation_mode =
|
|
uint32_t(PipelinePrimitiveTopologyType::kTriangle);
|
|
break;
|
|
}
|
|
switch (primitive_processing_result.host_primitive_type) {
|
|
case xenos::PrimitiveType::kPointList:
|
|
description_out.geometry_shader = PipelineGeometryShader::kPointList;
|
|
break;
|
|
case xenos::PrimitiveType::kRectangleList:
|
|
description_out.geometry_shader =
|
|
PipelineGeometryShader::kRectangleList;
|
|
break;
|
|
case xenos::PrimitiveType::kQuadList:
|
|
description_out.geometry_shader = PipelineGeometryShader::kQuadList;
|
|
break;
|
|
default:
|
|
description_out.geometry_shader = PipelineGeometryShader::kNone;
|
|
break;
|
|
}
|
|
}
|
|
GeometryShaderKey geometry_shader_key;
|
|
runtime_description_out.geometry_shader =
|
|
GetGeometryShaderKey(
|
|
description_out.geometry_shader,
|
|
DxbcShaderTranslator::Modification(vertex_shader->modification()),
|
|
DxbcShaderTranslator::Modification(
|
|
pixel_shader ? pixel_shader->modification() : 0),
|
|
geometry_shader_key)
|
|
? &GetGeometryShader(geometry_shader_key)
|
|
: nullptr;
|
|
|
|
// The rest doesn't matter when rasterization is disabled (thus no writing to
|
|
// anywhere from post-geometry stages and no samples are counted).
|
|
if (!rasterization_enabled) {
|
|
description_out.cull_mode = PipelineCullMode::kDisableRasterization;
|
|
return true;
|
|
}
|
|
|
|
// Pixel shader.
|
|
if (pixel_shader) {
|
|
runtime_description_out.pixel_shader = pixel_shader;
|
|
description_out.pixel_shader_hash =
|
|
pixel_shader->shader().ucode_data_hash();
|
|
description_out.pixel_shader_modification = pixel_shader->modification();
|
|
}
|
|
|
|
// Rasterizer state.
|
|
// Because Direct3D 12 doesn't support per-side fill mode and depth bias, the
|
|
// values to use depends on the current culling state.
|
|
// If front faces are culled, use the ones for back faces.
|
|
// If back faces are culled, it's the other way around.
|
|
// If culling is not enabled, assume the developer wanted to draw things in a
|
|
// more special way - so if one side is wireframe or has a depth bias, then
|
|
// that's intentional (if both sides have a depth bias, the one for the front
|
|
// faces is used, though it's unlikely that they will ever be different -
|
|
// SetRenderState sets the same offset for both sides).
|
|
// Points fill mode (0) also isn't supported in Direct3D 12, but assume the
|
|
// developer didn't want to fill the whole primitive and use wireframe (like
|
|
// Xenos fill mode 1).
|
|
// Here we also assume that only one side is culled - if two sides are culled,
|
|
// rasterization will be disabled externally, or the draw call will be dropped
|
|
// early if the vertex shader doesn't export to memory.
|
|
bool cull_front, cull_back;
|
|
if (primitive_polygonal) {
|
|
description_out.front_counter_clockwise = pa_su_sc_mode_cntl.face == 0;
|
|
cull_front = pa_su_sc_mode_cntl.cull_front != 0;
|
|
cull_back = pa_su_sc_mode_cntl.cull_back != 0;
|
|
if (cull_front) {
|
|
// The case when both faces are culled should be handled by disabling
|
|
// rasterization.
|
|
assert_false(cull_back);
|
|
description_out.cull_mode = PipelineCullMode::kFront;
|
|
} else if (cull_back) {
|
|
description_out.cull_mode = PipelineCullMode::kBack;
|
|
} else {
|
|
description_out.cull_mode = PipelineCullMode::kNone;
|
|
}
|
|
// With ROV, the depth bias is applied in the pixel shader because
|
|
// per-sample depth is needed for MSAA.
|
|
if (!cull_front) {
|
|
// Front faces aren't culled.
|
|
// Direct3D 12, unfortunately, doesn't support point fill mode.
|
|
if (pa_su_sc_mode_cntl.polymode_front_ptype !=
|
|
xenos::PolygonType::kTriangles) {
|
|
description_out.fill_mode_wireframe = 1;
|
|
}
|
|
}
|
|
if (!cull_back) {
|
|
// Back faces aren't culled.
|
|
if (pa_su_sc_mode_cntl.polymode_back_ptype !=
|
|
xenos::PolygonType::kTriangles) {
|
|
description_out.fill_mode_wireframe = 1;
|
|
}
|
|
}
|
|
if (pa_su_sc_mode_cntl.poly_mode != xenos::PolygonModeEnable::kDualMode) {
|
|
description_out.fill_mode_wireframe = 0;
|
|
}
|
|
} else {
|
|
// Filled front faces only, without culling.
|
|
cull_front = false;
|
|
cull_back = false;
|
|
}
|
|
if (!edram_rov_used) {
|
|
float polygon_offset, polygon_offset_scale;
|
|
draw_util::GetPreferredFacePolygonOffset(
|
|
regs, primitive_polygonal, polygon_offset_scale, polygon_offset);
|
|
description_out.depth_bias = draw_util::GetD3D10IntegerPolygonOffset(
|
|
regs.Get<reg::RB_DEPTH_INFO>().depth_format, polygon_offset);
|
|
description_out.depth_bias_slope_scaled =
|
|
polygon_offset_scale * xenos::kPolygonOffsetScaleSubpixelUnit;
|
|
}
|
|
if (tessellated && cvars::d3d12_tessellation_wireframe) {
|
|
description_out.fill_mode_wireframe = 1;
|
|
}
|
|
description_out.depth_clip = !regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable;
|
|
bool depth_stencil_bound_and_used = false;
|
|
if (!edram_rov_used) {
|
|
// Depth/stencil. No stencil, always passing depth test and no depth writing
|
|
// means depth disabled.
|
|
if (bound_depth_and_color_render_target_bits & 1) {
|
|
if (normalized_depth_control.z_enable) {
|
|
description_out.depth_func = normalized_depth_control.zfunc;
|
|
description_out.depth_write = normalized_depth_control.z_write_enable;
|
|
} else {
|
|
description_out.depth_func = xenos::CompareFunction::kAlways;
|
|
}
|
|
if (normalized_depth_control.stencil_enable) {
|
|
description_out.stencil_enable = 1;
|
|
bool stencil_backface_enable =
|
|
primitive_polygonal && normalized_depth_control.backface_enable;
|
|
// Per-face masks not supported by Direct3D 12, choose the back face
|
|
// ones only if drawing only back faces.
|
|
Register stencil_ref_mask_reg;
|
|
if (stencil_backface_enable && cull_front) {
|
|
stencil_ref_mask_reg = XE_GPU_REG_RB_STENCILREFMASK_BF;
|
|
} else {
|
|
stencil_ref_mask_reg = XE_GPU_REG_RB_STENCILREFMASK;
|
|
}
|
|
auto stencil_ref_mask =
|
|
regs.Get<reg::RB_STENCILREFMASK>(stencil_ref_mask_reg);
|
|
description_out.stencil_read_mask = stencil_ref_mask.stencilmask;
|
|
description_out.stencil_write_mask = stencil_ref_mask.stencilwritemask;
|
|
description_out.stencil_front_fail_op =
|
|
normalized_depth_control.stencilfail;
|
|
description_out.stencil_front_depth_fail_op =
|
|
normalized_depth_control.stencilzfail;
|
|
description_out.stencil_front_pass_op =
|
|
normalized_depth_control.stencilzpass;
|
|
description_out.stencil_front_func =
|
|
normalized_depth_control.stencilfunc;
|
|
if (stencil_backface_enable) {
|
|
description_out.stencil_back_fail_op =
|
|
normalized_depth_control.stencilfail_bf;
|
|
description_out.stencil_back_depth_fail_op =
|
|
normalized_depth_control.stencilzfail_bf;
|
|
description_out.stencil_back_pass_op =
|
|
normalized_depth_control.stencilzpass_bf;
|
|
description_out.stencil_back_func =
|
|
normalized_depth_control.stencilfunc_bf;
|
|
} else {
|
|
description_out.stencil_back_fail_op =
|
|
description_out.stencil_front_fail_op;
|
|
description_out.stencil_back_depth_fail_op =
|
|
description_out.stencil_front_depth_fail_op;
|
|
description_out.stencil_back_pass_op =
|
|
description_out.stencil_front_pass_op;
|
|
description_out.stencil_back_func =
|
|
description_out.stencil_front_func;
|
|
}
|
|
}
|
|
// If not binding the DSV, ignore the format in the hash.
|
|
if (description_out.depth_func != xenos::CompareFunction::kAlways ||
|
|
description_out.depth_write || description_out.stencil_enable) {
|
|
description_out.depth_format = xenos::DepthRenderTargetFormat(
|
|
bound_depth_and_color_render_target_formats[0]);
|
|
depth_stencil_bound_and_used = true;
|
|
}
|
|
} else {
|
|
description_out.depth_func = xenos::CompareFunction::kAlways;
|
|
}
|
|
|
|
// Render targets and blending state. 32 because of 0x1F mask, for safety
|
|
// (all unknown to zero).
|
|
static const PipelineBlendFactor kBlendFactorMap[32] = {
|
|
/* 0 */ PipelineBlendFactor::kZero,
|
|
/* 1 */ PipelineBlendFactor::kOne,
|
|
/* 2 */ PipelineBlendFactor::kZero, // ?
|
|
/* 3 */ PipelineBlendFactor::kZero, // ?
|
|
/* 4 */ PipelineBlendFactor::kSrcColor,
|
|
/* 5 */ PipelineBlendFactor::kInvSrcColor,
|
|
/* 6 */ PipelineBlendFactor::kSrcAlpha,
|
|
/* 7 */ PipelineBlendFactor::kInvSrcAlpha,
|
|
/* 8 */ PipelineBlendFactor::kDestColor,
|
|
/* 9 */ PipelineBlendFactor::kInvDestColor,
|
|
/* 10 */ PipelineBlendFactor::kDestAlpha,
|
|
/* 11 */ PipelineBlendFactor::kInvDestAlpha,
|
|
// CONSTANT_COLOR
|
|
/* 12 */ PipelineBlendFactor::kBlendFactor,
|
|
// ONE_MINUS_CONSTANT_COLOR
|
|
/* 13 */ PipelineBlendFactor::kInvBlendFactor,
|
|
// CONSTANT_ALPHA
|
|
/* 14 */ PipelineBlendFactor::kBlendFactor,
|
|
// ONE_MINUS_CONSTANT_ALPHA
|
|
/* 15 */ PipelineBlendFactor::kInvBlendFactor,
|
|
/* 16 */ PipelineBlendFactor::kSrcAlphaSat,
|
|
};
|
|
// Like kBlendFactorMap, but with color modes changed to alpha. Some
|
|
// pipelines aren't created in 545407E0 because a color mode is used for
|
|
// alpha.
|
|
static const PipelineBlendFactor kBlendFactorAlphaMap[32] = {
|
|
/* 0 */ PipelineBlendFactor::kZero,
|
|
/* 1 */ PipelineBlendFactor::kOne,
|
|
/* 2 */ PipelineBlendFactor::kZero, // ?
|
|
/* 3 */ PipelineBlendFactor::kZero, // ?
|
|
/* 4 */ PipelineBlendFactor::kSrcAlpha,
|
|
/* 5 */ PipelineBlendFactor::kInvSrcAlpha,
|
|
/* 6 */ PipelineBlendFactor::kSrcAlpha,
|
|
/* 7 */ PipelineBlendFactor::kInvSrcAlpha,
|
|
/* 8 */ PipelineBlendFactor::kDestAlpha,
|
|
/* 9 */ PipelineBlendFactor::kInvDestAlpha,
|
|
/* 10 */ PipelineBlendFactor::kDestAlpha,
|
|
/* 11 */ PipelineBlendFactor::kInvDestAlpha,
|
|
/* 12 */ PipelineBlendFactor::kBlendFactor,
|
|
// ONE_MINUS_CONSTANT_COLOR
|
|
/* 13 */ PipelineBlendFactor::kInvBlendFactor,
|
|
// CONSTANT_ALPHA
|
|
/* 14 */ PipelineBlendFactor::kBlendFactor,
|
|
// ONE_MINUS_CONSTANT_ALPHA
|
|
/* 15 */ PipelineBlendFactor::kInvBlendFactor,
|
|
/* 16 */ PipelineBlendFactor::kSrcAlphaSat,
|
|
};
|
|
// While it's okay to specify fewer render targets in the pipeline state
|
|
// (even fewer than written by the shader) than actually bound to the
|
|
// command list (though this kind of truncation may only happen at the end -
|
|
// DXGI_FORMAT_UNKNOWN *requires* a null RTV descriptor to be bound), not
|
|
// doing that because sample counts of all render targets bound via
|
|
// OMSetRenderTargets, even those beyond NumRenderTargets, apparently must
|
|
// have their sample count matching the one set in the pipeline - however if
|
|
// we set NumRenderTargets to 0 and also disable depth / stencil, the sample
|
|
// count must be set to 1 - while the command list may still have
|
|
// multisampled render targets bound (happens in 4D5307E6 main menu).
|
|
// TODO(Triang3l): Investigate interaction of OMSetRenderTargets with
|
|
// non-null depth and DSVFormat DXGI_FORMAT_UNKNOWN in the same case.
|
|
for (uint32_t i = 0; i < 4; ++i) {
|
|
if (!(bound_depth_and_color_render_target_bits &
|
|
(uint32_t(1) << (1 + i)))) {
|
|
continue;
|
|
}
|
|
PipelineRenderTarget& rt = description_out.render_targets[i];
|
|
rt.used = 1;
|
|
auto color_info = regs.Get<reg::RB_COLOR_INFO>(
|
|
reg::RB_COLOR_INFO::rt_register_indices[i]);
|
|
rt.format = xenos::ColorRenderTargetFormat(
|
|
bound_depth_and_color_render_target_formats[1 + i]);
|
|
rt.write_mask = (normalized_color_mask >> (i * 4)) & 0xF;
|
|
if (rt.write_mask) {
|
|
auto blendcontrol = regs.Get<reg::RB_BLENDCONTROL>(
|
|
reg::RB_BLENDCONTROL::rt_register_indices[i]);
|
|
rt.src_blend = kBlendFactorMap[uint32_t(blendcontrol.color_srcblend)];
|
|
rt.dest_blend = kBlendFactorMap[uint32_t(blendcontrol.color_destblend)];
|
|
rt.blend_op = blendcontrol.color_comb_fcn;
|
|
rt.src_blend_alpha =
|
|
kBlendFactorAlphaMap[uint32_t(blendcontrol.alpha_srcblend)];
|
|
rt.dest_blend_alpha =
|
|
kBlendFactorAlphaMap[uint32_t(blendcontrol.alpha_destblend)];
|
|
rt.blend_op_alpha = blendcontrol.alpha_comb_fcn;
|
|
} else {
|
|
rt.src_blend = PipelineBlendFactor::kOne;
|
|
rt.dest_blend = PipelineBlendFactor::kZero;
|
|
rt.blend_op = xenos::BlendOp::kAdd;
|
|
rt.src_blend_alpha = PipelineBlendFactor::kOne;
|
|
rt.dest_blend_alpha = PipelineBlendFactor::kZero;
|
|
rt.blend_op_alpha = xenos::BlendOp::kAdd;
|
|
}
|
|
}
|
|
}
|
|
xenos::MsaaSamples host_msaa_samples =
|
|
regs.Get<reg::RB_SURFACE_INFO>().msaa_samples;
|
|
if (edram_rov_used) {
|
|
if (host_msaa_samples == xenos::MsaaSamples::k2X) {
|
|
// 2 is not supported in ForcedSampleCount on Nvidia.
|
|
host_msaa_samples = xenos::MsaaSamples::k4X;
|
|
}
|
|
} else {
|
|
if (!(bound_depth_and_color_render_target_bits & ~uint32_t(1)) &&
|
|
!depth_stencil_bound_and_used) {
|
|
// Direct3D 12 requires the sample count to be 1 when no color or depth /
|
|
// stencil render targets are bound.
|
|
// FIXME(Triang3l): Use ForcedSampleCount or some other fallback for
|
|
// sample counting when needed, though with 2x it will be as incorrect as
|
|
// with 1x / 4x anyway; or bind a dummy depth / stencil buffer if really
|
|
// needed.
|
|
host_msaa_samples = xenos::MsaaSamples::k1X;
|
|
}
|
|
// TODO(Triang3l): 4x MSAA fallback when 2x isn't supported.
|
|
}
|
|
description_out.host_msaa_samples = host_msaa_samples;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool PipelineCache::GetGeometryShaderKey(
|
|
PipelineGeometryShader geometry_shader_type,
|
|
DxbcShaderTranslator::Modification vertex_shader_modification,
|
|
DxbcShaderTranslator::Modification pixel_shader_modification,
|
|
GeometryShaderKey& key_out) {
|
|
if (geometry_shader_type == PipelineGeometryShader::kNone) {
|
|
return false;
|
|
}
|
|
assert_true(vertex_shader_modification.vertex.interpolator_mask ==
|
|
pixel_shader_modification.pixel.interpolator_mask);
|
|
GeometryShaderKey key;
|
|
key.type = geometry_shader_type;
|
|
key.interpolator_count =
|
|
xe::bit_count(vertex_shader_modification.vertex.interpolator_mask);
|
|
key.user_clip_plane_count =
|
|
vertex_shader_modification.vertex.user_clip_plane_count;
|
|
key.user_clip_plane_cull =
|
|
vertex_shader_modification.vertex.user_clip_plane_cull;
|
|
key.has_vertex_kill_and = vertex_shader_modification.vertex.vertex_kill_and;
|
|
key.has_point_size = vertex_shader_modification.vertex.output_point_size;
|
|
key.has_point_coordinates = pixel_shader_modification.pixel.param_gen_point;
|
|
key_out = key;
|
|
return true;
|
|
}
|
|
|
|
void PipelineCache::CreateDxbcGeometryShader(
|
|
GeometryShaderKey key, std::vector<uint32_t>& shader_out) {
|
|
shader_out.clear();
|
|
|
|
// RDEF, ISGN, OSG5, SHEX, STAT.
|
|
constexpr uint32_t kBlobCount = 5;
|
|
|
|
// Allocate space for the container header and the blob offsets.
|
|
shader_out.resize(sizeof(dxbc::ContainerHeader) / sizeof(uint32_t) +
|
|
kBlobCount);
|
|
uint32_t blob_offset_position_dwords =
|
|
sizeof(dxbc::ContainerHeader) / sizeof(uint32_t);
|
|
uint32_t blob_position_dwords = uint32_t(shader_out.size());
|
|
constexpr uint32_t kBlobHeaderSizeDwords =
|
|
sizeof(dxbc::BlobHeader) / sizeof(uint32_t);
|
|
|
|
uint32_t name_ptr;
|
|
|
|
// ***************************************************************************
|
|
// Resource definition
|
|
// ***************************************************************************
|
|
|
|
shader_out[blob_offset_position_dwords] =
|
|
uint32_t(blob_position_dwords * sizeof(uint32_t));
|
|
uint32_t rdef_position_dwords = blob_position_dwords + kBlobHeaderSizeDwords;
|
|
// Not needed, as the next operation done is resize, to allocate the space for
|
|
// both the blob header and the resource definition header.
|
|
// shader_out.resize(rdef_position_dwords);
|
|
|
|
// RDEF header - the actual definitions will be written if needed.
|
|
shader_out.resize(rdef_position_dwords +
|
|
sizeof(dxbc::RdefHeader) / sizeof(uint32_t));
|
|
// Generator name.
|
|
dxbc::AppendAlignedString(shader_out, "Xenia");
|
|
{
|
|
auto& rdef_header = *reinterpret_cast<dxbc::RdefHeader*>(
|
|
shader_out.data() + rdef_position_dwords);
|
|
rdef_header.shader_model = dxbc::RdefShaderModel::kGeometryShader5_1;
|
|
rdef_header.compile_flags =
|
|
dxbc::kCompileFlagNoPreshader | dxbc::kCompileFlagPreferFlowControl |
|
|
dxbc::kCompileFlagIeeeStrictness | dxbc::kCompileFlagAllResourcesBound;
|
|
// Generator name is right after the header.
|
|
rdef_header.generator_name_ptr = sizeof(dxbc::RdefHeader);
|
|
rdef_header.fourcc = dxbc::RdefHeader::FourCC::k5_1;
|
|
rdef_header.InitializeSizes();
|
|
}
|
|
|
|
uint32_t system_cbuffer_size_vector_aligned_bytes = 0;
|
|
|
|
if (key.type == PipelineGeometryShader::kPointList) {
|
|
// Need point parameters from the system constants.
|
|
|
|
// Constant types - float2 only.
|
|
// Names.
|
|
name_ptr =
|
|
uint32_t((shader_out.size() - rdef_position_dwords) * sizeof(uint32_t));
|
|
uint32_t rdef_name_ptr_float2 = name_ptr;
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "float2");
|
|
// Types.
|
|
uint32_t rdef_type_float2_position_dwords = uint32_t(shader_out.size());
|
|
uint32_t rdef_type_float2_ptr =
|
|
uint32_t((rdef_type_float2_position_dwords - rdef_position_dwords) *
|
|
sizeof(uint32_t));
|
|
shader_out.resize(rdef_type_float2_position_dwords +
|
|
sizeof(dxbc::RdefType) / sizeof(uint32_t));
|
|
{
|
|
auto& rdef_type_float2 = *reinterpret_cast<dxbc::RdefType*>(
|
|
shader_out.data() + rdef_type_float2_position_dwords);
|
|
rdef_type_float2.variable_class = dxbc::RdefVariableClass::kVector;
|
|
rdef_type_float2.variable_type = dxbc::RdefVariableType::kFloat;
|
|
rdef_type_float2.row_count = 1;
|
|
rdef_type_float2.column_count = 2;
|
|
rdef_type_float2.name_ptr = rdef_name_ptr_float2;
|
|
}
|
|
|
|
// Constants:
|
|
// - float2 xe_point_constant_diameter
|
|
// - float2 xe_point_screen_diameter_to_ndc_radius
|
|
enum PointConstant : uint32_t {
|
|
kPointConstantConstantDiameter,
|
|
kPointConstantScreenDiameterToNDCRadius,
|
|
kPointConstantCount,
|
|
};
|
|
// Names.
|
|
name_ptr =
|
|
uint32_t((shader_out.size() - rdef_position_dwords) * sizeof(uint32_t));
|
|
uint32_t rdef_name_ptr_xe_point_constant_diameter = name_ptr;
|
|
name_ptr +=
|
|
dxbc::AppendAlignedString(shader_out, "xe_point_constant_diameter");
|
|
uint32_t rdef_name_ptr_xe_point_screen_diameter_to_ndc_radius = name_ptr;
|
|
name_ptr += dxbc::AppendAlignedString(
|
|
shader_out, "xe_point_screen_diameter_to_ndc_radius");
|
|
// Constants.
|
|
uint32_t rdef_constants_position_dwords = uint32_t(shader_out.size());
|
|
uint32_t rdef_constants_ptr =
|
|
uint32_t((rdef_constants_position_dwords - rdef_position_dwords) *
|
|
sizeof(uint32_t));
|
|
shader_out.resize(rdef_constants_position_dwords +
|
|
sizeof(dxbc::RdefVariable) / sizeof(uint32_t) *
|
|
kPointConstantCount);
|
|
{
|
|
auto rdef_constants = reinterpret_cast<dxbc::RdefVariable*>(
|
|
shader_out.data() + rdef_constants_position_dwords);
|
|
// float2 xe_point_constant_diameter
|
|
static_assert(
|
|
sizeof(DxbcShaderTranslator::SystemConstants ::
|
|
point_constant_diameter) == sizeof(float) * 2,
|
|
"DxbcShaderTranslator point_constant_diameter system constant size "
|
|
"differs between the shader translator and geometry shader "
|
|
"generation");
|
|
static_assert_size(
|
|
DxbcShaderTranslator::SystemConstants::point_constant_diameter,
|
|
sizeof(float) * 2);
|
|
dxbc::RdefVariable& rdef_constant_point_constant_diameter =
|
|
rdef_constants[kPointConstantConstantDiameter];
|
|
rdef_constant_point_constant_diameter.name_ptr =
|
|
rdef_name_ptr_xe_point_constant_diameter;
|
|
rdef_constant_point_constant_diameter.start_offset_bytes = offsetof(
|
|
DxbcShaderTranslator::SystemConstants, point_constant_diameter);
|
|
rdef_constant_point_constant_diameter.size_bytes = sizeof(float) * 2;
|
|
rdef_constant_point_constant_diameter.flags = dxbc::kRdefVariableFlagUsed;
|
|
rdef_constant_point_constant_diameter.type_ptr = rdef_type_float2_ptr;
|
|
rdef_constant_point_constant_diameter.start_texture = UINT32_MAX;
|
|
rdef_constant_point_constant_diameter.start_sampler = UINT32_MAX;
|
|
// float2 xe_point_screen_diameter_to_ndc_radius
|
|
static_assert(
|
|
sizeof(DxbcShaderTranslator::SystemConstants ::
|
|
point_screen_diameter_to_ndc_radius) == sizeof(float) * 2,
|
|
"DxbcShaderTranslator point_screen_diameter_to_ndc_radius system "
|
|
"constant size differs between the shader translator and geometry "
|
|
"shader generation");
|
|
dxbc::RdefVariable& rdef_constant_point_screen_diameter_to_ndc_radius =
|
|
rdef_constants[kPointConstantScreenDiameterToNDCRadius];
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.name_ptr =
|
|
rdef_name_ptr_xe_point_screen_diameter_to_ndc_radius;
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.start_offset_bytes =
|
|
offsetof(DxbcShaderTranslator::SystemConstants,
|
|
point_screen_diameter_to_ndc_radius);
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.size_bytes =
|
|
sizeof(float) * 2;
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.flags =
|
|
dxbc::kRdefVariableFlagUsed;
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.type_ptr =
|
|
rdef_type_float2_ptr;
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.start_texture =
|
|
UINT32_MAX;
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.start_sampler =
|
|
UINT32_MAX;
|
|
}
|
|
|
|
// Constant buffers - xe_system_cbuffer only.
|
|
|
|
// Names.
|
|
name_ptr =
|
|
uint32_t((shader_out.size() - rdef_position_dwords) * sizeof(uint32_t));
|
|
uint32_t rdef_name_ptr_xe_system_cbuffer = name_ptr;
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "xe_system_cbuffer");
|
|
// Constant buffers.
|
|
uint32_t rdef_cbuffer_position_dwords = uint32_t(shader_out.size());
|
|
shader_out.resize(rdef_cbuffer_position_dwords +
|
|
sizeof(dxbc::RdefCbuffer) / sizeof(uint32_t));
|
|
{
|
|
auto& rdef_cbuffer_system = *reinterpret_cast<dxbc::RdefCbuffer*>(
|
|
shader_out.data() + rdef_cbuffer_position_dwords);
|
|
rdef_cbuffer_system.name_ptr = rdef_name_ptr_xe_system_cbuffer;
|
|
rdef_cbuffer_system.variable_count = kPointConstantCount;
|
|
rdef_cbuffer_system.variables_ptr = rdef_constants_ptr;
|
|
auto rdef_constants = reinterpret_cast<const dxbc::RdefVariable*>(
|
|
shader_out.data() + rdef_constants_position_dwords);
|
|
for (uint32_t i = 0; i < kPointConstantCount; ++i) {
|
|
system_cbuffer_size_vector_aligned_bytes =
|
|
std::max(system_cbuffer_size_vector_aligned_bytes,
|
|
rdef_constants[i].start_offset_bytes +
|
|
rdef_constants[i].size_bytes);
|
|
}
|
|
system_cbuffer_size_vector_aligned_bytes =
|
|
xe::align(system_cbuffer_size_vector_aligned_bytes,
|
|
uint32_t(sizeof(uint32_t) * 4));
|
|
rdef_cbuffer_system.size_vector_aligned_bytes =
|
|
system_cbuffer_size_vector_aligned_bytes;
|
|
}
|
|
|
|
// Bindings - xe_system_cbuffer only.
|
|
uint32_t rdef_binding_position_dwords = uint32_t(shader_out.size());
|
|
shader_out.resize(rdef_binding_position_dwords +
|
|
sizeof(dxbc::RdefInputBind) / sizeof(uint32_t));
|
|
{
|
|
auto& rdef_binding_cbuffer_system =
|
|
*reinterpret_cast<dxbc::RdefInputBind*>(shader_out.data() +
|
|
rdef_binding_position_dwords);
|
|
rdef_binding_cbuffer_system.name_ptr = rdef_name_ptr_xe_system_cbuffer;
|
|
rdef_binding_cbuffer_system.type = dxbc::RdefInputType::kCbuffer;
|
|
rdef_binding_cbuffer_system.bind_point =
|
|
uint32_t(DxbcShaderTranslator::CbufferRegister::kSystemConstants);
|
|
rdef_binding_cbuffer_system.bind_count = 1;
|
|
rdef_binding_cbuffer_system.flags = dxbc::kRdefInputFlagUserPacked;
|
|
}
|
|
|
|
// Pointers in the header.
|
|
{
|
|
auto& rdef_header = *reinterpret_cast<dxbc::RdefHeader*>(
|
|
shader_out.data() + rdef_position_dwords);
|
|
rdef_header.cbuffer_count = 1;
|
|
rdef_header.cbuffers_ptr =
|
|
uint32_t((rdef_cbuffer_position_dwords - rdef_position_dwords) *
|
|
sizeof(uint32_t));
|
|
rdef_header.input_bind_count = 1;
|
|
rdef_header.input_binds_ptr =
|
|
uint32_t((rdef_binding_position_dwords - rdef_position_dwords) *
|
|
sizeof(uint32_t));
|
|
}
|
|
}
|
|
|
|
{
|
|
auto& blob_header = *reinterpret_cast<dxbc::BlobHeader*>(
|
|
shader_out.data() + blob_position_dwords);
|
|
blob_header.fourcc = dxbc::BlobHeader::FourCC::kResourceDefinition;
|
|
blob_position_dwords = uint32_t(shader_out.size());
|
|
blob_header.size_bytes =
|
|
(blob_position_dwords - kBlobHeaderSizeDwords) * sizeof(uint32_t) -
|
|
shader_out[blob_offset_position_dwords++];
|
|
}
|
|
|
|
// ***************************************************************************
|
|
// Input signature
|
|
// ***************************************************************************
|
|
|
|
// Clip and cull distances are tightly packed together into registers, but
|
|
// have separate signature parameters with each being a vec4-aligned window.
|
|
uint32_t input_clip_distance_count =
|
|
key.user_clip_plane_cull ? 0 : key.user_clip_plane_count;
|
|
uint32_t input_cull_distance_count =
|
|
(key.user_clip_plane_cull ? key.user_clip_plane_count : 0) +
|
|
key.has_vertex_kill_and;
|
|
uint32_t input_clip_and_cull_distance_count =
|
|
input_clip_distance_count + input_cull_distance_count;
|
|
|
|
// Interpolators, position, clip and cull distances (parameters containing
|
|
// only clip or cull distances, and also one parameter containing both if
|
|
// present), point size.
|
|
uint32_t isgn_parameter_count =
|
|
key.interpolator_count + 1 +
|
|
((input_clip_and_cull_distance_count + 3) / 4) +
|
|
uint32_t(input_cull_distance_count &&
|
|
(input_clip_distance_count & 3) != 0) +
|
|
key.has_point_size;
|
|
|
|
// Reserve space for the header and the parameters.
|
|
shader_out[blob_offset_position_dwords] =
|
|
uint32_t(blob_position_dwords * sizeof(uint32_t));
|
|
uint32_t isgn_position_dwords = blob_position_dwords + kBlobHeaderSizeDwords;
|
|
shader_out.resize(isgn_position_dwords +
|
|
sizeof(dxbc::Signature) / sizeof(uint32_t) +
|
|
sizeof(dxbc::SignatureParameter) / sizeof(uint32_t) *
|
|
isgn_parameter_count);
|
|
|
|
// Names (after the parameters).
|
|
name_ptr =
|
|
uint32_t((shader_out.size() - isgn_position_dwords) * sizeof(uint32_t));
|
|
uint32_t isgn_name_ptr_texcoord = name_ptr;
|
|
if (key.interpolator_count) {
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "TEXCOORD");
|
|
}
|
|
uint32_t isgn_name_ptr_sv_position = name_ptr;
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "SV_Position");
|
|
uint32_t isgn_name_ptr_sv_clip_distance = name_ptr;
|
|
if (input_clip_distance_count) {
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "SV_ClipDistance");
|
|
}
|
|
uint32_t isgn_name_ptr_sv_cull_distance = name_ptr;
|
|
if (input_cull_distance_count) {
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "SV_CullDistance");
|
|
}
|
|
uint32_t isgn_name_ptr_xepsize = name_ptr;
|
|
if (key.has_point_size) {
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "XEPSIZE");
|
|
}
|
|
|
|
// Header and parameters.
|
|
uint32_t input_register_interpolators = UINT32_MAX;
|
|
uint32_t input_register_position;
|
|
uint32_t input_register_clip_and_cull_distances = UINT32_MAX;
|
|
uint32_t input_register_point_size = UINT32_MAX;
|
|
{
|
|
// Header.
|
|
auto& isgn_header = *reinterpret_cast<dxbc::Signature*>(
|
|
shader_out.data() + isgn_position_dwords);
|
|
isgn_header.parameter_count = isgn_parameter_count;
|
|
isgn_header.parameter_info_ptr = sizeof(dxbc::Signature);
|
|
|
|
// Parameters.
|
|
auto isgn_parameters = reinterpret_cast<dxbc::SignatureParameter*>(
|
|
shader_out.data() + isgn_position_dwords +
|
|
sizeof(dxbc::Signature) / sizeof(uint32_t));
|
|
uint32_t isgn_parameter_index = 0;
|
|
uint32_t input_register_index = 0;
|
|
|
|
// Interpolators (TEXCOORD#).
|
|
if (key.interpolator_count) {
|
|
input_register_interpolators = input_register_index;
|
|
for (uint32_t i = 0; i < key.interpolator_count; ++i) {
|
|
assert_true(isgn_parameter_index < isgn_parameter_count);
|
|
dxbc::SignatureParameter& isgn_interpolator =
|
|
isgn_parameters[isgn_parameter_index++];
|
|
isgn_interpolator.semantic_name_ptr = isgn_name_ptr_texcoord;
|
|
isgn_interpolator.semantic_index = i;
|
|
isgn_interpolator.component_type =
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
isgn_interpolator.register_index = input_register_index++;
|
|
isgn_interpolator.mask = 0b1111;
|
|
isgn_interpolator.always_reads_mask = 0b1111;
|
|
}
|
|
}
|
|
|
|
// Position (SV_Position).
|
|
input_register_position = input_register_index;
|
|
assert_true(isgn_parameter_index < isgn_parameter_count);
|
|
dxbc::SignatureParameter& isgn_sv_position =
|
|
isgn_parameters[isgn_parameter_index++];
|
|
isgn_sv_position.semantic_name_ptr = isgn_name_ptr_sv_position;
|
|
isgn_sv_position.system_value = dxbc::Name::kPosition;
|
|
isgn_sv_position.component_type =
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
isgn_sv_position.register_index = input_register_index++;
|
|
isgn_sv_position.mask = 0b1111;
|
|
isgn_sv_position.always_reads_mask = 0b1111;
|
|
|
|
// Clip and cull distances (SV_ClipDistance#, SV_CullDistance#).
|
|
if (input_clip_and_cull_distance_count) {
|
|
input_register_clip_and_cull_distances = input_register_index;
|
|
uint32_t isgn_cull_distance_semantic_index = 0;
|
|
for (uint32_t i = 0; i < input_clip_and_cull_distance_count; i += 4) {
|
|
if (i < input_clip_distance_count) {
|
|
dxbc::SignatureParameter& isgn_sv_clip_distance =
|
|
isgn_parameters[isgn_parameter_index++];
|
|
isgn_sv_clip_distance.semantic_name_ptr =
|
|
isgn_name_ptr_sv_clip_distance;
|
|
isgn_sv_clip_distance.semantic_index = i / 4;
|
|
isgn_sv_clip_distance.system_value = dxbc::Name::kClipDistance;
|
|
isgn_sv_clip_distance.component_type =
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
isgn_sv_clip_distance.register_index = input_register_index;
|
|
uint8_t isgn_sv_clip_distance_mask =
|
|
(UINT8_C(1) << std::min(input_clip_distance_count - i,
|
|
UINT32_C(4))) -
|
|
1;
|
|
isgn_sv_clip_distance.mask = isgn_sv_clip_distance_mask;
|
|
isgn_sv_clip_distance.always_reads_mask = isgn_sv_clip_distance_mask;
|
|
}
|
|
if (input_cull_distance_count && i + 4 > input_clip_distance_count) {
|
|
dxbc::SignatureParameter& isgn_sv_cull_distance =
|
|
isgn_parameters[isgn_parameter_index++];
|
|
isgn_sv_cull_distance.semantic_name_ptr =
|
|
isgn_name_ptr_sv_cull_distance;
|
|
isgn_sv_cull_distance.semantic_index =
|
|
isgn_cull_distance_semantic_index++;
|
|
isgn_sv_cull_distance.system_value = dxbc::Name::kCullDistance;
|
|
isgn_sv_cull_distance.component_type =
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
isgn_sv_cull_distance.register_index = input_register_index;
|
|
uint8_t isgn_sv_cull_distance_mask =
|
|
(UINT8_C(1) << std::min(input_clip_and_cull_distance_count - i,
|
|
UINT32_C(4))) -
|
|
1;
|
|
if (i < input_clip_distance_count) {
|
|
isgn_sv_cull_distance_mask &=
|
|
~((UINT8_C(1) << (input_clip_distance_count - i)) - 1);
|
|
}
|
|
isgn_sv_cull_distance.mask = isgn_sv_cull_distance_mask;
|
|
isgn_sv_cull_distance.always_reads_mask = isgn_sv_cull_distance_mask;
|
|
}
|
|
++input_register_index;
|
|
}
|
|
}
|
|
|
|
// Point size (XEPSIZE).
|
|
if (key.has_point_size) {
|
|
input_register_point_size = input_register_index;
|
|
assert_true(isgn_parameter_index < isgn_parameter_count);
|
|
dxbc::SignatureParameter& isgn_point_size =
|
|
isgn_parameters[isgn_parameter_index++];
|
|
isgn_point_size.semantic_name_ptr = isgn_name_ptr_xepsize;
|
|
isgn_point_size.component_type =
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
isgn_point_size.register_index = input_register_index++;
|
|
isgn_point_size.mask = 0b0001;
|
|
isgn_point_size.always_reads_mask =
|
|
key.type == PipelineGeometryShader::kPointList ? 0b0001 : 0;
|
|
}
|
|
|
|
assert_true(isgn_parameter_index == isgn_parameter_count);
|
|
}
|
|
|
|
{
|
|
auto& blob_header = *reinterpret_cast<dxbc::BlobHeader*>(
|
|
shader_out.data() + blob_position_dwords);
|
|
blob_header.fourcc = dxbc::BlobHeader::FourCC::kInputSignature;
|
|
blob_position_dwords = uint32_t(shader_out.size());
|
|
blob_header.size_bytes =
|
|
(blob_position_dwords - kBlobHeaderSizeDwords) * sizeof(uint32_t) -
|
|
shader_out[blob_offset_position_dwords++];
|
|
}
|
|
|
|
// ***************************************************************************
|
|
// Output signature
|
|
// ***************************************************************************
|
|
|
|
// Interpolators, point coordinates, position, clip distances.
|
|
uint32_t osgn_parameter_count = key.interpolator_count +
|
|
key.has_point_coordinates + 1 +
|
|
((input_clip_distance_count + 3) / 4);
|
|
|
|
// Reserve space for the header and the parameters.
|
|
shader_out[blob_offset_position_dwords] =
|
|
uint32_t(blob_position_dwords * sizeof(uint32_t));
|
|
uint32_t osgn_position_dwords = blob_position_dwords + kBlobHeaderSizeDwords;
|
|
shader_out.resize(osgn_position_dwords +
|
|
sizeof(dxbc::Signature) / sizeof(uint32_t) +
|
|
sizeof(dxbc::SignatureParameterForGS) / sizeof(uint32_t) *
|
|
osgn_parameter_count);
|
|
|
|
// Names (after the parameters).
|
|
name_ptr =
|
|
uint32_t((shader_out.size() - osgn_position_dwords) * sizeof(uint32_t));
|
|
uint32_t osgn_name_ptr_texcoord = name_ptr;
|
|
if (key.interpolator_count) {
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "TEXCOORD");
|
|
}
|
|
uint32_t osgn_name_ptr_xespritetexcoord = name_ptr;
|
|
if (key.has_point_coordinates) {
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "XESPRITETEXCOORD");
|
|
}
|
|
uint32_t osgn_name_ptr_sv_position = name_ptr;
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "SV_Position");
|
|
uint32_t osgn_name_ptr_sv_clip_distance = name_ptr;
|
|
if (input_clip_distance_count) {
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "SV_ClipDistance");
|
|
}
|
|
|
|
// Header and parameters.
|
|
uint32_t output_register_interpolators = UINT32_MAX;
|
|
uint32_t output_register_point_coordinates = UINT32_MAX;
|
|
uint32_t output_register_position;
|
|
uint32_t output_register_clip_distances = UINT32_MAX;
|
|
{
|
|
// Header.
|
|
auto& osgn_header = *reinterpret_cast<dxbc::Signature*>(
|
|
shader_out.data() + osgn_position_dwords);
|
|
osgn_header.parameter_count = osgn_parameter_count;
|
|
osgn_header.parameter_info_ptr = sizeof(dxbc::Signature);
|
|
|
|
// Parameters.
|
|
auto osgn_parameters = reinterpret_cast<dxbc::SignatureParameterForGS*>(
|
|
shader_out.data() + osgn_position_dwords +
|
|
sizeof(dxbc::Signature) / sizeof(uint32_t));
|
|
uint32_t osgn_parameter_index = 0;
|
|
uint32_t output_register_index = 0;
|
|
|
|
// Interpolators (TEXCOORD#).
|
|
if (key.interpolator_count) {
|
|
output_register_interpolators = output_register_index;
|
|
for (uint32_t i = 0; i < key.interpolator_count; ++i) {
|
|
assert_true(osgn_parameter_index < osgn_parameter_count);
|
|
dxbc::SignatureParameterForGS& osgn_interpolator =
|
|
osgn_parameters[osgn_parameter_index++];
|
|
osgn_interpolator.semantic_name_ptr = osgn_name_ptr_texcoord;
|
|
osgn_interpolator.semantic_index = i;
|
|
osgn_interpolator.component_type =
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
osgn_interpolator.register_index = output_register_index++;
|
|
osgn_interpolator.mask = 0b1111;
|
|
}
|
|
}
|
|
|
|
// Point coordinates (XESPRITETEXCOORD).
|
|
if (key.has_point_coordinates) {
|
|
output_register_point_coordinates = output_register_index;
|
|
assert_true(osgn_parameter_index < osgn_parameter_count);
|
|
dxbc::SignatureParameterForGS& osgn_point_coordinates =
|
|
osgn_parameters[osgn_parameter_index++];
|
|
osgn_point_coordinates.semantic_name_ptr = osgn_name_ptr_xespritetexcoord;
|
|
osgn_point_coordinates.component_type =
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
osgn_point_coordinates.register_index = output_register_index++;
|
|
osgn_point_coordinates.mask = 0b0011;
|
|
osgn_point_coordinates.never_writes_mask = 0b1100;
|
|
}
|
|
|
|
// Position (SV_Position).
|
|
output_register_position = output_register_index;
|
|
assert_true(osgn_parameter_index < osgn_parameter_count);
|
|
dxbc::SignatureParameterForGS& osgn_sv_position =
|
|
osgn_parameters[osgn_parameter_index++];
|
|
osgn_sv_position.semantic_name_ptr = osgn_name_ptr_sv_position;
|
|
osgn_sv_position.system_value = dxbc::Name::kPosition;
|
|
osgn_sv_position.component_type =
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
osgn_sv_position.register_index = output_register_index++;
|
|
osgn_sv_position.mask = 0b1111;
|
|
|
|
// Clip distances (SV_ClipDistance#).
|
|
if (input_clip_distance_count) {
|
|
output_register_clip_distances = output_register_index;
|
|
for (uint32_t i = 0; i < input_clip_distance_count; i += 4) {
|
|
dxbc::SignatureParameterForGS& osgn_sv_clip_distance =
|
|
osgn_parameters[osgn_parameter_index++];
|
|
osgn_sv_clip_distance.semantic_name_ptr =
|
|
osgn_name_ptr_sv_clip_distance;
|
|
osgn_sv_clip_distance.semantic_index = i / 4;
|
|
osgn_sv_clip_distance.system_value = dxbc::Name::kClipDistance;
|
|
osgn_sv_clip_distance.component_type =
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
osgn_sv_clip_distance.register_index = output_register_index++;
|
|
uint8_t osgn_sv_clip_distance_mask =
|
|
(UINT8_C(1) << std::min(input_clip_distance_count - i,
|
|
UINT32_C(4))) -
|
|
1;
|
|
osgn_sv_clip_distance.mask = osgn_sv_clip_distance_mask;
|
|
osgn_sv_clip_distance.never_writes_mask =
|
|
osgn_sv_clip_distance_mask ^ 0b1111;
|
|
}
|
|
}
|
|
|
|
assert_true(osgn_parameter_index == osgn_parameter_count);
|
|
}
|
|
|
|
{
|
|
auto& blob_header = *reinterpret_cast<dxbc::BlobHeader*>(
|
|
shader_out.data() + blob_position_dwords);
|
|
blob_header.fourcc = dxbc::BlobHeader::FourCC::kOutputSignatureForGS;
|
|
blob_position_dwords = uint32_t(shader_out.size());
|
|
blob_header.size_bytes =
|
|
(blob_position_dwords - kBlobHeaderSizeDwords) * sizeof(uint32_t) -
|
|
shader_out[blob_offset_position_dwords++];
|
|
}
|
|
|
|
// ***************************************************************************
|
|
// Shader program
|
|
// ***************************************************************************
|
|
|
|
shader_out[blob_offset_position_dwords] =
|
|
uint32_t(blob_position_dwords * sizeof(uint32_t));
|
|
uint32_t shex_position_dwords = blob_position_dwords + kBlobHeaderSizeDwords;
|
|
shader_out.resize(shex_position_dwords);
|
|
|
|
shader_out.push_back(
|
|
dxbc::VersionToken(dxbc::ProgramType::kGeometryShader, 5, 1));
|
|
// Reserve space for the length token.
|
|
shader_out.push_back(0);
|
|
|
|
dxbc::Statistics stat;
|
|
std::memset(&stat, 0, sizeof(dxbc::Statistics));
|
|
dxbc::Assembler a(shader_out, stat);
|
|
|
|
a.OpDclGlobalFlags(dxbc::kGlobalFlagAllResourcesBound);
|
|
|
|
if (system_cbuffer_size_vector_aligned_bytes) {
|
|
a.OpDclConstantBuffer(
|
|
dxbc::Src::CB(
|
|
dxbc::Src::Dcl, 0,
|
|
uint32_t(DxbcShaderTranslator::CbufferRegister::kSystemConstants),
|
|
uint32_t(DxbcShaderTranslator::CbufferRegister::kSystemConstants)),
|
|
system_cbuffer_size_vector_aligned_bytes / (sizeof(uint32_t) * 4));
|
|
}
|
|
|
|
dxbc::Primitive input_primitive = dxbc::Primitive::kUndefined;
|
|
uint32_t input_primitive_vertex_count = 0;
|
|
dxbc::PrimitiveTopology output_primitive_topology =
|
|
dxbc::PrimitiveTopology::kUndefined;
|
|
uint32_t max_output_vertex_count = 0;
|
|
switch (key.type) {
|
|
case PipelineGeometryShader::kPointList:
|
|
// Point to a strip of 2 triangles.
|
|
input_primitive = dxbc::Primitive::kPoint;
|
|
input_primitive_vertex_count = 1;
|
|
output_primitive_topology = dxbc::PrimitiveTopology::kTriangleStrip;
|
|
max_output_vertex_count = 4;
|
|
break;
|
|
case PipelineGeometryShader::kRectangleList:
|
|
// Triangle to a strip of 2 triangles.
|
|
input_primitive = dxbc::Primitive::kTriangle;
|
|
input_primitive_vertex_count = 3;
|
|
output_primitive_topology = dxbc::PrimitiveTopology::kTriangleStrip;
|
|
max_output_vertex_count = 4;
|
|
break;
|
|
case PipelineGeometryShader::kQuadList:
|
|
// 4 vertices passed via kLineWithAdjacency to a strip of 2 triangles.
|
|
input_primitive = dxbc::Primitive::kLineWithAdjacency;
|
|
input_primitive_vertex_count = 4;
|
|
output_primitive_topology = dxbc::PrimitiveTopology::kTriangleStrip;
|
|
max_output_vertex_count = 4;
|
|
break;
|
|
default:
|
|
assert_unhandled_case(key.type);
|
|
}
|
|
|
|
assert_false(key.interpolator_count &&
|
|
input_register_interpolators == UINT32_MAX);
|
|
for (uint32_t i = 0; i < key.interpolator_count; ++i) {
|
|
a.OpDclInput(dxbc::Dest::V2D(input_primitive_vertex_count,
|
|
input_register_interpolators + i));
|
|
}
|
|
a.OpDclInputSIV(
|
|
dxbc::Dest::V2D(input_primitive_vertex_count, input_register_position),
|
|
dxbc::Name::kPosition);
|
|
// Clip and cull plane declarations are separate in FXC-generated code even
|
|
// for a single register.
|
|
assert_false(input_clip_and_cull_distance_count &&
|
|
input_register_clip_and_cull_distances == UINT32_MAX);
|
|
for (uint32_t i = 0; i < input_clip_and_cull_distance_count; i += 4) {
|
|
if (i < input_clip_distance_count) {
|
|
a.OpDclInput(
|
|
dxbc::Dest::V2D(input_primitive_vertex_count,
|
|
input_register_clip_and_cull_distances + (i >> 2),
|
|
(UINT32_C(1) << std::min(
|
|
input_clip_distance_count - i, UINT32_C(4))) -
|
|
1));
|
|
}
|
|
if (input_cull_distance_count && i + 4 > input_clip_distance_count) {
|
|
uint32_t cull_distance_mask =
|
|
(UINT32_C(1) << std::min(input_clip_and_cull_distance_count - i,
|
|
UINT32_C(4))) -
|
|
1;
|
|
if (i < input_clip_distance_count) {
|
|
cull_distance_mask &=
|
|
~((UINT32_C(1) << (input_clip_distance_count - i)) - 1);
|
|
}
|
|
a.OpDclInput(
|
|
dxbc::Dest::V2D(input_primitive_vertex_count,
|
|
input_register_clip_and_cull_distances + (i >> 2),
|
|
cull_distance_mask));
|
|
}
|
|
}
|
|
if (key.has_point_size && key.type == PipelineGeometryShader::kPointList) {
|
|
assert_true(input_register_point_size != UINT32_MAX);
|
|
a.OpDclInput(dxbc::Dest::V2D(input_primitive_vertex_count,
|
|
input_register_point_size, 0b0001));
|
|
}
|
|
|
|
// At least 1 temporary register needed to discard primitives with NaN
|
|
// position.
|
|
size_t dcl_temps_count_position_dwords = a.OpDclTemps(1);
|
|
|
|
a.OpDclInputPrimitive(input_primitive);
|
|
dxbc::Dest stream(dxbc::Dest::M(0));
|
|
a.OpDclStream(stream);
|
|
a.OpDclOutputTopology(output_primitive_topology);
|
|
|
|
assert_false(key.interpolator_count &&
|
|
output_register_interpolators == UINT32_MAX);
|
|
for (uint32_t i = 0; i < key.interpolator_count; ++i) {
|
|
a.OpDclOutput(dxbc::Dest::O(output_register_interpolators + i));
|
|
}
|
|
if (key.has_point_coordinates) {
|
|
assert_true(output_register_point_coordinates != UINT32_MAX);
|
|
a.OpDclOutput(dxbc::Dest::O(output_register_point_coordinates, 0b0011));
|
|
}
|
|
a.OpDclOutputSIV(dxbc::Dest::O(output_register_position),
|
|
dxbc::Name::kPosition);
|
|
assert_false(input_clip_distance_count &&
|
|
output_register_clip_distances == UINT32_MAX);
|
|
for (uint32_t i = 0; i < input_clip_distance_count; i += 4) {
|
|
a.OpDclOutputSIV(
|
|
dxbc::Dest::O(output_register_clip_distances + (i >> 2),
|
|
(UINT32_C(1) << std::min(input_clip_distance_count - i,
|
|
UINT32_C(4))) -
|
|
1),
|
|
dxbc::Name::kClipDistance);
|
|
}
|
|
|
|
a.OpDclMaxOutputVertexCount(max_output_vertex_count);
|
|
|
|
// Note that after every emit, all o# become initialized and must be written
|
|
// to again.
|
|
// Also, FXC generates only movs (from statically or dynamically indexed
|
|
// v[#][#], from r#, or from a literal) to o# for some reason.
|
|
|
|
// Discard the whole primitive if any vertex has a NaN position (may also be
|
|
// set to NaN for emulation of vertex killing with the OR operator).
|
|
for (uint32_t i = 0; i < input_primitive_vertex_count; ++i) {
|
|
a.OpNE(dxbc::Dest::R(0), dxbc::Src::V2D(i, input_register_position),
|
|
dxbc::Src::V2D(i, input_register_position));
|
|
a.OpOr(dxbc::Dest::R(0, 0b0011), dxbc::Src::R(0, 0b0100),
|
|
dxbc::Src::R(0, 0b1110));
|
|
a.OpOr(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kXXXX),
|
|
dxbc::Src::R(0, dxbc::Src::kYYYY));
|
|
a.OpRetC(true, dxbc::Src::R(0, dxbc::Src::kXXXX));
|
|
}
|
|
|
|
// Cull the whole primitive if any cull distance for all vertices in the
|
|
// primitive is < 0.
|
|
// TODO(Triang3l): For points, handle ps_ucp_mode (transform the host clip
|
|
// space to the guest one, calculate the distances to the user clip planes,
|
|
// cull using the distance from the center for modes 0, 1 and 2, cull and clip
|
|
// per-vertex for modes 2 and 3) - except for the vertex kill flag.
|
|
if (input_cull_distance_count) {
|
|
for (uint32_t i = 0; i < input_cull_distance_count; ++i) {
|
|
uint32_t cull_distance_register = input_register_clip_and_cull_distances +
|
|
((input_clip_distance_count + i) >> 2);
|
|
uint32_t cull_distance_component = (input_clip_distance_count + i) & 3;
|
|
a.OpLT(dxbc::Dest::R(0, 0b0001),
|
|
dxbc::Src::V2D(0, cull_distance_register)
|
|
.Select(cull_distance_component),
|
|
dxbc::Src::LF(0.0f));
|
|
for (uint32_t j = 1; j < input_primitive_vertex_count; ++j) {
|
|
a.OpLT(dxbc::Dest::R(0, 0b0010),
|
|
dxbc::Src::V2D(j, cull_distance_register)
|
|
.Select(cull_distance_component),
|
|
dxbc::Src::LF(0.0f));
|
|
a.OpAnd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kXXXX),
|
|
dxbc::Src::R(0, dxbc::Src::kYYYY));
|
|
}
|
|
a.OpRetC(true, dxbc::Src::R(0, dxbc::Src::kXXXX));
|
|
}
|
|
}
|
|
|
|
switch (key.type) {
|
|
case PipelineGeometryShader::kPointList: {
|
|
// Expand the point sprite, with left-to-right, top-to-bottom UVs.
|
|
dxbc::Src point_size_src(dxbc::Src::CB(
|
|
0, uint32_t(DxbcShaderTranslator::CbufferRegister::kSystemConstants),
|
|
offsetof(DxbcShaderTranslator::SystemConstants,
|
|
point_constant_diameter) >>
|
|
4,
|
|
((offsetof(DxbcShaderTranslator::SystemConstants,
|
|
point_constant_diameter[0]) >>
|
|
2) &
|
|
3) |
|
|
(((offsetof(DxbcShaderTranslator::SystemConstants,
|
|
point_constant_diameter[1]) >>
|
|
2) &
|
|
3)
|
|
<< 2)));
|
|
if (key.has_point_size) {
|
|
// The vertex shader's header writes -1.0 to point_size by default, so
|
|
// any non-negative value means that it was overwritten by the
|
|
// translated vertex shader, and needs to be used instead of the
|
|
// constant size. The per-vertex diameter is already clamped in the
|
|
// vertex shader (combined with making it non-negative).
|
|
a.OpGE(dxbc::Dest::R(0, 0b0001),
|
|
dxbc::Src::V2D(0, input_register_point_size, dxbc::Src::kXXXX),
|
|
dxbc::Src::LF(0.0f));
|
|
a.OpMovC(dxbc::Dest::R(0, 0b0011), dxbc::Src::R(0, dxbc::Src::kXXXX),
|
|
dxbc::Src::V2D(0, input_register_point_size, dxbc::Src::kXXXX),
|
|
point_size_src);
|
|
point_size_src = dxbc::Src::R(0, 0b0100);
|
|
}
|
|
// 4D5307F1 has zero-size snowflakes, drop them quicker, and also drop
|
|
// points with a constant size of zero since point lists may also be used
|
|
// as just "compute" with memexport.
|
|
// XY may contain the point size with the per-vertex override applied, use
|
|
// Z as temporary.
|
|
for (uint32_t i = 0; i < 2; ++i) {
|
|
a.OpLT(dxbc::Dest::R(0, 0b0100), dxbc::Src::LF(0.0f),
|
|
point_size_src.SelectFromSwizzled(i));
|
|
a.OpRetC(false, dxbc::Src::R(0, dxbc::Src::kZZZZ));
|
|
}
|
|
// Transform the diameter in the guest screen coordinates to radius in the
|
|
// normalized device coordinates, and then to the clip space by
|
|
// multiplying by W.
|
|
a.OpMul(
|
|
dxbc::Dest::R(0, 0b0011), point_size_src,
|
|
dxbc::Src::CB(
|
|
0,
|
|
uint32_t(DxbcShaderTranslator::CbufferRegister::kSystemConstants),
|
|
offsetof(DxbcShaderTranslator::SystemConstants,
|
|
point_screen_diameter_to_ndc_radius) >>
|
|
4,
|
|
((offsetof(DxbcShaderTranslator::SystemConstants,
|
|
point_screen_diameter_to_ndc_radius[0]) >>
|
|
2) &
|
|
3) |
|
|
(((offsetof(DxbcShaderTranslator::SystemConstants,
|
|
point_screen_diameter_to_ndc_radius[1]) >>
|
|
2) &
|
|
3)
|
|
<< 2)));
|
|
point_size_src = dxbc::Src::R(0, 0b0100);
|
|
a.OpMul(dxbc::Dest::R(0, 0b0011), point_size_src,
|
|
dxbc::Src::V2D(0, input_register_position, dxbc::Src::kWWWW));
|
|
dxbc::Src point_radius_x_src(point_size_src.SelectFromSwizzled(0));
|
|
dxbc::Src point_radius_y_src(point_size_src.SelectFromSwizzled(1));
|
|
|
|
for (uint32_t i = 0; i < 4; ++i) {
|
|
// Same interpolators for the entire sprite.
|
|
for (uint32_t j = 0; j < key.interpolator_count; ++j) {
|
|
a.OpMov(dxbc::Dest::O(output_register_interpolators + j),
|
|
dxbc::Src::V2D(0, input_register_interpolators + j));
|
|
}
|
|
// Top-left, top-right, bottom-left, bottom-right order (chosen
|
|
// arbitrarily, simply based on clockwise meaning front with
|
|
// FrontCounterClockwise = FALSE, but faceness is ignored for
|
|
// non-polygon primitive types).
|
|
// Bottom is -Y in Direct3D NDC, +V in point sprite coordinates.
|
|
if (key.has_point_coordinates) {
|
|
a.OpMov(dxbc::Dest::O(output_register_point_coordinates, 0b0011),
|
|
dxbc::Src::LF(float(i & 1), float(i >> 1), 0.0f, 0.0f));
|
|
}
|
|
// FXC generates only `mov`s for o#, use temporary registers (r0.zw, as
|
|
// r0.xy already used for the point size) for calculations.
|
|
a.OpAdd(dxbc::Dest::R(0, 0b0100),
|
|
dxbc::Src::V2D(0, input_register_position, dxbc::Src::kXXXX),
|
|
(i & 1) ? point_radius_x_src : -point_radius_x_src);
|
|
a.OpAdd(dxbc::Dest::R(0, 0b1000),
|
|
dxbc::Src::V2D(0, input_register_position, dxbc::Src::kYYYY),
|
|
(i >> 1) ? -point_radius_y_src : point_radius_y_src);
|
|
a.OpMov(dxbc::Dest::O(output_register_position, 0b0011),
|
|
dxbc::Src::R(0, 0b1110));
|
|
a.OpMov(dxbc::Dest::O(output_register_position, 0b1100),
|
|
dxbc::Src::V2D(0, input_register_position));
|
|
// TODO(Triang3l): Handle ps_ucp_mode properly, clip expanded points if
|
|
// needed.
|
|
for (uint32_t j = 0; j < input_clip_distance_count; j += 4) {
|
|
a.OpMov(
|
|
dxbc::Dest::O(output_register_clip_distances + (j >> 2),
|
|
(UINT32_C(1) << std::min(
|
|
input_clip_distance_count - j, UINT32_C(4))) -
|
|
1),
|
|
dxbc::Src::V2D(
|
|
0, input_register_clip_and_cull_distances + (j >> 2)));
|
|
}
|
|
if (i < 3) {
|
|
a.OpEmitStream(stream);
|
|
}
|
|
}
|
|
a.OpEmitThenCutStream(stream);
|
|
} break;
|
|
|
|
case PipelineGeometryShader::kRectangleList: {
|
|
// Construct a strip with the fourth vertex generated by mirroring a
|
|
// vertex across the longest edge (the diagonal).
|
|
//
|
|
// Possible options:
|
|
//
|
|
// 0---1
|
|
// | /|
|
|
// | / | - 12 is the longest edge, strip 0123 (most commonly used)
|
|
// |/ | v3 = v0 + (v1 - v0) + (v2 - v0), or v3 = -v0 + v1 + v2
|
|
// 2--[3]
|
|
//
|
|
// 1---2
|
|
// | /|
|
|
// | / | - 20 is the longest edge, strip 1203
|
|
// |/ |
|
|
// 0--[3]
|
|
//
|
|
// 2---0
|
|
// | /|
|
|
// | / | - 01 is the longest edge, strip 2013
|
|
// |/ |
|
|
// 1--[3]
|
|
//
|
|
// Input vertices are implicitly indexable, dcl_indexRange is not needed
|
|
// for the first dimension of a v[#][#] index.
|
|
|
|
// Get squares of edge lengths into r0.xyz to choose the longest edge.
|
|
// r0.x = ||12||^2
|
|
a.OpAdd(dxbc::Dest::R(0, 0b0011),
|
|
dxbc::Src::V2D(2, input_register_position, 0b0100),
|
|
-dxbc::Src::V2D(1, input_register_position, 0b0100));
|
|
a.OpDP2(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, 0b0100),
|
|
dxbc::Src::R(0, 0b0100));
|
|
// r0.y = ||20||^2
|
|
a.OpAdd(dxbc::Dest::R(0, 0b0110),
|
|
dxbc::Src::V2D(0, input_register_position, 0b0100 << 2),
|
|
-dxbc::Src::V2D(2, input_register_position, 0b0100 << 2));
|
|
a.OpDP2(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, 0b1001),
|
|
dxbc::Src::R(0, 0b1001));
|
|
// r0.z = ||01||^2
|
|
a.OpAdd(dxbc::Dest::R(0, 0b1100),
|
|
dxbc::Src::V2D(1, input_register_position, 0b0100 << 4),
|
|
-dxbc::Src::V2D(0, input_register_position, 0b0100 << 4));
|
|
a.OpDP2(dxbc::Dest::R(0, 0b0100), dxbc::Src::R(0, 0b1110),
|
|
dxbc::Src::R(0, 0b1110));
|
|
|
|
// Find the longest edge, and select the strip vertex indices into r0.xyz.
|
|
// r0.w = 12 > 20
|
|
a.OpLT(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kYYYY),
|
|
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
|
// r0.x = 12 > 01
|
|
a.OpLT(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ),
|
|
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
|
// r0.x = 12 > 20 && 12 > 01
|
|
a.OpAnd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kWWWW),
|
|
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
|
a.OpIf(true, dxbc::Src::R(0, dxbc::Src::kXXXX));
|
|
{
|
|
// 12 is the longest edge, the first triangle in the strip is 012.
|
|
a.OpMov(dxbc::Dest::R(0, 0b0111), dxbc::Src::LU(0, 1, 2, 0));
|
|
}
|
|
a.OpElse();
|
|
{
|
|
// r0.x = 20 > 01
|
|
a.OpLT(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ),
|
|
dxbc::Src::R(0, dxbc::Src::kYYYY));
|
|
// If 20 is the longest edge, the first triangle in the strip is 120.
|
|
// Otherwise, it's 201.
|
|
a.OpMovC(dxbc::Dest::R(0, 0b0111), dxbc::Src::R(0, dxbc::Src::kXXXX),
|
|
dxbc::Src::LU(1, 2, 0, 0), dxbc::Src::LU(2, 0, 1, 0));
|
|
}
|
|
a.OpEndIf();
|
|
|
|
// Emit the triangle in the strip that consists of the original vertices.
|
|
for (uint32_t i = 0; i < 3; ++i) {
|
|
dxbc::Index input_vertex_index(0, i);
|
|
for (uint32_t j = 0; j < key.interpolator_count; ++j) {
|
|
a.OpMov(dxbc::Dest::O(output_register_interpolators + j),
|
|
dxbc::Src::V2D(input_vertex_index,
|
|
input_register_interpolators + j));
|
|
}
|
|
if (key.has_point_coordinates) {
|
|
a.OpMov(dxbc::Dest::O(output_register_point_coordinates, 0b0011),
|
|
dxbc::Src::LF(0.0f));
|
|
}
|
|
a.OpMov(dxbc::Dest::O(output_register_position),
|
|
dxbc::Src::V2D(input_vertex_index, input_register_position));
|
|
for (uint32_t j = 0; j < input_clip_distance_count; j += 4) {
|
|
a.OpMov(
|
|
dxbc::Dest::O(output_register_clip_distances + (j >> 2),
|
|
(UINT32_C(1) << std::min(
|
|
input_clip_distance_count - j, UINT32_C(4))) -
|
|
1),
|
|
dxbc::Src::V2D(
|
|
input_vertex_index,
|
|
input_register_clip_and_cull_distances + (j >> 2)));
|
|
}
|
|
a.OpEmitStream(stream);
|
|
}
|
|
|
|
// Construct the fourth vertex using r1 as temporary storage, including
|
|
// for the final operation as FXC generates only `mov`s for o#.
|
|
stat.temp_register_count =
|
|
std::max(UINT32_C(2), stat.temp_register_count);
|
|
for (uint32_t j = 0; j < key.interpolator_count; ++j) {
|
|
uint32_t input_register_interpolator = input_register_interpolators + j;
|
|
a.OpAdd(dxbc::Dest::R(1),
|
|
-dxbc::Src::V2D(dxbc::Index(0, 0), input_register_interpolator),
|
|
dxbc::Src::V2D(dxbc::Index(0, 1), input_register_interpolator));
|
|
a.OpAdd(dxbc::Dest::R(1), dxbc::Src::R(1),
|
|
dxbc::Src::V2D(dxbc::Index(0, 2), input_register_interpolator));
|
|
a.OpMov(dxbc::Dest::O(output_register_interpolators + j),
|
|
dxbc::Src::R(1));
|
|
}
|
|
if (key.has_point_coordinates) {
|
|
a.OpMov(dxbc::Dest::O(output_register_point_coordinates, 0b0011),
|
|
dxbc::Src::LF(0.0f));
|
|
}
|
|
a.OpAdd(dxbc::Dest::R(1),
|
|
-dxbc::Src::V2D(dxbc::Index(0, 0), input_register_position),
|
|
dxbc::Src::V2D(dxbc::Index(0, 1), input_register_position));
|
|
a.OpAdd(dxbc::Dest::R(1), dxbc::Src::R(1),
|
|
dxbc::Src::V2D(dxbc::Index(0, 2), input_register_position));
|
|
a.OpMov(dxbc::Dest::O(output_register_position), dxbc::Src::R(1));
|
|
for (uint32_t j = 0; j < input_clip_distance_count; j += 4) {
|
|
uint32_t clip_distance_mask =
|
|
(UINT32_C(1) << std::min(input_clip_distance_count - j,
|
|
UINT32_C(4))) -
|
|
1;
|
|
uint32_t input_register_clip_distance =
|
|
input_register_clip_and_cull_distances + (j >> 2);
|
|
a.OpAdd(
|
|
dxbc::Dest::R(1, clip_distance_mask),
|
|
-dxbc::Src::V2D(dxbc::Index(0, 0), input_register_clip_distance),
|
|
dxbc::Src::V2D(dxbc::Index(0, 1), input_register_clip_distance));
|
|
a.OpAdd(
|
|
dxbc::Dest::R(1, clip_distance_mask), dxbc::Src::R(1),
|
|
dxbc::Src::V2D(dxbc::Index(0, 2), input_register_clip_distance));
|
|
a.OpMov(dxbc::Dest::O(output_register_clip_distances + (j >> 2),
|
|
clip_distance_mask),
|
|
dxbc::Src::R(1));
|
|
}
|
|
a.OpEmitThenCutStream(stream);
|
|
} break;
|
|
|
|
case PipelineGeometryShader::kQuadList: {
|
|
// Build the triangle strip from the original quad vertices in the
|
|
// 0, 1, 3, 2 order (like specified for GL_QUAD_STRIP).
|
|
// TODO(Triang3l): Find the correct decomposition of quads into triangles
|
|
// on the real hardware.
|
|
for (uint32_t i = 0; i < 4; ++i) {
|
|
uint32_t input_vertex_index = i ^ (i >> 1);
|
|
for (uint32_t j = 0; j < key.interpolator_count; ++j) {
|
|
a.OpMov(dxbc::Dest::O(output_register_interpolators + j),
|
|
dxbc::Src::V2D(input_vertex_index,
|
|
input_register_interpolators + j));
|
|
}
|
|
if (key.has_point_coordinates) {
|
|
a.OpMov(dxbc::Dest::O(output_register_point_coordinates, 0b0011),
|
|
dxbc::Src::LF(0.0f));
|
|
}
|
|
a.OpMov(dxbc::Dest::O(output_register_position),
|
|
dxbc::Src::V2D(input_vertex_index, input_register_position));
|
|
for (uint32_t j = 0; j < input_clip_distance_count; j += 4) {
|
|
a.OpMov(
|
|
dxbc::Dest::O(output_register_clip_distances + (j >> 2),
|
|
(UINT32_C(1) << std::min(
|
|
input_clip_distance_count - j, UINT32_C(4))) -
|
|
1),
|
|
dxbc::Src::V2D(
|
|
input_vertex_index,
|
|
input_register_clip_and_cull_distances + (j >> 2)));
|
|
}
|
|
if (i < 3) {
|
|
a.OpEmitStream(stream);
|
|
}
|
|
}
|
|
a.OpEmitThenCutStream(stream);
|
|
} break;
|
|
|
|
default:
|
|
assert_unhandled_case(key.type);
|
|
}
|
|
|
|
a.OpRet();
|
|
|
|
// Write the actual number of temporary registers used.
|
|
shader_out[dcl_temps_count_position_dwords] = stat.temp_register_count;
|
|
|
|
// Write the shader program length in dwords.
|
|
shader_out[shex_position_dwords + 1] =
|
|
uint32_t(shader_out.size()) - shex_position_dwords;
|
|
|
|
{
|
|
auto& blob_header = *reinterpret_cast<dxbc::BlobHeader*>(
|
|
shader_out.data() + blob_position_dwords);
|
|
blob_header.fourcc = dxbc::BlobHeader::FourCC::kShaderEx;
|
|
blob_position_dwords = uint32_t(shader_out.size());
|
|
blob_header.size_bytes =
|
|
(blob_position_dwords - kBlobHeaderSizeDwords) * sizeof(uint32_t) -
|
|
shader_out[blob_offset_position_dwords++];
|
|
}
|
|
|
|
// ***************************************************************************
|
|
// Statistics
|
|
// ***************************************************************************
|
|
|
|
shader_out[blob_offset_position_dwords] =
|
|
uint32_t(blob_position_dwords * sizeof(uint32_t));
|
|
uint32_t stat_position_dwords = blob_position_dwords + kBlobHeaderSizeDwords;
|
|
shader_out.resize(stat_position_dwords +
|
|
sizeof(dxbc::Statistics) / sizeof(uint32_t));
|
|
std::memcpy(shader_out.data() + stat_position_dwords, &stat,
|
|
sizeof(dxbc::Statistics));
|
|
|
|
{
|
|
auto& blob_header = *reinterpret_cast<dxbc::BlobHeader*>(
|
|
shader_out.data() + blob_position_dwords);
|
|
blob_header.fourcc = dxbc::BlobHeader::FourCC::kStatistics;
|
|
blob_position_dwords = uint32_t(shader_out.size());
|
|
blob_header.size_bytes =
|
|
(blob_position_dwords - kBlobHeaderSizeDwords) * sizeof(uint32_t) -
|
|
shader_out[blob_offset_position_dwords++];
|
|
}
|
|
|
|
// ***************************************************************************
|
|
// Container header
|
|
// ***************************************************************************
|
|
|
|
uint32_t shader_size_bytes = uint32_t(shader_out.size() * sizeof(uint32_t));
|
|
{
|
|
auto& container_header =
|
|
*reinterpret_cast<dxbc::ContainerHeader*>(shader_out.data());
|
|
container_header.InitializeIdentification();
|
|
container_header.size_bytes = shader_size_bytes;
|
|
container_header.blob_count = kBlobCount;
|
|
CalculateDXBCChecksum(
|
|
reinterpret_cast<unsigned char*>(shader_out.data()),
|
|
static_cast<unsigned int>(shader_size_bytes),
|
|
reinterpret_cast<unsigned int*>(&container_header.hash));
|
|
}
|
|
}
|
|
|
|
const std::vector<uint32_t>& PipelineCache::GetGeometryShader(
|
|
GeometryShaderKey key) {
|
|
auto it = geometry_shaders_.find(key);
|
|
if (it != geometry_shaders_.end()) {
|
|
return it->second;
|
|
}
|
|
std::vector<uint32_t> shader;
|
|
CreateDxbcGeometryShader(key, shader);
|
|
return geometry_shaders_.emplace(key, std::move(shader)).first->second;
|
|
}
|
|
|
|
ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline(
|
|
const PipelineRuntimeDescription& runtime_description) {
|
|
const PipelineDescription& description = runtime_description.description;
|
|
|
|
if (runtime_description.pixel_shader != nullptr) {
|
|
XELOGGPU("Creating graphics pipeline with VS {:016X}, PS {:016X}",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash(),
|
|
runtime_description.pixel_shader->shader().ucode_data_hash());
|
|
} else {
|
|
XELOGGPU("Creating graphics pipeline with VS {:016X}",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash());
|
|
}
|
|
|
|
D3D12_GRAPHICS_PIPELINE_STATE_DESC state_desc;
|
|
std::memset(&state_desc, 0, sizeof(state_desc));
|
|
|
|
bool edram_rov_used = render_target_cache_.GetPath() ==
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
|
|
// Root signature.
|
|
state_desc.pRootSignature = runtime_description.root_signature;
|
|
|
|
// Index buffer strip cut value.
|
|
switch (description.strip_cut_index) {
|
|
case PipelineStripCutIndex::kFFFF:
|
|
state_desc.IBStripCutValue = D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFF;
|
|
break;
|
|
case PipelineStripCutIndex::kFFFFFFFF:
|
|
state_desc.IBStripCutValue =
|
|
D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFFFFFF;
|
|
break;
|
|
default:
|
|
state_desc.IBStripCutValue = D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_DISABLED;
|
|
break;
|
|
}
|
|
|
|
// Primitive topology, vertex, hull, domain and geometry shaders.
|
|
if (!runtime_description.vertex_shader->is_translated()) {
|
|
XELOGE("Vertex shader {:016X} not translated",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash());
|
|
assert_always();
|
|
return nullptr;
|
|
}
|
|
Shader::HostVertexShaderType host_vertex_shader_type =
|
|
DxbcShaderTranslator::Modification(
|
|
runtime_description.vertex_shader->modification())
|
|
.vertex.host_vertex_shader_type;
|
|
if (Shader::IsHostVertexShaderTypeDomain(host_vertex_shader_type)) {
|
|
state_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_PATCH;
|
|
xenos::TessellationMode tessellation_mode = xenos::TessellationMode(
|
|
description.primitive_topology_type_or_tessellation_mode);
|
|
if (tessellation_mode == xenos::TessellationMode::kAdaptive) {
|
|
state_desc.VS.pShaderBytecode = shaders::tessellation_adaptive_vs;
|
|
state_desc.VS.BytecodeLength = sizeof(shaders::tessellation_adaptive_vs);
|
|
} else {
|
|
state_desc.VS.pShaderBytecode = shaders::tessellation_indexed_vs;
|
|
state_desc.VS.BytecodeLength = sizeof(shaders::tessellation_indexed_vs);
|
|
}
|
|
switch (tessellation_mode) {
|
|
case xenos::TessellationMode::kDiscrete:
|
|
switch (host_vertex_shader_type) {
|
|
case Shader::HostVertexShaderType::kTriangleDomainCPIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::discrete_triangle_3cp_hs;
|
|
state_desc.HS.BytecodeLength =
|
|
sizeof(shaders::discrete_triangle_3cp_hs);
|
|
break;
|
|
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::discrete_triangle_1cp_hs;
|
|
state_desc.HS.BytecodeLength =
|
|
sizeof(shaders::discrete_triangle_1cp_hs);
|
|
break;
|
|
case Shader::HostVertexShaderType::kQuadDomainCPIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::discrete_quad_4cp_hs;
|
|
state_desc.HS.BytecodeLength =
|
|
sizeof(shaders::discrete_quad_4cp_hs);
|
|
break;
|
|
case Shader::HostVertexShaderType::kQuadDomainPatchIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::discrete_quad_1cp_hs;
|
|
state_desc.HS.BytecodeLength =
|
|
sizeof(shaders::discrete_quad_1cp_hs);
|
|
break;
|
|
default:
|
|
assert_unhandled_case(host_vertex_shader_type);
|
|
return nullptr;
|
|
}
|
|
break;
|
|
case xenos::TessellationMode::kContinuous:
|
|
switch (host_vertex_shader_type) {
|
|
case Shader::HostVertexShaderType::kTriangleDomainCPIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::continuous_triangle_3cp_hs;
|
|
state_desc.HS.BytecodeLength =
|
|
sizeof(shaders::continuous_triangle_3cp_hs);
|
|
break;
|
|
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::continuous_triangle_1cp_hs;
|
|
state_desc.HS.BytecodeLength =
|
|
sizeof(shaders::continuous_triangle_1cp_hs);
|
|
break;
|
|
case Shader::HostVertexShaderType::kQuadDomainCPIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::continuous_quad_4cp_hs;
|
|
state_desc.HS.BytecodeLength =
|
|
sizeof(shaders::continuous_quad_4cp_hs);
|
|
break;
|
|
case Shader::HostVertexShaderType::kQuadDomainPatchIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::continuous_quad_1cp_hs;
|
|
state_desc.HS.BytecodeLength =
|
|
sizeof(shaders::continuous_quad_1cp_hs);
|
|
break;
|
|
default:
|
|
assert_unhandled_case(host_vertex_shader_type);
|
|
return nullptr;
|
|
}
|
|
break;
|
|
case xenos::TessellationMode::kAdaptive:
|
|
switch (host_vertex_shader_type) {
|
|
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::adaptive_triangle_hs;
|
|
state_desc.HS.BytecodeLength =
|
|
sizeof(shaders::adaptive_triangle_hs);
|
|
break;
|
|
case Shader::HostVertexShaderType::kQuadDomainPatchIndexed:
|
|
state_desc.HS.pShaderBytecode = shaders::adaptive_quad_hs;
|
|
state_desc.HS.BytecodeLength = sizeof(shaders::adaptive_quad_hs);
|
|
break;
|
|
default:
|
|
assert_unhandled_case(host_vertex_shader_type);
|
|
return nullptr;
|
|
}
|
|
break;
|
|
default:
|
|
assert_unhandled_case(tessellation_mode);
|
|
return nullptr;
|
|
}
|
|
state_desc.DS.pShaderBytecode =
|
|
runtime_description.vertex_shader->translated_binary().data();
|
|
state_desc.DS.BytecodeLength =
|
|
runtime_description.vertex_shader->translated_binary().size();
|
|
} else {
|
|
assert_true(host_vertex_shader_type ==
|
|
Shader::HostVertexShaderType::kVertex);
|
|
if (host_vertex_shader_type != Shader::HostVertexShaderType::kVertex) {
|
|
// Fallback vertex shaders are not needed on Direct3D 12.
|
|
return nullptr;
|
|
}
|
|
state_desc.VS.pShaderBytecode =
|
|
runtime_description.vertex_shader->translated_binary().data();
|
|
state_desc.VS.BytecodeLength =
|
|
runtime_description.vertex_shader->translated_binary().size();
|
|
PipelinePrimitiveTopologyType primitive_topology_type =
|
|
PipelinePrimitiveTopologyType(
|
|
description.primitive_topology_type_or_tessellation_mode);
|
|
switch (primitive_topology_type) {
|
|
case PipelinePrimitiveTopologyType::kPoint:
|
|
state_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_POINT;
|
|
break;
|
|
case PipelinePrimitiveTopologyType::kLine:
|
|
state_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_LINE;
|
|
break;
|
|
case PipelinePrimitiveTopologyType::kTriangle:
|
|
state_desc.PrimitiveTopologyType =
|
|
D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
|
|
break;
|
|
default:
|
|
assert_unhandled_case(primitive_topology_type);
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
// Pixel shader.
|
|
if (runtime_description.pixel_shader != nullptr) {
|
|
if (!runtime_description.pixel_shader->is_translated()) {
|
|
XELOGE("Pixel shader {:016X} not translated",
|
|
runtime_description.pixel_shader->shader().ucode_data_hash());
|
|
assert_always();
|
|
return nullptr;
|
|
}
|
|
state_desc.PS.pShaderBytecode =
|
|
runtime_description.pixel_shader->translated_binary().data();
|
|
state_desc.PS.BytecodeLength =
|
|
runtime_description.pixel_shader->translated_binary().size();
|
|
} else if (edram_rov_used) {
|
|
state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data();
|
|
state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size();
|
|
} else {
|
|
if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
|
|
(description.depth_func != xenos::CompareFunction::kAlways ||
|
|
description.depth_write) &&
|
|
description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
|
|
if (render_target_cache_.depth_float24_round()) {
|
|
state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
|
|
state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
|
|
} else {
|
|
state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps;
|
|
state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Geometry shader.
|
|
if (runtime_description.geometry_shader != nullptr) {
|
|
state_desc.GS.pShaderBytecode = runtime_description.geometry_shader->data();
|
|
state_desc.GS.BytecodeLength =
|
|
sizeof(*runtime_description.geometry_shader->data()) *
|
|
runtime_description.geometry_shader->size();
|
|
}
|
|
|
|
// Rasterizer state.
|
|
state_desc.RasterizerState.FillMode = description.fill_mode_wireframe
|
|
? D3D12_FILL_MODE_WIREFRAME
|
|
: D3D12_FILL_MODE_SOLID;
|
|
switch (description.cull_mode) {
|
|
case PipelineCullMode::kFront:
|
|
state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_FRONT;
|
|
break;
|
|
case PipelineCullMode::kBack:
|
|
state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_BACK;
|
|
break;
|
|
default:
|
|
assert_true(description.cull_mode == PipelineCullMode::kNone ||
|
|
description.cull_mode ==
|
|
PipelineCullMode::kDisableRasterization);
|
|
state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_NONE;
|
|
break;
|
|
}
|
|
state_desc.RasterizerState.FrontCounterClockwise =
|
|
description.front_counter_clockwise ? TRUE : FALSE;
|
|
state_desc.RasterizerState.DepthBias = description.depth_bias;
|
|
state_desc.RasterizerState.DepthBiasClamp = 0.0f;
|
|
// With non-square resolution scaling, make sure the worst-case impact is
|
|
// reverted (slope only along the scaled axis), thus max. More bias is better
|
|
// than less bias, because less bias means Z fighting with the background is
|
|
// more likely.
|
|
state_desc.RasterizerState.SlopeScaledDepthBias =
|
|
description.depth_bias_slope_scaled *
|
|
float(std::max(render_target_cache_.draw_resolution_scale_x(),
|
|
render_target_cache_.draw_resolution_scale_y()));
|
|
state_desc.RasterizerState.DepthClipEnable =
|
|
description.depth_clip ? TRUE : FALSE;
|
|
uint32_t msaa_sample_count = uint32_t(1)
|
|
<< uint32_t(description.host_msaa_samples);
|
|
if (edram_rov_used) {
|
|
// Only 1, 4, 8 and (not on all GPUs) 16 are allowed, using sample 0 as 0
|
|
// and 3 as 1 for 2x instead (not exactly the same sample positions, but
|
|
// still top-left and bottom-right - however, this can be adjusted with
|
|
// programmable sample positions).
|
|
assert_true(msaa_sample_count == 1 || msaa_sample_count == 4);
|
|
if (msaa_sample_count != 1 && msaa_sample_count != 4) {
|
|
return nullptr;
|
|
}
|
|
state_desc.RasterizerState.ForcedSampleCount =
|
|
uint32_t(1) << uint32_t(description.host_msaa_samples);
|
|
}
|
|
|
|
// Sample mask and description.
|
|
state_desc.SampleMask = UINT_MAX;
|
|
// TODO(Triang3l): 4x MSAA fallback when 2x isn't supported without ROV.
|
|
if (edram_rov_used) {
|
|
state_desc.SampleDesc.Count = 1;
|
|
} else {
|
|
assert_true(msaa_sample_count <= 4);
|
|
if (msaa_sample_count > 4) {
|
|
return nullptr;
|
|
}
|
|
if (msaa_sample_count == 2 && !render_target_cache_.msaa_2x_supported()) {
|
|
// Using sample 0 as 0 and 3 as 1 for 2x instead (not exactly the same
|
|
// sample positions, but still top-left and bottom-right - however, this
|
|
// can be adjusted with programmable sample positions).
|
|
state_desc.SampleMask = 0b1001;
|
|
state_desc.SampleDesc.Count = 4;
|
|
} else {
|
|
state_desc.SampleDesc.Count = msaa_sample_count;
|
|
}
|
|
}
|
|
|
|
if (!edram_rov_used) {
|
|
// Depth/stencil.
|
|
if (description.depth_func != xenos::CompareFunction::kAlways ||
|
|
description.depth_write) {
|
|
state_desc.DepthStencilState.DepthEnable = TRUE;
|
|
state_desc.DepthStencilState.DepthWriteMask =
|
|
description.depth_write ? D3D12_DEPTH_WRITE_MASK_ALL
|
|
: D3D12_DEPTH_WRITE_MASK_ZERO;
|
|
// Comparison functions are the same in Direct3D 12 but plus one (minus
|
|
// one, bit 0 for less, bit 1 for equal, bit 2 for greater).
|
|
state_desc.DepthStencilState.DepthFunc =
|
|
D3D12_COMPARISON_FUNC(uint32_t(D3D12_COMPARISON_FUNC_NEVER) +
|
|
uint32_t(description.depth_func));
|
|
}
|
|
if (description.stencil_enable) {
|
|
state_desc.DepthStencilState.StencilEnable = TRUE;
|
|
state_desc.DepthStencilState.StencilReadMask =
|
|
description.stencil_read_mask;
|
|
state_desc.DepthStencilState.StencilWriteMask =
|
|
description.stencil_write_mask;
|
|
// Stencil operations are the same in Direct3D 12 too but plus one.
|
|
state_desc.DepthStencilState.FrontFace.StencilFailOp =
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
uint32_t(description.stencil_front_fail_op));
|
|
state_desc.DepthStencilState.FrontFace.StencilDepthFailOp =
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
uint32_t(description.stencil_front_depth_fail_op));
|
|
state_desc.DepthStencilState.FrontFace.StencilPassOp =
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
uint32_t(description.stencil_front_pass_op));
|
|
state_desc.DepthStencilState.FrontFace.StencilFunc =
|
|
D3D12_COMPARISON_FUNC(uint32_t(D3D12_COMPARISON_FUNC_NEVER) +
|
|
uint32_t(description.stencil_front_func));
|
|
state_desc.DepthStencilState.BackFace.StencilFailOp =
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
uint32_t(description.stencil_back_fail_op));
|
|
state_desc.DepthStencilState.BackFace.StencilDepthFailOp =
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
uint32_t(description.stencil_back_depth_fail_op));
|
|
state_desc.DepthStencilState.BackFace.StencilPassOp =
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
uint32_t(description.stencil_back_pass_op));
|
|
state_desc.DepthStencilState.BackFace.StencilFunc =
|
|
D3D12_COMPARISON_FUNC(uint32_t(D3D12_COMPARISON_FUNC_NEVER) +
|
|
uint32_t(description.stencil_back_func));
|
|
}
|
|
if (state_desc.DepthStencilState.DepthEnable ||
|
|
state_desc.DepthStencilState.StencilEnable) {
|
|
state_desc.DSVFormat = D3D12RenderTargetCache::GetDepthDSVDXGIFormat(
|
|
description.depth_format);
|
|
}
|
|
|
|
// Render targets and blending.
|
|
state_desc.BlendState.IndependentBlendEnable = TRUE;
|
|
static const D3D12_BLEND kBlendFactorMap[] = {
|
|
D3D12_BLEND_ZERO, D3D12_BLEND_ONE,
|
|
D3D12_BLEND_SRC_COLOR, D3D12_BLEND_INV_SRC_COLOR,
|
|
D3D12_BLEND_SRC_ALPHA, D3D12_BLEND_INV_SRC_ALPHA,
|
|
D3D12_BLEND_DEST_COLOR, D3D12_BLEND_INV_DEST_COLOR,
|
|
D3D12_BLEND_DEST_ALPHA, D3D12_BLEND_INV_DEST_ALPHA,
|
|
D3D12_BLEND_BLEND_FACTOR, D3D12_BLEND_INV_BLEND_FACTOR,
|
|
D3D12_BLEND_SRC_ALPHA_SAT,
|
|
};
|
|
// 8 entries for safety since 3 bits from the guest are passed directly.
|
|
static const D3D12_BLEND_OP kBlendOpMap[] = {
|
|
D3D12_BLEND_OP_ADD, D3D12_BLEND_OP_SUBTRACT, D3D12_BLEND_OP_MIN,
|
|
D3D12_BLEND_OP_MAX, D3D12_BLEND_OP_REV_SUBTRACT, D3D12_BLEND_OP_ADD,
|
|
D3D12_BLEND_OP_ADD, D3D12_BLEND_OP_ADD};
|
|
for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) {
|
|
const PipelineRenderTarget& rt = description.render_targets[i];
|
|
if (!rt.used) {
|
|
// Null RTV descriptors can be used for slots with DXGI_FORMAT_UNKNOWN
|
|
// in the pipeline state.
|
|
state_desc.RTVFormats[i] = DXGI_FORMAT_UNKNOWN;
|
|
continue;
|
|
}
|
|
state_desc.NumRenderTargets = i + 1;
|
|
state_desc.RTVFormats[i] =
|
|
render_target_cache_.GetColorDrawDXGIFormat(rt.format);
|
|
if (state_desc.RTVFormats[i] == DXGI_FORMAT_UNKNOWN) {
|
|
assert_always();
|
|
return nullptr;
|
|
}
|
|
D3D12_RENDER_TARGET_BLEND_DESC& blend_desc =
|
|
state_desc.BlendState.RenderTarget[i];
|
|
if (rt.src_blend != PipelineBlendFactor::kOne ||
|
|
rt.dest_blend != PipelineBlendFactor::kZero ||
|
|
rt.blend_op != xenos::BlendOp::kAdd ||
|
|
rt.src_blend_alpha != PipelineBlendFactor::kOne ||
|
|
rt.dest_blend_alpha != PipelineBlendFactor::kZero ||
|
|
rt.blend_op_alpha != xenos::BlendOp::kAdd) {
|
|
blend_desc.BlendEnable = TRUE;
|
|
blend_desc.SrcBlend = kBlendFactorMap[uint32_t(rt.src_blend)];
|
|
blend_desc.DestBlend = kBlendFactorMap[uint32_t(rt.dest_blend)];
|
|
blend_desc.BlendOp = kBlendOpMap[uint32_t(rt.blend_op)];
|
|
blend_desc.SrcBlendAlpha =
|
|
kBlendFactorMap[uint32_t(rt.src_blend_alpha)];
|
|
blend_desc.DestBlendAlpha =
|
|
kBlendFactorMap[uint32_t(rt.dest_blend_alpha)];
|
|
blend_desc.BlendOpAlpha = kBlendOpMap[uint32_t(rt.blend_op_alpha)];
|
|
}
|
|
blend_desc.RenderTargetWriteMask = rt.write_mask;
|
|
}
|
|
}
|
|
|
|
// Disable rasterization if needed (parameter combinations that make no
|
|
// difference when rasterization is disabled have already been handled in
|
|
// GetCurrentStateDescription) the way it's disabled in Direct3D by design
|
|
// (disabling a pixel shader and depth / stencil).
|
|
// TODO(Triang3l): When it happens to be that a combination of parameters
|
|
// (no host pixel shader and depth / stencil without ROV) would disable
|
|
// rasterization when it's still needed (for occlusion query sample counting),
|
|
// ensure rasterization happens (by binding an empty pixel shader, or maybe
|
|
// via ForcedSampleCount when not using 2x MSAA - its requirements for
|
|
// OMSetRenderTargets need some investigation though).
|
|
if (description.cull_mode == PipelineCullMode::kDisableRasterization) {
|
|
state_desc.PS.pShaderBytecode = nullptr;
|
|
state_desc.PS.BytecodeLength = 0;
|
|
state_desc.DepthStencilState.DepthEnable = FALSE;
|
|
state_desc.DepthStencilState.StencilEnable = FALSE;
|
|
}
|
|
|
|
// Create the D3D12 pipeline state object.
|
|
ID3D12Device* device = command_processor_.GetD3D12Provider().GetDevice();
|
|
ID3D12PipelineState* state;
|
|
if (FAILED(device->CreateGraphicsPipelineState(&state_desc,
|
|
IID_PPV_ARGS(&state)))) {
|
|
if (runtime_description.pixel_shader != nullptr) {
|
|
XELOGE("Failed to create graphics pipeline with VS {:016X}, PS {:016X}",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash(),
|
|
runtime_description.pixel_shader->shader().ucode_data_hash());
|
|
} else {
|
|
XELOGE("Failed to create graphics pipeline with VS {:016X}",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash());
|
|
}
|
|
return nullptr;
|
|
}
|
|
std::wstring name;
|
|
if (runtime_description.pixel_shader != nullptr) {
|
|
name = fmt::format(
|
|
L"VS {:016X}, PS {:016X}",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash(),
|
|
runtime_description.pixel_shader->shader().ucode_data_hash());
|
|
} else {
|
|
name = fmt::format(
|
|
L"VS {:016X}",
|
|
runtime_description.vertex_shader->shader().ucode_data_hash());
|
|
}
|
|
state->SetName(name.c_str());
|
|
return state;
|
|
}
|
|
|
|
void PipelineCache::StorageWriteThread() {
|
|
ShaderStoredHeader shader_header;
|
|
// Don't leak anything in unused bits.
|
|
std::memset(&shader_header, 0, sizeof(shader_header));
|
|
|
|
std::vector<uint32_t> ucode_guest_endian;
|
|
ucode_guest_endian.reserve(0xFFFF);
|
|
|
|
bool flush_shaders = false;
|
|
bool flush_pipelines = false;
|
|
|
|
while (true) {
|
|
if (flush_shaders) {
|
|
flush_shaders = false;
|
|
assert_not_null(shader_storage_file_);
|
|
fflush(shader_storage_file_);
|
|
}
|
|
if (flush_pipelines) {
|
|
flush_pipelines = false;
|
|
assert_not_null(pipeline_storage_file_);
|
|
fflush(pipeline_storage_file_);
|
|
}
|
|
|
|
const Shader* shader = nullptr;
|
|
PipelineStoredDescription pipeline_description;
|
|
bool write_pipeline = false;
|
|
{
|
|
std::unique_lock<std::mutex> lock(storage_write_request_lock_);
|
|
if (storage_write_thread_shutdown_) {
|
|
return;
|
|
}
|
|
if (!storage_write_shader_queue_.empty()) {
|
|
shader = storage_write_shader_queue_.front();
|
|
storage_write_shader_queue_.pop_front();
|
|
} else if (storage_write_flush_shaders_) {
|
|
storage_write_flush_shaders_ = false;
|
|
flush_shaders = true;
|
|
}
|
|
if (!storage_write_pipeline_queue_.empty()) {
|
|
std::memcpy(&pipeline_description,
|
|
&storage_write_pipeline_queue_.front(),
|
|
sizeof(pipeline_description));
|
|
storage_write_pipeline_queue_.pop_front();
|
|
write_pipeline = true;
|
|
} else if (storage_write_flush_pipelines_) {
|
|
storage_write_flush_pipelines_ = false;
|
|
flush_pipelines = true;
|
|
}
|
|
if (!shader && !write_pipeline) {
|
|
storage_write_request_cond_.wait(lock);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (shader) {
|
|
shader_header.ucode_data_hash = shader->ucode_data_hash();
|
|
shader_header.ucode_dword_count = shader->ucode_dword_count();
|
|
shader_header.type = shader->type();
|
|
assert_not_null(shader_storage_file_);
|
|
fwrite(&shader_header, sizeof(shader_header), 1, shader_storage_file_);
|
|
if (shader_header.ucode_dword_count) {
|
|
ucode_guest_endian.resize(shader_header.ucode_dword_count);
|
|
// Need to swap because the hash is calculated for the shader with guest
|
|
// endianness.
|
|
xe::copy_and_swap(ucode_guest_endian.data(), shader->ucode_dwords(),
|
|
shader_header.ucode_dword_count);
|
|
fwrite(ucode_guest_endian.data(),
|
|
shader_header.ucode_dword_count * sizeof(uint32_t), 1,
|
|
shader_storage_file_);
|
|
}
|
|
}
|
|
|
|
if (write_pipeline) {
|
|
assert_not_null(pipeline_storage_file_);
|
|
fwrite(&pipeline_description, sizeof(pipeline_description), 1,
|
|
pipeline_storage_file_);
|
|
}
|
|
}
|
|
}
|
|
|
|
void PipelineCache::CreationThread(size_t thread_index) {
|
|
while (true) {
|
|
Pipeline* pipeline_to_create = nullptr;
|
|
|
|
// Check if need to shut down or set the completion event and dequeue the
|
|
// pipeline if there is any.
|
|
{
|
|
std::unique_lock<xe_mutex> lock(creation_request_lock_);
|
|
if (thread_index >= creation_threads_shutdown_from_ ||
|
|
creation_queue_.empty()) {
|
|
if (creation_completion_set_event_ && creation_threads_busy_ == 0) {
|
|
// Last pipeline in the queue created - signal the event if requested.
|
|
creation_completion_set_event_ = false;
|
|
creation_completion_event_->Set();
|
|
}
|
|
if (thread_index >= creation_threads_shutdown_from_) {
|
|
return;
|
|
}
|
|
creation_request_cond_.wait(lock);
|
|
continue;
|
|
}
|
|
// Take the pipeline from the queue and increment the busy thread count
|
|
// until the pipeline is created - other threads must be able to dequeue
|
|
// requests, but can't set the completion event until the pipelines are
|
|
// fully created (rather than just started creating).
|
|
pipeline_to_create = creation_queue_.front();
|
|
creation_queue_.pop_front();
|
|
++creation_threads_busy_;
|
|
}
|
|
|
|
// Create the D3D12 pipeline state object.
|
|
pipeline_to_create->state =
|
|
CreateD3D12Pipeline(pipeline_to_create->description);
|
|
|
|
// Pipeline created - the thread is not busy anymore, safe to set the
|
|
// completion event if needed (at the next iteration, or in some other
|
|
// thread).
|
|
{
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
|
--creation_threads_busy_;
|
|
}
|
|
}
|
|
}
|
|
|
|
void PipelineCache::CreateQueuedPipelinesOnProcessorThread() {
|
|
assert_false(creation_threads_.empty());
|
|
while (true) {
|
|
Pipeline* pipeline_to_create;
|
|
{
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
|
if (creation_queue_.empty()) {
|
|
break;
|
|
}
|
|
pipeline_to_create = creation_queue_.front();
|
|
creation_queue_.pop_front();
|
|
}
|
|
pipeline_to_create->state =
|
|
CreateD3D12Pipeline(pipeline_to_create->description);
|
|
}
|
|
}
|
|
|
|
} // namespace d3d12
|
|
} // namespace gpu
|
|
} // namespace xe
|