2018-07-24 13:57:21 +02:00
|
|
|
/**
|
|
|
|
|
******************************************************************************
|
|
|
|
|
* Xenia : Xbox 360 Emulator Research Project *
|
|
|
|
|
******************************************************************************
|
2022-02-13 18:50:31 +01:00
|
|
|
* Copyright 2022 Ben Vanik. All rights reserved. *
|
2018-07-24 13:57:21 +02:00
|
|
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
|
|
|
|
******************************************************************************
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "xenia/gpu/d3d12/pipeline_cache.h"
|
|
|
|
|
|
2018-08-10 17:06:21 +02:00
|
|
|
#include <algorithm>
|
2020-03-21 17:21:00 +01:00
|
|
|
#include <atomic>
|
2018-07-24 13:57:21 +02:00
|
|
|
#include <cinttypes>
|
2018-07-28 15:30:47 +02:00
|
|
|
#include <cmath>
|
2018-07-30 14:59:43 +02:00
|
|
|
#include <cstring>
|
2020-03-21 17:21:00 +01:00
|
|
|
#include <deque>
|
|
|
|
|
#include <mutex>
|
2020-12-07 20:23:54 +01:00
|
|
|
#include <set>
|
2018-10-16 13:02:43 +02:00
|
|
|
#include <utility>
|
2020-12-19 14:14:54 +01:00
|
|
|
#include <vector>
|
2018-07-24 13:57:21 +02:00
|
|
|
|
2022-05-09 18:16:22 +02:00
|
|
|
#include "third_party/dxbc/DXBCChecksum.h"
|
2020-03-02 16:37:11 +01:00
|
|
|
#include "third_party/fmt/include/fmt/format.h"
|
2018-07-24 13:57:21 +02:00
|
|
|
#include "xenia/base/assert.h"
|
2020-03-21 17:21:00 +01:00
|
|
|
#include "xenia/base/byte_order.h"
|
|
|
|
|
#include "xenia/base/clock.h"
|
2019-08-03 15:53:23 +02:00
|
|
|
#include "xenia/base/cvar.h"
|
2020-03-21 17:21:00 +01:00
|
|
|
#include "xenia/base/filesystem.h"
|
2018-07-24 13:57:21 +02:00
|
|
|
#include "xenia/base/logging.h"
|
2019-01-01 20:20:50 +01:00
|
|
|
#include "xenia/base/math.h"
|
2018-07-24 14:14:16 +02:00
|
|
|
#include "xenia/base/profiling.h"
|
2018-09-16 14:57:22 +02:00
|
|
|
#include "xenia/base/string.h"
|
2020-12-19 14:14:54 +01:00
|
|
|
#include "xenia/base/string_buffer.h"
|
2020-12-08 20:31:09 +01:00
|
|
|
#include "xenia/base/xxhash.h"
|
2018-07-30 14:59:43 +02:00
|
|
|
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
|
2021-04-26 21:12:09 +02:00
|
|
|
#include "xenia/gpu/d3d12/d3d12_render_target_cache.h"
|
2020-12-24 21:40:38 +01:00
|
|
|
#include "xenia/gpu/draw_util.h"
|
2022-05-09 18:16:22 +02:00
|
|
|
#include "xenia/gpu/dxbc.h"
|
|
|
|
|
#include "xenia/gpu/dxbc_shader_translator.h"
|
2018-07-24 13:57:21 +02:00
|
|
|
#include "xenia/gpu/gpu_flags.h"
|
2022-04-27 20:46:29 +02:00
|
|
|
#include "xenia/gpu/registers.h"
|
2021-06-09 19:46:24 +02:00
|
|
|
#include "xenia/gpu/xenos.h"
|
2020-08-22 22:15:50 +02:00
|
|
|
#include "xenia/ui/d3d12/d3d12_util.h"
|
2018-08-30 19:42:22 +02:00
|
|
|
|
|
|
|
|
DEFINE_bool(d3d12_dxbc_disasm, false,
|
2019-08-03 15:53:23 +02:00
|
|
|
"Disassemble DXBC shaders after generation.", "D3D12");
|
2020-08-22 22:15:50 +02:00
|
|
|
DEFINE_bool(
|
|
|
|
|
d3d12_dxbc_disasm_dxilconv, false,
|
|
|
|
|
"Disassemble DXBC shaders after conversion to DXIL, if DXIL shaders are "
|
|
|
|
|
"supported by the OS, and DirectX Shader Compiler DLLs available at "
|
|
|
|
|
"https://github.com/microsoft/DirectXShaderCompiler/releases are present.",
|
|
|
|
|
"D3D12");
|
2019-01-04 12:30:26 +01:00
|
|
|
DEFINE_int32(
|
|
|
|
|
d3d12_pipeline_creation_threads, -1,
|
2020-11-14 14:43:18 +01:00
|
|
|
"Number of threads used for graphics pipeline creation. -1 to calculate "
|
|
|
|
|
"automatically (75% of logical CPU cores), a positive number to specify "
|
|
|
|
|
"the number of threads explicitly (up to the number of logical CPU cores), "
|
|
|
|
|
"0 to disable multithreaded pipeline creation.",
|
2019-08-03 15:53:23 +02:00
|
|
|
"D3D12");
|
2018-12-28 12:37:39 +01:00
|
|
|
DEFINE_bool(d3d12_tessellation_wireframe, false,
|
2019-08-03 15:53:23 +02:00
|
|
|
"Display tessellated surfaces as wireframe for debugging.",
|
|
|
|
|
"D3D12");
|
2018-07-24 13:57:21 +02:00
|
|
|
|
|
|
|
|
namespace xe {
|
|
|
|
|
namespace gpu {
|
|
|
|
|
namespace d3d12 {
|
|
|
|
|
|
2021-06-05 17:53:53 +02:00
|
|
|
// Generated with `xb buildshaders`.
|
|
|
|
|
namespace shaders {
|
|
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/adaptive_quad_hs.h"
|
|
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/adaptive_triangle_hs.h"
|
2022-07-24 16:38:26 +02:00
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/continuous_quad_1cp_hs.h"
|
|
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/continuous_quad_4cp_hs.h"
|
|
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/continuous_triangle_1cp_hs.h"
|
|
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/continuous_triangle_3cp_hs.h"
|
|
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/discrete_quad_1cp_hs.h"
|
|
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/discrete_quad_4cp_hs.h"
|
|
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/discrete_triangle_1cp_hs.h"
|
|
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/discrete_triangle_3cp_hs.h"
|
2021-06-05 17:53:53 +02:00
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/float24_round_ps.h"
|
|
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/float24_truncate_ps.h"
|
|
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/tessellation_adaptive_vs.h"
|
|
|
|
|
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/tessellation_indexed_vs.h"
|
|
|
|
|
} // namespace shaders
|
2018-08-14 16:21:18 +02:00
|
|
|
|
2021-04-26 21:12:09 +02:00
|
|
|
PipelineCache::PipelineCache(D3D12CommandProcessor& command_processor,
|
|
|
|
|
const RegisterFile& register_file,
|
|
|
|
|
const D3D12RenderTargetCache& render_target_cache,
|
|
|
|
|
bool bindless_resources_used)
|
2018-10-10 13:30:29 +02:00
|
|
|
: command_processor_(command_processor),
|
|
|
|
|
register_file_(register_file),
|
2021-04-26 21:12:09 +02:00
|
|
|
render_target_cache_(render_target_cache),
|
|
|
|
|
bindless_resources_used_(bindless_resources_used) {
|
[UI] Image post-processing and full presentation/window rework
[GPU] Add FXAA post-processing
[UI] Add FidelityFX FSR and CAS post-processing
[UI] Add blue noise dithering from 10bpc to 8bpc
[GPU] Apply the DC PWL gamma ramp closer to the spec, supporting fully white color
[UI] Allow the GPU CP thread to present on the host directly, bypassing the UI thread OS paint event
[UI] Allow variable refresh rate (or tearing)
[UI] Present the newest frame (restart) on DXGI
[UI] Replace GraphicsContext with a far more advanced Presenter with more coherent surface connection and UI overlay state management
[UI] Connect presentation to windows via the Surface class, not native window handles
[Vulkan] Switch to simpler Vulkan setup with no instance/device separation due to interdependencies and to pass fewer objects around
[Vulkan] Lower the minimum required Vulkan version to 1.0
[UI/GPU] Various cleanup, mainly ComPtr usage
[UI] Support per-monitor DPI awareness v2 on Windows
[UI] DPI-scale Dear ImGui
[UI] Replace the remaining non-detachable window delegates with unified window event and input listeners
[UI] Allow listeners to safely destroy or close the window, and to register/unregister listeners without use-after-free and the ABA problem
[UI] Explicit Z ordering of input listeners and UI overlays, top-down for input, bottom-up for drawing
[UI] Add explicit window lifecycle phases
[UI] Replace Window virtual functions with explicit desired state, its application, actual state, its feedback
[UI] GTK: Apply the initial size to the drawing area
[UI] Limit internal UI frame rate to that of the monitor
[UI] Hide the cursor using a timer instead of polling due to no repeated UI thread paints with GPU CP thread presentation, and only within the window
2022-01-29 11:22:03 +01:00
|
|
|
const ui::d3d12::D3D12Provider& provider =
|
|
|
|
|
command_processor_.GetD3D12Provider();
|
2018-12-03 14:31:49 +01:00
|
|
|
|
2021-04-26 21:12:09 +02:00
|
|
|
bool edram_rov_used = render_target_cache.GetPath() ==
|
|
|
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
|
|
|
|
2018-12-03 14:31:49 +01:00
|
|
|
shader_translator_ = std::make_unique<DxbcShaderTranslator>(
|
2021-04-26 21:12:09 +02:00
|
|
|
provider.GetAdapterVendorID(), bindless_resources_used_, edram_rov_used,
|
|
|
|
|
render_target_cache_.gamma_render_target_as_srgb(),
|
|
|
|
|
render_target_cache_.msaa_2x_supported(),
|
2022-05-14 15:18:10 +02:00
|
|
|
render_target_cache_.draw_resolution_scale_x(),
|
|
|
|
|
render_target_cache_.draw_resolution_scale_y(),
|
2020-08-30 21:07:35 +02:00
|
|
|
provider.GetGraphicsAnalysis() != nullptr);
|
2018-07-30 14:59:43 +02:00
|
|
|
|
2021-04-26 21:12:09 +02:00
|
|
|
if (edram_rov_used) {
|
2018-10-16 13:02:43 +02:00
|
|
|
depth_only_pixel_shader_ =
|
|
|
|
|
std::move(shader_translator_->CreateDepthOnlyPixelShader());
|
|
|
|
|
}
|
2018-07-24 13:57:21 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
PipelineCache::~PipelineCache() { Shutdown(); }
|
|
|
|
|
|
2019-01-03 22:30:11 +01:00
|
|
|
bool PipelineCache::Initialize() {
|
[UI] Image post-processing and full presentation/window rework
[GPU] Add FXAA post-processing
[UI] Add FidelityFX FSR and CAS post-processing
[UI] Add blue noise dithering from 10bpc to 8bpc
[GPU] Apply the DC PWL gamma ramp closer to the spec, supporting fully white color
[UI] Allow the GPU CP thread to present on the host directly, bypassing the UI thread OS paint event
[UI] Allow variable refresh rate (or tearing)
[UI] Present the newest frame (restart) on DXGI
[UI] Replace GraphicsContext with a far more advanced Presenter with more coherent surface connection and UI overlay state management
[UI] Connect presentation to windows via the Surface class, not native window handles
[Vulkan] Switch to simpler Vulkan setup with no instance/device separation due to interdependencies and to pass fewer objects around
[Vulkan] Lower the minimum required Vulkan version to 1.0
[UI/GPU] Various cleanup, mainly ComPtr usage
[UI] Support per-monitor DPI awareness v2 on Windows
[UI] DPI-scale Dear ImGui
[UI] Replace the remaining non-detachable window delegates with unified window event and input listeners
[UI] Allow listeners to safely destroy or close the window, and to register/unregister listeners without use-after-free and the ABA problem
[UI] Explicit Z ordering of input listeners and UI overlays, top-down for input, bottom-up for drawing
[UI] Add explicit window lifecycle phases
[UI] Replace Window virtual functions with explicit desired state, its application, actual state, its feedback
[UI] GTK: Apply the initial size to the drawing area
[UI] Limit internal UI frame rate to that of the monitor
[UI] Hide the cursor using a timer instead of polling due to no repeated UI thread paints with GPU CP thread presentation, and only within the window
2022-01-29 11:22:03 +01:00
|
|
|
const ui::d3d12::D3D12Provider& provider =
|
|
|
|
|
command_processor_.GetD3D12Provider();
|
2020-08-22 22:15:50 +02:00
|
|
|
|
|
|
|
|
// Initialize the command processor thread DXIL objects.
|
2020-08-22 22:21:00 +02:00
|
|
|
dxbc_converter_ = nullptr;
|
|
|
|
|
dxc_utils_ = nullptr;
|
|
|
|
|
dxc_compiler_ = nullptr;
|
2020-08-22 22:15:50 +02:00
|
|
|
if (cvars::d3d12_dxbc_disasm_dxilconv) {
|
2020-08-30 21:07:35 +02:00
|
|
|
if (FAILED(provider.DxbcConverterCreateInstance(
|
2020-08-22 22:15:50 +02:00
|
|
|
CLSID_DxbcConverter, IID_PPV_ARGS(&dxbc_converter_)))) {
|
|
|
|
|
XELOGE(
|
|
|
|
|
"Failed to create DxbcConverter, converted DXIL disassembly for "
|
|
|
|
|
"debugging will be unavailable");
|
|
|
|
|
}
|
2020-08-30 21:07:35 +02:00
|
|
|
if (FAILED(provider.DxcCreateInstance(CLSID_DxcUtils,
|
|
|
|
|
IID_PPV_ARGS(&dxc_utils_)))) {
|
2020-08-22 22:15:50 +02:00
|
|
|
XELOGE(
|
|
|
|
|
"Failed to create DxcUtils, converted DXIL disassembly for debugging "
|
|
|
|
|
"will be unavailable");
|
|
|
|
|
}
|
2020-08-30 21:07:35 +02:00
|
|
|
if (FAILED(provider.DxcCreateInstance(CLSID_DxcCompiler,
|
|
|
|
|
IID_PPV_ARGS(&dxc_compiler_)))) {
|
2020-08-22 22:15:50 +02:00
|
|
|
XELOGE(
|
|
|
|
|
"Failed to create DxcCompiler, converted DXIL disassembly for "
|
|
|
|
|
"debugging will be unavailable");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-21 17:21:00 +01:00
|
|
|
uint32_t logical_processor_count = xe::threading::logical_processor_count();
|
|
|
|
|
if (!logical_processor_count) {
|
|
|
|
|
// Pick some reasonable amount if couldn't determine the number of cores.
|
|
|
|
|
logical_processor_count = 6;
|
|
|
|
|
}
|
|
|
|
|
// Initialize creation thread synchronization data even if not using creation
|
2020-11-14 14:43:18 +01:00
|
|
|
// threads because they may be used anyway to create pipelines from the
|
|
|
|
|
// storage.
|
2020-03-21 17:21:00 +01:00
|
|
|
creation_threads_busy_ = 0;
|
|
|
|
|
creation_completion_event_ =
|
|
|
|
|
xe::threading::Event::CreateManualResetEvent(true);
|
2022-02-24 22:37:32 +01:00
|
|
|
assert_not_null(creation_completion_event_);
|
2020-03-21 17:21:00 +01:00
|
|
|
creation_completion_set_event_ = false;
|
|
|
|
|
creation_threads_shutdown_from_ = SIZE_MAX;
|
2019-08-03 15:53:23 +02:00
|
|
|
if (cvars::d3d12_pipeline_creation_threads != 0) {
|
2020-03-21 17:21:00 +01:00
|
|
|
size_t creation_thread_count;
|
2019-08-03 15:53:23 +02:00
|
|
|
if (cvars::d3d12_pipeline_creation_threads < 0) {
|
2020-03-21 17:21:00 +01:00
|
|
|
creation_thread_count =
|
|
|
|
|
std::max(logical_processor_count * 3 / 4, uint32_t(1));
|
2019-01-04 12:30:26 +01:00
|
|
|
} else {
|
2020-03-21 17:21:00 +01:00
|
|
|
creation_thread_count =
|
|
|
|
|
std::min(uint32_t(cvars::d3d12_pipeline_creation_threads),
|
|
|
|
|
logical_processor_count);
|
2019-01-04 12:30:26 +01:00
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
for (size_t i = 0; i < creation_thread_count; ++i) {
|
2019-01-04 12:30:26 +01:00
|
|
|
std::unique_ptr<xe::threading::Thread> creation_thread =
|
2020-03-21 17:21:00 +01:00
|
|
|
xe::threading::Thread::Create({}, [this, i]() { CreationThread(i); });
|
2022-02-24 22:37:32 +01:00
|
|
|
assert_not_null(creation_thread);
|
2020-11-14 14:43:18 +01:00
|
|
|
creation_thread->set_name("D3D12 Pipelines");
|
2019-01-04 12:30:26 +01:00
|
|
|
creation_threads_.push_back(std::move(creation_thread));
|
|
|
|
|
}
|
2019-01-03 22:30:11 +01:00
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void PipelineCache::Shutdown() {
|
2022-05-22 13:46:03 +02:00
|
|
|
// Shut down all threads, before destroying the pipelines since they may be
|
|
|
|
|
// creating them.
|
2019-01-04 12:30:26 +01:00
|
|
|
if (!creation_threads_.empty()) {
|
|
|
|
|
{
|
2022-09-17 13:04:53 +02:00
|
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
2020-03-21 17:21:00 +01:00
|
|
|
creation_threads_shutdown_from_ = 0;
|
2019-01-04 12:30:26 +01:00
|
|
|
}
|
|
|
|
|
creation_request_cond_.notify_all();
|
|
|
|
|
for (size_t i = 0; i < creation_threads_.size(); ++i) {
|
|
|
|
|
xe::threading::Wait(creation_threads_[i].get(), false);
|
|
|
|
|
}
|
|
|
|
|
creation_threads_.clear();
|
2019-01-03 22:30:11 +01:00
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
creation_completion_event_.reset();
|
2020-08-22 22:15:50 +02:00
|
|
|
|
2022-05-22 13:46:03 +02:00
|
|
|
// Shut down the persistent shader / pipeline storage.
|
2020-03-21 17:21:00 +01:00
|
|
|
ShutdownShaderStorage();
|
|
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
// Destroy all pipelines.
|
2022-05-22 13:46:03 +02:00
|
|
|
current_pipeline_ = nullptr;
|
2020-11-14 14:43:18 +01:00
|
|
|
for (auto it : pipelines_) {
|
2019-01-03 22:30:11 +01:00
|
|
|
it.second->state->Release();
|
|
|
|
|
delete it.second;
|
|
|
|
|
}
|
2020-11-14 14:43:18 +01:00
|
|
|
pipelines_.clear();
|
|
|
|
|
COUNT_profile_set("gpu/pipeline_cache/pipelines", 0);
|
2019-01-03 22:30:11 +01:00
|
|
|
|
|
|
|
|
// Destroy all shaders.
|
2020-06-19 22:52:33 +02:00
|
|
|
if (bindless_resources_used_) {
|
|
|
|
|
bindless_sampler_layout_map_.clear();
|
|
|
|
|
bindless_sampler_layouts_.clear();
|
|
|
|
|
}
|
|
|
|
|
texture_binding_layout_map_.clear();
|
|
|
|
|
texture_binding_layouts_.clear();
|
2020-11-14 14:43:18 +01:00
|
|
|
for (auto it : shaders_) {
|
2019-01-03 22:30:11 +01:00
|
|
|
delete it.second;
|
|
|
|
|
}
|
2020-11-14 14:43:18 +01:00
|
|
|
shaders_.clear();
|
2020-12-07 20:23:54 +01:00
|
|
|
shader_storage_index_ = 0;
|
2020-03-21 17:21:00 +01:00
|
|
|
|
2022-05-22 13:46:03 +02:00
|
|
|
// Shut down shader translation.
|
|
|
|
|
ui::d3d12::util::ReleaseAndNull(dxc_compiler_);
|
|
|
|
|
ui::d3d12::util::ReleaseAndNull(dxc_utils_);
|
|
|
|
|
ui::d3d12::util::ReleaseAndNull(dxbc_converter_);
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
|
|
|
|
|
2020-03-02 16:37:11 +01:00
|
|
|
void PipelineCache::InitializeShaderStorage(
|
2020-12-07 20:23:54 +01:00
|
|
|
const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) {
|
2020-03-21 17:21:00 +01:00
|
|
|
ShutdownShaderStorage();
|
|
|
|
|
|
2020-12-07 20:23:54 +01:00
|
|
|
auto shader_storage_root = cache_root / "shaders";
|
2020-03-21 17:21:00 +01:00
|
|
|
// For files that can be moved between different hosts.
|
|
|
|
|
// Host PSO blobs - if ever added - should be stored in shaders/local/ (they
|
|
|
|
|
// currently aren't used because because they may be not very practical -
|
|
|
|
|
// would need to invalidate them every commit likely, and additional I/O
|
|
|
|
|
// cost - though D3D's internal validation would possibly be enough to ensure
|
|
|
|
|
// they are up to date).
|
2020-03-02 16:37:11 +01:00
|
|
|
auto shader_storage_shareable_root = shader_storage_root / "shareable";
|
2020-04-14 07:56:02 +02:00
|
|
|
if (!std::filesystem::exists(shader_storage_shareable_root)) {
|
|
|
|
|
if (!std::filesystem::create_directories(shader_storage_shareable_root)) {
|
2020-04-14 12:46:18 +02:00
|
|
|
XELOGE(
|
|
|
|
|
"Failed to create the shareable shader storage directory, persistent "
|
|
|
|
|
"shader storage will be disabled: {}",
|
|
|
|
|
xe::path_to_utf8(shader_storage_shareable_root));
|
2020-04-14 07:56:02 +02:00
|
|
|
return;
|
|
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
|
|
|
|
|
2021-04-26 21:12:09 +02:00
|
|
|
bool edram_rov_used = render_target_cache_.GetPath() ==
|
|
|
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
|
|
|
|
2020-12-07 20:23:54 +01:00
|
|
|
// Initialize the pipeline storage stream - read pipeline descriptions and
|
|
|
|
|
// collect used shader modifications to translate.
|
|
|
|
|
std::vector<PipelineStoredDescription> pipeline_stored_descriptions;
|
|
|
|
|
// <Shader hash, modification bits>.
|
2020-12-19 14:14:54 +01:00
|
|
|
std::set<std::pair<uint64_t, uint64_t>> shader_translations_needed;
|
2020-12-07 20:23:54 +01:00
|
|
|
auto pipeline_storage_file_path =
|
|
|
|
|
shader_storage_shareable_root /
|
|
|
|
|
fmt::format("{:08X}.{}.d3d12.xpso", title_id,
|
2021-04-26 21:12:09 +02:00
|
|
|
edram_rov_used ? "rov" : "rtv");
|
2020-12-07 20:23:54 +01:00
|
|
|
pipeline_storage_file_ =
|
|
|
|
|
xe::filesystem::OpenFile(pipeline_storage_file_path, "a+b");
|
|
|
|
|
if (!pipeline_storage_file_) {
|
|
|
|
|
XELOGE(
|
|
|
|
|
"Failed to open the Direct3D 12 pipeline description storage file for "
|
|
|
|
|
"writing, persistent shader storage will be disabled: {}",
|
|
|
|
|
xe::path_to_utf8(pipeline_storage_file_path));
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
pipeline_storage_file_flush_needed_ = false;
|
|
|
|
|
// 'XEPS'.
|
|
|
|
|
const uint32_t pipeline_storage_magic = 0x53504558;
|
|
|
|
|
// 'DXRO' or 'DXRT'.
|
|
|
|
|
const uint32_t pipeline_storage_magic_api =
|
2021-04-26 21:12:09 +02:00
|
|
|
edram_rov_used ? 0x4F525844 : 0x54525844;
|
2020-12-07 20:23:54 +01:00
|
|
|
const uint32_t pipeline_storage_version_swapped =
|
|
|
|
|
xe::byte_swap(std::max(PipelineDescription::kVersion,
|
|
|
|
|
DxbcShaderTranslator::Modification::kVersion));
|
|
|
|
|
struct {
|
|
|
|
|
uint32_t magic;
|
|
|
|
|
uint32_t magic_api;
|
|
|
|
|
uint32_t version_swapped;
|
|
|
|
|
} pipeline_storage_file_header;
|
|
|
|
|
if (fread(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header),
|
|
|
|
|
1, pipeline_storage_file_) &&
|
|
|
|
|
pipeline_storage_file_header.magic == pipeline_storage_magic &&
|
|
|
|
|
pipeline_storage_file_header.magic_api == pipeline_storage_magic_api &&
|
|
|
|
|
pipeline_storage_file_header.version_swapped ==
|
|
|
|
|
pipeline_storage_version_swapped) {
|
|
|
|
|
xe::filesystem::Seek(pipeline_storage_file_, 0, SEEK_END);
|
|
|
|
|
int64_t pipeline_storage_told_end =
|
|
|
|
|
xe::filesystem::Tell(pipeline_storage_file_);
|
|
|
|
|
size_t pipeline_storage_told_count =
|
|
|
|
|
size_t(pipeline_storage_told_end >=
|
|
|
|
|
int64_t(sizeof(pipeline_storage_file_header))
|
|
|
|
|
? (uint64_t(pipeline_storage_told_end) -
|
|
|
|
|
sizeof(pipeline_storage_file_header)) /
|
|
|
|
|
sizeof(PipelineStoredDescription)
|
|
|
|
|
: 0);
|
|
|
|
|
if (pipeline_storage_told_count &&
|
|
|
|
|
xe::filesystem::Seek(pipeline_storage_file_,
|
|
|
|
|
int64_t(sizeof(pipeline_storage_file_header)),
|
|
|
|
|
SEEK_SET)) {
|
|
|
|
|
pipeline_stored_descriptions.resize(pipeline_storage_told_count);
|
|
|
|
|
pipeline_stored_descriptions.resize(
|
|
|
|
|
fread(pipeline_stored_descriptions.data(),
|
|
|
|
|
sizeof(PipelineStoredDescription), pipeline_storage_told_count,
|
|
|
|
|
pipeline_storage_file_));
|
|
|
|
|
size_t pipeline_storage_read_count = pipeline_stored_descriptions.size();
|
|
|
|
|
for (size_t i = 0; i < pipeline_storage_read_count; ++i) {
|
|
|
|
|
const PipelineStoredDescription& pipeline_stored_description =
|
|
|
|
|
pipeline_stored_descriptions[i];
|
|
|
|
|
// Validate file integrity, stop and truncate the stream if data is
|
|
|
|
|
// corrupted.
|
2020-12-08 20:31:09 +01:00
|
|
|
if (XXH3_64bits(&pipeline_stored_description.description,
|
|
|
|
|
sizeof(pipeline_stored_description.description)) !=
|
|
|
|
|
pipeline_stored_description.description_hash) {
|
2020-12-07 20:23:54 +01:00
|
|
|
pipeline_stored_descriptions.resize(i);
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-12-19 14:14:54 +01:00
|
|
|
// TODO(Triang3l): On Vulkan, skip pipelines requiring unsupported
|
|
|
|
|
// device features (to keep the cache files mostly shareable across
|
|
|
|
|
// devices).
|
2020-12-07 20:23:54 +01:00
|
|
|
// Mark the shader modifications as needed for translation.
|
|
|
|
|
shader_translations_needed.emplace(
|
|
|
|
|
pipeline_stored_description.description.vertex_shader_hash,
|
|
|
|
|
pipeline_stored_description.description.vertex_shader_modification);
|
|
|
|
|
if (pipeline_stored_description.description.pixel_shader_hash) {
|
|
|
|
|
shader_translations_needed.emplace(
|
|
|
|
|
pipeline_stored_description.description.pixel_shader_hash,
|
|
|
|
|
pipeline_stored_description.description
|
|
|
|
|
.pixel_shader_modification);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-21 17:21:00 +01:00
|
|
|
size_t logical_processor_count = xe::threading::logical_processor_count();
|
|
|
|
|
if (!logical_processor_count) {
|
|
|
|
|
// Pick some reasonable amount if couldn't determine the number of cores.
|
|
|
|
|
logical_processor_count = 6;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Initialize the Xenos shader storage stream.
|
|
|
|
|
uint64_t shader_storage_initialization_start =
|
|
|
|
|
xe::Clock::QueryHostTickCount();
|
2020-04-14 12:46:18 +02:00
|
|
|
auto shader_storage_file_path =
|
2020-04-14 07:56:02 +02:00
|
|
|
shader_storage_shareable_root / fmt::format("{:08X}.xsh", title_id);
|
2020-04-14 12:46:18 +02:00
|
|
|
shader_storage_file_ =
|
|
|
|
|
xe::filesystem::OpenFile(shader_storage_file_path, "a+b");
|
2020-03-21 17:21:00 +01:00
|
|
|
if (!shader_storage_file_) {
|
2020-04-14 12:46:18 +02:00
|
|
|
XELOGE(
|
|
|
|
|
"Failed to open the guest shader storage file for writing, persistent "
|
|
|
|
|
"shader storage will be disabled: {}",
|
|
|
|
|
xe::path_to_utf8(shader_storage_file_path));
|
2020-12-07 20:23:54 +01:00
|
|
|
fclose(pipeline_storage_file_);
|
|
|
|
|
pipeline_storage_file_ = nullptr;
|
2020-03-21 17:21:00 +01:00
|
|
|
return;
|
|
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
++shader_storage_index_;
|
2020-03-21 17:21:00 +01:00
|
|
|
shader_storage_file_flush_needed_ = false;
|
|
|
|
|
struct {
|
|
|
|
|
uint32_t magic;
|
|
|
|
|
uint32_t version_swapped;
|
|
|
|
|
} shader_storage_file_header;
|
|
|
|
|
// 'XESH'.
|
|
|
|
|
const uint32_t shader_storage_magic = 0x48534558;
|
|
|
|
|
if (fread(&shader_storage_file_header, sizeof(shader_storage_file_header), 1,
|
|
|
|
|
shader_storage_file_) &&
|
|
|
|
|
shader_storage_file_header.magic == shader_storage_magic &&
|
|
|
|
|
xe::byte_swap(shader_storage_file_header.version_swapped) ==
|
|
|
|
|
ShaderStoredHeader::kVersion) {
|
|
|
|
|
uint64_t shader_storage_valid_bytes = sizeof(shader_storage_file_header);
|
|
|
|
|
// Load and translate shaders written by previous Xenia executions until the
|
|
|
|
|
// end of the file or until a corrupted one is detected.
|
|
|
|
|
ShaderStoredHeader shader_header;
|
|
|
|
|
std::vector<uint32_t> ucode_dwords;
|
|
|
|
|
ucode_dwords.reserve(0xFFFF);
|
|
|
|
|
size_t shaders_translated = 0;
|
|
|
|
|
|
|
|
|
|
// Threads overlapping file reading.
|
|
|
|
|
std::mutex shaders_translation_thread_mutex;
|
|
|
|
|
std::condition_variable shaders_translation_thread_cond;
|
2020-12-19 14:14:54 +01:00
|
|
|
std::deque<D3D12Shader*> shaders_to_translate;
|
2020-03-21 17:21:00 +01:00
|
|
|
size_t shader_translation_threads_busy = 0;
|
|
|
|
|
bool shader_translation_threads_shutdown = false;
|
|
|
|
|
std::mutex shaders_failed_to_translate_mutex;
|
2020-12-07 20:23:54 +01:00
|
|
|
std::vector<D3D12Shader::D3D12Translation*> shaders_failed_to_translate;
|
2020-03-21 17:21:00 +01:00
|
|
|
auto shader_translation_thread_function = [&]() {
|
[UI] Image post-processing and full presentation/window rework
[GPU] Add FXAA post-processing
[UI] Add FidelityFX FSR and CAS post-processing
[UI] Add blue noise dithering from 10bpc to 8bpc
[GPU] Apply the DC PWL gamma ramp closer to the spec, supporting fully white color
[UI] Allow the GPU CP thread to present on the host directly, bypassing the UI thread OS paint event
[UI] Allow variable refresh rate (or tearing)
[UI] Present the newest frame (restart) on DXGI
[UI] Replace GraphicsContext with a far more advanced Presenter with more coherent surface connection and UI overlay state management
[UI] Connect presentation to windows via the Surface class, not native window handles
[Vulkan] Switch to simpler Vulkan setup with no instance/device separation due to interdependencies and to pass fewer objects around
[Vulkan] Lower the minimum required Vulkan version to 1.0
[UI/GPU] Various cleanup, mainly ComPtr usage
[UI] Support per-monitor DPI awareness v2 on Windows
[UI] DPI-scale Dear ImGui
[UI] Replace the remaining non-detachable window delegates with unified window event and input listeners
[UI] Allow listeners to safely destroy or close the window, and to register/unregister listeners without use-after-free and the ABA problem
[UI] Explicit Z ordering of input listeners and UI overlays, top-down for input, bottom-up for drawing
[UI] Add explicit window lifecycle phases
[UI] Replace Window virtual functions with explicit desired state, its application, actual state, its feedback
[UI] GTK: Apply the initial size to the drawing area
[UI] Limit internal UI frame rate to that of the monitor
[UI] Hide the cursor using a timer instead of polling due to no repeated UI thread paints with GPU CP thread presentation, and only within the window
2022-01-29 11:22:03 +01:00
|
|
|
const ui::d3d12::D3D12Provider& provider =
|
|
|
|
|
command_processor_.GetD3D12Provider();
|
2020-12-19 14:14:54 +01:00
|
|
|
StringBuffer ucode_disasm_buffer;
|
2020-03-21 17:21:00 +01:00
|
|
|
DxbcShaderTranslator translator(
|
2020-08-30 21:07:35 +02:00
|
|
|
provider.GetAdapterVendorID(), bindless_resources_used_,
|
2021-04-26 21:12:09 +02:00
|
|
|
edram_rov_used, render_target_cache_.gamma_render_target_as_srgb(),
|
|
|
|
|
render_target_cache_.msaa_2x_supported(),
|
2022-05-14 15:18:10 +02:00
|
|
|
render_target_cache_.draw_resolution_scale_x(),
|
|
|
|
|
render_target_cache_.draw_resolution_scale_y(),
|
2021-04-26 21:12:09 +02:00
|
|
|
provider.GetGraphicsAnalysis() != nullptr);
|
2020-08-22 22:15:50 +02:00
|
|
|
// If needed and possible, create objects needed for DXIL conversion and
|
|
|
|
|
// disassembly on this thread.
|
|
|
|
|
IDxbcConverter* dxbc_converter = nullptr;
|
|
|
|
|
IDxcUtils* dxc_utils = nullptr;
|
|
|
|
|
IDxcCompiler* dxc_compiler = nullptr;
|
|
|
|
|
if (cvars::d3d12_dxbc_disasm_dxilconv && dxbc_converter_ && dxc_utils_ &&
|
|
|
|
|
dxc_compiler_) {
|
2020-08-30 21:07:35 +02:00
|
|
|
provider.DxbcConverterCreateInstance(CLSID_DxbcConverter,
|
|
|
|
|
IID_PPV_ARGS(&dxbc_converter));
|
|
|
|
|
provider.DxcCreateInstance(CLSID_DxcUtils, IID_PPV_ARGS(&dxc_utils));
|
|
|
|
|
provider.DxcCreateInstance(CLSID_DxcCompiler,
|
|
|
|
|
IID_PPV_ARGS(&dxc_compiler));
|
2020-08-22 22:15:50 +02:00
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
for (;;) {
|
2020-12-19 14:14:54 +01:00
|
|
|
D3D12Shader* shader_to_translate;
|
2020-03-21 17:21:00 +01:00
|
|
|
for (;;) {
|
|
|
|
|
std::unique_lock<std::mutex> lock(shaders_translation_thread_mutex);
|
|
|
|
|
if (shaders_to_translate.empty()) {
|
|
|
|
|
if (shader_translation_threads_shutdown) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
shaders_translation_thread_cond.wait(lock);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
shader_to_translate = shaders_to_translate.front();
|
|
|
|
|
shaders_to_translate.pop_front();
|
|
|
|
|
++shader_translation_threads_busy;
|
|
|
|
|
break;
|
|
|
|
|
}
|
nasty commit with a bunch of test code left in, will clean up and pr
Remove the logger_ != nullptr check from shouldlog, it will nearly always be true except on initialization and gets checked later anyway, this shrinks the size of the generated code for some
Select specialized vastcpy for current cpu, for now only have paths for MOVDIR64B and generic avx1
Add XE_UNLIKELY/LIKELY if, they map better to the c++ unlikely/likely attributes which we will need to use soon
Finished reimplementing STVL/STVR/LVL/LVR as their own opcodes. we now generate far less code for these instructions. this also means optimization passes can be written to simplify/remove/replace these instructions in some cases. Found that a good deal of the X86 we were emitting for these instructions was dead code or redundant.
the reduction in generated HIR/x86 should help a lot with compilation times and make function precompilation more feasible as a default
Don't static assert in default prefetch impl, in c++20 the assertion will be triggered even without an instantiation
Reorder some if/else to prod msvc into ordering the branches optimally. it somewhat worked...
Added some notes about which opcodes should be removed/refactored
Dispatch in WriteRegister via vector compares for the bounds. still not very optimal, we ought to be checking whether any register in a range may be special
A lot of work on trying to optimize writeregister, moved wraparound path into a noinline function based on profiling info
Hoist the IsUcodeAnalyzed check out of AnalyzeShader, instead check it before each call. Profiler recorded many hits in the stack frame setup of the function, but none in the actual body of it, so the check is often true but the stack frame setup is run unconditionally
Pre-check whether we're about to write a single register from a ring
Replace more jump tables from draw_util/texture_info with popcnt based sparse indexing/bit tables/shuffle lookups
Place the GPU register file on its own VAD/virtual allocation, it is no longer a member of graphics system
2022-09-04 20:04:41 +02:00
|
|
|
if (!shader_to_translate->is_ucode_analyzed()) {
|
|
|
|
|
shader_to_translate->AnalyzeUcode(ucode_disasm_buffer);
|
|
|
|
|
}
|
2020-12-19 14:14:54 +01:00
|
|
|
// Translate each needed modification on this thread after performing
|
|
|
|
|
// modification-independent analysis of the whole shader.
|
|
|
|
|
uint64_t ucode_data_hash = shader_to_translate->ucode_data_hash();
|
|
|
|
|
for (auto modification_it = shader_translations_needed.lower_bound(
|
|
|
|
|
std::make_pair(ucode_data_hash, uint64_t(0)));
|
|
|
|
|
modification_it != shader_translations_needed.end() &&
|
|
|
|
|
modification_it->first == ucode_data_hash;
|
|
|
|
|
++modification_it) {
|
|
|
|
|
D3D12Shader::D3D12Translation* translation =
|
|
|
|
|
static_cast<D3D12Shader::D3D12Translation*>(
|
|
|
|
|
shader_to_translate->GetOrCreateTranslation(
|
|
|
|
|
modification_it->second));
|
|
|
|
|
// Only try (and delete in case of failure) if it's a new translation.
|
|
|
|
|
// If it's a shader previously encountered in the game, translation of
|
|
|
|
|
// which has failed, and the shader storage is loaded later, keep it
|
|
|
|
|
// this way not to try to translate it again.
|
|
|
|
|
if (!translation->is_translated() &&
|
|
|
|
|
!TranslateAnalyzedShader(translator, *translation, dxbc_converter,
|
|
|
|
|
dxc_utils, dxc_compiler)) {
|
|
|
|
|
std::lock_guard<std::mutex> lock(shaders_failed_to_translate_mutex);
|
|
|
|
|
shaders_failed_to_translate.push_back(translation);
|
|
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
|
|
|
|
{
|
2020-06-19 22:52:33 +02:00
|
|
|
std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
|
2020-03-21 17:21:00 +01:00
|
|
|
--shader_translation_threads_busy;
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-08-22 22:15:50 +02:00
|
|
|
if (dxc_compiler) {
|
|
|
|
|
dxc_compiler->Release();
|
|
|
|
|
}
|
|
|
|
|
if (dxc_utils) {
|
|
|
|
|
dxc_utils->Release();
|
|
|
|
|
}
|
|
|
|
|
if (dxbc_converter) {
|
|
|
|
|
dxbc_converter->Release();
|
|
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
};
|
|
|
|
|
std::vector<std::unique_ptr<xe::threading::Thread>>
|
|
|
|
|
shader_translation_threads;
|
|
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
|
if (!fread(&shader_header, sizeof(shader_header), 1,
|
|
|
|
|
shader_storage_file_)) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
size_t ucode_byte_count =
|
|
|
|
|
shader_header.ucode_dword_count * sizeof(uint32_t);
|
|
|
|
|
ucode_dwords.resize(shader_header.ucode_dword_count);
|
|
|
|
|
if (shader_header.ucode_dword_count &&
|
|
|
|
|
!fread(ucode_dwords.data(), ucode_byte_count, 1,
|
|
|
|
|
shader_storage_file_)) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
uint64_t ucode_data_hash =
|
2020-12-08 20:31:09 +01:00
|
|
|
XXH3_64bits(ucode_dwords.data(), ucode_byte_count);
|
2020-03-21 17:21:00 +01:00
|
|
|
if (shader_header.ucode_data_hash != ucode_data_hash) {
|
|
|
|
|
// Validation failed.
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
shader_storage_valid_bytes += sizeof(shader_header) + ucode_byte_count;
|
|
|
|
|
D3D12Shader* shader =
|
|
|
|
|
LoadShader(shader_header.type, ucode_dwords.data(),
|
|
|
|
|
shader_header.ucode_dword_count, ucode_data_hash);
|
2020-12-19 14:14:54 +01:00
|
|
|
if (shader->ucode_storage_index() == shader_storage_index_) {
|
|
|
|
|
// Appeared twice in this file for some reason - skip, otherwise race
|
|
|
|
|
// condition will be caused by translating twice in parallel.
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
// Loaded from the current storage - don't write again.
|
|
|
|
|
shader->set_ucode_storage_index(shader_storage_index_);
|
2020-12-19 14:14:54 +01:00
|
|
|
// Create new threads if the currently existing threads can't keep up
|
|
|
|
|
// with file reading, but not more than the number of logical processors
|
|
|
|
|
// minus one.
|
|
|
|
|
size_t shader_translation_threads_needed;
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
|
|
|
|
|
shader_translation_threads_needed =
|
|
|
|
|
std::min(shader_translation_threads_busy +
|
|
|
|
|
shaders_to_translate.size() + size_t(1),
|
|
|
|
|
logical_processor_count - size_t(1));
|
|
|
|
|
}
|
|
|
|
|
while (shader_translation_threads.size() <
|
|
|
|
|
shader_translation_threads_needed) {
|
2022-02-24 22:37:32 +01:00
|
|
|
auto thread = xe::threading::Thread::Create(
|
|
|
|
|
{}, shader_translation_thread_function);
|
|
|
|
|
assert_not_null(thread);
|
|
|
|
|
thread->set_name("Shader Translation");
|
|
|
|
|
shader_translation_threads.push_back(std::move(thread));
|
2020-12-19 14:14:54 +01:00
|
|
|
}
|
|
|
|
|
// Request ucode information gathering and translation of all the needed
|
|
|
|
|
// shaders.
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
|
|
|
|
|
shaders_to_translate.push_back(shader);
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
2020-12-19 14:14:54 +01:00
|
|
|
shaders_translation_thread_cond.notify_one();
|
|
|
|
|
++shaders_translated;
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
|
|
|
|
if (!shader_translation_threads.empty()) {
|
|
|
|
|
{
|
2020-06-19 22:52:33 +02:00
|
|
|
std::lock_guard<std::mutex> lock(shaders_translation_thread_mutex);
|
2020-03-21 17:21:00 +01:00
|
|
|
shader_translation_threads_shutdown = true;
|
|
|
|
|
}
|
|
|
|
|
shaders_translation_thread_cond.notify_all();
|
|
|
|
|
for (auto& shader_translation_thread : shader_translation_threads) {
|
|
|
|
|
xe::threading::Wait(shader_translation_thread.get(), false);
|
|
|
|
|
}
|
|
|
|
|
shader_translation_threads.clear();
|
2020-12-07 20:23:54 +01:00
|
|
|
for (D3D12Shader::D3D12Translation* translation :
|
|
|
|
|
shaders_failed_to_translate) {
|
|
|
|
|
D3D12Shader* shader = static_cast<D3D12Shader*>(&translation->shader());
|
|
|
|
|
shader->DestroyTranslation(translation->modification());
|
|
|
|
|
if (shader->translations().empty()) {
|
|
|
|
|
shaders_.erase(shader->ucode_data_hash());
|
|
|
|
|
delete shader;
|
|
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
|
|
|
|
}
|
2020-02-28 21:30:48 +01:00
|
|
|
XELOGGPU("Translated {} shaders from the storage in {} milliseconds",
|
2020-03-21 17:21:00 +01:00
|
|
|
shaders_translated,
|
|
|
|
|
(xe::Clock::QueryHostTickCount() -
|
|
|
|
|
shader_storage_initialization_start) *
|
|
|
|
|
1000 / xe::Clock::QueryHostTickFrequency());
|
|
|
|
|
xe::filesystem::TruncateStdioFile(shader_storage_file_,
|
|
|
|
|
shader_storage_valid_bytes);
|
|
|
|
|
} else {
|
|
|
|
|
xe::filesystem::TruncateStdioFile(shader_storage_file_, 0);
|
|
|
|
|
shader_storage_file_header.magic = shader_storage_magic;
|
|
|
|
|
shader_storage_file_header.version_swapped =
|
|
|
|
|
xe::byte_swap(ShaderStoredHeader::kVersion);
|
|
|
|
|
fwrite(&shader_storage_file_header, sizeof(shader_storage_file_header), 1,
|
|
|
|
|
shader_storage_file_);
|
|
|
|
|
}
|
|
|
|
|
|
2020-12-07 20:23:54 +01:00
|
|
|
// Create the pipelines.
|
|
|
|
|
if (!pipeline_stored_descriptions.empty()) {
|
|
|
|
|
uint64_t pipeline_creation_start_ = xe::Clock::QueryHostTickCount();
|
|
|
|
|
|
|
|
|
|
// Launch additional creation threads to use all cores to create
|
|
|
|
|
// pipelines faster. Will also be using the main thread, so minus 1.
|
|
|
|
|
size_t creation_thread_original_count = creation_threads_.size();
|
|
|
|
|
size_t creation_thread_needed_count = std::max(
|
|
|
|
|
std::min(pipeline_stored_descriptions.size(), logical_processor_count) -
|
|
|
|
|
size_t(1),
|
|
|
|
|
creation_thread_original_count);
|
|
|
|
|
while (creation_threads_.size() < creation_thread_original_count) {
|
|
|
|
|
size_t creation_thread_index = creation_threads_.size();
|
|
|
|
|
std::unique_ptr<xe::threading::Thread> creation_thread =
|
|
|
|
|
xe::threading::Thread::Create({}, [this, creation_thread_index]() {
|
|
|
|
|
CreationThread(creation_thread_index);
|
|
|
|
|
});
|
2022-02-24 22:37:32 +01:00
|
|
|
assert_not_null(creation_thread);
|
2020-12-07 20:23:54 +01:00
|
|
|
creation_thread->set_name("D3D12 Pipelines");
|
|
|
|
|
creation_threads_.push_back(std::move(creation_thread));
|
|
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
|
2020-12-07 20:23:54 +01:00
|
|
|
size_t pipelines_created = 0;
|
|
|
|
|
for (const PipelineStoredDescription& pipeline_stored_description :
|
|
|
|
|
pipeline_stored_descriptions) {
|
|
|
|
|
const PipelineDescription& pipeline_description =
|
|
|
|
|
pipeline_stored_description.description;
|
2020-12-19 14:14:54 +01:00
|
|
|
// TODO(Triang3l): On Vulkan, skip pipelines requiring unsupported device
|
|
|
|
|
// features (to keep the cache files mostly shareable across devices).
|
2020-12-07 20:23:54 +01:00
|
|
|
// Skip already known pipelines - those have already been enqueued.
|
|
|
|
|
auto found_range =
|
|
|
|
|
pipelines_.equal_range(pipeline_stored_description.description_hash);
|
|
|
|
|
bool pipeline_found = false;
|
|
|
|
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
|
|
|
|
Pipeline* found_pipeline = it->second;
|
|
|
|
|
if (!std::memcmp(&found_pipeline->description.description,
|
|
|
|
|
&pipeline_description, sizeof(pipeline_description))) {
|
|
|
|
|
pipeline_found = true;
|
|
|
|
|
break;
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
}
|
|
|
|
|
if (pipeline_found) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
|
2020-12-07 20:23:54 +01:00
|
|
|
PipelineRuntimeDescription pipeline_runtime_description;
|
|
|
|
|
auto vertex_shader_it =
|
|
|
|
|
shaders_.find(pipeline_description.vertex_shader_hash);
|
|
|
|
|
if (vertex_shader_it == shaders_.end()) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
D3D12Shader* vertex_shader = vertex_shader_it->second;
|
|
|
|
|
pipeline_runtime_description.vertex_shader =
|
|
|
|
|
static_cast<D3D12Shader::D3D12Translation*>(
|
|
|
|
|
vertex_shader->GetTranslation(
|
|
|
|
|
pipeline_description.vertex_shader_modification));
|
|
|
|
|
if (!pipeline_runtime_description.vertex_shader ||
|
2020-12-19 14:14:54 +01:00
|
|
|
!pipeline_runtime_description.vertex_shader->is_translated() ||
|
2020-12-07 20:23:54 +01:00
|
|
|
!pipeline_runtime_description.vertex_shader->is_valid()) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
D3D12Shader* pixel_shader;
|
|
|
|
|
if (pipeline_description.pixel_shader_hash) {
|
|
|
|
|
auto pixel_shader_it =
|
|
|
|
|
shaders_.find(pipeline_description.pixel_shader_hash);
|
|
|
|
|
if (pixel_shader_it == shaders_.end()) {
|
|
|
|
|
continue;
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
pixel_shader = pixel_shader_it->second;
|
|
|
|
|
pipeline_runtime_description.pixel_shader =
|
|
|
|
|
static_cast<D3D12Shader::D3D12Translation*>(
|
|
|
|
|
pixel_shader->GetTranslation(
|
|
|
|
|
pipeline_description.pixel_shader_modification));
|
|
|
|
|
if (!pipeline_runtime_description.pixel_shader ||
|
2020-12-19 14:14:54 +01:00
|
|
|
!pipeline_runtime_description.pixel_shader->is_translated() ||
|
2020-12-07 20:23:54 +01:00
|
|
|
!pipeline_runtime_description.pixel_shader->is_valid()) {
|
|
|
|
|
continue;
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
} else {
|
|
|
|
|
pixel_shader = nullptr;
|
|
|
|
|
pipeline_runtime_description.pixel_shader = nullptr;
|
|
|
|
|
}
|
2022-05-09 18:16:22 +02:00
|
|
|
GeometryShaderKey pipeline_geometry_shader_key;
|
|
|
|
|
pipeline_runtime_description.geometry_shader =
|
2022-07-21 11:32:28 +02:00
|
|
|
GetGeometryShaderKey(
|
|
|
|
|
pipeline_description.geometry_shader,
|
|
|
|
|
DxbcShaderTranslator::Modification(
|
|
|
|
|
pipeline_description.vertex_shader_modification),
|
|
|
|
|
DxbcShaderTranslator::Modification(
|
|
|
|
|
pipeline_description.pixel_shader_modification),
|
|
|
|
|
pipeline_geometry_shader_key)
|
2022-05-09 18:16:22 +02:00
|
|
|
? &GetGeometryShader(pipeline_geometry_shader_key)
|
|
|
|
|
: nullptr;
|
2020-12-07 20:23:54 +01:00
|
|
|
pipeline_runtime_description.root_signature =
|
|
|
|
|
command_processor_.GetRootSignature(
|
|
|
|
|
vertex_shader, pixel_shader,
|
2022-05-15 15:13:05 +02:00
|
|
|
Shader::IsHostVertexShaderTypeDomain(
|
|
|
|
|
DxbcShaderTranslator::Modification(
|
|
|
|
|
pipeline_description.vertex_shader_modification)
|
|
|
|
|
.vertex.host_vertex_shader_type));
|
2020-12-07 20:23:54 +01:00
|
|
|
if (!pipeline_runtime_description.root_signature) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
std::memcpy(&pipeline_runtime_description.description,
|
|
|
|
|
&pipeline_description, sizeof(pipeline_description));
|
|
|
|
|
|
|
|
|
|
Pipeline* new_pipeline = new Pipeline;
|
|
|
|
|
new_pipeline->state = nullptr;
|
|
|
|
|
std::memcpy(&new_pipeline->description, &pipeline_runtime_description,
|
|
|
|
|
sizeof(pipeline_runtime_description));
|
|
|
|
|
pipelines_.emplace(pipeline_stored_description.description_hash,
|
|
|
|
|
new_pipeline);
|
|
|
|
|
COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size());
|
|
|
|
|
if (!creation_threads_.empty()) {
|
|
|
|
|
// Submit the pipeline for creation to any available thread.
|
|
|
|
|
{
|
2022-09-17 13:04:53 +02:00
|
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
2020-12-07 20:23:54 +01:00
|
|
|
creation_queue_.push_back(new_pipeline);
|
|
|
|
|
}
|
|
|
|
|
creation_request_cond_.notify_one();
|
|
|
|
|
} else {
|
|
|
|
|
new_pipeline->state = CreateD3D12Pipeline(pipeline_runtime_description);
|
|
|
|
|
}
|
|
|
|
|
++pipelines_created;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-30 10:37:14 +01:00
|
|
|
if (!creation_threads_.empty()) {
|
|
|
|
|
CreateQueuedPipelinesOnProcessorThread();
|
|
|
|
|
if (creation_threads_.size() > creation_thread_original_count) {
|
|
|
|
|
{
|
2022-09-17 13:04:53 +02:00
|
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
2022-01-30 10:37:14 +01:00
|
|
|
creation_threads_shutdown_from_ = creation_thread_original_count;
|
|
|
|
|
// Assuming the queue is empty because of
|
|
|
|
|
// CreateQueuedPipelinesOnProcessorThread.
|
|
|
|
|
}
|
|
|
|
|
creation_request_cond_.notify_all();
|
|
|
|
|
while (creation_threads_.size() > creation_thread_original_count) {
|
|
|
|
|
xe::threading::Wait(creation_threads_.back().get(), false);
|
|
|
|
|
creation_threads_.pop_back();
|
|
|
|
|
}
|
|
|
|
|
bool await_creation_completion_event;
|
|
|
|
|
{
|
|
|
|
|
// Cleanup so additional threads can be created later again.
|
2022-09-17 13:04:53 +02:00
|
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
2022-01-30 10:37:14 +01:00
|
|
|
creation_threads_shutdown_from_ = SIZE_MAX;
|
|
|
|
|
// If the invocation is blocking, all the shader storage
|
|
|
|
|
// initialization is expected to be done before proceeding, to avoid
|
|
|
|
|
// latency in the command processor after the invocation.
|
|
|
|
|
await_creation_completion_event =
|
|
|
|
|
blocking && creation_threads_busy_ != 0;
|
|
|
|
|
if (await_creation_completion_event) {
|
|
|
|
|
creation_completion_event_->Reset();
|
|
|
|
|
creation_completion_set_event_ = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
if (await_creation_completion_event) {
|
2022-01-30 10:37:14 +01:00
|
|
|
creation_request_cond_.notify_one();
|
|
|
|
|
xe::threading::Wait(creation_completion_event_.get(), false);
|
2020-12-07 20:23:54 +01:00
|
|
|
}
|
|
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
|
|
|
|
|
XELOGGPU(
|
|
|
|
|
"Created {} graphics pipelines (not including reading the "
|
|
|
|
|
"descriptions) from the storage in {} milliseconds",
|
|
|
|
|
pipelines_created,
|
|
|
|
|
(xe::Clock::QueryHostTickCount() - pipeline_creation_start_) * 1000 /
|
|
|
|
|
xe::Clock::QueryHostTickFrequency());
|
|
|
|
|
// If any pipeline descriptions were corrupted (or the whole file has excess
|
|
|
|
|
// bytes in the end), truncate to the last valid pipeline description.
|
|
|
|
|
xe::filesystem::TruncateStdioFile(
|
|
|
|
|
pipeline_storage_file_,
|
|
|
|
|
uint64_t(sizeof(pipeline_storage_file_header) +
|
|
|
|
|
sizeof(PipelineStoredDescription) *
|
|
|
|
|
pipeline_stored_descriptions.size()));
|
2020-03-21 17:21:00 +01:00
|
|
|
} else {
|
2020-11-14 14:43:18 +01:00
|
|
|
xe::filesystem::TruncateStdioFile(pipeline_storage_file_, 0);
|
|
|
|
|
pipeline_storage_file_header.magic = pipeline_storage_magic;
|
|
|
|
|
pipeline_storage_file_header.magic_api = pipeline_storage_magic_api;
|
|
|
|
|
pipeline_storage_file_header.version_swapped =
|
2020-12-07 20:23:54 +01:00
|
|
|
pipeline_storage_version_swapped;
|
2020-11-14 14:43:18 +01:00
|
|
|
fwrite(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header),
|
|
|
|
|
1, pipeline_storage_file_);
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
|
|
|
|
|
2020-12-07 20:23:54 +01:00
|
|
|
shader_storage_cache_root_ = cache_root;
|
2020-03-21 17:21:00 +01:00
|
|
|
shader_storage_title_id_ = title_id;
|
|
|
|
|
|
|
|
|
|
// Start the storage writing thread.
|
|
|
|
|
storage_write_flush_shaders_ = false;
|
2020-11-14 14:43:18 +01:00
|
|
|
storage_write_flush_pipelines_ = false;
|
2020-03-21 17:21:00 +01:00
|
|
|
storage_write_thread_shutdown_ = false;
|
|
|
|
|
storage_write_thread_ =
|
|
|
|
|
xe::threading::Thread::Create({}, [this]() { StorageWriteThread(); });
|
2022-02-24 22:37:32 +01:00
|
|
|
assert_not_null(storage_write_thread_);
|
|
|
|
|
storage_write_thread_->set_name("D3D12 Storage writer");
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void PipelineCache::ShutdownShaderStorage() {
|
|
|
|
|
if (storage_write_thread_) {
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
|
|
|
storage_write_thread_shutdown_ = true;
|
|
|
|
|
}
|
|
|
|
|
storage_write_request_cond_.notify_all();
|
|
|
|
|
xe::threading::Wait(storage_write_thread_.get(), false);
|
|
|
|
|
storage_write_thread_.reset();
|
|
|
|
|
}
|
|
|
|
|
storage_write_shader_queue_.clear();
|
2020-11-14 14:43:18 +01:00
|
|
|
storage_write_pipeline_queue_.clear();
|
2020-03-21 17:21:00 +01:00
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
if (pipeline_storage_file_) {
|
|
|
|
|
fclose(pipeline_storage_file_);
|
|
|
|
|
pipeline_storage_file_ = nullptr;
|
|
|
|
|
pipeline_storage_file_flush_needed_ = false;
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (shader_storage_file_) {
|
|
|
|
|
fclose(shader_storage_file_);
|
|
|
|
|
shader_storage_file_ = nullptr;
|
|
|
|
|
shader_storage_file_flush_needed_ = false;
|
|
|
|
|
}
|
|
|
|
|
|
2020-12-07 20:23:54 +01:00
|
|
|
shader_storage_cache_root_.clear();
|
2020-03-21 17:21:00 +01:00
|
|
|
shader_storage_title_id_ = 0;
|
2019-01-03 22:30:11 +01:00
|
|
|
}
|
|
|
|
|
|
2019-10-28 20:00:59 +01:00
|
|
|
void PipelineCache::EndSubmission() {
|
2020-03-21 17:21:00 +01:00
|
|
|
if (shader_storage_file_flush_needed_ ||
|
2020-11-14 14:43:18 +01:00
|
|
|
pipeline_storage_file_flush_needed_) {
|
2020-03-21 17:21:00 +01:00
|
|
|
{
|
2020-06-19 22:52:33 +02:00
|
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
2020-03-21 17:21:00 +01:00
|
|
|
if (shader_storage_file_flush_needed_) {
|
|
|
|
|
storage_write_flush_shaders_ = true;
|
|
|
|
|
}
|
2020-11-14 14:43:18 +01:00
|
|
|
if (pipeline_storage_file_flush_needed_) {
|
|
|
|
|
storage_write_flush_pipelines_ = true;
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
storage_write_request_cond_.notify_one();
|
|
|
|
|
shader_storage_file_flush_needed_ = false;
|
2020-11-14 14:43:18 +01:00
|
|
|
pipeline_storage_file_flush_needed_ = false;
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
2019-01-04 12:30:26 +01:00
|
|
|
if (!creation_threads_.empty()) {
|
2020-11-14 14:43:18 +01:00
|
|
|
CreateQueuedPipelinesOnProcessorThread();
|
|
|
|
|
// Await creation of all queued pipelines.
|
2020-03-21 17:21:00 +01:00
|
|
|
bool await_creation_completion_event;
|
2019-01-04 12:30:26 +01:00
|
|
|
{
|
2022-09-17 13:04:53 +02:00
|
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
2020-03-21 17:21:00 +01:00
|
|
|
// Assuming the creation queue is already empty (because the processor
|
2020-11-14 14:43:18 +01:00
|
|
|
// thread also worked on creating the leftover pipelines), so only check
|
|
|
|
|
// if there are threads with pipelines currently being created.
|
2020-03-21 17:21:00 +01:00
|
|
|
await_creation_completion_event = creation_threads_busy_ != 0;
|
|
|
|
|
if (await_creation_completion_event) {
|
2019-01-04 12:30:26 +01:00
|
|
|
creation_completion_event_->Reset();
|
|
|
|
|
creation_completion_set_event_ = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
if (await_creation_completion_event) {
|
|
|
|
|
creation_request_cond_.notify_one();
|
2019-01-04 12:30:26 +01:00
|
|
|
xe::threading::Wait(creation_completion_event_.get(), false);
|
2019-01-03 22:30:11 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-07-24 13:57:21 +02:00
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
bool PipelineCache::IsCreatingPipelines() {
|
2019-12-04 19:42:26 +01:00
|
|
|
if (creation_threads_.empty()) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2022-09-17 13:04:53 +02:00
|
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
2019-12-04 19:42:26 +01:00
|
|
|
return !creation_queue_.empty() || creation_threads_busy_ != 0;
|
|
|
|
|
}
|
|
|
|
|
|
2020-07-11 14:54:22 +02:00
|
|
|
D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
|
2018-07-24 13:57:21 +02:00
|
|
|
const uint32_t* host_address,
|
|
|
|
|
uint32_t dword_count) {
|
|
|
|
|
// Hash the input memory and lookup the shader.
|
2020-12-07 20:23:54 +01:00
|
|
|
return LoadShader(shader_type, host_address, dword_count,
|
2020-12-08 20:31:09 +01:00
|
|
|
XXH3_64bits(host_address, dword_count * sizeof(uint32_t)));
|
2020-12-07 20:23:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type,
|
|
|
|
|
const uint32_t* host_address,
|
|
|
|
|
uint32_t dword_count,
|
|
|
|
|
uint64_t data_hash) {
|
2020-11-14 14:43:18 +01:00
|
|
|
auto it = shaders_.find(data_hash);
|
|
|
|
|
if (it != shaders_.end()) {
|
2018-07-24 13:57:21 +02:00
|
|
|
// Shader has been previously loaded.
|
|
|
|
|
return it->second;
|
|
|
|
|
}
|
|
|
|
|
// Always create the shader and stash it away.
|
|
|
|
|
// We need to track it even if it fails translation so we know not to try
|
|
|
|
|
// again.
|
2018-07-24 14:14:16 +02:00
|
|
|
D3D12Shader* shader =
|
|
|
|
|
new D3D12Shader(shader_type, data_hash, host_address, dword_count);
|
2020-11-14 14:43:18 +01:00
|
|
|
shaders_.emplace(data_hash, shader);
|
2018-07-24 13:57:21 +02:00
|
|
|
return shader;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-04 22:56:25 +02:00
|
|
|
DxbcShaderTranslator::Modification
|
|
|
|
|
PipelineCache::GetCurrentVertexShaderModification(
|
2022-07-21 11:32:28 +02:00
|
|
|
const Shader& shader, Shader::HostVertexShaderType host_vertex_shader_type,
|
|
|
|
|
uint32_t interpolator_mask) const {
|
2021-06-04 22:56:25 +02:00
|
|
|
assert_true(shader.type() == xenos::ShaderType::kVertex);
|
2020-12-24 21:40:38 +01:00
|
|
|
assert_true(shader.is_ucode_analyzed());
|
2020-12-19 14:14:54 +01:00
|
|
|
const auto& regs = register_file_;
|
2022-07-21 11:32:28 +02:00
|
|
|
|
|
|
|
|
DxbcShaderTranslator::Modification modification(
|
2021-06-04 22:56:25 +02:00
|
|
|
shader_translator_->GetDefaultVertexShaderModification(
|
2022-07-21 11:32:28 +02:00
|
|
|
shader.GetDynamicAddressableRegisterCount(
|
|
|
|
|
regs.Get<reg::SQ_PROGRAM_CNTL>().vs_num_reg),
|
2021-06-04 22:56:25 +02:00
|
|
|
host_vertex_shader_type));
|
2022-07-21 11:32:28 +02:00
|
|
|
|
|
|
|
|
modification.vertex.interpolator_mask = interpolator_mask;
|
|
|
|
|
|
|
|
|
|
auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
|
|
|
|
|
uint32_t user_clip_planes =
|
|
|
|
|
pa_cl_clip_cntl.clip_disable ? 0 : pa_cl_clip_cntl.ucp_ena;
|
|
|
|
|
modification.vertex.user_clip_plane_count = xe::bit_count(user_clip_planes);
|
|
|
|
|
modification.vertex.user_clip_plane_cull =
|
|
|
|
|
uint32_t(user_clip_planes && pa_cl_clip_cntl.ucp_cull_only_ena);
|
|
|
|
|
modification.vertex.vertex_kill_and =
|
|
|
|
|
uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b100) &&
|
|
|
|
|
!pa_cl_clip_cntl.vtx_kill_or);
|
|
|
|
|
|
|
|
|
|
modification.vertex.output_point_size =
|
|
|
|
|
uint32_t((shader.writes_point_size_edge_flag_kill_vertex() & 0b001) &&
|
|
|
|
|
regs.Get<reg::VGT_DRAW_INITIATOR>().prim_type ==
|
|
|
|
|
xenos::PrimitiveType::kPointList);
|
|
|
|
|
|
|
|
|
|
return modification;
|
2020-12-07 20:23:54 +01:00
|
|
|
}
|
|
|
|
|
|
2021-06-04 22:56:25 +02:00
|
|
|
DxbcShaderTranslator::Modification
|
2022-04-27 20:46:29 +02:00
|
|
|
PipelineCache::GetCurrentPixelShaderModification(
|
2022-07-21 11:32:28 +02:00
|
|
|
const Shader& shader, uint32_t interpolator_mask, uint32_t param_gen_pos,
|
|
|
|
|
reg::RB_DEPTHCONTROL normalized_depth_control) const {
|
2021-06-04 22:56:25 +02:00
|
|
|
assert_true(shader.type() == xenos::ShaderType::kPixel);
|
|
|
|
|
assert_true(shader.is_ucode_analyzed());
|
2020-08-30 21:07:35 +02:00
|
|
|
const auto& regs = register_file_;
|
2022-07-21 11:32:28 +02:00
|
|
|
|
2021-06-04 22:56:25 +02:00
|
|
|
DxbcShaderTranslator::Modification modification(
|
|
|
|
|
shader_translator_->GetDefaultPixelShaderModification(
|
|
|
|
|
shader.GetDynamicAddressableRegisterCount(
|
2022-07-21 11:32:28 +02:00
|
|
|
regs.Get<reg::SQ_PROGRAM_CNTL>().ps_num_reg)));
|
|
|
|
|
|
|
|
|
|
modification.pixel.interpolator_mask = interpolator_mask;
|
|
|
|
|
modification.pixel.interpolators_centroid =
|
|
|
|
|
interpolator_mask &
|
|
|
|
|
~xenos::GetInterpolatorSamplingPattern(
|
|
|
|
|
regs.Get<reg::RB_SURFACE_INFO>().msaa_samples,
|
|
|
|
|
regs.Get<reg::SQ_CONTEXT_MISC>().sc_sample_cntl,
|
|
|
|
|
regs.Get<reg::SQ_INTERPOLATOR_CNTL>().sampling_pattern);
|
|
|
|
|
|
|
|
|
|
if (param_gen_pos < xenos::kMaxInterpolators) {
|
|
|
|
|
modification.pixel.param_gen_enable = 1;
|
|
|
|
|
modification.pixel.param_gen_interpolator = param_gen_pos;
|
|
|
|
|
modification.pixel.param_gen_point =
|
|
|
|
|
uint32_t(regs.Get<reg::VGT_DRAW_INITIATOR>().prim_type ==
|
|
|
|
|
xenos::PrimitiveType::kPointList);
|
|
|
|
|
} else {
|
|
|
|
|
modification.pixel.param_gen_enable = 0;
|
|
|
|
|
modification.pixel.param_gen_interpolator = 0;
|
|
|
|
|
modification.pixel.param_gen_point = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-04 22:56:25 +02:00
|
|
|
if (render_target_cache_.GetPath() ==
|
|
|
|
|
RenderTargetCache::Path::kHostRenderTargets) {
|
|
|
|
|
using DepthStencilMode =
|
|
|
|
|
DxbcShaderTranslator::Modification::DepthStencilMode;
|
2022-06-22 11:53:09 +02:00
|
|
|
if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
|
2022-04-27 20:46:29 +02:00
|
|
|
normalized_depth_control.z_enable &&
|
2021-06-04 22:56:25 +02:00
|
|
|
regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
|
|
|
|
|
xenos::DepthRenderTargetFormat::kD24FS8) {
|
|
|
|
|
modification.pixel.depth_stencil_mode =
|
2022-06-22 11:53:09 +02:00
|
|
|
render_target_cache_.depth_float24_round()
|
|
|
|
|
? DepthStencilMode::kFloat24Rounding
|
|
|
|
|
: DepthStencilMode::kFloat24Truncating;
|
2021-06-04 22:56:25 +02:00
|
|
|
} else {
|
|
|
|
|
if (shader.implicit_early_z_write_allowed() &&
|
|
|
|
|
(!shader.writes_color_target(0) ||
|
|
|
|
|
!draw_util::DoesCoverageDependOnAlpha(
|
|
|
|
|
regs.Get<reg::RB_COLORCONTROL>()))) {
|
|
|
|
|
modification.pixel.depth_stencil_mode = DepthStencilMode::kEarlyHint;
|
|
|
|
|
} else {
|
|
|
|
|
modification.pixel.depth_stencil_mode = DepthStencilMode::kNoModifiers;
|
2020-04-19 22:11:52 +02:00
|
|
|
}
|
2021-06-04 22:56:25 +02:00
|
|
|
}
|
2020-04-05 23:03:23 +02:00
|
|
|
}
|
2022-07-21 11:32:28 +02:00
|
|
|
|
2021-06-04 22:56:25 +02:00
|
|
|
return modification;
|
2020-04-05 23:03:23 +02:00
|
|
|
}
|
|
|
|
|
|
2019-01-01 20:20:50 +01:00
|
|
|
bool PipelineCache::ConfigurePipeline(
|
2020-12-07 20:23:54 +01:00
|
|
|
D3D12Shader::D3D12Translation* vertex_shader,
|
|
|
|
|
D3D12Shader::D3D12Translation* pixel_shader,
|
2021-06-04 22:56:25 +02:00
|
|
|
const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
|
2022-04-27 20:46:29 +02:00
|
|
|
reg::RB_DEPTHCONTROL normalized_depth_control,
|
2022-02-13 18:50:31 +01:00
|
|
|
uint32_t normalized_color_mask,
|
2021-04-26 21:12:09 +02:00
|
|
|
uint32_t bound_depth_and_color_render_target_bits,
|
|
|
|
|
const uint32_t* bound_depth_and_color_render_target_formats,
|
2020-11-14 14:43:18 +01:00
|
|
|
void** pipeline_handle_out, ID3D12RootSignature** root_signature_out) {
|
2020-10-04 20:49:10 +02:00
|
|
|
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
2018-07-24 13:57:21 +02:00
|
|
|
SCOPE_profile_cpu_f("gpu");
|
2020-10-04 20:49:10 +02:00
|
|
|
#endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
2018-07-28 15:30:47 +02:00
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
assert_not_null(pipeline_handle_out);
|
2018-07-28 15:30:47 +02:00
|
|
|
assert_not_null(root_signature_out);
|
|
|
|
|
|
2021-04-27 17:29:23 +02:00
|
|
|
// Ensure shaders are translated - needed now for GetCurrentStateDescription.
|
|
|
|
|
// Edge flags are not supported yet (because polygon primitives are not).
|
|
|
|
|
assert_true(register_file_.Get<reg::SQ_PROGRAM_CNTL>().vs_export_mode !=
|
|
|
|
|
xenos::VertexShaderExportMode::kPosition2VectorsEdge &&
|
|
|
|
|
register_file_.Get<reg::SQ_PROGRAM_CNTL>().vs_export_mode !=
|
|
|
|
|
xenos::VertexShaderExportMode::kPosition2VectorsEdgeKill);
|
|
|
|
|
assert_false(register_file_.Get<reg::SQ_PROGRAM_CNTL>().gen_index_vtx);
|
|
|
|
|
if (!vertex_shader->is_translated()) {
|
nasty commit with a bunch of test code left in, will clean up and pr
Remove the logger_ != nullptr check from shouldlog, it will nearly always be true except on initialization and gets checked later anyway, this shrinks the size of the generated code for some
Select specialized vastcpy for current cpu, for now only have paths for MOVDIR64B and generic avx1
Add XE_UNLIKELY/LIKELY if, they map better to the c++ unlikely/likely attributes which we will need to use soon
Finished reimplementing STVL/STVR/LVL/LVR as their own opcodes. we now generate far less code for these instructions. this also means optimization passes can be written to simplify/remove/replace these instructions in some cases. Found that a good deal of the X86 we were emitting for these instructions was dead code or redundant.
the reduction in generated HIR/x86 should help a lot with compilation times and make function precompilation more feasible as a default
Don't static assert in default prefetch impl, in c++20 the assertion will be triggered even without an instantiation
Reorder some if/else to prod msvc into ordering the branches optimally. it somewhat worked...
Added some notes about which opcodes should be removed/refactored
Dispatch in WriteRegister via vector compares for the bounds. still not very optimal, we ought to be checking whether any register in a range may be special
A lot of work on trying to optimize writeregister, moved wraparound path into a noinline function based on profiling info
Hoist the IsUcodeAnalyzed check out of AnalyzeShader, instead check it before each call. Profiler recorded many hits in the stack frame setup of the function, but none in the actual body of it, so the check is often true but the stack frame setup is run unconditionally
Pre-check whether we're about to write a single register from a ring
Replace more jump tables from draw_util/texture_info with popcnt based sparse indexing/bit tables/shuffle lookups
Place the GPU register file on its own VAD/virtual allocation, it is no longer a member of graphics system
2022-09-04 20:04:41 +02:00
|
|
|
if (!vertex_shader->shader().is_ucode_analyzed()) {
|
|
|
|
|
vertex_shader->shader().AnalyzeUcode(ucode_disasm_buffer_);
|
|
|
|
|
}
|
2021-04-27 17:29:23 +02:00
|
|
|
if (!TranslateAnalyzedShader(*shader_translator_, *vertex_shader,
|
|
|
|
|
dxbc_converter_, dxc_utils_, dxc_compiler_)) {
|
|
|
|
|
XELOGE("Failed to translate the vertex shader!");
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
if (shader_storage_file_ && vertex_shader->shader().ucode_storage_index() !=
|
|
|
|
|
shader_storage_index_) {
|
|
|
|
|
vertex_shader->shader().set_ucode_storage_index(shader_storage_index_);
|
|
|
|
|
assert_not_null(storage_write_thread_);
|
|
|
|
|
shader_storage_file_flush_needed_ = true;
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
|
|
|
storage_write_shader_queue_.push_back(&vertex_shader->shader());
|
|
|
|
|
}
|
|
|
|
|
storage_write_request_cond_.notify_all();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (!vertex_shader->is_valid()) {
|
|
|
|
|
// Translation attempted previously, but not valid.
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
if (pixel_shader != nullptr) {
|
|
|
|
|
if (!pixel_shader->is_translated()) {
|
nasty commit with a bunch of test code left in, will clean up and pr
Remove the logger_ != nullptr check from shouldlog, it will nearly always be true except on initialization and gets checked later anyway, this shrinks the size of the generated code for some
Select specialized vastcpy for current cpu, for now only have paths for MOVDIR64B and generic avx1
Add XE_UNLIKELY/LIKELY if, they map better to the c++ unlikely/likely attributes which we will need to use soon
Finished reimplementing STVL/STVR/LVL/LVR as their own opcodes. we now generate far less code for these instructions. this also means optimization passes can be written to simplify/remove/replace these instructions in some cases. Found that a good deal of the X86 we were emitting for these instructions was dead code or redundant.
the reduction in generated HIR/x86 should help a lot with compilation times and make function precompilation more feasible as a default
Don't static assert in default prefetch impl, in c++20 the assertion will be triggered even without an instantiation
Reorder some if/else to prod msvc into ordering the branches optimally. it somewhat worked...
Added some notes about which opcodes should be removed/refactored
Dispatch in WriteRegister via vector compares for the bounds. still not very optimal, we ought to be checking whether any register in a range may be special
A lot of work on trying to optimize writeregister, moved wraparound path into a noinline function based on profiling info
Hoist the IsUcodeAnalyzed check out of AnalyzeShader, instead check it before each call. Profiler recorded many hits in the stack frame setup of the function, but none in the actual body of it, so the check is often true but the stack frame setup is run unconditionally
Pre-check whether we're about to write a single register from a ring
Replace more jump tables from draw_util/texture_info with popcnt based sparse indexing/bit tables/shuffle lookups
Place the GPU register file on its own VAD/virtual allocation, it is no longer a member of graphics system
2022-09-04 20:04:41 +02:00
|
|
|
if (!pixel_shader->shader().is_ucode_analyzed()) {
|
|
|
|
|
pixel_shader->shader().AnalyzeUcode(ucode_disasm_buffer_);
|
|
|
|
|
}
|
2021-04-27 17:29:23 +02:00
|
|
|
if (!TranslateAnalyzedShader(*shader_translator_, *pixel_shader,
|
|
|
|
|
dxbc_converter_, dxc_utils_,
|
|
|
|
|
dxc_compiler_)) {
|
|
|
|
|
XELOGE("Failed to translate the pixel shader!");
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
if (shader_storage_file_ &&
|
|
|
|
|
pixel_shader->shader().ucode_storage_index() !=
|
|
|
|
|
shader_storage_index_) {
|
|
|
|
|
pixel_shader->shader().set_ucode_storage_index(shader_storage_index_);
|
|
|
|
|
assert_not_null(storage_write_thread_);
|
|
|
|
|
shader_storage_file_flush_needed_ = true;
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
|
|
|
|
storage_write_shader_queue_.push_back(&pixel_shader->shader());
|
|
|
|
|
}
|
|
|
|
|
storage_write_request_cond_.notify_all();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (!pixel_shader->is_valid()) {
|
|
|
|
|
// Translation attempted previously, but not valid.
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-21 17:21:00 +01:00
|
|
|
PipelineRuntimeDescription runtime_description;
|
2021-04-26 21:12:09 +02:00
|
|
|
if (!GetCurrentStateDescription(
|
2021-06-04 22:56:25 +02:00
|
|
|
vertex_shader, pixel_shader, primitive_processing_result,
|
2022-04-27 20:46:29 +02:00
|
|
|
normalized_depth_control, normalized_color_mask,
|
|
|
|
|
bound_depth_and_color_render_target_bits,
|
2021-04-26 21:12:09 +02:00
|
|
|
bound_depth_and_color_render_target_formats, runtime_description)) {
|
2019-01-01 20:20:50 +01:00
|
|
|
return false;
|
2018-07-28 15:30:47 +02:00
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
PipelineDescription& description = runtime_description.description;
|
2018-07-28 15:30:47 +02:00
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
if (current_pipeline_ != nullptr &&
|
use Sleep(0) instead of SwitchToThread, should waste less power and help the os with scheduling.
PM4 buffer handling made a virtual member of commandprocessor, place the implementation/declaration into reusable macro files. this is probably the biggest boost here.
Optimized SET_CONSTANT/ LOAD_CONSTANT pm4 ops based on the register range they start writing at, this was also a nice boost
Expose X64 extension flags to code outside of x64 backend, so we can detect and use things like avx512, xop, avx2, etc in normal code
Add freelists for HIR structures to try to reduce the number of last level cache misses during optimization (currently disabled... fixme later)
Analyzed PGO feedback and reordered branches, uninlined functions, moved code out into different functions based on info from it in the PM4 functions, this gave like a 2% boost at best.
Added support for the db16cyc opcode, which is used often in xb360 spinlocks. before it was just being translated to nop, now on x64 we translate it to _mm_pause but may change that in the future to reduce cpu time wasted
texture util - all our divisors were powers of 2, instead we look up a shift. this made texture scaling slightly faster, more so on intel processors which seem to be worse at int divs. GetGuestTextureLayout is now a little faster, although it is still one of the heaviest functions in the emulator when scaling is on.
xe_unlikely_mutex was not a good choice for the guest clock lock, (running theory) on intel processors another thread may take a significant time to update the clock? maybe because of the uint64 division? really not sure, but switched it to xe_mutex. This fixed audio stutter that i had introduced to 1 or 2 games, fixed performance on that n64 rare game with the monkeys.
Took another crack at DMA implementation, another failure.
Instead of passing as a parameter, keep the ringbuffer reader as the first member of commandprocessor so it can be accessed through this
Added macro for noalias
Applied noalias to Memory::LookupHeap. This reduced the size of the executable by 7 kb.
Reworked kernel shim template, this shaved like 100kb off the exe and eliminated the indirect calls from the shim to the actual implementation. We still unconditionally generate string representations of kernel calls though :(, unless it is kHighFrequency
Add nvapi extensions support, currently unused. Will use CPUVISIBLE memory at some point
Inserted prefetches in a few places based on feedback from vtune.
Add native implementation of SHA int8 if all elements are the same
Vectorized comparisons for SetViewport, SetScissorRect
Vectorized ranged comparisons for WriteRegister
Add XE_MSVC_ASSUME
Move FormatInfo::name out of the structure, instead look up the name in a different table. Debug related data and critical runtime data are best kept apart
Templated UpdateSystemConstantValues based on ROV/RTV and primitive_polygonal
Add ArchFloatMask functions, these are for storing the results of floating point comparisons without doing costly float->int pipeline transfers (vucomiss/setb)
Use floatmasks in UpdateSystemConstantValues for checking if dirty, only transfer to int at end of function.
Instead of dirty |= (x == y) in UpdateSystemConstantValues, now we do dirty_u32 |= (x^y). if any of them are not equal, dirty_u32 will be nz, else if theyre all equal it will be zero. This is more friendly to register renaming and the lack of dependencies on EFLAGS lets the compiler reorder better
Add PrefetchSamplerParameters to D3D12TextureCache
use PrefetchSamplerParameters in UpdateBindings to eliminate cache misses that vtune detected
Add PrefetchTextureBinding to D3D12TextureCache
Prefetch texture bindings to get rid of more misses vtune detected (more accesses out of order with random strides)
Rewrote DMAC, still terrible though and have disabled it for now.
Replace tiny memcmp of 6 U64 in render_target_cache with inline loop, msvc fails to make it a loop and instead does a thunk to their memcmp function, which is optimized for larger sizes
PrefetchTextureBinding in AreActiveTextureSRVKeysUpToDate
Replace memcmp calls for pipelinedescription with handwritten cmp
Directly write some registers that dont have special handling in PM4 functions
Changed EstimateMaxY to try to eliminate mispredictions that vtune was reporting, msvc ended up turning the changed code into a series of blends
in ExecutePacketType3_EVENT_WRITE_EXT, instead of writing extents to an array on the stack and then doing xe_copy_and_swap_16 of the data to its dest, pre-swap each constant and then store those. msvc manages to unroll that into wider stores
stop logging XE_SWAP every time we receive XE_SWAP, stop logging the start and end of each viz query
Prefetch watch nodes in FireWatches based on feedback from vtune
Removed dead code from texture_info.cc
NOINLINE on GpuSwap, PGO builds did it so we should too.
2022-09-11 23:14:48 +02:00
|
|
|
current_pipeline_->description.description == description) {
|
2020-11-14 14:43:18 +01:00
|
|
|
*pipeline_handle_out = current_pipeline_;
|
2020-03-21 17:21:00 +01:00
|
|
|
*root_signature_out = runtime_description.root_signature;
|
2019-01-01 20:20:50 +01:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
// Find an existing pipeline in the cache.
|
2020-12-08 20:31:09 +01:00
|
|
|
uint64_t hash = XXH3_64bits(&description, sizeof(description));
|
2020-11-14 14:43:18 +01:00
|
|
|
auto found_range = pipelines_.equal_range(hash);
|
2020-03-21 17:21:00 +01:00
|
|
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
2020-11-14 14:43:18 +01:00
|
|
|
Pipeline* found_pipeline = it->second;
|
use Sleep(0) instead of SwitchToThread, should waste less power and help the os with scheduling.
PM4 buffer handling made a virtual member of commandprocessor, place the implementation/declaration into reusable macro files. this is probably the biggest boost here.
Optimized SET_CONSTANT/ LOAD_CONSTANT pm4 ops based on the register range they start writing at, this was also a nice boost
Expose X64 extension flags to code outside of x64 backend, so we can detect and use things like avx512, xop, avx2, etc in normal code
Add freelists for HIR structures to try to reduce the number of last level cache misses during optimization (currently disabled... fixme later)
Analyzed PGO feedback and reordered branches, uninlined functions, moved code out into different functions based on info from it in the PM4 functions, this gave like a 2% boost at best.
Added support for the db16cyc opcode, which is used often in xb360 spinlocks. before it was just being translated to nop, now on x64 we translate it to _mm_pause but may change that in the future to reduce cpu time wasted
texture util - all our divisors were powers of 2, instead we look up a shift. this made texture scaling slightly faster, more so on intel processors which seem to be worse at int divs. GetGuestTextureLayout is now a little faster, although it is still one of the heaviest functions in the emulator when scaling is on.
xe_unlikely_mutex was not a good choice for the guest clock lock, (running theory) on intel processors another thread may take a significant time to update the clock? maybe because of the uint64 division? really not sure, but switched it to xe_mutex. This fixed audio stutter that i had introduced to 1 or 2 games, fixed performance on that n64 rare game with the monkeys.
Took another crack at DMA implementation, another failure.
Instead of passing as a parameter, keep the ringbuffer reader as the first member of commandprocessor so it can be accessed through this
Added macro for noalias
Applied noalias to Memory::LookupHeap. This reduced the size of the executable by 7 kb.
Reworked kernel shim template, this shaved like 100kb off the exe and eliminated the indirect calls from the shim to the actual implementation. We still unconditionally generate string representations of kernel calls though :(, unless it is kHighFrequency
Add nvapi extensions support, currently unused. Will use CPUVISIBLE memory at some point
Inserted prefetches in a few places based on feedback from vtune.
Add native implementation of SHA int8 if all elements are the same
Vectorized comparisons for SetViewport, SetScissorRect
Vectorized ranged comparisons for WriteRegister
Add XE_MSVC_ASSUME
Move FormatInfo::name out of the structure, instead look up the name in a different table. Debug related data and critical runtime data are best kept apart
Templated UpdateSystemConstantValues based on ROV/RTV and primitive_polygonal
Add ArchFloatMask functions, these are for storing the results of floating point comparisons without doing costly float->int pipeline transfers (vucomiss/setb)
Use floatmasks in UpdateSystemConstantValues for checking if dirty, only transfer to int at end of function.
Instead of dirty |= (x == y) in UpdateSystemConstantValues, now we do dirty_u32 |= (x^y). if any of them are not equal, dirty_u32 will be nz, else if theyre all equal it will be zero. This is more friendly to register renaming and the lack of dependencies on EFLAGS lets the compiler reorder better
Add PrefetchSamplerParameters to D3D12TextureCache
use PrefetchSamplerParameters in UpdateBindings to eliminate cache misses that vtune detected
Add PrefetchTextureBinding to D3D12TextureCache
Prefetch texture bindings to get rid of more misses vtune detected (more accesses out of order with random strides)
Rewrote DMAC, still terrible though and have disabled it for now.
Replace tiny memcmp of 6 U64 in render_target_cache with inline loop, msvc fails to make it a loop and instead does a thunk to their memcmp function, which is optimized for larger sizes
PrefetchTextureBinding in AreActiveTextureSRVKeysUpToDate
Replace memcmp calls for pipelinedescription with handwritten cmp
Directly write some registers that dont have special handling in PM4 functions
Changed EstimateMaxY to try to eliminate mispredictions that vtune was reporting, msvc ended up turning the changed code into a series of blends
in ExecutePacketType3_EVENT_WRITE_EXT, instead of writing extents to an array on the stack and then doing xe_copy_and_swap_16 of the data to its dest, pre-swap each constant and then store those. msvc manages to unroll that into wider stores
stop logging XE_SWAP every time we receive XE_SWAP, stop logging the start and end of each viz query
Prefetch watch nodes in FireWatches based on feedback from vtune
Removed dead code from texture_info.cc
NOINLINE on GpuSwap, PGO builds did it so we should too.
2022-09-11 23:14:48 +02:00
|
|
|
if (found_pipeline->description.description == description) {
|
2020-11-14 14:43:18 +01:00
|
|
|
current_pipeline_ = found_pipeline;
|
|
|
|
|
*pipeline_handle_out = found_pipeline;
|
|
|
|
|
*root_signature_out = found_pipeline->description.root_signature;
|
2019-01-01 21:13:26 +01:00
|
|
|
return true;
|
|
|
|
|
}
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
|
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
Pipeline* new_pipeline = new Pipeline;
|
|
|
|
|
new_pipeline->state = nullptr;
|
|
|
|
|
std::memcpy(&new_pipeline->description, &runtime_description,
|
2020-03-21 17:21:00 +01:00
|
|
|
sizeof(runtime_description));
|
2020-11-14 14:43:18 +01:00
|
|
|
pipelines_.emplace(hash, new_pipeline);
|
|
|
|
|
COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size());
|
2019-01-03 22:30:11 +01:00
|
|
|
|
2019-01-04 12:30:26 +01:00
|
|
|
if (!creation_threads_.empty()) {
|
2020-11-14 14:43:18 +01:00
|
|
|
// Submit the pipeline for creation to any available thread.
|
2019-01-04 12:30:26 +01:00
|
|
|
{
|
2022-09-17 13:04:53 +02:00
|
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
2020-11-14 14:43:18 +01:00
|
|
|
creation_queue_.push_back(new_pipeline);
|
2019-01-04 12:30:26 +01:00
|
|
|
}
|
|
|
|
|
creation_request_cond_.notify_one();
|
2019-01-01 20:20:50 +01:00
|
|
|
} else {
|
2020-11-14 14:43:18 +01:00
|
|
|
new_pipeline->state = CreateD3D12Pipeline(runtime_description);
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
|
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
if (pipeline_storage_file_) {
|
2020-03-21 17:21:00 +01:00
|
|
|
assert_not_null(storage_write_thread_);
|
2020-11-14 14:43:18 +01:00
|
|
|
pipeline_storage_file_flush_needed_ = true;
|
2020-03-21 17:21:00 +01:00
|
|
|
{
|
|
|
|
|
std::lock_guard<std::mutex> lock(storage_write_request_lock_);
|
2020-11-14 14:43:18 +01:00
|
|
|
storage_write_pipeline_queue_.emplace_back();
|
2020-03-21 17:21:00 +01:00
|
|
|
PipelineStoredDescription& stored_description =
|
2020-11-14 14:43:18 +01:00
|
|
|
storage_write_pipeline_queue_.back();
|
2020-03-21 17:21:00 +01:00
|
|
|
stored_description.description_hash = hash;
|
|
|
|
|
std::memcpy(&stored_description.description, &description,
|
|
|
|
|
sizeof(description));
|
|
|
|
|
}
|
|
|
|
|
storage_write_request_cond_.notify_all();
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
current_pipeline_ = new_pipeline;
|
|
|
|
|
*pipeline_handle_out = new_pipeline;
|
2020-03-21 17:21:00 +01:00
|
|
|
*root_signature_out = runtime_description.root_signature;
|
2019-01-01 20:20:50 +01:00
|
|
|
return true;
|
2018-07-24 13:57:21 +02:00
|
|
|
}
|
|
|
|
|
|
2020-12-19 14:14:54 +01:00
|
|
|
bool PipelineCache::TranslateAnalyzedShader(
|
|
|
|
|
DxbcShaderTranslator& translator,
|
|
|
|
|
D3D12Shader::D3D12Translation& translation, IDxbcConverter* dxbc_converter,
|
|
|
|
|
IDxcUtils* dxc_utils, IDxcCompiler* dxc_compiler) {
|
2020-12-07 20:23:54 +01:00
|
|
|
D3D12Shader& shader = static_cast<D3D12Shader&>(translation.shader());
|
|
|
|
|
|
2018-07-24 13:57:21 +02:00
|
|
|
// Perform translation.
|
|
|
|
|
// If this fails the shader will be marked as invalid and ignored later.
|
2020-12-19 14:14:54 +01:00
|
|
|
if (!translator.TranslateAnalyzedShader(translation)) {
|
2020-02-28 21:30:48 +01:00
|
|
|
XELOGE("Shader {:016X} translation failed; marking as ignored",
|
2020-08-30 21:07:35 +02:00
|
|
|
shader.ucode_data_hash());
|
2018-07-24 13:57:21 +02:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-19 22:52:33 +02:00
|
|
|
const char* host_shader_type;
|
2020-08-30 21:07:35 +02:00
|
|
|
if (shader.type() == xenos::ShaderType::kVertex) {
|
2020-12-07 20:23:54 +01:00
|
|
|
DxbcShaderTranslator::Modification modification(translation.modification());
|
2021-04-26 21:12:09 +02:00
|
|
|
switch (modification.vertex.host_vertex_shader_type) {
|
2020-06-19 22:52:33 +02:00
|
|
|
case Shader::HostVertexShaderType::kLineDomainCPIndexed:
|
|
|
|
|
host_shader_type = "control-point-indexed line domain";
|
|
|
|
|
break;
|
|
|
|
|
case Shader::HostVertexShaderType::kLineDomainPatchIndexed:
|
|
|
|
|
host_shader_type = "patch-indexed line domain";
|
|
|
|
|
break;
|
|
|
|
|
case Shader::HostVertexShaderType::kTriangleDomainCPIndexed:
|
|
|
|
|
host_shader_type = "control-point-indexed triangle domain";
|
|
|
|
|
break;
|
|
|
|
|
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
|
|
|
|
|
host_shader_type = "patch-indexed triangle domain";
|
|
|
|
|
break;
|
|
|
|
|
case Shader::HostVertexShaderType::kQuadDomainCPIndexed:
|
|
|
|
|
host_shader_type = "control-point-indexed quad domain";
|
|
|
|
|
break;
|
|
|
|
|
case Shader::HostVertexShaderType::kQuadDomainPatchIndexed:
|
|
|
|
|
host_shader_type = "patch-indexed quad domain";
|
|
|
|
|
break;
|
|
|
|
|
default:
|
2022-07-21 11:32:28 +02:00
|
|
|
assert(modification.vertex.host_vertex_shader_type ==
|
|
|
|
|
Shader::HostVertexShaderType::kVertex);
|
2020-06-19 22:52:33 +02:00
|
|
|
host_shader_type = "vertex";
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
host_shader_type = "pixel";
|
|
|
|
|
}
|
|
|
|
|
XELOGGPU("Generated {} shader ({}b) - hash {:016X}:\n{}\n", host_shader_type,
|
2021-04-26 21:12:09 +02:00
|
|
|
shader.ucode_dword_count() * sizeof(uint32_t),
|
|
|
|
|
shader.ucode_data_hash(), shader.ucode_disassembly().c_str());
|
2020-06-19 22:52:33 +02:00
|
|
|
|
2020-12-07 20:23:54 +01:00
|
|
|
// Set up texture and sampler binding layouts.
|
|
|
|
|
if (shader.EnterBindingLayoutUserUIDSetup()) {
|
2020-12-19 14:14:54 +01:00
|
|
|
const std::vector<D3D12Shader::TextureBinding>& texture_bindings =
|
|
|
|
|
shader.GetTextureBindingsAfterTranslation();
|
2022-05-17 20:33:17 +02:00
|
|
|
size_t texture_binding_count = texture_bindings.size();
|
2020-12-19 14:14:54 +01:00
|
|
|
const std::vector<D3D12Shader::SamplerBinding>& sampler_bindings =
|
|
|
|
|
shader.GetSamplerBindingsAfterTranslation();
|
2022-05-17 20:33:17 +02:00
|
|
|
size_t sampler_binding_count = sampler_bindings.size();
|
2020-12-07 20:23:54 +01:00
|
|
|
assert_false(bindless_resources_used_ &&
|
|
|
|
|
texture_binding_count + sampler_binding_count >
|
|
|
|
|
D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 4);
|
|
|
|
|
size_t texture_binding_layout_bytes =
|
2020-12-19 14:14:54 +01:00
|
|
|
texture_binding_count * sizeof(*texture_bindings.data());
|
2020-12-07 20:23:54 +01:00
|
|
|
uint64_t texture_binding_layout_hash = 0;
|
2020-06-19 22:52:33 +02:00
|
|
|
if (texture_binding_count) {
|
2020-12-07 20:23:54 +01:00
|
|
|
texture_binding_layout_hash =
|
2020-12-19 14:14:54 +01:00
|
|
|
XXH3_64bits(texture_bindings.data(), texture_binding_layout_bytes);
|
2020-06-19 22:52:33 +02:00
|
|
|
}
|
2022-05-17 20:33:17 +02:00
|
|
|
size_t bindless_sampler_count =
|
2020-12-07 20:23:54 +01:00
|
|
|
bindless_resources_used_ ? sampler_binding_count : 0;
|
|
|
|
|
uint64_t bindless_sampler_layout_hash = 0;
|
2020-06-19 22:52:33 +02:00
|
|
|
if (bindless_sampler_count) {
|
2020-12-08 20:31:09 +01:00
|
|
|
XXH3_state_t hash_state;
|
|
|
|
|
XXH3_64bits_reset(&hash_state);
|
2022-05-17 20:33:17 +02:00
|
|
|
for (size_t i = 0; i < bindless_sampler_count; ++i) {
|
2020-12-08 20:31:09 +01:00
|
|
|
XXH3_64bits_update(
|
|
|
|
|
&hash_state, &sampler_bindings[i].bindless_descriptor_index,
|
|
|
|
|
sizeof(sampler_bindings[i].bindless_descriptor_index));
|
2020-12-07 20:23:54 +01:00
|
|
|
}
|
2020-12-08 20:31:09 +01:00
|
|
|
bindless_sampler_layout_hash = XXH3_64bits_digest(&hash_state);
|
2020-12-07 20:23:54 +01:00
|
|
|
}
|
|
|
|
|
// Obtain the unique IDs of binding layouts if there are any texture
|
|
|
|
|
// bindings or bindless samplers, for invalidation in the command processor.
|
|
|
|
|
size_t texture_binding_layout_uid = kLayoutUIDEmpty;
|
|
|
|
|
// Use sampler count for the bindful case because it's the only thing that
|
|
|
|
|
// must be the same for layouts to be compatible in this case
|
|
|
|
|
// (instruction-specified parameters are used as overrides for actual
|
|
|
|
|
// samplers).
|
|
|
|
|
static_assert(
|
|
|
|
|
kLayoutUIDEmpty == 0,
|
|
|
|
|
"Empty layout UID is assumed to be 0 because for bindful samplers, the "
|
|
|
|
|
"UID is their count");
|
2022-05-17 20:33:17 +02:00
|
|
|
size_t sampler_binding_layout_uid =
|
|
|
|
|
bindless_resources_used_ ? kLayoutUIDEmpty : sampler_binding_count;
|
2020-12-07 20:23:54 +01:00
|
|
|
if (texture_binding_count || bindless_sampler_count) {
|
2022-05-15 17:52:20 +02:00
|
|
|
std::lock_guard<std::mutex> layouts_lock(layouts_mutex_);
|
2020-12-07 20:23:54 +01:00
|
|
|
if (texture_binding_count) {
|
|
|
|
|
auto found_range = texture_binding_layout_map_.equal_range(
|
|
|
|
|
texture_binding_layout_hash);
|
|
|
|
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
|
|
|
|
if (it->second.vector_span_length == texture_binding_count &&
|
|
|
|
|
!std::memcmp(texture_binding_layouts_.data() +
|
|
|
|
|
it->second.vector_span_offset,
|
2020-12-19 14:14:54 +01:00
|
|
|
texture_bindings.data(),
|
|
|
|
|
texture_binding_layout_bytes)) {
|
2020-12-07 20:23:54 +01:00
|
|
|
texture_binding_layout_uid = it->second.uid;
|
2020-06-19 22:52:33 +02:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
if (texture_binding_layout_uid == kLayoutUIDEmpty) {
|
|
|
|
|
static_assert(
|
|
|
|
|
kLayoutUIDEmpty == 0,
|
|
|
|
|
"Layout UID is size + 1 because it's assumed that 0 is the UID "
|
|
|
|
|
"for an empty layout");
|
|
|
|
|
texture_binding_layout_uid = texture_binding_layout_map_.size() + 1;
|
|
|
|
|
LayoutUID new_uid;
|
|
|
|
|
new_uid.uid = texture_binding_layout_uid;
|
|
|
|
|
new_uid.vector_span_offset = texture_binding_layouts_.size();
|
|
|
|
|
new_uid.vector_span_length = texture_binding_count;
|
|
|
|
|
texture_binding_layouts_.resize(new_uid.vector_span_offset +
|
|
|
|
|
texture_binding_count);
|
|
|
|
|
std::memcpy(
|
|
|
|
|
texture_binding_layouts_.data() + new_uid.vector_span_offset,
|
2020-12-19 14:14:54 +01:00
|
|
|
texture_bindings.data(), texture_binding_layout_bytes);
|
2020-12-07 20:23:54 +01:00
|
|
|
texture_binding_layout_map_.emplace(texture_binding_layout_hash,
|
|
|
|
|
new_uid);
|
2020-06-19 22:52:33 +02:00
|
|
|
}
|
|
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
if (bindless_sampler_count) {
|
|
|
|
|
auto found_range = bindless_sampler_layout_map_.equal_range(
|
|
|
|
|
sampler_binding_layout_uid);
|
|
|
|
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
|
|
|
|
if (it->second.vector_span_length != bindless_sampler_count) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
sampler_binding_layout_uid = it->second.uid;
|
|
|
|
|
const uint32_t* vector_bindless_sampler_layout =
|
|
|
|
|
bindless_sampler_layouts_.data() + it->second.vector_span_offset;
|
2022-05-17 20:33:17 +02:00
|
|
|
for (size_t i = 0; i < bindless_sampler_count; ++i) {
|
2020-12-07 20:23:54 +01:00
|
|
|
if (vector_bindless_sampler_layout[i] !=
|
|
|
|
|
sampler_bindings[i].bindless_descriptor_index) {
|
|
|
|
|
sampler_binding_layout_uid = kLayoutUIDEmpty;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (sampler_binding_layout_uid != kLayoutUIDEmpty) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (sampler_binding_layout_uid == kLayoutUIDEmpty) {
|
|
|
|
|
sampler_binding_layout_uid = bindless_sampler_layout_map_.size();
|
|
|
|
|
LayoutUID new_uid;
|
|
|
|
|
static_assert(
|
|
|
|
|
kLayoutUIDEmpty == 0,
|
|
|
|
|
"Layout UID is size + 1 because it's assumed that 0 is the UID "
|
|
|
|
|
"for an empty layout");
|
|
|
|
|
new_uid.uid = sampler_binding_layout_uid + 1;
|
|
|
|
|
new_uid.vector_span_offset = bindless_sampler_layouts_.size();
|
|
|
|
|
new_uid.vector_span_length = sampler_binding_count;
|
|
|
|
|
bindless_sampler_layouts_.resize(new_uid.vector_span_offset +
|
|
|
|
|
sampler_binding_count);
|
|
|
|
|
uint32_t* vector_bindless_sampler_layout =
|
|
|
|
|
bindless_sampler_layouts_.data() + new_uid.vector_span_offset;
|
2022-05-17 20:33:17 +02:00
|
|
|
for (size_t i = 0; i < bindless_sampler_count; ++i) {
|
2020-12-07 20:23:54 +01:00
|
|
|
vector_bindless_sampler_layout[i] =
|
|
|
|
|
sampler_bindings[i].bindless_descriptor_index;
|
|
|
|
|
}
|
|
|
|
|
bindless_sampler_layout_map_.emplace(bindless_sampler_layout_hash,
|
|
|
|
|
new_uid);
|
2020-06-19 22:52:33 +02:00
|
|
|
}
|
2020-04-07 17:55:43 +02:00
|
|
|
}
|
|
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
shader.SetTextureBindingLayoutUserUID(texture_binding_layout_uid);
|
|
|
|
|
shader.SetSamplerBindingLayoutUserUID(sampler_binding_layout_uid);
|
2019-01-11 15:07:33 +01:00
|
|
|
}
|
|
|
|
|
|
2018-08-30 19:42:22 +02:00
|
|
|
// Disassemble the shader for dumping.
|
[UI] Image post-processing and full presentation/window rework
[GPU] Add FXAA post-processing
[UI] Add FidelityFX FSR and CAS post-processing
[UI] Add blue noise dithering from 10bpc to 8bpc
[GPU] Apply the DC PWL gamma ramp closer to the spec, supporting fully white color
[UI] Allow the GPU CP thread to present on the host directly, bypassing the UI thread OS paint event
[UI] Allow variable refresh rate (or tearing)
[UI] Present the newest frame (restart) on DXGI
[UI] Replace GraphicsContext with a far more advanced Presenter with more coherent surface connection and UI overlay state management
[UI] Connect presentation to windows via the Surface class, not native window handles
[Vulkan] Switch to simpler Vulkan setup with no instance/device separation due to interdependencies and to pass fewer objects around
[Vulkan] Lower the minimum required Vulkan version to 1.0
[UI/GPU] Various cleanup, mainly ComPtr usage
[UI] Support per-monitor DPI awareness v2 on Windows
[UI] DPI-scale Dear ImGui
[UI] Replace the remaining non-detachable window delegates with unified window event and input listeners
[UI] Allow listeners to safely destroy or close the window, and to register/unregister listeners without use-after-free and the ABA problem
[UI] Explicit Z ordering of input listeners and UI overlays, top-down for input, bottom-up for drawing
[UI] Add explicit window lifecycle phases
[UI] Replace Window virtual functions with explicit desired state, its application, actual state, its feedback
[UI] GTK: Apply the initial size to the drawing area
[UI] Limit internal UI frame rate to that of the monitor
[UI] Hide the cursor using a timer instead of polling due to no repeated UI thread paints with GPU CP thread presentation, and only within the window
2022-01-29 11:22:03 +01:00
|
|
|
const ui::d3d12::D3D12Provider& provider =
|
|
|
|
|
command_processor_.GetD3D12Provider();
|
2020-08-22 22:15:50 +02:00
|
|
|
if (cvars::d3d12_dxbc_disasm_dxilconv) {
|
2020-12-07 20:23:54 +01:00
|
|
|
translation.DisassembleDxbcAndDxil(provider, cvars::d3d12_dxbc_disasm,
|
|
|
|
|
dxbc_converter, dxc_utils, dxc_compiler);
|
2020-08-22 22:15:50 +02:00
|
|
|
} else {
|
2020-12-07 20:23:54 +01:00
|
|
|
translation.DisassembleDxbcAndDxil(provider, cvars::d3d12_dxbc_disasm);
|
2018-08-30 19:42:22 +02:00
|
|
|
}
|
|
|
|
|
|
2018-07-24 13:57:21 +02:00
|
|
|
// Dump shader files if desired.
|
2019-08-03 15:53:23 +02:00
|
|
|
if (!cvars::dump_shaders.empty()) {
|
2021-04-26 21:12:09 +02:00
|
|
|
bool edram_rov_used = render_target_cache_.GetPath() ==
|
|
|
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
2020-12-07 20:23:54 +01:00
|
|
|
translation.Dump(cvars::dump_shaders,
|
|
|
|
|
(shader.type() == xenos::ShaderType::kPixel)
|
2021-04-26 21:12:09 +02:00
|
|
|
? (edram_rov_used ? "d3d12_rov" : "d3d12_rtv")
|
2020-12-07 20:23:54 +01:00
|
|
|
: "d3d12");
|
2018-07-24 13:57:21 +02:00
|
|
|
}
|
|
|
|
|
|
2020-12-07 20:23:54 +01:00
|
|
|
return translation.is_valid();
|
2018-07-24 13:57:21 +02:00
|
|
|
}
|
|
|
|
|
|
2019-01-01 20:20:50 +01:00
|
|
|
bool PipelineCache::GetCurrentStateDescription(
|
2020-12-07 20:23:54 +01:00
|
|
|
D3D12Shader::D3D12Translation* vertex_shader,
|
|
|
|
|
D3D12Shader::D3D12Translation* pixel_shader,
|
2021-06-04 22:56:25 +02:00
|
|
|
const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
|
2022-04-27 20:46:29 +02:00
|
|
|
reg::RB_DEPTHCONTROL normalized_depth_control,
|
2022-02-13 18:50:31 +01:00
|
|
|
uint32_t normalized_color_mask,
|
2021-04-26 21:12:09 +02:00
|
|
|
uint32_t bound_depth_and_color_render_target_bits,
|
|
|
|
|
const uint32_t* bound_depth_and_color_render_target_formats,
|
2020-03-21 17:21:00 +01:00
|
|
|
PipelineRuntimeDescription& runtime_description_out) {
|
2021-04-27 17:29:23 +02:00
|
|
|
// Translated shaders needed at least for the root signature.
|
|
|
|
|
assert_true(vertex_shader->is_translated() && vertex_shader->is_valid());
|
|
|
|
|
assert_true(!pixel_shader ||
|
|
|
|
|
(pixel_shader->is_translated() && pixel_shader->is_valid()));
|
|
|
|
|
|
2020-03-21 17:21:00 +01:00
|
|
|
PipelineDescription& description_out = runtime_description_out.description;
|
|
|
|
|
|
2020-08-30 21:07:35 +02:00
|
|
|
const auto& regs = register_file_;
|
2019-10-20 18:40:37 +02:00
|
|
|
auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
|
2018-07-24 13:57:21 +02:00
|
|
|
|
2019-01-01 20:20:50 +01:00
|
|
|
// Initialize all unused fields to zero for comparison/hashing.
|
2020-03-21 17:21:00 +01:00
|
|
|
std::memset(&runtime_description_out, 0, sizeof(runtime_description_out));
|
2019-01-01 20:20:50 +01:00
|
|
|
|
2021-06-04 22:56:25 +02:00
|
|
|
assert_true(DxbcShaderTranslator::Modification(vertex_shader->modification())
|
|
|
|
|
.vertex.host_vertex_shader_type ==
|
|
|
|
|
primitive_processing_result.host_vertex_shader_type);
|
|
|
|
|
bool tessellated = primitive_processing_result.IsTessellated();
|
|
|
|
|
bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
|
2020-12-24 21:40:38 +01:00
|
|
|
bool rasterization_enabled =
|
|
|
|
|
draw_util::IsRasterizationPotentiallyDone(regs, primitive_polygonal);
|
|
|
|
|
// In Direct3D, rasterization (along with pixel counting) is disabled by
|
|
|
|
|
// disabling the pixel shader and depth / stencil. However, if rasterization
|
|
|
|
|
// should be disabled, the pixel shader must be disabled externally, to ensure
|
|
|
|
|
// things like texture binding layout is correct for the shader actually being
|
|
|
|
|
// used (don't replace anything here).
|
|
|
|
|
if (!rasterization_enabled) {
|
|
|
|
|
assert_null(pixel_shader);
|
|
|
|
|
if (pixel_shader) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
|
2021-04-26 21:12:09 +02:00
|
|
|
bool edram_rov_used = render_target_cache_.GetPath() ==
|
|
|
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
|
|
|
|
2019-01-01 20:20:50 +01:00
|
|
|
// Root signature.
|
2020-12-07 20:23:54 +01:00
|
|
|
runtime_description_out.root_signature = command_processor_.GetRootSignature(
|
|
|
|
|
static_cast<const DxbcShader*>(&vertex_shader->shader()),
|
|
|
|
|
pixel_shader ? static_cast<const DxbcShader*>(&pixel_shader->shader())
|
|
|
|
|
: nullptr,
|
|
|
|
|
tessellated);
|
2020-03-21 17:21:00 +01:00
|
|
|
if (runtime_description_out.root_signature == nullptr) {
|
2019-01-01 20:20:50 +01:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2020-12-24 21:40:38 +01:00
|
|
|
// Vertex shader.
|
2020-03-21 17:21:00 +01:00
|
|
|
runtime_description_out.vertex_shader = vertex_shader;
|
2020-12-07 20:23:54 +01:00
|
|
|
description_out.vertex_shader_hash =
|
|
|
|
|
vertex_shader->shader().ucode_data_hash();
|
|
|
|
|
description_out.vertex_shader_modification = vertex_shader->modification();
|
2019-01-01 20:20:50 +01:00
|
|
|
|
|
|
|
|
// Index buffer strip cut value.
|
2021-06-04 22:56:25 +02:00
|
|
|
if (primitive_processing_result.host_primitive_reset_enabled) {
|
|
|
|
|
description_out.strip_cut_index =
|
|
|
|
|
primitive_processing_result.host_index_format ==
|
|
|
|
|
xenos::IndexFormat::kInt16
|
|
|
|
|
? PipelineStripCutIndex::kFFFF
|
|
|
|
|
: PipelineStripCutIndex::kFFFFFFFF;
|
2019-01-01 20:20:50 +01:00
|
|
|
} else {
|
|
|
|
|
description_out.strip_cut_index = PipelineStripCutIndex::kNone;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-05 23:03:23 +02:00
|
|
|
// Host vertex shader type and primitive topology.
|
2020-12-07 20:23:54 +01:00
|
|
|
if (tessellated) {
|
|
|
|
|
description_out.primitive_topology_type_or_tessellation_mode =
|
2021-06-04 22:56:25 +02:00
|
|
|
uint32_t(primitive_processing_result.tessellation_mode);
|
2020-12-07 20:23:54 +01:00
|
|
|
} else {
|
2021-06-04 22:56:25 +02:00
|
|
|
switch (primitive_processing_result.host_primitive_type) {
|
2020-07-11 14:54:22 +02:00
|
|
|
case xenos::PrimitiveType::kPointList:
|
2020-04-05 23:03:23 +02:00
|
|
|
description_out.primitive_topology_type_or_tessellation_mode =
|
|
|
|
|
uint32_t(PipelinePrimitiveTopologyType::kPoint);
|
2019-07-13 21:25:03 +02:00
|
|
|
break;
|
2020-07-11 14:54:22 +02:00
|
|
|
case xenos::PrimitiveType::kLineList:
|
|
|
|
|
case xenos::PrimitiveType::kLineStrip:
|
2019-07-13 21:25:03 +02:00
|
|
|
// Quads are emulated as line lists with adjacency.
|
2020-07-11 14:54:22 +02:00
|
|
|
case xenos::PrimitiveType::kQuadList:
|
|
|
|
|
case xenos::PrimitiveType::k2DLineStrip:
|
2020-04-05 23:03:23 +02:00
|
|
|
description_out.primitive_topology_type_or_tessellation_mode =
|
|
|
|
|
uint32_t(PipelinePrimitiveTopologyType::kLine);
|
2019-07-13 21:25:03 +02:00
|
|
|
break;
|
|
|
|
|
default:
|
2020-04-05 23:03:23 +02:00
|
|
|
description_out.primitive_topology_type_or_tessellation_mode =
|
|
|
|
|
uint32_t(PipelinePrimitiveTopologyType::kTriangle);
|
2019-07-13 21:25:03 +02:00
|
|
|
break;
|
|
|
|
|
}
|
2021-06-04 22:56:25 +02:00
|
|
|
switch (primitive_processing_result.host_primitive_type) {
|
2020-07-11 14:54:22 +02:00
|
|
|
case xenos::PrimitiveType::kPointList:
|
2019-07-13 21:25:03 +02:00
|
|
|
description_out.geometry_shader = PipelineGeometryShader::kPointList;
|
|
|
|
|
break;
|
2020-07-11 14:54:22 +02:00
|
|
|
case xenos::PrimitiveType::kRectangleList:
|
2019-07-13 21:25:03 +02:00
|
|
|
description_out.geometry_shader =
|
|
|
|
|
PipelineGeometryShader::kRectangleList;
|
|
|
|
|
break;
|
2020-07-11 14:54:22 +02:00
|
|
|
case xenos::PrimitiveType::kQuadList:
|
2019-07-13 21:25:03 +02:00
|
|
|
description_out.geometry_shader = PipelineGeometryShader::kQuadList;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
description_out.geometry_shader = PipelineGeometryShader::kNone;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2018-07-28 15:30:47 +02:00
|
|
|
}
|
2022-05-09 18:16:22 +02:00
|
|
|
GeometryShaderKey geometry_shader_key;
|
|
|
|
|
runtime_description_out.geometry_shader =
|
2022-07-21 11:32:28 +02:00
|
|
|
GetGeometryShaderKey(
|
|
|
|
|
description_out.geometry_shader,
|
|
|
|
|
DxbcShaderTranslator::Modification(vertex_shader->modification()),
|
|
|
|
|
DxbcShaderTranslator::Modification(
|
|
|
|
|
pixel_shader ? pixel_shader->modification() : 0),
|
|
|
|
|
geometry_shader_key)
|
2022-05-09 18:16:22 +02:00
|
|
|
? &GetGeometryShader(geometry_shader_key)
|
|
|
|
|
: nullptr;
|
2018-07-28 15:30:47 +02:00
|
|
|
|
2020-12-24 21:40:38 +01:00
|
|
|
// The rest doesn't matter when rasterization is disabled (thus no writing to
|
|
|
|
|
// anywhere from post-geometry stages and no samples are counted).
|
|
|
|
|
if (!rasterization_enabled) {
|
|
|
|
|
description_out.cull_mode = PipelineCullMode::kDisableRasterization;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Pixel shader.
|
|
|
|
|
if (pixel_shader) {
|
|
|
|
|
runtime_description_out.pixel_shader = pixel_shader;
|
|
|
|
|
description_out.pixel_shader_hash =
|
|
|
|
|
pixel_shader->shader().ucode_data_hash();
|
|
|
|
|
description_out.pixel_shader_modification = pixel_shader->modification();
|
|
|
|
|
}
|
2020-04-05 23:03:23 +02:00
|
|
|
|
2019-01-01 20:20:50 +01:00
|
|
|
// Rasterizer state.
|
2018-07-28 15:30:47 +02:00
|
|
|
// Because Direct3D 12 doesn't support per-side fill mode and depth bias, the
|
|
|
|
|
// values to use depends on the current culling state.
|
|
|
|
|
// If front faces are culled, use the ones for back faces.
|
|
|
|
|
// If back faces are culled, it's the other way around.
|
|
|
|
|
// If culling is not enabled, assume the developer wanted to draw things in a
|
|
|
|
|
// more special way - so if one side is wireframe or has a depth bias, then
|
|
|
|
|
// that's intentional (if both sides have a depth bias, the one for the front
|
|
|
|
|
// faces is used, though it's unlikely that they will ever be different -
|
|
|
|
|
// SetRenderState sets the same offset for both sides).
|
|
|
|
|
// Points fill mode (0) also isn't supported in Direct3D 12, but assume the
|
|
|
|
|
// developer didn't want to fill the whole primitive and use wireframe (like
|
|
|
|
|
// Xenos fill mode 1).
|
|
|
|
|
// Here we also assume that only one side is culled - if two sides are culled,
|
2020-12-24 21:40:38 +01:00
|
|
|
// rasterization will be disabled externally, or the draw call will be dropped
|
|
|
|
|
// early if the vertex shader doesn't export to memory.
|
2019-10-20 18:40:37 +02:00
|
|
|
bool cull_front, cull_back;
|
2020-11-19 21:55:49 +01:00
|
|
|
if (primitive_polygonal) {
|
2019-10-20 18:40:37 +02:00
|
|
|
description_out.front_counter_clockwise = pa_su_sc_mode_cntl.face == 0;
|
2020-11-20 21:12:12 +01:00
|
|
|
cull_front = pa_su_sc_mode_cntl.cull_front != 0;
|
|
|
|
|
cull_back = pa_su_sc_mode_cntl.cull_back != 0;
|
2019-10-20 18:40:37 +02:00
|
|
|
if (cull_front) {
|
2020-12-24 21:40:38 +01:00
|
|
|
// The case when both faces are culled should be handled by disabling
|
|
|
|
|
// rasterization.
|
|
|
|
|
assert_false(cull_back);
|
2019-07-13 21:25:03 +02:00
|
|
|
description_out.cull_mode = PipelineCullMode::kFront;
|
2019-10-20 18:40:37 +02:00
|
|
|
} else if (cull_back) {
|
2019-07-13 21:25:03 +02:00
|
|
|
description_out.cull_mode = PipelineCullMode::kBack;
|
|
|
|
|
} else {
|
|
|
|
|
description_out.cull_mode = PipelineCullMode::kNone;
|
2018-07-28 15:30:47 +02:00
|
|
|
}
|
2019-07-13 21:25:03 +02:00
|
|
|
// With ROV, the depth bias is applied in the pixel shader because
|
|
|
|
|
// per-sample depth is needed for MSAA.
|
2019-10-20 18:40:37 +02:00
|
|
|
if (!cull_front) {
|
2019-07-13 21:25:03 +02:00
|
|
|
// Front faces aren't culled.
|
2019-10-20 18:40:37 +02:00
|
|
|
// Direct3D 12, unfortunately, doesn't support point fill mode.
|
2020-11-20 21:12:12 +01:00
|
|
|
if (pa_su_sc_mode_cntl.polymode_front_ptype !=
|
|
|
|
|
xenos::PolygonType::kTriangles) {
|
2019-07-13 21:25:03 +02:00
|
|
|
description_out.fill_mode_wireframe = 1;
|
|
|
|
|
}
|
2018-07-28 15:30:47 +02:00
|
|
|
}
|
2019-10-20 18:40:37 +02:00
|
|
|
if (!cull_back) {
|
2019-07-13 21:25:03 +02:00
|
|
|
// Back faces aren't culled.
|
2020-11-20 21:12:12 +01:00
|
|
|
if (pa_su_sc_mode_cntl.polymode_back_ptype !=
|
|
|
|
|
xenos::PolygonType::kTriangles) {
|
2019-07-13 21:25:03 +02:00
|
|
|
description_out.fill_mode_wireframe = 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-06-07 15:40:01 +02:00
|
|
|
if (pa_su_sc_mode_cntl.poly_mode != xenos::PolygonModeEnable::kDualMode) {
|
2019-07-13 21:25:03 +02:00
|
|
|
description_out.fill_mode_wireframe = 0;
|
2018-07-28 15:30:47 +02:00
|
|
|
}
|
2019-07-13 21:25:03 +02:00
|
|
|
} else {
|
2020-11-20 21:12:12 +01:00
|
|
|
// Filled front faces only, without culling.
|
|
|
|
|
cull_front = false;
|
|
|
|
|
cull_back = false;
|
2018-07-28 15:30:47 +02:00
|
|
|
}
|
2021-04-26 21:12:09 +02:00
|
|
|
if (!edram_rov_used) {
|
2022-02-13 19:18:02 +01:00
|
|
|
float polygon_offset, polygon_offset_scale;
|
|
|
|
|
draw_util::GetPreferredFacePolygonOffset(
|
|
|
|
|
regs, primitive_polygonal, polygon_offset_scale, polygon_offset);
|
2022-06-22 20:14:40 +02:00
|
|
|
description_out.depth_bias = draw_util::GetD3D10IntegerPolygonOffset(
|
|
|
|
|
regs.Get<reg::RB_DEPTH_INFO>().depth_format, polygon_offset);
|
2019-01-01 20:20:50 +01:00
|
|
|
description_out.depth_bias_slope_scaled =
|
2022-02-13 19:18:02 +01:00
|
|
|
polygon_offset_scale * xenos::kPolygonOffsetScaleSubpixelUnit;
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
if (tessellated && cvars::d3d12_tessellation_wireframe) {
|
2019-01-01 20:20:50 +01:00
|
|
|
description_out.fill_mode_wireframe = 1;
|
|
|
|
|
}
|
2019-10-20 18:40:37 +02:00
|
|
|
description_out.depth_clip = !regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable;
|
2021-04-26 21:12:09 +02:00
|
|
|
bool depth_stencil_bound_and_used = false;
|
|
|
|
|
if (!edram_rov_used) {
|
2019-01-01 20:20:50 +01:00
|
|
|
// Depth/stencil. No stencil, always passing depth test and no depth writing
|
|
|
|
|
// means depth disabled.
|
2021-04-26 21:12:09 +02:00
|
|
|
if (bound_depth_and_color_render_target_bits & 1) {
|
2022-04-27 20:46:29 +02:00
|
|
|
if (normalized_depth_control.z_enable) {
|
|
|
|
|
description_out.depth_func = normalized_depth_control.zfunc;
|
|
|
|
|
description_out.depth_write = normalized_depth_control.z_write_enable;
|
2019-01-01 20:20:50 +01:00
|
|
|
} else {
|
2020-07-11 14:54:22 +02:00
|
|
|
description_out.depth_func = xenos::CompareFunction::kAlways;
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
2022-04-27 20:46:29 +02:00
|
|
|
if (normalized_depth_control.stencil_enable) {
|
2019-01-01 20:20:50 +01:00
|
|
|
description_out.stencil_enable = 1;
|
2019-07-14 22:49:30 +02:00
|
|
|
bool stencil_backface_enable =
|
2022-04-27 20:46:29 +02:00
|
|
|
primitive_polygonal && normalized_depth_control.backface_enable;
|
2019-07-14 22:49:30 +02:00
|
|
|
// Per-face masks not supported by Direct3D 12, choose the back face
|
|
|
|
|
// ones only if drawing only back faces.
|
2019-10-20 18:40:37 +02:00
|
|
|
Register stencil_ref_mask_reg;
|
|
|
|
|
if (stencil_backface_enable && cull_front) {
|
|
|
|
|
stencil_ref_mask_reg = XE_GPU_REG_RB_STENCILREFMASK_BF;
|
2019-07-14 22:49:30 +02:00
|
|
|
} else {
|
2019-10-20 18:40:37 +02:00
|
|
|
stencil_ref_mask_reg = XE_GPU_REG_RB_STENCILREFMASK;
|
2019-07-14 22:49:30 +02:00
|
|
|
}
|
2019-10-20 18:40:37 +02:00
|
|
|
auto stencil_ref_mask =
|
|
|
|
|
regs.Get<reg::RB_STENCILREFMASK>(stencil_ref_mask_reg);
|
|
|
|
|
description_out.stencil_read_mask = stencil_ref_mask.stencilmask;
|
|
|
|
|
description_out.stencil_write_mask = stencil_ref_mask.stencilwritemask;
|
2022-04-27 20:46:29 +02:00
|
|
|
description_out.stencil_front_fail_op =
|
|
|
|
|
normalized_depth_control.stencilfail;
|
2019-01-01 20:20:50 +01:00
|
|
|
description_out.stencil_front_depth_fail_op =
|
2022-04-27 20:46:29 +02:00
|
|
|
normalized_depth_control.stencilzfail;
|
|
|
|
|
description_out.stencil_front_pass_op =
|
|
|
|
|
normalized_depth_control.stencilzpass;
|
|
|
|
|
description_out.stencil_front_func =
|
|
|
|
|
normalized_depth_control.stencilfunc;
|
2019-07-14 22:49:30 +02:00
|
|
|
if (stencil_backface_enable) {
|
2022-04-27 20:46:29 +02:00
|
|
|
description_out.stencil_back_fail_op =
|
|
|
|
|
normalized_depth_control.stencilfail_bf;
|
2019-01-01 20:20:50 +01:00
|
|
|
description_out.stencil_back_depth_fail_op =
|
2022-04-27 20:46:29 +02:00
|
|
|
normalized_depth_control.stencilzfail_bf;
|
2019-10-20 18:40:37 +02:00
|
|
|
description_out.stencil_back_pass_op =
|
2022-04-27 20:46:29 +02:00
|
|
|
normalized_depth_control.stencilzpass_bf;
|
|
|
|
|
description_out.stencil_back_func =
|
|
|
|
|
normalized_depth_control.stencilfunc_bf;
|
2019-01-01 20:20:50 +01:00
|
|
|
} else {
|
|
|
|
|
description_out.stencil_back_fail_op =
|
|
|
|
|
description_out.stencil_front_fail_op;
|
|
|
|
|
description_out.stencil_back_depth_fail_op =
|
|
|
|
|
description_out.stencil_front_depth_fail_op;
|
|
|
|
|
description_out.stencil_back_pass_op =
|
|
|
|
|
description_out.stencil_front_pass_op;
|
|
|
|
|
description_out.stencil_back_func =
|
|
|
|
|
description_out.stencil_front_func;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// If not binding the DSV, ignore the format in the hash.
|
2020-07-11 14:54:22 +02:00
|
|
|
if (description_out.depth_func != xenos::CompareFunction::kAlways ||
|
2019-10-20 18:40:37 +02:00
|
|
|
description_out.depth_write || description_out.stencil_enable) {
|
2021-04-26 21:12:09 +02:00
|
|
|
description_out.depth_format = xenos::DepthRenderTargetFormat(
|
|
|
|
|
bound_depth_and_color_render_target_formats[0]);
|
|
|
|
|
depth_stencil_bound_and_used = true;
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
|
|
|
|
} else {
|
2020-07-11 14:54:22 +02:00
|
|
|
description_out.depth_func = xenos::CompareFunction::kAlways;
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
2019-01-11 15:07:33 +01:00
|
|
|
|
2019-01-01 20:20:50 +01:00
|
|
|
// Render targets and blending state. 32 because of 0x1F mask, for safety
|
|
|
|
|
// (all unknown to zero).
|
|
|
|
|
static const PipelineBlendFactor kBlendFactorMap[32] = {
|
|
|
|
|
/* 0 */ PipelineBlendFactor::kZero,
|
|
|
|
|
/* 1 */ PipelineBlendFactor::kOne,
|
|
|
|
|
/* 2 */ PipelineBlendFactor::kZero, // ?
|
|
|
|
|
/* 3 */ PipelineBlendFactor::kZero, // ?
|
|
|
|
|
/* 4 */ PipelineBlendFactor::kSrcColor,
|
|
|
|
|
/* 5 */ PipelineBlendFactor::kInvSrcColor,
|
|
|
|
|
/* 6 */ PipelineBlendFactor::kSrcAlpha,
|
|
|
|
|
/* 7 */ PipelineBlendFactor::kInvSrcAlpha,
|
|
|
|
|
/* 8 */ PipelineBlendFactor::kDestColor,
|
|
|
|
|
/* 9 */ PipelineBlendFactor::kInvDestColor,
|
|
|
|
|
/* 10 */ PipelineBlendFactor::kDestAlpha,
|
|
|
|
|
/* 11 */ PipelineBlendFactor::kInvDestAlpha,
|
|
|
|
|
// CONSTANT_COLOR
|
|
|
|
|
/* 12 */ PipelineBlendFactor::kBlendFactor,
|
|
|
|
|
// ONE_MINUS_CONSTANT_COLOR
|
|
|
|
|
/* 13 */ PipelineBlendFactor::kInvBlendFactor,
|
|
|
|
|
// CONSTANT_ALPHA
|
|
|
|
|
/* 14 */ PipelineBlendFactor::kBlendFactor,
|
|
|
|
|
// ONE_MINUS_CONSTANT_ALPHA
|
|
|
|
|
/* 15 */ PipelineBlendFactor::kInvBlendFactor,
|
|
|
|
|
/* 16 */ PipelineBlendFactor::kSrcAlphaSat,
|
|
|
|
|
};
|
|
|
|
|
// Like kBlendFactorMap, but with color modes changed to alpha. Some
|
2021-09-05 20:03:05 +02:00
|
|
|
// pipelines aren't created in 545407E0 because a color mode is used for
|
|
|
|
|
// alpha.
|
2019-01-01 20:20:50 +01:00
|
|
|
static const PipelineBlendFactor kBlendFactorAlphaMap[32] = {
|
|
|
|
|
/* 0 */ PipelineBlendFactor::kZero,
|
|
|
|
|
/* 1 */ PipelineBlendFactor::kOne,
|
|
|
|
|
/* 2 */ PipelineBlendFactor::kZero, // ?
|
|
|
|
|
/* 3 */ PipelineBlendFactor::kZero, // ?
|
|
|
|
|
/* 4 */ PipelineBlendFactor::kSrcAlpha,
|
|
|
|
|
/* 5 */ PipelineBlendFactor::kInvSrcAlpha,
|
|
|
|
|
/* 6 */ PipelineBlendFactor::kSrcAlpha,
|
|
|
|
|
/* 7 */ PipelineBlendFactor::kInvSrcAlpha,
|
|
|
|
|
/* 8 */ PipelineBlendFactor::kDestAlpha,
|
|
|
|
|
/* 9 */ PipelineBlendFactor::kInvDestAlpha,
|
|
|
|
|
/* 10 */ PipelineBlendFactor::kDestAlpha,
|
|
|
|
|
/* 11 */ PipelineBlendFactor::kInvDestAlpha,
|
|
|
|
|
/* 12 */ PipelineBlendFactor::kBlendFactor,
|
|
|
|
|
// ONE_MINUS_CONSTANT_COLOR
|
|
|
|
|
/* 13 */ PipelineBlendFactor::kInvBlendFactor,
|
|
|
|
|
// CONSTANT_ALPHA
|
|
|
|
|
/* 14 */ PipelineBlendFactor::kBlendFactor,
|
|
|
|
|
// ONE_MINUS_CONSTANT_ALPHA
|
|
|
|
|
/* 15 */ PipelineBlendFactor::kInvBlendFactor,
|
|
|
|
|
/* 16 */ PipelineBlendFactor::kSrcAlphaSat,
|
|
|
|
|
};
|
2021-04-26 21:12:09 +02:00
|
|
|
// While it's okay to specify fewer render targets in the pipeline state
|
|
|
|
|
// (even fewer than written by the shader) than actually bound to the
|
|
|
|
|
// command list (though this kind of truncation may only happen at the end -
|
|
|
|
|
// DXGI_FORMAT_UNKNOWN *requires* a null RTV descriptor to be bound), not
|
|
|
|
|
// doing that because sample counts of all render targets bound via
|
|
|
|
|
// OMSetRenderTargets, even those beyond NumRenderTargets, apparently must
|
|
|
|
|
// have their sample count matching the one set in the pipeline - however if
|
|
|
|
|
// we set NumRenderTargets to 0 and also disable depth / stencil, the sample
|
|
|
|
|
// count must be set to 1 - while the command list may still have
|
2021-09-05 20:03:05 +02:00
|
|
|
// multisampled render targets bound (happens in 4D5307E6 main menu).
|
2021-04-26 21:12:09 +02:00
|
|
|
// TODO(Triang3l): Investigate interaction of OMSetRenderTargets with
|
|
|
|
|
// non-null depth and DSVFormat DXGI_FORMAT_UNKNOWN in the same case.
|
2019-01-01 20:20:50 +01:00
|
|
|
for (uint32_t i = 0; i < 4; ++i) {
|
2021-04-26 21:12:09 +02:00
|
|
|
if (!(bound_depth_and_color_render_target_bits &
|
|
|
|
|
(uint32_t(1) << (1 + i)))) {
|
|
|
|
|
continue;
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
|
|
|
|
PipelineRenderTarget& rt = description_out.render_targets[i];
|
|
|
|
|
rt.used = 1;
|
2019-10-20 18:40:37 +02:00
|
|
|
auto color_info = regs.Get<reg::RB_COLOR_INFO>(
|
2021-04-26 21:12:09 +02:00
|
|
|
reg::RB_COLOR_INFO::rt_register_indices[i]);
|
|
|
|
|
rt.format = xenos::ColorRenderTargetFormat(
|
|
|
|
|
bound_depth_and_color_render_target_formats[1 + i]);
|
2022-02-13 18:50:31 +01:00
|
|
|
rt.write_mask = (normalized_color_mask >> (i * 4)) & 0xF;
|
2019-10-19 22:30:53 +02:00
|
|
|
if (rt.write_mask) {
|
2019-10-20 18:40:37 +02:00
|
|
|
auto blendcontrol = regs.Get<reg::RB_BLENDCONTROL>(
|
2021-04-26 21:12:09 +02:00
|
|
|
reg::RB_BLENDCONTROL::rt_register_indices[i]);
|
2019-10-20 18:40:37 +02:00
|
|
|
rt.src_blend = kBlendFactorMap[uint32_t(blendcontrol.color_srcblend)];
|
|
|
|
|
rt.dest_blend = kBlendFactorMap[uint32_t(blendcontrol.color_destblend)];
|
|
|
|
|
rt.blend_op = blendcontrol.color_comb_fcn;
|
|
|
|
|
rt.src_blend_alpha =
|
|
|
|
|
kBlendFactorAlphaMap[uint32_t(blendcontrol.alpha_srcblend)];
|
|
|
|
|
rt.dest_blend_alpha =
|
|
|
|
|
kBlendFactorAlphaMap[uint32_t(blendcontrol.alpha_destblend)];
|
|
|
|
|
rt.blend_op_alpha = blendcontrol.alpha_comb_fcn;
|
2019-01-01 20:20:50 +01:00
|
|
|
} else {
|
|
|
|
|
rt.src_blend = PipelineBlendFactor::kOne;
|
|
|
|
|
rt.dest_blend = PipelineBlendFactor::kZero;
|
2020-07-11 14:54:22 +02:00
|
|
|
rt.blend_op = xenos::BlendOp::kAdd;
|
2019-01-01 20:20:50 +01:00
|
|
|
rt.src_blend_alpha = PipelineBlendFactor::kOne;
|
|
|
|
|
rt.dest_blend_alpha = PipelineBlendFactor::kZero;
|
2020-07-11 14:54:22 +02:00
|
|
|
rt.blend_op_alpha = xenos::BlendOp::kAdd;
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
|
|
|
|
}
|
2018-07-28 15:30:47 +02:00
|
|
|
}
|
2021-04-26 21:12:09 +02:00
|
|
|
xenos::MsaaSamples host_msaa_samples =
|
|
|
|
|
regs.Get<reg::RB_SURFACE_INFO>().msaa_samples;
|
|
|
|
|
if (edram_rov_used) {
|
|
|
|
|
if (host_msaa_samples == xenos::MsaaSamples::k2X) {
|
|
|
|
|
// 2 is not supported in ForcedSampleCount on Nvidia.
|
|
|
|
|
host_msaa_samples = xenos::MsaaSamples::k4X;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (!(bound_depth_and_color_render_target_bits & ~uint32_t(1)) &&
|
|
|
|
|
!depth_stencil_bound_and_used) {
|
|
|
|
|
// Direct3D 12 requires the sample count to be 1 when no color or depth /
|
|
|
|
|
// stencil render targets are bound.
|
|
|
|
|
// FIXME(Triang3l): Use ForcedSampleCount or some other fallback for
|
|
|
|
|
// sample counting when needed, though with 2x it will be as incorrect as
|
|
|
|
|
// with 1x / 4x anyway; or bind a dummy depth / stencil buffer if really
|
|
|
|
|
// needed.
|
|
|
|
|
host_msaa_samples = xenos::MsaaSamples::k1X;
|
|
|
|
|
}
|
|
|
|
|
// TODO(Triang3l): 4x MSAA fallback when 2x isn't supported.
|
|
|
|
|
}
|
|
|
|
|
description_out.host_msaa_samples = host_msaa_samples;
|
2018-07-28 15:30:47 +02:00
|
|
|
|
2019-01-01 20:20:50 +01:00
|
|
|
return true;
|
2018-07-28 15:30:47 +02:00
|
|
|
}
|
|
|
|
|
|
2022-05-09 18:16:22 +02:00
|
|
|
bool PipelineCache::GetGeometryShaderKey(
|
2022-07-21 11:32:28 +02:00
|
|
|
PipelineGeometryShader geometry_shader_type,
|
|
|
|
|
DxbcShaderTranslator::Modification vertex_shader_modification,
|
|
|
|
|
DxbcShaderTranslator::Modification pixel_shader_modification,
|
|
|
|
|
GeometryShaderKey& key_out) {
|
2022-05-09 18:16:22 +02:00
|
|
|
if (geometry_shader_type == PipelineGeometryShader::kNone) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2022-07-21 11:32:28 +02:00
|
|
|
assert_true(vertex_shader_modification.vertex.interpolator_mask ==
|
|
|
|
|
pixel_shader_modification.pixel.interpolator_mask);
|
2022-05-09 18:16:22 +02:00
|
|
|
GeometryShaderKey key;
|
|
|
|
|
key.type = geometry_shader_type;
|
2022-07-21 11:32:28 +02:00
|
|
|
key.interpolator_count =
|
|
|
|
|
xe::bit_count(vertex_shader_modification.vertex.interpolator_mask);
|
|
|
|
|
key.user_clip_plane_count =
|
|
|
|
|
vertex_shader_modification.vertex.user_clip_plane_count;
|
|
|
|
|
key.user_clip_plane_cull =
|
|
|
|
|
vertex_shader_modification.vertex.user_clip_plane_cull;
|
|
|
|
|
key.has_vertex_kill_and = vertex_shader_modification.vertex.vertex_kill_and;
|
|
|
|
|
key.has_point_size = vertex_shader_modification.vertex.output_point_size;
|
|
|
|
|
key.has_point_coordinates = pixel_shader_modification.pixel.param_gen_point;
|
2022-05-09 18:16:22 +02:00
|
|
|
key_out = key;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void PipelineCache::CreateDxbcGeometryShader(
|
|
|
|
|
GeometryShaderKey key, std::vector<uint32_t>& shader_out) {
|
|
|
|
|
shader_out.clear();
|
|
|
|
|
|
|
|
|
|
// RDEF, ISGN, OSG5, SHEX, STAT.
|
|
|
|
|
constexpr uint32_t kBlobCount = 5;
|
|
|
|
|
|
|
|
|
|
// Allocate space for the container header and the blob offsets.
|
|
|
|
|
shader_out.resize(sizeof(dxbc::ContainerHeader) / sizeof(uint32_t) +
|
|
|
|
|
kBlobCount);
|
|
|
|
|
uint32_t blob_offset_position_dwords =
|
|
|
|
|
sizeof(dxbc::ContainerHeader) / sizeof(uint32_t);
|
|
|
|
|
uint32_t blob_position_dwords = uint32_t(shader_out.size());
|
|
|
|
|
constexpr uint32_t kBlobHeaderSizeDwords =
|
|
|
|
|
sizeof(dxbc::BlobHeader) / sizeof(uint32_t);
|
|
|
|
|
|
|
|
|
|
uint32_t name_ptr;
|
|
|
|
|
|
|
|
|
|
// ***************************************************************************
|
|
|
|
|
// Resource definition
|
|
|
|
|
// ***************************************************************************
|
|
|
|
|
|
|
|
|
|
shader_out[blob_offset_position_dwords] =
|
|
|
|
|
uint32_t(blob_position_dwords * sizeof(uint32_t));
|
|
|
|
|
uint32_t rdef_position_dwords = blob_position_dwords + kBlobHeaderSizeDwords;
|
|
|
|
|
// Not needed, as the next operation done is resize, to allocate the space for
|
|
|
|
|
// both the blob header and the resource definition header.
|
|
|
|
|
// shader_out.resize(rdef_position_dwords);
|
|
|
|
|
|
|
|
|
|
// RDEF header - the actual definitions will be written if needed.
|
|
|
|
|
shader_out.resize(rdef_position_dwords +
|
|
|
|
|
sizeof(dxbc::RdefHeader) / sizeof(uint32_t));
|
|
|
|
|
// Generator name.
|
|
|
|
|
dxbc::AppendAlignedString(shader_out, "Xenia");
|
|
|
|
|
{
|
|
|
|
|
auto& rdef_header = *reinterpret_cast<dxbc::RdefHeader*>(
|
|
|
|
|
shader_out.data() + rdef_position_dwords);
|
|
|
|
|
rdef_header.shader_model = dxbc::RdefShaderModel::kGeometryShader5_1;
|
|
|
|
|
rdef_header.compile_flags =
|
|
|
|
|
dxbc::kCompileFlagNoPreshader | dxbc::kCompileFlagPreferFlowControl |
|
|
|
|
|
dxbc::kCompileFlagIeeeStrictness | dxbc::kCompileFlagAllResourcesBound;
|
|
|
|
|
// Generator name is right after the header.
|
|
|
|
|
rdef_header.generator_name_ptr = sizeof(dxbc::RdefHeader);
|
|
|
|
|
rdef_header.fourcc = dxbc::RdefHeader::FourCC::k5_1;
|
|
|
|
|
rdef_header.InitializeSizes();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint32_t system_cbuffer_size_vector_aligned_bytes = 0;
|
|
|
|
|
|
|
|
|
|
if (key.type == PipelineGeometryShader::kPointList) {
|
|
|
|
|
// Need point parameters from the system constants.
|
|
|
|
|
|
|
|
|
|
// Constant types - float2 only.
|
|
|
|
|
// Names.
|
|
|
|
|
name_ptr =
|
|
|
|
|
uint32_t((shader_out.size() - rdef_position_dwords) * sizeof(uint32_t));
|
|
|
|
|
uint32_t rdef_name_ptr_float2 = name_ptr;
|
|
|
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "float2");
|
|
|
|
|
// Types.
|
|
|
|
|
uint32_t rdef_type_float2_position_dwords = uint32_t(shader_out.size());
|
|
|
|
|
uint32_t rdef_type_float2_ptr =
|
|
|
|
|
uint32_t((rdef_type_float2_position_dwords - rdef_position_dwords) *
|
|
|
|
|
sizeof(uint32_t));
|
|
|
|
|
shader_out.resize(rdef_type_float2_position_dwords +
|
|
|
|
|
sizeof(dxbc::RdefType) / sizeof(uint32_t));
|
|
|
|
|
{
|
|
|
|
|
auto& rdef_type_float2 = *reinterpret_cast<dxbc::RdefType*>(
|
|
|
|
|
shader_out.data() + rdef_type_float2_position_dwords);
|
|
|
|
|
rdef_type_float2.variable_class = dxbc::RdefVariableClass::kVector;
|
|
|
|
|
rdef_type_float2.variable_type = dxbc::RdefVariableType::kFloat;
|
|
|
|
|
rdef_type_float2.row_count = 1;
|
|
|
|
|
rdef_type_float2.column_count = 2;
|
|
|
|
|
rdef_type_float2.name_ptr = rdef_name_ptr_float2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Constants:
|
|
|
|
|
// - float2 xe_point_constant_diameter
|
|
|
|
|
// - float2 xe_point_screen_diameter_to_ndc_radius
|
|
|
|
|
enum PointConstant : uint32_t {
|
|
|
|
|
kPointConstantConstantDiameter,
|
|
|
|
|
kPointConstantScreenDiameterToNDCRadius,
|
|
|
|
|
kPointConstantCount,
|
|
|
|
|
};
|
|
|
|
|
// Names.
|
|
|
|
|
name_ptr =
|
|
|
|
|
uint32_t((shader_out.size() - rdef_position_dwords) * sizeof(uint32_t));
|
|
|
|
|
uint32_t rdef_name_ptr_xe_point_constant_diameter = name_ptr;
|
|
|
|
|
name_ptr +=
|
|
|
|
|
dxbc::AppendAlignedString(shader_out, "xe_point_constant_diameter");
|
|
|
|
|
uint32_t rdef_name_ptr_xe_point_screen_diameter_to_ndc_radius = name_ptr;
|
|
|
|
|
name_ptr += dxbc::AppendAlignedString(
|
|
|
|
|
shader_out, "xe_point_screen_diameter_to_ndc_radius");
|
|
|
|
|
// Constants.
|
|
|
|
|
uint32_t rdef_constants_position_dwords = uint32_t(shader_out.size());
|
|
|
|
|
uint32_t rdef_constants_ptr =
|
|
|
|
|
uint32_t((rdef_constants_position_dwords - rdef_position_dwords) *
|
|
|
|
|
sizeof(uint32_t));
|
|
|
|
|
shader_out.resize(rdef_constants_position_dwords +
|
|
|
|
|
sizeof(dxbc::RdefVariable) / sizeof(uint32_t) *
|
|
|
|
|
kPointConstantCount);
|
|
|
|
|
{
|
|
|
|
|
auto rdef_constants = reinterpret_cast<dxbc::RdefVariable*>(
|
|
|
|
|
shader_out.data() + rdef_constants_position_dwords);
|
|
|
|
|
// float2 xe_point_constant_diameter
|
|
|
|
|
static_assert(
|
|
|
|
|
sizeof(DxbcShaderTranslator::SystemConstants ::
|
|
|
|
|
point_constant_diameter) == sizeof(float) * 2,
|
|
|
|
|
"DxbcShaderTranslator point_constant_diameter system constant size "
|
|
|
|
|
"differs between the shader translator and geometry shader "
|
|
|
|
|
"generation");
|
|
|
|
|
static_assert_size(
|
|
|
|
|
DxbcShaderTranslator::SystemConstants::point_constant_diameter,
|
|
|
|
|
sizeof(float) * 2);
|
|
|
|
|
dxbc::RdefVariable& rdef_constant_point_constant_diameter =
|
|
|
|
|
rdef_constants[kPointConstantConstantDiameter];
|
|
|
|
|
rdef_constant_point_constant_diameter.name_ptr =
|
|
|
|
|
rdef_name_ptr_xe_point_constant_diameter;
|
|
|
|
|
rdef_constant_point_constant_diameter.start_offset_bytes = offsetof(
|
|
|
|
|
DxbcShaderTranslator::SystemConstants, point_constant_diameter);
|
|
|
|
|
rdef_constant_point_constant_diameter.size_bytes = sizeof(float) * 2;
|
|
|
|
|
rdef_constant_point_constant_diameter.flags = dxbc::kRdefVariableFlagUsed;
|
|
|
|
|
rdef_constant_point_constant_diameter.type_ptr = rdef_type_float2_ptr;
|
|
|
|
|
rdef_constant_point_constant_diameter.start_texture = UINT32_MAX;
|
|
|
|
|
rdef_constant_point_constant_diameter.start_sampler = UINT32_MAX;
|
|
|
|
|
// float2 xe_point_screen_diameter_to_ndc_radius
|
|
|
|
|
static_assert(
|
|
|
|
|
sizeof(DxbcShaderTranslator::SystemConstants ::
|
|
|
|
|
point_screen_diameter_to_ndc_radius) == sizeof(float) * 2,
|
|
|
|
|
"DxbcShaderTranslator point_screen_diameter_to_ndc_radius system "
|
|
|
|
|
"constant size differs between the shader translator and geometry "
|
|
|
|
|
"shader generation");
|
|
|
|
|
dxbc::RdefVariable& rdef_constant_point_screen_diameter_to_ndc_radius =
|
|
|
|
|
rdef_constants[kPointConstantScreenDiameterToNDCRadius];
|
|
|
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.name_ptr =
|
|
|
|
|
rdef_name_ptr_xe_point_screen_diameter_to_ndc_radius;
|
|
|
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.start_offset_bytes =
|
|
|
|
|
offsetof(DxbcShaderTranslator::SystemConstants,
|
|
|
|
|
point_screen_diameter_to_ndc_radius);
|
|
|
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.size_bytes =
|
|
|
|
|
sizeof(float) * 2;
|
|
|
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.flags =
|
|
|
|
|
dxbc::kRdefVariableFlagUsed;
|
|
|
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.type_ptr =
|
|
|
|
|
rdef_type_float2_ptr;
|
|
|
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.start_texture =
|
|
|
|
|
UINT32_MAX;
|
|
|
|
|
rdef_constant_point_screen_diameter_to_ndc_radius.start_sampler =
|
|
|
|
|
UINT32_MAX;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Constant buffers - xe_system_cbuffer only.
|
|
|
|
|
|
|
|
|
|
// Names.
|
|
|
|
|
name_ptr =
|
|
|
|
|
uint32_t((shader_out.size() - rdef_position_dwords) * sizeof(uint32_t));
|
|
|
|
|
uint32_t rdef_name_ptr_xe_system_cbuffer = name_ptr;
|
|
|
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "xe_system_cbuffer");
|
|
|
|
|
// Constant buffers.
|
|
|
|
|
uint32_t rdef_cbuffer_position_dwords = uint32_t(shader_out.size());
|
|
|
|
|
shader_out.resize(rdef_cbuffer_position_dwords +
|
|
|
|
|
sizeof(dxbc::RdefCbuffer) / sizeof(uint32_t));
|
|
|
|
|
{
|
|
|
|
|
auto& rdef_cbuffer_system = *reinterpret_cast<dxbc::RdefCbuffer*>(
|
|
|
|
|
shader_out.data() + rdef_cbuffer_position_dwords);
|
|
|
|
|
rdef_cbuffer_system.name_ptr = rdef_name_ptr_xe_system_cbuffer;
|
|
|
|
|
rdef_cbuffer_system.variable_count = kPointConstantCount;
|
|
|
|
|
rdef_cbuffer_system.variables_ptr = rdef_constants_ptr;
|
|
|
|
|
auto rdef_constants = reinterpret_cast<const dxbc::RdefVariable*>(
|
|
|
|
|
shader_out.data() + rdef_constants_position_dwords);
|
|
|
|
|
for (uint32_t i = 0; i < kPointConstantCount; ++i) {
|
|
|
|
|
system_cbuffer_size_vector_aligned_bytes =
|
|
|
|
|
std::max(system_cbuffer_size_vector_aligned_bytes,
|
|
|
|
|
rdef_constants[i].start_offset_bytes +
|
|
|
|
|
rdef_constants[i].size_bytes);
|
|
|
|
|
}
|
|
|
|
|
system_cbuffer_size_vector_aligned_bytes =
|
|
|
|
|
xe::align(system_cbuffer_size_vector_aligned_bytes,
|
|
|
|
|
uint32_t(sizeof(uint32_t) * 4));
|
|
|
|
|
rdef_cbuffer_system.size_vector_aligned_bytes =
|
|
|
|
|
system_cbuffer_size_vector_aligned_bytes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Bindings - xe_system_cbuffer only.
|
|
|
|
|
uint32_t rdef_binding_position_dwords = uint32_t(shader_out.size());
|
|
|
|
|
shader_out.resize(rdef_binding_position_dwords +
|
|
|
|
|
sizeof(dxbc::RdefInputBind) / sizeof(uint32_t));
|
|
|
|
|
{
|
|
|
|
|
auto& rdef_binding_cbuffer_system =
|
|
|
|
|
*reinterpret_cast<dxbc::RdefInputBind*>(shader_out.data() +
|
|
|
|
|
rdef_binding_position_dwords);
|
|
|
|
|
rdef_binding_cbuffer_system.name_ptr = rdef_name_ptr_xe_system_cbuffer;
|
|
|
|
|
rdef_binding_cbuffer_system.type = dxbc::RdefInputType::kCbuffer;
|
|
|
|
|
rdef_binding_cbuffer_system.bind_point =
|
|
|
|
|
uint32_t(DxbcShaderTranslator::CbufferRegister::kSystemConstants);
|
|
|
|
|
rdef_binding_cbuffer_system.bind_count = 1;
|
|
|
|
|
rdef_binding_cbuffer_system.flags = dxbc::kRdefInputFlagUserPacked;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Pointers in the header.
|
|
|
|
|
{
|
|
|
|
|
auto& rdef_header = *reinterpret_cast<dxbc::RdefHeader*>(
|
|
|
|
|
shader_out.data() + rdef_position_dwords);
|
|
|
|
|
rdef_header.cbuffer_count = 1;
|
|
|
|
|
rdef_header.cbuffers_ptr =
|
|
|
|
|
uint32_t((rdef_cbuffer_position_dwords - rdef_position_dwords) *
|
|
|
|
|
sizeof(uint32_t));
|
|
|
|
|
rdef_header.input_bind_count = 1;
|
|
|
|
|
rdef_header.input_binds_ptr =
|
|
|
|
|
uint32_t((rdef_binding_position_dwords - rdef_position_dwords) *
|
|
|
|
|
sizeof(uint32_t));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
auto& blob_header = *reinterpret_cast<dxbc::BlobHeader*>(
|
|
|
|
|
shader_out.data() + blob_position_dwords);
|
|
|
|
|
blob_header.fourcc = dxbc::BlobHeader::FourCC::kResourceDefinition;
|
|
|
|
|
blob_position_dwords = uint32_t(shader_out.size());
|
|
|
|
|
blob_header.size_bytes =
|
|
|
|
|
(blob_position_dwords - kBlobHeaderSizeDwords) * sizeof(uint32_t) -
|
|
|
|
|
shader_out[blob_offset_position_dwords++];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ***************************************************************************
|
|
|
|
|
// Input signature
|
|
|
|
|
// ***************************************************************************
|
|
|
|
|
|
|
|
|
|
// Clip and cull distances are tightly packed together into registers, but
|
|
|
|
|
// have separate signature parameters with each being a vec4-aligned window.
|
|
|
|
|
uint32_t input_clip_distance_count =
|
|
|
|
|
key.user_clip_plane_cull ? 0 : key.user_clip_plane_count;
|
|
|
|
|
uint32_t input_cull_distance_count =
|
|
|
|
|
(key.user_clip_plane_cull ? key.user_clip_plane_count : 0) +
|
|
|
|
|
key.has_vertex_kill_and;
|
|
|
|
|
uint32_t input_clip_and_cull_distance_count =
|
|
|
|
|
input_clip_distance_count + input_cull_distance_count;
|
|
|
|
|
|
2022-07-21 11:32:28 +02:00
|
|
|
// Interpolators, position, clip and cull distances (parameters containing
|
|
|
|
|
// only clip or cull distances, and also one parameter containing both if
|
|
|
|
|
// present), point size.
|
2022-05-09 18:16:22 +02:00
|
|
|
uint32_t isgn_parameter_count =
|
2022-07-21 11:32:28 +02:00
|
|
|
key.interpolator_count + 1 +
|
2022-05-09 18:16:22 +02:00
|
|
|
((input_clip_and_cull_distance_count + 3) / 4) +
|
|
|
|
|
uint32_t(input_cull_distance_count &&
|
2022-07-21 11:32:28 +02:00
|
|
|
(input_clip_distance_count & 3) != 0) +
|
|
|
|
|
key.has_point_size;
|
2022-05-09 18:16:22 +02:00
|
|
|
|
|
|
|
|
// Reserve space for the header and the parameters.
|
|
|
|
|
shader_out[blob_offset_position_dwords] =
|
|
|
|
|
uint32_t(blob_position_dwords * sizeof(uint32_t));
|
|
|
|
|
uint32_t isgn_position_dwords = blob_position_dwords + kBlobHeaderSizeDwords;
|
|
|
|
|
shader_out.resize(isgn_position_dwords +
|
|
|
|
|
sizeof(dxbc::Signature) / sizeof(uint32_t) +
|
|
|
|
|
sizeof(dxbc::SignatureParameter) / sizeof(uint32_t) *
|
|
|
|
|
isgn_parameter_count);
|
|
|
|
|
|
|
|
|
|
// Names (after the parameters).
|
|
|
|
|
name_ptr =
|
|
|
|
|
uint32_t((shader_out.size() - isgn_position_dwords) * sizeof(uint32_t));
|
|
|
|
|
uint32_t isgn_name_ptr_texcoord = name_ptr;
|
2022-07-21 11:32:28 +02:00
|
|
|
if (key.interpolator_count) {
|
2022-05-09 18:16:22 +02:00
|
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "TEXCOORD");
|
|
|
|
|
}
|
|
|
|
|
uint32_t isgn_name_ptr_sv_position = name_ptr;
|
|
|
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "SV_Position");
|
|
|
|
|
uint32_t isgn_name_ptr_sv_clip_distance = name_ptr;
|
|
|
|
|
if (input_clip_distance_count) {
|
|
|
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "SV_ClipDistance");
|
|
|
|
|
}
|
|
|
|
|
uint32_t isgn_name_ptr_sv_cull_distance = name_ptr;
|
|
|
|
|
if (input_cull_distance_count) {
|
|
|
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "SV_CullDistance");
|
|
|
|
|
}
|
2022-07-21 11:32:28 +02:00
|
|
|
uint32_t isgn_name_ptr_xepsize = name_ptr;
|
|
|
|
|
if (key.has_point_size) {
|
|
|
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "XEPSIZE");
|
|
|
|
|
}
|
2022-05-09 18:16:22 +02:00
|
|
|
|
|
|
|
|
// Header and parameters.
|
|
|
|
|
uint32_t input_register_interpolators = UINT32_MAX;
|
|
|
|
|
uint32_t input_register_position;
|
|
|
|
|
uint32_t input_register_clip_and_cull_distances = UINT32_MAX;
|
2022-07-21 11:32:28 +02:00
|
|
|
uint32_t input_register_point_size = UINT32_MAX;
|
2022-05-09 18:16:22 +02:00
|
|
|
{
|
|
|
|
|
// Header.
|
|
|
|
|
auto& isgn_header = *reinterpret_cast<dxbc::Signature*>(
|
|
|
|
|
shader_out.data() + isgn_position_dwords);
|
|
|
|
|
isgn_header.parameter_count = isgn_parameter_count;
|
|
|
|
|
isgn_header.parameter_info_ptr = sizeof(dxbc::Signature);
|
|
|
|
|
|
|
|
|
|
// Parameters.
|
|
|
|
|
auto isgn_parameters = reinterpret_cast<dxbc::SignatureParameter*>(
|
|
|
|
|
shader_out.data() + isgn_position_dwords +
|
|
|
|
|
sizeof(dxbc::Signature) / sizeof(uint32_t));
|
|
|
|
|
uint32_t isgn_parameter_index = 0;
|
|
|
|
|
uint32_t input_register_index = 0;
|
|
|
|
|
|
|
|
|
|
// Interpolators (TEXCOORD#).
|
|
|
|
|
if (key.interpolator_count) {
|
|
|
|
|
input_register_interpolators = input_register_index;
|
|
|
|
|
for (uint32_t i = 0; i < key.interpolator_count; ++i) {
|
|
|
|
|
assert_true(isgn_parameter_index < isgn_parameter_count);
|
|
|
|
|
dxbc::SignatureParameter& isgn_interpolator =
|
|
|
|
|
isgn_parameters[isgn_parameter_index++];
|
|
|
|
|
isgn_interpolator.semantic_name_ptr = isgn_name_ptr_texcoord;
|
|
|
|
|
isgn_interpolator.semantic_index = i;
|
|
|
|
|
isgn_interpolator.component_type =
|
|
|
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
|
|
|
isgn_interpolator.register_index = input_register_index++;
|
|
|
|
|
isgn_interpolator.mask = 0b1111;
|
|
|
|
|
isgn_interpolator.always_reads_mask = 0b1111;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-21 11:32:28 +02:00
|
|
|
// Position (SV_Position).
|
2022-05-09 18:16:22 +02:00
|
|
|
input_register_position = input_register_index;
|
|
|
|
|
assert_true(isgn_parameter_index < isgn_parameter_count);
|
|
|
|
|
dxbc::SignatureParameter& isgn_sv_position =
|
|
|
|
|
isgn_parameters[isgn_parameter_index++];
|
|
|
|
|
isgn_sv_position.semantic_name_ptr = isgn_name_ptr_sv_position;
|
|
|
|
|
isgn_sv_position.system_value = dxbc::Name::kPosition;
|
|
|
|
|
isgn_sv_position.component_type =
|
|
|
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
|
|
|
isgn_sv_position.register_index = input_register_index++;
|
|
|
|
|
isgn_sv_position.mask = 0b1111;
|
|
|
|
|
isgn_sv_position.always_reads_mask = 0b1111;
|
|
|
|
|
|
2022-07-21 11:32:28 +02:00
|
|
|
// Clip and cull distances (SV_ClipDistance#, SV_CullDistance#).
|
2022-05-09 18:16:22 +02:00
|
|
|
if (input_clip_and_cull_distance_count) {
|
|
|
|
|
input_register_clip_and_cull_distances = input_register_index;
|
|
|
|
|
uint32_t isgn_cull_distance_semantic_index = 0;
|
|
|
|
|
for (uint32_t i = 0; i < input_clip_and_cull_distance_count; i += 4) {
|
|
|
|
|
if (i < input_clip_distance_count) {
|
|
|
|
|
dxbc::SignatureParameter& isgn_sv_clip_distance =
|
|
|
|
|
isgn_parameters[isgn_parameter_index++];
|
|
|
|
|
isgn_sv_clip_distance.semantic_name_ptr =
|
|
|
|
|
isgn_name_ptr_sv_clip_distance;
|
|
|
|
|
isgn_sv_clip_distance.semantic_index = i / 4;
|
|
|
|
|
isgn_sv_clip_distance.system_value = dxbc::Name::kClipDistance;
|
|
|
|
|
isgn_sv_clip_distance.component_type =
|
|
|
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
|
|
|
isgn_sv_clip_distance.register_index = input_register_index;
|
|
|
|
|
uint8_t isgn_sv_clip_distance_mask =
|
|
|
|
|
(UINT8_C(1) << std::min(input_clip_distance_count - i,
|
|
|
|
|
UINT32_C(4))) -
|
|
|
|
|
1;
|
|
|
|
|
isgn_sv_clip_distance.mask = isgn_sv_clip_distance_mask;
|
|
|
|
|
isgn_sv_clip_distance.always_reads_mask = isgn_sv_clip_distance_mask;
|
|
|
|
|
}
|
|
|
|
|
if (input_cull_distance_count && i + 4 > input_clip_distance_count) {
|
|
|
|
|
dxbc::SignatureParameter& isgn_sv_cull_distance =
|
|
|
|
|
isgn_parameters[isgn_parameter_index++];
|
|
|
|
|
isgn_sv_cull_distance.semantic_name_ptr =
|
|
|
|
|
isgn_name_ptr_sv_cull_distance;
|
|
|
|
|
isgn_sv_cull_distance.semantic_index =
|
|
|
|
|
isgn_cull_distance_semantic_index++;
|
|
|
|
|
isgn_sv_cull_distance.system_value = dxbc::Name::kCullDistance;
|
|
|
|
|
isgn_sv_cull_distance.component_type =
|
|
|
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
|
|
|
isgn_sv_cull_distance.register_index = input_register_index;
|
|
|
|
|
uint8_t isgn_sv_cull_distance_mask =
|
|
|
|
|
(UINT8_C(1) << std::min(input_clip_and_cull_distance_count - i,
|
|
|
|
|
UINT32_C(4))) -
|
|
|
|
|
1;
|
|
|
|
|
if (i < input_clip_distance_count) {
|
|
|
|
|
isgn_sv_cull_distance_mask &=
|
|
|
|
|
~((UINT8_C(1) << (input_clip_distance_count - i)) - 1);
|
|
|
|
|
}
|
|
|
|
|
isgn_sv_cull_distance.mask = isgn_sv_cull_distance_mask;
|
|
|
|
|
isgn_sv_cull_distance.always_reads_mask = isgn_sv_cull_distance_mask;
|
|
|
|
|
}
|
|
|
|
|
++input_register_index;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-21 11:32:28 +02:00
|
|
|
// Point size (XEPSIZE).
|
|
|
|
|
if (key.has_point_size) {
|
|
|
|
|
input_register_point_size = input_register_index;
|
|
|
|
|
assert_true(isgn_parameter_index < isgn_parameter_count);
|
|
|
|
|
dxbc::SignatureParameter& isgn_point_size =
|
|
|
|
|
isgn_parameters[isgn_parameter_index++];
|
|
|
|
|
isgn_point_size.semantic_name_ptr = isgn_name_ptr_xepsize;
|
|
|
|
|
isgn_point_size.component_type =
|
|
|
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
|
|
|
isgn_point_size.register_index = input_register_index++;
|
|
|
|
|
isgn_point_size.mask = 0b0001;
|
|
|
|
|
isgn_point_size.always_reads_mask =
|
|
|
|
|
key.type == PipelineGeometryShader::kPointList ? 0b0001 : 0;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-09 18:16:22 +02:00
|
|
|
assert_true(isgn_parameter_index == isgn_parameter_count);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
auto& blob_header = *reinterpret_cast<dxbc::BlobHeader*>(
|
|
|
|
|
shader_out.data() + blob_position_dwords);
|
|
|
|
|
blob_header.fourcc = dxbc::BlobHeader::FourCC::kInputSignature;
|
|
|
|
|
blob_position_dwords = uint32_t(shader_out.size());
|
|
|
|
|
blob_header.size_bytes =
|
|
|
|
|
(blob_position_dwords - kBlobHeaderSizeDwords) * sizeof(uint32_t) -
|
|
|
|
|
shader_out[blob_offset_position_dwords++];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ***************************************************************************
|
|
|
|
|
// Output signature
|
|
|
|
|
// ***************************************************************************
|
|
|
|
|
|
|
|
|
|
// Interpolators, point coordinates, position, clip distances.
|
|
|
|
|
uint32_t osgn_parameter_count = key.interpolator_count +
|
|
|
|
|
key.has_point_coordinates + 1 +
|
|
|
|
|
((input_clip_distance_count + 3) / 4);
|
|
|
|
|
|
|
|
|
|
// Reserve space for the header and the parameters.
|
|
|
|
|
shader_out[blob_offset_position_dwords] =
|
|
|
|
|
uint32_t(blob_position_dwords * sizeof(uint32_t));
|
|
|
|
|
uint32_t osgn_position_dwords = blob_position_dwords + kBlobHeaderSizeDwords;
|
|
|
|
|
shader_out.resize(osgn_position_dwords +
|
|
|
|
|
sizeof(dxbc::Signature) / sizeof(uint32_t) +
|
|
|
|
|
sizeof(dxbc::SignatureParameterForGS) / sizeof(uint32_t) *
|
|
|
|
|
osgn_parameter_count);
|
|
|
|
|
|
|
|
|
|
// Names (after the parameters).
|
|
|
|
|
name_ptr =
|
|
|
|
|
uint32_t((shader_out.size() - osgn_position_dwords) * sizeof(uint32_t));
|
|
|
|
|
uint32_t osgn_name_ptr_texcoord = name_ptr;
|
2022-07-21 11:32:28 +02:00
|
|
|
if (key.interpolator_count) {
|
2022-05-09 18:16:22 +02:00
|
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "TEXCOORD");
|
|
|
|
|
}
|
2022-07-21 11:32:28 +02:00
|
|
|
uint32_t osgn_name_ptr_xespritetexcoord = name_ptr;
|
|
|
|
|
if (key.has_point_coordinates) {
|
|
|
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "XESPRITETEXCOORD");
|
|
|
|
|
}
|
2022-05-09 18:16:22 +02:00
|
|
|
uint32_t osgn_name_ptr_sv_position = name_ptr;
|
|
|
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "SV_Position");
|
|
|
|
|
uint32_t osgn_name_ptr_sv_clip_distance = name_ptr;
|
|
|
|
|
if (input_clip_distance_count) {
|
|
|
|
|
name_ptr += dxbc::AppendAlignedString(shader_out, "SV_ClipDistance");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Header and parameters.
|
|
|
|
|
uint32_t output_register_interpolators = UINT32_MAX;
|
|
|
|
|
uint32_t output_register_point_coordinates = UINT32_MAX;
|
|
|
|
|
uint32_t output_register_position;
|
|
|
|
|
uint32_t output_register_clip_distances = UINT32_MAX;
|
|
|
|
|
{
|
|
|
|
|
// Header.
|
|
|
|
|
auto& osgn_header = *reinterpret_cast<dxbc::Signature*>(
|
|
|
|
|
shader_out.data() + osgn_position_dwords);
|
|
|
|
|
osgn_header.parameter_count = osgn_parameter_count;
|
|
|
|
|
osgn_header.parameter_info_ptr = sizeof(dxbc::Signature);
|
|
|
|
|
|
|
|
|
|
// Parameters.
|
|
|
|
|
auto osgn_parameters = reinterpret_cast<dxbc::SignatureParameterForGS*>(
|
|
|
|
|
shader_out.data() + osgn_position_dwords +
|
|
|
|
|
sizeof(dxbc::Signature) / sizeof(uint32_t));
|
|
|
|
|
uint32_t osgn_parameter_index = 0;
|
|
|
|
|
uint32_t output_register_index = 0;
|
|
|
|
|
|
|
|
|
|
// Interpolators (TEXCOORD#).
|
|
|
|
|
if (key.interpolator_count) {
|
|
|
|
|
output_register_interpolators = output_register_index;
|
|
|
|
|
for (uint32_t i = 0; i < key.interpolator_count; ++i) {
|
|
|
|
|
assert_true(osgn_parameter_index < osgn_parameter_count);
|
|
|
|
|
dxbc::SignatureParameterForGS& osgn_interpolator =
|
|
|
|
|
osgn_parameters[osgn_parameter_index++];
|
|
|
|
|
osgn_interpolator.semantic_name_ptr = osgn_name_ptr_texcoord;
|
|
|
|
|
osgn_interpolator.semantic_index = i;
|
|
|
|
|
osgn_interpolator.component_type =
|
|
|
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
|
|
|
osgn_interpolator.register_index = output_register_index++;
|
|
|
|
|
osgn_interpolator.mask = 0b1111;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-21 11:32:28 +02:00
|
|
|
// Point coordinates (XESPRITETEXCOORD).
|
2022-05-09 18:16:22 +02:00
|
|
|
if (key.has_point_coordinates) {
|
|
|
|
|
output_register_point_coordinates = output_register_index;
|
|
|
|
|
assert_true(osgn_parameter_index < osgn_parameter_count);
|
|
|
|
|
dxbc::SignatureParameterForGS& osgn_point_coordinates =
|
|
|
|
|
osgn_parameters[osgn_parameter_index++];
|
2022-07-21 11:32:28 +02:00
|
|
|
osgn_point_coordinates.semantic_name_ptr = osgn_name_ptr_xespritetexcoord;
|
2022-05-09 18:16:22 +02:00
|
|
|
osgn_point_coordinates.component_type =
|
|
|
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
|
|
|
osgn_point_coordinates.register_index = output_register_index++;
|
2022-07-21 11:32:28 +02:00
|
|
|
osgn_point_coordinates.mask = 0b0011;
|
2022-05-09 18:16:22 +02:00
|
|
|
osgn_point_coordinates.never_writes_mask = 0b1100;
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-21 11:32:28 +02:00
|
|
|
// Position (SV_Position).
|
2022-05-09 18:16:22 +02:00
|
|
|
output_register_position = output_register_index;
|
|
|
|
|
assert_true(osgn_parameter_index < osgn_parameter_count);
|
|
|
|
|
dxbc::SignatureParameterForGS& osgn_sv_position =
|
|
|
|
|
osgn_parameters[osgn_parameter_index++];
|
|
|
|
|
osgn_sv_position.semantic_name_ptr = osgn_name_ptr_sv_position;
|
|
|
|
|
osgn_sv_position.system_value = dxbc::Name::kPosition;
|
|
|
|
|
osgn_sv_position.component_type =
|
|
|
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
|
|
|
osgn_sv_position.register_index = output_register_index++;
|
|
|
|
|
osgn_sv_position.mask = 0b1111;
|
|
|
|
|
|
2022-07-21 11:32:28 +02:00
|
|
|
// Clip distances (SV_ClipDistance#).
|
2022-05-09 18:16:22 +02:00
|
|
|
if (input_clip_distance_count) {
|
|
|
|
|
output_register_clip_distances = output_register_index;
|
|
|
|
|
for (uint32_t i = 0; i < input_clip_distance_count; i += 4) {
|
|
|
|
|
dxbc::SignatureParameterForGS& osgn_sv_clip_distance =
|
|
|
|
|
osgn_parameters[osgn_parameter_index++];
|
|
|
|
|
osgn_sv_clip_distance.semantic_name_ptr =
|
|
|
|
|
osgn_name_ptr_sv_clip_distance;
|
|
|
|
|
osgn_sv_clip_distance.semantic_index = i / 4;
|
|
|
|
|
osgn_sv_clip_distance.system_value = dxbc::Name::kClipDistance;
|
|
|
|
|
osgn_sv_clip_distance.component_type =
|
|
|
|
|
dxbc::SignatureRegisterComponentType::kFloat32;
|
|
|
|
|
osgn_sv_clip_distance.register_index = output_register_index++;
|
|
|
|
|
uint8_t osgn_sv_clip_distance_mask =
|
|
|
|
|
(UINT8_C(1) << std::min(input_clip_distance_count - i,
|
|
|
|
|
UINT32_C(4))) -
|
|
|
|
|
1;
|
|
|
|
|
osgn_sv_clip_distance.mask = osgn_sv_clip_distance_mask;
|
|
|
|
|
osgn_sv_clip_distance.never_writes_mask =
|
|
|
|
|
osgn_sv_clip_distance_mask ^ 0b1111;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert_true(osgn_parameter_index == osgn_parameter_count);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
auto& blob_header = *reinterpret_cast<dxbc::BlobHeader*>(
|
|
|
|
|
shader_out.data() + blob_position_dwords);
|
|
|
|
|
blob_header.fourcc = dxbc::BlobHeader::FourCC::kOutputSignatureForGS;
|
|
|
|
|
blob_position_dwords = uint32_t(shader_out.size());
|
|
|
|
|
blob_header.size_bytes =
|
|
|
|
|
(blob_position_dwords - kBlobHeaderSizeDwords) * sizeof(uint32_t) -
|
|
|
|
|
shader_out[blob_offset_position_dwords++];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ***************************************************************************
|
|
|
|
|
// Shader program
|
|
|
|
|
// ***************************************************************************
|
|
|
|
|
|
|
|
|
|
shader_out[blob_offset_position_dwords] =
|
|
|
|
|
uint32_t(blob_position_dwords * sizeof(uint32_t));
|
|
|
|
|
uint32_t shex_position_dwords = blob_position_dwords + kBlobHeaderSizeDwords;
|
|
|
|
|
shader_out.resize(shex_position_dwords);
|
|
|
|
|
|
|
|
|
|
shader_out.push_back(
|
|
|
|
|
dxbc::VersionToken(dxbc::ProgramType::kGeometryShader, 5, 1));
|
|
|
|
|
// Reserve space for the length token.
|
|
|
|
|
shader_out.push_back(0);
|
|
|
|
|
|
|
|
|
|
dxbc::Statistics stat;
|
|
|
|
|
std::memset(&stat, 0, sizeof(dxbc::Statistics));
|
|
|
|
|
dxbc::Assembler a(shader_out, stat);
|
|
|
|
|
|
|
|
|
|
a.OpDclGlobalFlags(dxbc::kGlobalFlagAllResourcesBound);
|
|
|
|
|
|
|
|
|
|
if (system_cbuffer_size_vector_aligned_bytes) {
|
|
|
|
|
a.OpDclConstantBuffer(
|
|
|
|
|
dxbc::Src::CB(
|
|
|
|
|
dxbc::Src::Dcl, 0,
|
|
|
|
|
uint32_t(DxbcShaderTranslator::CbufferRegister::kSystemConstants),
|
|
|
|
|
uint32_t(DxbcShaderTranslator::CbufferRegister::kSystemConstants)),
|
|
|
|
|
system_cbuffer_size_vector_aligned_bytes / (sizeof(uint32_t) * 4));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
dxbc::Primitive input_primitive = dxbc::Primitive::kUndefined;
|
|
|
|
|
uint32_t input_primitive_vertex_count = 0;
|
|
|
|
|
dxbc::PrimitiveTopology output_primitive_topology =
|
|
|
|
|
dxbc::PrimitiveTopology::kUndefined;
|
|
|
|
|
uint32_t max_output_vertex_count = 0;
|
|
|
|
|
switch (key.type) {
|
|
|
|
|
case PipelineGeometryShader::kPointList:
|
|
|
|
|
// Point to a strip of 2 triangles.
|
|
|
|
|
input_primitive = dxbc::Primitive::kPoint;
|
|
|
|
|
input_primitive_vertex_count = 1;
|
|
|
|
|
output_primitive_topology = dxbc::PrimitiveTopology::kTriangleStrip;
|
|
|
|
|
max_output_vertex_count = 4;
|
|
|
|
|
break;
|
|
|
|
|
case PipelineGeometryShader::kRectangleList:
|
|
|
|
|
// Triangle to a strip of 2 triangles.
|
|
|
|
|
input_primitive = dxbc::Primitive::kTriangle;
|
|
|
|
|
input_primitive_vertex_count = 3;
|
|
|
|
|
output_primitive_topology = dxbc::PrimitiveTopology::kTriangleStrip;
|
|
|
|
|
max_output_vertex_count = 4;
|
|
|
|
|
break;
|
|
|
|
|
case PipelineGeometryShader::kQuadList:
|
|
|
|
|
// 4 vertices passed via kLineWithAdjacency to a strip of 2 triangles.
|
|
|
|
|
input_primitive = dxbc::Primitive::kLineWithAdjacency;
|
|
|
|
|
input_primitive_vertex_count = 4;
|
|
|
|
|
output_primitive_topology = dxbc::PrimitiveTopology::kTriangleStrip;
|
|
|
|
|
max_output_vertex_count = 4;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
assert_unhandled_case(key.type);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert_false(key.interpolator_count &&
|
|
|
|
|
input_register_interpolators == UINT32_MAX);
|
|
|
|
|
for (uint32_t i = 0; i < key.interpolator_count; ++i) {
|
|
|
|
|
a.OpDclInput(dxbc::Dest::V2D(input_primitive_vertex_count,
|
|
|
|
|
input_register_interpolators + i));
|
|
|
|
|
}
|
|
|
|
|
a.OpDclInputSIV(
|
|
|
|
|
dxbc::Dest::V2D(input_primitive_vertex_count, input_register_position),
|
|
|
|
|
dxbc::Name::kPosition);
|
|
|
|
|
// Clip and cull plane declarations are separate in FXC-generated code even
|
|
|
|
|
// for a single register.
|
|
|
|
|
assert_false(input_clip_and_cull_distance_count &&
|
|
|
|
|
input_register_clip_and_cull_distances == UINT32_MAX);
|
|
|
|
|
for (uint32_t i = 0; i < input_clip_and_cull_distance_count; i += 4) {
|
|
|
|
|
if (i < input_clip_distance_count) {
|
|
|
|
|
a.OpDclInput(
|
|
|
|
|
dxbc::Dest::V2D(input_primitive_vertex_count,
|
|
|
|
|
input_register_clip_and_cull_distances + (i >> 2),
|
|
|
|
|
(UINT32_C(1) << std::min(
|
|
|
|
|
input_clip_distance_count - i, UINT32_C(4))) -
|
|
|
|
|
1));
|
|
|
|
|
}
|
|
|
|
|
if (input_cull_distance_count && i + 4 > input_clip_distance_count) {
|
|
|
|
|
uint32_t cull_distance_mask =
|
|
|
|
|
(UINT32_C(1) << std::min(input_clip_and_cull_distance_count - i,
|
|
|
|
|
UINT32_C(4))) -
|
|
|
|
|
1;
|
|
|
|
|
if (i < input_clip_distance_count) {
|
|
|
|
|
cull_distance_mask &=
|
|
|
|
|
~((UINT32_C(1) << (input_clip_distance_count - i)) - 1);
|
|
|
|
|
}
|
|
|
|
|
a.OpDclInput(
|
|
|
|
|
dxbc::Dest::V2D(input_primitive_vertex_count,
|
|
|
|
|
input_register_clip_and_cull_distances + (i >> 2),
|
|
|
|
|
cull_distance_mask));
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-07-21 11:32:28 +02:00
|
|
|
if (key.has_point_size && key.type == PipelineGeometryShader::kPointList) {
|
|
|
|
|
assert_true(input_register_point_size != UINT32_MAX);
|
|
|
|
|
a.OpDclInput(dxbc::Dest::V2D(input_primitive_vertex_count,
|
|
|
|
|
input_register_point_size, 0b0001));
|
|
|
|
|
}
|
2022-05-09 18:16:22 +02:00
|
|
|
|
2022-05-09 21:34:17 +02:00
|
|
|
// At least 1 temporary register needed to discard primitives with NaN
|
|
|
|
|
// position.
|
|
|
|
|
size_t dcl_temps_count_position_dwords = a.OpDclTemps(1);
|
2022-05-09 18:16:22 +02:00
|
|
|
|
|
|
|
|
a.OpDclInputPrimitive(input_primitive);
|
|
|
|
|
dxbc::Dest stream(dxbc::Dest::M(0));
|
|
|
|
|
a.OpDclStream(stream);
|
|
|
|
|
a.OpDclOutputTopology(output_primitive_topology);
|
|
|
|
|
|
|
|
|
|
assert_false(key.interpolator_count &&
|
|
|
|
|
output_register_interpolators == UINT32_MAX);
|
|
|
|
|
for (uint32_t i = 0; i < key.interpolator_count; ++i) {
|
|
|
|
|
a.OpDclOutput(dxbc::Dest::O(output_register_interpolators + i));
|
|
|
|
|
}
|
|
|
|
|
if (key.has_point_coordinates) {
|
|
|
|
|
assert_true(output_register_point_coordinates != UINT32_MAX);
|
|
|
|
|
a.OpDclOutput(dxbc::Dest::O(output_register_point_coordinates, 0b0011));
|
|
|
|
|
}
|
|
|
|
|
a.OpDclOutputSIV(dxbc::Dest::O(output_register_position),
|
|
|
|
|
dxbc::Name::kPosition);
|
|
|
|
|
assert_false(input_clip_distance_count &&
|
|
|
|
|
output_register_clip_distances == UINT32_MAX);
|
|
|
|
|
for (uint32_t i = 0; i < input_clip_distance_count; i += 4) {
|
|
|
|
|
a.OpDclOutputSIV(
|
|
|
|
|
dxbc::Dest::O(output_register_clip_distances + (i >> 2),
|
|
|
|
|
(UINT32_C(1) << std::min(input_clip_distance_count - i,
|
|
|
|
|
UINT32_C(4))) -
|
|
|
|
|
1),
|
|
|
|
|
dxbc::Name::kClipDistance);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
a.OpDclMaxOutputVertexCount(max_output_vertex_count);
|
|
|
|
|
|
|
|
|
|
// Note that after every emit, all o# become initialized and must be written
|
|
|
|
|
// to again.
|
|
|
|
|
// Also, FXC generates only movs (from statically or dynamically indexed
|
|
|
|
|
// v[#][#], from r#, or from a literal) to o# for some reason.
|
|
|
|
|
|
2022-05-09 21:34:17 +02:00
|
|
|
// Discard the whole primitive if any vertex has a NaN position (may also be
|
|
|
|
|
// set to NaN for emulation of vertex killing with the OR operator).
|
|
|
|
|
for (uint32_t i = 0; i < input_primitive_vertex_count; ++i) {
|
|
|
|
|
a.OpNE(dxbc::Dest::R(0), dxbc::Src::V2D(i, input_register_position),
|
|
|
|
|
dxbc::Src::V2D(i, input_register_position));
|
|
|
|
|
a.OpOr(dxbc::Dest::R(0, 0b0011), dxbc::Src::R(0, 0b0100),
|
|
|
|
|
dxbc::Src::R(0, 0b1110));
|
|
|
|
|
a.OpOr(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kXXXX),
|
|
|
|
|
dxbc::Src::R(0, dxbc::Src::kYYYY));
|
2023-04-09 17:27:55 +02:00
|
|
|
a.OpRetC(true, dxbc::Src::R(0, dxbc::Src::kXXXX));
|
2022-05-09 21:34:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Cull the whole primitive if any cull distance for all vertices in the
|
|
|
|
|
// primitive is < 0.
|
2022-05-09 18:16:22 +02:00
|
|
|
// TODO(Triang3l): For points, handle ps_ucp_mode (transform the host clip
|
|
|
|
|
// space to the guest one, calculate the distances to the user clip planes,
|
|
|
|
|
// cull using the distance from the center for modes 0, 1 and 2, cull and clip
|
|
|
|
|
// per-vertex for modes 2 and 3) - except for the vertex kill flag.
|
|
|
|
|
if (input_cull_distance_count) {
|
|
|
|
|
for (uint32_t i = 0; i < input_cull_distance_count; ++i) {
|
|
|
|
|
uint32_t cull_distance_register = input_register_clip_and_cull_distances +
|
|
|
|
|
((input_clip_distance_count + i) >> 2);
|
|
|
|
|
uint32_t cull_distance_component = (input_clip_distance_count + i) & 3;
|
|
|
|
|
a.OpLT(dxbc::Dest::R(0, 0b0001),
|
|
|
|
|
dxbc::Src::V2D(0, cull_distance_register)
|
|
|
|
|
.Select(cull_distance_component),
|
|
|
|
|
dxbc::Src::LF(0.0f));
|
|
|
|
|
for (uint32_t j = 1; j < input_primitive_vertex_count; ++j) {
|
|
|
|
|
a.OpLT(dxbc::Dest::R(0, 0b0010),
|
|
|
|
|
dxbc::Src::V2D(j, cull_distance_register)
|
|
|
|
|
.Select(cull_distance_component),
|
|
|
|
|
dxbc::Src::LF(0.0f));
|
|
|
|
|
a.OpAnd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kXXXX),
|
|
|
|
|
dxbc::Src::R(0, dxbc::Src::kYYYY));
|
|
|
|
|
}
|
2023-04-09 17:27:55 +02:00
|
|
|
a.OpRetC(true, dxbc::Src::R(0, dxbc::Src::kXXXX));
|
2022-05-09 18:16:22 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch (key.type) {
|
|
|
|
|
case PipelineGeometryShader::kPointList: {
|
|
|
|
|
// Expand the point sprite, with left-to-right, top-to-bottom UVs.
|
|
|
|
|
dxbc::Src point_size_src(dxbc::Src::CB(
|
|
|
|
|
0, uint32_t(DxbcShaderTranslator::CbufferRegister::kSystemConstants),
|
|
|
|
|
offsetof(DxbcShaderTranslator::SystemConstants,
|
|
|
|
|
point_constant_diameter) >>
|
|
|
|
|
4,
|
|
|
|
|
((offsetof(DxbcShaderTranslator::SystemConstants,
|
|
|
|
|
point_constant_diameter[0]) >>
|
|
|
|
|
2) &
|
|
|
|
|
3) |
|
|
|
|
|
(((offsetof(DxbcShaderTranslator::SystemConstants,
|
|
|
|
|
point_constant_diameter[1]) >>
|
|
|
|
|
2) &
|
|
|
|
|
3)
|
|
|
|
|
<< 2)));
|
|
|
|
|
if (key.has_point_size) {
|
|
|
|
|
// The vertex shader's header writes -1.0 to point_size by default, so
|
|
|
|
|
// any non-negative value means that it was overwritten by the
|
|
|
|
|
// translated vertex shader, and needs to be used instead of the
|
|
|
|
|
// constant size. The per-vertex diameter is already clamped in the
|
|
|
|
|
// vertex shader (combined with making it non-negative).
|
|
|
|
|
a.OpGE(dxbc::Dest::R(0, 0b0001),
|
2022-07-21 11:32:28 +02:00
|
|
|
dxbc::Src::V2D(0, input_register_point_size, dxbc::Src::kXXXX),
|
2022-05-09 18:16:22 +02:00
|
|
|
dxbc::Src::LF(0.0f));
|
|
|
|
|
a.OpMovC(dxbc::Dest::R(0, 0b0011), dxbc::Src::R(0, dxbc::Src::kXXXX),
|
2022-07-21 11:32:28 +02:00
|
|
|
dxbc::Src::V2D(0, input_register_point_size, dxbc::Src::kXXXX),
|
2022-05-09 18:16:22 +02:00
|
|
|
point_size_src);
|
|
|
|
|
point_size_src = dxbc::Src::R(0, 0b0100);
|
|
|
|
|
}
|
|
|
|
|
// 4D5307F1 has zero-size snowflakes, drop them quicker, and also drop
|
|
|
|
|
// points with a constant size of zero since point lists may also be used
|
|
|
|
|
// as just "compute" with memexport.
|
|
|
|
|
// XY may contain the point size with the per-vertex override applied, use
|
|
|
|
|
// Z as temporary.
|
|
|
|
|
for (uint32_t i = 0; i < 2; ++i) {
|
|
|
|
|
a.OpLT(dxbc::Dest::R(0, 0b0100), dxbc::Src::LF(0.0f),
|
|
|
|
|
point_size_src.SelectFromSwizzled(i));
|
2023-04-09 17:27:55 +02:00
|
|
|
a.OpRetC(false, dxbc::Src::R(0, dxbc::Src::kZZZZ));
|
2022-05-09 18:16:22 +02:00
|
|
|
}
|
|
|
|
|
// Transform the diameter in the guest screen coordinates to radius in the
|
|
|
|
|
// normalized device coordinates, and then to the clip space by
|
|
|
|
|
// multiplying by W.
|
|
|
|
|
a.OpMul(
|
|
|
|
|
dxbc::Dest::R(0, 0b0011), point_size_src,
|
|
|
|
|
dxbc::Src::CB(
|
|
|
|
|
0,
|
|
|
|
|
uint32_t(DxbcShaderTranslator::CbufferRegister::kSystemConstants),
|
|
|
|
|
offsetof(DxbcShaderTranslator::SystemConstants,
|
|
|
|
|
point_screen_diameter_to_ndc_radius) >>
|
|
|
|
|
4,
|
|
|
|
|
((offsetof(DxbcShaderTranslator::SystemConstants,
|
|
|
|
|
point_screen_diameter_to_ndc_radius[0]) >>
|
|
|
|
|
2) &
|
|
|
|
|
3) |
|
|
|
|
|
(((offsetof(DxbcShaderTranslator::SystemConstants,
|
|
|
|
|
point_screen_diameter_to_ndc_radius[1]) >>
|
|
|
|
|
2) &
|
|
|
|
|
3)
|
|
|
|
|
<< 2)));
|
|
|
|
|
point_size_src = dxbc::Src::R(0, 0b0100);
|
|
|
|
|
a.OpMul(dxbc::Dest::R(0, 0b0011), point_size_src,
|
|
|
|
|
dxbc::Src::V2D(0, input_register_position, dxbc::Src::kWWWW));
|
|
|
|
|
dxbc::Src point_radius_x_src(point_size_src.SelectFromSwizzled(0));
|
|
|
|
|
dxbc::Src point_radius_y_src(point_size_src.SelectFromSwizzled(1));
|
|
|
|
|
|
|
|
|
|
for (uint32_t i = 0; i < 4; ++i) {
|
|
|
|
|
// Same interpolators for the entire sprite.
|
|
|
|
|
for (uint32_t j = 0; j < key.interpolator_count; ++j) {
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_interpolators + j),
|
|
|
|
|
dxbc::Src::V2D(0, input_register_interpolators + j));
|
|
|
|
|
}
|
|
|
|
|
// Top-left, top-right, bottom-left, bottom-right order (chosen
|
|
|
|
|
// arbitrarily, simply based on clockwise meaning front with
|
|
|
|
|
// FrontCounterClockwise = FALSE, but faceness is ignored for
|
|
|
|
|
// non-polygon primitive types).
|
|
|
|
|
// Bottom is -Y in Direct3D NDC, +V in point sprite coordinates.
|
|
|
|
|
if (key.has_point_coordinates) {
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_point_coordinates, 0b0011),
|
|
|
|
|
dxbc::Src::LF(float(i & 1), float(i >> 1), 0.0f, 0.0f));
|
|
|
|
|
}
|
|
|
|
|
// FXC generates only `mov`s for o#, use temporary registers (r0.zw, as
|
|
|
|
|
// r0.xy already used for the point size) for calculations.
|
|
|
|
|
a.OpAdd(dxbc::Dest::R(0, 0b0100),
|
|
|
|
|
dxbc::Src::V2D(0, input_register_position, dxbc::Src::kXXXX),
|
|
|
|
|
(i & 1) ? point_radius_x_src : -point_radius_x_src);
|
|
|
|
|
a.OpAdd(dxbc::Dest::R(0, 0b1000),
|
|
|
|
|
dxbc::Src::V2D(0, input_register_position, dxbc::Src::kYYYY),
|
|
|
|
|
(i >> 1) ? -point_radius_y_src : point_radius_y_src);
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_position, 0b0011),
|
|
|
|
|
dxbc::Src::R(0, 0b1110));
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_position, 0b1100),
|
|
|
|
|
dxbc::Src::V2D(0, input_register_position));
|
|
|
|
|
// TODO(Triang3l): Handle ps_ucp_mode properly, clip expanded points if
|
|
|
|
|
// needed.
|
|
|
|
|
for (uint32_t j = 0; j < input_clip_distance_count; j += 4) {
|
|
|
|
|
a.OpMov(
|
|
|
|
|
dxbc::Dest::O(output_register_clip_distances + (j >> 2),
|
|
|
|
|
(UINT32_C(1) << std::min(
|
|
|
|
|
input_clip_distance_count - j, UINT32_C(4))) -
|
|
|
|
|
1),
|
|
|
|
|
dxbc::Src::V2D(
|
|
|
|
|
0, input_register_clip_and_cull_distances + (j >> 2)));
|
|
|
|
|
}
|
|
|
|
|
if (i < 3) {
|
|
|
|
|
a.OpEmitStream(stream);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
a.OpEmitThenCutStream(stream);
|
|
|
|
|
} break;
|
|
|
|
|
|
|
|
|
|
case PipelineGeometryShader::kRectangleList: {
|
|
|
|
|
// Construct a strip with the fourth vertex generated by mirroring a
|
|
|
|
|
// vertex across the longest edge (the diagonal).
|
|
|
|
|
//
|
|
|
|
|
// Possible options:
|
|
|
|
|
//
|
|
|
|
|
// 0---1
|
|
|
|
|
// | /|
|
|
|
|
|
// | / | - 12 is the longest edge, strip 0123 (most commonly used)
|
|
|
|
|
// |/ | v3 = v0 + (v1 - v0) + (v2 - v0), or v3 = -v0 + v1 + v2
|
|
|
|
|
// 2--[3]
|
|
|
|
|
//
|
|
|
|
|
// 1---2
|
|
|
|
|
// | /|
|
|
|
|
|
// | / | - 20 is the longest edge, strip 1203
|
|
|
|
|
// |/ |
|
|
|
|
|
// 0--[3]
|
|
|
|
|
//
|
|
|
|
|
// 2---0
|
|
|
|
|
// | /|
|
|
|
|
|
// | / | - 01 is the longest edge, strip 2013
|
|
|
|
|
// |/ |
|
|
|
|
|
// 1--[3]
|
|
|
|
|
//
|
|
|
|
|
// Input vertices are implicitly indexable, dcl_indexRange is not needed
|
|
|
|
|
// for the first dimension of a v[#][#] index.
|
|
|
|
|
|
|
|
|
|
// Get squares of edge lengths into r0.xyz to choose the longest edge.
|
|
|
|
|
// r0.x = ||12||^2
|
|
|
|
|
a.OpAdd(dxbc::Dest::R(0, 0b0011),
|
|
|
|
|
dxbc::Src::V2D(2, input_register_position, 0b0100),
|
|
|
|
|
-dxbc::Src::V2D(1, input_register_position, 0b0100));
|
|
|
|
|
a.OpDP2(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, 0b0100),
|
|
|
|
|
dxbc::Src::R(0, 0b0100));
|
|
|
|
|
// r0.y = ||20||^2
|
|
|
|
|
a.OpAdd(dxbc::Dest::R(0, 0b0110),
|
|
|
|
|
dxbc::Src::V2D(0, input_register_position, 0b0100 << 2),
|
|
|
|
|
-dxbc::Src::V2D(2, input_register_position, 0b0100 << 2));
|
|
|
|
|
a.OpDP2(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, 0b1001),
|
|
|
|
|
dxbc::Src::R(0, 0b1001));
|
|
|
|
|
// r0.z = ||01||^2
|
|
|
|
|
a.OpAdd(dxbc::Dest::R(0, 0b1100),
|
|
|
|
|
dxbc::Src::V2D(1, input_register_position, 0b0100 << 4),
|
|
|
|
|
-dxbc::Src::V2D(0, input_register_position, 0b0100 << 4));
|
|
|
|
|
a.OpDP2(dxbc::Dest::R(0, 0b0100), dxbc::Src::R(0, 0b1110),
|
|
|
|
|
dxbc::Src::R(0, 0b1110));
|
|
|
|
|
|
|
|
|
|
// Find the longest edge, and select the strip vertex indices into r0.xyz.
|
|
|
|
|
// r0.w = 12 > 20
|
|
|
|
|
a.OpLT(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kYYYY),
|
|
|
|
|
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
|
|
|
|
// r0.x = 12 > 01
|
|
|
|
|
a.OpLT(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ),
|
|
|
|
|
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
|
|
|
|
// r0.x = 12 > 20 && 12 > 01
|
|
|
|
|
a.OpAnd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kWWWW),
|
|
|
|
|
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
|
|
|
|
a.OpIf(true, dxbc::Src::R(0, dxbc::Src::kXXXX));
|
|
|
|
|
{
|
|
|
|
|
// 12 is the longest edge, the first triangle in the strip is 012.
|
|
|
|
|
a.OpMov(dxbc::Dest::R(0, 0b0111), dxbc::Src::LU(0, 1, 2, 0));
|
|
|
|
|
}
|
|
|
|
|
a.OpElse();
|
|
|
|
|
{
|
|
|
|
|
// r0.x = 20 > 01
|
|
|
|
|
a.OpLT(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ),
|
|
|
|
|
dxbc::Src::R(0, dxbc::Src::kYYYY));
|
|
|
|
|
// If 20 is the longest edge, the first triangle in the strip is 120.
|
|
|
|
|
// Otherwise, it's 201.
|
|
|
|
|
a.OpMovC(dxbc::Dest::R(0, 0b0111), dxbc::Src::R(0, dxbc::Src::kXXXX),
|
|
|
|
|
dxbc::Src::LU(1, 2, 0, 0), dxbc::Src::LU(2, 0, 1, 0));
|
|
|
|
|
}
|
|
|
|
|
a.OpEndIf();
|
|
|
|
|
|
2022-05-09 18:17:55 +02:00
|
|
|
// Emit the triangle in the strip that consists of the original vertices.
|
2022-05-09 18:16:22 +02:00
|
|
|
for (uint32_t i = 0; i < 3; ++i) {
|
|
|
|
|
dxbc::Index input_vertex_index(0, i);
|
|
|
|
|
for (uint32_t j = 0; j < key.interpolator_count; ++j) {
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_interpolators + j),
|
|
|
|
|
dxbc::Src::V2D(input_vertex_index,
|
|
|
|
|
input_register_interpolators + j));
|
|
|
|
|
}
|
|
|
|
|
if (key.has_point_coordinates) {
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_point_coordinates, 0b0011),
|
|
|
|
|
dxbc::Src::LF(0.0f));
|
|
|
|
|
}
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_position),
|
|
|
|
|
dxbc::Src::V2D(input_vertex_index, input_register_position));
|
|
|
|
|
for (uint32_t j = 0; j < input_clip_distance_count; j += 4) {
|
|
|
|
|
a.OpMov(
|
|
|
|
|
dxbc::Dest::O(output_register_clip_distances + (j >> 2),
|
|
|
|
|
(UINT32_C(1) << std::min(
|
|
|
|
|
input_clip_distance_count - j, UINT32_C(4))) -
|
|
|
|
|
1),
|
|
|
|
|
dxbc::Src::V2D(
|
|
|
|
|
input_vertex_index,
|
|
|
|
|
input_register_clip_and_cull_distances + (j >> 2)));
|
|
|
|
|
}
|
|
|
|
|
a.OpEmitStream(stream);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Construct the fourth vertex using r1 as temporary storage, including
|
|
|
|
|
// for the final operation as FXC generates only `mov`s for o#.
|
|
|
|
|
stat.temp_register_count =
|
|
|
|
|
std::max(UINT32_C(2), stat.temp_register_count);
|
|
|
|
|
for (uint32_t j = 0; j < key.interpolator_count; ++j) {
|
|
|
|
|
uint32_t input_register_interpolator = input_register_interpolators + j;
|
|
|
|
|
a.OpAdd(dxbc::Dest::R(1),
|
|
|
|
|
-dxbc::Src::V2D(dxbc::Index(0, 0), input_register_interpolator),
|
|
|
|
|
dxbc::Src::V2D(dxbc::Index(0, 1), input_register_interpolator));
|
|
|
|
|
a.OpAdd(dxbc::Dest::R(1), dxbc::Src::R(1),
|
|
|
|
|
dxbc::Src::V2D(dxbc::Index(0, 2), input_register_interpolator));
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_interpolators + j),
|
|
|
|
|
dxbc::Src::R(1));
|
|
|
|
|
}
|
|
|
|
|
if (key.has_point_coordinates) {
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_point_coordinates, 0b0011),
|
|
|
|
|
dxbc::Src::LF(0.0f));
|
|
|
|
|
}
|
|
|
|
|
a.OpAdd(dxbc::Dest::R(1),
|
|
|
|
|
-dxbc::Src::V2D(dxbc::Index(0, 0), input_register_position),
|
|
|
|
|
dxbc::Src::V2D(dxbc::Index(0, 1), input_register_position));
|
|
|
|
|
a.OpAdd(dxbc::Dest::R(1), dxbc::Src::R(1),
|
|
|
|
|
dxbc::Src::V2D(dxbc::Index(0, 2), input_register_position));
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_position), dxbc::Src::R(1));
|
|
|
|
|
for (uint32_t j = 0; j < input_clip_distance_count; j += 4) {
|
|
|
|
|
uint32_t clip_distance_mask =
|
|
|
|
|
(UINT32_C(1) << std::min(input_clip_distance_count - j,
|
|
|
|
|
UINT32_C(4))) -
|
|
|
|
|
1;
|
|
|
|
|
uint32_t input_register_clip_distance =
|
|
|
|
|
input_register_clip_and_cull_distances + (j >> 2);
|
|
|
|
|
a.OpAdd(
|
|
|
|
|
dxbc::Dest::R(1, clip_distance_mask),
|
|
|
|
|
-dxbc::Src::V2D(dxbc::Index(0, 0), input_register_clip_distance),
|
|
|
|
|
dxbc::Src::V2D(dxbc::Index(0, 1), input_register_clip_distance));
|
|
|
|
|
a.OpAdd(
|
|
|
|
|
dxbc::Dest::R(1, clip_distance_mask), dxbc::Src::R(1),
|
|
|
|
|
dxbc::Src::V2D(dxbc::Index(0, 2), input_register_clip_distance));
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_clip_distances + (j >> 2),
|
|
|
|
|
clip_distance_mask),
|
|
|
|
|
dxbc::Src::R(1));
|
|
|
|
|
}
|
|
|
|
|
a.OpEmitThenCutStream(stream);
|
|
|
|
|
} break;
|
|
|
|
|
|
|
|
|
|
case PipelineGeometryShader::kQuadList: {
|
|
|
|
|
// Build the triangle strip from the original quad vertices in the
|
|
|
|
|
// 0, 1, 3, 2 order (like specified for GL_QUAD_STRIP).
|
|
|
|
|
// TODO(Triang3l): Find the correct decomposition of quads into triangles
|
|
|
|
|
// on the real hardware.
|
|
|
|
|
for (uint32_t i = 0; i < 4; ++i) {
|
|
|
|
|
uint32_t input_vertex_index = i ^ (i >> 1);
|
|
|
|
|
for (uint32_t j = 0; j < key.interpolator_count; ++j) {
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_interpolators + j),
|
|
|
|
|
dxbc::Src::V2D(input_vertex_index,
|
|
|
|
|
input_register_interpolators + j));
|
|
|
|
|
}
|
|
|
|
|
if (key.has_point_coordinates) {
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_point_coordinates, 0b0011),
|
|
|
|
|
dxbc::Src::LF(0.0f));
|
|
|
|
|
}
|
|
|
|
|
a.OpMov(dxbc::Dest::O(output_register_position),
|
|
|
|
|
dxbc::Src::V2D(input_vertex_index, input_register_position));
|
|
|
|
|
for (uint32_t j = 0; j < input_clip_distance_count; j += 4) {
|
|
|
|
|
a.OpMov(
|
|
|
|
|
dxbc::Dest::O(output_register_clip_distances + (j >> 2),
|
|
|
|
|
(UINT32_C(1) << std::min(
|
|
|
|
|
input_clip_distance_count - j, UINT32_C(4))) -
|
|
|
|
|
1),
|
|
|
|
|
dxbc::Src::V2D(
|
|
|
|
|
input_vertex_index,
|
|
|
|
|
input_register_clip_and_cull_distances + (j >> 2)));
|
|
|
|
|
}
|
|
|
|
|
if (i < 3) {
|
|
|
|
|
a.OpEmitStream(stream);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
a.OpEmitThenCutStream(stream);
|
|
|
|
|
} break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
assert_unhandled_case(key.type);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
a.OpRet();
|
|
|
|
|
|
2022-05-09 21:34:17 +02:00
|
|
|
// Write the actual number of temporary registers used.
|
|
|
|
|
shader_out[dcl_temps_count_position_dwords] = stat.temp_register_count;
|
2022-05-09 18:16:22 +02:00
|
|
|
|
|
|
|
|
// Write the shader program length in dwords.
|
|
|
|
|
shader_out[shex_position_dwords + 1] =
|
|
|
|
|
uint32_t(shader_out.size()) - shex_position_dwords;
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
auto& blob_header = *reinterpret_cast<dxbc::BlobHeader*>(
|
|
|
|
|
shader_out.data() + blob_position_dwords);
|
|
|
|
|
blob_header.fourcc = dxbc::BlobHeader::FourCC::kShaderEx;
|
|
|
|
|
blob_position_dwords = uint32_t(shader_out.size());
|
|
|
|
|
blob_header.size_bytes =
|
|
|
|
|
(blob_position_dwords - kBlobHeaderSizeDwords) * sizeof(uint32_t) -
|
|
|
|
|
shader_out[blob_offset_position_dwords++];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ***************************************************************************
|
|
|
|
|
// Statistics
|
|
|
|
|
// ***************************************************************************
|
|
|
|
|
|
|
|
|
|
shader_out[blob_offset_position_dwords] =
|
|
|
|
|
uint32_t(blob_position_dwords * sizeof(uint32_t));
|
|
|
|
|
uint32_t stat_position_dwords = blob_position_dwords + kBlobHeaderSizeDwords;
|
|
|
|
|
shader_out.resize(stat_position_dwords +
|
|
|
|
|
sizeof(dxbc::Statistics) / sizeof(uint32_t));
|
|
|
|
|
std::memcpy(shader_out.data() + stat_position_dwords, &stat,
|
|
|
|
|
sizeof(dxbc::Statistics));
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
auto& blob_header = *reinterpret_cast<dxbc::BlobHeader*>(
|
|
|
|
|
shader_out.data() + blob_position_dwords);
|
|
|
|
|
blob_header.fourcc = dxbc::BlobHeader::FourCC::kStatistics;
|
|
|
|
|
blob_position_dwords = uint32_t(shader_out.size());
|
|
|
|
|
blob_header.size_bytes =
|
|
|
|
|
(blob_position_dwords - kBlobHeaderSizeDwords) * sizeof(uint32_t) -
|
|
|
|
|
shader_out[blob_offset_position_dwords++];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ***************************************************************************
|
|
|
|
|
// Container header
|
|
|
|
|
// ***************************************************************************
|
|
|
|
|
|
|
|
|
|
uint32_t shader_size_bytes = uint32_t(shader_out.size() * sizeof(uint32_t));
|
|
|
|
|
{
|
|
|
|
|
auto& container_header =
|
|
|
|
|
*reinterpret_cast<dxbc::ContainerHeader*>(shader_out.data());
|
|
|
|
|
container_header.InitializeIdentification();
|
|
|
|
|
container_header.size_bytes = shader_size_bytes;
|
|
|
|
|
container_header.blob_count = kBlobCount;
|
|
|
|
|
CalculateDXBCChecksum(
|
|
|
|
|
reinterpret_cast<unsigned char*>(shader_out.data()),
|
|
|
|
|
static_cast<unsigned int>(shader_size_bytes),
|
|
|
|
|
reinterpret_cast<unsigned int*>(&container_header.hash));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const std::vector<uint32_t>& PipelineCache::GetGeometryShader(
|
|
|
|
|
GeometryShaderKey key) {
|
|
|
|
|
auto it = geometry_shaders_.find(key);
|
|
|
|
|
if (it != geometry_shaders_.end()) {
|
|
|
|
|
return it->second;
|
|
|
|
|
}
|
|
|
|
|
std::vector<uint32_t> shader;
|
|
|
|
|
CreateDxbcGeometryShader(key, shader);
|
|
|
|
|
return geometry_shaders_.emplace(key, std::move(shader)).first->second;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline(
|
2020-03-21 17:21:00 +01:00
|
|
|
const PipelineRuntimeDescription& runtime_description) {
|
|
|
|
|
const PipelineDescription& description = runtime_description.description;
|
|
|
|
|
|
|
|
|
|
if (runtime_description.pixel_shader != nullptr) {
|
2020-11-14 14:43:18 +01:00
|
|
|
XELOGGPU("Creating graphics pipeline with VS {:016X}, PS {:016X}",
|
2020-12-07 20:23:54 +01:00
|
|
|
runtime_description.vertex_shader->shader().ucode_data_hash(),
|
|
|
|
|
runtime_description.pixel_shader->shader().ucode_data_hash());
|
2019-01-04 12:30:26 +01:00
|
|
|
} else {
|
2020-11-14 14:43:18 +01:00
|
|
|
XELOGGPU("Creating graphics pipeline with VS {:016X}",
|
2020-12-07 20:23:54 +01:00
|
|
|
runtime_description.vertex_shader->shader().ucode_data_hash());
|
2019-01-04 12:30:26 +01:00
|
|
|
}
|
|
|
|
|
|
2019-01-01 20:20:50 +01:00
|
|
|
D3D12_GRAPHICS_PIPELINE_STATE_DESC state_desc;
|
|
|
|
|
std::memset(&state_desc, 0, sizeof(state_desc));
|
2018-07-28 15:30:47 +02:00
|
|
|
|
2021-04-26 21:12:09 +02:00
|
|
|
bool edram_rov_used = render_target_cache_.GetPath() ==
|
|
|
|
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
|
|
|
|
2019-01-01 20:20:50 +01:00
|
|
|
// Root signature.
|
2020-03-21 17:21:00 +01:00
|
|
|
state_desc.pRootSignature = runtime_description.root_signature;
|
2019-01-01 20:20:50 +01:00
|
|
|
|
|
|
|
|
// Index buffer strip cut value.
|
|
|
|
|
switch (description.strip_cut_index) {
|
|
|
|
|
case PipelineStripCutIndex::kFFFF:
|
|
|
|
|
state_desc.IBStripCutValue = D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFF;
|
|
|
|
|
break;
|
|
|
|
|
case PipelineStripCutIndex::kFFFFFFFF:
|
|
|
|
|
state_desc.IBStripCutValue =
|
|
|
|
|
D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFFFFFF;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
state_desc.IBStripCutValue = D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_DISABLED;
|
|
|
|
|
break;
|
2018-07-28 15:30:47 +02:00
|
|
|
}
|
|
|
|
|
|
2020-04-05 23:19:29 +02:00
|
|
|
// Primitive topology, vertex, hull, domain and geometry shaders.
|
2020-03-21 17:21:00 +01:00
|
|
|
if (!runtime_description.vertex_shader->is_translated()) {
|
2020-02-28 21:30:48 +01:00
|
|
|
XELOGE("Vertex shader {:016X} not translated",
|
2020-12-07 20:23:54 +01:00
|
|
|
runtime_description.vertex_shader->shader().ucode_data_hash());
|
2019-01-04 12:30:26 +01:00
|
|
|
assert_always();
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
2020-04-05 23:03:23 +02:00
|
|
|
Shader::HostVertexShaderType host_vertex_shader_type =
|
2020-12-07 20:23:54 +01:00
|
|
|
DxbcShaderTranslator::Modification(
|
|
|
|
|
runtime_description.vertex_shader->modification())
|
2021-04-26 21:12:09 +02:00
|
|
|
.vertex.host_vertex_shader_type;
|
2022-05-15 15:13:05 +02:00
|
|
|
if (Shader::IsHostVertexShaderTypeDomain(host_vertex_shader_type)) {
|
2020-04-05 23:03:23 +02:00
|
|
|
state_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_PATCH;
|
|
|
|
|
xenos::TessellationMode tessellation_mode = xenos::TessellationMode(
|
|
|
|
|
description.primitive_topology_type_or_tessellation_mode);
|
2021-05-16 17:27:41 +02:00
|
|
|
if (tessellation_mode == xenos::TessellationMode::kAdaptive) {
|
2021-06-05 17:53:53 +02:00
|
|
|
state_desc.VS.pShaderBytecode = shaders::tessellation_adaptive_vs;
|
|
|
|
|
state_desc.VS.BytecodeLength = sizeof(shaders::tessellation_adaptive_vs);
|
2021-05-16 17:27:41 +02:00
|
|
|
} else {
|
2021-06-05 17:53:53 +02:00
|
|
|
state_desc.VS.pShaderBytecode = shaders::tessellation_indexed_vs;
|
|
|
|
|
state_desc.VS.BytecodeLength = sizeof(shaders::tessellation_indexed_vs);
|
2021-05-16 17:27:41 +02:00
|
|
|
}
|
2020-04-05 23:03:23 +02:00
|
|
|
switch (tessellation_mode) {
|
|
|
|
|
case xenos::TessellationMode::kDiscrete:
|
|
|
|
|
switch (host_vertex_shader_type) {
|
2020-05-11 21:40:52 +02:00
|
|
|
case Shader::HostVertexShaderType::kTriangleDomainCPIndexed:
|
2022-07-24 16:38:26 +02:00
|
|
|
state_desc.HS.pShaderBytecode = shaders::discrete_triangle_3cp_hs;
|
|
|
|
|
state_desc.HS.BytecodeLength =
|
|
|
|
|
sizeof(shaders::discrete_triangle_3cp_hs);
|
|
|
|
|
break;
|
2020-05-11 21:40:52 +02:00
|
|
|
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
|
2022-07-24 16:38:26 +02:00
|
|
|
state_desc.HS.pShaderBytecode = shaders::discrete_triangle_1cp_hs;
|
2021-06-05 17:53:53 +02:00
|
|
|
state_desc.HS.BytecodeLength =
|
2022-07-24 16:38:26 +02:00
|
|
|
sizeof(shaders::discrete_triangle_1cp_hs);
|
2020-04-05 16:22:49 +02:00
|
|
|
break;
|
2020-05-11 21:40:52 +02:00
|
|
|
case Shader::HostVertexShaderType::kQuadDomainCPIndexed:
|
2022-07-24 16:38:26 +02:00
|
|
|
state_desc.HS.pShaderBytecode = shaders::discrete_quad_4cp_hs;
|
|
|
|
|
state_desc.HS.BytecodeLength =
|
|
|
|
|
sizeof(shaders::discrete_quad_4cp_hs);
|
|
|
|
|
break;
|
2020-05-11 21:40:52 +02:00
|
|
|
case Shader::HostVertexShaderType::kQuadDomainPatchIndexed:
|
2022-07-24 16:38:26 +02:00
|
|
|
state_desc.HS.pShaderBytecode = shaders::discrete_quad_1cp_hs;
|
|
|
|
|
state_desc.HS.BytecodeLength =
|
|
|
|
|
sizeof(shaders::discrete_quad_1cp_hs);
|
2020-04-05 16:22:49 +02:00
|
|
|
break;
|
|
|
|
|
default:
|
2020-04-05 23:03:23 +02:00
|
|
|
assert_unhandled_case(host_vertex_shader_type);
|
2020-04-05 16:22:49 +02:00
|
|
|
return nullptr;
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
|
|
|
|
break;
|
2020-04-05 23:03:23 +02:00
|
|
|
case xenos::TessellationMode::kContinuous:
|
|
|
|
|
switch (host_vertex_shader_type) {
|
2020-05-11 21:40:52 +02:00
|
|
|
case Shader::HostVertexShaderType::kTriangleDomainCPIndexed:
|
2022-07-24 16:38:26 +02:00
|
|
|
state_desc.HS.pShaderBytecode = shaders::continuous_triangle_3cp_hs;
|
|
|
|
|
state_desc.HS.BytecodeLength =
|
|
|
|
|
sizeof(shaders::continuous_triangle_3cp_hs);
|
|
|
|
|
break;
|
2020-05-11 21:40:52 +02:00
|
|
|
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
|
2022-07-24 16:38:26 +02:00
|
|
|
state_desc.HS.pShaderBytecode = shaders::continuous_triangle_1cp_hs;
|
2021-06-05 17:53:53 +02:00
|
|
|
state_desc.HS.BytecodeLength =
|
2022-07-24 16:38:26 +02:00
|
|
|
sizeof(shaders::continuous_triangle_1cp_hs);
|
2020-04-05 16:22:49 +02:00
|
|
|
break;
|
2020-05-11 21:40:52 +02:00
|
|
|
case Shader::HostVertexShaderType::kQuadDomainCPIndexed:
|
2022-07-24 16:38:26 +02:00
|
|
|
state_desc.HS.pShaderBytecode = shaders::continuous_quad_4cp_hs;
|
|
|
|
|
state_desc.HS.BytecodeLength =
|
|
|
|
|
sizeof(shaders::continuous_quad_4cp_hs);
|
|
|
|
|
break;
|
2020-05-11 21:40:52 +02:00
|
|
|
case Shader::HostVertexShaderType::kQuadDomainPatchIndexed:
|
2022-07-24 16:38:26 +02:00
|
|
|
state_desc.HS.pShaderBytecode = shaders::continuous_quad_1cp_hs;
|
|
|
|
|
state_desc.HS.BytecodeLength =
|
|
|
|
|
sizeof(shaders::continuous_quad_1cp_hs);
|
2020-04-05 16:22:49 +02:00
|
|
|
break;
|
|
|
|
|
default:
|
2020-04-05 23:03:23 +02:00
|
|
|
assert_unhandled_case(host_vertex_shader_type);
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case xenos::TessellationMode::kAdaptive:
|
|
|
|
|
switch (host_vertex_shader_type) {
|
2020-05-11 21:40:52 +02:00
|
|
|
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
|
2021-06-05 17:53:53 +02:00
|
|
|
state_desc.HS.pShaderBytecode = shaders::adaptive_triangle_hs;
|
|
|
|
|
state_desc.HS.BytecodeLength =
|
|
|
|
|
sizeof(shaders::adaptive_triangle_hs);
|
2020-04-05 23:03:23 +02:00
|
|
|
break;
|
2020-05-11 21:40:52 +02:00
|
|
|
case Shader::HostVertexShaderType::kQuadDomainPatchIndexed:
|
2021-06-05 17:53:53 +02:00
|
|
|
state_desc.HS.pShaderBytecode = shaders::adaptive_quad_hs;
|
|
|
|
|
state_desc.HS.BytecodeLength = sizeof(shaders::adaptive_quad_hs);
|
2020-04-08 23:00:11 +02:00
|
|
|
break;
|
2020-04-05 23:03:23 +02:00
|
|
|
default:
|
|
|
|
|
assert_unhandled_case(host_vertex_shader_type);
|
2020-04-05 16:22:49 +02:00
|
|
|
return nullptr;
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
2020-04-05 23:03:23 +02:00
|
|
|
assert_unhandled_case(tessellation_mode);
|
2019-01-01 20:20:50 +01:00
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
state_desc.DS.pShaderBytecode =
|
2020-03-21 17:21:00 +01:00
|
|
|
runtime_description.vertex_shader->translated_binary().data();
|
2019-01-01 20:20:50 +01:00
|
|
|
state_desc.DS.BytecodeLength =
|
2020-03-21 17:21:00 +01:00
|
|
|
runtime_description.vertex_shader->translated_binary().size();
|
2022-05-15 15:13:05 +02:00
|
|
|
} else {
|
|
|
|
|
assert_true(host_vertex_shader_type ==
|
|
|
|
|
Shader::HostVertexShaderType::kVertex);
|
|
|
|
|
if (host_vertex_shader_type != Shader::HostVertexShaderType::kVertex) {
|
|
|
|
|
// Fallback vertex shaders are not needed on Direct3D 12.
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
state_desc.VS.pShaderBytecode =
|
|
|
|
|
runtime_description.vertex_shader->translated_binary().data();
|
|
|
|
|
state_desc.VS.BytecodeLength =
|
|
|
|
|
runtime_description.vertex_shader->translated_binary().size();
|
|
|
|
|
PipelinePrimitiveTopologyType primitive_topology_type =
|
|
|
|
|
PipelinePrimitiveTopologyType(
|
|
|
|
|
description.primitive_topology_type_or_tessellation_mode);
|
|
|
|
|
switch (primitive_topology_type) {
|
|
|
|
|
case PipelinePrimitiveTopologyType::kPoint:
|
|
|
|
|
state_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_POINT;
|
|
|
|
|
break;
|
|
|
|
|
case PipelinePrimitiveTopologyType::kLine:
|
|
|
|
|
state_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_LINE;
|
|
|
|
|
break;
|
|
|
|
|
case PipelinePrimitiveTopologyType::kTriangle:
|
|
|
|
|
state_desc.PrimitiveTopologyType =
|
|
|
|
|
D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
assert_unhandled_case(primitive_topology_type);
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
2018-07-28 15:30:47 +02:00
|
|
|
|
2019-01-01 20:20:50 +01:00
|
|
|
// Pixel shader.
|
2020-03-21 17:21:00 +01:00
|
|
|
if (runtime_description.pixel_shader != nullptr) {
|
|
|
|
|
if (!runtime_description.pixel_shader->is_translated()) {
|
2020-02-28 21:30:48 +01:00
|
|
|
XELOGE("Pixel shader {:016X} not translated",
|
2020-12-07 20:23:54 +01:00
|
|
|
runtime_description.pixel_shader->shader().ucode_data_hash());
|
2019-01-04 12:30:26 +01:00
|
|
|
assert_always();
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
2020-12-07 20:23:54 +01:00
|
|
|
state_desc.PS.pShaderBytecode =
|
|
|
|
|
runtime_description.pixel_shader->translated_binary().data();
|
|
|
|
|
state_desc.PS.BytecodeLength =
|
|
|
|
|
runtime_description.pixel_shader->translated_binary().size();
|
2021-04-26 21:12:09 +02:00
|
|
|
} else if (edram_rov_used) {
|
2019-01-01 20:20:50 +01:00
|
|
|
state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data();
|
|
|
|
|
state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size();
|
2020-12-07 20:23:54 +01:00
|
|
|
} else {
|
2022-06-22 11:53:09 +02:00
|
|
|
if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
|
|
|
|
|
(description.depth_func != xenos::CompareFunction::kAlways ||
|
2020-12-07 20:23:54 +01:00
|
|
|
description.depth_write) &&
|
|
|
|
|
description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
|
2022-06-22 11:53:09 +02:00
|
|
|
if (render_target_cache_.depth_float24_round()) {
|
|
|
|
|
state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
|
|
|
|
|
state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
|
|
|
|
|
} else {
|
|
|
|
|
state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps;
|
|
|
|
|
state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps);
|
2020-12-07 20:23:54 +01:00
|
|
|
}
|
|
|
|
|
}
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
|
|
|
|
|
2022-05-09 18:16:22 +02:00
|
|
|
// Geometry shader.
|
|
|
|
|
if (runtime_description.geometry_shader != nullptr) {
|
|
|
|
|
state_desc.GS.pShaderBytecode = runtime_description.geometry_shader->data();
|
|
|
|
|
state_desc.GS.BytecodeLength =
|
|
|
|
|
sizeof(*runtime_description.geometry_shader->data()) *
|
|
|
|
|
runtime_description.geometry_shader->size();
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-01 20:20:50 +01:00
|
|
|
// Rasterizer state.
|
|
|
|
|
state_desc.RasterizerState.FillMode = description.fill_mode_wireframe
|
|
|
|
|
? D3D12_FILL_MODE_WIREFRAME
|
|
|
|
|
: D3D12_FILL_MODE_SOLID;
|
|
|
|
|
switch (description.cull_mode) {
|
|
|
|
|
case PipelineCullMode::kFront:
|
|
|
|
|
state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_FRONT;
|
|
|
|
|
break;
|
|
|
|
|
case PipelineCullMode::kBack:
|
|
|
|
|
state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_BACK;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
2020-12-24 21:40:38 +01:00
|
|
|
assert_true(description.cull_mode == PipelineCullMode::kNone ||
|
|
|
|
|
description.cull_mode ==
|
|
|
|
|
PipelineCullMode::kDisableRasterization);
|
2019-01-01 20:20:50 +01:00
|
|
|
state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_NONE;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
state_desc.RasterizerState.FrontCounterClockwise =
|
|
|
|
|
description.front_counter_clockwise ? TRUE : FALSE;
|
|
|
|
|
state_desc.RasterizerState.DepthBias = description.depth_bias;
|
|
|
|
|
state_desc.RasterizerState.DepthBiasClamp = 0.0f;
|
2021-12-11 19:55:33 +01:00
|
|
|
// With non-square resolution scaling, make sure the worst-case impact is
|
|
|
|
|
// reverted (slope only along the scaled axis), thus max. More bias is better
|
|
|
|
|
// than less bias, because less bias means Z fighting with the background is
|
|
|
|
|
// more likely.
|
2019-01-01 20:20:50 +01:00
|
|
|
state_desc.RasterizerState.SlopeScaledDepthBias =
|
2021-04-26 21:12:09 +02:00
|
|
|
description.depth_bias_slope_scaled *
|
2022-05-14 15:18:10 +02:00
|
|
|
float(std::max(render_target_cache_.draw_resolution_scale_x(),
|
|
|
|
|
render_target_cache_.draw_resolution_scale_y()));
|
2019-01-01 20:20:50 +01:00
|
|
|
state_desc.RasterizerState.DepthClipEnable =
|
|
|
|
|
description.depth_clip ? TRUE : FALSE;
|
2021-04-26 21:12:09 +02:00
|
|
|
uint32_t msaa_sample_count = uint32_t(1)
|
|
|
|
|
<< uint32_t(description.host_msaa_samples);
|
|
|
|
|
if (edram_rov_used) {
|
2019-01-01 20:20:50 +01:00
|
|
|
// Only 1, 4, 8 and (not on all GPUs) 16 are allowed, using sample 0 as 0
|
|
|
|
|
// and 3 as 1 for 2x instead (not exactly the same sample positions, but
|
|
|
|
|
// still top-left and bottom-right - however, this can be adjusted with
|
|
|
|
|
// programmable sample positions).
|
2021-04-26 21:12:09 +02:00
|
|
|
assert_true(msaa_sample_count == 1 || msaa_sample_count == 4);
|
|
|
|
|
if (msaa_sample_count != 1 && msaa_sample_count != 4) {
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
state_desc.RasterizerState.ForcedSampleCount =
|
|
|
|
|
uint32_t(1) << uint32_t(description.host_msaa_samples);
|
2018-07-28 15:30:47 +02:00
|
|
|
}
|
|
|
|
|
|
2021-04-26 21:12:09 +02:00
|
|
|
// Sample mask and description.
|
|
|
|
|
state_desc.SampleMask = UINT_MAX;
|
|
|
|
|
// TODO(Triang3l): 4x MSAA fallback when 2x isn't supported without ROV.
|
|
|
|
|
if (edram_rov_used) {
|
|
|
|
|
state_desc.SampleDesc.Count = 1;
|
|
|
|
|
} else {
|
|
|
|
|
assert_true(msaa_sample_count <= 4);
|
|
|
|
|
if (msaa_sample_count > 4) {
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
if (msaa_sample_count == 2 && !render_target_cache_.msaa_2x_supported()) {
|
|
|
|
|
// Using sample 0 as 0 and 3 as 1 for 2x instead (not exactly the same
|
|
|
|
|
// sample positions, but still top-left and bottom-right - however, this
|
|
|
|
|
// can be adjusted with programmable sample positions).
|
|
|
|
|
state_desc.SampleMask = 0b1001;
|
|
|
|
|
state_desc.SampleDesc.Count = 4;
|
|
|
|
|
} else {
|
|
|
|
|
state_desc.SampleDesc.Count = msaa_sample_count;
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-07-28 15:30:47 +02:00
|
|
|
|
2021-04-26 21:12:09 +02:00
|
|
|
if (!edram_rov_used) {
|
2019-01-01 20:20:50 +01:00
|
|
|
// Depth/stencil.
|
2020-07-11 14:54:22 +02:00
|
|
|
if (description.depth_func != xenos::CompareFunction::kAlways ||
|
2019-10-20 18:40:37 +02:00
|
|
|
description.depth_write) {
|
2019-01-01 20:20:50 +01:00
|
|
|
state_desc.DepthStencilState.DepthEnable = TRUE;
|
|
|
|
|
state_desc.DepthStencilState.DepthWriteMask =
|
|
|
|
|
description.depth_write ? D3D12_DEPTH_WRITE_MASK_ALL
|
|
|
|
|
: D3D12_DEPTH_WRITE_MASK_ZERO;
|
|
|
|
|
// Comparison functions are the same in Direct3D 12 but plus one (minus
|
|
|
|
|
// one, bit 0 for less, bit 1 for equal, bit 2 for greater).
|
2019-10-20 18:40:37 +02:00
|
|
|
state_desc.DepthStencilState.DepthFunc =
|
|
|
|
|
D3D12_COMPARISON_FUNC(uint32_t(D3D12_COMPARISON_FUNC_NEVER) +
|
|
|
|
|
uint32_t(description.depth_func));
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
|
|
|
|
if (description.stencil_enable) {
|
|
|
|
|
state_desc.DepthStencilState.StencilEnable = TRUE;
|
|
|
|
|
state_desc.DepthStencilState.StencilReadMask =
|
|
|
|
|
description.stencil_read_mask;
|
|
|
|
|
state_desc.DepthStencilState.StencilWriteMask =
|
|
|
|
|
description.stencil_write_mask;
|
|
|
|
|
// Stencil operations are the same in Direct3D 12 too but plus one.
|
2019-10-20 18:40:37 +02:00
|
|
|
state_desc.DepthStencilState.FrontFace.StencilFailOp =
|
|
|
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
|
|
|
uint32_t(description.stencil_front_fail_op));
|
2019-01-01 20:20:50 +01:00
|
|
|
state_desc.DepthStencilState.FrontFace.StencilDepthFailOp =
|
|
|
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
2019-10-20 18:40:37 +02:00
|
|
|
uint32_t(description.stencil_front_depth_fail_op));
|
|
|
|
|
state_desc.DepthStencilState.FrontFace.StencilPassOp =
|
|
|
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
|
|
|
uint32_t(description.stencil_front_pass_op));
|
2019-01-01 20:20:50 +01:00
|
|
|
state_desc.DepthStencilState.FrontFace.StencilFunc =
|
|
|
|
|
D3D12_COMPARISON_FUNC(uint32_t(D3D12_COMPARISON_FUNC_NEVER) +
|
2019-10-20 18:40:37 +02:00
|
|
|
uint32_t(description.stencil_front_func));
|
|
|
|
|
state_desc.DepthStencilState.BackFace.StencilFailOp =
|
|
|
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
|
|
|
uint32_t(description.stencil_back_fail_op));
|
2019-01-01 20:20:50 +01:00
|
|
|
state_desc.DepthStencilState.BackFace.StencilDepthFailOp =
|
|
|
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
2019-10-20 18:40:37 +02:00
|
|
|
uint32_t(description.stencil_back_depth_fail_op));
|
|
|
|
|
state_desc.DepthStencilState.BackFace.StencilPassOp =
|
|
|
|
|
D3D12_STENCIL_OP(uint32_t(D3D12_STENCIL_OP_KEEP) +
|
|
|
|
|
uint32_t(description.stencil_back_pass_op));
|
2019-01-01 20:20:50 +01:00
|
|
|
state_desc.DepthStencilState.BackFace.StencilFunc =
|
|
|
|
|
D3D12_COMPARISON_FUNC(uint32_t(D3D12_COMPARISON_FUNC_NEVER) +
|
2019-10-20 18:40:37 +02:00
|
|
|
uint32_t(description.stencil_back_func));
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
|
|
|
|
if (state_desc.DepthStencilState.DepthEnable ||
|
|
|
|
|
state_desc.DepthStencilState.StencilEnable) {
|
2021-04-26 21:12:09 +02:00
|
|
|
state_desc.DSVFormat = D3D12RenderTargetCache::GetDepthDSVDXGIFormat(
|
|
|
|
|
description.depth_format);
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Render targets and blending.
|
|
|
|
|
state_desc.BlendState.IndependentBlendEnable = TRUE;
|
|
|
|
|
static const D3D12_BLEND kBlendFactorMap[] = {
|
|
|
|
|
D3D12_BLEND_ZERO, D3D12_BLEND_ONE,
|
|
|
|
|
D3D12_BLEND_SRC_COLOR, D3D12_BLEND_INV_SRC_COLOR,
|
|
|
|
|
D3D12_BLEND_SRC_ALPHA, D3D12_BLEND_INV_SRC_ALPHA,
|
|
|
|
|
D3D12_BLEND_DEST_COLOR, D3D12_BLEND_INV_DEST_COLOR,
|
|
|
|
|
D3D12_BLEND_DEST_ALPHA, D3D12_BLEND_INV_DEST_ALPHA,
|
|
|
|
|
D3D12_BLEND_BLEND_FACTOR, D3D12_BLEND_INV_BLEND_FACTOR,
|
|
|
|
|
D3D12_BLEND_SRC_ALPHA_SAT,
|
|
|
|
|
};
|
2022-02-15 21:02:26 +01:00
|
|
|
// 8 entries for safety since 3 bits from the guest are passed directly.
|
2019-01-01 20:20:50 +01:00
|
|
|
static const D3D12_BLEND_OP kBlendOpMap[] = {
|
|
|
|
|
D3D12_BLEND_OP_ADD, D3D12_BLEND_OP_SUBTRACT, D3D12_BLEND_OP_MIN,
|
2022-02-15 21:02:26 +01:00
|
|
|
D3D12_BLEND_OP_MAX, D3D12_BLEND_OP_REV_SUBTRACT, D3D12_BLEND_OP_ADD,
|
|
|
|
|
D3D12_BLEND_OP_ADD, D3D12_BLEND_OP_ADD};
|
2021-04-26 21:12:09 +02:00
|
|
|
for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) {
|
2019-01-01 20:20:50 +01:00
|
|
|
const PipelineRenderTarget& rt = description.render_targets[i];
|
|
|
|
|
if (!rt.used) {
|
2021-04-26 21:12:09 +02:00
|
|
|
// Null RTV descriptors can be used for slots with DXGI_FORMAT_UNKNOWN
|
|
|
|
|
// in the pipeline state.
|
|
|
|
|
state_desc.RTVFormats[i] = DXGI_FORMAT_UNKNOWN;
|
|
|
|
|
continue;
|
2019-01-01 20:20:50 +01:00
|
|
|
}
|
2021-04-26 21:12:09 +02:00
|
|
|
state_desc.NumRenderTargets = i + 1;
|
2019-01-01 20:20:50 +01:00
|
|
|
state_desc.RTVFormats[i] =
|
2021-04-26 21:12:09 +02:00
|
|
|
render_target_cache_.GetColorDrawDXGIFormat(rt.format);
|
2019-01-01 20:20:50 +01:00
|
|
|
if (state_desc.RTVFormats[i] == DXGI_FORMAT_UNKNOWN) {
|
|
|
|
|
assert_always();
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
D3D12_RENDER_TARGET_BLEND_DESC& blend_desc =
|
|
|
|
|
state_desc.BlendState.RenderTarget[i];
|
|
|
|
|
if (rt.src_blend != PipelineBlendFactor::kOne ||
|
|
|
|
|
rt.dest_blend != PipelineBlendFactor::kZero ||
|
2020-07-11 14:54:22 +02:00
|
|
|
rt.blend_op != xenos::BlendOp::kAdd ||
|
2019-01-01 20:20:50 +01:00
|
|
|
rt.src_blend_alpha != PipelineBlendFactor::kOne ||
|
|
|
|
|
rt.dest_blend_alpha != PipelineBlendFactor::kZero ||
|
2020-07-11 14:54:22 +02:00
|
|
|
rt.blend_op_alpha != xenos::BlendOp::kAdd) {
|
2019-01-01 20:20:50 +01:00
|
|
|
blend_desc.BlendEnable = TRUE;
|
|
|
|
|
blend_desc.SrcBlend = kBlendFactorMap[uint32_t(rt.src_blend)];
|
|
|
|
|
blend_desc.DestBlend = kBlendFactorMap[uint32_t(rt.dest_blend)];
|
|
|
|
|
blend_desc.BlendOp = kBlendOpMap[uint32_t(rt.blend_op)];
|
|
|
|
|
blend_desc.SrcBlendAlpha =
|
|
|
|
|
kBlendFactorMap[uint32_t(rt.src_blend_alpha)];
|
|
|
|
|
blend_desc.DestBlendAlpha =
|
|
|
|
|
kBlendFactorMap[uint32_t(rt.dest_blend_alpha)];
|
|
|
|
|
blend_desc.BlendOpAlpha = kBlendOpMap[uint32_t(rt.blend_op_alpha)];
|
|
|
|
|
}
|
|
|
|
|
blend_desc.RenderTargetWriteMask = rt.write_mask;
|
|
|
|
|
}
|
2018-09-19 15:49:37 +02:00
|
|
|
}
|
|
|
|
|
|
2020-12-24 21:40:38 +01:00
|
|
|
// Disable rasterization if needed (parameter combinations that make no
|
|
|
|
|
// difference when rasterization is disabled have already been handled in
|
|
|
|
|
// GetCurrentStateDescription) the way it's disabled in Direct3D by design
|
|
|
|
|
// (disabling a pixel shader and depth / stencil).
|
|
|
|
|
// TODO(Triang3l): When it happens to be that a combination of parameters
|
|
|
|
|
// (no host pixel shader and depth / stencil without ROV) would disable
|
|
|
|
|
// rasterization when it's still needed (for occlusion query sample counting),
|
|
|
|
|
// ensure rasterization happens (by binding an empty pixel shader, or maybe
|
|
|
|
|
// via ForcedSampleCount when not using 2x MSAA - its requirements for
|
|
|
|
|
// OMSetRenderTargets need some investigation though).
|
|
|
|
|
if (description.cull_mode == PipelineCullMode::kDisableRasterization) {
|
|
|
|
|
state_desc.PS.pShaderBytecode = nullptr;
|
|
|
|
|
state_desc.PS.BytecodeLength = 0;
|
|
|
|
|
state_desc.DepthStencilState.DepthEnable = FALSE;
|
|
|
|
|
state_desc.DepthStencilState.StencilEnable = FALSE;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
// Create the D3D12 pipeline state object.
|
[UI] Image post-processing and full presentation/window rework
[GPU] Add FXAA post-processing
[UI] Add FidelityFX FSR and CAS post-processing
[UI] Add blue noise dithering from 10bpc to 8bpc
[GPU] Apply the DC PWL gamma ramp closer to the spec, supporting fully white color
[UI] Allow the GPU CP thread to present on the host directly, bypassing the UI thread OS paint event
[UI] Allow variable refresh rate (or tearing)
[UI] Present the newest frame (restart) on DXGI
[UI] Replace GraphicsContext with a far more advanced Presenter with more coherent surface connection and UI overlay state management
[UI] Connect presentation to windows via the Surface class, not native window handles
[Vulkan] Switch to simpler Vulkan setup with no instance/device separation due to interdependencies and to pass fewer objects around
[Vulkan] Lower the minimum required Vulkan version to 1.0
[UI/GPU] Various cleanup, mainly ComPtr usage
[UI] Support per-monitor DPI awareness v2 on Windows
[UI] DPI-scale Dear ImGui
[UI] Replace the remaining non-detachable window delegates with unified window event and input listeners
[UI] Allow listeners to safely destroy or close the window, and to register/unregister listeners without use-after-free and the ABA problem
[UI] Explicit Z ordering of input listeners and UI overlays, top-down for input, bottom-up for drawing
[UI] Add explicit window lifecycle phases
[UI] Replace Window virtual functions with explicit desired state, its application, actual state, its feedback
[UI] GTK: Apply the initial size to the drawing area
[UI] Limit internal UI frame rate to that of the monitor
[UI] Hide the cursor using a timer instead of polling due to no repeated UI thread paints with GPU CP thread presentation, and only within the window
2022-01-29 11:22:03 +01:00
|
|
|
ID3D12Device* device = command_processor_.GetD3D12Provider().GetDevice();
|
2018-07-28 15:30:47 +02:00
|
|
|
ID3D12PipelineState* state;
|
2019-01-01 20:20:50 +01:00
|
|
|
if (FAILED(device->CreateGraphicsPipelineState(&state_desc,
|
2018-07-28 15:30:47 +02:00
|
|
|
IID_PPV_ARGS(&state)))) {
|
2020-03-21 17:21:00 +01:00
|
|
|
if (runtime_description.pixel_shader != nullptr) {
|
2020-11-14 14:43:18 +01:00
|
|
|
XELOGE("Failed to create graphics pipeline with VS {:016X}, PS {:016X}",
|
2020-12-07 20:23:54 +01:00
|
|
|
runtime_description.vertex_shader->shader().ucode_data_hash(),
|
|
|
|
|
runtime_description.pixel_shader->shader().ucode_data_hash());
|
2019-01-04 12:30:26 +01:00
|
|
|
} else {
|
2020-11-14 14:43:18 +01:00
|
|
|
XELOGE("Failed to create graphics pipeline with VS {:016X}",
|
2020-12-07 20:23:54 +01:00
|
|
|
runtime_description.vertex_shader->shader().ucode_data_hash());
|
2019-01-04 12:30:26 +01:00
|
|
|
}
|
2018-07-28 15:30:47 +02:00
|
|
|
return nullptr;
|
|
|
|
|
}
|
2018-09-16 14:57:22 +02:00
|
|
|
std::wstring name;
|
2020-03-21 17:21:00 +01:00
|
|
|
if (runtime_description.pixel_shader != nullptr) {
|
2020-12-07 20:23:54 +01:00
|
|
|
name = fmt::format(
|
|
|
|
|
L"VS {:016X}, PS {:016X}",
|
|
|
|
|
runtime_description.vertex_shader->shader().ucode_data_hash(),
|
|
|
|
|
runtime_description.pixel_shader->shader().ucode_data_hash());
|
2018-09-16 14:57:22 +02:00
|
|
|
} else {
|
2020-12-07 20:23:54 +01:00
|
|
|
name = fmt::format(
|
|
|
|
|
L"VS {:016X}",
|
|
|
|
|
runtime_description.vertex_shader->shader().ucode_data_hash());
|
2018-09-16 14:57:22 +02:00
|
|
|
}
|
|
|
|
|
state->SetName(name.c_str());
|
2019-01-01 20:20:50 +01:00
|
|
|
return state;
|
2018-07-28 15:30:47 +02:00
|
|
|
}
|
|
|
|
|
|
2020-03-21 17:21:00 +01:00
|
|
|
void PipelineCache::StorageWriteThread() {
|
|
|
|
|
ShaderStoredHeader shader_header;
|
|
|
|
|
// Don't leak anything in unused bits.
|
|
|
|
|
std::memset(&shader_header, 0, sizeof(shader_header));
|
|
|
|
|
|
|
|
|
|
std::vector<uint32_t> ucode_guest_endian;
|
|
|
|
|
ucode_guest_endian.reserve(0xFFFF);
|
|
|
|
|
|
|
|
|
|
bool flush_shaders = false;
|
2020-11-14 14:43:18 +01:00
|
|
|
bool flush_pipelines = false;
|
2020-03-21 17:21:00 +01:00
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
|
if (flush_shaders) {
|
|
|
|
|
flush_shaders = false;
|
|
|
|
|
assert_not_null(shader_storage_file_);
|
|
|
|
|
fflush(shader_storage_file_);
|
|
|
|
|
}
|
2020-11-14 14:43:18 +01:00
|
|
|
if (flush_pipelines) {
|
|
|
|
|
flush_pipelines = false;
|
|
|
|
|
assert_not_null(pipeline_storage_file_);
|
|
|
|
|
fflush(pipeline_storage_file_);
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
|
|
|
|
|
2020-12-19 14:14:54 +01:00
|
|
|
const Shader* shader = nullptr;
|
2020-03-21 17:21:00 +01:00
|
|
|
PipelineStoredDescription pipeline_description;
|
2020-11-14 14:43:18 +01:00
|
|
|
bool write_pipeline = false;
|
2020-03-21 17:21:00 +01:00
|
|
|
{
|
|
|
|
|
std::unique_lock<std::mutex> lock(storage_write_request_lock_);
|
|
|
|
|
if (storage_write_thread_shutdown_) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (!storage_write_shader_queue_.empty()) {
|
2020-12-19 14:14:54 +01:00
|
|
|
shader = storage_write_shader_queue_.front();
|
2020-03-21 17:21:00 +01:00
|
|
|
storage_write_shader_queue_.pop_front();
|
|
|
|
|
} else if (storage_write_flush_shaders_) {
|
|
|
|
|
storage_write_flush_shaders_ = false;
|
|
|
|
|
flush_shaders = true;
|
|
|
|
|
}
|
2020-11-14 14:43:18 +01:00
|
|
|
if (!storage_write_pipeline_queue_.empty()) {
|
2020-03-21 17:21:00 +01:00
|
|
|
std::memcpy(&pipeline_description,
|
2020-11-14 14:43:18 +01:00
|
|
|
&storage_write_pipeline_queue_.front(),
|
2020-03-21 17:21:00 +01:00
|
|
|
sizeof(pipeline_description));
|
2020-11-14 14:43:18 +01:00
|
|
|
storage_write_pipeline_queue_.pop_front();
|
|
|
|
|
write_pipeline = true;
|
|
|
|
|
} else if (storage_write_flush_pipelines_) {
|
|
|
|
|
storage_write_flush_pipelines_ = false;
|
|
|
|
|
flush_pipelines = true;
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
2020-12-19 14:14:54 +01:00
|
|
|
if (!shader && !write_pipeline) {
|
2020-03-21 17:21:00 +01:00
|
|
|
storage_write_request_cond_.wait(lock);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (shader) {
|
|
|
|
|
shader_header.ucode_data_hash = shader->ucode_data_hash();
|
|
|
|
|
shader_header.ucode_dword_count = shader->ucode_dword_count();
|
|
|
|
|
shader_header.type = shader->type();
|
|
|
|
|
assert_not_null(shader_storage_file_);
|
|
|
|
|
fwrite(&shader_header, sizeof(shader_header), 1, shader_storage_file_);
|
|
|
|
|
if (shader_header.ucode_dword_count) {
|
|
|
|
|
ucode_guest_endian.resize(shader_header.ucode_dword_count);
|
|
|
|
|
// Need to swap because the hash is calculated for the shader with guest
|
|
|
|
|
// endianness.
|
|
|
|
|
xe::copy_and_swap(ucode_guest_endian.data(), shader->ucode_dwords(),
|
|
|
|
|
shader_header.ucode_dword_count);
|
|
|
|
|
fwrite(ucode_guest_endian.data(),
|
|
|
|
|
shader_header.ucode_dword_count * sizeof(uint32_t), 1,
|
|
|
|
|
shader_storage_file_);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
if (write_pipeline) {
|
|
|
|
|
assert_not_null(pipeline_storage_file_);
|
2020-03-21 17:21:00 +01:00
|
|
|
fwrite(&pipeline_description, sizeof(pipeline_description), 1,
|
2020-11-14 14:43:18 +01:00
|
|
|
pipeline_storage_file_);
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void PipelineCache::CreationThread(size_t thread_index) {
|
2019-01-03 22:30:11 +01:00
|
|
|
while (true) {
|
2020-11-14 14:43:18 +01:00
|
|
|
Pipeline* pipeline_to_create = nullptr;
|
2019-01-03 22:30:11 +01:00
|
|
|
|
|
|
|
|
// Check if need to shut down or set the completion event and dequeue the
|
2020-11-14 14:43:18 +01:00
|
|
|
// pipeline if there is any.
|
2019-01-03 22:30:11 +01:00
|
|
|
{
|
2022-09-17 13:04:53 +02:00
|
|
|
std::unique_lock<xe_mutex> lock(creation_request_lock_);
|
2020-03-21 17:21:00 +01:00
|
|
|
if (thread_index >= creation_threads_shutdown_from_ ||
|
|
|
|
|
creation_queue_.empty()) {
|
2019-01-03 22:30:11 +01:00
|
|
|
if (creation_completion_set_event_ && creation_threads_busy_ == 0) {
|
2020-11-14 14:43:18 +01:00
|
|
|
// Last pipeline in the queue created - signal the event if requested.
|
2019-01-03 22:30:11 +01:00
|
|
|
creation_completion_set_event_ = false;
|
|
|
|
|
creation_completion_event_->Set();
|
|
|
|
|
}
|
2020-03-21 17:21:00 +01:00
|
|
|
if (thread_index >= creation_threads_shutdown_from_) {
|
2019-01-03 22:30:11 +01:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
creation_request_cond_.wait(lock);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2020-11-14 14:43:18 +01:00
|
|
|
// Take the pipeline from the queue and increment the busy thread count
|
|
|
|
|
// until the pipeline is created - other threads must be able to dequeue
|
|
|
|
|
// requests, but can't set the completion event until the pipelines are
|
|
|
|
|
// fully created (rather than just started creating).
|
|
|
|
|
pipeline_to_create = creation_queue_.front();
|
2019-01-03 22:30:11 +01:00
|
|
|
creation_queue_.pop_front();
|
|
|
|
|
++creation_threads_busy_;
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-21 17:21:00 +01:00
|
|
|
// Create the D3D12 pipeline state object.
|
2020-11-14 14:43:18 +01:00
|
|
|
pipeline_to_create->state =
|
|
|
|
|
CreateD3D12Pipeline(pipeline_to_create->description);
|
2019-01-03 22:30:11 +01:00
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
// Pipeline created - the thread is not busy anymore, safe to set the
|
|
|
|
|
// completion event if needed (at the next iteration, or in some other
|
|
|
|
|
// thread).
|
2019-01-03 22:30:11 +01:00
|
|
|
{
|
2022-09-17 13:04:53 +02:00
|
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
2019-01-03 22:30:11 +01:00
|
|
|
--creation_threads_busy_;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-14 14:43:18 +01:00
|
|
|
void PipelineCache::CreateQueuedPipelinesOnProcessorThread() {
|
2020-03-21 17:21:00 +01:00
|
|
|
assert_false(creation_threads_.empty());
|
|
|
|
|
while (true) {
|
2020-11-14 14:43:18 +01:00
|
|
|
Pipeline* pipeline_to_create;
|
2020-03-21 17:21:00 +01:00
|
|
|
{
|
2022-09-17 13:04:53 +02:00
|
|
|
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
2020-03-21 17:21:00 +01:00
|
|
|
if (creation_queue_.empty()) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-11-14 14:43:18 +01:00
|
|
|
pipeline_to_create = creation_queue_.front();
|
2020-03-21 17:21:00 +01:00
|
|
|
creation_queue_.pop_front();
|
|
|
|
|
}
|
2020-11-14 14:43:18 +01:00
|
|
|
pipeline_to_create->state =
|
|
|
|
|
CreateD3D12Pipeline(pipeline_to_create->description);
|
2020-03-21 17:21:00 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-07-24 13:57:21 +02:00
|
|
|
} // namespace d3d12
|
|
|
|
|
} // namespace gpu
|
|
|
|
|
} // namespace xe
|