rpcsx/rpcs3/Emu/RSX/VK/VKCompute.h

#pragma once
#include "VKHelpers.h"
#include "Utilities/StrUtil.h"

#define VK_MAX_COMPUTE_TASKS 4096   // Max number of jobs per frame

namespace vk
{
	struct compute_task
	{
		std::string m_src;
		vk::glsl::shader m_shader;
		std::unique_ptr<vk::glsl::program> m_program;
		std::unique_ptr<vk::buffer> m_param_buffer;

		vk::descriptor_pool m_descriptor_pool;
		VkDescriptorSet m_descriptor_set = nullptr;
		VkDescriptorSetLayout m_descriptor_layout = nullptr;
		VkPipelineLayout m_pipeline_layout = nullptr;
		u32 m_used_descriptors = 0;

		bool initialized = false;
		bool unroll_loops = true;
		bool use_push_constants = false;
		u32 ssbo_count = 1;
		u32 push_constants_size = 0;
		u32 optimal_group_size = 1;
		u32 optimal_kernel_size = 1;
		u32 max_invocations_x = 65535;

		virtual std::vector<std::pair<VkDescriptorType, u8>> get_descriptor_layout()
		{
			std::vector<std::pair<VkDescriptorType, u8>> result;
			result.emplace_back(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ssbo_count);
			return result;
		}

		void init_descriptors()
		{
			std::vector<VkDescriptorPoolSize> descriptor_pool_sizes;
			std::vector<VkDescriptorSetLayoutBinding> bindings;

			const auto layout = get_descriptor_layout();
			for (const auto &e : layout)
			{
				descriptor_pool_sizes.push_back({e.first, u32(VK_MAX_COMPUTE_TASKS * e.second)});

				for (unsigned n = 0; n < e.second; ++n)
				{
					bindings.push_back
					({
						uint32_t(bindings.size()),
						e.first,
						1,
						VK_SHADER_STAGE_COMPUTE_BIT,
						nullptr
					});
				}
			}

			// Reserve descriptor pools
			m_descriptor_pool.create(*get_current_renderer(), descriptor_pool_sizes.data(), ::size32(descriptor_pool_sizes), VK_MAX_COMPUTE_TASKS, 2);

			VkDescriptorSetLayoutCreateInfo infos = {};
			infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
			infos.pBindings = bindings.data();
			infos.bindingCount = ::size32(bindings);

			CHECK_RESULT(vkCreateDescriptorSetLayout(*get_current_renderer(), &infos, nullptr, &m_descriptor_layout));

			VkPipelineLayoutCreateInfo layout_info = {};
			layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
			layout_info.setLayoutCount = 1;
			layout_info.pSetLayouts = &m_descriptor_layout;

			VkPushConstantRange push_constants{};
			if (use_push_constants)
			{
				push_constants.size = push_constants_size;
				push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;

				layout_info.pushConstantRangeCount = 1;
				layout_info.pPushConstantRanges = &push_constants;
			}

			CHECK_RESULT(vkCreatePipelineLayout(*get_current_renderer(), &layout_info, nullptr, &m_pipeline_layout));
		}

		void create()
		{
			if (!initialized)
			{
				init_descriptors();

				switch (vk::get_driver_vendor())
				{
				case vk::driver_vendor::unknown:
				case vk::driver_vendor::INTEL:
					// Intel hw has 8 threads, but LDS allocation behavior makes optimal group size between 64 and 256
					// Based on intel's own OpenCL recommended settings
					unroll_loops = true;
					optimal_kernel_size = 1;
					optimal_group_size = 128;
					break;
				case vk::driver_vendor::NVIDIA:
					// Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample)
					unroll_loops = true;
					optimal_group_size = 32;
					optimal_kernel_size = 1;
					break;
				case vk::driver_vendor::AMD:
				case vk::driver_vendor::RADV:
					// Wavefronts are multiples of 64
					unroll_loops = false;
					optimal_kernel_size = 1;
					optimal_group_size = 64;
					break;
				}

				const auto& gpu = vk::get_current_renderer()->gpu();
				max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0];

				initialized = true;
			}
		}

		void destroy()
		{
			if (initialized)
			{
				m_shader.destroy();
				m_program.reset();
				m_param_buffer.reset();

				vkDestroyDescriptorSetLayout(*get_current_renderer(), m_descriptor_layout, nullptr);
				vkDestroyPipelineLayout(*get_current_renderer(), m_pipeline_layout, nullptr);
				m_descriptor_pool.destroy();

				initialized = false;
			}
		}

		void free_resources()
		{
			if (m_used_descriptors == 0)
				return;

			m_descriptor_pool.reset(0);
			m_used_descriptors = 0;
		}

		virtual void bind_resources()
		{}

		virtual void declare_inputs()
		{}

		void load_program(VkCommandBuffer cmd)
		{
			if (!m_program)
			{
				m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
				auto handle = m_shader.compile();

				VkPipelineShaderStageCreateInfo shader_stage{};
				shader_stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
				shader_stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
				shader_stage.module = handle;
				shader_stage.pName = "main";

				VkComputePipelineCreateInfo info{};
				info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
				info.stage = shader_stage;
				info.layout = m_pipeline_layout;
				info.basePipelineIndex = -1;
				info.basePipelineHandle = VK_NULL_HANDLE;

				VkPipeline pipeline;
				vkCreateComputePipelines(*get_current_renderer(), nullptr, 1, &info, nullptr, &pipeline);

				m_program = std::make_unique<vk::glsl::program>(*get_current_renderer(), pipeline);
				declare_inputs();
			}

			verify(HERE), m_used_descriptors < VK_MAX_COMPUTE_TASKS;

			VkDescriptorSetAllocateInfo alloc_info = {};
			alloc_info.descriptorPool = m_descriptor_pool;
			alloc_info.descriptorSetCount = 1;
			alloc_info.pSetLayouts = &m_descriptor_layout;
			alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;

			CHECK_RESULT(vkAllocateDescriptorSets(*get_current_renderer(), &alloc_info, &m_descriptor_set));
			m_used_descriptors++;

			bind_resources();

			vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_program->pipeline);
			vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr);
		}

		void run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z)
		{
			load_program(cmd);
			vkCmdDispatch(cmd, invocations_x, invocations_y, invocations_z);
		}

		void run(VkCommandBuffer cmd, u32 num_invocations)
		{
			u32 invocations_x, invocations_y;
			if (num_invocations > max_invocations_x)
			{
				// AMD hw reports an annoyingly small maximum number of invocations in the X dimension
				// Split the 1D job into 2 dimensions to accomodate this
				invocations_x = static_cast<u32>(floor(std::sqrt(num_invocations)));
				invocations_y = invocations_x;

				if (num_invocations % invocations_x) invocations_y++;
			}
			else
			{
				invocations_x = num_invocations;
				invocations_y = 1;
			}

			run(cmd, invocations_x, invocations_y, 1);
		}
	};

	struct cs_shuffle_base : compute_task
	{
		const vk::buffer* m_data;
		u32 m_data_offset = 0;
		u32 m_data_length = 0;
		u32 kernel_size = 1;

		std::string variables, work_kernel, loop_advance, suffix;
		std::string method_declarations;

		cs_shuffle_base()
		{
			work_kernel =
				"		value = data[index];\n"
				"		data[index] = %f(value);\n";

			loop_advance =
				"		index++;\n";

			suffix =
				"}\n";
		}

		void build(const char* function_name, u32 _kernel_size = 0)
		{
			// Initialize to allow detecting optimal settings
			create();

			kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;

			m_src =
				"#version 430\n"
				"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
				"layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n"
				"%ub"
				"\n"
				"#define KERNEL_SIZE %ks\n"
				"\n"
				"// Generic swap routines\n"
				"#define bswap_u16(bits)     (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
				"#define bswap_u32(bits)     (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
				"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
				"\n"
				"// Depth format conversions\n"
				"#define d24_to_f32(bits)             floatBitsToUint(float(bits) / 16777215.f)\n"
				"#define f32_to_d24(bits)             uint(uintBitsToFloat(bits) * 16777215.f)\n"
				"#define d24x8_to_f32(bits)           d24_to_f32(bits >> 8)\n"
				"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
				"#define f32_to_d24x8_swapped(bits)   d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
				"\n"
				"void main()\n"
				"{\n"
				"	uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
				"	uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
				"	uint index = invocation_id * KERNEL_SIZE;\n"
				"	uint value;\n"
				"	%vars"
				"\n";

			const auto parameters_size = align(push_constants_size, 16) / 16;
			const std::pair<std::string, std::string> syntax_replace[] =
			{
				{ "%ws", std::to_string(optimal_group_size) },
				{ "%ks", std::to_string(kernel_size) },
				{ "%vars", variables },
				{ "%f", function_name },
				{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
			};

			m_src = fmt::replace_all(m_src, syntax_replace);
			work_kernel = fmt::replace_all(work_kernel, syntax_replace);

			if (kernel_size <= 1)
			{
				m_src += "	{\n" + work_kernel + "	}\n";
			}
			else if (unroll_loops)
			{
				work_kernel += loop_advance + "\n";

				m_src += std::string
				(
					"	//Unrolled loop\n"
					"	{\n"
				);

				// Assemble body with manual loop unroll to try loweing GPR usage
				for (u32 n = 0; n < kernel_size; ++n)
				{
					m_src += work_kernel;
				}

				m_src += "	}\n";
			}
			else
			{
				m_src += "	for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
				m_src += "	{\n";
				m_src += work_kernel;
				m_src += loop_advance;
				m_src += "	}\n";
			}

			m_src += suffix;
		}

		void bind_resources() override
		{
			m_program->bind_buffer({ m_data->value, m_data_offset, m_data_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
		}

		void set_parameters(VkCommandBuffer cmd, const u32* params, u8 count)
		{
			verify(HERE), use_push_constants;
			vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, count * 4, params);
		}

		void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset = 0)
		{
			m_data = data;
			m_data_offset = data_offset;
			m_data_length = data_length;

			const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
			const auto num_bytes_to_process = rsx::align2(data_length, num_bytes_per_invocation);
			const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;

			if ((num_bytes_to_process + data_offset) > data->size())
			{
				// Technically robust buffer access should keep the driver from crashing in OOB situations
				LOG_ERROR(RSX, "Inadequate buffer length submitted for a compute operation."
					"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
			}

			compute_task::run(cmd, num_invocations);
		}
	};

	struct cs_shuffle_16 : cs_shuffle_base
	{
		// byteswap ushort
		cs_shuffle_16()
		{
			cs_shuffle_base::build("bswap_u16");
		}
	};

	struct cs_shuffle_32 : cs_shuffle_base
	{
		// byteswap_ulong
		cs_shuffle_32()
		{
			cs_shuffle_base::build("bswap_u32");
		}
	};

	struct cs_shuffle_32_16 : cs_shuffle_base
	{
		// byteswap_ulong + byteswap_ushort
		cs_shuffle_32_16()
		{
			cs_shuffle_base::build("bswap_u16_u32");
		}
	};

	struct cs_shuffle_d24x8_f32 : cs_shuffle_base
	{
		// convert d24x8 to f32
		cs_shuffle_d24x8_f32()
		{
			cs_shuffle_base::build("d24x8_to_f32");
		}
	};

	struct cs_shuffle_se_f32_d24x8 : cs_shuffle_base
	{
		// convert f32 to d24x8 and swap endianness
		cs_shuffle_se_f32_d24x8()
		{
			cs_shuffle_base::build("f32_to_d24x8_swapped");
		}
	};

	struct cs_shuffle_se_d24x8 : cs_shuffle_base
	{
		// swap endianness of d24x8
		cs_shuffle_se_d24x8()
		{
			cs_shuffle_base::build("d24x8_to_d24x8_swapped");
		}
	};

	// NOTE: D24S8 layout has the stencil in the MSB! Its actually S8|D24|S8|D24 starting at offset 0
	struct cs_interleave_task : cs_shuffle_base
	{
		u32 m_ssbo_length = 0;

		cs_interleave_task()
		{
			use_push_constants = true;
			push_constants_size = 16;

			variables =
				"	uint block_length = params[0].x >> 2;\n"
				"	uint z_offset = params[0].y >> 2;\n"
				"	uint s_offset = params[0].z >> 2;\n"
				"	uint depth;\n"
				"	uint stencil;\n"
				"	uint stencil_shift;\n"
				"	uint stencil_offset;\n";
		}

		void bind_resources() override
		{
			m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
		}

		void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
		{
			u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
			set_parameters(cmd, parameters, 4);

			m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
			cs_shuffle_base::run(cmd, data, data_length, data_offset);
		}
	};

	template<bool _SwapBytes = false>
	struct cs_gather_d24x8 : cs_interleave_task
	{
		cs_gather_d24x8()
		{
			work_kernel =
				"		if (index >= block_length)\n"
				"			return;\n"
				"\n"
				"		depth = data[index + z_offset] & 0x00FFFFFF;\n"
				"		stencil_offset = (index / 4);\n"
				"		stencil_shift = (index % 4) * 8;\n"
				"		stencil = data[stencil_offset + s_offset];\n"
				"		stencil = (stencil >> stencil_shift) & 0xFF;\n"
				"		value = (depth << 8) | stencil;\n";

			if constexpr (!_SwapBytes)
			{
				work_kernel +=
				"		data[index] = value;\n";
			}
			else
			{
				work_kernel +=
				"		data[index] = bswap_u32(value);\n";
			}

			cs_shuffle_base::build("");
		}
	};

	template<bool _SwapBytes = false>
	struct cs_gather_d32x8 : cs_interleave_task
	{
		cs_gather_d32x8()
		{
			work_kernel =
				"		if (index >= block_length)\n"
				"			return;\n"
				"\n"
				"		depth = f32_to_d24(data[index + z_offset]);\n"
				"		stencil_offset = (index / 4);\n"
				"		stencil_shift = (index % 4) * 8;\n"
				"		stencil = data[stencil_offset + s_offset];\n"
				"		stencil = (stencil >> stencil_shift) & 0xFF;\n"
				"		value = (depth << 8) | stencil;\n";

			if constexpr (!_SwapBytes)
			{
				work_kernel +=
				"		data[index] = value;\n";
			}
			else
			{
				work_kernel +=
				"		data[index] = bswap_u32(value);\n";
			}

			cs_shuffle_base::build("");
		}
	};

	struct cs_scatter_d24x8 : cs_interleave_task
	{
		cs_scatter_d24x8()
		{
			work_kernel =
				"		if (index >= block_length)\n"
				"			return;\n"
				"\n"
				"		value = data[index];\n"
				"		data[index + z_offset] = (value >> 8);\n"
				"		stencil_offset = (index / 4);\n"
				"		stencil_shift = (index % 4) * 8;\n"
				"		stencil = (value & 0xFF) << stencil_shift;\n"
				"		data[stencil_offset + s_offset] |= stencil;\n";

			cs_shuffle_base::build("");
		}
	};

	struct cs_scatter_d32x8 : cs_interleave_task
	{
		cs_scatter_d32x8()
		{
			work_kernel =
				"		if (index >= block_length)\n"
				"			return;\n"
				"\n"
				"		value = data[index];\n"
				"		data[index + z_offset] = d24_to_f32(value >> 8);\n"
				"		stencil_offset = (index / 4);\n"
				"		stencil_shift = (index % 4) * 8;\n"
				"		stencil = (value & 0xFF) << stencil_shift;\n"
				"		data[stencil_offset + s_offset] |= stencil;\n";

			cs_shuffle_base::build("");
		}
	};

	// Reverse morton-order block arrangement
	struct cs_deswizzle_base : compute_task
	{
		virtual void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 data_length, u32 width, u32 height, u32 depth, u32 mipmaps) = 0;
	};

	template <typename _BlockType, typename _BaseType, bool _SwapBytes>
	struct cs_deswizzle_3d : cs_deswizzle_base
	{
		union params_t
		{
			u32 data[7];

			struct
			{
				u32 width;
				u32 height;
				u32 depth;
				u32 logw;
				u32 logh;
				u32 logd;
				u32 mipmaps;
			};
		}
		params;

		const vk::buffer* src_buffer = nullptr;
		const vk::buffer* dst_buffer = nullptr;
		u32 in_offset = 0;
		u32 out_offset = 0;
		u32 block_length = 0;

		cs_deswizzle_3d()
		{
			verify("Unsupported block type" HERE), (sizeof(_BlockType) & 3) == 0;

			ssbo_count = 2;
			use_push_constants = true;
			push_constants_size = 28;

			create();

			m_src =
			"#version 450\n"
			"layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n"

			"layout(set=0, binding=0, std430) buffer ssbo0{ uint data_in[]; };\n"
			"layout(set=0, binding=1, std430) buffer ssbo1{ uint data_out[]; };\n"
			"layout(push_constant) uniform parameters\n"
			"{\n"
			"	uint image_width;\n"
			"	uint image_height;\n"
			"	uint image_depth;\n"
			"	uint image_logw;\n"
			"	uint image_logh;\n"
			"	uint image_logd;\n"
			"	uint lod_count;\n"
			"};\n\n"

			"struct invocation_properties\n"
			"{\n"
			"	uint data_offset;\n"
			"	uvec3 size;\n"
			"	uvec3 size_log2;\n"
			"};\n\n"

			"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
			"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n\n"

			"invocation_properties invocation;\n\n"

			"bool init_invocation_properties(const in uint offset)\n"
			"{\n"
			"	invocation.data_offset = 0;\n"
			"	invocation.size.x = image_width;\n"
			"	invocation.size.y = image_height;\n"
			"	invocation.size.z = image_depth;\n"
			"	invocation.size_log2.x = image_logw;\n"
			"	invocation.size_log2.y = image_logh;\n"
			"	invocation.size_log2.z = image_logd;\n"
			"	uint level_end = image_width * image_height * image_depth;\n"
			"	uint level = 1;\n\n"

			"	while (offset >= level_end && level < lod_count)\n"
			"	{\n"
			"		invocation.data_offset = level_end;\n"
			"		invocation.size.xy /= 2;\n"
			"		invocation.size.xy = max(invocation.size.xy, uvec2(1));\n"
			"		invocation.size_log2.xy = max(invocation.size_log2.xy, uvec2(1));\n"
			"		invocation.size_log2.xy --;\n"
			"		level_end += (invocation.size.x * invocation.size.y * image_depth);\n"
			"		level++;"
			"	}\n\n"

			"	return (offset < level_end);\n"
			"}\n\n"

			"uint get_z_index(const in uint x_, const in uint y_, const in uint z_)\n"
			"{\n"
			"	uint offset = 0;\n"
			"	uint shift = 0;\n"
			"	uint x = x_;\n"
			"	uint y = y_;\n"
			"	uint z = z_;\n"
			"	uint log2w = invocation.size_log2.x;\n"
			"	uint log2h = invocation.size_log2.y;\n"
			"	uint log2d = invocation.size_log2.z;\n"
			"\n"
			"	do\n"
			"	{\n"
			"		if (log2w > 0)\n"
			"		{\n"
			"			offset |= (x & 1) << shift;\n"
			"			shift++;\n"
			"			x >>= 1;\n"
			"			log2w--;\n"
			"		}\n"
			"\n"
			"		if (log2h > 0)\n"
			"		{\n"
			"			offset |= (y & 1) << shift;\n"
			"			shift++;\n"
			"			y >>= 1;\n"
			"			log2h--;\n"
			"		}\n"
			"\n"
			"		if (log2d > 0)\n"
			"		{\n"
			"			offset |= (z & 1) << shift;\n"
			"			shift++;\n"
			"			z >>= 1;\n"
			"			log2d--;\n"
			"		}\n"
			"	}\n"
			"	while(x > 0 || y > 0 || z > 0);\n"
			"\n"
			"	return offset;\n"
			"}\n\n"

			"void main()\n"
			"{\n"
			"	uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
			"	uint texel_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
			"	uint word_count = %_wordcount;\n\n"

			"	if (!init_invocation_properties(texel_id))\n"
			"		return;\n\n"

			"	// Calculations done in texels, not bytes\n"
			"	uint row_length = invocation.size.x;\n"
			"	uint slice_length = (invocation.size.y * row_length);\n"
			"	uint level_offset = (texel_id - invocation.data_offset);\n"
			"	uint slice_offset = (level_offset % slice_length);\n"
			"	uint z = (level_offset / slice_length);\n"
			"	uint y = (slice_offset / row_length);\n"
			"	uint x = (slice_offset % row_length);\n\n"

			"	uint src_texel_id = get_z_index(x, y, z);\n"
			"	uint dst_id = (texel_id * word_count);\n"
			"	uint src_id = (src_texel_id + invocation.data_offset) * word_count;\n\n"

			"	for (uint i = 0; i < word_count; ++i)\n"
			"	{\n"
			"		uint value = data_in[src_id++];\n"
			"		data_out[dst_id++] = %f(value);\n"
			"	}\n\n"

			"}\n";

			std::string transform;
			if constexpr (_SwapBytes)
			{
				if constexpr (sizeof(_BaseType) == 4)
				{
					transform = "bswap_u32";
				}
				else if constexpr (sizeof(_BaseType) == 2)
				{
					transform = "bswap_u16";
				}
				else
				{
					fmt::throw_exception("Unreachable" HERE);
				}
			}

			const std::pair<std::string, std::string> syntax_replace[] =
			{
				{ "%ws", std::to_string(optimal_group_size) },
				{ "%_wordcount", std::to_string(sizeof(_BlockType) / 4) },
				{ "%f", transform }
			};

			m_src = fmt::replace_all(m_src, syntax_replace);
		}

		void bind_resources() override
		{
			m_program->bind_buffer({ src_buffer->value, in_offset, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
			m_program->bind_buffer({ dst_buffer->value, out_offset, block_length }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
		}

		void set_parameters(VkCommandBuffer cmd)
		{
			vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constants_size, params.data);
		}

		void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 data_length, u32 width, u32 height, u32 depth, u32 mipmaps) override
		{
			dst_buffer = dst;
			src_buffer = src;

			this->in_offset = in_offset;
			this->out_offset = out_offset;
			this->block_length = data_length;

			params.width = width;
			params.height = height;
			params.depth = depth;
			params.mipmaps = mipmaps;
			params.logw = rsx::ceil_log2(width);
			params.logh = rsx::ceil_log2(height);
			params.logd = rsx::ceil_log2(depth);
			set_parameters(cmd);

			const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
			const u32 linear_invocations = aligned_div(data_length, num_bytes_per_invocation);
			compute_task::run(cmd, linear_invocations);
		}
	};

	struct cs_aggregator : compute_task
	{
		const buffer* src = nullptr;
		const buffer* dst = nullptr;
		u32 block_length = 0;
		u32 word_count = 0;

		cs_aggregator()
		{
			ssbo_count = 2;

			create();

			m_src =
				"#version 450\n"
				"layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n"

				"layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n"
				"layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n"

				"void main()\n"
				"{\n"
				"	if (gl_GlobalInvocationID.x < src.length())\n"
				"	{\n"
				"		atomicAdd(result, src[gl_GlobalInvocationID.x]);\n"
				"	}\n"
				"}\n";

			const std::pair<std::string, std::string> syntax_replace[] =
			{
				{ "%ws", std::to_string(optimal_group_size) },
			};

			m_src = fmt::replace_all(m_src, syntax_replace);
		}

		void bind_resources() override
		{
			m_program->bind_buffer({ src->value, 0, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
			m_program->bind_buffer({ dst->value, 0, 4 }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
		}

		void run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
		{
			this->dst = dst;
			this->src = src;
			word_count = num_words;
			block_length = num_words * 4;

			const u32 linear_invocations = aligned_div(word_count, optimal_group_size);
			compute_task::run(cmd, linear_invocations);
		}
	};

	// TODO: Replace with a proper manager
	extern std::unordered_map<u32, std::unique_ptr<vk::compute_task>> g_compute_tasks;

	template<class T>
	T* get_compute_task()
	{
		u32 index = id_manager::typeinfo::get_index<T>();
		auto &e = g_compute_tasks[index];

		if (!e)
		{
			e = std::make_unique<T>();
			e->create();
		}

		return static_cast<T*>(e.get());
	}

	void reset_compute_tasks();
}
-												vk: bump max number of compute jobs from 120 to 1024
- It is possible without bugs to have a very high number of compute invocations.

											
										
										
											2018-12-12 10:24:33 +01:00
+								#pragma once
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+								#include "VKHelpers.h"
-												gl/vk: Add constexpr to varying_registers and sync functions between the two backends

											
										
										
											2019-06-09 09:03:27 +02:00
+								#include "Utilities/StrUtil.h"
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+								#define VK_MAX_COMPUTE_TASKS 4096   // Max number of jobs per frame
-												vk: bump max number of compute jobs from 120 to 1024
- It is possible without bugs to have a very high number of compute invocations.

											
										
										
											2018-12-12 10:24:33 +01:00
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+								namespace vk
 								{
 									struct compute_task
 									{
 										std::string m_src;
 										vk::glsl::shader m_shader;
 										std::unique_ptr<vk::glsl::program> m_program;
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+										std::unique_ptr<vk::buffer> m_param_buffer;
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
 										vk::descriptor_pool m_descriptor_pool;
 										VkDescriptorSet m_descriptor_set = nullptr;
 										VkDescriptorSetLayout m_descriptor_layout = nullptr;
 										VkPipelineLayout m_pipeline_layout = nullptr;
 										u32 m_used_descriptors = 0;
 										bool initialized = false;
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+										bool unroll_loops = true;
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+										bool use_push_constants = false;
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+										u32 ssbo_count = 1;
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+										u32 push_constants_size = 0;
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+										u32 optimal_group_size = 1;
 										u32 optimal_kernel_size = 1;
-												vk: Compute kernel fixups

- Adhere to workgroup count limits as exposed by the GPU vendor.
  They already execute properly even when going beyond the limits but this removes validation noise.
- Fix invocation counts for deswizzle kernel. The count was incorrect if blocksize was not 4, causing a bunch of useless work to be done.

											
										
										
											2019-11-05 15:03:25 +01:00
+										u32 max_invocations_x = 65535;
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
-												rsx: Enable MSAA

- vk: Enable depth buffer resolve+unresolve
- vk: Add AMD stenciling extension support
- rsx: Temporarily disables MSAA-compatible hacks such as transparency AA
- TODO: Add paths to optionally disable MSAA

											
										
										
											2019-05-30 17:38:18 +02:00
+										virtual std::vector<std::pair<VkDescriptorType, u8>> get_descriptor_layout()
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+										{
-												rsx: Enable MSAA

- vk: Enable depth buffer resolve+unresolve
- vk: Add AMD stenciling extension support
- rsx: Temporarily disables MSAA-compatible hacks such as transparency AA
- TODO: Add paths to optionally disable MSAA

											
										
										
											2019-05-30 17:38:18 +02:00
+											std::vector<std::pair<VkDescriptorType, u8>> result;
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											result.emplace_back(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ssbo_count);
-												rsx: Enable MSAA

- vk: Enable depth buffer resolve+unresolve
- vk: Add AMD stenciling extension support
- rsx: Temporarily disables MSAA-compatible hacks such as transparency AA
- TODO: Add paths to optionally disable MSAA

											
										
										
											2019-05-30 17:38:18 +02:00
+											return result;
 										}
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
-												rsx: Enable MSAA

- vk: Enable depth buffer resolve+unresolve
- vk: Add AMD stenciling extension support
- rsx: Temporarily disables MSAA-compatible hacks such as transparency AA
- TODO: Add paths to optionally disable MSAA

											
										
										
											2019-05-30 17:38:18 +02:00
+										void init_descriptors()
 										{
 											std::vector<VkDescriptorPoolSize> descriptor_pool_sizes;
 											std::vector<VkDescriptorSetLayoutBinding> bindings;
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
-												rsx: Enable MSAA

- vk: Enable depth buffer resolve+unresolve
- vk: Add AMD stenciling extension support
- rsx: Temporarily disables MSAA-compatible hacks such as transparency AA
- TODO: Add paths to optionally disable MSAA

											
										
										
											2019-05-30 17:38:18 +02:00
+											const auto layout = get_descriptor_layout();
 											for (const auto &e : layout)
 											{
 												descriptor_pool_sizes.push_back({e.first, u32(VK_MAX_COMPUTE_TASKS * e.second)});
 												for (unsigned n = 0; n < e.second; ++n)
 												{
 													bindings.push_back
 													({
 														uint32_t(bindings.size()),
 														e.first,
 ,
 														VK_SHADER_STAGE_COMPUTE_BIT,
 														nullptr
 													});
 												}
 											}
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
-												rsx: Enable MSAA

- vk: Enable depth buffer resolve+unresolve
- vk: Add AMD stenciling extension support
- rsx: Temporarily disables MSAA-compatible hacks such as transparency AA
- TODO: Add paths to optionally disable MSAA

											
										
										
											2019-05-30 17:38:18 +02:00
+											// Reserve descriptor pools
-												C-style cast cleanup VI

											
										
										
											2019-12-03 23:34:23 +01:00
+											m_descriptor_pool.create(*get_current_renderer(), descriptor_pool_sizes.data(), ::size32(descriptor_pool_sizes), VK_MAX_COMPUTE_TASKS, 2);
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+											VkDescriptorSetLayoutCreateInfo infos = {};
 											infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
 											infos.pBindings = bindings.data();
-												C-style cast cleanup VI

											
										
										
											2019-12-03 23:34:23 +01:00
+											infos.bindingCount = ::size32(bindings);
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
 											CHECK_RESULT(vkCreateDescriptorSetLayout(*get_current_renderer(), &infos, nullptr, &m_descriptor_layout));
 											VkPipelineLayoutCreateInfo layout_info = {};
 											layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
 											layout_info.setLayoutCount = 1;
 											layout_info.pSetLayouts = &m_descriptor_layout;
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+											VkPushConstantRange push_constants{};
 											if (use_push_constants)
 											{
 												push_constants.size = push_constants_size;
 												push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
 												layout_info.pushConstantRangeCount = 1;
 												layout_info.pPushConstantRanges = &push_constants;
 											}
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+											CHECK_RESULT(vkCreatePipelineLayout(*get_current_renderer(), &layout_info, nullptr, &m_pipeline_layout));
 										}
 										void create()
 										{
 											if (!initialized)
 											{
 												init_descriptors();
 												switch (vk::get_driver_vendor())
 												{
 												case vk::driver_vendor::unknown:
-												vk: Allow some drivers to bypass window polling if not needed

											
										
										
											2019-05-04 15:56:57 +02:00
+												case vk::driver_vendor::INTEL:
-												vk: Workgroup tuning for different vendors

											
										
										
											2019-08-30 13:46:48 +02:00
+													// Intel hw has 8 threads, but LDS allocation behavior makes optimal group size between 64 and 256
 													// Based on intel's own OpenCL recommended settings
 													unroll_loops = true;
 													optimal_kernel_size = 1;
 													optimal_group_size = 128;
 													break;
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+												case vk::driver_vendor::NVIDIA:
-												vk: Workgroup tuning for different vendors

											
										
										
											2019-08-30 13:46:48 +02:00
+													// Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample)
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+													unroll_loops = true;
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+													optimal_group_size = 32;
-												vk: Workgroup tuning for different vendors

											
										
										
											2019-08-30 13:46:48 +02:00
+													optimal_kernel_size = 1;
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+													break;
 												case vk::driver_vendor::AMD:
 												case vk::driver_vendor::RADV:
-												vk: Workgroup tuning for different vendors

											
										
										
											2019-08-30 13:46:48 +02:00
+													// Wavefronts are multiples of 64
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+													unroll_loops = false;
 													optimal_kernel_size = 1;
 													optimal_group_size = 64;
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+													break;
 												}
-												vk: Compute kernel fixups

- Adhere to workgroup count limits as exposed by the GPU vendor.
  They already execute properly even when going beyond the limits but this removes validation noise.
- Fix invocation counts for deswizzle kernel. The count was incorrect if blocksize was not 4, causing a bunch of useless work to be done.

											
										
										
											2019-11-05 15:03:25 +01:00
+												const auto& gpu = vk::get_current_renderer()->gpu();
 												max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0];
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+												initialized = true;
 											}
 										}
 										void destroy()
 										{
 											if (initialized)
 											{
 												m_shader.destroy();
 												m_program.reset();
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+												m_param_buffer.reset();
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
 												vkDestroyDescriptorSetLayout(*get_current_renderer(), m_descriptor_layout, nullptr);
 												vkDestroyPipelineLayout(*get_current_renderer(), m_pipeline_layout, nullptr);
 												m_descriptor_pool.destroy();
 												initialized = false;
 											}
 										}
 										void free_resources()
 										{
 											if (m_used_descriptors == 0)
 												return;
-												vk: Improve descriptor pool management
- Add double-buffered descriptor pools to avoid use-after-free situations
- Make descriptor pools more configurable
- Also adds in a hack to allow renderdoc to capture properly

											
										
										
											2019-05-21 19:17:48 +02:00
+											m_descriptor_pool.reset(0);
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+											m_used_descriptors = 0;
 										}
 										virtual void bind_resources()
 										{}
-												rsx: Enable MSAA

- vk: Enable depth buffer resolve+unresolve
- vk: Add AMD stenciling extension support
- rsx: Temporarily disables MSAA-compatible hacks such as transparency AA
- TODO: Add paths to optionally disable MSAA

											
										
										
											2019-05-30 17:38:18 +02:00
+										virtual void declare_inputs()
 										{}
-												vk; Add more compute routines to handle texture format conversions
- Implement le D24x8 to le D32 upload routine
- Implement endianness swapping and depth format conversions routines (readback)

											
										
										
											2018-06-22 21:09:20 +02:00
+										void load_program(VkCommandBuffer cmd)
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+										{
 											if (!m_program)
 											{
 												m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
 												auto handle = m_shader.compile();
 												VkPipelineShaderStageCreateInfo shader_stage{};
 												shader_stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
 												shader_stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
 												shader_stage.module = handle;
 												shader_stage.pName = "main";
 												VkComputePipelineCreateInfo info{};
 												info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
 												info.stage = shader_stage;
 												info.layout = m_pipeline_layout;
 												info.basePipelineIndex = -1;
 												info.basePipelineHandle = VK_NULL_HANDLE;
 												VkPipeline pipeline;
 												vkCreateComputePipelines(*get_current_renderer(), nullptr, 1, &info, nullptr, &pipeline);
-												rsx: Enable MSAA

- vk: Enable depth buffer resolve+unresolve
- vk: Add AMD stenciling extension support
- rsx: Temporarily disables MSAA-compatible hacks such as transparency AA
- TODO: Add paths to optionally disable MSAA

											
										
										
											2019-05-30 17:38:18 +02:00
+												m_program = std::make_unique<vk::glsl::program>(*get_current_renderer(), pipeline);
 												declare_inputs();
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+											}
-												vk: bump max number of compute jobs from 120 to 1024
- It is possible without bugs to have a very high number of compute invocations.

											
										
										
											2018-12-12 10:24:33 +01:00
+											verify(HERE), m_used_descriptors < VK_MAX_COMPUTE_TASKS;
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
 											VkDescriptorSetAllocateInfo alloc_info = {};
 											alloc_info.descriptorPool = m_descriptor_pool;
 											alloc_info.descriptorSetCount = 1;
 											alloc_info.pSetLayouts = &m_descriptor_layout;
 											alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
 											CHECK_RESULT(vkAllocateDescriptorSets(*get_current_renderer(), &alloc_info, &m_descriptor_set));
 											m_used_descriptors++;
 											bind_resources();
 											vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_program->pipeline);
 											vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr);
 										}
-												rsx: Fix linux build

											
										
										
											2019-12-15 11:38:42 +01:00
+										void run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z)
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+										{
 											load_program(cmd);
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											vkCmdDispatch(cmd, invocations_x, invocations_y, invocations_z);
-												rsx: Enable MSAA

- vk: Enable depth buffer resolve+unresolve
- vk: Add AMD stenciling extension support
- rsx: Temporarily disables MSAA-compatible hacks such as transparency AA
- TODO: Add paths to optionally disable MSAA

											
										
										
											2019-05-30 17:38:18 +02:00
+										}
-												rsx: Fix linux build

											
										
										
											2019-12-15 11:38:42 +01:00
+										void run(VkCommandBuffer cmd, u32 num_invocations)
-												rsx: Enable MSAA

- vk: Enable depth buffer resolve+unresolve
- vk: Add AMD stenciling extension support
- rsx: Temporarily disables MSAA-compatible hacks such as transparency AA
- TODO: Add paths to optionally disable MSAA

											
										
										
											2019-05-30 17:38:18 +02:00
+										{
-												vk: Compute kernel fixups

- Adhere to workgroup count limits as exposed by the GPU vendor.
  They already execute properly even when going beyond the limits but this removes validation noise.
- Fix invocation counts for deswizzle kernel. The count was incorrect if blocksize was not 4, causing a bunch of useless work to be done.

											
										
										
											2019-11-05 15:03:25 +01:00
+											u32 invocations_x, invocations_y;
 											if (num_invocations > max_invocations_x)
 											{
 												// AMD hw reports an annoyingly small maximum number of invocations in the X dimension
 												// Split the 1D job into 2 dimensions to accomodate this
-												C-style cast cleanup VI

											
										
										
											2019-12-03 23:34:23 +01:00
+												invocations_x = static_cast<u32>(floor(std::sqrt(num_invocations)));
-												vk: Compute kernel fixups

- Adhere to workgroup count limits as exposed by the GPU vendor.
  They already execute properly even when going beyond the limits but this removes validation noise.
- Fix invocation counts for deswizzle kernel. The count was incorrect if blocksize was not 4, causing a bunch of useless work to be done.

											
										
										
											2019-11-05 15:03:25 +01:00
+												invocations_y = invocations_x;
 												if (num_invocations % invocations_x) invocations_y++;
 											}
 											else
 											{
 												invocations_x = num_invocations;
 												invocations_y = 1;
 											}
 											run(cmd, invocations_x, invocations_y, 1);
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+										}
 									};
 									struct cs_shuffle_base : compute_task
 									{
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+										const vk::buffer* m_data;
-												vk; Add more compute routines to handle texture format conversions
- Implement le D24x8 to le D32 upload routine
- Implement endianness swapping and depth format conversions routines (readback)

											
										
										
											2018-06-22 21:09:20 +02:00
+										u32 m_data_offset = 0;
 										u32 m_data_length = 0;
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+										u32 kernel_size = 1;
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+										std::string variables, work_kernel, loop_advance, suffix;
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+										std::string method_declarations;
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
 										cs_shuffle_base()
 										{
 											work_kernel =
 												"		value = data[index];\n"
-												Remove braces around shader source strings (warnings)

											
										
										
											2019-04-06 08:48:58 +02:00
+												"		data[index] = %f(value);\n";
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
 											loop_advance =
-												Remove braces around shader source strings (warnings)

											
										
										
											2019-04-06 08:48:58 +02:00
+												"		index++;\n";
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
 											suffix =
-												Remove braces around shader source strings (warnings)

											
										
										
											2019-04-06 08:48:58 +02:00
+												"}\n";
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+										}
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+										void build(const char* function_name, u32 _kernel_size = 0)
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+										{
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+											// Initialize to allow detecting optimal settings
 											create();
 											kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
 											m_src =
 												"#version 430\n"
 												"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+												"layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n"
 												"%ub"
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+												"\n"
 												"#define KERNEL_SIZE %ks\n"
-												vk; Add more compute routines to handle texture format conversions
- Implement le D24x8 to le D32 upload routine
- Implement endianness swapping and depth format conversions routines (readback)

											
										
										
											2018-06-22 21:09:20 +02:00
+												"\n"
 												"// Generic swap routines\n"
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+												"#define bswap_u16(bits)     (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
 												"#define bswap_u32(bits)     (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
 												"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
 												"\n"
-												vk; Add more compute routines to handle texture format conversions
- Implement le D24x8 to le D32 upload routine
- Implement endianness swapping and depth format conversions routines (readback)

											
										
										
											2018-06-22 21:09:20 +02:00
+												"// Depth format conversions\n"
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+												"#define d24_to_f32(bits)             floatBitsToUint(float(bits) / 16777215.f)\n"
 												"#define f32_to_d24(bits)             uint(uintBitsToFloat(bits) * 16777215.f)\n"
 												"#define d24x8_to_f32(bits)           d24_to_f32(bits >> 8)\n"
-												vk: Strip 'stencil' MSB when writing d24x8 data
- Seems to contains garbage in MSB when DEPTH aspect is read back
- TODO: Implement custom depth and stencil readback routine

											
										
										
											2018-06-24 00:37:24 +02:00
+												"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+												"#define f32_to_d24x8_swapped(bits)   d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
-												vk; Add more compute routines to handle texture format conversions
- Implement le D24x8 to le D32 upload routine
- Implement endianness swapping and depth format conversions routines (readback)

											
										
										
											2018-06-22 21:09:20 +02:00
+												"\n"
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+												"void main()\n"
 												"{\n"
-												vk: Compute kernel fixups

- Adhere to workgroup count limits as exposed by the GPU vendor.
  They already execute properly even when going beyond the limits but this removes validation noise.
- Fix invocation counts for deswizzle kernel. The count was incorrect if blocksize was not 4, causing a bunch of useless work to be done.

											
										
										
											2019-11-05 15:03:25 +01:00
+												"	uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
-												vk: Fix word index counting for shuffle tasks

											
										
										
											2020-01-14 14:32:13 +01:00
+												"	uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
 												"	uint index = invocation_id * KERNEL_SIZE;\n"
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+												"	uint value;\n"
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+												"	%vars"
-												Remove braces around shader source strings (warnings)

											
										
										
											2019-04-06 08:48:58 +02:00
+												"\n";
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
-												vk: Minor compute optimizations
- Remove use of uniform buffers for compute static data. Use push
constants instead.
- Minor touchups to the deswizzle code to avoid redundant data copies.

											
										
										
											2019-11-02 19:15:19 +01:00
+											const auto parameters_size = align(push_constants_size, 16) / 16;
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+											const std::pair<std::string, std::string> syntax_replace[] =
 											{
 												{ "%ws", std::to_string(optimal_group_size) },
 												{ "%ks", std::to_string(kernel_size) },
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+												{ "%vars", variables },
 												{ "%f", function_name },
-												vk: Minor compute optimizations
- Remove use of uniform buffers for compute static data. Use push
constants instead.
- Minor touchups to the deswizzle code to avoid redundant data copies.

											
										
										
											2019-11-02 19:15:19 +01:00
+												{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+											};
 											m_src = fmt::replace_all(m_src, syntax_replace);
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+											work_kernel = fmt::replace_all(work_kernel, syntax_replace);
 											if (kernel_size <= 1)
 											{
 												m_src += "	{\n" + work_kernel + "	}\n";
 											}
 											else if (unroll_loops)
 											{
 												work_kernel += loop_advance + "\n";
 												m_src += std::string
 												(
 													"	//Unrolled loop\n"
 													"	{\n"
 												);
 												// Assemble body with manual loop unroll to try loweing GPR usage
 												for (u32 n = 0; n < kernel_size; ++n)
 												{
 													m_src += work_kernel;
 												}
 												m_src += "	}\n";
 											}
 											else
 											{
 												m_src += "	for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
 												m_src += "	{\n";
 												m_src += work_kernel;
 												m_src += loop_advance;
 												m_src += "	}\n";
 											}
 											m_src += suffix;
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+										}
 										void bind_resources() override
 										{
-												vk; Add more compute routines to handle texture format conversions
- Implement le D24x8 to le D32 upload routine
- Implement endianness swapping and depth format conversions routines (readback)

											
										
										
											2018-06-22 21:09:20 +02:00
+											m_program->bind_buffer({ m_data->value, m_data_offset, m_data_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+										}
 										void set_parameters(VkCommandBuffer cmd, const u32* params, u8 count)
 										{
-												vk: Minor compute optimizations
- Remove use of uniform buffers for compute static data. Use push
constants instead.
- Minor touchups to the deswizzle code to avoid redundant data copies.

											
										
										
											2019-11-02 19:15:19 +01:00
+											verify(HERE), use_push_constants;
 											vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, count * 4, params);
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+										}
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+										void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset = 0)
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+										{
 											m_data = data;
-												vk; Add more compute routines to handle texture format conversions
- Implement le D24x8 to le D32 upload routine
- Implement endianness swapping and depth format conversions routines (readback)

											
										
										
											2018-06-22 21:09:20 +02:00
+											m_data_offset = data_offset;
 											m_data_length = data_length;
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
 											const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
-												vk: Fix invocation alignment to support non-power-of-2 alignment

											
										
										
											2020-01-14 14:40:29 +01:00
+											const auto num_bytes_to_process = rsx::align2(data_length, num_bytes_per_invocation);
-												vk: Fixup

											
										
										
											2018-06-25 21:23:00 +02:00
+											const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+											if ((num_bytes_to_process + data_offset) > data->size())
-												vk: Fixup

											
										
										
											2018-06-25 21:23:00 +02:00
+											{
 												// Technically robust buffer access should keep the driver from crashing in OOB situations
 												LOG_ERROR(RSX, "Inadequate buffer length submitted for a compute operation."
 													"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
 											}
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+											compute_task::run(cmd, num_invocations);
 										}
 									};
 									struct cs_shuffle_16 : cs_shuffle_base
 									{
 										// byteswap ushort
 										cs_shuffle_16()
 										{
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+											cs_shuffle_base::build("bswap_u16");
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+										}
 									};
 									struct cs_shuffle_32 : cs_shuffle_base
 									{
 										// byteswap_ulong
 										cs_shuffle_32()
 										{
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+											cs_shuffle_base::build("bswap_u32");
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+										}
 									};
 									struct cs_shuffle_32_16 : cs_shuffle_base
 									{
 										// byteswap_ulong + byteswap_ushort
 										cs_shuffle_32_16()
 										{
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+											cs_shuffle_base::build("bswap_u16_u32");
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+										}
 									};
-												vk; Add more compute routines to handle texture format conversions
- Implement le D24x8 to le D32 upload routine
- Implement endianness swapping and depth format conversions routines (readback)

											
										
										
											2018-06-22 21:09:20 +02:00
+									struct cs_shuffle_d24x8_f32 : cs_shuffle_base
 									{
 										// convert d24x8 to f32
 										cs_shuffle_d24x8_f32()
 										{
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+											cs_shuffle_base::build("d24x8_to_f32");
-												vk; Add more compute routines to handle texture format conversions
- Implement le D24x8 to le D32 upload routine
- Implement endianness swapping and depth format conversions routines (readback)

											
										
										
											2018-06-22 21:09:20 +02:00
+										}
 									};
 									struct cs_shuffle_se_f32_d24x8 : cs_shuffle_base
 									{
 										// convert f32 to d24x8 and swap endianness
 										cs_shuffle_se_f32_d24x8()
 										{
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+											cs_shuffle_base::build("f32_to_d24x8_swapped");
-												vk; Add more compute routines to handle texture format conversions
- Implement le D24x8 to le D32 upload routine
- Implement endianness swapping and depth format conversions routines (readback)

											
										
										
											2018-06-22 21:09:20 +02:00
+										}
 									};
 									struct cs_shuffle_se_d24x8 : cs_shuffle_base
 									{
 										// swap endianness of d24x8
 										cs_shuffle_se_d24x8()
 										{
-												vk: Tuning [WIP]
- Unroll main compute queue loop
- Do NOT run GPU cores on mappable memory! This has a dreadful impact on performance for obvious reasons
- Enable dynamic SSBO indexing (affects AMD)
- Make loop unrolling and loop length variable depending on hardware and find optimum

											
										
										
											2018-06-23 14:15:55 +02:00
+											cs_shuffle_base::build("d24x8_to_d24x8_swapped");
-												vk; Add more compute routines to handle texture format conversions
- Implement le D24x8 to le D32 upload routine
- Implement endianness swapping and depth format conversions routines (readback)

											
										
										
											2018-06-22 21:09:20 +02:00
+										}
 									};
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+									// NOTE: D24S8 layout has the stencil in the MSB! Its actually S8|D24|S8|D24 starting at offset 0
 									struct cs_interleave_task : cs_shuffle_base
 									{
 										u32 m_ssbo_length = 0;
 										cs_interleave_task()
 										{
-												vk: Minor compute optimizations
- Remove use of uniform buffers for compute static data. Use push
constants instead.
- Minor touchups to the deswizzle code to avoid redundant data copies.

											
										
										
											2019-11-02 19:15:19 +01:00
+											use_push_constants = true;
 											push_constants_size = 16;
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
 											variables =
 												"	uint block_length = params[0].x >> 2;\n"
 												"	uint z_offset = params[0].y >> 2;\n"
 												"	uint s_offset = params[0].z >> 2;\n"
 												"	uint depth;\n"
 												"	uint stencil;\n"
 												"	uint stencil_shift;\n"
-												Remove braces around shader source strings (warnings)

											
										
										
											2019-04-06 08:48:58 +02:00
+												"	uint stencil_offset;\n";
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+										}
 										void bind_resources() override
 										{
 											m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
 										}
 										void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
 										{
-												vk: Minor compute optimizations
- Remove use of uniform buffers for compute static data. Use push
constants instead.
- Minor touchups to the deswizzle code to avoid redundant data copies.

											
										
										
											2019-11-02 19:15:19 +01:00
+											u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
 											set_parameters(cmd, parameters, 4);
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
 											m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
 											cs_shuffle_base::run(cmd, data, data_length, data_offset);
 										}
 									};
-												rsx: Experiments with nul sink

											
										
										
											2019-09-04 21:19:58 +02:00
+									template<bool _SwapBytes = false>
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+									struct cs_gather_d24x8 : cs_interleave_task
 									{
 										cs_gather_d24x8()
 										{
 											work_kernel =
 												"		if (index >= block_length)\n"
 												"			return;\n"
 												"\n"
 												"		depth = data[index + z_offset] & 0x00FFFFFF;\n"
 												"		stencil_offset = (index / 4);\n"
 												"		stencil_shift = (index % 4) * 8;\n"
 												"		stencil = data[stencil_offset + s_offset];\n"
 												"		stencil = (stencil >> stencil_shift) & 0xFF;\n"
-												rsx: Experiments with nul sink

											
										
										
											2019-09-04 21:19:58 +02:00
+												"		value = (depth << 8) | stencil;\n";
 											if constexpr (!_SwapBytes)
 											{
 												work_kernel +=
-												Remove braces around shader source strings (warnings)

											
										
										
											2019-04-06 08:48:58 +02:00
+												"		data[index] = value;\n";
-												rsx: Experiments with nul sink

											
										
										
											2019-09-04 21:19:58 +02:00
+											}
 											else
 											{
 												work_kernel +=
 												"		data[index] = bswap_u32(value);\n";
 											}
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
 											cs_shuffle_base::build("");
 										}
 									};
-												rsx: Experiments with nul sink

											
										
										
											2019-09-04 21:19:58 +02:00
+									template<bool _SwapBytes = false>
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
+									struct cs_gather_d32x8 : cs_interleave_task
 									{
 										cs_gather_d32x8()
 										{
 											work_kernel =
 												"		if (index >= block_length)\n"
 												"			return;\n"
 												"\n"
 												"		depth = f32_to_d24(data[index + z_offset]);\n"
 												"		stencil_offset = (index / 4);\n"
 												"		stencil_shift = (index % 4) * 8;\n"
 												"		stencil = data[stencil_offset + s_offset];\n"
 												"		stencil = (stencil >> stencil_shift) & 0xFF;\n"
-												rsx: Experiments with nul sink

											
										
										
											2019-09-04 21:19:58 +02:00
+												"		value = (depth << 8) | stencil;\n";
 											if constexpr (!_SwapBytes)
 											{
 												work_kernel +=
-												Remove braces around shader source strings (warnings)

											
										
										
											2019-04-06 08:48:58 +02:00
+												"		data[index] = value;\n";
-												rsx: Experiments with nul sink

											
										
										
											2019-09-04 21:19:58 +02:00
+											}
 											else
 											{
 												work_kernel +=
 												"		data[index] = bswap_u32(value);\n";
 											}
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
 											cs_shuffle_base::build("");
 										}
 									};
 									struct cs_scatter_d24x8 : cs_interleave_task
 									{
 										cs_scatter_d24x8()
 										{
 											work_kernel =
 												"		if (index >= block_length)\n"
 												"			return;\n"
 												"\n"
 												"		value = data[index];\n"
 												"		data[index + z_offset] = (value >> 8);\n"
 												"		stencil_offset = (index / 4);\n"
 												"		stencil_shift = (index % 4) * 8;\n"
 												"		stencil = (value & 0xFF) << stencil_shift;\n"
-												Remove braces around shader source strings (warnings)

											
										
										
											2019-04-06 08:48:58 +02:00
+												"		data[stencil_offset + s_offset] |= stencil;\n";
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
 											cs_shuffle_base::build("");
 										}
 									};
 									struct cs_scatter_d32x8 : cs_interleave_task
 									{
 										cs_scatter_d32x8()
 										{
 											work_kernel =
 												"		if (index >= block_length)\n"
 												"			return;\n"
 												"\n"
 												"		value = data[index];\n"
 												"		data[index + z_offset] = d24_to_f32(value >> 8);\n"
 												"		stencil_offset = (index / 4);\n"
 												"		stencil_shift = (index % 4) * 8;\n"
 												"		stencil = (value & 0xFF) << stencil_shift;\n"
-												Remove braces around shader source strings (warnings)

											
										
										
											2019-04-06 08:48:58 +02:00
+												"		data[stencil_offset + s_offset] |= stencil;\n";
-												vk: Implement copy-to-buffer and copy-from-buffer for depth_stencil
formats
- Allows D24S8 and D32S8 transport via typeless channels
- Allows uploading and downloading D24S8 data easily
- TODO: Implement optional byteswapping to fix flushed readbacks with
the same method

											
										
										
											2019-04-02 14:16:52 +02:00
 											cs_shuffle_base::build("");
 										}
 									};
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+									// Reverse morton-order block arrangement
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+									struct cs_deswizzle_base : compute_task
 									{
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+										virtual void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 data_length, u32 width, u32 height, u32 depth, u32 mipmaps) = 0;
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+									};
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+									template <typename _BlockType, typename _BaseType, bool _SwapBytes>
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+									struct cs_deswizzle_3d : cs_deswizzle_base
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+									{
 										union params_t
 										{
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											u32 data[7];
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
 											struct
 											{
 												u32 width;
 												u32 height;
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+												u32 depth;
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+												u32 logw;
 												u32 logh;
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+												u32 logd;
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+												u32 mipmaps;
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											};
 										}
 										params;
 										const vk::buffer* src_buffer = nullptr;
 										const vk::buffer* dst_buffer = nullptr;
 										u32 in_offset = 0;
 										u32 out_offset = 0;
 										u32 block_length = 0;
-												C-style cast cleanup VI

											
										
										
											2019-12-03 23:34:23 +01:00
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+										cs_deswizzle_3d()
 										{
 											verify("Unsupported block type" HERE), (sizeof(_BlockType) & 3) == 0;
 											ssbo_count = 2;
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+											use_push_constants = true;
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											push_constants_size = 28;
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											create();
 											m_src =
 											"#version 450\n"
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											"layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n"
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+											"layout(set=0, binding=0, std430) buffer ssbo0{ uint data_in[]; };\n"
 											"layout(set=0, binding=1, std430) buffer ssbo1{ uint data_out[]; };\n"
 											"layout(push_constant) uniform parameters\n"
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											"{\n"
 											"	uint image_width;\n"
 											"	uint image_height;\n"
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+											"	uint image_depth;\n"
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											"	uint image_logw;\n"
 											"	uint image_logh;\n"
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+											"	uint image_logd;\n"
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											"	uint lod_count;\n"
 											"};\n\n"
 											"struct invocation_properties\n"
 											"{\n"
 											"	uint data_offset;\n"
 											"	uvec3 size;\n"
 											"	uvec3 size_log2;\n"
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											"};\n\n"
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+											"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n\n"
 											"invocation_properties invocation;\n\n"
 											"bool init_invocation_properties(const in uint offset)\n"
 											"{\n"
 											"	invocation.data_offset = 0;\n"
 											"	invocation.size.x = image_width;\n"
 											"	invocation.size.y = image_height;\n"
 											"	invocation.size.z = image_depth;\n"
 											"	invocation.size_log2.x = image_logw;\n"
 											"	invocation.size_log2.y = image_logh;\n"
 											"	invocation.size_log2.z = image_logd;\n"
 											"	uint level_end = image_width * image_height * image_depth;\n"
 											"	uint level = 1;\n\n"
 											"	while (offset >= level_end && level < lod_count)\n"
 											"	{\n"
 											"		invocation.data_offset = level_end;\n"
 											"		invocation.size.xy /= 2;\n"
 											"		invocation.size.xy = max(invocation.size.xy, uvec2(1));\n"
 											"		invocation.size_log2.xy = max(invocation.size_log2.xy, uvec2(1));\n"
 											"		invocation.size_log2.xy --;\n"
 											"		level_end += (invocation.size.x * invocation.size.y * image_depth);\n"
 											"		level++;"
 											"	}\n\n"
 											"	return (offset < level_end);\n"
 											"}\n\n"
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
-												vk: Minor compute optimizations
- Remove use of uniform buffers for compute static data. Use push
constants instead.
- Minor touchups to the deswizzle code to avoid redundant data copies.

											
										
										
											2019-11-02 19:15:19 +01:00
+											"uint get_z_index(const in uint x_, const in uint y_, const in uint z_)\n"
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											"{\n"
 											"	uint offset = 0;\n"
 											"	uint shift = 0;\n"
-												vk: Minor compute optimizations
- Remove use of uniform buffers for compute static data. Use push
constants instead.
- Minor touchups to the deswizzle code to avoid redundant data copies.

											
										
										
											2019-11-02 19:15:19 +01:00
+											"	uint x = x_;\n"
 											"	uint y = y_;\n"
 											"	uint z = z_;\n"
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											"	uint log2w = invocation.size_log2.x;\n"
 											"	uint log2h = invocation.size_log2.y;\n"
 											"	uint log2d = invocation.size_log2.z;\n"
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											"\n"
 											"	do\n"
 											"	{\n"
 											"		if (log2w > 0)\n"
 											"		{\n"
 											"			offset |= (x & 1) << shift;\n"
 											"			shift++;\n"
 											"			x >>= 1;\n"
 											"			log2w--;\n"
 											"		}\n"
 											"\n"
 											"		if (log2h > 0)\n"
 											"		{\n"
 											"			offset |= (y & 1) << shift;\n"
 											"			shift++;\n"
 											"			y >>= 1;\n"
 											"			log2h--;\n"
 											"		}\n"
 											"\n"
 											"		if (log2d > 0)\n"
 											"		{\n"
 											"			offset |= (z & 1) << shift;\n"
 											"			shift++;\n"
 											"			z >>= 1;\n"
 											"			log2d--;\n"
 											"		}\n"
 											"	}\n"
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+											"	while(x > 0 || y > 0 || z > 0);\n"
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											"\n"
 											"	return offset;\n"
 											"}\n\n"
 											"void main()\n"
 											"{\n"
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											"	uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
 											"	uint texel_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
 											"	uint word_count = %_wordcount;\n\n"
 											"	if (!init_invocation_properties(texel_id))\n"
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											"		return;\n\n"
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											"	// Calculations done in texels, not bytes\n"
 											"	uint row_length = invocation.size.x;\n"
 											"	uint slice_length = (invocation.size.y * row_length);\n"
 											"	uint level_offset = (texel_id - invocation.data_offset);\n"
 											"	uint slice_offset = (level_offset % slice_length);\n"
 											"	uint z = (level_offset / slice_length);\n"
 											"	uint y = (slice_offset / row_length);\n"
 											"	uint x = (slice_offset % row_length);\n\n"
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											"	uint src_texel_id = get_z_index(x, y, z);\n"
 											"	uint dst_id = (texel_id * word_count);\n"
 											"	uint src_id = (src_texel_id + invocation.data_offset) * word_count;\n\n"
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
 											"	for (uint i = 0; i < word_count; ++i)\n"
 											"	{\n"
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+											"		uint value = data_in[src_id++];\n"
 											"		data_out[dst_id++] = %f(value);\n"
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											"	}\n\n"
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											"}\n";
 											std::string transform;
 											if constexpr (_SwapBytes)
 											{
 												if constexpr (sizeof(_BaseType) == 4)
 												{
 													transform = "bswap_u32";
 												}
 												else if constexpr (sizeof(_BaseType) == 2)
 												{
 													transform = "bswap_u16";
 												}
 												else
 												{
 													fmt::throw_exception("Unreachable" HERE);
 												}
 											}
 											const std::pair<std::string, std::string> syntax_replace[] =
 											{
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+												{ "%ws", std::to_string(optimal_group_size) },
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+												{ "%_wordcount", std::to_string(sizeof(_BlockType) / 4) },
 												{ "%f", transform }
 											};
 											m_src = fmt::replace_all(m_src, syntax_replace);
 										}
 										void bind_resources() override
 										{
 											m_program->bind_buffer({ src_buffer->value, in_offset, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
 											m_program->bind_buffer({ dst_buffer->value, out_offset, block_length }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
 										}
 										void set_parameters(VkCommandBuffer cmd)
 										{
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constants_size, params.data);
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+										}
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+										void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 data_length, u32 width, u32 height, u32 depth, u32 mipmaps) override
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+										{
 											dst_buffer = dst;
 											src_buffer = src;
 											this->in_offset = in_offset;
 											this->out_offset = out_offset;
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											this->block_length = data_length;
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
 											params.width = width;
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+											params.height = height;
 											params.depth = depth;
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											params.mipmaps = mipmaps;
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+											params.logw = rsx::ceil_log2(width);
 											params.logh = rsx::ceil_log2(height);
-												vk: Enable gpu deswizzling

											
										
										
											2019-10-29 13:21:53 +01:00
+											params.logd = rsx::ceil_log2(depth);
 											set_parameters(cmd);
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
-												vk: Compute kernel fixups

- Adhere to workgroup count limits as exposed by the GPU vendor.
  They already execute properly even when going beyond the limits but this removes validation noise.
- Fix invocation counts for deswizzle kernel. The count was incorrect if blocksize was not 4, causing a bunch of useless work to be done.

											
										
										
											2019-11-05 15:03:25 +01:00
+											const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
-												Implement rounded_div

Round-to-nearest integral based division, optimized for unsigned integral.
Used in sceNpTrophyGetGameProgress.
Do not allow signed values for aligned_div(), align().

											
										
										
											2019-12-16 20:56:14 +01:00
+											const u32 linear_invocations = aligned_div(data_length, num_bytes_per_invocation);
-												vk: Implement layer batching for the GPU swizzle decoder

- Handles all LODs per layer meaning cubemaps are now fully handled in 6 passes instead of 6 * (log2(width)) passes.
- Handles all LODs of a 3D texture in one pass as well.
- The improvements do warrant dropping down the number of allowed compute invocations a bit

											
										
										
											2019-11-05 15:00:07 +01:00
+											compute_task::run(cmd, linear_invocations);
-												rsx: Set up framework for hw deswizzle

											
										
										
											2019-10-29 13:13:10 +01:00
+										}
 									};
-												vk: Implement hw conditional rendering

											
										
										
											2019-12-10 05:56:44 +01:00
+									struct cs_aggregator : compute_task
 									{
 										const buffer* src = nullptr;
 										const buffer* dst = nullptr;
 										u32 block_length = 0;
 										u32 word_count = 0;
 										cs_aggregator()
 										{
 											ssbo_count = 2;
 											create();
 											m_src =
 												"#version 450\n"
 												"layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n"
 												"layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n"
 												"layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n"
 												"void main()\n"
 												"{\n"
 												"	if (gl_GlobalInvocationID.x < src.length())\n"
 												"	{\n"
 												"		atomicAdd(result, src[gl_GlobalInvocationID.x]);\n"
 												"	}\n"
 												"}\n";
 											const std::pair<std::string, std::string> syntax_replace[] =
 											{
 												{ "%ws", std::to_string(optimal_group_size) },
 											};
 											m_src = fmt::replace_all(m_src, syntax_replace);
 										}
 										void bind_resources() override
 										{
 											m_program->bind_buffer({ src->value, 0, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
 											m_program->bind_buffer({ dst->value, 0, 4 }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
 										}
 										void run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
 										{
 											this->dst = dst;
 											this->src = src;
 											word_count = num_words;
 											block_length = num_words * 4;
 											const u32 linear_invocations = aligned_div(word_count, optimal_group_size);
 											compute_task::run(cmd, linear_invocations);
 										}
 									};
-												vk: Add synchronous compute pipelines
- Compute is now used to assist in some parts of blit operations, since there are no format conversions with vulkan like OGL does
- TODO: Integrate this into all types of GPU memory conversion operations instead of downloading to CPU then converting

											
										
										
											2018-06-12 17:46:59 +02:00
+									// TODO: Replace with a proper manager
 									extern std::unordered_map<u32, std::unique_ptr<vk::compute_task>> g_compute_tasks;
 									template<class T>
 									T* get_compute_task()
 									{
 										u32 index = id_manager::typeinfo::get_index<T>();
 										auto &e = g_compute_tasks[index];
 										if (!e)
 										{
 											e = std::make_unique<T>();
 											e->create();
 										}
 										return static_cast<T*>(e.get());
 									}
 									void reset_compute_tasks();
 								}