#pragma once #include "../system_config.h" #include "util/File.h" #include "util/lockless.h" #include "util/Thread.h" #include "Common/bitfield.hpp" #include "Common/unordered_map.hpp" #include "Emu/System.h" #include "Emu/cache_utils.hpp" #include "Emu/RSX/Program/RSXVertexProgram.h" #include "Emu/RSX/Program/RSXFragmentProgram.h" #include "Overlays/Shaders/shader_loading_dialog.h" #include #include #include #include #include #include "util/sysinfo.hpp" #include "util/fnv_hash.hpp" namespace rsx { template class shaders_cache { struct unpacked_shader { pipeline_storage_type props; RSXVertexProgram vp; RSXFragmentProgram fp; }; using unpacked_type = lf_fifo; struct pipeline_data { u64 vertex_program_hash; u64 fragment_program_hash; u64 pipeline_storage_hash; u32 vp_ctrl0; u32 vp_ctrl1; u32 vp_texture_dimensions; u32 vp_reserved_0; u64 vp_instruction_mask[9]; u32 vp_base_address; u32 vp_entry; u16 vp_jump_table[32]; u16 vp_multisampled_textures; u16 vp_reserved_1; u32 vp_reserved_2; u32 fp_ctrl; u32 fp_texture_dimensions; u32 fp_texcoord_control; u16 fp_height; u16 fp_pixel_layout; u16 fp_lighting_flags; u16 fp_shadow_textures; u16 fp_redirected_textures; u16 fp_multisampled_textures; u8 fp_mrt_count; u8 fp_reserved0; u16 fp_reserved1; u32 fp_reserved2; pipeline_storage_type pipeline_properties; }; std::string version_prefix; std::string root_path; std::string pipeline_class_name; lf_fifo, 100> fragment_program_data; backend_storage& m_storage; std::atomic m_shader_storage_exit{false}; std::condition_variable m_shader_storage_cv; std::mutex m_shader_storage_mtx; std::vector m_shader_storage_worker_queue; std::thread m_shader_storage_worker_thread = std::thread([this] { while (!m_shader_storage_exit.load()) { unpacked_shader item; { std::unique_lock lock(m_shader_storage_mtx); m_shader_storage_cv.wait(lock); if (m_shader_storage_worker_queue.empty()) { continue; } item = std::move(m_shader_storage_worker_queue.back()); m_shader_storage_worker_queue.pop_back(); } pipeline_data data = pack(item.props, item.vp, item.fp); std::string fp_name = root_path + "/raw/" + fmt::format("%llX.fp", data.fragment_program_hash); std::string vp_name = root_path + "/raw/" + fmt::format("%llX.vp", data.vertex_program_hash); if (fs::stat_t s{}; !fs::get_stat(fp_name, s) || s.size != item.fp.ucode_length) { fs::write_pending_file(fp_name, item.fp.get_data(), item.fp.ucode_length); } if (fs::stat_t s{}; !fs::get_stat(vp_name, s) || s.size != item.vp.data.size() * sizeof(u32)) { fs::write_pending_file(vp_name, item.vp.data); } const u32 state_params[] = { data.vp_ctrl0, data.vp_ctrl1, data.fp_ctrl, data.vp_texture_dimensions, data.fp_texture_dimensions, data.fp_texcoord_control, data.fp_height, data.fp_pixel_layout, data.fp_lighting_flags, data.fp_shadow_textures, data.fp_redirected_textures, data.vp_multisampled_textures, data.fp_multisampled_textures, data.fp_mrt_count, }; const usz state_hash = rpcs3::hash_array(state_params); const std::string pipeline_file_name = fmt::format("%llX+%llX+%llX+%llX.bin", data.vertex_program_hash, data.fragment_program_hash, data.pipeline_storage_hash, state_hash); const std::string pipeline_path = root_path + "/pipelines/" + pipeline_class_name + "/" + version_prefix + "/" + pipeline_file_name; fs::write_pending_file(pipeline_path, &data, sizeof(data)); } }); static std::string get_message(u32 index, u32 processed, u32 entry_count) { return fmt::format("%s pipeline object %u of %u", index == 0 ? "Loading" : "Compiling", processed, entry_count); } void load_shaders(uint nb_workers, unpacked_type& unpacked, std::string& directory_path, std::vector& entries, u32 entry_count, shader_loading_dialog* dlg) { atomic_t processed(0); std::function shader_load_worker = [&](u32 start_at, u32 stop_at) { u32 thread_processed = 0; auto update_stats = [&] { if (thread_processed == 0) { return true; } processed += thread_processed; thread_processed = 0; return !Emu.IsStopped(); }; for (u32 pos = start_at; pos < stop_at; ++pos) { const fs::dir_entry& tmp = entries[pos]; thread_processed++; const auto filename = directory_path + "/" + tmp.name; fs::file f(filename); if (!f) { fs::remove_file(filename); continue; } if (f.size() != sizeof(pipeline_data)) { rsx_log.error("Removing cached pipeline object %s since it's not binary compatible with the current shader cache", tmp.name.c_str()); fs::remove_file(filename); continue; } pipeline_data pdata{}; f.read(&pdata, f.size()); auto entry = unpack(pdata); if (entry.vp.data.empty() || !entry.fp.ucode_length) { continue; } m_storage.preload_programs(nullptr, entry.vp, entry.fp); unpacked[unpacked.push_begin()] = std::move(entry); if (thread_processed >= 10 && !update_stats()) { return; } } update_stats(); }; await_workers(nb_workers, 0, shader_load_worker, processed, entry_count, dlg); } template void compile_shaders(uint nb_workers, unpacked_type& unpacked, u32 entry_count, shader_loading_dialog* dlg, Args&&... args) { atomic_t processed(0); std::function shader_comp_worker = [&](u32 start_at, u32 stop_at) { u32 thread_processed = 0; auto update_stats = [&] { if (thread_processed == 0) { return true; } processed += thread_processed; thread_processed = 0; return !Emu.IsStopped(); }; for (u32 pos = start_at; pos < stop_at; ++pos) { unpacked_shader& entry = unpacked[pos]; m_storage.add_pipeline_entry(entry.vp, entry.fp, entry.props, std::forward(args)...); thread_processed++; if (thread_processed >= 3 && !update_stats()) { return; } } update_stats(); }; await_workers(nb_workers, 1, shader_comp_worker, processed, entry_count, dlg); } void await_workers(uint nb_workers, u8 step, std::function& worker, atomic_t& processed, u32 entry_count, shader_loading_dialog* dlg) { if (nb_workers > entry_count) { nb_workers = entry_count; } if (nb_workers == 1) { steady_clock::time_point last_update; // Call the worker function directly, stopping it prematurely to be able update the screen u32 stop_at = 0; u32 start_at = 0; do { stop_at = std::min(start_at + 10, entry_count); worker(start_at, stop_at); start_at = stop_at; // Only update the screen at about 60fps since updating it everytime slows down the process steady_clock::time_point now = steady_clock::now(); if ((std::chrono::duration_cast(now - last_update) > 16ms) || (stop_at == entry_count)) { dlg->update_msg(step, get_message(step, stop_at, entry_count)); dlg->set_value(step, stop_at); last_update = now; } } while (stop_at < entry_count && !Emu.IsStopped()); } else { named_thread_group workers("RSX Worker ", nb_workers, [&](u32 thread_index) { if (nb_workers == entry_count) { worker(thread_index, thread_index + 1); return; } auto per_thread_entries = entry_count / nb_workers; auto start_at = per_thread_entries * thread_index; auto stop_at = thread_index == nb_workers - 1 ? entry_count : start_at + per_thread_entries; worker(start_at, stop_at); }); u32 current_progress = 0; u32 last_update_progress = 0; while ((current_progress < entry_count) && !Emu.IsStopped()) { thread_ctrl::wait_for(16'000); // Around 60fps should be good enough if (Emu.IsStopped()) break; current_progress = std::min(processed.load(), entry_count); if (last_update_progress != current_progress) { last_update_progress = current_progress; dlg->update_msg(step, get_message(step, current_progress, entry_count)); dlg->set_value(step, current_progress); } } } if (!Emu.IsStopped()) { ensure(processed == entry_count); } } public: shaders_cache(backend_storage& storage, std::string pipeline_class, std::string version_prefix_str = "v1") : version_prefix(std::move(version_prefix_str)), pipeline_class_name(std::move(pipeline_class)), m_storage(storage) { if (!g_cfg.video.disable_on_disk_shader_cache) { if (std::string cache_path = rpcs3::cache::get_ppu_cache(); !cache_path.empty()) { root_path = std::move(cache_path) + "shaders_cache/"; } } } ~shaders_cache() { { std::lock_guard lock(m_shader_storage_mtx); m_shader_storage_exit = true; m_shader_storage_cv.notify_one(); } m_shader_storage_worker_thread.join(); } template void load(shader_loading_dialog* dlg, Args&&... args) { if (root_path.empty()) { return; } std::string directory_path = root_path + "/pipelines/" + pipeline_class_name + "/" + version_prefix; fs::dir root = fs::dir(directory_path); if (!root) { fs::create_path(directory_path); fs::create_path(root_path + "/raw"); return; } std::vector entries; for (auto&& entry : root) { if (entry.is_directory) continue; if (entry.name.ends_with(".bin")) { entries.push_back(std::move(entry)); } } u32 entry_count = ::size32(entries); if (!entry_count) return; root.rewind(); // Progress dialog std::unique_ptr fallback_dlg; if (!dlg) { fallback_dlg = std::make_unique(); dlg = fallback_dlg.get(); } dlg->create("Preloading cached shaders from disk.\nPlease wait...", "Shader Compilation"); dlg->set_limit(0, entry_count); dlg->set_limit(1, entry_count); dlg->update_msg(0, get_message(0, 0, entry_count)); dlg->update_msg(1, get_message(1, 0, entry_count)); // Preload everything needed to compile the shaders unpacked_type unpacked; uint nb_workers = g_cfg.video.renderer == video_renderer::vulkan ? utils::get_thread_count() * 2 : 1; load_shaders(nb_workers, unpacked, directory_path, entries, entry_count, dlg); // Account for any invalid entries entry_count = unpacked.size(); compile_shaders(nb_workers, unpacked, entry_count, dlg, std::forward(args)...); dlg->refresh(); dlg->close(); } void store(const pipeline_storage_type& pipeline, const RSXVertexProgram& vp, const RSXFragmentProgram& fp) { if (root_path.empty()) { return; } if (vp.jump_table.size() > 32) { rsx_log.error("shaders_cache: vertex program has more than 32 jump addresses. Entry not saved to cache"); return; } auto item = unpacked_shader{pipeline, vp, RSXFragmentProgram::clone(fp) /* ???? */}; std::lock_guard lock(m_shader_storage_mtx); m_shader_storage_worker_queue.push_back(std::move(item)); m_shader_storage_cv.notify_one(); } void wait_stores() { while (true) { { std::lock_guard lock(m_shader_storage_mtx); if (m_shader_storage_worker_queue.empty()) { return; } } std::this_thread::sleep_for(std::chrono::milliseconds(50)); } } RSXVertexProgram load_vp_raw(u64 program_hash) const { RSXVertexProgram vp = {}; fs::file f(fmt::format("%s/raw/%llX.vp", root_path, program_hash)); if (f) f.read(vp.data, f.size() / sizeof(u32)); return vp; } RSXFragmentProgram load_fp_raw(u64 program_hash) { fs::file f(fmt::format("%s/raw/%llX.fp", root_path, program_hash)); RSXFragmentProgram fp = {}; const u32 size = fp.ucode_length = f ? ::size32(f) : 0; if (!size) { return fp; } auto buf = std::make_unique(size); fp.data = buf.get(); f.read(buf.get(), size); fragment_program_data[fragment_program_data.push_begin()] = std::move(buf); return fp; } unpacked_shader unpack(pipeline_data& data) { unpacked_shader result; result.vp = load_vp_raw(data.vertex_program_hash); result.fp = load_fp_raw(data.fragment_program_hash); result.props = data.pipeline_properties; result.vp.ctrl = data.vp_ctrl0; result.vp.output_mask = data.vp_ctrl1; result.vp.texture_state.texture_dimensions = data.vp_texture_dimensions; result.vp.texture_state.multisampled_textures = data.vp_multisampled_textures; result.vp.base_address = data.vp_base_address; result.vp.entry = data.vp_entry; pack_bitset(result.vp.instruction_mask, data.vp_instruction_mask); for (u8 index = 0; index < 32; ++index) { const auto address = data.vp_jump_table[index]; if (address == u16{umax}) { // End of list marker break; } result.vp.jump_table.emplace(address); } result.fp.ctrl = data.fp_ctrl; result.fp.texture_state.texture_dimensions = data.fp_texture_dimensions; result.fp.texture_state.shadow_textures = data.fp_shadow_textures; result.fp.texture_state.redirected_textures = data.fp_redirected_textures; result.fp.texture_state.multisampled_textures = data.fp_multisampled_textures; result.fp.texcoord_control_mask = data.fp_texcoord_control; result.fp.two_sided_lighting = !!(data.fp_lighting_flags & 0x1); result.fp.mrt_buffers_count = data.fp_mrt_count; return result; } pipeline_data pack(const pipeline_storage_type& pipeline, const RSXVertexProgram& vp, const RSXFragmentProgram& fp) { pipeline_data data_block = {}; data_block.pipeline_properties = pipeline; data_block.vertex_program_hash = m_storage.get_hash(vp); data_block.fragment_program_hash = m_storage.get_hash(fp); data_block.pipeline_storage_hash = m_storage.get_hash(pipeline); data_block.vp_ctrl0 = vp.ctrl; data_block.vp_ctrl1 = vp.output_mask; data_block.vp_texture_dimensions = vp.texture_state.texture_dimensions; data_block.vp_multisampled_textures = vp.texture_state.multisampled_textures; data_block.vp_base_address = vp.base_address; data_block.vp_entry = vp.entry; unpack_bitset(vp.instruction_mask, data_block.vp_instruction_mask); u8 index = 0; while (index < 32) { if (!index && !vp.jump_table.empty()) { for (auto& address : vp.jump_table) { data_block.vp_jump_table[index++] = static_cast(address); } } else { // End of list marker data_block.vp_jump_table[index] = u16{umax}; break; } } data_block.fp_ctrl = fp.ctrl; data_block.fp_texture_dimensions = fp.texture_state.texture_dimensions; data_block.fp_texcoord_control = fp.texcoord_control_mask; data_block.fp_lighting_flags = u16(fp.two_sided_lighting); data_block.fp_shadow_textures = fp.texture_state.shadow_textures; data_block.fp_redirected_textures = fp.texture_state.redirected_textures; data_block.fp_multisampled_textures = fp.texture_state.multisampled_textures; data_block.fp_mrt_count = fp.mrt_buffers_count; return data_block; } }; namespace vertex_cache { // A null vertex cache template class default_vertex_cache { public: virtual ~default_vertex_cache() = default; virtual const storage_type* find_vertex_range(u32 /*local_addr*/, u32 /*data_length*/) { return nullptr; } virtual void store_range(u32 /*local_addr*/, u32 /*data_length*/, u32 /*offset_in_heap*/) {} virtual void purge() {} }; struct uploaded_range { uptr local_address; u32 offset_in_heap; u32 data_length; }; // A weak vertex cache with no data checks or memory range locks // Of limited use since contents are only guaranteed to be valid once per frame // Supports upto 1GiB block lengths if typed and full 4GiB otherwise. // Using a 1:1 hash-value with robin-hood is 2x faster than what we had before with std-map-of-arrays. class weak_vertex_cache : public default_vertex_cache { using storage_type = uploaded_range; private: rsx::unordered_map vertex_ranges; FORCE_INLINE u64 hash(u32 local_addr, u32 data_length) const { return u64(local_addr) | (u64(data_length) << 32); } public: const storage_type* find_vertex_range(u32 local_addr, u32 data_length) override { const auto key = hash(local_addr, data_length); const auto found = vertex_ranges.find(key); if (found == vertex_ranges.end()) { return nullptr; } return std::addressof(found->second); } void store_range(u32 local_addr, u32 data_length, u32 offset_in_heap) override { storage_type v = {}; v.data_length = data_length; v.local_address = local_addr; v.offset_in_heap = offset_in_heap; const auto key = hash(local_addr, data_length); vertex_ranges[key] = v; } void purge() override { vertex_ranges.clear(); } }; } // namespace vertex_cache } // namespace rsx