#pragma once #include "Utilities/VirtualMemory.h" #include "Utilities/hash.h" #include "Utilities/File.h" #include "Emu/Memory/vm.h" #include "gcm_enums.h" #include "Common/ProgramStateCache.h" #include "Emu/Cell/Modules/cellMsgDialog.h" #include "Emu/System.h" #include "Common/texture_cache_checker.h" #include "rsx_utils.h" #include #include namespace rsx { enum protection_policy { protect_policy_one_page, //Only guard one page, preferrably one where this section 'wholly' fits protect_policy_conservative, //Guards as much memory as possible that is guaranteed to only be covered by the defined range without sharing protect_policy_full_range //Guard the full memory range. Shared pages may be invalidated by access outside the object we're guarding }; enum section_bounds { full_range, locked_range, confirmed_range }; static inline void memory_protect(const address_range& range, utils::protection prot) { verify(HERE), range.is_page_range(); //LOG_ERROR(RSX, "memory_protect(0x%x, 0x%x, %x)", static_cast(range.start), static_cast(range.length()), static_cast(prot)); utils::memory_protect(vm::base(range.start), range.length(), prot); #ifdef TEXTURE_CACHE_DEBUG tex_cache_checker.set_protection(range, prot); #endif } class buffered_section { public: static const protection_policy guard_policy = protect_policy_full_range; private: address_range locked_range; address_range cpu_range = {}; address_range confirmed_range; utils::protection protection = utils::protection::rw; bool locked = false; inline void init_lockable_range(const address_range &range) { locked_range = range.to_page_range(); if ((guard_policy != protect_policy_full_range) && (range.length() >= 4096)) { const u32 block_start = (locked_range.start < range.start) ? (locked_range.start + 4096u) : locked_range.start; const u32 block_end = locked_range.end; if (block_start < block_end) { // protect unique page range locked_range.start = block_start; locked_range.end = block_end; } if (guard_policy == protect_policy_one_page) { // protect exactly one page locked_range.set_length(4096u); } } AUDIT( (locked_range.start == page_start(range.start)) || (locked_range.start == next_page(range.start)) ); AUDIT( locked_range.end <= page_end(range.end) ); verify(HERE), locked_range.is_page_range(); } public: buffered_section() = default; ~buffered_section() = default; void reset(const address_range &memory_range) { verify(HERE), memory_range.valid() && locked == false; cpu_range = address_range(memory_range); confirmed_range.invalidate(); locked_range.invalidate(); protection = utils::protection::rw; locked = false; init_lockable_range(cpu_range); } protected: void invalidate_range() { ASSERT(!locked); cpu_range.invalidate(); confirmed_range.invalidate(); locked_range.invalidate(); } public: void protect(utils::protection new_prot, bool force = false) { if (new_prot == protection && !force) return; verify(HERE), locked_range.is_page_range(); AUDIT( !confirmed_range.valid() || confirmed_range.inside(cpu_range) ); #ifdef TEXTURE_CACHE_DEBUG if (new_prot != protection || force) { if (locked && !force) // When force=true, it is the responsibility of the caller to remove this section from the checker refcounting tex_cache_checker.remove(locked_range, protection); if (new_prot != utils::protection::rw) tex_cache_checker.add(locked_range, new_prot); } #endif // TEXTURE_CACHE_DEBUG rsx::memory_protect(locked_range, new_prot); protection = new_prot; locked = (protection != utils::protection::rw); if (protection == utils::protection::no) { tag_memory(); } else { if (!locked) { //Unprotect range also invalidates secured range confirmed_range.invalidate(); } } } void protect(utils::protection prot, const std::pair& new_confirm) { // new_confirm.first is an offset after cpu_range.start // new_confirm.second is the length (after cpu_range.start + new_confirm.first) #ifdef TEXTURE_CACHE_DEBUG // We need to remove the lockable range from page_info as we will be re-protecting with force==true if (locked) tex_cache_checker.remove(locked_range, protection); #endif if (prot != utils::protection::rw) { if (confirmed_range.valid()) { confirmed_range.start = std::min(confirmed_range.start, cpu_range.start + new_confirm.first); confirmed_range.end = std::max(confirmed_range.end, cpu_range.start + new_confirm.first + new_confirm.second - 1); } else { confirmed_range = address_range::start_length(cpu_range.start + new_confirm.first, new_confirm.second); ASSERT(!locked || locked_range.inside(confirmed_range.to_page_range())); } verify(HERE), confirmed_range.inside(cpu_range); init_lockable_range(confirmed_range); } protect(prot, true); } inline void unprotect() { AUDIT(protection != utils::protection::rw); protect(utils::protection::rw); } inline void discard() { #ifdef TEXTURE_CACHE_DEBUG if (locked) tex_cache_checker.remove(locked_range, protection); #endif protection = utils::protection::rw; confirmed_range.invalidate(); locked = false; } inline const address_range& get_bounds(section_bounds bounds) const { switch (bounds) { case section_bounds::full_range: return cpu_range; case section_bounds::locked_range: return locked_range; case section_bounds::confirmed_range: return confirmed_range.valid() ? confirmed_range : cpu_range; default: ASSUME(0); } } /** * Overlapping checks */ inline bool overlaps(const u32 address, section_bounds bounds) const { return get_bounds(bounds).overlaps(address); } inline bool overlaps(const address_range &other, section_bounds bounds) const { return get_bounds(bounds).overlaps(other); } inline bool overlaps(const address_range_vector &other, section_bounds bounds) const { return get_bounds(bounds).overlaps(other); } inline bool overlaps(const buffered_section &other, section_bounds bounds) const { return get_bounds(bounds).overlaps(other.get_bounds(bounds)); } inline bool inside(const address_range &other, section_bounds bounds) const { return get_bounds(bounds).inside(other); } inline bool inside(const address_range_vector &other, section_bounds bounds) const { return get_bounds(bounds).inside(other); } inline bool inside(const buffered_section &other, section_bounds bounds) const { return get_bounds(bounds).inside(other.get_bounds(bounds)); } inline s32 signed_distance(const address_range &other, section_bounds bounds) const { return get_bounds(bounds).signed_distance(other); } inline u32 distance(const address_range &other, section_bounds bounds) const { return get_bounds(bounds).distance(other); } /** * Utilities */ inline bool valid_range() const { return cpu_range.valid(); } inline bool is_locked() const { return locked; } inline u32 get_section_base() const { return cpu_range.start; } inline u32 get_section_size() const { return cpu_range.valid() ? cpu_range.length() : 0; } inline const address_range& get_locked_range() const { AUDIT( locked ); return locked_range; } inline const address_range& get_section_range() const { return cpu_range; } const address_range& get_confirmed_range() const { return confirmed_range.valid() ? confirmed_range : cpu_range; } const std::pair get_confirmed_range_delta() const { if (!confirmed_range.valid()) return { 0, cpu_range.length() }; return { confirmed_range.start - cpu_range.start, confirmed_range.length() }; } inline bool matches(const address_range &range) const { return cpu_range.valid() && cpu_range == range; } inline utils::protection get_protection() const { return protection; } inline address_range get_min_max(const address_range& current_min_max, section_bounds bounds) const { return get_bounds(bounds).get_min_max(current_min_max); } /** * Super Pointer */ template inline T* get_ptr(u32 address) const { return reinterpret_cast(vm::g_sudo_addr + address); } /** * Memory tagging */ private: inline void tag_memory() { // We only need to tag memory if we are in full-range mode if (guard_policy == protect_policy_full_range) return; AUDIT(locked); const address_range& range = get_confirmed_range(); volatile u32* first = get_ptr(range.start); volatile u32* last = get_ptr(range.end - 3); *first = range.start; *last = range.end; } public: bool test_memory_head() { if (guard_policy == protect_policy_full_range) return true; AUDIT(locked); const auto& range = get_confirmed_range(); volatile const u32* first = get_ptr(range.start); return (*first == range.start); } bool test_memory_tail() { if (guard_policy == protect_policy_full_range) return true; AUDIT(locked); const auto& range = get_confirmed_range(); volatile const u32* last = get_ptr(range.end-3); return (*last == range.end); } }; template class shaders_cache { struct pipeline_data { u64 vertex_program_hash; u64 fragment_program_hash; u64 pipeline_storage_hash; u32 vp_ctrl; u32 vp_texture_dimensions; u64 vp_instruction_mask[8]; u32 vp_base_address; u32 vp_entry; u16 vp_jump_table[32]; u32 fp_ctrl; u32 fp_texture_dimensions; u32 fp_texcoord_control; u16 fp_unnormalized_coords; u16 fp_height; u16 fp_pixel_layout; u16 fp_lighting_flags; u16 fp_shadow_textures; u16 fp_redirected_textures; u16 fp_alphakill_mask; u64 fp_zfunc_mask; pipeline_storage_type pipeline_properties; }; std::string version_prefix; std::string root_path; std::string pipeline_class_name; std::unordered_map> fragment_program_data; backend_storage& m_storage; public: struct progress_dialog_helper { std::shared_ptr dlg; atomic_t ref_cnt; virtual void create() { dlg = Emu.GetCallbacks().get_msg_dialog(); if (dlg) { dlg->type.se_normal = true; dlg->type.bg_invisible = true; dlg->type.progress_bar_count = 2; dlg->ProgressBarSetTaskbarIndex(-1); // -1 to combine all progressbars in the taskbar progress dlg->on_close = [](s32 status) { Emu.CallAfter([]() { Emu.Stop(); }); }; ref_cnt++; Emu.CallAfter([&]() { dlg->Create("Preloading cached shaders from disk.\nPlease wait...", "Shader Compilation"); ref_cnt--; }); } while (ref_cnt.load() && !Emu.IsStopped()) { _mm_pause(); } } virtual void update_msg(u32 index, u32 processed, u32 entry_count) { if (!dlg) { return; } ref_cnt++; Emu.CallAfter([&, index, processed, entry_count]() { const char *text = index == 0 ? "Loading pipeline object %u of %u" : "Compiling pipeline object %u of %u"; dlg->ProgressBarSetMsg(index, fmt::format(text, processed, entry_count)); ref_cnt--; }); } virtual void inc_value(u32 index, u32 value) { if (!dlg) { return; } ref_cnt++; Emu.CallAfter([&, index, value]() { dlg->ProgressBarInc(index, value); ref_cnt--; }); } virtual void set_limit(u32 index, u32 limit) { if (!dlg) { return; } ref_cnt++; Emu.CallAfter([&, index, limit]() { dlg->ProgressBarSetLimit(index, limit); ref_cnt--; }); } virtual void refresh() {} virtual void close() { while (ref_cnt.load() && !Emu.IsStopped()) { _mm_pause(); } } }; shaders_cache(backend_storage& storage, std::string pipeline_class, std::string version_prefix_str = "v1") : version_prefix(std::move(version_prefix_str)) , pipeline_class_name(std::move(pipeline_class)) , m_storage(storage) { if (!g_cfg.video.disable_on_disk_shader_cache) { root_path = Emu.PPUCache() + "shaders_cache"; } } template void load(progress_dialog_helper* dlg, Args&& ...args) { if (g_cfg.video.disable_on_disk_shader_cache) { return; } std::string directory_path = root_path + "/pipelines/" + pipeline_class_name + "/" + version_prefix; if (!fs::is_dir(directory_path)) { fs::create_path(directory_path); fs::create_path(root_path + "/raw"); return; } fs::dir root = fs::dir(directory_path); u32 entry_count = 0; std::vector entries; for (auto It = root.begin(); It != root.end(); ++It, entry_count++) { fs::dir_entry tmp = *It; if (tmp.name == "." || tmp.name == "..") continue; entries.push_back(tmp); } if ((entry_count = (u32)entries.size()) <= 2) return; root.rewind(); // Invalid pipeline entries to be removed std::vector invalid_entries; // Progress dialog std::unique_ptr fallback_dlg; if (!dlg) { fallback_dlg = std::make_unique(); dlg = fallback_dlg.get(); } dlg->create(); dlg->set_limit(0, entry_count); dlg->set_limit(1, entry_count); dlg->update_msg(0, 0, entry_count); dlg->update_msg(1, 0, entry_count); // Setup worker threads unsigned nb_threads = std::thread::hardware_concurrency(); std::vector worker_threads(nb_threads); // Preload everything needed to compile the shaders // Can probably be parallelized too, but since it's mostly reading files it's probably not worth it std::vector> unpacked; std::chrono::time_point last_update; u32 processed_since_last_update = 0; for (u32 i = 0; (i < entry_count) && !Emu.IsStopped(); i++) { fs::dir_entry tmp = entries[i]; const auto filename = directory_path + "/" + tmp.name; std::vector bytes; fs::file f(filename); if (f.size() != sizeof(pipeline_data)) { LOG_ERROR(RSX, "Cached pipeline object %s is not binary compatible with the current shader cache", tmp.name.c_str()); invalid_entries.push_back(filename); continue; } f.read(bytes, f.size()); auto entry = unpack(*(pipeline_data*)bytes.data()); m_storage.preload_programs(std::get<1>(entry), std::get<2>(entry)); unpacked.push_back(entry); // Only update the screen at about 10fps since updating it everytime slows down the process std::chrono::time_point now = std::chrono::steady_clock::now(); processed_since_last_update++; if ((std::chrono::duration_cast(now - last_update) > 100ms) || (i == entry_count - 1)) { dlg->update_msg(0, i + 1, entry_count); dlg->inc_value(0, processed_since_last_update); last_update = now; processed_since_last_update = 0; } } // Account for any invalid entries entry_count = u32(unpacked.size()); atomic_t processed(0); std::function shader_comp_worker = [&](u32 index) { u32 pos; while (((pos = processed++) < entry_count) && !Emu.IsStopped()) { auto& entry = unpacked[pos]; m_storage.add_pipeline_entry(std::get<1>(entry), std::get<2>(entry), std::get<0>(entry), std::forward(args)...); } }; if (g_cfg.video.renderer == video_renderer::vulkan) { // Start workers for (u32 i = 0; i < nb_threads; i++) { worker_threads[i] = std::thread(shader_comp_worker, i); } // Wait for the workers to finish their task while updating UI u32 current_progress = 0; u32 last_update_progress = 0; while ((current_progress < entry_count) && !Emu.IsStopped()) { std::this_thread::sleep_for(100ms); // Around 10fps should be good enough current_progress = std::min(processed.load(), entry_count); processed_since_last_update = current_progress - last_update_progress; last_update_progress = current_progress; if (processed_since_last_update > 0) { dlg->update_msg(1, current_progress, entry_count); dlg->inc_value(1, processed_since_last_update); } } // Need to join the threads to be absolutely sure shader compilation is done. for (std::thread& worker_thread : worker_threads) worker_thread.join(); } else { u32 pos; while (((pos = processed++) < entry_count) && !Emu.IsStopped()) { auto& entry = unpacked[pos]; m_storage.add_pipeline_entry(std::get<1>(entry), std::get<2>(entry), std::get<0>(entry), std::forward(args)...); // Update screen at about 10fps std::chrono::time_point now = std::chrono::steady_clock::now(); processed_since_last_update++; if ((std::chrono::duration_cast(now - last_update) > 100ms) || (pos == entry_count - 1)) { dlg->update_msg(1, pos + 1, entry_count); dlg->inc_value(1, processed_since_last_update); last_update = now; processed_since_last_update = 0; } } } if (!invalid_entries.empty()) { for (const auto &filename : invalid_entries) { fs::remove_file(filename); } LOG_NOTICE(RSX, "shader cache: %d entries were marked as invalid and removed", invalid_entries.size()); } dlg->refresh(); dlg->close(); } void store(pipeline_storage_type &pipeline, RSXVertexProgram &vp, RSXFragmentProgram &fp) { if (g_cfg.video.disable_on_disk_shader_cache) { return; } if (vp.jump_table.size() > 32) { LOG_ERROR(RSX, "shaders_cache: vertex program has more than 32 jump addresses. Entry not saved to cache"); return; } pipeline_data data = pack(pipeline, vp, fp); std::string fp_name = root_path + "/raw/" + fmt::format("%llX.fp", data.fragment_program_hash); std::string vp_name = root_path + "/raw/" + fmt::format("%llX.vp", data.vertex_program_hash); if (!fs::is_file(fp_name)) { fs::file(fp_name, fs::rewrite).write(fp.addr, fp.ucode_length); } if (!fs::is_file(vp_name)) { fs::file(vp_name, fs::rewrite).write(vp.data); } u64 state_hash = 0; state_hash ^= rpcs3::hash_base(data.vp_ctrl); state_hash ^= rpcs3::hash_base(data.fp_ctrl); state_hash ^= rpcs3::hash_base(data.vp_texture_dimensions); state_hash ^= rpcs3::hash_base(data.fp_texture_dimensions); state_hash ^= rpcs3::hash_base(data.fp_texcoord_control); state_hash ^= rpcs3::hash_base(data.fp_unnormalized_coords); state_hash ^= rpcs3::hash_base(data.fp_height); state_hash ^= rpcs3::hash_base(data.fp_pixel_layout); state_hash ^= rpcs3::hash_base(data.fp_lighting_flags); state_hash ^= rpcs3::hash_base(data.fp_shadow_textures); state_hash ^= rpcs3::hash_base(data.fp_redirected_textures); state_hash ^= rpcs3::hash_base(data.fp_alphakill_mask); state_hash ^= rpcs3::hash_base(data.fp_zfunc_mask); std::string pipeline_file_name = fmt::format("%llX+%llX+%llX+%llX.bin", data.vertex_program_hash, data.fragment_program_hash, data.pipeline_storage_hash, state_hash); std::string pipeline_path = root_path + "/pipelines/" + pipeline_class_name + "/" + version_prefix + "/" + pipeline_file_name; fs::file(pipeline_path, fs::rewrite).write(&data, sizeof(pipeline_data)); } RSXVertexProgram load_vp_raw(u64 program_hash) { std::vector data; std::string filename = fmt::format("%llX.vp", program_hash); fs::file f(root_path + "/raw/" + filename); f.read(data, f.size() / sizeof(u32)); RSXVertexProgram vp = {}; vp.data = data; vp.skip_vertex_input_check = true; return vp; } RSXFragmentProgram load_fp_raw(u64 program_hash) { std::vector data; std::string filename = fmt::format("%llX.fp", program_hash); fs::file f(root_path + "/raw/" + filename); f.read(data, f.size()); RSXFragmentProgram fp = {}; fragment_program_data[program_hash] = data; fp.addr = fragment_program_data[program_hash].data(); fp.ucode_length = (u32)data.size(); return fp; } std::tuple unpack(pipeline_data &data) { RSXVertexProgram vp = load_vp_raw(data.vertex_program_hash); RSXFragmentProgram fp = load_fp_raw(data.fragment_program_hash); pipeline_storage_type pipeline = data.pipeline_properties; vp.output_mask = data.vp_ctrl; vp.texture_dimensions = data.vp_texture_dimensions; vp.base_address = data.vp_base_address; vp.entry = data.vp_entry; pack_bitset<512>(vp.instruction_mask, data.vp_instruction_mask); for (u8 index = 0; index < 32; ++index) { const auto address = data.vp_jump_table[index]; if (address == UINT16_MAX) { // End of list marker break; } vp.jump_table.emplace(address); } fp.ctrl = data.fp_ctrl; fp.texture_dimensions = data.fp_texture_dimensions; fp.texcoord_control_mask = data.fp_texcoord_control; fp.unnormalized_coords = data.fp_unnormalized_coords; fp.two_sided_lighting = !!(data.fp_lighting_flags & 0x1); fp.shadow_textures = data.fp_shadow_textures; fp.redirected_textures = data.fp_redirected_textures; for (u8 index = 0; index < 16; ++index) { fp.textures_alpha_kill[index] = (data.fp_alphakill_mask & (1 << index))? 1: 0; fp.textures_zfunc[index] = (data.fp_zfunc_mask >> (index << 2)) & 0xF; } return std::make_tuple(pipeline, vp, fp); } pipeline_data pack(pipeline_storage_type &pipeline, RSXVertexProgram &vp, RSXFragmentProgram &fp) { pipeline_data data_block = {}; data_block.pipeline_properties = pipeline; data_block.vertex_program_hash = m_storage.get_hash(vp); data_block.fragment_program_hash = m_storage.get_hash(fp); data_block.pipeline_storage_hash = m_storage.get_hash(pipeline); data_block.vp_ctrl = vp.output_mask; data_block.vp_texture_dimensions = vp.texture_dimensions; data_block.vp_base_address = vp.base_address; data_block.vp_entry = vp.entry; unpack_bitset<512>(vp.instruction_mask, data_block.vp_instruction_mask); u8 index = 0; while (index < 32) { if (!index && !vp.jump_table.empty()) { for (auto &address : vp.jump_table) { data_block.vp_jump_table[index++] = (u16)address; } } else { // End of list marker data_block.vp_jump_table[index] = UINT16_MAX; break; } } data_block.fp_ctrl = fp.ctrl; data_block.fp_texture_dimensions = fp.texture_dimensions; data_block.fp_texcoord_control = fp.texcoord_control_mask; data_block.fp_unnormalized_coords = fp.unnormalized_coords; data_block.fp_lighting_flags = u16(fp.two_sided_lighting); data_block.fp_shadow_textures = fp.shadow_textures; data_block.fp_redirected_textures = fp.redirected_textures; for (u8 index = 0; index < 16; ++index) { data_block.fp_alphakill_mask |= u32(fp.textures_alpha_kill[index] & 0x1) << index; data_block.fp_zfunc_mask |= u64(fp.textures_zfunc[index] & 0xF) << (index << 2); } return data_block; } }; namespace vertex_cache { // A null vertex cache template class default_vertex_cache { public: virtual ~default_vertex_cache() = default; virtual storage_type* find_vertex_range(uintptr_t /*local_addr*/, upload_format, u32 /*data_length*/) { return nullptr; } virtual void store_range(uintptr_t /*local_addr*/, upload_format, u32 /*data_length*/, u32 /*offset_in_heap*/) {} virtual void purge() {} }; // A weak vertex cache with no data checks or memory range locks // Of limited use since contents are only guaranteed to be valid once per frame // TODO: Strict vertex cache with range locks template struct uploaded_range { uintptr_t local_address; upload_format buffer_format; u32 offset_in_heap; u32 data_length; }; template class weak_vertex_cache : public default_vertex_cache, upload_format> { using storage_type = uploaded_range; private: std::unordered_map> vertex_ranges; public: storage_type* find_vertex_range(uintptr_t local_addr, upload_format fmt, u32 data_length) override { const auto data_end = local_addr + data_length; for (auto &v : vertex_ranges[local_addr]) { // NOTE: This has to match exactly. Using sized shortcuts such as >= comparison causes artifacting in some applications (UC1) if (v.buffer_format == fmt && v.data_length == data_length) return &v; } return nullptr; } void store_range(uintptr_t local_addr, upload_format fmt, u32 data_length, u32 offset_in_heap) override { storage_type v = {}; v.buffer_format = fmt; v.data_length = data_length; v.local_address = local_addr; v.offset_in_heap = offset_in_heap; vertex_ranges[local_addr].push_back(v); } void purge() override { vertex_ranges.clear(); } }; } }