llvm: try better protect from oom

2025-12-06 07:12:14 +01:00 · 2025-03-22 04:28:00 +03:00 · 2025-03-22 04:28:00 +03:00 · 0e639725c1
parent 745b0c45e0
commit 0e639725c1
1 changed files with 131 additions and 78 deletions
--- a/rpcs3/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/rpcs3/Emu/Cell/PPUThread.cpp
@ -167,9 +167,104 @@ bool serialize<ppu_thread::cr_bits>(utils::serial& ar, typename ppu_thread::cr_b
 	return true;
 }
 class concurent_memory_limit
 {
 	u32 m_total = 0;
 	atomic_t<u32> m_free = 0;
 	static constexpr auto k_block_size = 1024 * 8;
 public:
 	class [[nodiscard]] user
 	{
 		concurent_memory_limit *m_limit = nullptr;
 		u32 m_used = 0;		
 	public:
 		user(concurent_memory_limit *limit, u32 used) : m_limit(limit), m_used(used) {}
 		user() = default;
 		user(user &&other)
 		{
 			*this = std::move(other);
 		}
 		~user()
 		{
 			if (m_used != 0)
 			{
 				m_limit->release(m_used);
 			}
 		}
 		user &operator=(user &&other)
 		{
 			std::swap(other.m_limit, m_limit);
 			std::swap(other.m_used, m_used);
 			return *this;
 		}
 		explicit operator bool() const { return m_limit != nullptr; }
 	};
 	concurent_memory_limit(u64 total)
 		: m_total(u32(std::min<u64>(total / k_block_size, std::numeric_limits<u32>::max()))), m_free(m_total) {}
 	user acquire(u64 amount)
 	{
 		amount = utils::aligned_div<u64>(amount, k_block_size);
 		u32 allocated = 0;
 		while (!m_free.fetch_op([&, this](u32& value)
 		{
 			if (value >= amount || value == m_total)
 			{
 				// Allow at least allocation, make 0 the "memory unavailable" sign value for atomic waiting efficiency 
 				const u32 new_val = static_cast<u32>(utils::sub_saturate<u64>(value, amount));
 				allocated = value - new_val;
 				value = new_val;
 				return true;
 			}
 			// Resort to waiting
 			allocated = 0;
 			return Emu.IsStopped();
 		}).second)
 		{
 			// Wait until not 0
 			m_free.wait(0);
 		}
 		if (Emu.IsStopped())
 		{
 			return {};
 		}
 		return user(this, allocated);
 	}
 	std::size_t free_memory() const {
 		return m_free.load() * k_block_size;
 	}
 	std::uint64_t total_memory() const {
 		return m_total * k_block_size;
 	}
 private:
 	void release(u32 amount)
 	{
 		if (!m_free.fetch_add(amount))
 		{
 			m_free.notify_all();
 		}
 	}
 };
 extern void ppu_initialize();
 extern void ppu_finalize(const ppu_module<lv2_obj>& info, bool force_mem_release = false);
 extern bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only = false, u64 file_size = 0);
 extern bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_size, concurent_memory_limit &memory_limit);
 static void ppu_initialize2(class jit_compiler& jit, const ppu_module<lv2_obj>& module_part, const std::string& cache_path, const std::string& obj_name);
 extern bool ppu_load_exec(const ppu_exec_object&, bool virtual_load, const std::string&, utils::serial* = nullptr);
 extern std::pair<shared_ptr<lv2_overlay>, CellError> ppu_load_overlay(const ppu_exec_object&, bool virtual_load, const std::string& path, s64 file_offset, utils::serial* = nullptr);
@ -4171,13 +4266,7 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 	lf_queue<file_info> possible_exec_file_paths;
-	// Allow to allocate 2000 times the size of each file for the use of LLVM
+	concurent_memory_limit memory_limit(utils::get_total_memory() / 3);
 	// This works very nicely with Metal Gear Solid 4 for example:
 	// 2 7MB overlay files -> 14GB
 	// The growth in memory requirements of LLVM is not linear with file size of course
 	// But these estimates should hopefully protect RPCS3 in the coming years
 	// Especially when thread count is on the rise with each CPU generation 
 	atomic_t<u32> file_size_limit = static_cast<u32>(std::clamp<u64>(utils::aligned_div<u64>(utils::get_total_memory(), 2000), 65536, u32{umax}));
 	const u32 software_thread_limit = std::min<u32>(g_cfg.core.llvm_threads ? g_cfg.core.llvm_threads : u32{umax}, ::size32(file_queue));
 	const u32 cpu_thread_limit = utils::get_thread_count() > 8u ? std::max<u32>(utils::get_thread_count(), 2) - 1 : utils::get_thread_count(); // One LLVM thread less
@ -4236,7 +4325,6 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 		// Set low priority
 		thread_ctrl::scoped_priority low_prio(-1);
 		u32 inc_fdone = 1;
 		u32 restore_mem = 0;
 		for (usz func_i = fnext++; func_i < file_queue.size(); func_i = fnext++, g_progr_fdone += std::exchange(inc_fdone, 1))
 		{
@ -4245,20 +4333,12 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 				continue;
 			}
 			if (restore_mem)
 			{
 				if (!file_size_limit.fetch_add(restore_mem))
 				{
 					file_size_limit.notify_all();
 				}
 				restore_mem = 0;
 			}
 			auto& [path, offset, file_size] = file_queue[func_i];
 			ppu_log.notice("Trying to load: %s", path);
 			auto file_allocation = memory_limit.acquire(file_size * 2);
 			// Load MSELF, SPRX or SELF
 			fs::file src{path};
@ -4322,52 +4402,15 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 				continue;
 			}
 			auto wait_for_memory = [&]() -> bool
 			{
 				// Try not to process too many files at once because it seems to reduce performance and cause RAM shortages
 				// Concurrently compiling more OVL or huge PRX files does not have much theoretical benefit
 				while (!file_size_limit.fetch_op([&](u32& value)
 				{
 					if (value)
 					{
 						// Allow at least one file, make 0 the "memory unavailable" sign value for atomic waiting efficiency 
 						const u32 new_val = static_cast<u32>(utils::sub_saturate<u64>(value, file_size));
 						restore_mem = value - new_val;
 						value = new_val;
 						return true;
 					}
 					// Resort to waiting
 					restore_mem = 0;
 					return false;
 				}).second)
 				{
 					// Wait until not 0
 					file_size_limit.wait(0);
 				}
 				if (Emu.IsStopped())
 				{
 					return false;
 				}
 				return true;
 			};
 			elf_error prx_err{}, ovl_err{};
 			if (ppu_prx_object obj = src; (prx_err = obj, obj == elf_error::ok))
 			{
 				if (!wait_for_memory())
 				{
 					// Emulation stopped
 					continue;
 				}
 				if (auto prx = ppu_load_prx(obj, true, path, offset))
 				{
 					obj.clear(), src.close(); // Clear decrypted file and elf object memory
-					ppu_initialize(*prx, false, file_size);
+					file_allocation = {}; // release used file memory
 					ppu_initialize(*prx, false, file_size, memory_limit);
 					ppu_finalize(*prx, true);
 					continue;
 				}
@ -4400,11 +4443,8 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 						break;
 					}
-					if (!wait_for_memory())
+					obj.clear(), src.close(); // Clear decrypted file and elf object memory
-					{
+					file_allocation = {}; // release used file memory
 						// Emulation stopped
 						break;
 					}
 					// Participate in thread execution limitation (takes a long time)
 					if (std::lock_guard lock(g_fxo->get<jit_core_allocator>().sem); !ovlm->analyse(0, ovlm->entry, ovlm->seg0_code_end, ovlm->applied_patches, std::vector<u32>{}, []()
@ -4416,8 +4456,7 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 						break;
 					}
-					obj.clear(), src.close(); // Clear decrypted file and elf object memory
+					ppu_initialize(*ovlm, false, file_size, memory_limit);
 					ppu_initialize(*ovlm, false, file_size);
 					ppu_finalize(*ovlm, true);
 					break;
 				}
@ -4432,14 +4471,6 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 			possible_exec_file_paths.push(path, offset, file_size);
 			inc_fdone = 0;
 		}
 		if (restore_mem)
 		{
 			if (!file_size_limit.fetch_add(restore_mem))
 			{
 				file_size_limit.notify_all();
 			}
 		}
 	});
 	// Join every thread
@ -4482,6 +4513,8 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 				continue;
 			}
 			auto file_allocation = memory_limit.acquire(file_size * 2);
 			for (usz i = 0;; i++)
 			{
 				if (i > decrypt_klics.size())
@ -4550,10 +4583,11 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 					}
 					obj.clear(), src.close(); // Clear decrypted file and elf object memory
 					file_allocation = {};
 					_main.name = ' '; // Make ppu_finalize work
 					Emu.ConfigurePPUCache();
-					ppu_initialize(_main, false, file_size);
+					ppu_initialize(_main, false, file_size, memory_limit);
 					spu_cache::initialize(false);
 					ppu_finalize(_main, true);
 					_main = {};
@ -4719,7 +4753,7 @@ extern void ppu_initialize()
 	}
 }
-bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_size)
+bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_size, concurent_memory_limit &memory_limit)
 {
 	if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm)
 	{
@ -5466,6 +5500,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 		struct thread_op
 		{
 			concurent_memory_limit &memory_limit;
 			atomic_t<u32>& work_cv;
 			std::vector<std::pair<std::string, ppu_module<lv2_obj>>>& workload;
 			const ppu_module<lv2_obj>& main_module;
@ -5474,10 +5509,11 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 			std::unique_lock<decltype(jit_core_allocator::sem)> core_lock;
-			thread_op(atomic_t<u32>& work_cv, std::vector<std::pair<std::string, ppu_module<lv2_obj>>>& workload
+			thread_op(concurent_memory_limit &memory_limit, atomic_t<u32>& work_cv, std::vector<std::pair<std::string, ppu_module<lv2_obj>>>& workload
 				, const cpu_thread* cpu, const ppu_module<lv2_obj>& main_module, const std::string& cache_path, decltype(jit_core_allocator::sem)& sem) noexcept
-				: work_cv(work_cv)
+				: memory_limit(memory_limit)
 				, work_cv(work_cv)
 				, workload(workload)
 				, main_module(main_module)
 				, cache_path(cache_path)
@ -5488,7 +5524,8 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 			}
 			thread_op(const thread_op& other) noexcept
-				: work_cv(other.work_cv)
+				: memory_limit(other.memory_limit)
 				, work_cv(other.work_cv)
 				, workload(other.workload)
 				, main_module(other.main_module)
 				, cache_path(other.cache_path)
@ -5521,6 +5558,16 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 					// Keep allocating workload
 					const auto& [obj_name, part] = std::as_const(workload)[i];
 					std::size_t total_fn_size = 0;
 					for (auto &fn : part.get_funcs())
 					{
 						total_fn_size += fn.size;
 					}
 					ppu_log.warning("LLVM: reporting used memory %u (free/total: %u/%u) by %s%s", total_fn_size * 1024 * 16, memory_limit.free_memory(), memory_limit.total_memory(), cache_path, obj_name);
 					auto used_memory = memory_limit.acquire(total_fn_size * 1024 * 16);
 					std::shared_lock rlock(g_fxo->get<jit_core_allocator>().shared_mtx, std::defer_lock);
 					std::unique_lock lock(g_fxo->get<jit_core_allocator>().shared_mtx, std::defer_lock);
@ -5553,7 +5600,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 		g_watchdog_hold_ctr++;
 		named_thread_group threads(fmt::format("PPUW.%u.", ++g_fxo->get<thread_index_allocator>().index), thread_count
-			, thread_op(work_cv, workload, cpu, info, cache_path, g_fxo->get<jit_core_allocator>().sem)
+			, thread_op(memory_limit, work_cv, workload, cpu, info, cache_path, g_fxo->get<jit_core_allocator>().sem)
 			, [&](u32 /*thread_index*/, thread_op& op)
 		{
 			// Allocate "core"
@ -5728,6 +5775,12 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 #endif
 }
 bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_size)
 {
 	concurent_memory_limit memory_limit(utils::aligned_div<u64>(utils::get_total_memory(), 2));
 	return ppu_initialize(info, check_only, file_size, memory_limit);
 }
 static void ppu_initialize2(jit_compiler& jit, const ppu_module<lv2_obj>& module_part, const std::string& cache_path, const std::string& obj_name)
 {
 #ifdef LLVM_AVAILABLE