orbis/umtx: remove state from context

kernel: Add GlobalKernelObject utility
rx: shared_cv/shared_mtx disable copying/moving
2025-12-06 07:12:14 +01:00 · 2025-10-06 01:58:24 +03:00 · 2025-10-06 01:57:23 +03:00 · 2025-10-06 01:55:11 +03:00 · 2025-10-06 01:54:10 +03:00 · 2025-10-05 20:07:19 +03:00
133 changed files with 1032 additions and 1381 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -272,14 +272,13 @@ target_compile_definitions(rx PRIVATE
 if (WITH_PS4)
    find_package(nlohmann_json CONFIG)
    add_subdirectory(tools)
-    add_subdirectory(kernel/orbis)
 endif()

 add_subdirectory(rpcsx)
+add_subdirectory(kernel)

 if (WITH_PS3)
    include(ConfigureCompiler)
-    add_subdirectory(kernel/cellos)
    add_subdirectory(rpcs3)
    add_subdirectory(ps3fw)
 endif()
--- a/android/src/rpcsx-android.cpp
+++ b/android/src/rpcsx-android.cpp
@ -41,12 +41,13 @@
 #include "rpcs3_version.h"
 #include "rpcsx/fw/ps3/cellMsgDialog.h"
 #include "rpcsx/fw/ps3/cellSysutil.h"
+#include "rx/asm.hpp"
+#include "rx/debug.hpp"
 #include "util/File.h"
 #include "util/JIT.h"
 #include "util/StrFmt.h"
 #include "util/StrUtil.h"
 #include "util/Thread.h"
-#include "util/asm.hpp"
 #include "util/console.h"
 #include "util/fixed_typemap.hpp"
 #include "util/logs.hpp"
@ -241,7 +242,7 @@ void jit_announce(uptr, usz, std::string_view);
  __android_log_write(ANDROID_LOG_FATAL, "RPCS3", buf.c_str());

  jit_announce(0, 0, "");
-  utils::trap();
+  rx::breakpoint();
  std::abort();
  std::terminate();
 }
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@ -0,0 +1,10 @@
+add_library(kernel INTERFACE)
+target_include_directories(kernel INTERFACE include)
+
+if (WITH_PS3)
+    add_subdirectory(cellos)
+endif()
+
+if (WITH_PS4)
+    add_subdirectory(orbis)
+endif()
--- a/kernel/cellos/src/lv2.cpp
+++ b/kernel/cellos/src/lv2.cpp
@ -53,10 +53,10 @@
 #include "sys_usbd.h"
 #include "sys_vm.h"

+#include "rx/tsc.hpp"
 #include "util/atomic_bit_set.h"
 #include "util/init_mutex.hpp"
 #include "util/sysinfo.hpp"
-#include "util/tsc.hpp"
 #include <algorithm>
 #include <deque>
 #include <optional>
@ -2138,7 +2138,7 @@ void lv2_obj::schedule_all(u64 current_time) {
  }

  if (const u64 freq = s_yield_frequency) {
-    const u64 tsc = utils::get_tsc();
+    const u64 tsc = rx::get_tsc();
    const u64 last_tsc = s_last_yield_tsc;

    if (tsc >= last_tsc && tsc <= s_max_allowed_yield_tsc &&
@ -2297,7 +2297,7 @@ mwaitx_func static void __mwaitx(u32 cycles, u32 cstate) {
 // First bit indicates cstate, 0x0 for C.02 state (lower power) or 0x1 for C.01
 // state (higher power)
 waitpkg_func static void __tpause(u32 cycles, u32 cstate) {
-  const u64 tsc = utils::get_tsc() + cycles;
+  const u64 tsc = rx::get_tsc() + cycles;
  _tpause(cstate, tsc);
 }
 #endif
--- a/kernel/cellos/src/sys_cond.cpp
+++ b/kernel/cellos/src/sys_cond.cpp
@ -9,7 +9,7 @@

 #include "sys_cond.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(sys_cond);

@ -454,7 +454,7 @@ error_code sys_cond_wait(ppu_thread &ppu, u32 cond_id, u64 timeout) {
    }

    for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) {
-      busy_wait(500);
+      rx::busy_wait(500);
    }

    if (ppu.state & cpu_flag::signal) {
--- a/kernel/cellos/src/sys_dbg.cpp
+++ b/kernel/cellos/src/sys_dbg.cpp
@ -8,7 +8,8 @@
 #include "Emu/Memory/vm_locking.h"
 #include "rpcsx/fw/ps3/sys_lv2dbg.h"

-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"

 void ppu_register_function_at(u32 addr, u32 size,
                              ppu_intrp_func_t ptr = nullptr);
@ -92,7 +93,7 @@ error_code sys_dbg_write_process_memory(s32 pid, u32 address, u32 size,

  for (u32 i = address, exec_update_size = 0; i < end;) {
    const u32 op_size =
-        std::min<u32>(utils::align<u32>(i + 1, 0x10000), end) - i;
+        std::min<u32>(rx::alignUp<u32>(i + 1, 0x10000), end) - i;

    const bool is_exec =
        vm::check_addr(i, vm::page_executable | vm::page_readable);
--- a/kernel/cellos/src/sys_event.cpp
+++ b/kernel/cellos/src/sys_event.cpp
@ -11,7 +11,7 @@
 #include "Emu/Cell/SPUThread.h"
 #include "sys_process.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(sys_event);

@ -468,7 +468,7 @@ error_code sys_event_queue_receive(ppu_thread &ppu, u32 equeue_id,
    }

    for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) {
-      busy_wait(500);
+      rx::busy_wait(500);
    }

    if (ppu.state & cpu_flag::signal) {
--- a/kernel/cellos/src/sys_event_flag.cpp
+++ b/kernel/cellos/src/sys_event_flag.cpp
@ -7,7 +7,7 @@
 #include "Emu/Cell/ErrorCodes.h"
 #include "Emu/Cell/PPUThread.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(sys_event_flag);

@ -195,7 +195,7 @@ error_code sys_event_flag_wait(ppu_thread &ppu, u32 id, u64 bitptn, u32 mode,
    }

    for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) {
-      busy_wait(500);
+      rx::busy_wait(500);
    }

    if (ppu.state & cpu_flag::signal) {
--- a/kernel/cellos/src/sys_fs.cpp
+++ b/kernel/cellos/src/sys_fs.cpp
@ -1,9 +1,9 @@
 #include "stdafx.h"

+#include "rx/asm.hpp"
 #include "sys_fs.h"
 #include "sys_memory.h"
 #include "sys_sync.h"
-#include "util/asm.hpp"

 #include "Crypto/unedat.h"
 #include "Emu/Cell/PPUThread.h"
@ -618,7 +618,7 @@ struct lv2_file::file_view : fs::file_base {
    fs::stat_t stat = m_file->file.get_stat();

    // TODO: Check this on realhw
-    // stat.size = utils::sub_saturate<u64>(stat.size, m_off);
+    // stat.size = rx::sub_saturate<u64>(stat.size, m_off);

    stat.is_writable = false;
    return stat;
@ -655,7 +655,7 @@ struct lv2_file::file_view : fs::file_base {
  }

  u64 size() override {
-    return utils::sub_saturate<u64>(m_file->file.size(), m_off);
+    return rx::sub_saturate<u64>(m_file->file.size(), m_off);
  }

  fs::file_id get_id() override {
--- a/kernel/cellos/src/sys_lwcond.cpp
+++ b/kernel/cellos/src/sys_lwcond.cpp
@ -8,7 +8,7 @@
 #include "Emu/Cell/PPUThread.h"
 #include "sys_lwmutex.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(sys_lwcond);

@ -490,7 +490,7 @@ error_code _sys_lwcond_queue_wait(ppu_thread &ppu, u32 lwcond_id,
    }

    for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) {
-      busy_wait(500);
+      rx::busy_wait(500);
    }

    if (ppu.state & cpu_flag::signal) {
--- a/kernel/cellos/src/sys_lwmutex.cpp
+++ b/kernel/cellos/src/sys_lwmutex.cpp
@ -7,7 +7,7 @@
 #include "Emu/Cell/ErrorCodes.h"
 #include "Emu/Cell/PPUThread.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(sys_lwmutex);

@ -194,7 +194,7 @@ error_code _sys_lwmutex_lock(ppu_thread &ppu, u32 lwmutex_id, u64 timeout) {
    }

    for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) {
-      busy_wait(500);
+      rx::busy_wait(500);
    }

    if (ppu.state & cpu_flag::signal) {
--- a/kernel/cellos/src/sys_memory.cpp
+++ b/kernel/cellos/src/sys_memory.cpp
@ -8,7 +8,8 @@
 #include "Emu/IdManager.h"
 #include "Emu/Memory/vm_locking.h"

-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(sys_memory);

@ -75,11 +76,11 @@ struct sys_memory_address_table {
 };

 std::shared_ptr<vm::block_t> reserve_map(u32 alloc_size, u32 align) {
-  return vm::reserve_map(
-      align == 0x10000 ? vm::user64k : vm::user1m, 0,
-      align == 0x10000 ? 0x20000000 : utils::align(alloc_size, 0x10000000),
-      align == 0x10000 ? (vm::page_size_64k | vm::bf0_0x1)
-                       : (vm::page_size_1m | vm::bf0_0x1));
+  return vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0,
+                         align == 0x10000 ? 0x20000000
+                                          : rx::alignUp(alloc_size, 0x10000000),
+                         align == 0x10000 ? (vm::page_size_64k | vm::bf0_0x1)
+                                          : (vm::page_size_1m | vm::bf0_0x1));
 }

 // Todo: fix order of error checks
--- a/kernel/cellos/src/sys_mutex.cpp
+++ b/kernel/cellos/src/sys_mutex.cpp
@ -5,7 +5,7 @@
 #include "Emu/Cell/ErrorCodes.h"
 #include "Emu/Cell/PPUThread.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 #include "sys_mutex.h"

@ -147,7 +147,7 @@ error_code sys_mutex_lock(ppu_thread &ppu, u32 mutex_id, u64 timeout) {
          // Try busy waiting a bit if advantageous
          for (u32 i = 0, end = lv2_obj::has_ppus_in_running_state() ? 3 : 10;
               id_manager::g_mutex.is_lockable() && i < end; i++) {
-            busy_wait(300);
+            rx::busy_wait(300);
            result = mutex.try_lock(ppu);

            if (!result ||
@ -212,7 +212,7 @@ error_code sys_mutex_lock(ppu_thread &ppu, u32 mutex_id, u64 timeout) {
    }

    for (usz i = 0; cpu_flag::signal - ppu.state && i < 40; i++) {
-      busy_wait(500);
+      rx::busy_wait(500);
    }

    if (ppu.state & cpu_flag::signal) {
--- a/kernel/cellos/src/sys_ppu_thread.cpp
+++ b/kernel/cellos/src/sys_ppu_thread.cpp
@ -15,7 +15,8 @@
 #include "sys_mmapper.h"
 #include "sys_process.h"

-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"

 #include <thread>

@ -148,7 +149,7 @@ void _sys_ppu_thread_exit(ppu_thread &ppu, u64 errorcode) {
  // Need to wait until the current writers finish
  if (ppu.state & cpu_flag::memory) {
    for (; writer_mask; writer_mask &= vm::g_range_lock_bits[1]) {
-      busy_wait(200);
+      rx::busy_wait(200);
    }
  }
 }
@ -468,7 +469,7 @@ error_code _sys_ppu_thread_create(ppu_thread &ppu, vm::ptr<u64> thread_id,
  const u32 tls = param->tls;

  // Compute actual stack size and allocate
-  const u32 stack_size = utils::align<u32>(std::max<u32>(_stacksz, 4096), 4096);
+  const u32 stack_size = rx::alignUp<u32>(std::max<u32>(_stacksz, 4096), 4096);

  auto &dct = g_fxo->get<lv2_memory_container>();

--- a/kernel/cellos/src/sys_rsx.cpp
+++ b/kernel/cellos/src/sys_rsx.cpp
@ -10,9 +10,9 @@
 #include "Emu/RSX/Core/RSXReservationLock.hpp"
 #include "Emu/RSX/RSXThread.h"
 #include "Emu/System.h"
+#include "rx/asm.hpp"
 #include "sys_event.h"
 #include "sys_vm.h"
-#include "util/asm.hpp"

 LOG_CHANNEL(sys_rsx);

@ -46,7 +46,7 @@ static void set_rsx_dmactl(rsx::thread *render, u64 get_put) {
        }
      }

-      utils::pause();
+      rx::pause();
    }

    // Schedule FIFO interrupt to deal with this immediately
--- a/kernel/cellos/src/sys_rwlock.cpp
+++ b/kernel/cellos/src/sys_rwlock.cpp
@ -7,7 +7,7 @@
 #include "Emu/Cell/ErrorCodes.h"
 #include "Emu/Cell/PPUThread.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(sys_rwlock);

@ -151,7 +151,7 @@ error_code sys_rwlock_rlock(ppu_thread &ppu, u32 rw_lock_id, u64 timeout) {
    }

    for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) {
-      busy_wait(500);
+      rx::busy_wait(500);
    }

    if (ppu.state & cpu_flag::signal) {
@ -355,7 +355,7 @@ error_code sys_rwlock_wlock(ppu_thread &ppu, u32 rw_lock_id, u64 timeout) {
    }

    for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) {
-      busy_wait(500);
+      rx::busy_wait(500);
    }

    if (ppu.state & cpu_flag::signal) {
--- a/kernel/cellos/src/sys_semaphore.cpp
+++ b/kernel/cellos/src/sys_semaphore.cpp
@ -7,7 +7,7 @@
 #include "Emu/Cell/ErrorCodes.h"
 #include "Emu/Cell/PPUThread.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(sys_semaphore);

@ -167,7 +167,7 @@ error_code sys_semaphore_wait(ppu_thread &ppu, u32 sem_id, u64 timeout) {
    }

    for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) {
-      busy_wait(500);
+      rx::busy_wait(500);
    }

    if (ppu.state & cpu_flag::signal) {
--- a/kernel/cellos/src/sys_spu.cpp
+++ b/kernel/cellos/src/sys_spu.cpp
@ -21,7 +21,8 @@
 #include "sys_mmapper.h"
 #include "sys_process.h"

-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(sys_spu);

@ -129,7 +130,7 @@ void sys_spu_image::load(const fs::file &stream) {
  this->nsegs = 0;
  this->segs = vm::null;

-  vm::page_protect(segs.addr(), utils::align(mem_size, 4096), 0, 0,
+  vm::page_protect(segs.addr(), rx::alignUp(mem_size, 4096), 0, 0,
                   vm::page_writable);
 }

@ -196,8 +197,8 @@ void sys_spu_image::deploy(u8 *loc, std::span<const sys_spu_segment> segs,
  }

  auto mem_translate = [loc](u32 addr, u32 size) {
-    return utils::add_saturate<u32>(addr, size) <= SPU_LS_SIZE ? loc + addr
-                                                               : nullptr;
+    return rx::add_saturate<u32>(addr, size) <= SPU_LS_SIZE ? loc + addr
+                                                            : nullptr;
  };

  // Apply the patch
@ -1259,7 +1260,7 @@ error_code sys_spu_thread_group_terminate(ppu_thread &ppu, u32 id, s32 value) {
  // termination
  auto short_sleep = [](ppu_thread &ppu) {
    lv2_obj::sleep(ppu);
-    busy_wait(3000);
+    rx::busy_wait(3000);
    ppu.check_state();
    ppu.state += cpu_flag::wait;
  };
--- a/kernel/cellos/src/sys_time.cpp
+++ b/kernel/cellos/src/sys_time.cpp
@ -5,8 +5,8 @@
 #include "Emu/Cell/ErrorCodes.h"
 #include "Emu/Cell/timers.hpp"
 #include "Emu/system_config.h"
+#include "rx/tsc.hpp"
 #include "sys_process.h"
-#include "util/tsc.hpp"

 #include "util/sysinfo.hpp"

@ -14,7 +14,7 @@ u64 g_timebase_offs{};
 static u64 systemtime_offset;

 #ifndef __linux__
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #endif

 #ifdef _WIN32
@ -151,7 +151,7 @@ u64 convert_to_timebased_time(u64 time) {

 u64 get_timebased_time() {
  if (u64 freq = utils::get_tsc_freq()) {
-    const u64 tsc = utils::get_tsc();
+    const u64 tsc = rx::get_tsc();

 #if _MSC_VER
    const u64 result =
@ -218,7 +218,7 @@ void initialize_timebased_time(u64 timebased_init, bool reset) {
 // Returns some relative time in microseconds, don't change this fact
 u64 get_system_time() {
  if (u64 freq = utils::get_tsc_freq()) {
-    const u64 tsc = utils::get_tsc();
+    const u64 tsc = rx::get_tsc();

 #if _MSC_VER
    const u64 result = static_cast<u64>(u128_from_mul(tsc, 1000000ull) / freq);
@ -358,7 +358,7 @@ error_code sys_time_get_current_time(vm::ptr<s64> sec, vm::ptr<s64> nsec) {

  // Get time difference in nanoseconds (using 128 bit accumulator)
  const u64 diff_sl = diff_base * 1000000000ull;
-  const u64 diff_sh = utils::umulh64(diff_base, 1000000000ull);
+  const u64 diff_sh = rx::umulh64(diff_base, 1000000000ull);
  const u64 diff = utils::udiv128(diff_sh, diff_sl, s_time_aux_info.perf_freq);

  // get time since Epoch in nanoseconds
--- a/kernel/cellos/src/sys_timer.cpp
+++ b/kernel/cellos/src/sys_timer.cpp
@ -9,9 +9,9 @@

 #include "Emu/System.h"
 #include "Emu/system_config.h"
+#include "rx/asm.hpp"
 #include "sys_event.h"
 #include "sys_process.h"
-#include "util/asm.hpp"

 #include <deque>
 #include <thread>
@ -77,9 +77,9 @@ u64 lv2_timer::check_unlocked(u64 _now) noexcept {

  if (period) {
    // Set next expiration time and check again
-    const u64 expire0 = utils::add_saturate<u64>(next, period);
+    const u64 expire0 = rx::add_saturate<u64>(next, period);
    expire.release(expire0);
-    return utils::sub_saturate<u64>(expire0, _now);
+    return rx::sub_saturate<u64>(expire0, _now);
  }

  // Stop after oneshot
@ -265,11 +265,11 @@ error_code _sys_timer_start(ppu_thread &ppu, u32 timer_id, u64 base_time,
        const u64 expire =
            period == 0 ? base_time : // oneshot
                base_time == 0
-                ? utils::add_saturate(start_time, period)
+                ? rx::add_saturate(start_time, period)
                :
                // periodic timer with no base (using start time as base)
-                start_time < utils::add_saturate(base_time, period)
-                ? utils::add_saturate(base_time, period)
+                start_time < rx::add_saturate(base_time, period)
+                ? rx::add_saturate(base_time, period)
                :
                // periodic with base time over start time
                [&]() -> u64 // periodic timer base before start time (align to
@ -282,10 +282,10 @@ error_code _sys_timer_start(ppu_thread &ppu, u32 timer_id, u64 base_time,
          // }
          // while (base_time < start_time);

-          const u64 start_time_with_base_time_reminder = utils::add_saturate(
+          const u64 start_time_with_base_time_reminder = rx::add_saturate(
              start_time - start_time % period, base_time % period);

-          return utils::add_saturate(
+          return rx::add_saturate(
              start_time_with_base_time_reminder,
              start_time_with_base_time_reminder < start_time ? period : 0);
        }();
@ -428,10 +428,10 @@ error_code sys_timer_usleep(ppu_thread &ppu, u64 sleep_time) {

    // Over/underflow checks
    if (add_time >= 0) {
-      sleep_time = utils::add_saturate<u64>(sleep_time, add_time);
+      sleep_time = rx::add_saturate<u64>(sleep_time, add_time);
    } else {
      sleep_time =
-          std::max<u64>(1, utils::sub_saturate<u64>(sleep_time, -add_time));
+          std::max<u64>(1, rx::sub_saturate<u64>(sleep_time, -add_time));
    }

    lv2_obj::sleep(ppu, g_cfg.core.sleep_timers_accuracy <
--- a/kernel/include/kernel/GlobalKernelObject.hpp
+++ b/kernel/include/kernel/GlobalKernelObject.hpp
@ -0,0 +1,139 @@
+#pragma once
+
+#include "rx/LinkedNode.hpp"
+#include "rx/Serializer.hpp"
+#include <cassert>
+
+namespace kernel {
+
+namespace detail {
+struct GlobalObjectCtl {
+  void (*construct)();
+  void (*destruct)();
+  void (*serialize)(rx::Serializer &);
+  void (*deserialize)(rx::Deserializer &);
+};
+
+template <typename NamespaceT, typename T> struct GlobalKernelObjectInstance {
+  static inline T *instance = nullptr;
+
+  static inline rx::LinkedNode<GlobalObjectCtl> ctl = {
+      .object = {
+          .construct = +[] { instance->construct(); },
+          .destruct = +[] { instance->destruct(); },
+          .serialize = +[](rx::Serializer &s) { instance->serialize(s); },
+          .deserialize = +[](rx::Deserializer &s) { instance->deserialize(s); },
+      },
+  };
+};
+} // namespace detail
+
+template <typename NamespaceT> struct GlobalKernelObjectStorage {
+  template <typename T> static void AddObject() {
+    auto node = &detail::GlobalKernelObjectInstance<NamespaceT, T>::ctl;
+    auto head = GetHead();
+    if (head) {
+      head->prev = node;
+      node->next = head;
+    }
+
+    *GetHeadPtr() = node;
+  }
+
+  static void ConstructAll() {
+    for (auto it = GetHead(); it != nullptr; it = it->next) {
+      it->object.construct();
+    }
+  }
+
+  static void DestructAll() {
+    for (auto it = GetHead(); it != nullptr; it = it->next) {
+      it->object.destruct();
+    }
+  }
+
+  static void SerializeAll(rx::Serializer &s) {
+    for (auto it = GetHead(); it != nullptr; it = it->next) {
+      it->object.serialize(s);
+    }
+  }
+
+  static void DeserializeAll(rx::Deserializer &s) {
+    for (auto it = GetHead(); it != nullptr; it = it->next) {
+      it->object.deserialize(s);
+    }
+  }
+
+private:
+  static rx::LinkedNode<detail::GlobalObjectCtl> *GetHead() {
+    return *GetHeadPtr();
+  }
+
+  static rx::LinkedNode<detail::GlobalObjectCtl> **GetHeadPtr() {
+    static rx::LinkedNode<detail::GlobalObjectCtl> *registry;
+    return &registry;
+  }
+};
+
+template <rx::Serializable T, typename NamespaceT>
+  requires std::is_default_constructible_v<T>
+class GlobalKernelObject {
+  union U {
+    T object;
+
+    U() {}
+    ~U() {}
+  };
+
+  U mHolder;
+
+public:
+  template <typename = void> GlobalKernelObject() {
+    auto &instance =
+        detail::GlobalKernelObjectInstance<NamespaceT,
+                                           GlobalKernelObject>::instance;
+    assert(instance == nullptr);
+    instance = this;
+    GlobalKernelObjectStorage<NamespaceT>::template AddObject<
+        GlobalKernelObject>();
+  }
+
+  T *operator->() { return &mHolder.object; }
+  const T *operator->() const { return &mHolder.object; }
+  T &operator*() { return mHolder.object; }
+  const T &operator*() const { return mHolder.object; }
+  operator T &() { return mHolder.object; }
+  operator const T &() const { return mHolder.object; }
+
+  void serialize(rx::Serializer &s)
+    requires rx::Serializable<T>
+  {
+    s.serialize(mHolder.object);
+  }
+
+  void deserialize(rx::Deserializer &s)
+    requires rx::Serializable<T>
+  {
+    std::construct_at(&mHolder.object);
+    s.deserialize(mHolder.object);
+  }
+
+  T &get() { return mHolder.object; }
+  const T &get() const { return mHolder.object; }
+
+private:
+  template <typename... Args>
+    requires(std::is_constructible_v<T, Args && ...>)
+  void construct(Args &&...args) noexcept(
+      std::is_nothrow_constructible_v<T, Args &&...>) {
+    std::construct_at(&mHolder.object, std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  void destruct() noexcept(std::is_nothrow_destructible_v<T>) {
+    mHolder.object.~T();
+  }
+
+  friend detail::GlobalKernelObjectInstance<NamespaceT, GlobalKernelObject>;
+};
+} // namespace kernel
--- a/kernel/orbis/CMakeLists.txt
+++ b/kernel/orbis/CMakeLists.txt
@ -67,7 +67,7 @@ add_library(obj.orbis-kernel OBJECT
  src/utils/Logs.cpp
 )

-target_link_libraries(obj.orbis-kernel PUBLIC orbis::kernel::config rx)
+target_link_libraries(obj.orbis-kernel PUBLIC orbis::kernel::config rx kernel)

 target_include_directories(obj.orbis-kernel
    PUBLIC
--- a/kernel/orbis/include/orbis/GlobalKernelObject.hpp
+++ b/kernel/orbis/include/orbis/GlobalKernelObject.hpp
@ -0,0 +1,26 @@
+#pragma once
+#include <kernel/GlobalKernelObject.hpp>
+
+namespace orbis {
+struct OrbisNamespace;
+
+template <rx::Serializable T>
+using GlobalKernelObject = kernel::GlobalKernelObject<T, OrbisNamespace>;
+
+template <rx::Serializable T> GlobalKernelObject<T> createGlobalObject() {
+  return {};
+}
+
+inline void constructAllGlobals() {
+  kernel::GlobalKernelObjectStorage<OrbisNamespace>::ConstructAll();
+}
+inline void destructAllGlobals() {
+  kernel::GlobalKernelObjectStorage<OrbisNamespace>::DestructAll();
+}
+
+template <typename T> T &getGlobalObject() {
+  assert(detail::GlobalKernelObjectInstance<GlobalKernelObject<T>>::instance);
+  return kernel::detail::GlobalKernelObjectInstance<
+             OrbisNamespace, GlobalKernelObject<T>>::instance->get();
+}
+} // namespace orbis
--- a/kernel/orbis/include/orbis/KernelContext.hpp
+++ b/kernel/orbis/include/orbis/KernelContext.hpp
@ -8,7 +8,6 @@
 #include "osem.hpp"
 #include "rx/IdMap.hpp"
 #include "rx/LinkedNode.hpp"
-#include "rx/SharedCV.hpp"
 #include "rx/SharedMutex.hpp"
 #include "thread/types.hpp"

@ -22,35 +21,6 @@ namespace orbis {
 struct Process;
 struct Thread;

-struct UmtxKey {
-  // TODO: may contain a reference to a shared memory
-  std::uintptr_t addr;
-  orbis::pid_t pid;
-
-  auto operator<=>(const UmtxKey &) const = default;
-};
-
-struct UmtxCond {
-  Thread *thr;
-  rx::shared_cv cv;
-
-  UmtxCond(Thread *thr) : thr(thr) {}
-};
-
-struct UmtxChain {
-  rx::shared_mutex mtx;
-  using queue_type = utils::kmultimap<UmtxKey, UmtxCond>;
-  queue_type sleep_queue;
-  queue_type spare_queue;
-
-  std::pair<const UmtxKey, UmtxCond> *enqueue(UmtxKey &key, Thread *thr);
-  void erase(std::pair<const UmtxKey, UmtxCond> *obj);
-  queue_type::iterator erase(queue_type::iterator it);
-  uint notify_one(const UmtxKey &key);
-  uint notify_all(const UmtxKey &key);
-  uint notify_n(const UmtxKey &key, sint count);
-};
-
 enum class FwType : std::uint8_t {
  Unknown,
  Ps4,
@ -172,26 +142,6 @@ public:
    kenvValue[len] = '0';
  }

-  enum {
-    c_golden_ratio_prime = 2654404609u,
-    c_umtx_chains = 512,
-    c_umtx_shifts = 23,
-  };
-
-  // Use getUmtxChain0 or getUmtxChain1
-  std::tuple<UmtxChain &, UmtxKey, std::unique_lock<rx::shared_mutex>>
-  getUmtxChainIndexed(int i, Thread *t, uint32_t flags, void *ptr);
-
-  // Internal Umtx: Wait/Cv/Sem
-  auto getUmtxChain0(Thread *t, uint32_t flags, void *ptr) {
-    return getUmtxChainIndexed(0, t, flags, ptr);
-  }
-
-  // Internal Umtx: Mutex/Umtx/Rwlock
-  auto getUmtxChain1(Thread *t, uint32_t flags, void *ptr) {
-    return getUmtxChainIndexed(1, t, flags, ptr);
-  }
-
  rx::Ref<EventEmitter> deviceEventEmitter;
  rx::Ref<rx::RcBase> shmDevice;
  rx::Ref<rx::RcBase> dmemDevice;
@ -235,8 +185,6 @@ private:
  utils::kmultimap<std::size_t, void *> m_free_heap;
  utils::kmultimap<std::size_t, void *> m_used_node;

-  UmtxChain m_umtx_chains[2][c_umtx_chains]{};
-
  std::atomic<long> m_tsc_freq{0};

  rx::shared_mutex m_thread_id_mtx;
--- a/kernel/orbis/src/KernelContext.cpp
+++ b/kernel/orbis/src/KernelContext.cpp
@ -283,24 +283,6 @@ void KernelContext::kfree(void *ptr, std::size_t size) {
  }
 }

-std::tuple<UmtxChain &, UmtxKey, std::unique_lock<rx::shared_mutex>>
-KernelContext::getUmtxChainIndexed(int i, Thread *t, uint32_t flags,
-                                   void *ptr) {
-  auto pid = t->tproc->pid;
-  auto p = reinterpret_cast<std::uintptr_t>(ptr);
-  if (flags & 1) {
-    pid = 0; // Process shared (TODO)
-    ORBIS_LOG_WARNING("Using process-shared umtx", t->tid, ptr, (p % 0x4000));
-    t->where();
-  }
-  auto n = p + pid;
-  if (flags & 1)
-    n %= 0x4000;
-  n = ((n * c_golden_ratio_prime) >> c_umtx_shifts) % c_umtx_chains;
-  std::unique_lock lock(m_umtx_chains[i][n].mtx);
-  return {m_umtx_chains[i][n], UmtxKey{p, pid}, std::move(lock)};
-}
-
 inline namespace utils {
 void kfree(void *ptr, std::size_t size) { return g_context.kfree(ptr, size); }
 void *kalloc(std::size_t size, std::size_t align) {
--- a/kernel/orbis/src/umtx.cpp
+++ b/kernel/orbis/src/umtx.cpp
@ -1,11 +1,85 @@
-#include "umtx.hpp"
+#include "orbis/umtx.hpp"
+#include "GlobalKernelObject.hpp"
 #include "error.hpp"
-#include "orbis/KernelContext.hpp"
+#include "orbis-config.hpp"
 #include "orbis/thread.hpp"
 #include "orbis/utils/Logs.hpp"
+#include "rx/Serializer.hpp"
 #include <limits>

 namespace orbis {
+struct UmtxKey {
+  // TODO: may contain a reference to a shared memory
+  std::uintptr_t addr;
+  orbis::pid_t pid;
+
+  auto operator<=>(const UmtxKey &) const = default;
+};
+
+struct UmtxCond {
+  Thread *thr;
+  rx::shared_cv cv;
+
+  UmtxCond(Thread *thr) : thr(thr) {}
+};
+
+struct UmtxChain {
+  rx::shared_mutex mtx;
+  using queue_type = utils::kmultimap<UmtxKey, UmtxCond>;
+  queue_type sleep_queue;
+  queue_type spare_queue;
+
+  std::pair<const UmtxKey, UmtxCond> *enqueue(UmtxKey &key, Thread *thr);
+  void erase(std::pair<const UmtxKey, UmtxCond> *obj);
+  queue_type::iterator erase(queue_type::iterator it);
+  uint notify_one(const UmtxKey &key);
+  uint notify_all(const UmtxKey &key);
+  uint notify_n(const UmtxKey &key, sint count);
+};
+
+struct UmtxStorage {
+  enum {
+    c_golden_ratio_prime = 2654404609u,
+    c_umtx_chains = 512,
+    c_umtx_shifts = 23,
+  };
+
+  UmtxChain m_umtx_chains[2][c_umtx_chains]{};
+
+  // Use getUmtxChain0 or getUmtxChain1
+  std::tuple<UmtxChain &, UmtxKey, std::unique_lock<rx::shared_mutex>>
+  getUmtxChainIndexed(int i, Thread *t, uint32_t flags, void *ptr) {
+    auto pid = t->tproc->pid;
+    auto p = reinterpret_cast<std::uintptr_t>(ptr);
+    if (flags & 1) {
+      pid = 0; // Process shared (TODO)
+      ORBIS_LOG_WARNING("Using process-shared umtx", t->tid, ptr, (p % 0x4000));
+      t->where();
+    }
+    auto n = p + pid;
+    if (flags & 1)
+      n %= 0x4000;
+    n = ((n * c_golden_ratio_prime) >> c_umtx_shifts) % c_umtx_chains;
+    std::unique_lock lock(m_umtx_chains[i][n].mtx);
+    return {m_umtx_chains[i][n], UmtxKey{p, pid}, std::move(lock)};
+  }
+
+  // Internal Umtx: Wait/Cv/Sem
+  auto getUmtxChain0(Thread *t, uint32_t flags, void *ptr) {
+    return getUmtxChainIndexed(0, t, flags, ptr);
+  }
+
+  // Internal Umtx: Mutex/Umtx/Rwlock
+  auto getUmtxChain1(Thread *t, uint32_t flags, void *ptr) {
+    return getUmtxChainIndexed(1, t, flags, ptr);
+  }
+
+  void serialize(rx::Serializer &) const {}
+  void deserialize(rx::Deserializer &) {}
+};
+
+static auto umtxStorage = createGlobalObject<UmtxStorage>();
+
 std::pair<const UmtxKey, UmtxCond> *UmtxChain::enqueue(UmtxKey &key,
                                                       Thread *thr) {
  if (!spare_queue.empty()) {
@ -80,7 +154,7 @@ orbis::ErrorCode orbis::umtx_unlock_umtx(Thread *thread, ptr<umtx> umtx,
 orbis::ErrorCode orbis::umtx_wait(Thread *thread, ptr<void> addr, ulong id,
                                  std::uint64_t ut, bool is32, bool ipc) {
  ORBIS_LOG_NOTICE(__FUNCTION__, thread->tid, addr, id, ut, is32);
-  auto [chain, key, lock] = g_context.getUmtxChain0(thread, ipc, addr);
+  auto [chain, key, lock] = umtxStorage->getUmtxChain0(thread, ipc, addr);
  auto node = chain.enqueue(key, thread);
  ErrorCode result = {};
  ulong val = 0;
@ -127,7 +201,7 @@ orbis::ErrorCode orbis::umtx_wait(Thread *thread, ptr<void> addr, ulong id,

 orbis::ErrorCode orbis::umtx_wake(Thread *thread, ptr<void> addr, sint n_wake) {
  ORBIS_LOG_NOTICE(__FUNCTION__, thread->tid, addr, n_wake);
-  auto [chain, key, lock] = g_context.getUmtxChain0(thread, true, addr);
+  auto [chain, key, lock] = umtxStorage->getUmtxChain0(thread, true, addr);
  if (key.pid == 0) {
    // IPC workaround (TODO)
    chain.notify_all(key);
@ -162,7 +236,7 @@ static ErrorCode do_lock_normal(Thread *thread, ptr<umutex> m, uint flags,
                                std::uint64_t ut, umutex_lock_mode mode) {
  ORBIS_LOG_TRACE(__FUNCTION__, thread->tid, m, flags, ut, mode);

-  auto [chain, key, lock] = g_context.getUmtxChain1(thread, flags, m);
+  auto [chain, key, lock] = umtxStorage->getUmtxChain1(thread, flags, m);
  ErrorCode error = {};
  while (true) {
    int owner = m->owner.load(std::memory_order_acquire);
@ -219,7 +293,7 @@ static ErrorCode do_lock_pp(Thread *thread, ptr<umutex> m, uint flags,
 static ErrorCode do_unlock_normal(Thread *thread, ptr<umutex> m, uint flags) {
  ORBIS_LOG_TRACE(__FUNCTION__, thread->tid, m, flags);

-  auto [chain, key, lock] = g_context.getUmtxChain1(thread, flags, m);
+  auto [chain, key, lock] = umtxStorage->getUmtxChain1(thread, flags, m);

  int owner = m->owner.load(std::memory_order_acquire);
  if ((owner & ~kUmutexContested) != thread->tid)
@ -344,7 +418,7 @@ orbis::ErrorCode orbis::umtx_cv_wait(Thread *thread, ptr<ucond> cv,
    }
  }

-  auto [chain, key, lock] = g_context.getUmtxChain0(thread, cv->flags, cv);
+  auto [chain, key, lock] = umtxStorage->getUmtxChain0(thread, cv->flags, cv);
  auto node = chain.enqueue(key, thread);

  if (!cv->has_waiters.load(std::memory_order::relaxed)) {
@ -398,7 +472,7 @@ orbis::ErrorCode orbis::umtx_cv_wait(Thread *thread, ptr<ucond> cv,

 orbis::ErrorCode orbis::umtx_cv_signal(Thread *thread, ptr<ucond> cv) {
  ORBIS_LOG_TRACE(__FUNCTION__, thread->tid, cv);
-  auto [chain, key, lock] = g_context.getUmtxChain0(thread, cv->flags, cv);
+  auto [chain, key, lock] = umtxStorage->getUmtxChain0(thread, cv->flags, cv);
  if (key.pid == 0) {
    // IPC workaround (TODO)
    chain.notify_all(key);
@ -413,7 +487,7 @@ orbis::ErrorCode orbis::umtx_cv_signal(Thread *thread, ptr<ucond> cv) {

 orbis::ErrorCode orbis::umtx_cv_broadcast(Thread *thread, ptr<ucond> cv) {
  ORBIS_LOG_TRACE(__FUNCTION__, thread->tid, cv);
-  auto [chain, key, lock] = g_context.getUmtxChain0(thread, cv->flags, cv);
+  auto [chain, key, lock] = umtxStorage->getUmtxChain0(thread, cv->flags, cv);
  chain.notify_all(key);
  cv->has_waiters.store(0, std::memory_order::relaxed);
  return {};
@ -423,7 +497,8 @@ orbis::ErrorCode orbis::umtx_rw_rdlock(Thread *thread, ptr<urwlock> rwlock,
                                       slong fflag, ulong ut) {
  ORBIS_LOG_TRACE(__FUNCTION__, thread->tid, rwlock, fflag, ut);
  auto flags = rwlock->flags;
-  auto [chain, key, lock] = g_context.getUmtxChain1(thread, flags & 1, rwlock);
+  auto [chain, key, lock] =
+      umtxStorage->getUmtxChain1(thread, flags & 1, rwlock);

  auto wrflags = kUrwLockWriteOwner;
  if (!(fflag & kUrwLockPreferReader) && !(flags & kUrwLockPreferReader)) {
@ -521,7 +596,8 @@ orbis::ErrorCode orbis::umtx_rw_wrlock(Thread *thread, ptr<urwlock> rwlock,
  ORBIS_LOG_TRACE(__FUNCTION__, thread->tid, rwlock, ut);

  auto flags = rwlock->flags;
-  auto [chain, key, lock] = g_context.getUmtxChain1(thread, flags & 1, rwlock);
+  auto [chain, key, lock] =
+      umtxStorage->getUmtxChain1(thread, flags & 1, rwlock);

  uint32_t blocked_readers = 0;
  ErrorCode error = {};
@ -626,7 +702,8 @@ orbis::ErrorCode orbis::umtx_rw_wrlock(Thread *thread, ptr<urwlock> rwlock,

 orbis::ErrorCode orbis::umtx_rw_unlock(Thread *thread, ptr<urwlock> rwlock) {
  auto flags = rwlock->flags;
-  auto [chain, key, lock] = g_context.getUmtxChain1(thread, flags & 1, rwlock);
+  auto [chain, key, lock] =
+      umtxStorage->getUmtxChain1(thread, flags & 1, rwlock);

  auto state = rwlock->state.load(std::memory_order::relaxed);
  if (state & kUrwLockWriteOwner) {
@ -681,7 +758,7 @@ orbis::ErrorCode orbis::umtx_rw_unlock(Thread *thread, ptr<urwlock> rwlock) {
 orbis::ErrorCode orbis::umtx_wake_private(Thread *thread, ptr<void> addr,
                                          sint n_wake) {
  ORBIS_LOG_TRACE(__FUNCTION__, thread->tid, addr, n_wake);
-  auto [chain, key, lock] = g_context.getUmtxChain0(thread, false, addr);
+  auto [chain, key, lock] = umtxStorage->getUmtxChain0(thread, false, addr);
  chain.notify_n(key, n_wake);
  return {};
 }
@ -711,7 +788,7 @@ orbis::ErrorCode orbis::umtx_wake_umutex(Thread *thread, ptr<umutex> m,
  if (ErrorCode err = uread(flags, &m->flags); err != ErrorCode{})
    return err;

-  auto [chain, key, lock] = g_context.getUmtxChain1(thread, flags, m);
+  auto [chain, key, lock] = umtxStorage->getUmtxChain1(thread, flags, m);

  int owner = m->owner.load(std::memory_order::acquire);
  if ((owner & ~kUmutexContested) != 0)
@ -736,7 +813,7 @@ orbis::ErrorCode orbis::umtx_wake_umutex(Thread *thread, ptr<umutex> m,
 orbis::ErrorCode orbis::umtx_sem_wait(Thread *thread, ptr<usem> sem,
                                      std::uint64_t ut) {
  ORBIS_LOG_TRACE(__FUNCTION__, sem, ut);
-  auto [chain, key, lock] = g_context.getUmtxChain0(thread, sem->flags, sem);
+  auto [chain, key, lock] = umtxStorage->getUmtxChain0(thread, sem->flags, sem);
  auto node = chain.enqueue(key, thread);

  std::uint32_t has_waiters = sem->has_waiters;
@ -785,7 +862,7 @@ orbis::ErrorCode orbis::umtx_sem_wait(Thread *thread, ptr<usem> sem,

 orbis::ErrorCode orbis::umtx_sem_wake(Thread *thread, ptr<usem> sem) {
  ORBIS_LOG_TRACE(__FUNCTION__, sem);
-  auto [chain, key, lock] = g_context.getUmtxChain0(thread, sem->flags, sem);
+  auto [chain, key, lock] = umtxStorage->getUmtxChain0(thread, sem->flags, sem);
  if (key.pid == 0) {
    // IPC workaround (TODO)
    chain.notify_all(key);
@ -819,7 +896,8 @@ orbis::ErrorCode orbis::umtx_wake2_umutex(Thread *thread, ptr<umutex> m,
  if (ErrorCode err = uread(flags, &m->flags); err != ErrorCode{})
    return err;

-  auto [chain, key, lock] = g_context.getUmtxChain1(thread, wakeFlags & 1, m);
+  auto [chain, key, lock] =
+      umtxStorage->getUmtxChain1(thread, wakeFlags & 1, m);

  int owner = 0;

@ -860,7 +938,8 @@ orbis::ErrorCode orbis::umtx_wake3_umutex(Thread *thread, ptr<umutex> m,
  if (ErrorCode err = uread(flags, &m->flags); err != ErrorCode{})
    return err;

-  auto [chain, key, lock] = g_context.getUmtxChain1(thread, wakeFlags & 1, m);
+  auto [chain, key, lock] =
+      umtxStorage->getUmtxChain1(thread, wakeFlags & 1, m);

  int owner = 0;
  std::size_t count = chain.sleep_queue.count(key);
--- a/ps3fw/cellAdec.cpp
+++ b/ps3fw/cellAdec.cpp
@ -1,4 +1,6 @@
 #include "stdafx.h"
+
+#include "rx/align.hpp"
 #include "Emu/perf_meter.hpp"
 #include "Emu/Cell/PPUModule.h"
 #include "cellos/sys_sync.h"
@ -9,7 +11,7 @@
 #include "cellAdec.h"

 #include "util/simd.hpp"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(cellAdec);

@ -415,7 +417,7 @@ void LpcmDecContext::exec(ppu_thread& ppu)
 			be_t<f32>* const _output = std::assume_aligned<0x80>(output.get_ptr());
 			s64 output_size = cmd.au_size;

-			s32 sample_num = static_cast<s32>(utils::align(+lpcm_param->audioPayloadSize, 0x10));
+			s32 sample_num = static_cast<s32>(rx::alignUp(+lpcm_param->audioPayloadSize, 0x10));
 			s32 channel_num = 0;

 			if (!dvd_packing)
@ -860,11 +862,11 @@ error_code _CellAdecCoreOpGetMemSize_lpcm(vm::ptr<CellAdecAttr> attr)
 	cellAdec.notice("_CellAdecCoreOpGetMemSize_lpcm(attr=*0x%x)", attr);

 	constexpr u32 mem_size =
-		utils::align(static_cast<u32>(sizeof(LpcmDecContext)), 0x80) + utils::align(static_cast<u32>(sizeof(CellAdecParamLpcm)), 0x80) + 0x100 // Command data for Spurs task
-		+ LPCM_DEC_OUTPUT_BUFFER_SIZE + 0x2900                                                                                                 // sizeof(CellSpurs) + sizeof(CellSpursTaskset)
-		+ 0x3b400                                                                                                                              // Spurs context
-		+ 0x300                                                                                                                                // (sizeof(CellSpursQueue) + 0x80 + queue buffer) * 2
-		+ 0x855;                                                                                                                               // Unused
+		rx::alignUp(static_cast<u32>(sizeof(LpcmDecContext)), 0x80) + rx::alignUp(static_cast<u32>(sizeof(CellAdecParamLpcm)), 0x80) + 0x100 // Command data for Spurs task
+		+ LPCM_DEC_OUTPUT_BUFFER_SIZE + 0x2900                                                                                               // sizeof(CellSpurs) + sizeof(CellSpursTaskset)
+		+ 0x3b400                                                                                                                            // Spurs context
+		+ 0x300                                                                                                                              // (sizeof(CellSpursQueue) + 0x80 + queue buffer) * 2
+		+ 0x855;                                                                                                                             // Unused

 	static_assert(mem_size == 0x7ebd5);

@ -883,7 +885,7 @@ error_code _CellAdecCoreOpOpenExt_lpcm(ppu_thread& ppu, vm::ptr<LpcmDecContext>
 	ensure(handle.aligned(0x80));                                                                                                                                     // LLE doesn't check the alignment or aligns the address itself
 	ensure(!!notifyAuDone && !!notifyAuDoneArg && !!notifyPcmOut && !!notifyPcmOutArg && !!notifyError && !!notifyErrorArg && !!notifySeqDone && !!notifySeqDoneArg); // These should always be set

-	const u32 end_of_context_addr = handle.addr() + utils::align(static_cast<u32>(sizeof(LpcmDecContext)), 0x80);
+	const u32 end_of_context_addr = handle.addr() + rx::alignUp(static_cast<u32>(sizeof(LpcmDecContext)), 0x80);

 	handle->cmd_queue.front = 0;
 	handle->cmd_queue.back = 0;
@ -1587,10 +1589,10 @@ error_code adecOpen(ppu_thread& ppu, vm::ptr<CellAdecType> type, vm::cptr<CellAd
 	const s32 pcm_handle_num = core_ops->getPcmHandleNum(ppu);
 	const u32 bitstream_info_size = core_ops->getBsiInfoSize(ppu);

-	const auto _this = vm::ptr<AdecContext>::make(utils::align(+res->startAddr, 0x80));
+	const auto _this = vm::ptr<AdecContext>::make(rx::alignUp(+res->startAddr, 0x80));
 	const auto frames = vm::ptr<AdecFrame>::make(_this.addr() + sizeof(AdecContext));
 	const u32 bitstream_infos_addr = frames.addr() + pcm_handle_num * sizeof(AdecFrame);
-	const auto core_handle = vm::ptr<void>::make(utils::align(bitstream_infos_addr + bitstream_info_size * pcm_handle_num, 0x80));
+	const auto core_handle = vm::ptr<void>::make(rx::alignUp(bitstream_infos_addr + bitstream_info_size * pcm_handle_num, 0x80));

 	if (type->audioCodecType == CELL_ADEC_TYPE_LPCM_DVD)
 	{
--- a/ps3fw/cellAtracXdec.cpp
+++ b/ps3fw/cellAtracXdec.cpp
@ -1,11 +1,13 @@
 #include "stdafx.h"
+
+#include "rx/align.hpp"
 #include "Emu/perf_meter.hpp"
 #include "Emu/Cell/PPUModule.h"
 #include "cellos/sys_sync.h"
 #include "cellos/sys_ppu_thread.h"
 #include "Emu/savestate_utils.hpp"
 #include "sysPrxForUser.h"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include "util/media_utils.h"

 #include "cellAtracXdec.h"
@ -182,7 +184,7 @@ error_code AtracXdecDecoder::set_config_info(u32 sampling_freq, u32 ch_config_id
 	this->sampling_freq = sampling_freq;
 	this->ch_config_idx = ch_config_idx;
 	this->nbytes = nbytes;
-	this->nbytes_128_aligned = utils::align(nbytes, 0x80);
+	this->nbytes_128_aligned = rx::alignUp(nbytes, 0x80);
 	this->nch_in = ch_config_idx <= 4 ? ch_config_idx : ch_config_idx + 1;

 	if (ch_config_idx > 7u)
@ -741,7 +743,7 @@ error_code _CellAdecCoreOpGetMemSize_atracx(vm::ptr<CellAdecAttr> attr)
 	constexpr u32 mem_size =
 		sizeof(AtracXdecContext) + 0x7f + ATXDEC_SPURS_STRUCTS_SIZE + 0x1d8 + atracXdecGetSpursMemSize(nch_in) + ATXDEC_SAMPLES_PER_FRAME * sizeof(f32) * nch_in;

-	attr->workMemSize = utils::align(mem_size, 0x80);
+	attr->workMemSize = rx::alignUp(mem_size, 0x80);

 	return CELL_OK;
 }
@ -765,7 +767,7 @@ error_code _CellAdecCoreOpOpenExt_atracx(ppu_thread& ppu, vm::ptr<AtracXdecConte
 	ensure(!!notifyAuDone && !!notifyAuDoneArg && !!notifyPcmOut && !!notifyPcmOutArg && !!notifyError && !!notifyErrorArg && !!notifySeqDone && !!notifySeqDoneArg); // These should always be set by cellAdec

 	write_to_ptr(handle.get_ptr(), AtracXdecContext(notifyAuDone, notifyAuDoneArg, notifyPcmOut, notifyPcmOutArg, notifyError, notifyErrorArg, notifySeqDone, notifySeqDoneArg,
-									   vm::bptr<u8>::make(handle.addr() + utils::align(static_cast<u32>(sizeof(AtracXdecContext)), 0x80) + ATXDEC_SPURS_STRUCTS_SIZE)));
+									   vm::bptr<u8>::make(handle.addr() + rx::alignUp(static_cast<u32>(sizeof(AtracXdecContext)), 0x80) + ATXDEC_SPURS_STRUCTS_SIZE)));

 	const vm::var<sys_mutex_attribute_t> mutex_attr{{SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, {"_atd001"_u64}}};
 	const vm::var<sys_cond_attribute_t> cond_attr{{SYS_SYNC_NOT_PROCESS_SHARED, 0, 0, {"_atd002"_u64}}};
--- a/ps3fw/cellDmux.cpp
+++ b/ps3fw/cellDmux.cpp
@ -1,4 +1,6 @@
 #include "stdafx.h"
+
+#include "rx/align.hpp"
 #include "Emu/System.h"
 #include "Emu/IdManager.h"
 #include "Emu/Cell/PPUModule.h"
@ -7,7 +9,7 @@
 #include "cellPamf.h"
 #include "cellDmux.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 #include <thread>

@ -765,7 +767,7 @@ PesHeader::PesHeader(DemuxerStream& stream)
 }

 ElementaryStream::ElementaryStream(Demuxer* dmux, u32 addr, u32 size, u32 fidMajor, u32 fidMinor, u32 sup1, u32 sup2, vm::ptr<CellDmuxCbEsMsg> cbFunc, u32 cbArg, u32 spec)
-	: put(utils::align(addr, 128)), dmux(dmux), memAddr(utils::align(addr, 128)), memSize(size - (addr - memAddr)), fidMajor(fidMajor), fidMinor(fidMinor), sup1(sup1), sup2(sup2), cbFunc(cbFunc), cbArg(cbArg), spec(spec)
+	: put(rx::alignUp(addr, 128)), dmux(dmux), memAddr(rx::alignUp(addr, 128)), memSize(size - (addr - memAddr)), fidMajor(fidMajor), fidMinor(fidMinor), sup1(sup1), sup2(sup2), cbFunc(cbFunc), cbArg(cbArg), spec(spec)
 {
 }

@ -849,7 +851,7 @@ void ElementaryStream::push_au(u32 size, u64 dts, u64 pts, u64 userdata, bool ra

 		addr = put;

-		put = utils::align(put + 128 + size, 128);
+		put = rx::alignUp(put + 128 + size, 128);

 		put_count++;
 	}
--- a/ps3fw/cellGame.cpp
+++ b/ps3fw/cellGame.cpp
@ -18,7 +18,7 @@
 #include "Crypto/utils.h"
 #include "Loader/PSF.h"
 #include "util/StrUtil.h"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include "util/init_mutex.hpp"

 #include <span>
@ -691,7 +691,7 @@ error_code cellHddGameGetSizeKB(ppu_thread& ppu, vm::ptr<u32> size)
 	// This function is very slow by nature
 	// TODO: Check if after first use the result is being cached so the sleep can
 	// be reduced in this case
-	lv2_sleep(utils::sub_saturate<u64>(dirsz == umax ? 2000 : 200000,
+	lv2_sleep(rx::sub_saturate<u64>(dirsz == umax ? 2000 : 200000,
 				  get_guest_system_time() - start_sleep),
 		&ppu);

@ -757,7 +757,7 @@ error_code cellGameDataGetSizeKB(ppu_thread& ppu, vm::ptr<u32> size)
 	// This function is very slow by nature
 	// TODO: Check if after first use the result is being cached so the sleep can
 	// be reduced in this case
-	lv2_sleep(utils::sub_saturate<u64>(dirsz == umax ? 2000 : 200000,
+	lv2_sleep(rx::sub_saturate<u64>(dirsz == umax ? 2000 : 200000,
 				  get_guest_system_time() - start_sleep),
 		&ppu);

@ -1127,7 +1127,7 @@ cellGameContentPermit(ppu_thread& ppu,
 	}

 	// This function is very slow by nature
-	lv2_sleep(utils::sub_saturate<u64>(
+	lv2_sleep(rx::sub_saturate<u64>(
 				  !perm.temp.empty() || perm.can_create ? 200000 : 2000,
 				  get_guest_system_time() - start_sleep),
 		&ppu);
@ -1886,7 +1886,7 @@ error_code cellGameGetSizeKB(ppu_thread& ppu, vm::ptr<s32> size)
 	// This function is very slow by nature
 	// TODO: Check if after first use the result is being cached so the sleep can
 	// be reduced in this case
-	lv2_sleep(utils::sub_saturate<u64>(dirsz == umax ? 1000 : 200000,
+	lv2_sleep(rx::sub_saturate<u64>(dirsz == umax ? 1000 : 200000,
 				  get_guest_system_time() - start_sleep),
 		&ppu);

--- a/ps3fw/cellGcmSys.cpp
+++ b/ps3fw/cellGcmSys.cpp
@ -10,7 +10,7 @@
 #include "cellGcmSys.h"
 #include "sysPrxForUser.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(cellGcmSys);

@ -1491,7 +1491,7 @@ s32 cellGcmCallback(ppu_thread& ppu, vm::ptr<CellGcmContextData> context, u32 co
 			return 0;
 		}

-		busy_wait();
+		rx::busy_wait();
 	}

 	return CELL_OK;
--- a/ps3fw/cellSaveData.cpp
+++ b/ps3fw/cellSaveData.cpp
@ -1,3 +1,5 @@
+#include "stdafx.h"
+
 #include "cellSysutil.h"
 #include "cellUserInfo.h"
 #include "Emu/Cell/PPUModule.h"
@ -12,7 +14,6 @@
 #include "Emu/localized_string.h"
 #include "Emu/savestate_utils.hpp"
 #include "Emu/system_config.h"
-#include "stdafx.h"

 #include "cellMsgDialog.h"
 #include "cellSaveData.h"
@ -26,7 +27,9 @@
 #include <mutex>
 #include <span>

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/types.hpp"

 LOG_CHANNEL(cellSaveData);

@ -65,11 +68,11 @@ std::string SaveDataEntry::date() const
 std::string SaveDataEntry::data_size() const
 {
 	std::string metric = "KB";
-	u64 sz = utils::aligned_div(size, 1000);
+	u64 sz = rx::aligned_div(size, 1000);
 	if (sz > 1000)
 	{
 		metric = "MB";
-		sz = utils::aligned_div(sz, 1000);
+		sz = rx::aligned_div(sz, 1000);
 	}
 	return fmt::format("%lu %s", sz, metric);
 }
@ -1286,7 +1289,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr<char> dirName,
 			{
 				if (!file.is_directory)
 				{
-					size_bytes += utils::align(file.size, 1024);
+					size_bytes += rx::alignUp(file.size, 1024);
 				}
 			}

@ -1728,7 +1731,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr<char> dirName,
 				statGet->fileNum++;

 				size_bytes +=
-					utils::align(entry.size, 1024); // firmware rounds this value up
+					rx::alignUp(entry.size, 1024); // firmware rounds this value up

 				if (statGet->fileListNum >= setBuf->fileListMax)
 					continue;
@ -2345,7 +2348,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr<char> dirName,
 		final_blist = fmt::merge(blist, "/");
 		psf::assign(
 			psf, "RPCS3_BLIST",
-			psf::string(utils::align(::size32(final_blist) + 1, 4), final_blist));
+			psf::string(rx::alignUp(::size32(final_blist) + 1, 4), final_blist));

 		// Write all files in temporary directory
 		auto& fsfo = all_files["PARAM.SFO"];
--- a/ps3fw/cellSpurs.cpp
+++ b/ps3fw/cellSpurs.cpp
@ -15,7 +15,7 @@
 #include "sysPrxForUser.h"
 #include "cellSpurs.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include "util/v128.hpp"
 #include "util/simd.hpp"

@ -4145,7 +4145,7 @@ s32 _spurs::create_task(vm::ptr<CellSpursTaskset> taskset, vm::ptr<u32> task_id,
 		{
 			v128 ls_pattern_128 = v128::from64r(ls_pattern->_u64[0], ls_pattern->_u64[1]);

-			const u32 ls_blocks = utils::popcnt128(ls_pattern_128._u);
+			const u32 ls_blocks = rx::popcnt128(ls_pattern_128._u);

 			if (ls_blocks > alloc_ls_blocks)
 			{
--- a/ps3fw/cellSpursSpu.cpp
+++ b/ps3fw/cellSpursSpu.cpp
@ -6,7 +6,7 @@
 #include "Emu/Cell/SPURecompiler.h"
 #include "cellSpurs.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include "util/v128.hpp"
 #include "util/simd.hpp"

@ -1446,7 +1446,7 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
 		{
 			v128 newlyReadyTasks = gv_andn(ready, signalled | pready);

-			numNewlyReadyTasks = utils::popcnt128(newlyReadyTasks._u);
+			numNewlyReadyTasks = rx::popcnt128(newlyReadyTasks._u);
 		}

 		v128 readyButNotRunning;
@ -1701,7 +1701,7 @@ s32 spursTasketSaveTaskContext(spu_thread& spu)
 	u32 allocLsBlocks = static_cast<u32>(taskInfo->context_save_storage_and_alloc_ls_blocks & 0x7F);
 	v128 ls_pattern = v128::from64r(taskInfo->ls_pattern._u64[0], taskInfo->ls_pattern._u64[1]);

-	const u32 lsBlocks = utils::popcnt128(ls_pattern._u);
+	const u32 lsBlocks = rx::popcnt128(ls_pattern._u);

 	if (lsBlocks > allocLsBlocks)
 	{
--- a/ps3fw/cellSysutilAvc2.cpp
+++ b/ps3fw/cellSysutilAvc2.cpp
@ -1,7 +1,9 @@
 #include "stdafx.h"
+
 #include "Emu/Cell/PPUModule.h"
 #include "Emu/IdManager.h"
-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"

 #include "sceNp.h"
 #include "sceNp2.h"
@ -946,7 +948,7 @@ error_code cellSysutilAvc2Load_shared(SceNpMatching2ContextId /*ctx_id*/, u32 /*
 				window_count++;
 			}

-			total_bitrate = utils::align<u32>(window_count * bitrate, 0x100000) + 0x100000;
+			total_bitrate = rx::alignUp<u32>(window_count * bitrate, 0x100000) + 0x100000;
 		}

 		settings.video_stream_sharing = init_param->video_param.video_stream_sharing;
--- a/ps3fw/cellVdec.cpp
+++ b/ps3fw/cellVdec.cpp
@ -1,3 +1,5 @@
+#include "stdafx.h"
+
 #include "Emu/Cell/PPUModule.h"
 #include "cellos/sys_ppu_thread.h"
 #include "cellos/sys_process.h"
@ -5,7 +7,7 @@
 #include "Emu/IdManager.h"
 #include "Emu/perf_meter.hpp"
 #include "Emu/savestate_utils.hpp"
-#include "stdafx.h"
+#include "rx/align.hpp"
 #include "sysPrxForUser.h"
 #include "util/media_utils.h"

@ -32,7 +34,7 @@ extern "C"
 #include "cellPamf.h"
 #include "cellVdec.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include "util/lockless.h"
 #include <cmath>
 #include <mutex>
@ -1660,7 +1662,7 @@ error_code cellVdecGetPicItem(ppu_thread& ppu, u32 handle,
 	const int buffer_size = av_image_get_buffer_size(
 		vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1);
 	ensure(buffer_size >= 0);
-	info->size = utils::align<u32>(buffer_size, 128);
+	info->size = rx::alignUp<u32>(buffer_size, 128);
 	info->auNum = 1;
 	info->auPts[0].lower = static_cast<u32>(pts);
 	info->auPts[0].upper = static_cast<u32>(pts >> 32);
--- a/ps3fw/sceNpTrophy.cpp
+++ b/ps3fw/sceNpTrophy.cpp
@ -20,7 +20,7 @@
 #include "cellos/sys_event.h"
 #include "cellos/sys_fs.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include <algorithm>
 #include <functional>
 #include <shared_mutex>
@ -1490,7 +1490,7 @@ error_code sceNpTrophyGetGameProgress(u32 context, u32 handle,
 	const u32 trp_count = ctxt->tropusr->GetTrophiesCount();

 	// Round result to nearest (TODO: Check 0 trophies)
-	*percentage = trp_count ? utils::rounded_div(unlocked * 100, trp_count) : 0;
+	*percentage = trp_count ? rx::rounded_div(unlocked * 100, trp_count) : 0;

 	if (trp_count == 0 || trp_count > 128)
 	{
--- a/ps3fw/sys_lwmutex_.cpp
+++ b/ps3fw/sys_lwmutex_.cpp
@ -7,7 +7,7 @@
 #include "cellos/sys_mutex.h"
 #include "sysPrxForUser.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(sysPrxForUser);

@ -151,7 +151,7 @@ error_code sys_lwmutex_lock(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex, u64

 	for (u32 i = 0; i < 10; i++)
 	{
-		busy_wait();
+		rx::busy_wait();

 		if (lwmutex->vars.owner.load() == lwmutex_free)
 		{
@ -210,7 +210,7 @@ error_code sys_lwmutex_lock(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex, u64
 		{
 			for (u32 i = 0; i < 10; i++)
 			{
-				busy_wait();
+				rx::busy_wait();

 				if (lwmutex->vars.owner.load() == lwmutex_free)
 				{
--- a/rpcs3/Crypto/unedat.cpp
+++ b/rpcs3/Crypto/unedat.cpp
@ -1,4 +1,6 @@
 #include "stdafx.h"
+
+#include "rx/align.hpp"
 #include "key_vault.h"
 #include "unedat.h"
 #include "sha1.h"
@ -8,7 +10,7 @@

 #include "Emu/system_utils.hpp"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include <algorithm>
 #include <span>

@ -233,7 +235,7 @@ s64 decrypt_block(const fs::file* in, u8* out, EDAT_HEADER* edat, NPD_HEADER* np

 	// Locate the real data.
 	const usz pad_length = length;
-	length = utils::align<usz>(pad_length, 0x10);
+	length = rx::alignUp<usz>(pad_length, 0x10);

 	// Setup buffers for decryption and read the data.
 	std::vector<u8> enc_data_buf(is_out_buffer_aligned || length == pad_length ? 0 : length);
@ -432,12 +434,12 @@ bool check_data(u8* key, EDAT_HEADER* edat, NPD_HEADER* npd, const fs::file* f,
 		return false;
 	}

-	const usz block_num = utils::aligned_div<u64>(edat->file_size, edat->block_size);
+	const usz block_num = rx::aligned_div<u64>(edat->file_size, edat->block_size);
 	constexpr usz metadata_offset = 0x100;
-	const usz metadata_size = utils::mul_saturate<u64>(metadata_section_size, block_num);
+	const usz metadata_size = rx::mul_saturate<u64>(metadata_section_size, block_num);
 	u64 metadata_section_offset = metadata_offset;

-	if (utils::add_saturate<u64>(utils::add_saturate<u64>(file_offset, metadata_section_offset), metadata_size) > f->size())
+	if (rx::add_saturate<u64>(rx::add_saturate<u64>(file_offset, metadata_section_offset), metadata_size) > f->size())
 	{
 		return false;
 	}
@ -860,7 +862,7 @@ bool EDATADecrypter::ReadHeader()
 	//}

 	file_size = edatHeader.file_size;
-	total_blocks = ::narrow<u32>(utils::aligned_div(edatHeader.file_size, edatHeader.block_size));
+	total_blocks = ::narrow<u32>(rx::aligned_div(edatHeader.file_size, edatHeader.block_size));

 	// Try decrypting the first block instead
 	u8 data_sample[1];
@ -886,7 +888,7 @@ u64 EDATADecrypter::ReadData(u64 pos, u8* data, u64 size)
 	// Now we need to offset things to account for the actual 'range' requested
 	const u64 startOffset = pos % edatHeader.block_size;

-	const u64 num_blocks = utils::aligned_div(startOffset + size, edatHeader.block_size);
+	const u64 num_blocks = rx::aligned_div(startOffset + size, edatHeader.block_size);

 	// Find and decrypt block range covering pos + size
 	const u32 starting_block = ::narrow<u32>(pos / edatHeader.block_size);
--- a/rpcs3/Crypto/unself.cpp
+++ b/rpcs3/Crypto/unself.cpp
@ -1,7 +1,7 @@
 #include "stdafx.h"
 #include "aes.h"
 #include "unself.h"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include "Emu/System.h"
 #include "Emu/system_utils.hpp"
 #include "Crypto/unzip.h"
@ -887,7 +887,7 @@ bool SELFDecrypter::LoadHeaders(bool isElf32, SelfAdditionalInfo* out_info)
 		m_seg_ext_hdr.back().Load(self_f);
 	}

-	if (m_ext_hdr.version_hdr_offset == 0 || utils::add_saturate<u64>(m_ext_hdr.version_hdr_offset, sizeof(version_header)) > self_f.size())
+	if (m_ext_hdr.version_hdr_offset == 0 || rx::add_saturate<u64>(m_ext_hdr.version_hdr_offset, sizeof(version_header)) > self_f.size())
 	{
 		return false;
 	}
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@ -1,4 +1,5 @@
 #include "stdafx.h"
+
 #include "CPUThread.h"
 #include "CPUDisAsm.h"

@ -14,7 +15,7 @@
 #include "Emu/RSX/RSXThread.h"
 #include "Emu/perf_meter.hpp"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include <thread>
 #include <unordered_map>
 #include <map>
@ -64,7 +65,6 @@ void fmt_class_string<cpu_flag>::format(std::string& out, u64 arg)
 			case cpu_flag::dbg_global_pause: return "G-PAUSE";
 			case cpu_flag::dbg_pause: return "PAUSE";
 			case cpu_flag::dbg_step: return "STEP";
-			case cpu_flag::bitset_last: break;
 			}

 			return unknown;
@ -124,7 +124,7 @@ void fmt_class_string<cpu_threads_emulation_info_dump_t>::format(std::string& ou

 				for (u32 i = 0; !rlock.try_lock() && i < 100; i++)
 				{
-					busy_wait();
+					rx::busy_wait();
 				}

 				if (rlock)
@ -533,7 +533,7 @@ namespace cpu_counter
 			if (ok) [[likely]]
 			{
 				// Get actual slot number
-				id = utils::ctz128(~bits);
+				id = rx::ctz128(~bits);

 				// Register thread
 				if (s_cpu_list[id].compare_and_swap_test(nullptr, _this)) [[likely]]
@ -552,7 +552,7 @@ namespace cpu_counter
 				return;
 			}

-			busy_wait(300);
+			rx::busy_wait(300);
 		}

 		s_tls_thread_slot = id;
@ -599,7 +599,7 @@ namespace cpu_counter
 	{
 		for (u128 bits = copy; bits; bits &= bits - 1)
 		{
-			const u32 index = utils::ctz128(bits);
+			const u32 index = rx::ctz128(bits);

 			if (cpu_thread* cpu = s_cpu_list[index].load())
 			{
@ -1062,7 +1062,7 @@ bool cpu_thread::check_state() noexcept
 					{
 						if (i < 20 || ctr & 1)
 						{
-							busy_wait(300);
+							rx::busy_wait(300);
 						}
 						else
 						{
@ -1404,7 +1404,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept
 			{
 				if (cpu != _this)
 				{
-					utils::prefetch_write(&cpu->state);
+					rx::prefetch_write(&cpu->state);
 					return true;
 				}

@ -1446,7 +1446,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept
 				break;
 			}

-			utils::pause();
+			rx::pause();
 		}

 		// Second increment: all threads paused
@ -1480,13 +1480,13 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept
 		{
 			for (u32 i = 0; i < work->prf_size; i++)
 			{
-				utils::prefetch_write(work->prf_list[0]);
+				rx::prefetch_write(work->prf_list[0]);
 			}
 		}

 		cpu_counter::for_all_cpu(copy2, [&](cpu_thread* cpu)
 			{
-				utils::prefetch_write(&cpu->state);
+				rx::prefetch_write(&cpu->state);
 				return true;
 			});

--- a/rpcs3/Emu/CPU/sse2neon.h
+++ b/rpcs3/Emu/CPU/sse2neon.h
@ -8719,10 +8719,22 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)

 	uint8x16_t dest = {
 		// Undo ShiftRows step from AESE and extract X1 and X3
-		u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
-		u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
-		u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
-		u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
+		u8[0x4],
+		u8[0x1],
+		u8[0xE],
+		u8[0xB], // SubBytes(X1)
+		u8[0x1],
+		u8[0xE],
+		u8[0xB],
+		u8[0x4], // ROT(SubBytes(X1))
+		u8[0xC],
+		u8[0x9],
+		u8[0x6],
+		u8[0x3], // SubBytes(X3)
+		u8[0x9],
+		u8[0x6],
+		u8[0x3],
+		u8[0xC], // ROT(SubBytes(X3))
 	};
 	uint32x4_t r = {0, (unsigned)rcon, 0, (unsigned)rcon};
 	return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
--- a/rpcs3/Emu/Cell/PPUAnalyser.cpp
+++ b/rpcs3/Emu/Cell/PPUAnalyser.cpp
@ -1,4 +1,5 @@
 #include "stdafx.h"
+
 #include "PPUAnalyser.h"

 #include "cellos/sys_sync.h"
@ -8,7 +9,8 @@

 #include <unordered_set>
 #include "util/yaml.hpp"
-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"

 LOG_CHANNEL(ppu_validator);

@ -25,7 +27,6 @@ void fmt_class_string<ppu_attr>::format(std::string& out, u64 arg)
 			case ppu_attr::no_return: return "no_return";
 			case ppu_attr::no_size: return "no_size";
 			case ppu_attr::has_mfvscr: return "has_mfvscr";
-			case ppu_attr::bitset_last: break;
 			}

 			return unknown;
@ -2243,7 +2244,7 @@ bool ppu_module<lv2_obj>::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con
 										}
 									}

-									jt_end = utils::align<u32>(static_cast<u32>(std::min<u64>(jt_end - 1, ctr(maxv) - 1) + 1), 4);
+									jt_end = rx::alignUp<u32>(static_cast<u32>(std::min<u64>(jt_end - 1, ctr(maxv) - 1) + 1), 4);

 									get_jumptable_end(jumpatble_off, jumpatble_ptr, false);

@ -2882,7 +2883,7 @@ bool ppu_module<lv2_obj>::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con
 				block.attr = ppu_attr::no_size;
 			}

-			per_instruction_bytes += utils::sub_saturate<u32>(lim, func.addr);
+			per_instruction_bytes += rx::sub_saturate<u32>(lim, func.addr);
 			addr_next = std::max<u32>(addr_next, lim);
 			continue;
 		}
@ -3291,7 +3292,7 @@ bool ppu_module<lv2_obj>::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con

 	if (per_instruction_bytes)
 	{
-		const bool error = per_instruction_bytes >= 200 && per_instruction_bytes / 4 >= utils::aligned_div<u32>(::size32(funcs), 128);
+		const bool error = per_instruction_bytes >= 200 && per_instruction_bytes / 4 >= rx::aligned_div<u32>(::size32(funcs), 128);
 		(error ? ppu_log.error : ppu_log.notice)("%d instructions will be compiled on per-instruction basis in total", per_instruction_bytes / 4);
 	}

--- a/rpcs3/Emu/Cell/PPUAnalyser.h
+++ b/rpcs3/Emu/Cell/PPUAnalyser.h
@ -1,11 +1,12 @@
 #pragma once

+#include <functional>
 #include <string>
 #include <map>
 #include <deque>
 #include <span>
-#include "util/types.hpp"
-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/types.hpp"
 #include "util/to_endian.hpp"

 #include "rx/EnumBitSet.hpp"
@ -218,7 +219,7 @@ struct ppu_module : public Type
 		const u32 seg_size = seg.size;
 		const u32 seg_addr = seg.addr;

-		if (seg_size >= std::max<usz>(size_bytes, 1) && addr <= utils::align<u32>(seg_addr + seg_size, 0x10000) - size_bytes)
+		if (seg_size >= std::max<usz>(size_bytes, 1) && addr <= rx::alignUp<u32>(seg_addr + seg_size, 0x10000) - size_bytes)
 		{
 			return reinterpret_cast<to_be_t<T>*>(static_cast<u8*>(seg.ptr) + (addr - seg_addr));
 		}
--- a/rpcs3/Emu/Cell/PPUDisAsm.cpp
+++ b/rpcs3/Emu/Cell/PPUDisAsm.cpp
@ -4,7 +4,7 @@
 #include "PPUAnalyser.h"
 #include "Emu/IdManager.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 #include <cmath>

@ -222,7 +222,7 @@ std::pair<PPUDisAsm::const_op, u64> PPUDisAsm::try_get_const_op_gpr_value(u32 re

 			GET_CONST_REG(reg_rs, op.rs);

-			return {form, utils::rol64(reg_rs, op.sh64) & (~0ull << (op.mbe64 ^ 63))};
+			return {form, rx::rol64(reg_rs, op.sh64) & (~0ull << (op.mbe64 ^ 63))};
 		}
 		case ppu_itype::OR:
 		{
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@ -15,7 +15,7 @@
 #include <cmath>
 #include <climits>

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include "util/v128.hpp"
 #include "util/simd.hpp"
 #include "util/sysinfo.hpp"
@ -3509,7 +3509,7 @@ auto RLWIMI()
 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
 		const u64 mask = ppu_rotate_mask(32 + op.mb32, 32 + op.me32);
-		ppu.gpr[op.ra] = (ppu.gpr[op.ra] & ~mask) | (dup32(utils::rol32(static_cast<u32>(ppu.gpr[op.rs]), op.sh32)) & mask);
+		ppu.gpr[op.ra] = (ppu.gpr[op.ra] & ~mask) | (dup32(rx::rol32(static_cast<u32>(ppu.gpr[op.rs]), op.sh32)) & mask);
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
 	};
@ -3524,7 +3524,7 @@ auto RLWINM()

 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
-		ppu.gpr[op.ra] = dup32(utils::rol32(static_cast<u32>(ppu.gpr[op.rs]), op.sh32)) & ppu_rotate_mask(32 + op.mb32, 32 + op.me32);
+		ppu.gpr[op.ra] = dup32(rx::rol32(static_cast<u32>(ppu.gpr[op.rs]), op.sh32)) & ppu_rotate_mask(32 + op.mb32, 32 + op.me32);
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
 	};
@ -3539,7 +3539,7 @@ auto RLWNM()

 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
-		ppu.gpr[op.ra] = dup32(utils::rol32(static_cast<u32>(ppu.gpr[op.rs]), ppu.gpr[op.rb] & 0x1f)) & ppu_rotate_mask(32 + op.mb32, 32 + op.me32);
+		ppu.gpr[op.ra] = dup32(rx::rol32(static_cast<u32>(ppu.gpr[op.rs]), ppu.gpr[op.rb] & 0x1f)) & ppu_rotate_mask(32 + op.mb32, 32 + op.me32);
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
 	};
@ -3634,7 +3634,7 @@ auto RLDICL()

 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
-		ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], op.sh64) & (~0ull >> op.mbe64);
+		ppu.gpr[op.ra] = rx::rol64(ppu.gpr[op.rs], op.sh64) & (~0ull >> op.mbe64);
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
 	};
@ -3649,7 +3649,7 @@ auto RLDICR()

 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
-		ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], op.sh64) & (~0ull << (op.mbe64 ^ 63));
+		ppu.gpr[op.ra] = rx::rol64(ppu.gpr[op.rs], op.sh64) & (~0ull << (op.mbe64 ^ 63));
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
 	};
@ -3664,7 +3664,7 @@ auto RLDIC()

 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
-		ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], op.sh64) & ppu_rotate_mask(op.mbe64, op.sh64 ^ 63);
+		ppu.gpr[op.ra] = rx::rol64(ppu.gpr[op.rs], op.sh64) & ppu_rotate_mask(op.mbe64, op.sh64 ^ 63);
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
 	};
@ -3680,7 +3680,7 @@ auto RLDIMI()
 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
 		const u64 mask = ppu_rotate_mask(op.mbe64, op.sh64 ^ 63);
-		ppu.gpr[op.ra] = (ppu.gpr[op.ra] & ~mask) | (utils::rol64(ppu.gpr[op.rs], op.sh64) & mask);
+		ppu.gpr[op.ra] = (ppu.gpr[op.ra] & ~mask) | (rx::rol64(ppu.gpr[op.rs], op.sh64) & mask);
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
 	};
@ -3695,7 +3695,7 @@ auto RLDCL()

 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
-		ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], ppu.gpr[op.rb] & 0x3f) & (~0ull >> op.mbe64);
+		ppu.gpr[op.ra] = rx::rol64(ppu.gpr[op.rs], ppu.gpr[op.rb] & 0x3f) & (~0ull >> op.mbe64);
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
 	};
@ -3710,7 +3710,7 @@ auto RLDCR()

 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
-		ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], ppu.gpr[op.rb] & 0x3f) & (~0ull << (op.mbe64 ^ 63));
+		ppu.gpr[op.ra] = rx::rol64(ppu.gpr[op.rs], ppu.gpr[op.rb] & 0x3f) & (~0ull << (op.mbe64 ^ 63));
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
 	};
@ -3842,7 +3842,7 @@ auto MULHDU()

 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
-		ppu.gpr[op.rd] = utils::umulh64(ppu.gpr[op.ra], ppu.gpr[op.rb]);
+		ppu.gpr[op.rd] = rx::umulh64(ppu.gpr[op.ra], ppu.gpr[op.rb]);
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
 	};
@ -4243,7 +4243,7 @@ auto MULHD()

 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
-		ppu.gpr[op.rd] = utils::mulh64(ppu.gpr[op.ra], ppu.gpr[op.rb]);
+		ppu.gpr[op.rd] = rx::mulh64(ppu.gpr[op.ra], ppu.gpr[op.rb]);
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.rd], 0);
 	};
@ -4675,7 +4675,7 @@ auto MULLD()
 		ppu.gpr[op.rd] = RA * RB;
 		if (op.oe) [[unlikely]]
 		{
-			const s64 high = utils::mulh64(RA, RB);
+			const s64 high = rx::mulh64(RA, RB);
 			ppu_ov_set(ppu, high != s64(ppu.gpr[op.rd]) >> 63);
 		}
 		if constexpr (((Flags == has_rc) || ...))
--- a/rpcs3/Emu/Cell/PPUModule.cpp
+++ b/rpcs3/Emu/Cell/PPUModule.cpp
@ -28,7 +28,8 @@
 #include <span>
 #include <set>
 #include <algorithm>
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"

 LOG_CHANNEL(ppu_loader);

@ -341,7 +342,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link, utils::serial* ar = n
 	if (!hle_funcs_addr)
 		hle_funcs_addr = vm::alloc(::size32(hle_funcs) * 8, vm::main);
 	else
-		vm::page_protect(hle_funcs_addr, utils::align(::size32(hle_funcs) * 8, 0x1000), 0, vm::page_writable);
+		vm::page_protect(hle_funcs_addr, rx::alignUp(::size32(hle_funcs) * 8, 0x1000), 0, vm::page_writable);

 	// Initialize as PPU executable code
 	ppu_register_range(hle_funcs_addr, ::size32(hle_funcs) * 8);
@ -359,7 +360,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link, utils::serial* ar = n
 	}

 	// Set memory protection to read-only
-	vm::page_protect(hle_funcs_addr, utils::align(::size32(hle_funcs) * 8, 0x1000), 0, 0, vm::page_writable);
+	vm::page_protect(hle_funcs_addr, rx::alignUp(::size32(hle_funcs) * 8, 0x1000), 0, 0, vm::page_writable);

 	// Initialize function names
 	const bool is_first = g_ppu_function_names.empty();
@ -489,7 +490,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link, utils::serial* ar = n
 			}
 			else
 			{
-				const u32 next = utils::align(alloc_addr, variable.second.align);
+				const u32 next = rx::alignUp(alloc_addr, variable.second.align);
 				const u32 end = next + variable.second.size - 1;

 				if (!next || (end >> 16 != alloc_addr >> 16))
@ -1191,7 +1192,7 @@ static void ppu_check_patch_spu_images(const ppu_module<lv2_obj>& mod, const ppu

 	u32 prev_bound = 0;

-	for (u32 i = find_first_of_multiple(seg_view, prefixes, 0); i < seg.size; i = find_first_of_multiple(seg_view, prefixes, utils::align<u32>(i + 1, 4)))
+	for (u32 i = find_first_of_multiple(seg_view, prefixes, 0); i < seg.size; i = find_first_of_multiple(seg_view, prefixes, rx::alignUp<u32>(i + 1, 4)))
 	{
 		const auto elf_header = ensure(mod.get_ptr<u8>(seg.addr + i));

@ -1201,7 +1202,7 @@ static void ppu_check_patch_spu_images(const ppu_module<lv2_obj>& mod, const ppu
 			const u32 old_i = i;
 			u32 guid_start = umax, guid_end = umax;

-			for (u32 search = i & -128, tries = 10; tries && search >= prev_bound; tries--, search = utils::sub_saturate<u32>(search, 128))
+			for (u32 search = i & -128, tries = 10; tries && search >= prev_bound; tries--, search = rx::sub_saturate<u32>(search, 128))
 			{
 				if (seg_view[search] != 0x42 && seg_view[search] != 0x43)
 				{
@ -1271,7 +1272,7 @@ static void ppu_check_patch_spu_images(const ppu_module<lv2_obj>& mod, const ppu
 				if (addr_last >= 0x80 && valid_count >= 2)
 				{
 					const u32 begin = i & -128;
-					u32 end = std::min<u32>(seg.size, utils::align<u32>(i + addr_last + 256, 128));
+					u32 end = std::min<u32>(seg.size, rx::alignUp<u32>(i + addr_last + 256, 128));

 					u32 guessed_ls_addr = 0;

@ -1611,7 +1612,7 @@ shared_ptr<lv2_prx> ppu_load_prx(const ppu_prx_object& elf, bool virtual_load, c

 				if (virtual_load)
 				{
-					addr = std::exchange(allocating_address, allocating_address + utils::align<u32>(mem_size, 0x10000));
+					addr = std::exchange(allocating_address, allocating_address + rx::alignUp<u32>(mem_size, 0x10000));
 				}
 				else
 				{
@ -1625,7 +1626,7 @@ shared_ptr<lv2_prx> ppu_load_prx(const ppu_prx_object& elf, bool virtual_load, c
 					// Leave additional room for the analyser so it can safely access beyond limit a bit
 					// Because with VM the address sapce is not really a limit so any u32 address is valid there, here it is UB to create pointer that goes beyond the boundaries
 					// TODO: Use make_shared_for_overwrite when all compilers support it
-					const usz alloc_size = utils::align<usz>(mem_size, 0x10000) + 4096;
+					const usz alloc_size = rx::alignUp<usz>(mem_size, 0x10000) + 4096;
 					prx->allocations.push_back(std::shared_ptr<u8[]>(new u8[alloc_size]));
 					_seg.ptr = prx->allocations.back().get();
 					std::memset(static_cast<u8*>(_seg.ptr) + prog.bin.size(), 0, alloc_size - 4096 - prog.bin.size());
@ -1725,7 +1726,7 @@ shared_ptr<lv2_prx> ppu_load_prx(const ppu_prx_object& elf, bool virtual_load, c
 			{
 				const auto& rel = reinterpret_cast<const ppu_prx_relocation_info&>(prog.bin[i]);

-				if (rel.offset >= utils::align<u64>(::at32(prx->segs, rel.index_addr).size, 0x100))
+				if (rel.offset >= rx::alignUp<u64>(::at32(prx->segs, rel.index_addr).size, 0x100))
 				{
 					fmt::throw_exception("Relocation offset out of segment memory! (offset=0x%x, index_addr=%u, seg_size=0x%x)", rel.offset, rel.index_addr, prx->segs[rel.index_addr].size);
 				}
@ -2201,7 +2202,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str
 				// Leave additional room for the analyser so it can safely access beyond limit a bit
 				// Because with VM the address sapce is not really a limit so any u32 address is valid there, here it is UB to create pointer that goes beyond the boundaries
 				// TODO: Use make_shared_for_overwrite when all compilers support it
-				const usz alloc_size = utils::align<usz>(size, 0x10000) + 4096;
+				const usz alloc_size = rx::alignUp<usz>(size, 0x10000) + 4096;
 				_main.allocations.push_back(std::shared_ptr<u8[]>(new u8[alloc_size]));
 				_seg.ptr = _main.allocations.back().get();
 				std::memset(static_cast<u8*>(_seg.ptr) + prog.bin.size(), 0, alloc_size - 4096 - prog.bin.size());
@ -2247,7 +2248,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str
 			else
 			{
 				// For backwards compatibility: already loaded memory will always be writable
-				const u32 size0 = utils::align(size + addr % 0x10000, 0x10000);
+				const u32 size0 = rx::alignUp(size + addr % 0x10000, 0x10000);
 				const u32 addr0 = addr & -0x10000;
 				vm::page_protect(addr0, size0, 0, vm::page_writable | vm::page_readable, vm::page_executable);
 			}
@ -2721,7 +2722,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str
 	default:
 	{
 		// According to elad335, the min value seems to be 64KB instead of the expected 4KB (SYS_PROCESS_PARAM_STACK_SIZE_MIN)
-		primary_stacksize = utils::align<u32>(std::clamp<u32>(sz, 0x10000, SYS_PROCESS_PARAM_STACK_SIZE_MAX), 4096);
+		primary_stacksize = rx::alignUp<u32>(std::clamp<u32>(sz, 0x10000, SYS_PROCESS_PARAM_STACK_SIZE_MAX), 4096);
 		break;
 	}
 	}
@ -2738,29 +2739,29 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str
 	if (!Emu.data.empty())
 	{
 		std::memcpy(vm::base(ppu->stack_addr + ppu->stack_size - ::size32(Emu.data)), Emu.data.data(), Emu.data.size());
-		ppu->gpr[1] -= utils::align<u32>(::size32(Emu.data), 0x10);
+		ppu->gpr[1] -= rx::alignUp<u32>(::size32(Emu.data), 0x10);
 	}

 	// Initialize process arguments

 	// Calculate storage requirements on the stack
-	const u32 pointers_storage_size = u32{sizeof(u64)} * utils::align<u32>(::size32(Emu.envp) + ::size32(Emu.argv) + 2, 2);
+	const u32 pointers_storage_size = u32{sizeof(u64)} * rx::alignUp<u32>(::size32(Emu.envp) + ::size32(Emu.argv) + 2, 2);

 	u32 stack_alloc_size = pointers_storage_size;

 	for (const auto& arg : Emu.argv)
 	{
-		stack_alloc_size += utils::align<u32>(::size32(arg) + 1, 0x10);
+		stack_alloc_size += rx::alignUp<u32>(::size32(arg) + 1, 0x10);
 	}

 	for (const auto& arg : Emu.envp)
 	{
-		stack_alloc_size += utils::align<u32>(::size32(arg) + 1, 0x10);
+		stack_alloc_size += rx::alignUp<u32>(::size32(arg) + 1, 0x10);
 	}

 	ensure(ppu->stack_size > stack_alloc_size);

-	vm::ptr<u64> args = vm::cast(static_cast<u32>(ppu->stack_addr + ppu->stack_size - stack_alloc_size - utils::align<u32>(::size32(Emu.data), 0x10)));
+	vm::ptr<u64> args = vm::cast(static_cast<u32>(ppu->stack_addr + ppu->stack_size - stack_alloc_size - rx::alignUp<u32>(::size32(Emu.data), 0x10)));
 	vm::ptr<u8> args_data = vm::cast(args.addr() + pointers_storage_size);

 	const vm::ptr<u64> argv = args;
@ -2772,7 +2773,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str
 		std::memcpy(args_data.get_ptr(), arg.data(), arg_size);

 		*args++ = args_data.addr();
-		args_data = vm::cast(args_data.addr() + utils::align<u32>(arg_size, 0x10));
+		args_data = vm::cast(args_data.addr() + rx::alignUp<u32>(arg_size, 0x10));
 	}

 	*args++ = 0;
@ -2787,7 +2788,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str
 		std::memcpy(args_data.get_ptr(), arg.data(), arg_size);

 		*args++ = args_data.addr();
-		args_data = vm::cast(args_data.addr() + utils::align<u32>(arg_size, 0x10));
+		args_data = vm::cast(args_data.addr() + rx::alignUp<u32>(arg_size, 0x10));
 	}

 	*args++ = 0;
@ -2855,7 +2856,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str
 		if (prog.p_type == 0x1u /* LOAD */ && prog.p_memsz && (prog.p_flags & 0x022000002) == 0u /* W */)
 		{
 			// Set memory protection to read-only when necessary (only if PPU-W, SPU-W, RSX-W are all disabled)
-			ensure(vm::page_protect(addr, utils::align(size, 0x1000), 0, 0, vm::page_writable));
+			ensure(vm::page_protect(addr, rx::alignUp(size, 0x1000), 0, 0, vm::page_writable));
 		}
 	}

@ -2934,7 +2935,7 @@ std::pair<shared_ptr<lv2_overlay>, CellError> ppu_load_overlay(const ppu_exec_ob
 				// Leave additional room for the analyser so it can safely access beyond limit a bit
 				// Because with VM the address sapce is not really a limit so any u32 address is valid there, here it is UB to create pointer that goes beyond the boundaries
 				// TODO: Use make_shared_for_overwrite when all compilers support it
-				const usz alloc_size = utils::align<usz>(size, 0x10000) + 4096;
+				const usz alloc_size = rx::alignUp<usz>(size, 0x10000) + 4096;
 				ovlm->allocations.push_back(std::shared_ptr<u8[]>(new u8[alloc_size]));
 				_seg.ptr = ovlm->allocations.back().get();
 				std::memset(static_cast<u8*>(_seg.ptr) + prog.bin.size(), 0, alloc_size - 4096 - prog.bin.size());
@ -3230,7 +3231,7 @@ bool ppu_load_rel_exec(const ppu_rel_object& elf)
 	{
 		if (s.sh_type != sec_type::sht_progbits)
 		{
-			memsize = utils::align<u32>(memsize + vm::cast(s.sh_size), 128);
+			memsize = rx::alignUp<u32>(memsize + vm::cast(s.sh_size), 128);
 		}
 	}

@ -3278,7 +3279,7 @@ bool ppu_load_rel_exec(const ppu_rel_object& elf)
 			relm.secs.emplace_back(_sec);

 			std::memcpy(vm::base(addr), s.get_bin().data(), size);
-			addr = utils::align<u32>(addr + size, 128);
+			addr = rx::alignUp<u32>(addr + size, 128);
 		}
 	}

--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -62,7 +62,8 @@
 #include <optional>
 #include <charconv>

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"
 #include "util/vm.hpp"
 #include "util/v128.hpp"
 #include "util/simd.hpp"
@ -217,7 +218,7 @@ public:

 	user acquire(u64 amount)
 	{
-		amount = utils::aligned_div<u64>(amount, k_block_size);
+		amount = rx::aligned_div<u64>(amount, k_block_size);

 		u32 allocated = 0;
 		while (!m_free.fetch_op([&, this](u32& value)
@ -225,7 +226,7 @@ public:
 							  if (value >= amount || value == m_total)
 							  {
 								  // Allow at least allocation, make 0 the "memory unavailable" sign value for atomic waiting efficiency
-								  const u32 new_val = static_cast<u32>(utils::sub_saturate<u64>(value, amount));
+								  const u32 new_val = static_cast<u32>(rx::sub_saturate<u64>(value, amount));
 								  allocated = value - new_val;
 								  value = new_val;
 								  return true;
@ -869,7 +870,7 @@ extern void ppu_register_range(u32 addr, u32 size)
 		return;
 	}

-	size = utils::align(size + addr % 0x10000, 0x10000);
+	size = rx::alignUp(size + addr % 0x10000, 0x10000);
 	addr &= -0x10000;

 	// Register executable range at
@ -1816,7 +1817,7 @@ std::vector<std::pair<u32, u32>> ppu_thread::dump_callstack_list() const

 				if (pos_dist >= inst_pos.size())
 				{
-					const u32 inst_bound = utils::align<u32>(pos, 256);
+					const u32 inst_bound = rx::alignUp<u32>(pos, 256);

 					const usz old_size = inst_pos.size();
 					const usz new_size = pos_dist + (inst_bound - pos) / 4 + 1;
@ -1903,7 +1904,7 @@ std::vector<std::pair<u32, u32>> ppu_thread::dump_callstack_list() const

 						for (u32 back = 1; back < 20; back++)
 						{
-							be_t<u32>& opcode = get_inst(utils::sub_saturate<u32>(_cia, back * 4));
+							be_t<u32>& opcode = get_inst(rx::sub_saturate<u32>(_cia, back * 4));

 							if (!opcode)
 							{
@ -3588,11 +3589,11 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 							return false;
 						}

-						utils::prefetch_read(ppu.rdata);
-						utils::prefetch_read(ppu.rdata + 64);
+						rx::prefetch_read(ppu.rdata);
+						rx::prefetch_read(ppu.rdata + 64);
 						ppu.last_faddr = addr;
 						ppu.last_ftime = res.load() & -128;
-						ppu.last_ftsc = utils::get_tsc();
+						ppu.last_ftsc = rx::get_tsc();
 						return false;
 					}
 					default:
@ -3699,7 +3700,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)

 				ppu.last_faddr = addr;
 				ppu.last_ftime = old_rtime & -128;
-				ppu.last_ftsc = utils::get_tsc();
+				ppu.last_ftsc = rx::get_tsc();
 				std::memcpy(&ppu.rdata[addr & 0x78], &old_data, 8);
 			}

@ -3941,7 +3942,7 @@ namespace
 		fs::stat_t get_stat() override
 		{
 			fs::stat_t stat = m_file.get_stat();
-			stat.size = std::min<u64>(utils::sub_saturate<u64>(stat.size, m_off), m_max_size);
+			stat.size = std::min<u64>(rx::sub_saturate<u64>(stat.size, m_off), m_max_size);
 			stat.is_writable = false;
 			return stat;
 		}
@ -3960,7 +3961,7 @@ namespace

 		u64 read_at(u64 offset, void* buffer, u64 size) override
 		{
-			return m_file.read_at(offset + m_off, buffer, std::min<u64>(size, utils::sub_saturate<u64>(m_max_size, offset)));
+			return m_file.read_at(offset + m_off, buffer, std::min<u64>(size, rx::sub_saturate<u64>(m_max_size, offset)));
 		}

 		u64 write(const void*, u64) override
@ -3988,7 +3989,7 @@ namespace

 		u64 size() override
 		{
-			return std::min<u64>(utils::sub_saturate<u64>(m_file.size(), m_off), m_max_size);
+			return std::min<u64>(rx::sub_saturate<u64>(m_file.size(), m_off), m_max_size);
 		}
 	};
 } // namespace
@ -5624,7 +5625,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 	}

 	// Initialize compiler instance
-	while (jits.size() < utils::aligned_div<u64>(module_counter, c_moudles_per_jit) && is_being_used_in_emulation)
+	while (jits.size() < rx::aligned_div<u64>(module_counter, c_moudles_per_jit) && is_being_used_in_emulation)
 	{
 		jits.emplace_back(std::make_shared<jit_compiler>(s_link_table, g_cfg.core.llvm_cpu, 0, symbols_cement));

@ -5652,7 +5653,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 		const bool divide_by_twenty = !workload.empty();
 		const usz increment_link_count_at = (divide_by_twenty ? 20 : 1);

-		g_progr_ptotal += static_cast<u32>(utils::aligned_div<u64>(link_workload.size(), increment_link_count_at));
+		g_progr_ptotal += static_cast<u32>(rx::aligned_div<u64>(link_workload.size(), increment_link_count_at));

 		usz mod_index = umax;

@ -5785,7 +5786,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s

 bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_size)
 {
-	concurent_memory_limit memory_limit(utils::aligned_div<u64>(utils::get_total_memory(), 2));
+	concurent_memory_limit memory_limit(rx::aligned_div<u64>(utils::get_total_memory(), 2));
 	return ppu_initialize(info, check_only, file_size, memory_limit);
 }

--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -847,7 +847,7 @@ Value* PPUTranslator::ReadMemory(Value* addr, Type* type, bool is_be, u32 align)

 		m_may_be_mmio = false;

-		if (auto ptr = m_info.get_ptr<instructions_to_test>(std::max<u32>(m_info.segs[0].addr, (m_reloc ? m_reloc->addr : 0) + utils::sub_saturate<u32>(::narrow<u32>(m_addr), sizeof(instructions_to_test) / 2))))
+		if (auto ptr = m_info.get_ptr<instructions_to_test>(std::max<u32>(m_info.segs[0].addr, (m_reloc ? m_reloc->addr : 0) + rx::sub_saturate<u32>(::narrow<u32>(m_addr), sizeof(instructions_to_test) / 2))))
 		{
 			if (ppu_test_address_may_be_mmio(std::span(ptr->insts)))
 			{
@ -920,7 +920,7 @@ void PPUTranslator::WriteMemory(Value* addr, Value* value, bool is_be, u32 align
 			be_t<u32> insts[128];
 		};

-		if (auto ptr = m_info.get_ptr<instructions_to_test>(std::max<u32>(m_info.segs[0].addr, (m_reloc ? m_reloc->addr : 0) + utils::sub_saturate<u32>(::narrow<u32>(m_addr), sizeof(instructions_to_test) / 2))))
+		if (auto ptr = m_info.get_ptr<instructions_to_test>(std::max<u32>(m_info.segs[0].addr, (m_reloc ? m_reloc->addr : 0) + rx::sub_saturate<u32>(::narrow<u32>(m_addr), sizeof(instructions_to_test) / 2))))
 		{
 			if (ppu_test_address_may_be_mmio(std::span(ptr->insts)))
 			{
--- a/rpcs3/Emu/Cell/RawSPUThread.cpp
+++ b/rpcs3/Emu/Cell/RawSPUThread.cpp
@ -1,7 +1,8 @@
 #include "stdafx.h"
 #include "Emu/IdManager.h"
 #include "Loader/ELF.h"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"

 #include "SPUThread.h"

@ -450,7 +451,7 @@ void spu_load_rel_exec(const spu_rel_object& elf)
 	{
 		if (shdr.sh_type == sec_type::sht_progbits && shdr.sh_flags().all_of(sh_flag::shf_alloc))
 		{
-			total_memsize = utils::align<u32>(total_memsize + shdr.sh_size, 4);
+			total_memsize = rx::alignUp<u32>(total_memsize + shdr.sh_size, 4);
 		}
 	}

@ -462,7 +463,7 @@ void spu_load_rel_exec(const spu_rel_object& elf)
 		if (shdr.sh_type == sec_type::sht_progbits && shdr.sh_flags().all_of(sh_flag::shf_alloc))
 		{
 			std::memcpy(spu->_ptr<void>(offs), shdr.get_bin().data(), shdr.sh_size);
-			offs = utils::align<u32>(offs + shdr.sh_size, 4);
+			offs = rx::alignUp<u32>(offs + shdr.sh_size, 4);
 		}
 	}

--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -9,7 +9,8 @@
 #include "SPUInterpreter.h"
 #include "Crypto/sha1.h"

-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"
 #include "util/v128.hpp"
 #include "util/sysinfo.hpp"

@ -282,7 +283,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
 		words_align = 64;

 		const u32 starta = start & -64;
-		const u32 enda = utils::align(end, 64);
+		const u32 enda = rx::alignUp(end, 64);
 		const u32 sizea = (enda - starta) / 64;
 		ensure(sizea);

@ -363,7 +364,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
 		words_align = 32;

 		const u32 starta = start & -32;
-		const u32 enda = utils::align(end, 32);
+		const u32 enda = rx::alignUp(end, 32);
 		const u32 sizea = (enda - starta) / 32;
 		ensure(sizea);

@ -486,7 +487,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
 		words_align = 32;

 		const u32 starta = start & -32;
-		const u32 enda = utils::align(end, 32);
+		const u32 enda = rx::alignUp(end, 32);
 		const u32 sizea = (enda - starta) / 32;
 		ensure(sizea);

@ -3211,7 +3212,7 @@ void spu_recompiler::ROTQBYI(spu_opcode_t op)
 	}
 	else if (s == 4 || s == 8 || s == 12)
 	{
-		c->pshufd(va, va, utils::rol8(0xE4, s / 2));
+		c->pshufd(va, va, rx::rol8(0xE4, s / 2));
 	}
 	else if (utils::has_ssse3())
 	{
--- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
@ -25,6 +25,7 @@
 #include <optional>
 #include <unordered_set>

+#include "rx/align.hpp"
 #include "util/v128.hpp"
 #include "util/simd.hpp"
 #include "util/sysinfo.hpp"
@ -658,7 +659,7 @@ std::deque<spu_program> spu_cache::get()
 		const u32 size = block_info.size;
 		const u32 addr = block_info.addr;

-		if (utils::add_saturate<u32>(addr, size * 4) > SPU_LS_SIZE)
+		if (rx::add_saturate<u32>(addr, size * 4) > SPU_LS_SIZE)
 		{
 			break;
 		}
@ -1253,7 +1254,7 @@ void spu_cache::initialize(bool build_existing_cache)

 				fmt::append(dump, "\n\t%49s", "");

-				for (u32 i = 0; i < std::min<usz>(f->data.size(), std::max<usz>(64, utils::aligned_div<u32>(depth_m, 4))); i++)
+				for (u32 i = 0; i < std::min<usz>(f->data.size(), std::max<usz>(64, rx::aligned_div<u32>(depth_m, 4))); i++)
 				{
 					fmt::append(dump, "%-10s", g_spu_iname.decode(std::bit_cast<be_t<u32>>(f->data[i])));
 				}
@ -2308,12 +2309,12 @@ std::vector<u32> spu_thread::discover_functions(u32 base_addr, std::span<const u
 	// TODO: Does not detect jumptables or fixed-addr indirect calls
 	const v128 brasl_mask = is_known_addr ? v128::from32p(0x62u << 23) : v128::from32p(umax);

-	for (u32 i = utils::align<u32>(base_addr, 0x10); i < std::min<u32>(base_addr + ::size32(ls), 0x3FFF0); i += 0x10)
+	for (u32 i = rx::alignUp<u32>(base_addr, 0x10); i < std::min<u32>(base_addr + ::size32(ls), 0x3FFF0); i += 0x10)
 	{
 		// Search for BRSL LR and BRASL LR or BR
 		// TODO: BISL
 		const v128 inst = read_from_ptr<be_t<v128>>(ls.data(), i - base_addr);
-		const v128 cleared_i16 = gv_and32(inst, v128::from32p(utils::rol32(~0xffff, 7)));
+		const v128 cleared_i16 = gv_and32(inst, v128::from32p(rx::rol32(~0xffff, 7)));
 		const v128 eq_brsl = gv_eq32(cleared_i16, v128::from32p(0x66u << 23));
 		const v128 eq_brasl = gv_eq32(cleared_i16, brasl_mask);
 		const v128 eq_br = gv_eq32(cleared_i16, v128::from32p(0x64u << 23));
@ -5376,7 +5377,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
 									const usz block_tail = duplicate_positions[it_begin - it_tail];

 									// Check if the distance is precisely two times from the end
-									if (reg_state_it.size() - block_start != utils::rol64(reg_state_it.size() - block_tail, 1))
+									if (reg_state_it.size() - block_start != rx::rol64(reg_state_it.size() - block_tail, 1))
 									{
 										continue;
 									}
@ -7143,7 +7144,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
 			v_reg2 = 3,
 		};

-		for (auto it = infos.lower_bound(utils::sub_saturate<u32>(pattern.put_pc, 512)); it != infos.end() && it->first < pattern.put_pc + 512; it++)
+		for (auto it = infos.lower_bound(rx::sub_saturate<u32>(pattern.put_pc, 512)); it != infos.end() && it->first < pattern.put_pc + 512; it++)
 		{
 			for (auto& state : it->second->end_reg_state)
 			{
@ -7622,7 +7623,7 @@ struct spu_llvm
 			// Notify all before queue runs out if there is considerable excess
 			// Optimized that: if there are many workers, it acts soon
 			// If there are only a few workers, it postpones notifications until there is some more workload
-			if (notify_compile_count && std::min<u32>(7, utils::aligned_div<u32>(worker_count * 2, 3) + 2) <= compile_pending)
+			if (notify_compile_count && std::min<u32>(7, rx::aligned_div<u32>(worker_count * 2, 3) + 2) <= compile_pending)
 			{
 				for (usz i = 0; i < worker_count; i++)
 				{
--- a/rpcs3/Emu/Cell/SPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp
@ -6,7 +6,7 @@
 #include "Emu/Cell/SPUAnalyser.h"
 #include "Emu/system_config.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include "util/v128.hpp"
 #include "util/simd.hpp"
 #include "util/sysinfo.hpp"
@ -289,7 +289,7 @@ bool ROT(spu_thread& spu, spu_opcode_t op)

 	for (u32 i = 0; i < 4; i++)
 	{
-		spu.gpr[op.rt]._u32[i] = utils::rol32(a._u32[i], b._u32[i]);
+		spu.gpr[op.rt]._u32[i] = rx::rol32(a._u32[i], b._u32[i]);
 	}
 	return true;
 }
@ -344,7 +344,7 @@ bool ROTH(spu_thread& spu, spu_opcode_t op)

 	for (u32 i = 0; i < 8; i++)
 	{
-		spu.gpr[op.rt]._u16[i] = utils::rol16(a._u16[i], b._u16[i]);
+		spu.gpr[op.rt]._u16[i] = rx::rol16(a._u16[i], b._u16[i]);
 	}
 	return true;
 }
--- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
@ -1215,7 +1215,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 						rsx::reservation_lock rsx_lock(raddr, 128);

 						// Touch memory
-						utils::trigger_write_page_fault(vm::base(dest ^ (4096 / 2)));
+						rx::trigger_write_page_fault(vm::base(dest ^ (4096 / 2)));

 						auto [old_res, ok] = res.fetch_op([&](u64& rval)
 							{
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -1,3 +1,4 @@
+#include "rx/align.hpp"
 #include "stdafx.h"
 #include "util/JIT.h"
 #include "util/date_time.h"
@ -31,7 +32,7 @@
 #include <shared_mutex>
 #include <span>
 #include "util/vm.hpp"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include "util/v128.hpp"
 #include "util/simd.hpp"
 #include "util/sysinfo.hpp"
@ -448,7 +449,7 @@ mwaitx_func static void __mwaitx(u32 cycles, u32 cstate, const void* cline, cons
 // First bit indicates cstate, 0x0 for C.02 state (lower power) or 0x1 for C.01 state (higher power)
 waitpkg_func static void __tpause(u32 cycles, u32 cstate)
 {
-	const u64 tsc = utils::get_tsc() + cycles;
+	const u64 tsc = rx::get_tsc() + cycles;
 	_tpause(cstate, tsc);
 }
 #endif
@ -522,7 +523,7 @@ namespace spu
 				{
 					// Slight pause if function is overburdened
 					const auto count = atomic_instruction_table[pc_offset].observe() * 100ull;
-					busy_wait(count);
+					rx::busy_wait(count);
 				}

 				ensure(!spu.check_state());
@ -1774,7 +1775,7 @@ void spu_thread::cpu_return()
 						// Wait for all threads to have error codes if exited by sys_spu_thread_exit
 						for (u32 status; !thread->exit_status.try_read(status) || status != thread->last_exit_status;)
 						{
-							utils::pause();
+							rx::pause();
 						}
 					}
 				}
@ -2307,60 +2308,6 @@ void spu_thread::push_snr(u32 number, u32 value)
 	const u32 event_bit = SPU_EVENT_S1 >> (number & 1);
 	const bool bitor_bit = !!((snr_config >> number) & 1);

-	// Redundant, g_use_rtm is checked inside tx_start now.
-	if (g_use_rtm && false)
-	{
-		bool channel_notify = false;
-		bool thread_notify = false;
-
-		const bool ok = utils::tx_start([&]
-			{
-				channel_notify = (channel->data.raw() == spu_channel::bit_wait);
-				thread_notify = (channel->data.raw() & spu_channel::bit_count) == 0;
-
-				if (channel_notify)
-				{
-					ensure(channel->jostling_value.raw() == spu_channel::bit_wait);
-					channel->jostling_value.raw() = value;
-					channel->data.raw() = 0;
-				}
-				else if (bitor_bit)
-				{
-					channel->data.raw() &= ~spu_channel::bit_wait;
-					channel->data.raw() |= spu_channel::bit_count | value;
-				}
-				else
-				{
-					channel->data.raw() = spu_channel::bit_count | value;
-				}
-
-				if (thread_notify)
-				{
-					ch_events.raw().events |= event_bit;
-
-					if (ch_events.raw().mask & event_bit)
-					{
-						ch_events.raw().count = 1;
-						thread_notify = ch_events.raw().waiting != 0;
-					}
-					else
-					{
-						thread_notify = false;
-					}
-				}
-			});
-
-		if (ok)
-		{
-			if (channel_notify)
-				channel->data.notify_one();
-			if (thread_notify)
-				this->notify();
-
-			return;
-		}
-	}
-
 	// Lock event channel in case it needs event notification
 	ch_events.atomic_op([](ch_events_t& ev)
 		{
@ -2527,7 +2474,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
 			range_lock = _this->range_lock;
 		}

-		utils::prefetch_write(range_lock);
+		rx::prefetch_write(range_lock);

 		for (u32 size = args.size, size0; is_get; size -= size0, dst += size0, src += size0, eal += size0)
 		{
@ -2541,7 +2488,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
 					}
 					else if (++i < 25) [[likely]]
 					{
-						busy_wait(300);
+						rx::busy_wait(300);
 					}
 					else
 					{
@ -2706,7 +2653,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*

 						if (true || ++i < 10)
 						{
-							busy_wait(500);
+							rx::busy_wait(500);
 						}
 						else
 						{
@ -2947,7 +2894,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
 			}

 			u32 range_addr = eal & -128;
-			u32 range_end = utils::align(eal + size, 128);
+			u32 range_end = rx::alignUp(eal + size, 128);

 			// Handle the case of crossing 64K page borders (TODO: maybe split in 4K fragments?)
 			if (range_addr >> 16 != (range_end - 1) >> 16)
@ -3131,7 +3078,7 @@ plain_access:

 bool spu_thread::do_dma_check(const spu_mfc_cmd& args)
 {
-	const u32 mask = utils::rol32(1, args.tag);
+	const u32 mask = rx::rol32(1, args.tag);

 	if (mfc_barrier & mask || (args.cmd & (MFC_BARRIER_MASK | MFC_FENCE_MASK) && mfc_fence & mask)) [[unlikely]]
 	{
@ -3147,13 +3094,13 @@ bool spu_thread::do_dma_check(const spu_mfc_cmd& args)
 				if ((mfc_queue[i].cmd & ~0xc) == MFC_BARRIER_CMD)
 				{
 					mfc_barrier |= -1;
-					mfc_fence |= utils::rol32(1, mfc_queue[i].tag);
+					mfc_fence |= rx::rol32(1, mfc_queue[i].tag);
 					continue;
 				}

 				if (true)
 				{
-					const u32 _mask = utils::rol32(1u, mfc_queue[i].tag);
+					const u32 _mask = rx::rol32(1u, mfc_queue[i].tag);

 					// A command with barrier hard blocks that tag until it's been dealt with
 					if (mfc_queue[i].cmd & MFC_BARRIER_MASK)
@ -3258,7 +3205,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
 					u8* dst = this->ls + arg_lsa;

 					// Assume success, prepare the next elements
-					arg_lsa += fetch_size * utils::align<u32>(s_size, 16);
+					arg_lsa += fetch_size * rx::alignUp<u32>(s_size, 16);
 					item_ptr += fetch_size;
 					arg_size -= fetch_size * 8;

@ -3266,11 +3213,11 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
 					constexpr usz _128 = 128;

 					// This whole function relies on many constraints to be met (crashes real MFC), we can a have minor optimization assuming EA alignment to be +16 with +16 byte transfers
-#define MOV_T(type, index, _ea)                                                                                                                                     \
-	{                                                                                                                                                               \
-		const usz ea = _ea;                                                                                                                                         \
-		*reinterpret_cast<type*>(dst + index * utils::align<u32>(sizeof(type), 16) + ea % (sizeof(type) < 16 ? 16 : 1)) = *reinterpret_cast<const type*>(src + ea); \
-	}                                                                                                                                                               \
+#define MOV_T(type, index, _ea)                                                                                                                                    \
+	{                                                                                                                                                              \
+		const usz ea = _ea;                                                                                                                                        \
+		*reinterpret_cast<type*>(dst + index * rx::alignUp<u32>(sizeof(type), 16) + ea % (sizeof(type) < 16 ? 16 : 1)) = *reinterpret_cast<const type*>(src + ea); \
+	}                                                                                                                                                              \
 	void()
 #define MOV_128(index, ea) mov_rdata(*reinterpret_cast<decltype(rdata)*>(dst + index * _128), *reinterpret_cast<const decltype(rdata)*>(src + (ea)))

@ -3522,7 +3469,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
 #undef MOV_T
 #undef MOV_128
 					// Optimization miss, revert changes
-					arg_lsa -= fetch_size * utils::align<u32>(s_size, 16);
+					arg_lsa -= fetch_size * rx::alignUp<u32>(s_size, 16);
 					item_ptr -= fetch_size;
 					arg_size += fetch_size * 8;
 				}
@ -3604,7 +3551,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
 			}
 			}

-			arg_lsa += utils::align<u32>(size, 16);
+			arg_lsa += rx::alignUp<u32>(size, 16);
 		}
 		// Avoid inlining huge transfers because it intentionally drops range lock unlock
 		else if (optimization_compatible == MFC_PUT_CMD && ((addr >> 28 == rsx::constants::local_mem_base >> 28) || (addr < RAW_SPU_BASE_ADDR && size - 1 <= 0x400 - 1 && (addr % 0x10000 + (size - 1)) < 0x10000)))
@ -3615,7 +3562,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)

 				if (!g_use_rtm)
 				{
-					vm::range_lock(range_lock, addr & -128, utils::align<u32>(addr + size, 128) - (addr & -128));
+					vm::range_lock(range_lock, addr & -128, rx::alignUp<u32>(addr + size, 128) - (addr & -128));
 				}
 			}
 			else
@ -3690,7 +3637,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
 			}
 			}

-			arg_lsa += utils::align<u32>(size, 16);
+			arg_lsa += rx::alignUp<u32>(size, 16);
 		}
 		else if (size)
 		{
@ -3703,7 +3650,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
 			transfer.lsa = arg_lsa | (addr & 0xf);
 			transfer.size = size;

-			arg_lsa += utils::align<u32>(size, 16);
+			arg_lsa += rx::alignUp<u32>(size, 16);
 			do_dma_transfer(this, transfer, ls);
 		}

@ -3721,14 +3668,14 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
 		{
 			range_lock->release(0);

-			ch_stall_mask |= utils::rol32(1, args.tag);
+			ch_stall_mask |= rx::rol32(1, args.tag);

 			if (!ch_stall_stat.get_count())
 			{
 				set_events(SPU_EVENT_SN);
 			}

-			ch_stall_stat.set_value(utils::rol32(1, args.tag) | ch_stall_stat.get_value());
+			ch_stall_stat.set_value(rx::rol32(1, args.tag) | ch_stall_stat.get_value());

 			args.tag |= 0x80; // Set stalled status
 			args.eal = ::narrow<u32>(reinterpret_cast<const u8*>(item_ptr) - this->ls);
@ -3853,7 +3800,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 							return false;
 						});

-					const u64 count2 = utils::get_tsc() - perf2.get();
+					const u64 count2 = rx::get_tsc() - perf2.get();

 					if (count2 > 20000 && g_cfg.core.perf_report) [[unlikely]]
 					{
@ -3881,11 +3828,11 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 						return false;
 					}

-					utils::prefetch_read(rdata);
-					utils::prefetch_read(rdata + 64);
+					rx::prefetch_read(rdata);
+					rx::prefetch_read(rdata + 64);
 					last_faddr = addr;
 					last_ftime = res.load() & -128;
-					last_ftsc = utils::get_tsc();
+					last_ftsc = rx::get_tsc();
 					return false;
 				}
 				default:
@ -3973,7 +3920,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)

 		if (!vm::check_addr(addr, vm::page_writable))
 		{
-			utils::trigger_write_page_fault(vm::base(addr));
+			rx::trigger_write_page_fault(vm::base(addr));
 		}

 		raddr = 0;
@ -4036,7 +3983,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
 					}
 					else if (k < 15)
 					{
-						busy_wait(500);
+						rx::busy_wait(500);
 					}
 					else
 					{
@ -4053,7 +4000,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
 			}
 			else if (j < 15)
 			{
-				busy_wait(500);
+				rx::busy_wait(500);
 			}
 			else
 			{
@ -4075,7 +4022,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
 		else if (!g_use_rtm)
 		{
 			// Provoke page fault
-			utils::trigger_write_page_fault(vm::base(addr));
+			rx::trigger_write_page_fault(vm::base(addr));

 			// Hard lock
 			auto spu = cpu ? cpu->try_get<spu_thread>() : nullptr;
@ -4102,7 +4049,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
 				});

 			vm::reservation_acquire(addr) += 32;
-			result = utils::get_tsc() - perf0.get();
+			result = rx::get_tsc() - perf0.get();
 		}

 		if (result > 20000 && g_cfg.core.perf_report) [[unlikely]]
@ -4150,7 +4097,7 @@ bool spu_thread::do_mfc(bool can_escape, bool must_finish)
 	auto process_command = [&](spu_mfc_cmd& args)
 	{
 		// Select tag bit in the tag mask or the stall mask
-		const u32 mask = utils::rol32(1, args.tag);
+		const u32 mask = rx::rol32(1, args.tag);

 		if ((args.cmd & ~0xc) == MFC_BARRIER_CMD)
 		{
@ -4240,7 +4187,7 @@ bool spu_thread::do_mfc(bool can_escape, bool must_finish)
 	{
 		// Get commands' execution mask
 		// Mask bits are always set when mfc_transfers_shuffling is 0
-		return static_cast<u16>((0 - (1u << std::min<u32>(g_cfg.core.mfc_transfers_shuffling, size))) | utils::get_tsc());
+		return static_cast<u16>((0 - (1u << std::min<u32>(g_cfg.core.mfc_transfers_shuffling, size))) | rx::get_tsc());
 	};

 	// Process enqueued commands
@ -4733,7 +4680,7 @@ bool spu_thread::process_mfc_cmd()
 								else
 #endif
 								{
-									busy_wait(300);
+									rx::busy_wait(300);
 								}

 								if (getllar_spin_count == 3)
@ -4875,7 +4822,7 @@ bool spu_thread::process_mfc_cmd()
 				if (i < 24) [[likely]]
 				{
 					i++;
-					busy_wait(300);
+					rx::busy_wait(300);
 				}
 				else
 				{
@ -5159,7 +5106,7 @@ bool spu_thread::process_mfc_cmd()
 			std::memcpy(dump.data, _ptr<u8>(ch_mfc_cmd.lsa & 0x3ff80), 128);
 		}

-		const u32 mask = utils::rol32(1, ch_mfc_cmd.tag);
+		const u32 mask = rx::rol32(1, ch_mfc_cmd.tag);

 		if ((mfc_barrier | mfc_fence) & mask) [[unlikely]]
 		{
@ -5214,11 +5161,11 @@ bool spu_thread::process_mfc_cmd()
 			}

 			mfc_queue[mfc_size++] = ch_mfc_cmd;
-			mfc_fence |= utils::rol32(1, ch_mfc_cmd.tag);
+			mfc_fence |= rx::rol32(1, ch_mfc_cmd.tag);

 			if (ch_mfc_cmd.cmd & MFC_BARRIER_MASK)
 			{
-				mfc_barrier |= utils::rol32(1, ch_mfc_cmd.tag);
+				mfc_barrier |= rx::rol32(1, ch_mfc_cmd.tag);
 			}

 			return true;
@ -5267,11 +5214,11 @@ bool spu_thread::process_mfc_cmd()
 			}

 			mfc_size++;
-			mfc_fence |= utils::rol32(1, cmd.tag);
+			mfc_fence |= rx::rol32(1, cmd.tag);

 			if (cmd.cmd & MFC_BARRIER_MASK)
 			{
-				mfc_barrier |= utils::rol32(1, cmd.tag);
+				mfc_barrier |= rx::rol32(1, cmd.tag);
 			}

 			if (check_mfc_interrupts(pc + 4))
@ -5297,7 +5244,7 @@ bool spu_thread::process_mfc_cmd()
 		{
 			mfc_queue[mfc_size++] = ch_mfc_cmd;
 			mfc_barrier |= -1;
-			mfc_fence |= utils::rol32(1, ch_mfc_cmd.tag);
+			mfc_fence |= rx::rol32(1, ch_mfc_cmd.tag);
 		}

 		return true;
@ -5592,7 +5539,7 @@ retry:

 	if (reading && res.locks && mask_hint & (SPU_EVENT_S1 | SPU_EVENT_S2))
 	{
-		busy_wait(100);
+		rx::busy_wait(100);
 		goto retry;
 	}

@ -5899,7 +5846,7 @@ s64 spu_thread::get_ch_value(u32 ch)
 			}
 		}

-		const usz seed = (utils::get_tsc() >> 8) % 100;
+		const usz seed = (rx::get_tsc() >> 8) % 100;

 #ifdef __linux__
 		const bool reservation_busy_waiting = false;
@ -5998,7 +5945,7 @@ s64 spu_thread::get_ch_value(u32 ch)
 				{
 					if (u32 work_count = g_spu_work_count)
 					{
-						const u32 true_free = utils::sub_saturate<u32>(utils::get_thread_count(), 10);
+						const u32 true_free = rx::sub_saturate<u32>(utils::get_thread_count(), 10);

 						if (work_count > true_free)
 						{
@ -6123,7 +6070,7 @@ s64 spu_thread::get_ch_value(u32 ch)
 				}
 				else
 				{
-					busy_wait();
+					rx::busy_wait();
 				}

 				continue;
@ -6490,7 +6437,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
 		value &= 0x1f;

 		// Reset stall status for specified tag
-		const u32 tag_mask = utils::rol32(1, value);
+		const u32 tag_mask = rx::rol32(1, value);

 		if (ch_stall_mask & tag_mask)
 		{
@ -7320,7 +7267,7 @@ bool spu_thread::try_load_debug_capture()
 void spu_thread::wakeup_delay(u32 div) const
 {
 	if (g_cfg.core.spu_wakeup_delay_mask & (1u << index))
-		thread_ctrl::wait_for_accurate(utils::aligned_div(+g_cfg.core.spu_wakeup_delay, div));
+		thread_ctrl::wait_for_accurate(rx::aligned_div(+g_cfg.core.spu_wakeup_delay, div));
 }

 spu_function_logger::spu_function_logger(spu_thread& spu, const char* func) noexcept
@ -7397,7 +7344,7 @@ s64 spu_channel::pop_wait(cpu_thread& spu, bool pop)

 	for (int i = 0; i < 10; i++)
 	{
-		busy_wait();
+		rx::busy_wait();

 		if (!(data & bit_wait))
 		{
@ -7473,7 +7420,7 @@ bool spu_channel::push_wait(cpu_thread& spu, u32 value, bool push)
 			return true;
 		}

-		busy_wait();
+		rx::busy_wait();
 		state = data;
 	}

@ -7528,7 +7475,7 @@ std::pair<u32, u32> spu_channel_4_t::pop_wait(cpu_thread& spu, bool pop_value)

 	for (int i = 0; i < 10; i++)
 	{
-		busy_wait();
+		rx::busy_wait();

 		if (!atomic_storage<u8>::load(values.raw().waiting))
 		{
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@ -1,4 +1,6 @@
 #include "stdafx.h"
+
+#include "rx/align.hpp"
 #include "vm_locking.h"
 #include "vm_ptr.h"
 #include "vm_ref.h"
@ -14,7 +16,8 @@
 #include <span>

 #include "util/vm.hpp"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"
 #include "util/simd.hpp"
 #include "util/serialization.hpp"

@ -245,7 +248,7 @@ namespace vm

 					// Try triggering a page fault (write)
 					// TODO: Read memory if needed
-					utils::trigger_write_page_fault(vm::base(test / 4096 == begin / 4096 ? begin : test));
+					rx::trigger_write_page_fault(vm::base(test / 4096 == begin / 4096 ? begin : test));
 					continue;
 				}
 			}
@ -258,7 +261,7 @@ namespace vm
 				perf0.restart();
 			}

-			busy_wait(200);
+			rx::busy_wait(200);

 			if (i >= 2 && !_cpu)
 			{
@ -339,9 +342,9 @@ namespace vm
 		auto range_lock = &*std::prev(std::end(vm::g_range_lock_set));
 		*range_lock = addr | u64{size} << 32 | flags;

-		utils::prefetch_read(g_range_lock_set + 0);
-		utils::prefetch_read(g_range_lock_set + 2);
-		utils::prefetch_read(g_range_lock_set + 4);
+		rx::prefetch_read(g_range_lock_set + 0);
+		rx::prefetch_read(g_range_lock_set + 2);
+		rx::prefetch_read(g_range_lock_set + 4);

 		const auto range = utils::address_range::start_length(addr, size);

@ -364,7 +367,7 @@ namespace vm
 				break;
 			}

-			utils::pause();
+			rx::pause();
 		}

 		return range_lock;
@ -407,7 +410,7 @@ namespace vm
 				}

 				if (i < 100)
-					busy_wait(200);
+					rx::busy_wait(200);
 				else
 					std::this_thread::yield();

@ -516,12 +519,12 @@ namespace vm
 				if (to_prepare_memory)
 				{
 					// We have some spare time, prepare cache lines (todo: reservation tests here)
-					utils::prefetch_write(vm::get_super_ptr(addr));
-					utils::prefetch_write(vm::get_super_ptr(addr) + 64);
+					rx::prefetch_write(vm::get_super_ptr(addr));
+					rx::prefetch_write(vm::get_super_ptr(addr) + 64);
 					to_prepare_memory = false;
 				}

-				busy_wait(200);
+				rx::busy_wait(200);
 			}
 			else
 			{
@ -552,9 +555,9 @@ namespace vm
 				addr1 = static_cast<u16>(addr) | is_shared;
 			}

-			utils::prefetch_read(g_range_lock_set + 0);
-			utils::prefetch_read(g_range_lock_set + 2);
-			utils::prefetch_read(g_range_lock_set + 4);
+			rx::prefetch_read(g_range_lock_set + 0);
+			rx::prefetch_read(g_range_lock_set + 2);
+			rx::prefetch_read(g_range_lock_set + 4);

 			u64 to_clear = get_range_lock_bits(false);

@ -568,7 +571,7 @@ namespace vm
 						for (u64 hi = addr2 >> 16, max = (addr2 + size2 - 1) >> 16; hi <= max; hi++)
 						{
 							u64 addr3 = addr2;
-							u64 size3 = std::min<u64>(addr2 + size2, utils::align(addr2, 0x10000)) - addr2;
+							u64 size3 = std::min<u64>(addr2 + size2, rx::alignUp(addr2, 0x10000)) - addr2;

 							if (u64 is_shared = g_shmem[hi]) [[unlikely]]
 							{
@ -594,12 +597,12 @@ namespace vm

 				if (to_prepare_memory)
 				{
-					utils::prefetch_write(vm::get_super_ptr(addr));
-					utils::prefetch_write(vm::get_super_ptr(addr) + 64);
+					rx::prefetch_write(vm::get_super_ptr(addr));
+					rx::prefetch_write(vm::get_super_ptr(addr) + 64);
 					to_prepare_memory = false;
 				}

-				utils::pause();
+				rx::pause();
 			}

 			for (auto lock = g_locks.cbegin(), end = lock + g_cfg.core.ppu_threads; lock != end; lock++)
@ -610,12 +613,12 @@ namespace vm
 					{
 						if (to_prepare_memory)
 						{
-							utils::prefetch_write(vm::get_super_ptr(addr));
-							utils::prefetch_write(vm::get_super_ptr(addr) + 64);
+							rx::prefetch_write(vm::get_super_ptr(addr));
+							rx::prefetch_write(vm::get_super_ptr(addr) + 64);
 							to_prepare_memory = false;
 						}

-						utils::pause();
+						rx::pause();
 					}
 				}
 			}
@ -642,7 +645,7 @@ namespace vm
 			}
 			else if (i < 15)
 			{
-				busy_wait(500);
+				rx::busy_wait(500);
 			}
 			else
 			{
@ -683,7 +686,7 @@ namespace vm
 			}
 			else if (i < 15)
 			{
-				busy_wait(500);
+				rx::busy_wait(500);
 			}
 			else
 			{
@ -1078,13 +1081,13 @@ namespace vm

 			if (state & page_1m_size)
 			{
-				i = utils::align(i + 1, 0x100000 / 4096);
+				i = rx::alignUp(i + 1, 0x100000 / 4096);
 				continue;
 			}

 			if (state & page_64k_size)
 			{
-				i = utils::align(i + 1, 0x10000 / 4096);
+				i = rx::alignUp(i + 1, 0x10000 / 4096);
 				continue;
 			}

@ -1359,7 +1362,7 @@ namespace vm
 		const u32 min_page_size = flags & page_size_4k ? 0x1000 : 0x10000;

 		// Align to minimal page size
-		const u32 size = utils::align(orig_size, min_page_size) + (flags & stack_guarded ? 0x2000 : 0);
+		const u32 size = rx::alignUp(orig_size, min_page_size) + (flags & stack_guarded ? 0x2000 : 0);

 		// Check alignment (it's page allocation, so passing small values there is just silly)
 		if (align < min_page_size || align != (0x80000000u >> std::countl_zero(align)))
@ -1387,7 +1390,7 @@ namespace vm

 		const u32 max = (this->addr + this->size - size) & (0 - align);

-		u32 addr = utils::align(this->addr, align);
+		u32 addr = rx::alignUp(this->addr, align);

 		if (this->addr > max || addr > max)
 		{
@ -1434,7 +1437,7 @@ namespace vm
 		const u32 size0 = orig_size + addr % min_page_size;

 		// Align to minimal page size
-		const u32 size = utils::align(size0, min_page_size);
+		const u32 size = rx::alignUp(size0, min_page_size);

 		// Return if addr or size is invalid
 		// If shared memory is provided, addr/size must be aligned
@ -1870,7 +1873,7 @@ namespace vm
 			return nullptr;
 		}

-		for (u32 addr = utils::align<u32>(0x10000000, align);; addr += align)
+		for (u32 addr = rx::alignUp<u32>(0x10000000, align);; addr += align)
 		{
 			if (_test_map(addr, size))
 			{
@ -1950,7 +1953,7 @@ namespace vm
 		vm::writer_lock lock;

 		// Align to minimal page size
-		const u32 size = utils::align(orig_size, 0x10000);
+		const u32 size = rx::alignUp(orig_size, 0x10000);

 		// Check alignment
 		if (align < 0x10000 || align != (0x80000000u >> std::countl_zero(align)))
@ -2178,7 +2181,7 @@ namespace vm
 			// Wait a bit before accessing global lock
 			range_lock->release(0);

-			busy_wait(200);
+			rx::busy_wait(200);
 		}

 		const bool result = try_access_internal(begin, ptr, size, is_write);
@ -2399,7 +2402,7 @@ namespace vm
 		// Prevent overflow
 		const u32 size = 0 - max_size < addr ? (0 - addr) : max_size;

-		for (u32 i = addr, end = utils::align(addr + size, 4096) - 1; i <= end;)
+		for (u32 i = addr, end = rx::alignUp(addr + size, 4096) - 1; i <= end;)
 		{
 			if (check_pages && !vm::check_addr(i, vm::page_readable))
 			{
--- a/rpcs3/Emu/Memory/vm_reservation.h
+++ b/rpcs3/Emu/Memory/vm_reservation.h
@ -3,7 +3,7 @@
 #include "vm.h"
 #include "vm_locking.h"
 #include "util/atomic.hpp"
-#include "util/tsc.hpp"
+#include "rx/tsc.hpp"
 #include <functional>

 extern bool g_use_rtm;
@ -209,7 +209,7 @@ namespace vm
 			unsigned status = -1;
 			u64 _old = 0;

-			auto stamp0 = utils::get_tsc(), stamp1 = stamp0, stamp2 = stamp0;
+			auto stamp0 = rx::get_tsc(), stamp1 = stamp0, stamp2 = stamp0;

 #ifndef _MSC_VER
 			__asm__ goto("xbegin %l[stage2];" ::: "memory" : stage2);
@ -271,16 +271,16 @@ namespace vm
 #ifndef _MSC_VER
 			__asm__ volatile("mov %%eax, %0;" : "=r"(status)::"memory");
 #endif
-			stamp1 = utils::get_tsc();
+			stamp1 = rx::get_tsc();

 			// Stage 2: try to lock reservation first
 			_old = res.fetch_add(1);

 			// Compute stamps excluding memory touch
-			stamp2 = utils::get_tsc() - (stamp1 - stamp0);
+			stamp2 = rx::get_tsc() - (stamp1 - stamp0);

 			// Start lightened transaction
-			for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = utils::get_tsc())
+			for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = rx::get_tsc())
 			{
 				if (cpu.has_pause_flag())
 				{
--- a/rpcs3/Emu/NP/np_allocator.h
+++ b/rpcs3/Emu/NP/np_allocator.h
@ -4,7 +4,8 @@

 #include "Emu/Memory/vm_ptr.h"
 #include "util/mutex.h"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"
 #include "util/logs.hpp"

 LOG_CHANNEL(np_mem_allocator);
@ -52,7 +53,7 @@ namespace np
 			}

 			// Align allocs
-			const u32 alloc_size = utils::align(size, 4);
+			const u32 alloc_size = rx::alignUp(size, 4);
 			if (alloc_size > m_avail)
 			{
 				np_mem_allocator.error("Not enough memory available in NP pool!");
--- a/rpcs3/Emu/NP/np_event_data.h
+++ b/rpcs3/Emu/NP/np_event_data.h
@ -1,7 +1,8 @@
 #pragma once

 #include "Emu/Memory/vm_ptr.h"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"

 namespace np
 {
@ -9,7 +10,7 @@ namespace np
 	{
 	public:
 		event_data(u32 vm_addr, u32 initial_size, u32 max_size)
-			: m_max_size(max_size), m_cur_size(utils::align(initial_size, 4))
+			: m_max_size(max_size), m_cur_size(rx::alignUp(initial_size, 4))
 		{
 			m_data_ptr.set(vm_addr);
 		}
@ -50,7 +51,7 @@ namespace np
 		template <typename T>
 		T* allocate(u32 size, vm::bptr<T>& dest)
 		{
-			const u32 to_alloc = utils::align(size, 4);
+			const u32 to_alloc = rx::alignUp(size, 4);
 			ensure((m_cur_size + to_alloc) <= m_max_size, "event_data::allocate: size would overflow the allocated buffer!");

 			u8* dest_ptr = reinterpret_cast<u8*>(&dest);
--- a/rpcs3/Emu/NP/np_gui_cache.cpp
+++ b/rpcs3/Emu/NP/np_gui_cache.cpp
@ -1,5 +1,6 @@
 #include "stdafx.h"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"
 #include "np_gui_cache.h"

 LOG_CHANNEL(np_gui_cache);
@ -72,7 +73,7 @@ namespace np

 		const auto& room = ::at32(rooms, room_id);

-		const u32 room_size = ::narrow<u32>(utils::align(sizeof(SceNpMatchingRoomStatus), 8) + (utils::align(sizeof(SceNpMatchingRoomMember), 8) * room.members.size()));
+		const u32 room_size = ::narrow<u32>(rx::alignUp(sizeof(SceNpMatchingRoomStatus), 8) + (rx::alignUp(sizeof(SceNpMatchingRoomMember), 8) * room.members.size()));

 		if (!data)
 			return not_an_error(room_size);
@ -94,12 +95,12 @@ namespace np
 			{
 				if (!cur_member_ptr)
 				{
-					room_status->members = vm::cast(data.addr() + utils::align(sizeof(SceNpMatchingRoomStatus), 8));
+					room_status->members = vm::cast(data.addr() + rx::alignUp(sizeof(SceNpMatchingRoomStatus), 8));
 					cur_member_ptr = room_status->members;
 				}
 				else
 				{
-					cur_member_ptr->next = vm::cast(cur_member_ptr.addr() + utils::align(sizeof(SceNpMatchingRoomMember), 8));
+					cur_member_ptr->next = vm::cast(cur_member_ptr.addr() + rx::alignUp(sizeof(SceNpMatchingRoomMember), 8));
 					cur_member_ptr = cur_member_ptr->next;
 				}

--- a/rpcs3/Emu/RSX/Capture/rsx_replay.cpp
+++ b/rpcs3/Emu/RSX/Capture/rsx_replay.cpp
@ -7,7 +7,8 @@
 #include "cellos/sys_memory.h"
 #include "Emu/RSX/RSXThread.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"

 #include <thread>

@ -26,7 +27,7 @@ namespace rsx
 		}

 		// User memory + fifo size
-		buffer_size = utils::align<u32>(buffer_size, 0x100000) + 0x10000000;
+		buffer_size = rx::alignUp<u32>(buffer_size, 0x100000) + 0x10000000;
 		// We are not allowed to drain all memory so add a little
 		g_fxo->init<lv2_memory_container>(buffer_size + 0x1000000);

--- a/rpcs3/Emu/RSX/Common/TextureUtils.cpp
+++ b/rpcs3/Emu/RSX/Common/TextureUtils.cpp
@ -5,7 +5,8 @@
 #include "../rsx_utils.h"
 #include "3rdparty/bcdec/bcdec.hpp"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"

 namespace utils
 {
@ -661,13 +662,13 @@ namespace
 					}
 					else if constexpr (block_edge_in_texel == 4)
 					{
-						current_subresource_layout.width_in_block = utils::aligned_div(miplevel_width_in_texel, block_edge_in_texel);
-						current_subresource_layout.height_in_block = utils::aligned_div(miplevel_height_in_texel, block_edge_in_texel);
+						current_subresource_layout.width_in_block = rx::aligned_div(miplevel_width_in_texel, block_edge_in_texel);
+						current_subresource_layout.height_in_block = rx::aligned_div(miplevel_height_in_texel, block_edge_in_texel);
 					}
 					else
 					{
 						// Only the width is compressed
-						current_subresource_layout.width_in_block = utils::aligned_div(miplevel_width_in_texel, block_edge_in_texel);
+						current_subresource_layout.width_in_block = rx::aligned_div(miplevel_width_in_texel, block_edge_in_texel);
 						current_subresource_layout.height_in_block = miplevel_height_in_texel;
 					}

@ -699,7 +700,7 @@ namespace

 				if (!padded_row) // Only swizzled textures obey this restriction
 				{
-					offset_in_src = utils::align(offset_in_src, 128);
+					offset_in_src = rx::alignUp(offset_in_src, 128);
 				}
 			}

@ -1429,8 +1430,8 @@ namespace rsx
 		usz result = 0;
 		for (u16 i = 0; i < mipmap; ++i)
 		{
-			usz rowPitch = utils::align(block_size_in_byte * width_in_blocks, row_pitch_alignment);
-			result += utils::align(rowPitch * height_in_blocks * depth, mipmap_alignment);
+			usz rowPitch = rx::alignUp(block_size_in_byte * width_in_blocks, row_pitch_alignment);
+			result += rx::alignUp(rowPitch * height_in_blocks * depth, mipmap_alignment);
 			height_in_blocks = std::max<usz>(height_in_blocks / 2, 1);
 			width_in_blocks = std::max<usz>(width_in_blocks / 2, 1);
 		}
--- a/rpcs3/Emu/RSX/Common/ring_buffer_helper.h
+++ b/rpcs3/Emu/RSX/Common/ring_buffer_helper.h
@ -1,7 +1,8 @@
 #pragma once

 #include "util/StrFmt.h"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"

 /**
 * Ring buffer memory helper :
@ -20,8 +21,8 @@ protected:
 	template <int Alignment>
 	bool can_alloc(usz size) const
 	{
-		usz alloc_size = utils::align(size, Alignment);
-		usz aligned_put_pos = utils::align(m_put_pos, Alignment);
+		usz alloc_size = rx::alignUp(size, Alignment);
+		usz aligned_put_pos = rx::alignUp(m_put_pos, Alignment);
 		if (aligned_put_pos + alloc_size < m_size)
 		{
 			// range before get
@ -85,8 +86,8 @@ public:
 	template <int Alignment>
 	usz alloc(usz size)
 	{
-		const usz alloc_size = utils::align(size, Alignment);
-		const usz aligned_put_pos = utils::align(m_put_pos, Alignment);
+		const usz alloc_size = rx::alignUp(size, Alignment);
+		const usz aligned_put_pos = rx::alignUp(m_put_pos, Alignment);

 		if (!can_alloc<Alignment>(size) && !grow(alloc_size))
 		{
--- a/rpcs3/Emu/RSX/Common/surface_store.cpp
+++ b/rpcs3/Emu/RSX/Common/surface_store.cpp
@ -1,7 +1,8 @@
 #include "stdafx.h"
 #include "surface_store.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"

 namespace rsx
 {
@ -39,20 +40,20 @@ namespace rsx
 		{
 			switch (format)
 			{
-			case surface_color_format::b8: return utils::align(width, 256);
+			case surface_color_format::b8: return rx::alignUp(width, 256);
 			case surface_color_format::g8b8:
 			case surface_color_format::x1r5g5b5_o1r5g5b5:
 			case surface_color_format::x1r5g5b5_z1r5g5b5:
-			case surface_color_format::r5g6b5: return utils::align(width * 2, 256);
+			case surface_color_format::r5g6b5: return rx::alignUp(width * 2, 256);
 			case surface_color_format::a8b8g8r8:
 			case surface_color_format::x8b8g8r8_o8b8g8r8:
 			case surface_color_format::x8b8g8r8_z8b8g8r8:
 			case surface_color_format::x8r8g8b8_o8r8g8b8:
 			case surface_color_format::x8r8g8b8_z8r8g8b8:
 			case surface_color_format::x32:
-			case surface_color_format::a8r8g8b8: return utils::align(width * 4, 256);
-			case surface_color_format::w16z16y16x16: return utils::align(width * 8, 256);
-			case surface_color_format::w32z32y32x32: return utils::align(width * 16, 256);
+			case surface_color_format::a8r8g8b8: return rx::alignUp(width * 4, 256);
+			case surface_color_format::w16z16y16x16: return rx::alignUp(width * 8, 256);
+			case surface_color_format::w32z32y32x32: return rx::alignUp(width * 16, 256);
 			}
 			fmt::throw_exception("Unknown color surface format");
 		}
--- a/rpcs3/Emu/RSX/Common/surface_store.h
+++ b/rpcs3/Emu/RSX/Common/surface_store.h
@ -8,7 +8,8 @@
 #include "../rsx_utils.h"
 #include <list>

-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"

 namespace rsx
 {
@ -806,7 +807,7 @@ namespace rsx
 						continue;
 					}

-					num_rows = utils::aligned_div(this_range.length(), rsx_pitch);
+					num_rows = rx::aligned_div(this_range.length(), rsx_pitch);
 				}

 				for (u32 row = 0, offset = (this_range.start - range.start), section_len = (this_range.end - range.start + 1);
@ -1186,7 +1187,7 @@ namespace rsx
 					{
 						// Width is calculated in the coordinate-space of the requester; normalize
 						info.src_area.x = (info.src_area.x * required_bpp) / surface_bpp;
-						info.src_area.width = utils::align(width * required_bpp, surface_bpp) / surface_bpp;
+						info.src_area.width = rx::alignUp(width * required_bpp, surface_bpp) / surface_bpp;
 					}
 					else
 					{
--- a/rpcs3/Emu/RSX/Common/time.hpp
+++ b/rpcs3/Emu/RSX/Common/time.hpp
@ -1,4 +1,4 @@
 #pragma once

-#include <util/asm.hpp>
+#include <rx/asm.hpp>
 #include <util/sysinfo.hpp>
--- a/rpcs3/Emu/RSX/GL/GLCompute.cpp
+++ b/rpcs3/Emu/RSX/GL/GLCompute.cpp
@ -1,6 +1,7 @@
 #include "GLCompute.h"
 #include "GLTexture.h"
 #include "util/StrUtil.h"
+#include "rx/align.hpp"

 namespace gl
 {
@ -196,7 +197,7 @@ namespace gl
 		m_data_length = data_length;

 		const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
-		const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation);
+		const auto num_bytes_to_process = rx::alignUp(data_length, num_bytes_per_invocation);
 		const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;

 		if ((num_bytes_to_process + data_offset) > data->size())
@ -364,7 +365,7 @@ namespace gl

 		dst->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(2), out_offset, row_pitch * 4 * region.height);

-		const int num_invocations = utils::aligned_div(region.width * region.height, optimal_kernel_size * optimal_group_size);
+		const int num_invocations = rx::aligned_div(region.width * region.height, optimal_kernel_size * optimal_group_size);
 		compute_task::run(cmd, num_invocations);
 	}

@ -411,7 +412,7 @@ namespace gl

 		dst->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(1), out_offset, row_pitch * 4 * region.height);

-		const int num_invocations = utils::aligned_div(region.width * region.height, optimal_kernel_size * optimal_group_size);
+		const int num_invocations = rx::aligned_div(region.width * region.height, optimal_kernel_size * optimal_group_size);
 		compute_task::run(cmd, num_invocations);
 	}

@ -437,7 +438,7 @@ namespace gl
 	void cs_ssbo_to_color_image::run(gl::command_context& cmd, const buffer* src, const texture_view* dst, const u32 src_offset, const coordu& dst_region, const pixel_buffer_layout& layout)
 	{
 		const u32 bpp = dst->image()->pitch() / dst->image()->width();
-		const u32 row_length = utils::align(dst_region.width * bpp, std::max<int>(layout.alignment, 1)) / bpp;
+		const u32 row_length = rx::alignUp(dst_region.width * bpp, std::max<int>(layout.alignment, 1)) / bpp;

 		m_program.uniforms["swap_bytes"] = layout.swap_bytes;
 		m_program.uniforms["src_pitch"] = row_length;
@ -448,7 +449,7 @@ namespace gl
 		src->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), src_offset, row_length * bpp * dst_region.height);
 		glBindImageTexture(GL_COMPUTE_IMAGE_SLOT(0), dst->id(), 0, GL_FALSE, 0, GL_WRITE_ONLY, dst->view_format());

-		const int num_invocations = utils::aligned_div(dst_region.width * dst_region.height, optimal_kernel_size * optimal_group_size);
+		const int num_invocations = rx::aligned_div(dst_region.width * dst_region.height, optimal_kernel_size * optimal_group_size);
 		compute_task::run(cmd, num_invocations);
 	}

--- a/rpcs3/Emu/RSX/GL/GLCompute.h
+++ b/rpcs3/Emu/RSX/GL/GLCompute.h
@ -337,7 +337,7 @@ namespace gl
 			set_parameters(cmd);

 			const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
-			const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
+			const u32 linear_invocations = rx::aligned_div(data_length, num_bytes_per_invocation);
 			compute_task::run(cmd, linear_invocations);
 		}
 	};
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@ -12,6 +12,8 @@
 #include "Emu/RSX/Host/RSXDMAWriter.h"
 #include "Emu/RSX/NV47/HW/context_accessors.define.h"

+#include "rx/align.hpp"
+
 [[noreturn]] extern void report_fatal_error(std::string_view _text, bool is_html = false, bool include_help_text = true);

 namespace
@ -895,7 +897,7 @@ void GLGSRender::load_program_env()
 		if (update_fragment_texture_env)
 			m_texture_parameters_buffer->reserve_storage_on_heap(256);
 		if (update_fragment_constants)
-			m_fragment_constants_buffer->reserve_storage_on_heap(utils::align(fragment_constants_size, 256));
+			m_fragment_constants_buffer->reserve_storage_on_heap(rx::alignUp(fragment_constants_size, 256));
 		if (update_transform_constants)
 			m_transform_constants_buffer->reserve_storage_on_heap(8192);
 		if (update_raster_env)
--- a/rpcs3/Emu/RSX/GL/GLHelpers.h
+++ b/rpcs3/Emu/RSX/GL/GLHelpers.h
@ -15,7 +15,7 @@
 #include "util/geometry.h"
 #include "util/File.h"
 #include "util/logs.hpp"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 #include "glutils/common.h"
 // TODO: Include on use
--- a/rpcs3/Emu/RSX/GL/GLOverlays.cpp
+++ b/rpcs3/Emu/RSX/GL/GLOverlays.cpp
@ -4,6 +4,8 @@
 #include "../Program/RSXOverlay.h"
 #include "Emu/Cell/timers.hpp"

+#include "rx/align.hpp"
+
 namespace gl
 {
 	// Lame
@ -544,7 +546,7 @@ namespace gl
 		const pixel_buffer_layout& layout)
 	{
 		const u32 bpp = dst->image()->pitch() / dst->image()->width();
-		const u32 row_length = utils::align(dst_region.width * bpp, std::max<int>(layout.alignment, 1)) / bpp;
+		const u32 row_length = rx::alignUp(dst_region.width * bpp, std::max<int>(layout.alignment, 1)) / bpp;

 		program_handle.uniforms["src_pitch"] = row_length;
 		program_handle.uniforms["swap_bytes"] = layout.swap_bytes;
--- a/rpcs3/Emu/RSX/GL/GLResolveHelper.cpp
+++ b/rpcs3/Emu/RSX/GL/GLResolveHelper.cpp
@ -2,6 +2,8 @@
 #include "GLResolveHelper.h"
 #include "GLTexture.h"

+#include "rx/align.hpp"
+
 #include <unordered_map>
 #include <stack>

@ -225,8 +227,8 @@ namespace gl
 		multisampled = msaa_image;
 		resolve = resolve_image;

-		const u32 invocations_x = utils::align(resolve_image->width(), cs_wave_x) / cs_wave_x;
-		const u32 invocations_y = utils::align(resolve_image->height(), cs_wave_y) / cs_wave_y;
+		const u32 invocations_x = rx::alignUp(resolve_image->width(), cs_wave_x) / cs_wave_x;
+		const u32 invocations_y = rx::alignUp(resolve_image->height(), cs_wave_y) / cs_wave_y;

 		compute_task::run(cmd, invocations_x, invocations_y);
 	}
--- a/rpcs3/Emu/RSX/GL/GLTexture.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp
@ -9,7 +9,8 @@

 #include "../RSXThread.h"

-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"

 namespace gl
 {
@ -664,7 +665,7 @@ namespace gl
 			u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format);
 			u64 image_linear_size = staging_buffer.size();

-			const auto min_required_buffer_size = std::max<u64>(utils::align(image_linear_size * 4, 0x100000), 16 * 0x100000);
+			const auto min_required_buffer_size = std::max<u64>(rx::alignUp(image_linear_size * 4, 0x100000), 16 * 0x100000);

 			if (driver_caps.ARB_compute_shader_supported)
 			{
@ -825,7 +826,7 @@ namespace gl
 		}
 		else
 		{
-			const auto aligned_pitch = utils::align<u32>(dst->pitch(), 4);
+			const auto aligned_pitch = rx::alignUp<u32>(dst->pitch(), 4);
 			const u32 texture_data_sz = dst->depth() * dst->height() * aligned_pitch;
 			data_upload_buf.resize(texture_data_sz);
 		}
@ -1002,7 +1003,7 @@ namespace gl

 			u32 scratch_offset = 0;
 			const u64 min_storage_requirement = src_mem.image_size_in_bytes + dst_mem.image_size_in_bytes;
-			const u64 min_required_buffer_size = utils::align(min_storage_requirement, 256);
+			const u64 min_required_buffer_size = rx::alignUp(min_storage_requirement, 256);

 			if (g_typeless_transfer_buffer.size() >= min_required_buffer_size) [[likely]]
 			{
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp
@ -3,7 +3,8 @@
 #include "GLTextureCache.h"
 #include "../Common/BufferUtils.h"

-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"

 namespace gl
 {
@ -82,7 +83,7 @@ namespace gl
 				}
 				else
 				{
-					const u32 num_rows = utils::align(valid_length, rsx_pitch) / rsx_pitch;
+					const u32 num_rows = rx::alignUp(valid_length, rsx_pitch) / rsx_pitch;
 					u32* data = static_cast<u32*>(dst);
 					for (u32 row = 0; row < num_rows; ++row)
 					{
@ -212,7 +213,7 @@ namespace gl
 				// Dimensions were given in 'dst' space. Work out the real source coordinates
 				const auto src_bpp = slice.src->pitch() / slice.src->width();
 				src_x = (src_x * dst_bpp) / src_bpp;
-				src_w = utils::aligned_div<u16>(src_w * dst_bpp, src_bpp);
+				src_w = rx::aligned_div<u16>(src_w * dst_bpp, src_bpp);
 			}

 			if (auto surface = dynamic_cast<gl::render_target*>(slice.src))
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.h
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h
@ -7,6 +7,8 @@

 #include "../Common/texture_cache.h"

+#include "rx/align.hpp"
+
 #include <memory>
 #include <vector>

@ -49,7 +51,7 @@ namespace gl
 		void init_buffer(const gl::texture* src)
 		{
 			const u32 vram_size = src->pitch() * src->height();
-			const u32 buffer_size = utils::align(vram_size, 4096);
+			const u32 buffer_size = rx::alignUp(vram_size, 4096);

 			if (pbo)
 			{
--- a/rpcs3/Emu/RSX/GL/glutils/capabilities.h
+++ b/rpcs3/Emu/RSX/GL/glutils/capabilities.h
@ -2,7 +2,7 @@

 #include "../OpenGL.h"
 #include <util/types.hpp>
-#include <util/asm.hpp>
+#include <rx/asm.hpp>
 #include <util/logs.hpp>

 namespace gl
--- a/rpcs3/Emu/RSX/GL/glutils/image.cpp
+++ b/rpcs3/Emu/RSX/GL/glutils/image.cpp
@ -3,6 +3,7 @@
 #include "buffer_object.h"
 #include "state_tracker.hpp"
 #include "pixel_settings.hpp"
+#include "rx/align.hpp"

 namespace gl
 {
@ -119,14 +120,14 @@ namespace gl
 			case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
 			{
 				m_compressed = true;
-				m_pitch = utils::align(width, 4) / 2;
+				m_pitch = rx::alignUp(width, 4) / 2;
 				break;
 			}
 			case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
 			case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
 			{
 				m_compressed = true;
-				m_pitch = utils::align(width, 4);
+				m_pitch = rx::alignUp(width, 4);
 				break;
 			}
 			default:
--- a/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp
+++ b/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp
@ -1,6 +1,8 @@
 #include "stdafx.h"
 #include "ring_buffer.h"

+#include "rx/align.hpp"
+
 namespace gl
 {
 	void ring_buffer::recreate(GLsizeiptr size, const void* data)
@ -37,7 +39,7 @@ namespace gl
 	{
 		u32 offset = m_data_loc;
 		if (m_data_loc)
-			offset = utils::align(offset, alignment);
+			offset = rx::alignUp(offset, alignment);

 		if ((offset + alloc_size) > m_size)
 		{
@ -56,7 +58,7 @@ namespace gl
 		}

 		// Align data loc to 256; allows some "guard" region so we dont trample our own data inadvertently
-		m_data_loc = utils::align(offset + alloc_size, 256);
+		m_data_loc = rx::alignUp(offset + alloc_size, 256);
 		return std::make_pair(static_cast<char*>(m_memory_mapping) + offset, offset);
 	}

@ -108,9 +110,9 @@ namespace gl

 		u32 offset = m_data_loc;
 		if (m_data_loc)
-			offset = utils::align(offset, 256);
+			offset = rx::alignUp(offset, 256);

-		const u32 block_size = utils::align(alloc_size + 16, 256); // Overallocate just in case we need to realign base
+		const u32 block_size = rx::alignUp(alloc_size + 16, 256); // Overallocate just in case we need to realign base

 		if ((offset + block_size) > m_size)
 		{
@ -144,10 +146,10 @@ namespace gl
 	{
 		u32 offset = m_data_loc;
 		if (m_data_loc)
-			offset = utils::align(offset, alignment);
+			offset = rx::alignUp(offset, alignment);

 		u32 padding = (offset - m_data_loc);
-		u32 real_size = utils::align(padding + alloc_size, alignment); // Ensures we leave the loc pointer aligned after we exit
+		u32 real_size = rx::alignUp(padding + alloc_size, alignment); // Ensures we leave the loc pointer aligned after we exit

 		if (real_size > m_mapped_bytes)
 		{
@ -158,10 +160,10 @@ namespace gl

 			offset = m_data_loc;
 			if (m_data_loc)
-				offset = utils::align(offset, alignment);
+				offset = rx::alignUp(offset, alignment);

 			padding = (offset - m_data_loc);
-			real_size = utils::align(padding + alloc_size, alignment);
+			real_size = rx::alignUp(padding + alloc_size, alignment);
 		}

 		m_data_loc = offset + real_size;
@ -270,7 +272,7 @@ namespace gl

 	u32 scratch_ring_buffer::alloc(u32 size, u32 alignment)
 	{
-		u64 start = utils::align(m_alloc_pointer, alignment);
+		u64 start = rx::alignUp(m_alloc_pointer, alignment);
 		m_alloc_pointer = (start + size);

 		if (static_cast<GLsizeiptr>(m_alloc_pointer) > m_storage.size())
--- a/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp
+++ b/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp
@ -58,7 +58,7 @@ namespace gl
 			m_src = fmt::replace_all(m_src, replacement_table);

 			// Fill with 0 to avoid sending incomplete/unused variables to the GPU
-			m_constants_buf.resize(utils::rounded_div(push_constants_size, 4), 0);
+			m_constants_buf.resize(rx::rounded_div(push_constants_size, 4), 0);

 			create();

@ -106,8 +106,8 @@ namespace gl
 			glBindImageTexture(GL_COMPUTE_IMAGE_SLOT(0), dst->id(), 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8);

 			constexpr auto wg_size = 16;
-			const auto invocations_x = utils::aligned_div(output_size.width, wg_size);
-			const auto invocations_y = utils::aligned_div(output_size.height, wg_size);
+			const auto invocations_x = rx::aligned_div(output_size.width, wg_size);
+			const auto invocations_y = rx::aligned_div(output_size.height, wg_size);

 			ensure(invocations_x == (output_size.width + (wg_size - 1)) / wg_size);
 			ensure(invocations_y == (output_size.height + (wg_size - 1)) / wg_size);
--- a/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp
+++ b/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp
@ -2,7 +2,7 @@
 #include "RSXDMAWriter.h"

 #include "util//Thread.h"
-#include <util/asm.hpp>
+#include <rx/asm.hpp>

 namespace rsx
 {
@ -56,7 +56,7 @@ namespace rsx
 		// FIXME: This is a busy wait, consider yield to improve responsiveness on weak devices.
 		while (!m_host_context_ptr->in_flight_commands_completed())
 		{
-			utils::pause();
+			rx::pause();

 			if (thread_ctrl::state() == thread_state::aborting)
 			{
--- a/rpcs3/Emu/RSX/Overlays/Shaders/shader_loading_dialog.cpp
+++ b/rpcs3/Emu/RSX/Overlays/Shaders/shader_loading_dialog.cpp
@ -3,7 +3,7 @@
 #include "Emu/System.h"
 #include "rpcsx/fw/ps3/cellMsgDialog.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 namespace rsx
 {
@ -36,7 +36,7 @@ namespace rsx

 		while (ref_cnt.load() && !Emu.IsStopped())
 		{
-			utils::pause();
+			rx::pause();
 		}
 	}

@ -112,7 +112,7 @@ namespace rsx
 	{
 		while (ref_cnt.load() && !Emu.IsStopped())
 		{
-			utils::pause();
+			rx::pause();
 		}
 	}
 } // namespace rsx
--- a/rpcs3/Emu/RSX/Overlays/overlay_manager.cpp
+++ b/rpcs3/Emu/RSX/Overlays/overlay_manager.cpp
@ -1,7 +1,7 @@
 #include "stdafx.h"
 #include "overlay_manager.h"
 #include "Emu/System.h"
-#include <util/asm.hpp>
+#include <rx/asm.hpp>

 namespace rsx
 {
@ -37,7 +37,7 @@ namespace rsx
 				*m_input_thread = thread_state::aborting;
 				while (*m_input_thread <= thread_state::aborting)
 				{
-					utils::pause();
+					rx::pause();
 				}
 			}
 		}
--- a/rpcs3/Emu/RSX/RSXFIFO.cpp
+++ b/rpcs3/Emu/RSX/RSXFIFO.cpp
@ -9,7 +9,8 @@
 #include "cellos/sys_rsx.h"
 #include "NV47/HW/context.h"

-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"

 #include <thread>
 #include <bitset>
@ -139,7 +140,7 @@ namespace rsx
 				u32 bytes_read = 0;

 				// Find the next set bit after every iteration
-				for (int i = 0;; i = (std::countr_zero<u32>(utils::rol8(to_fetch, 0 - i - 1)) + i + 1) % 8)
+				for (int i = 0;; i = (std::countr_zero<u32>(rx::rol8(to_fetch, 0 - i - 1)) + i + 1) % 8)
 				{
 					// If a reservation is being updated, try to load another
 					const auto& res = vm::reservation_acquire(addr1 + i * 128);
@ -193,7 +194,7 @@ namespace rsx
 					}
 					else
 					{
-						busy_wait(200);
+						rx::busy_wait(200);
 					}

 					if (strict_fetch_ordering)
@ -247,7 +248,7 @@ namespace rsx

 			for (u32 remaining = size, addr = m_internal_get, ptr = from; remaining > 0;)
 			{
-				const u32 next_block = utils::align(addr + 1, _1M);
+				const u32 next_block = rx::alignUp(addr + 1, _1M);
 				const u32 available = (next_block - addr);
 				if (remaining <= available)
 				{
--- a/rpcs3/Emu/RSX/RSXOffload.cpp
+++ b/rpcs3/Emu/RSX/RSXOffload.cpp
@ -9,7 +9,7 @@
 #include "util/lockless.h"

 #include <thread>
-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 namespace rsx
 {
@ -181,13 +181,13 @@ namespace rsx
 			while (_thr.m_enqueued_count.load() > _thr.m_processed_count.load())
 			{
 				rsxthr->on_semaphore_acquire_wait();
-				utils::pause();
+				rx::pause();
 			}
 		}
 		else
 		{
 			while (_thr.m_enqueued_count.load() > _thr.m_processed_count.load())
-				utils::pause();
+				rx::pause();
 		}

 		return true;
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@ -27,7 +27,8 @@

 #include "util/date_time.h"

-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"

 #include <span>
 #include <thread>
@ -332,11 +333,11 @@ namespace rsx
 				{
 					// Division operator
 					_min_index = std::min(_min_index, first / attrib.frequency);
-					_max_index = std::max<u32>(_max_index, utils::aligned_div(max_index, attrib.frequency));
+					_max_index = std::max<u32>(_max_index, rx::aligned_div(max_index, attrib.frequency));

 					if (freq_count > 0 && freq_count != umax)
 					{
-						const u32 max = utils::aligned_div(max_index, attrib.frequency);
+						const u32 max = rx::aligned_div(max_index, attrib.frequency);
 						max_result_by_division = std::max<u32>(max_result_by_division, max);

 						// Discard lower frequencies because it has been proven that there are indices higher than them
@ -365,7 +366,7 @@ namespace rsx
 			// The alternative would be re-iterating again over all of them
 			if (get_location(real_offset_address) == CELL_GCM_LOCATION_LOCAL)
 			{
-				if (utils::add_saturate<u32>(real_offset_address - rsx::constants::local_mem_base, (_max_index + 1) * attribute_stride) <= render->local_mem_size)
+				if (rx::add_saturate<u32>(real_offset_address - rsx::constants::local_mem_base, (_max_index + 1) * attribute_stride) <= render->local_mem_size)
 				{
 					break;
 				}
@ -734,7 +735,7 @@ namespace rsx
 		{
 			// Be compatible with previous bitwise serialization
 			ar(std::span<u8>(reinterpret_cast<u8*>(this), OFFSET_OF(avconf, scan_mode)));
-			ar.pos += utils::align<usz>(OFFSET_OF(avconf, scan_mode), alignof(avconf)) - OFFSET_OF(avconf, scan_mode);
+			ar.pos += rx::alignUp<usz>(OFFSET_OF(avconf, scan_mode), alignof(avconf)) - OFFSET_OF(avconf, scan_mode);
 			return;
 		}

@ -1169,7 +1170,7 @@ namespace rsx

 			for (; t == now; now = get_time_ns())
 			{
-				utils::pause();
+				rx::pause();
 			}

 			timestamp_ctrl = now;
@ -2590,7 +2591,7 @@ namespace rsx
 			{
 				if (u32 advance = disasm.disasm(pcs_of_valid_cmds.back()))
 				{
-					pcs_of_valid_cmds.push_back(utils::add_saturate<u32>(pcs_of_valid_cmds.back(), advance));
+					pcs_of_valid_cmds.push_back(rx::add_saturate<u32>(pcs_of_valid_cmds.back(), advance));
 				}
 				else
 				{
@ -2722,7 +2723,7 @@ namespace rsx
 		}

 		// Some cases do not need full delay
-		remaining = utils::aligned_div(remaining, div);
+		remaining = rx::aligned_div(remaining, div);
 		const u64 until = get_system_time() + remaining;

 		while (true)
@ -2751,7 +2752,7 @@ namespace rsx
 			}
 			else
 			{
-				busy_wait(100);
+				rx::busy_wait(100);
 			}

 			const u64 current = get_system_time();
@ -2862,7 +2863,7 @@ namespace rsx

 			for (u32 ea = address >> 20, end = ea + (size >> 20); ea < end; ea++)
 			{
-				const u32 io = utils::rol32(iomap_table.io[ea], 32 - 20);
+				const u32 io = rx::rol32(iomap_table.io[ea], 32 - 20);

 				if (io + 1)
 				{
@ -2892,7 +2893,7 @@ namespace rsx

 						while (to_unmap)
 						{
-							bit = (std::countr_zero<u64>(utils::rol64(to_unmap, 0 - bit)) + bit);
+							bit = (std::countr_zero<u64>(rx::rol64(to_unmap, 0 - bit)) + bit);
 							to_unmap &= ~(1ull << bit);

 							constexpr u16 null_entry = 0xFFFF;
@ -2998,7 +2999,7 @@ namespace rsx

 		while (!external_interrupt_ack && !is_stopped())
 		{
-			utils::pause();
+			rx::pause();
 		}
 	}

@ -3022,7 +3023,7 @@ namespace rsx
 			while (external_interrupt_lock && (cpu_flag::ret - state))
 			{
 				// TODO: Investigate non busy-spinning method
-				utils::pause();
+				rx::pause();
 			}

 			external_interrupt_ack.store(false);
@ -3364,7 +3365,7 @@ namespace rsx
 		}

 		const u64 current_time = get_system_time();
-		const u64 current_tsc = utils::get_tsc();
+		const u64 current_tsc = rx::get_tsc();
 		u64 preempt_count = 0;

 		if (frame_times.size() >= 60)
--- a/rpcs3/Emu/RSX/VK/VKCompute.cpp
+++ b/rpcs3/Emu/RSX/VK/VKCompute.cpp
@ -4,6 +4,8 @@
 #include "vkutils/buffer_object.h"
 #include "VKPipelineCompiler.h"

+#include "rx/align.hpp"
+
 #define VK_MAX_COMPUTE_TASKS 8192 // Max number of jobs per frame

 namespace vk
@ -219,7 +221,7 @@ namespace vk
 #include "../Program/GLSLSnippets/ShuffleBytes.glsl"
 			;

-		const auto parameters_size = utils::align(push_constants_size, 16) / 16;
+		const auto parameters_size = rx::alignUp(push_constants_size, 16) / 16;
 		const std::pair<std::string_view, std::string> syntax_replace[] =
 			{
 				{"%loc", "0"},
@ -387,7 +389,7 @@ namespace vk
 		word_count = num_words;
 		block_length = num_words * 4;

-		const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size);
+		const u32 linear_invocations = rx::aligned_div(word_count, optimal_group_size);
 		compute_task::run(cmd, linear_invocations);
 	}
 } // namespace vk
--- a/rpcs3/Emu/RSX/VK/VKCompute.h
+++ b/rpcs3/Emu/RSX/VK/VKCompute.h
@ -6,7 +6,8 @@
 #include "Emu/IdManager.h"

 #include "util/StrUtil.h"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"

 #include <unordered_map>

@ -484,7 +485,7 @@ namespace vk
 			set_parameters(cmd);

 			const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
-			const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
+			const u32 linear_invocations = rx::aligned_div(data_length, num_bytes_per_invocation);
 			compute_task::run(cmd, linear_invocations);
 		}
 	};
@ -602,8 +603,8 @@ namespace vk
 			this->out_offset = config.dst_offset;

 			const auto tile_aligned_height = std::min(
-				utils::align<u32>(config.image_height, 64),
-				utils::aligned_div(config.tile_size - config.tile_base_offset, config.tile_pitch));
+				rx::alignUp<u32>(config.image_height, 64),
+				rx::aligned_div(config.tile_size - config.tile_base_offset, config.tile_pitch));

 			if constexpr (Op == RSX_detiler_op::decode)
 			{
@ -656,7 +657,7 @@ namespace vk

 			const u32 subtexels_per_invocation = (config.image_bpp < 4) ? (4 / config.image_bpp) : 1;
 			const u32 virtual_width = config.image_width / subtexels_per_invocation;
-			const u32 invocations_x = utils::aligned_div(virtual_width, optimal_group_size);
+			const u32 invocations_x = rx::aligned_div(virtual_width, optimal_group_size);
 			compute_task::run(cmd, invocations_x, config.image_height, 1);
 		}
 	};
--- a/rpcs3/Emu/RSX/VK/VKDMA.cpp
+++ b/rpcs3/Emu/RSX/VK/VKDMA.cpp
@ -7,7 +7,9 @@
 #include "Emu/RSX/RSXThread.h"
 #include "util/mutex.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"
+
 #include <unordered_map>

 namespace vk
@ -413,7 +415,7 @@ namespace vk
 		std::lock_guard lock(g_dma_mutex);

 		const u32 start = (local_address & s_dma_block_mask);
-		const u32 end = utils::align(local_address + length, static_cast<u32>(s_dma_block_length));
+		const u32 end = rx::alignUp(local_address + length, static_cast<u32>(s_dma_block_length));

 		for (u32 block = start; block < end;)
 		{
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@ -22,7 +22,8 @@

 #include "../Program/SPIRVCommon.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"

 namespace vk
 {
@ -919,7 +920,7 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
 			// Wait for deadlock to clear
 			while (m_queue_status & flush_queue_state::deadlock)
 			{
-				utils::pause();
+				rx::pause();
 			}

 			g_fxo->get<rsx::dma_manager>().clear_mem_fault_flag();
@ -2081,13 +2082,13 @@ void VKGSRender::load_program_env()

 		rsx::io_buffer indirection_table_buf([&](usz size) -> std::pair<void*, usz>
 			{
-				indirection_table_offset = m_instancing_buffer_ring_info.alloc<1>(utils::align(size, alignment));
+				indirection_table_offset = m_instancing_buffer_ring_info.alloc<1>(rx::alignUp(size, alignment));
 				return std::make_pair(m_instancing_buffer_ring_info.map(indirection_table_offset, size), size);
 			});

 		rsx::io_buffer constants_array_buf([&](usz size) -> std::pair<void*, usz>
 			{
-				constants_data_table_offset = m_instancing_buffer_ring_info.alloc<1>(utils::align(size, alignment));
+				constants_data_table_offset = m_instancing_buffer_ring_info.alloc<1>(rx::alignUp(size, alignment));
 				return std::make_pair(m_instancing_buffer_ring_info.map(constants_data_table_offset, size), size);
 			});

@ -2105,7 +2106,7 @@ void VKGSRender::load_program_env()
 		auto alloc_storage = [&](usz size) -> std::pair<void*, usz>
 		{
 			const auto alignment = m_device->gpu().get_limits().minUniformBufferOffsetAlignment;
-			mem_offset = m_transform_constants_ring_info.alloc<1>(utils::align(size, alignment));
+			mem_offset = m_transform_constants_ring_info.alloc<1>(rx::alignUp(size, alignment));
 			return std::make_pair(m_transform_constants_ring_info.map(mem_offset, size), size);
 		};

@ -2921,7 +2922,7 @@ void VKGSRender::get_occlusion_query_result(rsx::reports::occlusion_query_info*
 			}

 			rsx_log.warning("[Performance warning] Unexpected ZCULL read caused a hard sync");
-			busy_wait();
+			rx::busy_wait();
 		}

 		data.sync();
--- a/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp
@ -8,7 +8,7 @@
 #include "Emu/RSX/rsx_utils.h"
 #include "Emu/RSX/rsx_cache.h"
 #include "util/mutex.h"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 #include <optional>
 #include <thread>
@ -289,7 +289,7 @@ namespace vk
 		{
 			while (num_waiters.load() != 0)
 			{
-				utils::pause();
+				rx::pause();
 			}
 		}

--- a/rpcs3/Emu/RSX/VK/VKPresent.cpp
+++ b/rpcs3/Emu/RSX/VK/VKPresent.cpp
@ -8,7 +8,8 @@
 #include "upscalers/bilinear_pass.hpp"
 #include "upscalers/fsr_pass.h"
 #include "upscalers/nearest_pass.hpp"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
+#include "rx/align.hpp"
 #include "util/video_provider.h"

 extern atomic_t<bool> g_user_asked_for_screenshot;
@ -762,7 +763,7 @@ void VKGSRender::flip(const rsx::display_flip_info_t& info)
 		{
 			const usz sshot_size = buffer_height * buffer_width * 4;

-			vk::buffer sshot_vkbuf(*m_device, utils::align(sshot_size, 0x100000), m_device->get_memory_mapping().host_visible_coherent,
+			vk::buffer sshot_vkbuf(*m_device, rx::alignUp(sshot_size, 0x100000), m_device->get_memory_mapping().host_visible_coherent,
 				VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0, VMM_ALLOCATION_POOL_UNDEFINED);

 			VkBufferImageCopy copy_info;
--- a/rpcs3/Emu/RSX/VK/VKQueryPool.cpp
+++ b/rpcs3/Emu/RSX/VK/VKQueryPool.cpp
@ -4,7 +4,7 @@
 #include "VKQueryPool.h"
 #include "VKRenderPass.h"
 #include "VKResourceManager.h"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"
 #include "VKGSRender.h"

 namespace vk
@ -172,7 +172,7 @@ namespace vk

 			while (!query_info.ready)
 			{
-				utils::pause();
+				rx::pause();
 				poke_query(query_info, index, result_flags);
 			}
 		}
--- a/rpcs3/Emu/RSX/VK/VKResolveHelper.h
+++ b/rpcs3/Emu/RSX/VK/VKResolveHelper.h
@ -4,6 +4,7 @@
 #include "VKOverlays.h"

 #include "vkutils/image.h"
+#include "rx/align.hpp"

 namespace vk
 {
@ -65,8 +66,8 @@ namespace vk
 			multisampled = msaa_image;
 			resolve = resolve_image;

-			const u32 invocations_x = utils::align(resolve_image->width(), cs_wave_x) / cs_wave_x;
-			const u32 invocations_y = utils::align(resolve_image->height(), cs_wave_y) / cs_wave_y;
+			const u32 invocations_x = rx::alignUp(resolve_image->width(), cs_wave_x) / cs_wave_x;
+			const u32 invocations_y = rx::alignUp(resolve_image->height(), cs_wave_y) / cs_wave_y;

 			compute_task::run(cmd, invocations_x, invocations_y, 1);
 		}
--- a/rpcs3/Emu/RSX/VK/VKTexture.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@ -13,7 +13,8 @@
 #include "../GCM.h"
 #include "../rsx_utils.h"

-#include "util/asm.hpp"
+#include "rx/align.hpp"
+#include "rx/asm.hpp"

 namespace vk
 {
@ -94,7 +95,7 @@ namespace vk
 			ensure(dst->size() >= allocation_end);

 			const auto data_offset = u32(region.bufferOffset);
-			const auto z32_offset = utils::align<u32>(data_offset + packed16_length, 256);
+			const auto z32_offset = rx::alignUp<u32>(data_offset + packed16_length, 256);

 			// 1. Copy the depth to buffer
 			VkBufferImageCopy region2;
@ -148,8 +149,8 @@ namespace vk
 			ensure(dst->size() >= allocation_end);

 			const auto data_offset = u32(region.bufferOffset);
-			const auto z_offset = utils::align<u32>(data_offset + packed_length, 256);
-			const auto s_offset = utils::align<u32>(z_offset + in_depth_size, 256);
+			const auto z_offset = rx::alignUp<u32>(data_offset + packed_length, 256);
+			const auto s_offset = rx::alignUp<u32>(z_offset + in_depth_size, 256);

 			// 1. Copy the depth and stencil blocks to separate banks
 			VkBufferImageCopy sub_regions[2];
@ -246,7 +247,7 @@ namespace vk
 			ensure(src->size() >= allocation_end);

 			const auto data_offset = u32(region.bufferOffset);
-			const auto z32_offset = utils::align<u32>(data_offset + packed16_length, 256);
+			const auto z32_offset = rx::alignUp<u32>(data_offset + packed16_length, 256);

 			// 1. Pre-compute barrier
 			vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length,
@ -281,11 +282,11 @@ namespace vk
 			ensure(src->size() >= allocation_end); // "Out of memory (compute heap). Lower your resolution scale setting."

 			const auto data_offset = u32(region.bufferOffset);
-			const auto z_offset = utils::align<u32>(data_offset + packed_length, 256);
-			const auto s_offset = utils::align<u32>(z_offset + in_depth_size, 256);
+			const auto z_offset = rx::alignUp<u32>(data_offset + packed_length, 256);
+			const auto s_offset = rx::alignUp<u32>(z_offset + in_depth_size, 256);

 			// Zero out the stencil block
-			VK_GET_SYMBOL(vkCmdFillBuffer)(cmd, src->value, s_offset, utils::align(in_stencil_size, 4), 0);
+			VK_GET_SYMBOL(vkCmdFillBuffer)(cmd, src->value, s_offset, rx::alignUp(in_stencil_size, 4), 0);

 			vk::insert_buffer_memory_barrier(cmd, src->value, s_offset, in_stencil_size,
 				VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
@ -848,7 +849,7 @@ namespace vk
 			const auto src_offset = section.bufferOffset;

 			// Align output to 128-byte boundary to keep some drivers happy
-			dst_offset = utils::align(dst_offset, 128);
+			dst_offset = rx::alignUp(dst_offset, 128);

 			u32 data_length = 0;
 			for (unsigned i = 0, j = packet.first; i < packet.second; ++i, ++j)
@ -1124,7 +1125,7 @@ namespace vk
 				if (layout.level == 0)
 				{
 					// Align mip0 on a 128-byte boundary
-					scratch_offset = utils::align(scratch_offset, 128);
+					scratch_offset = rx::alignUp(scratch_offset, 128);
 				}

 				// Copy from upload heap to scratch mem
@ -1254,7 +1255,7 @@ namespace vk
 	{
 		// Calculate the true length of the usable memory section
 		const auto available_tile_size = tiled_region.tile->size - (range.start - tiled_region.base_address);
-		const auto max_content_size = tiled_region.tile->pitch * utils::align<u32>(height, 64);
+		const auto max_content_size = tiled_region.tile->pitch * rx::alignUp<u32>(height, 64);
 		const auto section_length = std::min(max_content_size, available_tile_size);

 		// Sync the DMA layer
--- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp
@ -4,7 +4,7 @@
 #include "VKCompute.h"
 #include "VKAsyncScheduler.h"

-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 namespace vk
 {
@ -450,7 +450,7 @@ namespace vk
 				// Dimensions were given in 'dst' space. Work out the real source coordinates
 				const auto src_bpp = vk::get_format_texel_width(section.src->format());
 				src_x = (src_x * dst_bpp) / src_bpp;
-				src_w = utils::aligned_div<u16>(src_w * dst_bpp, src_bpp);
+				src_w = rx::aligned_div<u16>(src_w * dst_bpp, src_bpp);

 				transform &= ~(rsx::surface_transform::coordinate_transform);
 			}
--- a/rpcs3/Emu/RSX/VK/VKTextureCache.h
+++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h
@ -4,11 +4,14 @@
 #include "VKRenderTargets.h"
 #include "VKResourceManager.h"
 #include "VKRenderPass.h"
+#include "VKGSRenderTypes.hpp"
 #include "vkutils/image_helpers.h"

 #include "../Common/texture_cache.h"
 #include "../Common/tiled_dma_copy.hpp"

+#include "rx/align.hpp"
+
 #include <memory>
 #include <vector>

@ -289,7 +292,7 @@ namespace vk
 			if (tiled_region)
 			{
 				const auto available_tile_size = tiled_region.tile->size - (range.start - tiled_region.base_address);
-				const auto max_content_size = tiled_region.tile->pitch * utils::align(height, 64);
+				const auto max_content_size = tiled_region.tile->pitch * rx::alignUp(height, 64);
 				flush_length = std::min(max_content_size, available_tile_size);
 			}

--- a/rpcs3/Emu/RSX/VK/upscalers/fsr1/fsr_pass.cpp
+++ b/rpcs3/Emu/RSX/VK/upscalers/fsr1/fsr_pass.cpp
@ -117,8 +117,8 @@ namespace vk
 			configure(cmd);

 			constexpr auto wg_size = 16;
-			const auto invocations_x = utils::aligned_div(output_size.width, wg_size);
-			const auto invocations_y = utils::aligned_div(output_size.height, wg_size);
+			const auto invocations_x = rx::aligned_div(output_size.width, wg_size);
+			const auto invocations_y = rx::aligned_div(output_size.height, wg_size);

 			ensure(invocations_x == (output_size.width + (wg_size - 1)) / wg_size);
 			ensure(invocations_y == (output_size.height + (wg_size - 1)) / wg_size);
--- a/rpcs3/Emu/RSX/VK/vkutils/data_heap.cpp
+++ b/rpcs3/Emu/RSX/VK/vkutils/data_heap.cpp
@ -6,6 +6,7 @@
 #include "../VKHelpers.h"
 #include "../VKResourceManager.h"
 #include "Emu/IdManager.h"
+#include "rx/align.hpp"

 #include <memory>

@ -60,7 +61,7 @@ namespace vk

 		// Create new heap. All sizes are aligned up by 64M, upto 1GiB
 		const usz size_limit = 1024 * 0x100000;
-		usz aligned_new_size = utils::align(m_size + size, 64 * 0x100000);
+		usz aligned_new_size = rx::alignUp(m_size + size, 64 * 0x100000);

 		if (aligned_new_size >= size_limit)
 		{
--- a/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp
+++ b/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp
@ -4,7 +4,8 @@

 #include "../VKResourceManager.h"

-#include <util/asm.hpp>
+#include <rx/align.hpp>
+#include <rx/asm.hpp>

 namespace vk
 {
@ -123,8 +124,8 @@ namespace vk
 	{
 		auto create_texture = [&]()
 		{
-			u32 new_width = utils::align(requested_width, 256u);
-			u32 new_height = utils::align(requested_height, 256u);
+			u32 new_width = rx::alignUp(requested_width, 256u);
+			u32 new_height = rx::alignUp(requested_height, 256u);

 			return new vk::image(*g_render_device, g_render_device->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
 				VK_IMAGE_TYPE_2D, format, new_width, new_height, 1, 1, 1, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED,
@ -165,7 +166,7 @@ namespace vk
 		if (!scratch_buffer)
 		{
 			// Choose optimal size
-			const u64 alloc_size = utils::align(min_required_size, 0x100000);
+			const u64 alloc_size = rx::alignUp(min_required_size, 0x100000);

 			scratch_buffer = std::make_unique<vk::buffer>(*g_render_device, alloc_size,
 				g_render_device->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
@ -184,7 +185,7 @@ namespace vk
 		if (init_mem || zero_memory)
 		{
 			// Zero-initialize the allocated VRAM
-			const u64 zero_length = init_mem ? buf->size() : utils::align(min_required_size, 4);
+			const u64 zero_length = init_mem ? buf->size() : rx::alignUp(min_required_size, 4);
 			VK_GET_SYMBOL(vkCmdFillBuffer)(cmd, buf->value, 0, zero_length, 0);

 			insert_buffer_memory_barrier(cmd, buf->value, 0, zero_length,
--- a/rpcs3/Emu/RSX/VK/vkutils/sync.cpp
+++ b/rpcs3/Emu/RSX/VK/vkutils/sync.cpp
@ -9,7 +9,7 @@
 #include "Emu/Cell/timers.hpp"

 #include "util/sysinfo.hpp"
-#include "util/asm.hpp"
+#include "rx/asm.hpp"

 namespace vk
 {
@ -170,7 +170,7 @@ namespace vk
 	{
 		while (!flushed)
 		{
-			utils::pause();
+			rx::pause();
 		}
 	}

@ -553,7 +553,7 @@ namespace vk
 				switch (status)
 				{
 				case VK_NOT_READY:
-					utils::pause();
+					rx::pause();
 					continue;
 				default:
 					die_with_error(status);
@ -592,7 +592,7 @@ namespace vk

 			if (timeout)
 			{
-				const auto now = freq ? utils::get_tsc() : get_system_time();
+				const auto now = freq ? rx::get_tsc() : get_system_time();

 				if (!start)
 				{
@ -608,7 +608,7 @@ namespace vk
 				}
 			}

-			utils::pause();
+			rx::pause();
 		}
 	}
 } // namespace vk
--- a/Show more
+++ b/Show more
Author	SHA1	Message	Date
DH	3986f77869	orbis/umtx: remove state from context Some checks failed Formatting check / formatting-check (push) Has been cancelled Details Build RPCSX / build-linux (push) Has been cancelled Details Build RPCSX / build-android (arm64-v8a, armv8-a) (push) Has been cancelled Details Build RPCSX / build-android (arm64-v8a, armv8.1-a) (push) Has been cancelled Details Build RPCSX / build-android (arm64-v8a, armv8.2-a) (push) Has been cancelled Details Build RPCSX / build-android (arm64-v8a, armv8.4-a) (push) Has been cancelled Details Build RPCSX / build-android (arm64-v8a, armv8.5-a) (push) Has been cancelled Details Build RPCSX / build-android (arm64-v8a, armv9-a) (push) Has been cancelled Details Build RPCSX / build-android (arm64-v8a, armv9.1-a) (push) Has been cancelled Details Build RPCSX / build-android (x86_64, x86-64) (push) Has been cancelled Details	2025-10-06 01:58:24 +03:00
DH	e66ce512d2	kernel: Add GlobalKernelObject utility	2025-10-06 01:57:23 +03:00
DH	fd9bf42538	rx: shared_cv/shared_mtx disable copying/moving	2025-10-06 01:55:11 +03:00
DH	be56f0745a	rx/serializer: fixed compilation with gcc	2025-10-06 01:54:10 +03:00
DH	37f423aec3	add missed file changes	2025-10-05 20:07:19 +03:00
DH	640df36c48	moved tsc and asm utilities to rx	2025-10-05 19:28:03 +03:00