diff --git a/kernel/include/kernel/MemoryResource.hpp b/kernel/include/kernel/MemoryResource.hpp
index 2a78eabdc..adac526c4 100644
--- a/kernel/include/kernel/MemoryResource.hpp
+++ b/kernel/include/kernel/MemoryResource.hpp
@@ -317,8 +317,7 @@ struct AllocableResource : Resource {
       } else {
         if (flags & AllocationFlags::Stack) {
           fixedRange = rx::AddressRange::fromBeginSize(
-              rx::alignDown(it.endAddress() - size, alignment),
-              size);
+              rx::alignDown(it.endAddress() - size, alignment), size);
         } else {
           fixedRange = rx::AddressRange::fromBeginSize(
               rx::alignUp(it.beginAddress(), alignment), size);
@@ -353,6 +352,10 @@ struct AllocableResource : Resource {
       return {it, {}, fixedRange};
     }
 
+    return {merge(it), {}, fixedRange};
+  }
+
+  iterator merge(iterator it) {
     if (it != begin()) {
       // try to merge with previous node
       iterator prevIt = it;
@@ -393,7 +396,7 @@ struct AllocableResource : Resource {
       }
     }
 
-    return {it, {}, fixedRange};
+    return it;
   }
 
   void destroy() {
diff --git a/kernel/orbis/CMakeLists.txt b/kernel/orbis/CMakeLists.txt
index 98eab3cb4..e7981e1e6 100644
--- a/kernel/orbis/CMakeLists.txt
+++ b/kernel/orbis/CMakeLists.txt
@@ -78,6 +78,7 @@ add_library(obj.orbis-kernel OBJECT
 )
 
 target_link_libraries(obj.orbis-kernel PUBLIC orbis::kernel::config rx kernel)
+target_compile_options(obj.orbis-kernel PRIVATE "-mfsgsbase")
 
 target_include_directories(obj.orbis-kernel
     PUBLIC
diff --git a/kernel/orbis/include/orbis/MemoryType.hpp b/kernel/orbis/include/orbis/MemoryType.hpp
index fae3dd018..886dcafe5 100644
--- a/kernel/orbis/include/orbis/MemoryType.hpp
+++ b/kernel/orbis/include/orbis/MemoryType.hpp
@@ -6,7 +6,7 @@ namespace orbis {
 enum class MemoryType : std::uint32_t {
   Invalid = -1u,
   WbOnion = 0,   // write back, CPU bus
-  WCGarlic = 3,  // combining, GPU bus
+  WcGarlic = 3,  // combining, GPU bus
   WbGarlic = 10, // write back, GPU bus
 };
 }
diff --git a/kernel/orbis/include/orbis/dmem.hpp b/kernel/orbis/include/orbis/dmem.hpp
index 3dd91c639..65caf883c 100644
--- a/kernel/orbis/include/orbis/dmem.hpp
+++ b/kernel/orbis/include/orbis/dmem.hpp
@@ -35,6 +35,10 @@ allocate(unsigned dmemIndex, rx::AddressRange searchRange, std::uint64_t len,
          MemoryType memoryType, std::uint64_t alignment = kPageSize,
          bool pooled = false);
 
+std::pair<std::uint64_t, ErrorCode>
+allocateSystem(unsigned dmemIndex, std::uint64_t len, MemoryType memoryType,
+               std::uint64_t alignment = kPageSize);
+
 ErrorCode release(unsigned dmemIndex, rx::AddressRange range,
                   bool pooled = false);
 
@@ -54,9 +58,17 @@ std::pair<rx::AddressRange, ErrorCode>
 getAvailSize(unsigned dmemIndex, rx::AddressRange searchRange,
              std::uint64_t alignment);
 
-ErrorCode map(unsigned dmemIndex, rx::AddressRange range, std::uint64_t offset,
+ErrorCode map(orbis::Process *process, unsigned dmemIndex,
+              rx::AddressRange range, std::uint64_t offset,
               rx::EnumBitSet<vmem::Protection> protection);
 
+ErrorCode notifyUnmap(orbis::Process *process, unsigned dmemIndex,
+                      std::uint64_t offset, rx::AddressRange range);
+
+ErrorCode protect(orbis::Process *process, unsigned dmemIndex,
+                  rx::AddressRange range,
+                  rx::EnumBitSet<vmem::Protection> prot);
+
 std::pair<std::uint64_t, ErrorCode> getPmemOffset(unsigned dmemIndex,
                                                   std::uint64_t dmemOffset);
 } // namespace orbis::dmem
diff --git a/kernel/orbis/include/orbis/module/Module.hpp b/kernel/orbis/include/orbis/module/Module.hpp
index 4b17537a3..31edb2aae 100644
--- a/kernel/orbis/include/orbis/module/Module.hpp
+++ b/kernel/orbis/include/orbis/module/Module.hpp
@@ -105,6 +105,7 @@ struct Module final {
 
   DynType dynType = DynType::None;
 
+  uint32_t refCount{};
   uint32_t phNum{};
   uint64_t phdrAddress{};
 
diff --git a/kernel/orbis/include/orbis/sys/sysproto.hpp b/kernel/orbis/include/orbis/sys/sysproto.hpp
index d2f0b8ff9..e40e6b912 100644
--- a/kernel/orbis/include/orbis/sys/sysproto.hpp
+++ b/kernel/orbis/include/orbis/sys/sysproto.hpp
@@ -786,7 +786,7 @@ SysResult sys_localtime_to_utc(Thread *thread, int64_t time, uint unk,
 SysResult sys_set_uevt(Thread *thread /* TODO */);
 SysResult sys_get_cpu_usage_proc(Thread *thread /* TODO */);
 SysResult sys_get_map_statistics(Thread *thread /* TODO */);
-SysResult sys_set_chicken_switches(Thread *thread /* TODO */);
+SysResult sys_set_chicken_switches(Thread *thread, sint flags);
 SysResult sys_extend_page_table_pool(Thread *thread);
 SysResult sys_extend_page_table_pool2(Thread *thread);
 SysResult sys_get_kernel_mem_statistics(Thread *thread /* TODO */);
diff --git a/kernel/orbis/include/orbis/thread/Process.hpp b/kernel/orbis/include/orbis/thread/Process.hpp
index 6a8d8d19f..b9b546430 100644
--- a/kernel/orbis/include/orbis/thread/Process.hpp
+++ b/kernel/orbis/include/orbis/thread/Process.hpp
@@ -20,6 +20,7 @@
 #include "rx/Serializer.hpp"
 #include "rx/SharedMutex.hpp"
 #include <optional>
+#include <type_traits>
 
 namespace orbis {
 class KernelContext;
@@ -93,6 +94,7 @@ struct Process final {
   std::optional<sint> exitStatus;
 
   std::uint32_t sdkVersion = 0;
+  bool allowDmemAliasing = false;
   std::uint64_t nextTlsSlot = 1;
   std::uint64_t lastTlsOffset = 0;
 
@@ -126,6 +128,50 @@ struct Process final {
   }
 
   Budget *getBudget() const;
+
+  template <typename Cb>
+    requires(alignof(Cb) <= 8 && sizeof(Cb) <= 64) &&
+            (std::is_same_v<std::invoke_result_t<Cb>, void> ||
+             (alignof(std::invoke_result_t<Cb>) <= 8 &&
+              sizeof(std::invoke_result_t<Cb>) <= 64))
+  std::invoke_result_t<Cb> invoke(Cb &&fn) {
+    auto constructObject = [](void *to, void *from) {
+      new (to) Cb(std::move(*reinterpret_cast<Cb *>(from)));
+    };
+
+    auto destroyObject = [](void *object) {
+      reinterpret_cast<Cb *>(object)->~Cb();
+    };
+
+    if constexpr (std::is_same_v<std::invoke_result_t<Cb>, void>) {
+      invokeImpl(
+          nullptr, nullptr, &fn, constructObject, destroyObject,
+          [](void *, void *fnPtr) { (*reinterpret_cast<Cb *>(fnPtr))(); });
+    } else {
+      alignas(std::invoke_result_t<Cb>) char
+          result[sizeof(std::invoke_result_t<Cb>)];
+      invokeImpl(
+          &result,
+          [](void *to, void *from) {
+            new (to) std::invoke_result_t<Cb>(
+                std::move(*reinterpret_cast<std::invoke_result_t<Cb> *>(from)));
+          },
+          &fn, constructObject, destroyObject,
+          [](void *result, void *fnPtr) {
+            new (result)
+                std::invoke_result_t<Cb>((*reinterpret_cast<Cb *>(fnPtr))());
+          });
+      return std::move(*reinterpret_cast<std::invoke_result_t<Cb> *>(result));
+    }
+  }
+
+  void invokeAsync(void (*fn)());
+
+private:
+  void invokeImpl(void *returnValue, void (*copyResult)(void *to, void *from),
+                  void *fnPtr, void (*constructObject)(void *to, void *from),
+                  void (*destroyObject)(void *to),
+                  void (*invokeImpl)(void *returnValue, void *fnPtr));
 };
 
 pid_t allocatePid();
diff --git a/kernel/orbis/include/orbis/vmem.hpp b/kernel/orbis/include/orbis/vmem.hpp
index 461eb274c..6beeae0ac 100644
--- a/kernel/orbis/include/orbis/vmem.hpp
+++ b/kernel/orbis/include/orbis/vmem.hpp
@@ -44,6 +44,7 @@ enum class BlockFlagsEx : std::uint8_t {
   Private,
   Shared,
   PoolControl,
+  Void,
   Reserved,
 
   bitset_last = Reserved
@@ -149,6 +150,31 @@ toGpuProtection(rx::EnumBitSet<Protection> prot) {
   return result;
 }
 
+inline bool validateProtection(rx::EnumBitSet<Protection> &prot) {
+  prot = rx::EnumBitSet<Protection>::fromUnderlying(prot.toUnderlying() & 0xff);
+
+  if (prot & ~(kProtCpuAll | kProtGpuAll)) {
+    return false;
+  }
+
+  if (prot & Protection::CpuWrite) {
+    prot |= Protection::CpuRead;
+  }
+
+  return true;
+}
+
+inline bool validateMemoryType(MemoryType type,
+                               rx::EnumBitSet<Protection> prot) {
+  if (type == MemoryType::WbGarlic) {
+    if (prot & (Protection::CpuWrite | Protection::GpuWrite)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 void initialize(Process *process, bool force = false);
 void fork(Process *process, Process *parentThread);
 
@@ -164,22 +190,27 @@ mapFile(Process *process, std::uint64_t addressHint, std::uint64_t size,
         rx::EnumBitSet<Protection> prot, rx::EnumBitSet<BlockFlags> blockFlags,
         rx::EnumBitSet<BlockFlagsEx> blockFlagsEx, File *file,
         std::uint64_t fileOffset, std::string_view name = {},
-        std::uint64_t alignment = kPageSize,
+        std::uint64_t alignment = kPageSize, std::uint64_t callerAddress = 0,
         MemoryType type = MemoryType::Invalid);
 
-std::pair<rx::AddressRange, ErrorCode>
-mapDirect(Process *process, std::uint64_t addressHint,
-          rx::AddressRange directRange, rx::EnumBitSet<Protection> prot,
-          rx::EnumBitSet<AllocationFlags> allocFlags,
-          std::string_view name = {}, std::uint64_t alignment = kPageSize,
-          MemoryType type = MemoryType::Invalid);
+std::pair<rx::AddressRange, ErrorCode> mapDirect(
+    Process *process, std::uint64_t addressHint, rx::AddressRange directRange,
+    rx::EnumBitSet<Protection> prot, rx::EnumBitSet<AllocationFlags> allocFlags,
+    std::string_view name = {}, std::uint64_t alignment = kPageSize,
+    std::uint64_t callerAddress = 0, MemoryType type = MemoryType::Invalid);
 
 std::pair<rx::AddressRange, ErrorCode>
 mapFlex(Process *process, std::uint64_t size, rx::EnumBitSet<Protection> prot,
         std::uint64_t addressHint = 0,
         rx::EnumBitSet<AllocationFlags> allocFlags = {},
         rx::EnumBitSet<BlockFlags> blockFlags = {}, std::string_view name = {},
-        std::uint64_t alignment = kPageSize);
+        std::uint64_t alignment = kPageSize, std::uint64_t callerAddress = 0);
+
+std::pair<rx::AddressRange, ErrorCode>
+mapVoid(Process *process, std::uint64_t size, std::uint64_t addressHint = 0,
+        rx::EnumBitSet<AllocationFlags> allocFlags = {},
+        std::string_view name = {}, std::uint64_t alignment = kPageSize,
+        std::uint64_t callerAddress = 0);
 
 std::pair<rx::AddressRange, ErrorCode>
 commitPooled(Process *process, rx::AddressRange addressRange, MemoryType type,
@@ -191,7 +222,6 @@ ErrorCode protect(Process *process, rx::AddressRange range,
 ErrorCode unmap(Process *process, rx::AddressRange range);
 ErrorCode setName(Process *process, rx::AddressRange range,
                   std::string_view name);
-ErrorCode setType(Process *process, rx::AddressRange range, MemoryType type);
 ErrorCode setTypeAndProtect(Process *process, rx::AddressRange range,
                             MemoryType type, rx::EnumBitSet<Protection> prot);
 std::optional<QueryResult> query(Process *process, std::uint64_t address,
diff --git a/kernel/orbis/src/KernelAllocator.cpp b/kernel/orbis/src/KernelAllocator.cpp
index dd6e6d0e3..ff361bcc1 100644
--- a/kernel/orbis/src/KernelAllocator.cpp
+++ b/kernel/orbis/src/KernelAllocator.cpp
@@ -76,8 +76,6 @@ void initializeAllocator() {
   sMemoryResource->m_heap_next = ptr + sizeof(KernelMemoryResource);
   sMemoryResource->m_heap = std::move(heap);
 
-  rx::print(stderr, "global: size {}, alignment {}\n", GlobalStorage::GetSize(),
-            GlobalStorage::GetAlignment());
   // allocate whole global storage
   g_globalStorage = (std::byte *)sMemoryResource->kalloc(
       GlobalStorage::GetSize(), GlobalStorage::GetAlignment());
diff --git a/kernel/orbis/src/dmem.cpp b/kernel/orbis/src/dmem.cpp
index 2fab16b05..3664722fc 100644
--- a/kernel/orbis/src/dmem.cpp
+++ b/kernel/orbis/src/dmem.cpp
@@ -1,5 +1,6 @@
 #include "dmem.hpp"
 #include "KernelAllocator.hpp"
+#include "KernelContext.hpp"
 #include "KernelObject.hpp"
 #include "error.hpp"
 #include "kernel/KernelObject.hpp"
@@ -14,6 +15,7 @@
 #include <array>
 #include <rx/format.hpp>
 #include <string_view>
+#include <thread/Process.hpp>
 #include <utility>
 
 struct DirectMemoryAllocation {
@@ -21,6 +23,35 @@ struct DirectMemoryAllocation {
   static constexpr std::uint32_t kPooledBit = 1 << 30;
   std::uint32_t type = 0;
 
+  struct Mapping {
+    orbis::Process *process;
+    rx::AddressRange vmRange;
+
+    void serialize(rx::Serializer &s) const {
+      s.serialize(process->pid);
+      s.serialize(vmRange);
+    }
+
+    void deserialize(rx::Deserializer &d) {
+      auto pid = d.deserialize<orbis::pid_t>();
+      if (d.failure()) {
+        return;
+      }
+      auto foundProcess = orbis::findProcessById(pid);
+      if (foundProcess == nullptr) {
+        d.setFailure();
+        return;
+      }
+
+      process = foundProcess;
+      d.deserialize(vmRange);
+    }
+
+    bool operator==(const Mapping &) const = default;
+  };
+
+  orbis::kvector<Mapping> mappings;
+
   [[nodiscard]] bool isPooled() const { return type & kPooledBit; }
   void markAsPooled() { type |= kPooledBit | kAllocatedBit; }
 
@@ -42,7 +73,9 @@ struct DirectMemoryAllocation {
   [[nodiscard]] DirectMemoryAllocation
   merge(const DirectMemoryAllocation &other, rx::AddressRange,
         rx::AddressRange) const {
-    return other;
+    auto result = *this;
+    result.mappings.insert_range(result.mappings.end(), other.mappings);
+    return result;
   }
 
   bool operator==(const DirectMemoryAllocation &) const = default;
@@ -60,6 +93,8 @@ struct DirectMemoryResource
   using BaseResource =
       kernel::AllocableResource<DirectMemoryAllocation, orbis::kallocator>;
 
+  BaseResource systemResource;
+
   void create(std::size_t size) {
     auto [pmemRange, errc] = orbis::pmem::allocate(0, size, {}, 64 * 1024);
 
@@ -86,6 +121,8 @@ struct DirectMemoryResource
     if (result != std::errc{}) {
       return orbis::toErrorCode(result);
     }
+
+    systemResource.destroy();
     dmemReservedSize = 0;
     return {};
   }
@@ -99,6 +136,22 @@ struct DirectMemoryResource
     d.deserialize(static_cast<DirectMemoryResourceState &>(*this));
     BaseResource::deserialize(d);
   }
+
+  std::pair<std::uint64_t, orbis::ErrorCode> reserveSystem(std::uint64_t size) {
+    DirectMemoryAllocation alloc;
+    alloc.setMemoryType(orbis::MemoryType::WbOnion);
+    auto result = map(0, size, alloc, orbis::AllocationFlags::Stack,
+                      orbis::dmem::kPageSize);
+
+    if (result.errc != std::errc{}) {
+      return {{}, orbis::toErrorCode(result.errc)};
+    }
+
+    allocations.unmap(result.range);
+    dmemReservedSize += size;
+    systemResource.allocations.map(result.range, {});
+    return {result.range.beginAddress(), {}};
+  }
 };
 
 static std::array g_dmemPools = {
@@ -111,10 +164,12 @@ static std::array g_dmemPools = {
 };
 
 orbis::ErrorCode orbis::dmem::initialize() {
-  g_dmemPools[0]->create(0x120000000);
+  g_dmemPools[0]->create(0x180000000);
   g_dmemPools[1]->create(0x1000000);
   g_dmemPools[2]->create(0x1000000);
 
+  g_dmemPools[0]->reserveSystem(0x60000000);
+
   return {};
 }
 
@@ -222,6 +277,43 @@ orbis::dmem::allocate(unsigned dmemIndex, rx::AddressRange searchRange,
   return {result, {}};
 }
 
+std::pair<std::uint64_t, orbis::ErrorCode>
+orbis::dmem::allocateSystem(unsigned dmemIndex, std::uint64_t len,
+                            MemoryType memoryType, std::uint64_t alignment) {
+  if (dmemIndex >= std::size(g_dmemPools)) {
+    return {{}, ErrorCode::INVAL};
+  }
+
+  auto dmem = g_dmemPools[dmemIndex];
+  std::lock_guard lock(*dmem);
+
+  alignment = alignment == 0 ? kPageSize : alignment;
+  len = rx::alignUp(len, dmem::kPageSize);
+
+  DirectMemoryAllocation allocation;
+  allocation.setMemoryType(memoryType);
+
+  auto allocResult = dmem->systemResource.map(0, len, allocation,
+                                              AllocationFlags::Dry, alignment);
+  if (allocResult.errc != std::errc{}) {
+    return {{}, ErrorCode::AGAIN};
+  }
+
+  auto result = allocResult.range.beginAddress();
+
+  auto commitResult =
+      dmem->map(result, len, allocation, AllocationFlags::Fixed, alignment);
+
+  rx::dieIf(commitResult.errc != std::errc{},
+            "dmem: failed to commit main memory, error {}", commitResult.errc);
+
+  dmemDump(dmemIndex, rx::format("allocated main {:x}-{:x}",
+                                 allocResult.range.beginAddress(),
+                                 allocResult.range.endAddress()));
+
+  return {allocResult.range.beginAddress() | 0x4000000000, {}};
+}
+
 orbis::ErrorCode orbis::dmem::release(unsigned dmemIndex,
                                       rx::AddressRange range, bool pooled) {
   if (dmemIndex >= std::size(g_dmemPools)) {
@@ -253,6 +345,13 @@ orbis::ErrorCode orbis::dmem::release(unsigned dmemIndex,
     return ErrorCode::NOENT;
   }
 
+  for (auto mapping : it->mappings) {
+    mapping.process->invoke(
+        [=] { vmem::unmap(mapping.process, mapping.vmRange); });
+  }
+
+  it->mappings.clear();
+
   DirectMemoryAllocation allocation{};
   auto result = dmem->map(range.beginAddress(), range.size(), allocation,
                           AllocationFlags::Fixed, vmem::kPageSize);
@@ -261,6 +360,8 @@ orbis::ErrorCode orbis::dmem::release(unsigned dmemIndex,
     return toErrorCode(result.errc);
   }
 
+  dmemDump(dmemIndex, rx::format("released {:x}-{:x}", range.beginAddress(),
+                                 range.endAddress()));
   return {};
 }
 
@@ -281,16 +382,7 @@ orbis::dmem::reserveSystem(unsigned dmemIndex, std::uint64_t size) {
     return {{}, ErrorCode::NOMEM};
   }
 
-  DirectMemoryAllocation alloc;
-  alloc.setMemoryType(MemoryType::WbOnion);
-  auto result = dmem->map(0, size, alloc, AllocationFlags::Stack, kPageSize);
-
-  if (result.errc != std::errc{}) {
-    return {{}, toErrorCode(result.errc)};
-  }
-
-  dmem->dmemReservedSize += size;
-  return {result.range.beginAddress() | 0x4000000000, {}};
+  return dmem->reserveSystem(size);
 }
 
 std::pair<std::uint64_t, orbis::ErrorCode>
@@ -465,7 +557,7 @@ orbis::dmem::getAvailSize(unsigned dmemIndex, rx::AddressRange searchRange,
 
   alignment = alignment == 0 ? kPageSize : alignment;
 
-  if (alignment % kPageSize) {
+  if (alignment % vmem::kPageSize) {
     return {{}, ErrorCode::INVAL};
   }
 
@@ -474,7 +566,9 @@ orbis::dmem::getAvailSize(unsigned dmemIndex, rx::AddressRange searchRange,
   if (searchRange.endAddress() > dmem->dmemTotalSize) {
     ORBIS_LOG_ERROR(__FUNCTION__, "out of direct memory size",
                     searchRange.endAddress(), dmem->dmemTotalSize);
-    return {{}, orbis::ErrorCode::INVAL};
+    // return {{}, orbis::ErrorCode::INVAL};
+    searchRange = rx::AddressRange::fromBeginEnd(searchRange.beginAddress(),
+                                                 dmem->dmemTotalSize);
   }
 
   if (!searchRange.isValid() &&
@@ -502,7 +596,11 @@ orbis::dmem::getAvailSize(unsigned dmemIndex, rx::AddressRange searchRange,
     ORBIS_LOG_ERROR(__FUNCTION__, "out of direct memory size",
                     searchRange.endAddress(), dmem->dmemTotalSize,
                     dmem->dmemReservedSize);
-    return {{}, orbis::ErrorCode::INVAL};
+    searchRange = rx::AddressRange::fromBeginEnd(searchRange.beginAddress(),
+                                                 dmem->dmemTotalSize -
+                                                     dmem->dmemReservedSize);
+
+    // return {{}, orbis::ErrorCode::INVAL};
   }
 
   auto it = dmem->lowerBound(searchRange.beginAddress());
@@ -529,8 +627,8 @@ orbis::dmem::getAvailSize(unsigned dmemIndex, rx::AddressRange searchRange,
   return {result, {}};
 }
 
-orbis::ErrorCode orbis::dmem::map(unsigned dmemIndex, rx::AddressRange range,
-                                  std::uint64_t offset,
+orbis::ErrorCode orbis::dmem::map(orbis::Process *process, unsigned dmemIndex,
+                                  rx::AddressRange range, std::uint64_t offset,
                                   rx::EnumBitSet<vmem::Protection> protection) {
   if (dmemIndex >= std::size(g_dmemPools)) {
     return ErrorCode::INVAL;
@@ -569,6 +667,15 @@ orbis::ErrorCode orbis::dmem::map(unsigned dmemIndex, rx::AddressRange range,
     return orbis::ErrorCode::ACCES;
   }
 
+  if (!vmem::validateMemoryType(allocationInfoIt->getMemoryType(),
+                                protection)) {
+    return ErrorCode::ACCES;
+  }
+
+  if (!allocationInfoIt->mappings.empty() && !process->allowDmemAliasing) {
+    return ErrorCode::INVAL;
+  }
+
   auto directRange = rx::AddressRange::fromBeginSize(offset, range.size())
                          .intersection(allocationInfoIt.range());
 
@@ -579,8 +686,96 @@ orbis::ErrorCode orbis::dmem::map(unsigned dmemIndex, rx::AddressRange range,
   auto physicalRange =
       rx::AddressRange::fromBeginSize(dmem->pmemOffset + offset, range.size());
 
-  return orbis::pmem::map(range.beginAddress(), physicalRange,
-                          vmem::toCpuProtection(protection));
+  auto result = process->invoke([=] {
+    return orbis::pmem::map(range.beginAddress(), physicalRange,
+                            vmem::toCpuProtection(protection));
+  });
+
+  if (result == ErrorCode{}) {
+    allocationInfoIt->mappings.push_back({
+        .process = process,
+        .vmRange = range,
+    });
+  }
+
+  return result;
+}
+
+orbis::ErrorCode orbis::dmem::notifyUnmap(orbis::Process *process,
+                                          unsigned dmemIndex,
+                                          std::uint64_t offset,
+                                          rx::AddressRange range) {
+  if (dmemIndex >= std::size(g_dmemPools)) {
+    return ErrorCode::INVAL;
+  }
+
+  auto dmem = g_dmemPools[dmemIndex];
+  std::lock_guard lock(*dmem);
+
+  auto it = dmem->query(offset);
+  if (it == dmem->end()) {
+    return ErrorCode::INVAL;
+  }
+
+  for (auto mapIt = it->mappings.begin(); mapIt != it->mappings.end();) {
+    if (mapIt->process == process && mapIt->vmRange.intersects(range)) {
+      if (mapIt->vmRange == range) {
+        mapIt = it->mappings.erase(mapIt);
+        break;
+      }
+
+      if (mapIt->vmRange.beginAddress() == range.beginAddress()) {
+        mapIt->vmRange = rx::AddressRange::fromBeginEnd(
+            range.endAddress(), mapIt->vmRange.endAddress());
+        break;
+      }
+
+      if (mapIt->vmRange.endAddress() == range.endAddress()) {
+        mapIt->vmRange = rx::AddressRange::fromBeginEnd(
+            mapIt->vmRange.beginAddress(), range.beginAddress());
+        break;
+      }
+
+      auto leftAllocation = rx::AddressRange::fromBeginEnd(
+          mapIt->vmRange.beginAddress(), range.beginAddress());
+
+      auto rightAllocation = rx::AddressRange::fromBeginEnd(
+          range.endAddress(), mapIt->vmRange.endAddress());
+
+      mapIt->vmRange = leftAllocation;
+      it->mappings.push_back({.process = process, .vmRange = rightAllocation});
+      break;
+    }
+
+    ++mapIt;
+  }
+
+  return {};
+}
+
+orbis::ErrorCode orbis::dmem::protect(orbis::Process *process,
+                                      unsigned dmemIndex,
+                                      rx::AddressRange range,
+                                      rx::EnumBitSet<vmem::Protection> prot) {
+  auto dmem = g_dmemPools[dmemIndex];
+  std::lock_guard lock(*dmem);
+
+  auto it = dmem->query(range.beginAddress());
+  if (it == dmem->end()) {
+    return ErrorCode::INVAL;
+  }
+
+  if (!it.range().contains(range)) {
+    return ErrorCode::INVAL;
+  }
+
+  for (auto mapping : it->mappings) {
+    if (process == nullptr || process == mapping.process) {
+      vmem::protect(mapping.process, mapping.vmRange, prot);
+    }
+  }
+
+  return {};
 }
 
 std::pair<std::uint64_t, orbis::ErrorCode>
diff --git a/kernel/orbis/src/module.cpp b/kernel/orbis/src/module.cpp
index f1266d1db..bc97d13f2 100644
--- a/kernel/orbis/src/module.cpp
+++ b/kernel/orbis/src/module.cpp
@@ -119,7 +119,7 @@ static orbis::SysResult doPltRelocation(orbis::Process *process,
       foundInLibs.emplace_back(std::string_view(defLib.name));
     }
 
-    for (auto nsDefModule : defModule->namespaceModules) {
+    for (auto &nsDefModule : defModule->namespaceModules) {
       for (auto defSym : nsDefModule->symbols) {
         if (defSym.id != symbol.id || defSym.bind == orbis::SymbolBind::Local) {
           continue;
diff --git a/kernel/orbis/src/sys/sys_sce.cpp b/kernel/orbis/src/sys/sys_sce.cpp
index 6184db6af..fa7dd85ef 100644
--- a/kernel/orbis/src/sys/sys_sce.cpp
+++ b/kernel/orbis/src/sys/sys_sce.cpp
@@ -78,6 +78,10 @@ orbis::sys_mtypeprotect(Thread *thread, uintptr_t addr, size_t len,
                         MemoryType type,
                         rx::EnumBitSet<vmem::Protection> prot) {
   auto range = rx::AddressRange::fromBeginSize(addr, len);
+  if (!range.isValid() || static_cast<unsigned>(type) > 10) {
+    return ErrorCode::INVAL;
+  }
+
   return vmem::setTypeAndProtect(thread->tproc, range, type, prot);
 }
 orbis::SysResult orbis::sys_regmgr_call(Thread *thread, uint32_t op,
@@ -1289,7 +1293,7 @@ orbis::sys_dynlib_get_info_ex(Thread *thread, SceKernelModule handle,
   result.ehFrameSize = module->ehFrameSize;
   std::memcpy(result.segments, module->segments, sizeof(ModuleSegment) * 2);
   result.segmentCount = 2;
-  result.refCount = 1;
+  result.refCount = module->refCount;
   ORBIS_LOG_WARNING(__FUNCTION__, result.id, result.name, result.tlsIndex,
                     result.tlsInit, result.tlsInitSize, result.tlsSize,
                     result.tlsOffset, result.tlsAlign, result.initProc,
@@ -1512,6 +1516,10 @@ orbis::SysResult orbis::sys_mmap_dmem(Thread *thread, uintptr_t addr,
   auto callerAddress = getCallerAddress(thread);
   auto alignment = dmem::kPageSize;
 
+  if (static_cast<unsigned>(memoryType) > 10) {
+    return ErrorCode::INVAL;
+  }
+
   {
     auto unpacked = unpackMapFlags(flags, dmem::kPageSize);
     alignment = unpacked.first;
@@ -1522,13 +1530,6 @@ orbis::SysResult orbis::sys_mmap_dmem(Thread *thread, uintptr_t addr,
 
   rx::EnumBitSet<AllocationFlags> allocFlags{};
 
-  if (!prot) {
-    // HACK
-    // FIXME: implement protect for pid
-    prot = vmem::Protection::CpuRead | vmem::Protection::CpuWrite |
-           vmem::Protection::GpuRead | vmem::Protection::GpuWrite;
-  }
-
   if (prot & vmem::Protection::CpuExec) {
     return ErrorCode::INVAL;
   }
@@ -1548,7 +1549,7 @@ orbis::SysResult orbis::sys_mmap_dmem(Thread *thread, uintptr_t addr,
   auto [range, errc] =
       vmem::mapDirect(thread->tproc, addr,
                       rx::AddressRange::fromBeginSize(directMemoryStart, len),
-                      prot, allocFlags, name, alignment, memoryType);
+                      prot, allocFlags, name, alignment, callerAddress, memoryType);
 
   if (errc != ErrorCode{}) {
     return errc;
@@ -1696,7 +1697,7 @@ orbis::SysResult orbis::sys_get_cpu_usage_proc(Thread *thread /* TODO */) {
 orbis::SysResult orbis::sys_get_map_statistics(Thread *thread /* TODO */) {
   return ErrorCode::NOSYS;
 }
-orbis::SysResult orbis::sys_set_chicken_switches(Thread *thread /* TODO */) {
+orbis::SysResult orbis::sys_set_chicken_switches(Thread *thread, sint flags) {
   return ErrorCode::NOSYS;
 }
 orbis::SysResult orbis::sys_extend_page_table_pool(Thread *thread) {
diff --git a/kernel/orbis/src/thread/Process.cpp b/kernel/orbis/src/thread/Process.cpp
index 9d1bebdec..1a83e0c43 100644
--- a/kernel/orbis/src/thread/Process.cpp
+++ b/kernel/orbis/src/thread/Process.cpp
@@ -4,10 +4,16 @@
 #include "KernelObject.hpp"
 #include "kernel/KernelObject.hpp"
 #include "rx/LinkedNode.hpp"
+#include "rx/Process.hpp"
 #include "rx/Serializer.hpp"
+#include "rx/SharedMutex.hpp"
 #include "rx/align.hpp"
+#include "rx/die.hpp"
 #include "thread/Thread.hpp"
 #include <algorithm>
+#include <bit>
+#include <csignal>
+#include <mutex>
 
 struct ProcessIdList {
   rx::OwningIdMap<std::uint8_t, orbis::pid_t, 256, 0> pidMap;
@@ -132,6 +138,148 @@ orbis::Process *orbis::findProcessByHostId(std::uint64_t pid) {
   return nullptr;
 }
 
+struct InvokeDeliveryState {
+  void (*invokeCb)(void *returnValue, void *fnPtr) = nullptr;
+  alignas(8) std::byte returnData[64];
+  alignas(8) std::byte callerObject[64];
+
+  static void invoke();
+
+  void serialize(rx::Serializer &) const {}
+  void deserialize(rx::Deserializer &) {}
+};
+
+struct AsyncInvokeDeliveryState {
+  void (*invokeCb)() = nullptr;
+
+  static void invoke();
+
+  void serialize(rx::Serializer &) const {}
+  void deserialize(rx::Deserializer &) {}
+};
+
+auto g_invokeDelivery = orbis::createProcessLocalObject<
+    kernel::LockableKernelObject<InvokeDeliveryState>>();
+auto g_asyncInvokeDelivery = orbis::createProcessLocalObject<
+    kernel::LockableKernelObject<AsyncInvokeDeliveryState>>();
+
+void InvokeDeliveryState::invoke() {
+  auto state = orbis::g_currentThread->tproc->get(g_invokeDelivery);
+  state->invokeCb(state->returnData, state->callerObject);
+  state->invokeCb = {};
+  state->unlock();
+}
+
+void AsyncInvokeDeliveryState::invoke() {
+  auto state = orbis::g_currentThread->tproc->get(g_asyncInvokeDelivery);
+  auto cb = state->invokeCb;
+  state->invokeCb = {};
+  state->unlock();
+
+  cb();
+}
+
+struct SignalHandlerObject {
+  SignalHandlerObject() {
+    struct sigaction act{};
+    act.sa_sigaction = handleSignal;
+    act.sa_flags = SA_SIGINFO | SA_ONSTACK;
+
+    if (sigaction(SIGUSR2, &act, nullptr)) {
+      rx::die("SignalHandlerObject: failed to setup signal handler");
+    }
+  }
+
+  ~SignalHandlerObject() {
+    struct sigaction act{};
+    act.sa_handler = SIG_DFL;
+
+    sigaction(SIGUSR2, &act, nullptr);
+  }
+
+  [[gnu::no_stack_protector]] static void handleSignal(int sig, siginfo_t *info,
+                                                       void *context) {
+    if (auto hostFs = _readgsbase_u64()) {
+      _writefsbase_u64(hostFs);
+    }
+
+    if (sig == SIGUSR2) {
+      std::bit_cast<void (*)()>(info->si_value.sival_ptr)();
+    }
+
+    auto ctx = reinterpret_cast<ucontext_t *>(context);
+
+    if (ctx->uc_mcontext.gregs[REG_RIP] < orbis::kMaxAddress) {
+      if (auto thread = orbis::g_currentThread) {
+        _writefsbase_u64(thread->fsBase);
+      }
+    }
+  }
+
+  void serialize(rx::Serializer &) const {}
+  void deserialize(rx::Deserializer &) {}
+};
+
+[[maybe_unused, gnu::used]] static auto g_signalHandler =
+    orbis::createGlobalObject<SignalHandlerObject>();
+
+void orbis::Process::invokeImpl(
+    void *returnValue, void (*copyResult)(void *to, void *from), void *fnPtr,
+    void (*constructObject)(void *to, void *from),
+    void (*destroyObject)(void *to),
+    void (*invokeCb)(void *returnValue, void *fnPtr)) {
+  if (rx::getCurrentProcessId() == hostPid) {
+    invokeCb(returnValue, fnPtr);
+    return;
+  }
+
+  auto invoker = get(g_invokeDelivery);
+
+  while (true) {
+    std::lock_guard lock(*invoker);
+    if (invoker->invokeCb != nullptr) {
+      continue;
+    }
+
+    invoker->invokeCb = invokeCb;
+    constructObject(invoker->callerObject, fnPtr);
+    sigqueue(hostPid, SIGUSR2,
+             sigval{.sival_ptr =
+                        std::bit_cast<void *>(&InvokeDeliveryState::invoke)});
+    invoker->lock();
+
+    destroyObject(invoker->callerObject);
+
+    if (returnValue != nullptr) {
+      copyResult(returnValue, invoker->returnData);
+    }
+    break;
+  }
+}
+
+void orbis::Process::invokeAsync(void (*fn)()) {
+  if (rx::getCurrentProcessId() == hostPid) {
+    fn();
+    return;
+  }
+
+  auto invoker = get(g_asyncInvokeDelivery);
+
+  while (true) {
+    std::lock_guard lock(*invoker);
+    if (invoker->invokeCb != nullptr) {
+      continue;
+    }
+
+    invoker->invokeCb = fn;
+    sigqueue(hostPid, SIGUSR2,
+             sigval{.sival_ptr = std::bit_cast<void *>(
+                        &AsyncInvokeDeliveryState::invoke)});
+    invoker->lock();
+    break;
+  }
+}
+
 void orbis::Process::serialize(rx::Serializer &s) const {
   Process::Storage::SerializeAll(storage, s);
 }
diff --git a/kernel/orbis/src/vmem.cpp b/kernel/orbis/src/vmem.cpp
index 9643c5774..e2505aa2f 100644
--- a/kernel/orbis/src/vmem.cpp
+++ b/kernel/orbis/src/vmem.cpp
@@ -45,6 +45,7 @@ struct VirtualMemoryAllocation {
   orbis::MemoryType type = orbis::MemoryType::Invalid;
   rx::Ref<orbis::IoDevice> device;
   std::uint64_t deviceOffset = 0;
+  std::uint64_t callerAddress = 0;
   rx::StaticString<31> name;
 
   [[nodiscard]] bool isAllocated() const {
@@ -56,7 +57,8 @@ struct VirtualMemoryAllocation {
   isRelated(const VirtualMemoryAllocation &other, rx::AddressRange selfRange,
             [[maybe_unused]] rx::AddressRange rightRange) const {
     if (flags != other.flags || flagsEx != other.flagsEx ||
-        prot != other.prot || type != other.type || device != other.device) {
+        prot != other.prot || type != other.type || device != other.device ||
+        callerAddress != other.callerAddress) {
       return false;
     }
 
@@ -64,13 +66,7 @@ struct VirtualMemoryAllocation {
       return true;
     }
 
-    bool isAnon = std::string_view(name).starts_with("anon:");
-
-    if (isAnon) {
-      if (!std::string_view(other.name).starts_with("anon:")) {
-        return false;
-      }
-    } else if (name != other.name) {
+    if (name != other.name) {
       return false;
     }
 
@@ -117,6 +113,40 @@ struct VirtualMemoryAllocation {
     }
   }
 
+  [[nodiscard]] bool isFlex() const {
+    return (flags & orbis::vmem::BlockFlags::FlexibleMemory) ==
+           orbis::vmem::BlockFlags::FlexibleMemory;
+  }
+
+  [[nodiscard]] bool isDirect() const {
+    return (flags & orbis::vmem::BlockFlags::DirectMemory) ==
+           orbis::vmem::BlockFlags::DirectMemory;
+  }
+
+  [[nodiscard]] bool isPooled() const {
+    return (flags & orbis::vmem::BlockFlags::PooledMemory) ==
+           orbis::vmem::BlockFlags::PooledMemory;
+  }
+
+  [[nodiscard]] bool isPoolCommited() const {
+    return flags == (orbis::vmem::BlockFlags::PooledMemory |
+                     orbis::vmem::BlockFlags::Commited);
+  }
+
+  [[nodiscard]] bool isPoolReserved() const {
+    return flags == orbis::vmem::BlockFlags::PooledMemory;
+  }
+
+  [[nodiscard]] bool isPoolControl() const {
+    return flags == orbis::vmem::BlockFlags::PooledMemory &&
+           (flagsEx & orbis::vmem::BlockFlagsEx::PoolControl);
+  }
+
+  [[nodiscard]] bool isVoid() const {
+    return (flagsEx & orbis::vmem::BlockFlagsEx::Void) ==
+           orbis::vmem::BlockFlagsEx::Void;
+  }
+
   void serialize(rx::Serializer &s) const {}
   void deserialize(rx::Deserializer &d) {}
 };
@@ -239,6 +269,7 @@ static void release(orbis::Process *process, decltype(g_vmInstance)::type *vmem,
     }
 
     if (it->flags & orbis::vmem::BlockFlags::DirectMemory) {
+      orbis::dmem::notifyUnmap(process, 0, it->deviceOffset, range);
       budget->release(orbis::BudgetResource::Dmem, blockRange.size());
     }
 
@@ -258,12 +289,15 @@ static void release(orbis::Process *process, decltype(g_vmInstance)::type *vmem,
   }
 }
 
-static orbis::ErrorCode validateRange(
-    decltype(g_vmInstance)::type *vmem,
-    decltype(g_vmInstance)::type::iterator it, rx::AddressRange range,
-    rx::FunctionRef<orbis::ErrorCode(const VirtualMemoryAllocation &)> cb) {
+static orbis::ErrorCode
+validateRange(decltype(g_vmInstance)::type *vmem,
+              decltype(g_vmInstance)::type::iterator it, rx::AddressRange range,
+              rx::FunctionRef<orbis::ErrorCode(const VirtualMemoryAllocation &,
+                                               rx::AddressRange)>
+                  cb) {
   while (it != vmem->end() && it.beginAddress() < range.endAddress()) {
-    if (auto errc = cb(it.get()); errc != orbis::ErrorCode{}) {
+    if (auto errc = cb(it.get(), it.range().intersection(range));
+        errc != orbis::ErrorCode{}) {
       return errc;
     }
 
@@ -273,25 +307,45 @@ static orbis::ErrorCode validateRange(
   return {};
 }
 
-static void modifyRange(
+static decltype(g_vmInstance)::type::iterator modifyRange(
     decltype(g_vmInstance)::type *vmem,
     decltype(g_vmInstance)::type::iterator it, rx::AddressRange range,
+    rx::EnumBitSet<orbis::AllocationFlags> allocFlags,
     rx::FunctionRef<void(VirtualMemoryAllocation &, rx::AddressRange)> cb) {
+  auto returnIt = it;
   while (it != vmem->end() && it.beginAddress() < range.endAddress()) {
-    auto mapRange = range.intersection(it.range());
-    auto allocInfo = it.get();
+    auto itRange = it.range();
+    auto mapRange = range.intersection(itRange);
+    if (mapRange == itRange) {
+      cb(it.get(), mapRange);
 
-    if (allocInfo.device != nullptr &&
-        !(allocInfo.flags & orbis::vmem::BlockFlags::PooledMemory)) {
-      allocInfo.deviceOffset += mapRange.beginAddress() - it.beginAddress();
+      if (!(allocFlags & orbis::AllocationFlags::NoMerge)) {
+        it = vmem->merge(it);
+      }
+    } else {
+      auto allocInfo = it.get();
+
+      if (allocInfo.device != nullptr &&
+          !(allocInfo.flags & orbis::vmem::BlockFlags::PooledMemory)) {
+        allocInfo.deviceOffset += mapRange.beginAddress() - it.beginAddress();
+      }
+
+      cb(allocInfo, mapRange);
+      auto result = vmem->map(
+          mapRange.beginAddress(), mapRange.size(), allocInfo,
+          orbis::AllocationFlags::Fixed | allocFlags, orbis::vmem::kPageSize);
+
+      if (returnIt == it) {
+        returnIt = result.it;
+      }
+
+      it = result.it;
     }
 
-    cb(allocInfo, mapRange);
-    vmem->map(mapRange.beginAddress(), mapRange.size(), allocInfo,
-              orbis::AllocationFlags::Fixed, orbis::vmem::kPageSize);
-
     ++it;
   }
+
+  return returnIt;
 }
 
 void orbis::vmem::initialize(Process *process, bool force) {
@@ -336,6 +390,7 @@ void orbis::vmem::initialize(Process *process, bool force) {
   for (auto usedRange : rx::mem::query(range)) {
     reserveRangeImpl(
         rx::AddressRange::fromBeginEnd(address, usedRange.beginAddress()));
+    vmem->allocations.map(usedRange, {.flagsEx = BlockFlagsEx::Reserved});
 
     address = usedRange.endAddress();
   }
@@ -373,17 +428,21 @@ std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapFile(
     rx::EnumBitSet<BlockFlags> blockFlags,
     rx::EnumBitSet<BlockFlagsEx> blockFlagsEx, File *file,
     std::uint64_t fileOffset, std::string_view name, std::uint64_t alignment,
-    MemoryType type) {
+    std::uint64_t callerAddress, MemoryType type) {
   blockFlags |= file->device->blockFlags;
 
+  if (!validateProtection(prot)) {
+    return {{}, ErrorCode::INVAL};
+  }
+
   if (blockFlags & BlockFlags::PooledMemory) {
     if (size < dmem::kPageSize * 2 || size % dmem::kPageSize) {
-      return {{}, orbis::ErrorCode::INVAL};
+      return {{}, ErrorCode::INVAL};
     }
   }
 
   if (blockFlags & (BlockFlags::DirectMemory | BlockFlags::PooledMemory)) {
-    if (prot & vmem::Protection::CpuExec) {
+    if (prot & Protection::CpuExec) {
       return {{}, ErrorCode::ACCES};
     }
 
@@ -394,7 +453,7 @@ std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapFile(
 
   if (allocFlags & AllocationFlags::Fixed) {
     if (addressHint % alignment) {
-      return {{}, orbis::ErrorCode::INVAL};
+      return {{}, ErrorCode::INVAL};
     }
   }
 
@@ -402,6 +461,14 @@ std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapFile(
                       !(allocFlags & AllocationFlags::NoOverwrite);
 
   VirtualMemoryAllocation allocationInfo;
+  if (process->sdkVersion < 0x5500000) {
+    allocationInfo.callerAddress = callerAddress;
+  }
+  if (process->sdkVersion < 0x2000000 &&
+      (blockFlags & (BlockFlags::DirectMemory | BlockFlags::PooledMemory))) {
+    allocFlags |= AllocationFlags::NoMerge;
+  }
+
   allocationInfo.flagsEx = blockFlagsEx | BlockFlagsEx::Allocated;
   allocationInfo.device = file->device;
   allocationInfo.prot = prot;
@@ -444,13 +511,13 @@ std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapFile(
   }
 
   if (blockFlags & BlockFlags::FlexibleMemory) {
-    if (prot) {
-      if (!budget->acquire(BudgetResource::Fmem, size)) {
-        rx::println(stderr, "map: fmem budget: failed to allocate {:#x} bytes",
-                    size);
-        return {{}, ErrorCode::INVAL};
-      }
+    if (!budget->acquire(BudgetResource::Fmem, size)) {
+      rx::println(stderr, "map: fmem budget: failed to allocate {:#x} bytes",
+                  size);
+      return {{}, ErrorCode::INVAL};
+    }
 
+    if (prot) {
       blockFlags |= BlockFlags::Commited;
     }
   }
@@ -458,35 +525,38 @@ std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapFile(
   allocFlags = AllocationFlags::Fixed | (allocFlags & AllocationFlags::NoMerge);
 
   if (blockFlags & BlockFlags::DirectMemory) {
-    if (prot) {
-      if (!budget->acquire(BudgetResource::Dmem, size)) {
-        rx::println(stderr, "map: dmem budget: failed to allocate {:#x} bytes",
-                    size);
-        return {{}, ErrorCode::INVAL};
-      }
+    if (!budget->acquire(BudgetResource::Dmem, size)) {
+      rx::println(stderr, "map: dmem budget: failed to allocate {:#x} bytes",
+                  size);
+      return {{}, ErrorCode::INVAL};
+    }
 
+    if (prot) {
       blockFlags |= BlockFlags::Commited;
     }
   }
 
   if (blockFlags & BlockFlags::PooledMemory) {
-    if (auto errc = blockpool::allocateControlBlock();
-        errc != orbis::ErrorCode{}) {
+    if (auto errc = blockpool::allocateControlBlock(); errc != ErrorCode{}) {
       return {{}, errc};
     }
     allocationInfo.flagsEx |= BlockFlagsEx::PoolControl;
   }
 
-  if (auto error = file->device->map(range, fileOffset, prot, file, process);
-      error != ErrorCode{}) {
-    if (prot) {
-      if (blockFlags & BlockFlags::FlexibleMemory) {
-        budget->release(BudgetResource::Fmem, size);
-      }
+  if (type != MemoryType::Invalid && !validateMemoryType(type, prot)) {
+    return {{}, ErrorCode::ACCES};
+  }
 
-      if (blockFlags & BlockFlags::DirectMemory) {
-        budget->release(BudgetResource::Dmem, size);
-      }
+  if (auto error = process->invoke([=] {
+        return file->device->map(range, fileOffset, prot, file, process);
+      });
+      error != ErrorCode{}) {
+    if (blockFlags & BlockFlags::FlexibleMemory) {
+      budget->release(BudgetResource::Fmem, size);
+    }
+
+    if (blockFlags & BlockFlags::DirectMemory) {
+      budget->release(BudgetResource::Dmem, size);
     }
 
     if (blockFlags & BlockFlags::PooledMemory) {
@@ -531,8 +601,8 @@ std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapFile(
     rx::dieIf(errc != std::errc{}, "failed to commit virtual memory {}", errc);
   }
 
-  // vmemDump(process, rx::format("mapped {:x}-{:x} {}", range.beginAddress(),
-  //                              range.endAddress(), prot));
+  vmemDump(process, rx::format("mapped {:x}-{:x} {}", range.beginAddress(),
+                               range.endAddress(), prot));
 
   return {range, {}};
 }
@@ -540,9 +610,14 @@ std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapFile(
 std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapDirect(
     Process *process, std::uint64_t addressHint, rx::AddressRange directRange,
     rx::EnumBitSet<Protection> prot, rx::EnumBitSet<AllocationFlags> allocFlags,
-    std::string_view name, std::uint64_t alignment, MemoryType type) {
+    std::string_view name, std::uint64_t alignment, std::uint64_t callerAddress,
+    MemoryType type) {
   ScopedBudgetAcquire dmemResource;
 
+  if (!validateProtection(prot)) {
+    return {{}, ErrorCode::INVAL};
+  }
+
   if (prot) {
     dmemResource = ScopedBudgetAcquire(
         process->getBudget(), BudgetResource::Dmem, directRange.size());
@@ -557,7 +632,13 @@ std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapDirect(
   }
 
   VirtualMemoryAllocation allocationInfo;
-  allocationInfo.flags = orbis::vmem::BlockFlags::DirectMemory;
+  if (process->sdkVersion < 0x5500000) {
+    allocationInfo.callerAddress = callerAddress;
+  }
+  if (process->sdkVersion < 0x2000000) {
+    allocFlags |= AllocationFlags::NoMerge;
+  }
+  allocationInfo.flags = BlockFlags::DirectMemory;
   allocationInfo.flagsEx = BlockFlagsEx::Allocated;
   allocationInfo.prot = prot;
   allocationInfo.device = g_context->dmem->device;
@@ -566,7 +647,7 @@ std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapDirect(
   allocationInfo.setName(process, name);
 
   if (prot) {
-    allocationInfo.flags |= orbis::vmem::BlockFlags::Commited;
+    allocationInfo.flags |= BlockFlags::Commited;
   }
 
   bool canOverwrite = (allocFlags & AllocationFlags::Fixed) &&
@@ -594,13 +675,14 @@ std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapDirect(
 
   if (canOverwrite) {
     if (auto errc = validateOverwrite(vmem, range, false);
-        errc != orbis::ErrorCode{}) {
+        errc != ErrorCode{}) {
       return {{}, errc};
     }
   }
 
-  if (auto errc = dmem::map(0, range, directRange.beginAddress(), prot);
-      errc != orbis::ErrorCode{}) {
+  if (auto errc =
+          dmem::map(process, 0, range, directRange.beginAddress(), prot);
+      errc != ErrorCode{}) {
     return {{}, errc};
   }
 
@@ -630,39 +712,40 @@ std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapDirect(
 
   amdgpu::mapMemory(process->pid, range, type, prot, pmemOffset);
 
-  // vmemDump(process, rx::format("mapped dmem {:x}-{:x}", range.beginAddress(),
-  //                              range.endAddress()));
+  vmemDump(process, rx::format("mapped dmem {:x}-{:x}", range.beginAddress(),
+                               range.endAddress()));
 
   return {range, {}};
 }
 
-std::pair<rx::AddressRange, orbis::ErrorCode>
-orbis::vmem::mapFlex(Process *process, std::uint64_t size,
-                     rx::EnumBitSet<Protection> prot, std::uint64_t addressHint,
-                     rx::EnumBitSet<AllocationFlags> allocFlags,
-                     rx::EnumBitSet<BlockFlags> blockFlags,
-                     std::string_view name, std::uint64_t alignment) {
-  ScopedBudgetAcquire fmemResource;
+std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapFlex(
+    Process *process, std::uint64_t size, rx::EnumBitSet<Protection> prot,
+    std::uint64_t addressHint, rx::EnumBitSet<AllocationFlags> allocFlags,
+    rx::EnumBitSet<BlockFlags> blockFlags, std::string_view name,
+    std::uint64_t alignment, std::uint64_t callerAddress) {
 
-  if (prot) {
-    fmemResource =
-        ScopedBudgetAcquire(process->getBudget(), BudgetResource::Fmem, size);
+  if (!validateProtection(prot)) {
+    return {{}, ErrorCode::INVAL};
+  }
 
-    if (!fmemResource) {
-      rx::println(stderr,
-                  "mapFlex: fmem budget: failed to allocate {:#x} bytes", size);
+  ScopedBudgetAcquire fmemResource(process->getBudget(), BudgetResource::Fmem,
+                                   size);
 
-      return {{}, ErrorCode::INVAL};
-    }
+  if (!fmemResource) {
+    rx::println(stderr, "mapFlex: fmem budget: failed to allocate {:#x} bytes",
+                size);
 
-    blockFlags |= orbis::vmem::BlockFlags::Commited;
+    return {{}, ErrorCode::INVAL};
   }
 
   bool canOverwrite = (allocFlags & AllocationFlags::Fixed) &&
                       !(allocFlags & AllocationFlags::NoOverwrite);
 
   VirtualMemoryAllocation allocationInfo;
-  allocationInfo.flags = orbis::vmem::BlockFlags::FlexibleMemory | blockFlags;
+  if (process->sdkVersion < 0x5500000) {
+    allocationInfo.callerAddress = callerAddress;
+  }
+  allocationInfo.flags = blockFlags | BlockFlags::FlexibleMemory;
   allocationInfo.flagsEx = BlockFlagsEx::Allocated;
   allocationInfo.prot = prot;
   allocationInfo.type = MemoryType::WbOnion;
@@ -690,40 +773,49 @@ orbis::vmem::mapFlex(Process *process, std::uint64_t size,
 
   if (canOverwrite) {
     if (auto errc = validateOverwrite(vmem, vmemRange, false);
-        errc != orbis::ErrorCode{}) {
+        errc != ErrorCode{}) {
       return {{}, errc};
     }
   }
 
-  rx::AddressRange flexRange;
+  if (prot) {
+    rx::AddressRange flexRange;
+    {
+      auto [range, errc] = fmem::allocate(size);
 
-  {
-    auto [range, errc] = fmem::allocate(size);
+      if (errc != ErrorCode{}) {
+        return {{}, errc};
+      }
 
-    if (errc != orbis::ErrorCode{}) {
-      return {{}, errc};
+      flexRange = range;
     }
 
-    flexRange = range;
-  }
+    allocationInfo.flags |= BlockFlags::Commited;
+    allocationInfo.deviceOffset = flexRange.beginAddress();
 
-  allocationInfo.deviceOffset = flexRange.beginAddress();
-
-  if (auto errc =
-          pmem::map(vmemRange.beginAddress(), flexRange, toCpuProtection(prot));
-      errc != orbis::ErrorCode{}) {
-    fmem::deallocate(flexRange);
-    return {{}, errc};
+    if (auto errc = process->invoke([=] {
+          return pmem::map(vmemRange.beginAddress(), flexRange,
+                           toCpuProtection(prot));
+        });
+        errc != ErrorCode{}) {
+      fmem::deallocate(flexRange);
+      return {{}, errc};
+    }
+  } else {
+    if (auto errc =
+            process->invoke([=] { return rx::mem::protect(vmemRange, {}); });
+        errc != std::errc{}) {
+      return {{}, toErrorCode(errc)};
+    }
   }
 
   if (canOverwrite) {
     release(process, vmem, process->getBudget(), vmemRange);
   }
 
-  auto [it, _errc, _range] = vmem->map(
-      vmemRange.beginAddress(), vmemRange.size(), allocationInfo,
-      AllocationFlags::Fixed | (allocFlags & AllocationFlags::NoMerge),
-      alignment);
+  vmem->map(vmemRange.beginAddress(), vmemRange.size(), allocationInfo,
+            AllocationFlags::Fixed | (allocFlags & AllocationFlags::NoMerge),
+            alignment);
 
   // vmemDump(process, rx::format("mapFlex {:x}-{:x}", vmemRange.beginAddress(),
   //                              vmemRange.endAddress()));
@@ -731,9 +823,78 @@ orbis::vmem::mapFlex(Process *process, std::uint64_t size,
   return {vmemRange, {}};
 }
 
+std::pair<rx::AddressRange, orbis::ErrorCode> orbis::vmem::mapVoid(
+    Process *process, std::uint64_t size, std::uint64_t addressHint,
+    rx::EnumBitSet<AllocationFlags> allocFlags, std::string_view name,
+    std::uint64_t alignment, std::uint64_t callerAddress) {
+  bool canOverwrite = (allocFlags & AllocationFlags::Fixed) &&
+                      !(allocFlags & AllocationFlags::NoOverwrite);
+
+  VirtualMemoryAllocation allocationInfo;
+  if (process->sdkVersion < 0x5500000) {
+    allocationInfo.callerAddress = callerAddress;
+  }
+  allocationInfo.flagsEx = BlockFlagsEx::Void | BlockFlagsEx::Allocated;
+  allocationInfo.type = MemoryType::WbOnion;
+  allocationInfo.setName(process, name);
+
+  auto vmem = process->get(g_vmInstance);
+  std::lock_guard lock(*vmem);
+
+  rx::AddressRange vmemRange;
+
+  {
+    auto [_, errc, range] =
+        vmem->map(addressHint, size, allocationInfo,
+                  allocFlags | AllocationFlags::Dry, alignment);
+    if (errc != std::errc{}) {
+      if (errc == std::errc::file_exists) {
+        return {{}, ErrorCode::NOMEM};
+      }
+
+      return {{}, toErrorCode(errc)};
+    }
+
+    vmemRange = range;
+  }
+
+  if (canOverwrite) {
+    if (auto errc = validateOverwrite(vmem, vmemRange, false);
+        errc != ErrorCode{}) {
+      return {{}, errc};
+    }
+  }
+
+  if (auto errc =
+          process->invoke([=] { return rx::mem::protect(vmemRange, {}); });
+      errc != std::errc{}) {
+    return {{}, toErrorCode(errc)};
+  }
+
+  if (canOverwrite) {
+    release(process, vmem, process->getBudget(), vmemRange);
+  }
+
+  vmem->map(vmemRange.beginAddress(), vmemRange.size(), allocationInfo,
+            AllocationFlags::Fixed | (allocFlags & AllocationFlags::NoMerge),
+            alignment);
+
+  // vmemDump(process, rx::format("mapVoid {:x}-{:x}", vmemRange.beginAddress(),
+  //                              vmemRange.endAddress()));
+  return {vmemRange, {}};
+}
+
 std::pair<rx::AddressRange, orbis::ErrorCode>
 orbis::vmem::commitPooled(Process *process, rx::AddressRange range,
                           MemoryType type, rx::EnumBitSet<Protection> prot) {
+  if (!validateProtection(prot)) {
+    return {{}, ErrorCode::INVAL};
+  }
+
+  if (!validateMemoryType(type, prot)) {
+    return {{}, ErrorCode::ACCES};
+  }
+
   VirtualMemoryAllocation allocationInfo;
   allocationInfo.flags = BlockFlags::PooledMemory | BlockFlags::Commited;
   allocationInfo.flagsEx = BlockFlagsEx::Allocated;
@@ -755,10 +916,8 @@ orbis::vmem::commitPooled(Process *process, rx::AddressRange range,
   auto controlBlockIt = it;
 
   while (controlBlockIt != vmem->end() && controlBlockIt->isAllocated() &&
-         (controlBlockIt->flags & BlockFlags::PooledMemory)) {
-    if (!controlBlockIt->prot &&
-        controlBlockIt->flags == BlockFlags::PooledMemory &&
-        (controlBlockIt->flagsEx & orbis::vmem::BlockFlagsEx::PoolControl)) {
+         controlBlockIt->isPooled()) {
+    if (controlBlockIt->isPoolControl()) {
       break;
     }
 
@@ -781,7 +940,7 @@ orbis::vmem::commitPooled(Process *process, rx::AddressRange range,
     }
 
     if (auto errc = blockpool::commit(process, range, type, prot);
-        errc != orbis::ErrorCode{}) {
+        errc != ErrorCode{}) {
       return {{}, errc};
     }
 
@@ -796,7 +955,7 @@ orbis::vmem::commitPooled(Process *process, rx::AddressRange range,
               controlAllocationInfo, AllocationFlags::Fixed, kPageSize);
   } else {
     if (auto errc = blockpool::commit(process, range, type, prot);
-        errc != orbis::ErrorCode{}) {
+        errc != ErrorCode{}) {
       return {{}, errc};
     }
   }
@@ -887,68 +1046,148 @@ orbis::ErrorCode orbis::vmem::decommitPooled(Process *process,
 
 orbis::ErrorCode orbis::vmem::protect(Process *process, rx::AddressRange range,
                                       rx::EnumBitSet<Protection> prot) {
+  prot &= kProtCpuAll | kProtGpuAll;
+
+  if (!validateProtection(prot)) {
+    return ErrorCode::INVAL;
+  }
+
   auto vmem = process->get(g_vmInstance);
 
   range = rx::AddressRange::fromBeginEnd(
       rx::alignDown(range.beginAddress(), kPageSize),
       rx::alignUp(range.endAddress(), kPageSize));
+
   {
     std::lock_guard lock(*vmem);
-    auto it = vmem->query(range.beginAddress());
+    std::size_t fmemSize = 0;
+    auto it = vmem->lowerBound(range.beginAddress());
 
     if (it == vmem->end()) {
       rx::println(stderr,
                   "vmem: attempt to set protection of invalid address range: "
                   "{:x}-{:x}",
                   range.beginAddress(), range.endAddress());
-      return orbis::ErrorCode::INVAL;
+      return ErrorCode::INVAL;
     }
 
-    auto errc = validateRange(vmem, it, range,
-                              [](const VirtualMemoryAllocation &alloc) {
-                                if (alloc.flags == BlockFlags::PooledMemory) {
-                                  return ErrorCode::ACCES;
-                                }
+    auto errc =
+        validateRange(vmem, it, range,
+                      [&fmemSize, prot](const VirtualMemoryAllocation &alloc,
+                                        rx::AddressRange range) {
+                        if (alloc.isPoolReserved()) {
+                          return ErrorCode::ACCES;
+                        }
 
-                                if (alloc.flagsEx & BlockFlagsEx::Reserved) {
-                                  return ErrorCode::ACCES;
-                                }
+                        if (!validateMemoryType(alloc.type, prot)) {
+                          return ErrorCode::ACCES;
+                        }
 
-                                return ErrorCode{};
-                              });
+                        if (alloc.isFlex()) {
+                          if ((prot && !alloc.prot) || (!prot && alloc.prot)) {
+                            fmemSize += range.size();
+                          }
+                        }
+
+                        return ErrorCode{};
+                      });
 
     if (errc != ErrorCode{}) {
       return errc;
     }
 
-    if (auto errc =
-            toErrorCode(rx::mem::protect(range, vmem::toCpuProtection(prot)));
-        errc != ErrorCode{}) {
-      return errc;
+    if (fmemSize && prot) {
+      if (!process->getBudget()->acquire(BudgetResource::Fmem, fmemSize)) {
+        return ErrorCode::INVAL;
+      }
     }
 
-    modifyRange(vmem, it, range,
-                [prot](VirtualMemoryAllocation &alloc, rx::AddressRange) {
-                  if (!alloc.isAllocated()) {
-                    return;
-                  }
+    rx::EnumBitSet<AllocationFlags> modifyFlags{};
 
-                  if (alloc.device != nullptr &&
-                      alloc.flags & BlockFlags::FlexibleMemory) {
-                    alloc.prot = prot & ~Protection::CpuExec;
-                  }
+    modifyRange(
+        vmem, it, range, modifyFlags,
+        [process, prot, sdkVersion = process->sdkVersion](
+            VirtualMemoryAllocation &alloc, rx::AddressRange range) {
+          if (!alloc.isAllocated()) {
+            return;
+          }
 
-                  alloc.flags = alloc.prot
-                                    ? alloc.flags | BlockFlags::Commited
-                                    : alloc.flags & ~BlockFlags::Commited;
-                });
+          if (alloc.flagsEx & BlockFlagsEx::Reserved) {
+            return;
+          }
+
+          auto blockProt = prot;
+
+          if (alloc.type == MemoryType::WbGarlic) {
+            blockProt &= ~(Protection::CpuWrite | Protection::GpuWrite);
+          }
+
+          if (!alloc.isVoid()) {
+            if (alloc.isFlex() && alloc.device == nullptr) {
+              if (blockProt && !alloc.prot) {
+                auto [pmemRange, errc] = fmem::allocate(range.size());
+                rx::dieIf(errc != ErrorCode{},
+                          "failed to allocate flexible memory");
+
+                errc = pmem::map(range.beginAddress(), pmemRange,
+                                 toCpuProtection(blockProt));
+
+                rx::dieIf(errc != ErrorCode{}, "failed to map flexible memory");
+              } else if (!blockProt && alloc.prot) {
+                auto errc = fmem::deallocate(rx::AddressRange::fromBeginSize(
+                    alloc.deviceOffset, range.size()));
+
+                rx::dieIf(errc != ErrorCode{},
+                          "failed to deallocate flexible memory {:x}-{:x}",
+                          alloc.deviceOffset,
+                          alloc.deviceOffset + range.size());
+              }
+            }
+
+            if (sdkVersion > 0x1500000) {
+              if (alloc.isDirect() || alloc.isPooled() ||
+                  (alloc.isFlex() && alloc.device != nullptr)) {
+                blockProt &= ~Protection::CpuExec;
+              }
+            }
+
+            auto cpuBlockProt = toCpuProtection(blockProt);
+
+            if (cpuBlockProt != toCpuProtection(alloc.prot)) {
+              auto errc = process->invoke(
+                  [=] { return rx::mem::protect(range, cpuBlockProt); });
+
+              rx::dieIf(errc != std::errc{},
+                        "failed to protect region {:x}-{:x}, prot {}, error {}",
+                        range.beginAddress(), range.endAddress(), cpuBlockProt,
+                        errc);
+            }
+          } else {
+            blockProt = {};
+          }
+
+          if (alloc.isDirect() || alloc.isPooled() ||
+              (alloc.isFlex() && alloc.device != nullptr)) {
+            blockProt &= ~Protection::CpuExec;
+          }
+
+          if (alloc.isVoid() && sdkVersion <= 0x1500000) {
+            alloc.prot = prot;
+          } else {
+            alloc.prot = blockProt;
+          }
+          alloc.flags = blockProt ? alloc.flags | BlockFlags::Commited
+                                  : alloc.flags & ~BlockFlags::Commited;
+        });
+    if (fmemSize && !prot) {
+      process->getBudget()->release(BudgetResource::Fmem, fmemSize);
+    }
   }
 
   amdgpu::protectMemory(process->pid, range, prot);
 
-  // vmemDump(process, rx::format("protected {:x}-{:x} {}",
-  // range.beginAddress(),
-  //                              range.endAddress(), prot));
+  vmemDump(process, rx::format("protected {:x}-{:x} {}", range.beginAddress(),
+                               range.endAddress(), prot));
 
   return {};
 }
@@ -965,10 +1204,16 @@ orbis::ErrorCode orbis::vmem::setName(Process *process, rx::AddressRange range,
                 "vmem: attempt to set name of invalid address range: "
                 "{:x}-{:x}, name: {}",
                 range.beginAddress(), range.endAddress(), name);
-    return orbis::ErrorCode::INVAL;
+    return ErrorCode::INVAL;
   }
 
-  modifyRange(vmem, it, range,
+  rx::EnumBitSet<AllocationFlags> modifyFlags{};
+
+  if (process->sdkVersion <= 0x1500000) {
+    modifyFlags |= AllocationFlags::NoMerge;
+  }
+
+  modifyRange(vmem, it, range, modifyFlags,
               [name](VirtualMemoryAllocation &alloc, rx::AddressRange) {
                 if (alloc.isAllocated()) {
                   alloc.name = name;
@@ -978,93 +1223,84 @@ orbis::ErrorCode orbis::vmem::setName(Process *process, rx::AddressRange range,
   return {};
 }
 
-orbis::ErrorCode orbis::vmem::setType(Process *process, rx::AddressRange range,
-                                      MemoryType type) {
+orbis::ErrorCode
+orbis::vmem::setTypeAndProtect(Process *process, rx::AddressRange range,
+                               MemoryType type,
+                               rx::EnumBitSet<Protection> prot) {
+  if (!validateProtection(prot)) {
+    return ErrorCode::INVAL;
+  }
+
+  if (!validateMemoryType(type, prot)) {
+    return ErrorCode::ACCES;
+  }
+
+  prot &= ~Protection::CpuExec;
+
   auto vmem = process->get(g_vmInstance);
 
   std::lock_guard lock(*vmem);
   auto it = vmem->query(range.beginAddress());
 
   if (it == vmem->end()) {
-    return orbis::ErrorCode::INVAL;
+    return ErrorCode::INVAL;
+  }
+
+  auto errc =
+      validateRange(vmem, it, range,
+                    [](const VirtualMemoryAllocation &alloc, rx::AddressRange) {
+                      if (alloc.flags == BlockFlags::PooledMemory) {
+                        return ErrorCode::ACCES;
+                      }
+
+                      if (alloc.flagsEx & BlockFlagsEx::Reserved) {
+                        return ErrorCode::ACCES;
+                      }
+
+                      if (!(alloc.flags & (BlockFlags::PooledMemory |
+                                           BlockFlags::DirectMemory))) {
+                        return ErrorCode::OPNOTSUPP;
+                      }
+
+                      return ErrorCode{};
+                    });
+
+  if (errc != ErrorCode{}) {
+    return errc;
+  }
+
+  if (auto errc = process->invoke([=] {
+        return toErrorCode(rx::mem::protect(range, toCpuProtection(prot)));
+      });
+      errc != ErrorCode{}) {
+    return errc;
+  }
+
+  rx::EnumBitSet<AllocationFlags> modifyFlags{};
+
+  if (process->sdkVersion < 0x1700000) {
+    modifyFlags |= AllocationFlags::NoMerge;
   }
 
   modifyRange(
-      vmem, it, range,
-      [type](VirtualMemoryAllocation &alloc, rx::AddressRange range) {
+      vmem, it, range, modifyFlags,
+      [type, prot](VirtualMemoryAllocation &alloc, rx::AddressRange range) {
         if (!alloc.isAllocated()) {
           return;
         }
 
+        alloc.type = type;
+        alloc.prot = prot;
+        alloc.flags = prot ? alloc.flags | BlockFlags::Commited
+                           : alloc.flags & ~BlockFlags::Commited;
+
         if (alloc.flags & BlockFlags::DirectMemory) {
           dmem::setType(
               0,
               rx::AddressRange::fromBeginSize(alloc.deviceOffset, range.size()),
               type);
-          alloc.type = type;
           return;
         }
-
-        if (alloc.flags != (BlockFlags::PooledMemory | BlockFlags::Commited)) {
-          alloc.type = type;
-          return;
-        }
-      });
-
-  return {};
-}
-
-orbis::ErrorCode
-orbis::vmem::setTypeAndProtect(Process *process, rx::AddressRange range,
-                               MemoryType type,
-                               rx::EnumBitSet<Protection> prot) {
-  auto vmem = process->get(g_vmInstance);
-
-  std::lock_guard lock(*vmem);
-  auto it = vmem->query(range.beginAddress());
-
-  if (it == vmem->end()) {
-    return orbis::ErrorCode::INVAL;
-  }
-
-  auto errc =
-      validateRange(vmem, it, range, [](const VirtualMemoryAllocation &alloc) {
-        if (alloc.flags == BlockFlags::PooledMemory) {
-          return ErrorCode::ACCES;
-        }
-
-        if (alloc.flagsEx & BlockFlagsEx::Reserved) {
-          return ErrorCode::ACCES;
-        }
-
-        return ErrorCode{};
-      });
-
-  if (errc != ErrorCode{}) {
-    return errc;
-  }
-
-  if (auto errc =
-          toErrorCode(rx::mem::protect(range, vmem::toCpuProtection(prot)));
-      errc != ErrorCode{}) {
-    return errc;
-  }
-
-  modifyRange(
-      vmem, it, range,
-      [type, prot](VirtualMemoryAllocation &alloc, rx::AddressRange range) {
-        if (alloc.isAllocated()) {
-          alloc.type = type;
-          alloc.prot = prot;
-
-          if (alloc.flags & BlockFlags::DirectMemory) {
-            dmem::setType(0,
-                          rx::AddressRange::fromBeginSize(alloc.deviceOffset,
-                                                          range.size()),
-                          type);
-            return;
-          }
-        }
       });
 
   amdgpu::protectMemory(process->pid, range, prot);
@@ -1080,12 +1316,11 @@ orbis::ErrorCode orbis::vmem::unmap(Process *process, rx::AddressRange range) {
       rx::alignDown(range.beginAddress(), kPageSize),
       rx::alignUp(range.endAddress(), kPageSize));
 
-  orbis::ErrorCode result;
+  ErrorCode result;
   {
     std::lock_guard lock(*vmem);
 
-    if (auto errc = validateOverwrite(vmem, range, true);
-        errc != orbis::ErrorCode{}) {
+    if (auto errc = validateOverwrite(vmem, range, true); errc != ErrorCode{}) {
       return errc;
     }
 
@@ -1098,7 +1333,7 @@ orbis::ErrorCode orbis::vmem::unmap(Process *process, rx::AddressRange range) {
     result = toErrorCode(errc);
   }
 
-  rx::mem::release(range, kPageSize);
+  process->invoke([=] { rx::mem::release(range, kPageSize); });
   amdgpu::unmapMemory(process->pid, range);
 
   // vmemDump(process, rx::format("unmap {:x}-{:x}", range.beginAddress(),
@@ -1134,7 +1369,7 @@ orbis::vmem::query(Process *process, std::uint64_t address, bool lowerBound) {
     return {};
   }
 
-  orbis::vmem::QueryResult result{};
+  QueryResult result{};
   result.start = it.beginAddress();
   result.end = it.endAddress();
 
@@ -1148,7 +1383,9 @@ orbis::vmem::query(Process *process, std::uint64_t address, bool lowerBound) {
     result.memoryType = it->type;
   }
 
-  result.protection = it->prot;
+  if (!it->isVoid()) {
+    result.protection = it->prot;
+  }
   result.flags = it->flags;
   result.name = it->name;
 
@@ -1184,7 +1421,7 @@ orbis::vmem::queryProtection(Process *process, std::uint64_t address,
     return {};
   }
 
-  orbis::vmem::MemoryProtection result{};
+  MemoryProtection result{};
   result.startAddress = it.beginAddress();
   result.endAddress = it.endAddress();
   result.prot = it->prot;
diff --git a/rpcsx/iodev/dce.cpp b/rpcsx/iodev/dce.cpp
index da2f1c01d..ca9a079ff 100644
--- a/rpcsx/iodev/dce.cpp
+++ b/rpcsx/iodev/dce.cpp
@@ -21,6 +21,7 @@
 #include <cstring>
 #include <mutex>
 #include <sys/mman.h>
+#include <thread>
 
 static constexpr auto kDceControlMemoryOffset = 0;
 static constexpr auto kDceControlMemorySize = 0x10000;
@@ -578,10 +579,10 @@ DceDevice::map(rx::AddressRange range, std::int64_t offset,
               range.endAddress(), offset, protection);
 
   auto result =
-      orbis::dmem::map(0, range, dmemRange.beginAddress() + offset, protection);
+      orbis::dmem::map(process, 0, range, dmemRange.beginAddress() + offset, protection);
 
   if (result == orbis::ErrorCode{}) {
-    amdgpu::mapMemory(process->pid, range, orbis::MemoryType::WbGarlic,
+    amdgpu::mapMemory(process->pid, range, orbis::MemoryType::WcGarlic,
                       protection, dmemRange.beginAddress() + offset);
   }
 
@@ -603,6 +604,7 @@ static void createGpu() {
   }
 
   while (orbis::g_context->gpuDevice == nullptr) {
+    std::this_thread::yield();
   }
 }
 
@@ -642,7 +644,7 @@ orbis::IoDevice *createDceCharacterDevice(orbis::Process *process) {
       0,
       rx::AddressRange::fromBeginEnd(dmemSize - orbis::dmem::kPageSize * 2,
                                      dmemSize),
-      orbis::dmem::kPageSize, orbis::MemoryType::WbGarlic);
+      orbis::dmem::kPageSize, orbis::MemoryType::WcGarlic);
 
   rx::dieIf(errc != orbis::ErrorCode{},
             "failed to allocate DCE memory, error {}", errc);
diff --git a/rpcsx/iodev/dmem.cpp b/rpcsx/iodev/dmem.cpp
index 81634be1f..7460e3a98 100644
--- a/rpcsx/iodev/dmem.cpp
+++ b/rpcsx/iodev/dmem.cpp
@@ -84,7 +84,7 @@ struct DmemIoctlControlRelease {
   orbis::uint64_t unk2;
 };
 struct DmemIoctlSetPidAndProtect {
-  orbis::uintptr_t address;
+  orbis::uintptr_t offset;
   orbis::size_t size;
   orbis::pid_t pid; // 0 if all
   rx::EnumBitSet<orbis::vmem::Protection> prot;
@@ -206,15 +206,32 @@ dmem_ioctl_control_release(orbis::Thread *thread, DmemDevice *device,
 static orbis::ErrorCode
 dmem_ioctl_set_pid_and_protect(orbis::Thread *thread, DmemDevice *device,
                                DmemIoctlSetPidAndProtect &args) {
-  ORBIS_LOG_WARNING(__FUNCTION__, args.pid, args.address, args.size,
+  ORBIS_LOG_WARNING(__FUNCTION__, args.pid, args.offset, args.size,
                     args.prot.toUnderlying());
-  return {};
+
+  orbis::Process *process = nullptr;
+
+  if (args.pid != 0) {
+    process = args.pid == -1 || args.pid == thread->tproc->pid
+                  ? thread->tproc
+                  : orbis::findProcessById(args.pid);
+    if (process == nullptr) {
+      return orbis::ErrorCode::SRCH;
+    }
+  }
+
+  return orbis::dmem::protect(
+      process, 0, rx::AddressRange::fromBeginSize(args.offset, args.size),
+      args.prot);
 }
 
 static orbis::ErrorCode
 dmem_ioctl_allocate_for_mini_app(orbis::Thread *thread, DmemDevice *device,
                                  DmemIoctlAllocate &args) {
   // FIXME: implement
+  ORBIS_LOG_WARNING(__FUNCTION__, args.searchStart, args.searchEnd,
+                    args.alignment, args.len, (int)args.memoryType);
+
   return dmem_ioctl_allocate(thread, device, args);
 }
 
@@ -222,6 +239,9 @@ static orbis::ErrorCode dmem_ioctl_allocate_main(orbis::Thread *thread,
                                                  DmemDevice *device,
                                                  DmemIoctlAllocate &args) {
   // FIXME: implement
+  ORBIS_LOG_WARNING(__FUNCTION__, args.searchStart, args.searchEnd,
+                    args.alignment, args.len, (int)args.memoryType);
+
   return dmem_ioctl_allocate(thread, device, args);
 }
 
@@ -303,7 +323,7 @@ static orbis::ErrorCode dmem_ioctl_reserve(orbis::Thread *thread,
   auto [offset, errc] = orbis::dmem::reserveSystem(device->index, args.size);
 
   if (errc == orbis::ErrorCode{}) {
-    args.size = offset | 0x4000000000;
+    args.size = offset;
   }
   return errc;
 }
@@ -332,7 +352,7 @@ orbis::ErrorCode
 DmemDevice::map(rx::AddressRange range, std::int64_t offset,
                 rx::EnumBitSet<orbis::vmem::Protection> protection,
                 orbis::File *, orbis::Process *process) {
-  auto result = orbis::dmem::map(index, range, offset, protection);
+  auto result = orbis::dmem::map(process, index, range, offset, protection);
 
   if (result == orbis::ErrorCode{}) {
     if (auto dmemType = orbis::dmem::query(0, offset)) {
diff --git a/rpcsx/iodev/gc.cpp b/rpcsx/iodev/gc.cpp
index b3caf1b49..9bad3835a 100644
--- a/rpcsx/iodev/gc.cpp
+++ b/rpcsx/iodev/gc.cpp
@@ -15,6 +15,7 @@
 #include "rx/die.hpp"
 #include "rx/format.hpp"
 #include "rx/print.hpp"
+#include "rx/watchdog.hpp"
 #include <cstdio>
 #include <mutex>
 #include <sys/mman.h>
@@ -83,14 +84,14 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
 
   auto gcFile = static_cast<GcFile *>(file);
   auto device = file->device.rawStaticCast<GcDevice>();
-  // std::lock_guard lock(device->mtx);
+  std::lock_guard lock(device->mtx);
 
   switch (request) {
   case 0xc008811b: // get submit done flag ptr?
     if (device->submitArea == 0) {
       auto [dmemOffset, dmemErrc] = orbis::dmem::allocate(
           0, rx::AddressRange::fromBeginEnd(0, 0), orbis::dmem::kPageSize,
-          orbis::MemoryType::WbGarlic);
+          orbis::MemoryType::WcGarlic);
 
       if (dmemErrc != orbis::ErrorCode{}) {
         return dmemErrc;
@@ -108,7 +109,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
 
       if (vmemErrc != orbis::ErrorCode{}) {
         orbis::dmem::release(0, directRange);
-        return dmemErrc;
+        return vmemErrc;
       }
 
       device->submitArea = vmemRange.beginAddress();
@@ -441,6 +442,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
   case 0xc004811f: {
     ORBIS_LOG_WARNING("Unknown gc ioctl", request,
                       (unsigned long)*(std::uint32_t *)argp);
+    *(std::uint32_t *)argp = 0;
     break;
   }
 
@@ -481,9 +483,26 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
 
 static const orbis::FileOps ops = {.ioctl = gc_ioctl};
 
+static void createGpu() {
+  {
+    std::lock_guard lock(orbis::g_context->gpuDeviceMtx);
+    if (orbis::g_context->gpuDevice != nullptr) {
+      return;
+    }
+
+    rx::createGpuDevice();
+  }
+
+  while (orbis::g_context->gpuDevice == nullptr) {
+    std::this_thread::yield();
+  }
+}
+
 orbis::ErrorCode GcDevice::open(rx::Ref<orbis::File> *file, const char *path,
                                 std::uint32_t flags, std::uint32_t mode,
                                 orbis::Thread *thread) {
+  createGpu();
+
   auto newFile = orbis::knew<GcFile>();
   newFile->device = this;
   newFile->ops = &ops;
@@ -520,7 +539,7 @@ orbis::IoDevice *createGcCharacterDevice() {
       0,
       rx::AddressRange::fromBeginEnd(dmemSize - orbis::dmem::kPageSize * 2,
                                      dmemSize),
-      orbis::dmem::kPageSize, orbis::MemoryType::WbGarlic);
+      orbis::dmem::kPageSize, orbis::MemoryType::WcGarlic);
 
   rx::dieIf(errc != orbis::ErrorCode{},
             "failed to allocate GC memory, error {}", errc);
diff --git a/rpcsx/ipmi.cpp b/rpcsx/ipmi.cpp
index 3b109e14f..b8fe62448 100644
--- a/rpcsx/ipmi.cpp
+++ b/rpcsx/ipmi.cpp
@@ -7,6 +7,7 @@
 #include "orbis/pmem.hpp"
 #include "orbis/thread/Process.hpp"
 #include "orbis/thread/Thread.hpp"
+#include "orbis/uio.hpp"
 #include "orbis/utils/Logs.hpp"
 #include "orbis/vmem.hpp"
 #include "rx/AddressRange.hpp"
@@ -206,12 +207,14 @@ orbis::EventFlag *ipmi::createEventFlag(std::string_view name, uint32_t attrs,
       .first;
 }
 
-void ipmi::createShm(const char *name, uint32_t flags, uint32_t mode,
-                     uint64_t size) {
+rx::Ref<orbis::File> ipmi::createShm(const char *name, uint32_t flags,
+                                     uint32_t mode, uint64_t size) {
   rx::Ref<orbis::File> shm;
   auto shmDevice = orbis::g_context->shmDevice.staticCast<orbis::IoDevice>();
   shmDevice->open(&shm, name, flags, mode, nullptr);
   shm->ops->truncate(shm.get(), size, nullptr);
+
+  return shm;
 }
 
 orbis::ErrorCode
@@ -610,7 +613,49 @@ void ipmi::createShellCoreObjects(orbis::Process *process) {
       lnsStatusServer = 0x30010;
     }
   }
+
+  int lnsLoadExec;
+
+  if (orbis::g_context->fwType == orbis::FwType::Ps5) {
+    lnsLoadExec = 0x30013;
+  } else {
+    if (orbis::g_context->fwSdkVersion > 0x6000000) {
+      lnsLoadExec = 0x30010;
+    } else {
+      lnsLoadExec = 0x30013;
+    }
+  }
+
   createIpmiServer(process, "SceLncService")
+      .addSyncMethod(
+          lnsLoadExec,
+          [](std::vector<std::vector<std::byte>> &outData,
+             const std::vector<std::span<std::byte>> &inData) {
+            std::println(stderr,
+                         "SceLncService::loadExec(inBufCount={}, "
+                         "outBufCount={})",
+                         inData.size(), outData.size());
+
+            for (auto in : inData) {
+              std::println(stderr, "in {}", in.size());
+              rx::hexdump(in);
+            }
+
+            if (inData.size() == 3) {
+              auto action = inData[1];
+              if (std::string_view((const char *)action.data(),
+                                   action.size() - 1) == "EXIT") {
+                orbis::uint32_t status = 1;
+                if (inData[0].size() == sizeof(orbis::uint32_t)) {
+                  std::memcpy(&status, inData[0].data(), sizeof(status));
+                }
+
+                rx::println(stderr, "LNC exit request, status {}", status);
+                std::exit(status);
+              }
+            }
+            return 0;
+          })
       .addSyncMethod(lnsStatusServer,
                      [](void *out, std::uint64_t &size) -> std::int32_t {
                        struct SceLncServiceAppStatus {
@@ -1112,17 +1157,34 @@ void ipmi::createShellCoreObjects(orbis::Process *process) {
   createEventFlag("SceDataTransfer", 0x120, 0);
 
   createEventFlag("SceLncUtilAppStatus00000000", 0x100, 0);
+  createEventFlag("SceLncUtilAppStatus0", 0x100, 0);
   createEventFlag("SceLncUtilAppStatus1", 0x100, 0);
+  createEventFlag("SceAppMessaging0", 0x120, 1);
   createEventFlag("SceAppMessaging1", 0x120, 1);
+  createEventFlag("SceShellCoreUtil0", 0x120, 0x3f8c);
   createEventFlag("SceShellCoreUtil1", 0x120, 0x3f8c);
   createEventFlag("SceNpScoreIpc_" + fmtHex(process->pid), 0x120, 0);
   createEventFlag("/vmicDdEvfAin", 0x120, 0);
 
+  createSemaphore("SceAppMessaging0", 0x101, 1, 0x7fffffff);
   createSemaphore("SceAppMessaging1", 0x101, 1, 0x7fffffff);
   createSemaphore("SceLncSuspendBlock1", 0x101, 1, 10000);
 
   createShm("SceGlsSharedMemory", 0x202, 0x1a4, 262144);
-  createShm("SceShellCoreUtil", 0x202, 0x1a4, 16384);
+  {
+    auto util = createShm("SceShellCoreUtil", 0x202, 0x1a4, 16384);
+    orbis::uint64_t header = 0x6ed81ede6df17259;
+    orbis::IoVec vec{
+        .base = &header,
+        .len = sizeof(header),
+    };
+    orbis::Uio uio{
+        .iov = &vec,
+        .iovcnt = 1,
+    };
+
+    util->ops->write(util.get(), &uio, process->threadsMap.get(0).get());
+  }
   createShm("SceNpTpip", 0x202, 0x1ff, 43008);
 
   createShm("vmicDdShmAin", 0x202, 0x1b6, 43008);
diff --git a/rpcsx/ipmi.hpp b/rpcsx/ipmi.hpp
index 85afe2ea5..ce976dcbd 100644
--- a/rpcsx/ipmi.hpp
+++ b/rpcsx/ipmi.hpp
@@ -266,7 +266,8 @@ orbis::EventFlag *createEventFlag(std::string_view name, uint32_t attrs,
                                   uint64_t initPattern);
 orbis::Semaphore *createSemaphore(std::string_view name, uint32_t attrs,
                                   uint64_t initCount, uint64_t maxCount);
-void createShm(const char *name, uint32_t flags, uint32_t mode, uint64_t size);
+rx::Ref<orbis::File> createShm(const char *name, uint32_t flags, uint32_t mode,
+                               uint64_t size);
 
 void createMiniSysCoreObjects(orbis::Process *process);
 void createSysAvControlObjects(orbis::Process *process);
diff --git a/rpcsx/main.cpp b/rpcsx/main.cpp
index 627cd3119..4e3a17aa3 100644
--- a/rpcsx/main.cpp
+++ b/rpcsx/main.cpp
@@ -57,7 +57,6 @@
 #include <cstdint>
 #include <filesystem>
 
-static int g_gpuPid;
 extern bool allowMonoDebug;
 
 __attribute__((no_stack_protector)) static void
@@ -147,11 +146,6 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {
                                               orbis::kNoteExit, sig);
   }
 
-  if (g_gpuPid > 0) {
-    // stop gpu thread
-    // ::kill(g_gpuPid, SIGINT);
-  }
-
   allowMonoDebug = true;
   if (sig != SIGINT) {
     char buf[128] = "";
@@ -361,15 +355,15 @@ static void onSysExit(orbis::Thread *thread, int id, uint64_t *args,
   funlockfile(stderr);
 }
 
-static void guestInitDev(orbis::Thread *thread) {
+static void guestInitDev(orbis::Thread *thread, int stdinFd, int stdoutFd,
+                         int stderrFd) {
   auto dmem0 = createDmemCharacterDevice(0);
   dmem0->open(&orbis::g_context->dmem, "", 0, 0, thread);
 
   auto dce = createDceCharacterDevice(thread->tproc);
   orbis::g_context->dceDevice = dce;
 
-  auto ttyFd = ::open("tty.txt", O_CREAT | O_TRUNC | O_WRONLY, 0666);
-  auto consoleDev = createConsoleCharacterDevice(STDIN_FILENO, ttyFd);
+  auto consoleDev = createConsoleCharacterDevice(stdinFd, stdoutFd);
   auto mbus = static_cast<MBusDevice *>(createMBusCharacterDevice());
   auto mbusAv = static_cast<MBusAVDevice *>(createMBusAVCharacterDevice());
 
@@ -461,7 +455,7 @@ static void guestInitDev(orbis::Thread *thread) {
   vfs::addDevice("devctl", createDevCtlCharacterDevice());
   vfs::addDevice("uvd", createUVDCharacterDevice());
   vfs::addDevice("vce", createVCECharacterDevice());
-  vfs::addDevice("evlg1", createEvlgCharacterDevice(ttyFd));
+  vfs::addDevice("evlg1", createEvlgCharacterDevice(stderrFd));
   vfs::addDevice("srtc", createSrtcCharacterDevice());
   vfs::addDevice("sshot", createScreenShotCharacterDevice());
   vfs::addDevice("lvdctl", createLvdCtlCharacterDevice());
@@ -926,8 +920,6 @@ int main(int argc, const char *argv[]) {
         return 1;
       }
 
-      rx::println("mounting '{}' to virtual '{}'", argv[argIndex + 1],
-                  argv[argIndex + 2]);
       if (!std::filesystem::is_directory(argv[argIndex + 1])) {
         rx::println(stderr, "Directory '{}' not exists", argv[argIndex + 1]);
         return 1;
@@ -945,8 +937,6 @@ int main(int argc, const char *argv[]) {
         return 1;
       }
 
-      rx::println("mounting firmware '{}'", argv[argIndex + 1]);
-
       vfs::mount("/", createHostIoDevice(argv[argIndex + 1], "/"));
 
       argIndex += 2;
@@ -1032,6 +1022,18 @@ int main(int argc, const char *argv[]) {
     break;
   }
 
+  setvbuf(stdout, nullptr, _IONBF, 0);
+  auto stdinFd = dup(STDIN_FILENO);
+  auto stdoutFd = dup(STDOUT_FILENO);
+  auto stderrFd = dup(STDERR_FILENO);
+
+  auto logFd = ::open("log-init.txt", O_CREAT | O_TRUNC | O_WRONLY, 0666);
+  dup2(logFd, STDOUT_FILENO);
+  dup2(logFd, STDERR_FILENO);
+  close(logFd);
+
+  rx::println(stderr, "RPCSX v{}", rx::getVersion().toString());
+
   setupSigHandlers();
   orbis::constructAllGlobals();
   orbis::g_context->deviceEventEmitter = orbis::knew<orbis::EventEmitter>();
@@ -1042,13 +1044,8 @@ int main(int argc, const char *argv[]) {
   orbis::fmem::initialize(2ull * 1024 * 1024 * 1024);
 
   rx::startWatchdog();
-  rx::createGpuDevice();
   vfs::initialize();
 
-  while (orbis::g_context->gpuDevice == nullptr) {
-    std::this_thread::yield();
-  }
-
   std::vector<std::string> guestArgv(argv + argIndex, argv + argc);
   if (guestArgv.empty()) {
     guestArgv.emplace_back("/mini-syscore.elf");
@@ -1097,7 +1094,7 @@ int main(int argc, const char *argv[]) {
           .flags = 0,
           .item =
               {
-                  .total = 2ul * 1024 * 1024 * 1024,
+                  .total = 0x1C000000,
               },
       },
       {
@@ -1260,7 +1257,7 @@ int main(int argc, const char *argv[]) {
     executableModule->dynType = orbis::DynType::Ps5;
   }
 
-  guestInitDev(mainThread);
+  guestInitDev(mainThread, stdinFd, stdoutFd, stderrFd);
   guestInitFd(mainThread);
 
   // data transfer mode
diff --git a/rpcsx/ops.cpp b/rpcsx/ops.cpp
index b9a1d4a7e..886a24da5 100644
--- a/rpcsx/ops.cpp
+++ b/rpcsx/ops.cpp
@@ -29,10 +29,10 @@
 #include <string_view>
 
 #ifdef __linux
+#include <csignal>
 #include <sys/prctl.h>
 #include <sys/socket.h>
 #include <unistd.h>
-#include <csignal>
 #endif
 
 #include <thread>
@@ -106,7 +106,7 @@ loadPrx(orbis::Thread *thread, std::string_view name, bool relocate,
   module->importedModules.clear();
   module->importedModules.reserve(module->neededModules.size());
 
-  for (auto mod : module->neededModules) {
+  for (const auto &mod : module->neededModules) {
     if (auto it = loadedModules.find(std::string_view(mod.name));
         it != loadedModules.end()) {
       module->importedModules.emplace_back(it->second);
@@ -329,7 +329,7 @@ orbis::SysResult dynlib_load_prx(orbis::Thread *thread,
     }
   }
 
-  auto [result, module] = loadPrx(thread, path, true);
+  auto [result, resultModule] = loadPrx(thread, path, true);
   if (result.isError()) {
     return result;
   }
@@ -342,14 +342,23 @@ orbis::SysResult dynlib_load_prx(orbis::Thread *thread,
       loadedModules[module->moduleName] = module;
     }
 
+    bool wasNeeded = false;
+
     for (auto [id, module] : thread->tproc->modulesMap) {
       module->importedModules.clear();
       module->importedModules.reserve(module->neededModules.size());
 
-      for (auto mod : module->neededModules) {
+      for (auto &mod : module->neededModules) {
         if (auto it = loadedModules.find(std::string_view(mod.name));
             it != loadedModules.end()) {
           module->importedModules.emplace_back(it->second);
+          if (id != ModuleHandle{1}) {
+            it->second->refCount = 1;
+          }
+
+          if (it->second == resultModule) {
+            wasNeeded = true;
+          }
           continue;
         }
 
@@ -358,9 +367,13 @@ orbis::SysResult dynlib_load_prx(orbis::Thread *thread,
 
       module->relocate(thread->tproc);
     }
+
+    if (!wasNeeded) {
+      resultModule->refCount = 1;
+    }
   }
 
-  *pHandle = module->id;
+  *pHandle = resultModule->id;
   return {};
 }
 orbis::SysResult dynlib_unload_prx(orbis::Thread *thread,
@@ -593,6 +606,7 @@ SysResult processNeeded(Thread *thread) {
       module->importedModules.push_back({});
     }
 
+    module->refCount = 1;
     module->relocate(thread->tproc);
   }
 
@@ -788,6 +802,7 @@ void block(Thread *thread) {
   sigset_t set;
   sigemptyset(&set);
   sigaddset(&set, SIGUSR1);
+  sigaddset(&set, SIGUSR2);
   sigaddset(&set, SIGSYS);
   if (pthread_sigmask(SIG_BLOCK, &set, nullptr)) {
     perror("pthread_sigmask block");
@@ -819,6 +834,7 @@ void unblock(Thread *thread) {
   sigset_t set;
   sigemptyset(&set);
   sigaddset(&set, SIGUSR1);
+  sigaddset(&set, SIGUSR2);
   sigaddset(&set, SIGSYS);
   if (pthread_sigmask(SIG_UNBLOCK, &set, nullptr)) {
     perror("pthread_sigmask unblock");