diff --git a/.ci/build-linux-aarch64.sh b/.ci/build-linux-aarch64.sh
index 54b2e49d3..0ee670ea3 100755
--- a/.ci/build-linux-aarch64.sh
+++ b/.ci/build-linux-aarch64.sh
@@ -1,7 +1,5 @@
 #!/bin/sh -ex
 
-cd rpcs3/ || exit 1
-
 git config --global --add safe.directory '*'
 
 # Pull all the submodules except llvm and opencv
@@ -9,33 +7,30 @@ git config --global --add safe.directory '*'
 git submodule -q update --init $(awk '/path/ && !/llvm/ && !/opencv/ { print $3 }' .gitmodules)
 
 if [ "$COMPILER" = "gcc" ]; then
-    # These are set in the dockerfile
-    export CC="${GCC_BINARY}"
-    export CXX="${GXX_BINARY}"
-    export LINKER=gold
+    export CC=gcc-14
+    export CXX=g++-14
 else
-    export CC="${CLANG_BINARY}"
-    export CXX="${CLANGXX_BINARY}"
-    export LINKER="${LLD_BINARY}"
+    export CC=clang
+    export CXX=clang++
+    export CFLAGS="$CFLAGS -fuse-ld=lld"
 fi
-export CFLAGS="$CFLAGS -fuse-ld=${LINKER}"
-export CXXFLAGS="$CXXFLAGS -fuse-ld=${LINKER}"
 
 cmake -B build                                         \
     -DCMAKE_INSTALL_PREFIX=/usr                        \
-    -DUSE_NATIVE_INSTRUCTIONS=OFF                      \
-    -DUSE_PRECOMPILED_HEADERS=OFF                      \
     -DCMAKE_C_FLAGS="$CFLAGS"                          \
     -DCMAKE_CXX_FLAGS="$CFLAGS"                        \
+    -DUSE_NATIVE_INSTRUCTIONS=OFF                      \
+    -DUSE_PRECOMPILED_HEADERS=OFF                      \
     -DUSE_SYSTEM_CURL=ON                               \
-    -DUSE_SDL=ON                                       \
-    -DUSE_SYSTEM_SDL=ON                                \
+    -DUSE_SDL=OFF                                      \
+    -DUSE_SYSTEM_FFMPEG=OFF                            \
+    -DUSE_SYSTEM_CURL=OFF                              \
+    -DUSE_SYSTEM_OPENAL=OFF                            \
     -DUSE_SYSTEM_FFMPEG=OFF                            \
-    -DUSE_SYSTEM_OPENCV=ON                             \
     -DUSE_DISCORD_RPC=ON                               \
     -DOpenGL_GL_PREFERENCE=LEGACY                      \
-    -DLLVM_DIR=/opt/llvm/lib/cmake/llvm                \
     -DSTATIC_LINK_LLVM=ON                              \
+    -DBUILD_LLVM=on                                    \
     -DWITH_RPCSX=off                                   \
     -DWITH_RPCS3=on                                    \
     -DWITH_RPCS3_QT_UI=on                              \
diff --git a/.ci/build-linux.sh b/.ci/build-linux.sh
index 15c76cac7..54b5adc83 100755
--- a/.ci/build-linux.sh
+++ b/.ci/build-linux.sh
@@ -1,7 +1,5 @@
 #!/bin/sh -ex
 
-cd rpcs3/ || exit 1
-
 git config --global --add safe.directory '*'
 
 # Pull all the submodules except llvm and opencv
@@ -11,40 +9,29 @@ git submodule -q update --init $(awk '/path/ && !/llvm/ && !/opencv/ { print $3
 
 if [ "$COMPILER" = "gcc" ]; then
     # These are set in the dockerfile
-    export CC="${GCC_BINARY}"
-    export CXX="${GXX_BINARY}"
-    export LINKER=gold
-    # We need to set the following variables for LTO to link properly
-    export AR=/usr/bin/gcc-ar-"$GCCVER"
-    export RANLIB=/usr/bin/gcc-ranlib-"$GCCVER"
-    export CFLAGS="-fuse-linker-plugin"
+    export CC=gcc-14
+    export CXX=g++-14
 else
-    export CC="${CLANG_BINARY}"
-    export CXX="${CLANGXX_BINARY}"
-    export LINKER=lld
-    export AR=/usr/bin/llvm-ar-"$LLVMVER"
-    export RANLIB=/usr/bin/llvm-ranlib-"$LLVMVER"
+    export CC=clang
+    export CXX=clang++
+    export LD=clang
+    export CFLAGS="$CFLAGS -fuse-ld=lld"
 fi
 
-export CFLAGS="$CFLAGS -fuse-ld=${LINKER}"
-
 cmake -B build                                         \
     -DCMAKE_INSTALL_PREFIX=/usr                        \
-    -DUSE_NATIVE_INSTRUCTIONS=OFF                      \
-    -DUSE_PRECOMPILED_HEADERS=OFF                      \
     -DCMAKE_C_FLAGS="$CFLAGS"                          \
     -DCMAKE_CXX_FLAGS="$CFLAGS"                        \
-    -DCMAKE_AR="$AR"                                   \
-    -DCMAKE_RANLIB="$RANLIB"                           \
-    -DUSE_SYSTEM_CURL=ON                               \
-    -DUSE_SDL=ON                                       \
-    -DUSE_SYSTEM_SDL=ON                                \
+    -DUSE_NATIVE_INSTRUCTIONS=OFF                      \
+    -DUSE_PRECOMPILED_HEADERS=OFF                      \
+    -DUSE_SDL=OFF                                      \
+    -DUSE_SYSTEM_CURL=OFF                              \
+    -DUSE_SYSTEM_OPENAL=OFF                            \
     -DUSE_SYSTEM_FFMPEG=OFF                            \
-    -DUSE_SYSTEM_OPENCV=ON                             \
     -DUSE_DISCORD_RPC=ON                               \
     -DOpenGL_GL_PREFERENCE=LEGACY                      \
-    -DLLVM_DIR=/opt/llvm/lib/cmake/llvm                \
     -DSTATIC_LINK_LLVM=ON                              \
+    -DBUILD_LLVM=on                                    \
     -DWITH_RPCSX=off                                   \
     -DWITH_RPCS3=on                                    \
     -DWITH_RPCS3_QT_UI=on                              \
diff --git a/.ci/deploy-linux.sh b/.ci/deploy-linux.sh
index e95c25a97..48a94c0d4 100755
--- a/.ci/deploy-linux.sh
+++ b/.ci/deploy-linux.sh
@@ -7,10 +7,10 @@ CPU_ARCH="${1:-x86_64}"
 if [ "$DEPLOY_APPIMAGE" = "true" ]; then
     DESTDIR=AppDir ninja install
 
-    curl -fsSLo /usr/bin/linuxdeploy "https://github.com/linuxdeploy/linuxdeploy/releases/download/continuous/linuxdeploy-$CPU_ARCH.AppImage"
-    chmod +x /usr/bin/linuxdeploy
-    curl -fsSLo /usr/bin/linuxdeploy-plugin-qt "https://github.com/linuxdeploy/linuxdeploy-plugin-qt/releases/download/continuous/linuxdeploy-plugin-qt-$CPU_ARCH.AppImage"
-    chmod +x /usr/bin/linuxdeploy-plugin-qt
+    sudo curl -fsSLo /usr/bin/linuxdeploy "https://github.com/linuxdeploy/linuxdeploy/releases/download/continuous/linuxdeploy-$CPU_ARCH.AppImage"
+    sudo chmod a+x /usr/bin/linuxdeploy
+    sudo curl -fsSLo /usr/bin/linuxdeploy-plugin-qt "https://github.com/linuxdeploy/linuxdeploy-plugin-qt/releases/download/continuous/linuxdeploy-plugin-qt-$CPU_ARCH.AppImage"
+    sudo chmod a+x /usr/bin/linuxdeploy-plugin-qt
     curl -fsSLo linuxdeploy-plugin-checkrt.sh https://github.com/darealshinji/linuxdeploy-plugin-checkrt/releases/download/continuous/linuxdeploy-plugin-checkrt.sh
     chmod +x ./linuxdeploy-plugin-checkrt.sh
 
diff --git a/.clangd b/.clangd
index f7e545dc3..b9be9e0aa 100644
--- a/.clangd
+++ b/.clangd
@@ -1,2 +1,3 @@
 CompileFlags:
     Add: [-Wall, -Wextra, -Wno-missing-designated-field-initializers]
+    Remove: [ -fno-lifetime-dse ]
diff --git a/.github/workflows/rpcs3.yml b/.github/workflows/rpcs3.yml
index 6ca24bc05..d09e10357 100644
--- a/.github/workflows/rpcs3.yml
+++ b/.github/workflows/rpcs3.yml
@@ -29,16 +29,13 @@ jobs:
       matrix:
         include:
           - os: ubuntu-24.04
-            docker_img: "rpcs3/rpcs3-ci-jammy:1.4"
-            build_sh: "rpcs3/.ci/build-linux.sh"
+            build_sh: ".ci/build-linux.sh"
             compiler: clang
           - os: ubuntu-24.04
-            docker_img: "rpcs3/rpcs3-ci-jammy:1.4"
-            build_sh: "rpcs3/.ci/build-linux.sh"
+            build_sh: ".ci/build-linux.sh"
             compiler: gcc
           - os: ubuntu-24.04-arm
-            docker_img: "rpcs3/rpcs3-ci-jammy-aarch64:1.4"
-            build_sh: "rpcs3/.ci/build-linux-aarch64.sh"
+            build_sh: ".ci/build-linux-aarch64.sh"
             compiler: clang
     name: RPCS3 Qt UI (Legacy) for Linux ${{ matrix.os }} ${{ matrix.compiler }}
     runs-on: ${{ matrix.os }}
@@ -46,9 +43,8 @@ jobs:
       CCACHE_DIR: ${{ github.workspace }}/ccache
       CI_HAS_ARTIFACTS: true
       DEPLOY_APPIMAGE: true
-      APPDIR: "/rpcs3/build/appdir"
-      ARTDIR: "/root/artifacts"
-      RELEASE_MESSAGE: "/rpcs3/GitHubReleaseMessage.txt"
+      APPDIR: "./appdir"
+      ARTDIR: "./artifacts"
       COMPILER: ${{ matrix.compiler }}
       RX_VERSION: "Unknown"
       RX_SHA: "Unknown"
@@ -66,16 +62,26 @@ jobs:
           restore-keys: | 
             ${{ runner.os }}-ccache-${{ matrix.compiler }}-${{ runner.arch }}-
 
-      - name: Docker setup and build
+      - name: Setup dependencies
         run: |
-          docker pull --quiet ${{ matrix.docker_img }}
-          docker run                      \
-            -v $PWD:/rpcs3              \
-            --env-file .ci/docker.env \
-            -v ${{ env.CCACHE_DIR }}:/root/.ccache  \
-            -v ${{ github.workspace }}/artifacts:/root/artifacts \
-            ${{ matrix.docker_img }} \
-            ${{ matrix.build_sh }}
+          echo "Types: deb" | sudo tee -a /etc/apt/sources.list.d/ubuntu.sources
+          echo "URIs: ${{ matrix.os == 'ubuntu-24.04-arm' && 'http://ports.ubuntu.com/ubuntu-ports' || 'http://azure.archive.ubuntu.com/ubuntu/' }}" | sudo tee -a /etc/apt/sources.list.d/ubuntu.sources
+          echo "Suites: plucky plucky-updates plucky-security" | sudo tee -a /etc/apt/sources.list.d/ubuntu.sources
+          echo "Components: main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/ubuntu.sources
+          echo "Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg" | sudo tee -a /etc/apt/sources.list.d/ubuntu.sources
+
+          sudo apt update
+          sudo apt install -y cmake build-essential libunwind-dev \
+            libvulkan-dev vulkan-validationlayers \
+            libsox-dev ninja-build libasound2-dev libglfw3-dev nasm libudev-dev \
+            libpulse-dev libopenal-dev libglew-dev zlib1g-dev libedit-dev \
+            libevdev-dev libjack-dev libsndio-dev libglvnd-dev \
+            qt6-base-dev qt6-svg-dev qt6-base-private-dev qt6-multimedia-dev \
+            clang lld gcc-14 g++-14 \
+
+      - name: Build
+        run: |
+          ${{ matrix.build_sh }}
 
           RX_VERSION=`cat .rx.version | awk -F'-' '{print $1}'`
           RX_SHA=`cat .rx.version | awk -F'-' '{print $5}'`
diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index 01f2b8938..28c39d893 100644
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -104,6 +104,9 @@ add_subdirectory(zlib EXCLUDE_FROM_ALL)
 # ZSTD
 add_subdirectory(zstd EXCLUDE_FROM_ALL)
 
+# workaround for LLVM
+add_library(zstd::libzstd_static ALIAS libzstd_static)
+
 # 7zip sdk
 add_subdirectory(7zip EXCLUDE_FROM_ALL)
 
@@ -342,10 +345,6 @@ if(NOT MSVC AND NOT ANDROID AND NOT WITHOUT_OPENGLEW)
 	target_link_libraries(3rdparty_glew INTERFACE GLEW::GLEW)
 endif()
 
-
-# LLVM
-add_subdirectory(llvm EXCLUDE_FROM_ALL)
-
 # WOLFSSL
 add_subdirectory(wolfssl EXCLUDE_FROM_ALL)
 
diff --git a/3rdparty/llvm/CMakeLists.txt b/3rdparty/llvm/CMakeLists.txt
index b2880f7b9..0e11aac59 100644
--- a/3rdparty/llvm/CMakeLists.txt
+++ b/3rdparty/llvm/CMakeLists.txt
@@ -1,6 +1,6 @@
 
 if(WITH_LLVM)
-	set(USE_LLVM_VERSION 19.1.7)
+	set(USE_LLVM_VERSION 20.1.3)
 	if (NOT MSVC)
 		check_cxx_compiler_flag("-msse -msse2 -mcx16" COMPILER_X86)
 		check_cxx_compiler_flag("-march=armv8-a+lse" COMPILER_ARM)
@@ -42,7 +42,18 @@ if(WITH_LLVM)
 
 		set(LLVM_DOWNLOAD_BINARY "")
 
-		if ((WIN32 AND MSVC) OR (LINUX AND NOT ANDROID))
+		if (ANDROID)
+			string(APPEND LLVM_DOWNLOAD_BINARY llvm-android-)
+
+			if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+				string(APPEND LLVM_DOWNLOAD_BINARY arm64-v8a)
+			else()
+				string(APPEND LLVM_DOWNLOAD_BINARY x64)
+			endif()
+
+			string(APPEND LLVM_DOWNLOAD_BINARY .7z)
+		elseif ((WIN32 AND MSVC) OR LINUX)
+			string(APPEND LLVM_DOWNLOAD_BINARY llvm-)
 			if (WIN32)
 				string(APPEND LLVM_DOWNLOAD_BINARY windows-)
 			else()
@@ -62,6 +73,8 @@ if(WITH_LLVM)
 					string(APPEND LLVM_DOWNLOAD_BINARY MD)
 				endif()
 			endif()
+
+			string(APPEND LLVM_DOWNLOAD_BINARY .7z)
 		endif()
 
 		if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
@@ -79,55 +92,76 @@ if(WITH_LLVM)
 			# LLVM needs to be built out-of-tree
 			add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/llvm/llvm ${CMAKE_CURRENT_BINARY_DIR}/llvm_build EXCLUDE_FROM_ALL)
 			set(LLVM_DIR "${CMAKE_CURRENT_BINARY_DIR}/llvm_build/lib/cmake/llvm/")
+			set(MLIR_DIR "${CMAKE_CURRENT_BINARY_DIR}/llvm_build/lib/cmake/mlir/")
 		else()
 			set(LLVM_DOWNLOAD_LINK https://github.com/RPCSX/llvm-build/releases/download/${USE_LLVM_VERSION})
 
-			if (NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z" AND
+			if (NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}" AND
 				NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.unpacked")
 				message(STATUS "Downloading LLVM")
 				file(DOWNLOAD ${LLVM_DOWNLOAD_LINK}/${LLVM_DOWNLOAD_BINARY}
-					"${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z.tmp" SHOW_PROGRESS
+					"${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.tmp" SHOW_PROGRESS
 					STATUS FILE_STATUS)
 				list(GET FILE_STATUS 0 STATUS_CODE)
 				if (NOT STATUS_CODE EQUAL 0)
-					file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z.tmp")
+					file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.tmp")
 					message(FATAL_ERROR "Failed to download LLVM")
 				endif()
 
 				file(RENAME
-					"${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z.tmp"
-					"${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z"
+					"${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.tmp"
+					"${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}"
 				)
 			endif()
 
 			if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.unpacked")
-				file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}")
-				execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z"
-					WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}" RESULT_VARIABLE STATUS_CODE)
+				file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.dir")
+				execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}"
+					WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.dir" RESULT_VARIABLE STATUS_CODE)
 
 				if (NOT STATUS_CODE EQUAL 0)
 					message(FATAL_ERROR "Failed to unpack LLVM")
 				endif()
 				file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.unpacked")
-				file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z")
+				file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}")
 			endif()
 
-			file(GLOB LLVM_ROOT_DIR_LIST LIST_DIRECTORIES true "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}/*")
+			file(GLOB LLVM_ROOT_DIR_LIST LIST_DIRECTORIES true "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.dir/*")
 			list(GET LLVM_ROOT_DIR_LIST 0 LLVM_ROOT_DIR)
-			set(LLVM_DIR "${LLVM_ROOT_DIR}/lib/cmake/llvm")	
+			set(LLVM_DIR "${LLVM_ROOT_DIR}/lib/cmake/llvm")
+			set(MLIR_DIR "${LLVM_ROOT_DIR}/lib/cmake/mlir")
 
 			if (NOT EXISTS "${LLVM_DIR}")
 				message(FATAL_ERROR "Failed to locate LLVM: ${LLVM_ROOT_DIR}")
 			endif()
+			
+			if (NOT EXISTS "${MLIR_DIR}")
+				message(FATAL_ERROR "Failed to locate MLIR: ${LLVM_ROOT_DIR}")
+			endif()
+
+			if (NOT ANDROID)
+				set(Clang_DIR "${LLVM_ROOT_DIR}/lib/cmake/clang")
+				if (NOT EXISTS "${Clang_DIR}")
+					message(FATAL_ERROR "Failed to locate Clang: ${LLVM_ROOT_DIR}")
+				endif()
+			endif()
 		endif()
 
 		set(STATIC_LINK_LLVM ON CACHE BOOL "Link against LLVM statically. This will get set to ON if you build LLVM from the submodule." FORCE)
 
 		find_package(LLVM ${USE_LLVM_VERSION} CONFIG)
+		find_package(MLIR ${USE_LLVM_VERSION} CONFIG)
 
-		if(NOT LLVM_FOUND)
+		if(NOT LLVM_FOUND OR NOT MLIR_FOUND)
 			message(FATAL_ERROR "Couldn't build LLVM from the submodule. You might need to run `git submodule update --init`")
 		endif()
+
+		if (NOT ANDROID)
+			find_package(Clang ${USE_LLVM_VERSION} CONFIG)
+			if(NOT Clang_FOUND)
+				message(FATAL_ERROR "Couldn't build Clang from the submodule. You might need to run `git submodule update --init`")
+			endif()
+		endif()
 	else()
 		message(STATUS "Using prebuilt or system LLVM")
 
@@ -136,15 +170,36 @@ if(WITH_LLVM)
 			set(LLVM_DIR ${CMAKE_SOURCE_DIR}/${LLVM_DIR})
 		endif()
 
+		if (MLIR_DIR AND NOT IS_ABSOLUTE "${MLIR_DIR}")
+			set(MLIR_DIR ${CMAKE_SOURCE_DIR}/${MLIR_DIR})
+		endif()
+
+		if (Clang_DIR AND NOT IS_ABSOLUTE "${Clang_DIR}")
+			set(Clang_DIR ${CMAKE_SOURCE_DIR}/${Clang_DIR})
+		endif()
+
 		find_package(LLVM CONFIG)
+		find_package(MLIR CONFIG)
 
 		if (NOT LLVM_FOUND)
 			message(FATAL_ERROR "Can't find LLVM libraries from the CMAKE_PREFIX_PATH path or LLVM_DIR. \
 													 Enable BUILD_LLVM option to build LLVM from included as a git submodule.")
 		endif()
 		if (LLVM_VERSION VERSION_LESS 18)
-			message(FATAL_ERROR "Found LLVM version ${LLVM_VERSION}. Required version 18 or above. \
-													 Enable BUILD_LLVM option to build LLVM from included as a git submodule.")
+			message(FATAL_ERROR "Found LLVM version ${LLVM_VERSION}. Required version 18 or above.")
+		endif()
+
+		if (NOT MLIR_FOUND)
+			message(FATAL_ERROR "Can't find MLIR libraries from the CMAKE_PREFIX_PATH path or MLIR_DIR")
+		endif()
+
+
+		if (NOT ANDROID)
+			find_package(Clang CONFIG)
+
+			if (NOT Clang_FOUND)
+				message(FATAL_ERROR "Can't find Clang from the CMAKE_PREFIX_PATH path or Clang_DIR.")
+			endif()
 		endif()
 	endif()
 
@@ -164,9 +219,9 @@ if(WITH_LLVM)
 		endif()
 
 		# For Linux even if BUILD_LLVM is disabled (precompiled llvm used)
-		if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-			list (APPEND LLVM_ADDITIONAL_LIBS PerfJITEvents)
-		endif()
+		# if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+		# 	list (APPEND LLVM_ADDITIONAL_LIBS PerfJITEvents)
+		# endif()
 
 		llvm_map_components_to_libnames(LLVM_LIBS
 			${LLVM_TARGETS_TO_BUILD}
@@ -176,17 +231,42 @@ if(WITH_LLVM)
 			MCJIT
 			Passes
 		)
+
+		set(MLIR_LIBS MLIRIR MLIRInferTypeOpInterface MLIRFuncDialect MLIRSCFDialect MLIRSCFToControlFlow MLIRAffineAnalysis MLIRAsyncToLLVM)
 	else()
-		set(LLVM_LIBS LLVM)
+		set(LLVM_LIBS LLVM MLIR)
 	endif()
 
+	list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
+	list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+	
+	include(TableGen)
+	include(AddLLVM)
+	include(AddMLIR)
+
+	if (NOT ANDROID)
+		list(APPEND CMAKE_MODULE_PATH "${CLANG_CMAKE_DIR}")
+		include(AddClang)
+
+		get_target_property(CLANG_EXECUTABLE clang LOCATION)
+	endif()
+	# include(HandleLLVMOptions)
+
 	add_library(3rdparty_llvm INTERFACE)
 	target_link_libraries(3rdparty_llvm INTERFACE ${LLVM_LIBS})
 	target_include_directories(3rdparty_llvm INTERFACE ${LLVM_INCLUDE_DIRS})
 	separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS})
 	target_compile_definitions(3rdparty_llvm INTERFACE ${LLVM_DEFINITIONS_LIST} LLVM_AVAILABLE)
 
+	add_library(3rdparty_mlir INTERFACE)
+	target_link_libraries(3rdparty_mlir INTERFACE 3rdparty_llvm ${MLIR_LIBS})
+	target_include_directories(3rdparty_mlir INTERFACE ${MLIR_INCLUDE_DIRS})
+	separate_arguments(MLIR_DEFINITIONS_LIST NATIVE_COMMAND ${MLIR_DEFINITIONS})
+	target_compile_definitions(3rdparty_mlir INTERFACE ${MLIR_DEFINITIONS_LIST} MLIR_AVAILABLE)
+
 	add_library(3rdparty::llvm ALIAS 3rdparty_llvm)
+	add_library(3rdparty::mlir ALIAS 3rdparty_mlir)
 else()
 	add_library(3rdparty::llvm ALIAS 3rdparty_dummy_lib)
+	add_library(3rdparty::mlir ALIAS 3rdparty_dummy_lib)
 endif()
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 600486065..7d4fa1202 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -121,6 +121,8 @@ endif()
 add_subdirectory(3rdparty EXCLUDE_FROM_ALL)
 add_subdirectory(rx EXCLUDE_FROM_ALL)
 
+include(3rdparty/llvm/CMakeLists.txt)
+
 if (NOT RX_TAG)
     set(RX_TAG 0)
 endif()
@@ -183,9 +185,10 @@ if (WITH_RPCSX)
     add_subdirectory(tools)
 
     add_subdirectory(orbis-kernel)
-    add_subdirectory(rpcsx)
 endif()
 
+add_subdirectory(rpcsx)
+
 if (WITH_RPCS3)
     include(ConfigureCompiler)
     include(CheckFunctionExists)
diff --git a/android/CMakeLists.txt b/android/CMakeLists.txt
index f062e405f..f8e93167c 100644
--- a/android/CMakeLists.txt
+++ b/android/CMakeLists.txt
@@ -5,7 +5,7 @@ set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_POSITION_INDEPENDENT_CODE on)
 
 set(FFMPEG_VERSION 5.1)
-set(LLVM_VERSION 19.1)
+set(LLVM_VERSION 20.1.2)
 
 option(USE_ARCH "Specify arch to build" "")
 
@@ -88,26 +88,6 @@ target_link_libraries(3rdparty_ffmpeg INTERFACE
 
 add_dependencies(3rdparty_ffmpeg ffmpeg-unpack)
 
-
-if(NOT EXISTS ${CMAKE_BINARY_DIR}/llvm-${LLVM_VERSION}.tar.gz)
-    message(STATUS "Downloading llvm-${LLVM_VERSION}")
-    file(DOWNLOAD
-        https://github.com/RPCS3-Android/llvm-android/releases/download/${LLVM_VERSION}/llvm-${RPCS3_DOWNLOAD_ARCH}-Android.tar.gz
-        ${CMAKE_BINARY_DIR}/llvm-${LLVM_VERSION}.tar.gz
-        SHOW_PROGRESS
-    )
-endif()
-
-set(LLVM_DIR ${CMAKE_BINARY_DIR}/llvm-${LLVM_VERSION}.7-Android/lib/cmake/llvm)
-
-if (NOT EXISTS ${LLVM_DIR})
-    message(STATUS "Unpacking llvm-${LLVM_VERSION}")
-    execute_process(
-        COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_BINARY_DIR}/llvm-${LLVM_VERSION}.tar.gz
-        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-    )
-endif()
-
 set(WITH_RPCSX off)
 set(WITH_RPCS3 on)
 set(USE_SYSTEM_LIBUSB off)
@@ -117,10 +97,10 @@ set(USE_SYSTEM_OPENCV off)
 set(USE_SYSTEM_FFMPEG off)
 set(USE_FAUDIO off)
 set(USE_SDL2 off)
-set(BUILD_LLVM off)
+set(BUILD_LLVM on)
 set(STATIC_LINK_LLVM on)
-set(DISABLE_LTO on)
-set(USE_LTO off)
+set(DISABLE_LTO off)
+set(USE_LTO on)
 set(USE_OPENSL off)
 set(ASMJIT_NO_SHM_OPEN on)
 set(USE_SYSTEM_ZLIB on)
diff --git a/ps3fw/cellGcmSys.cpp b/ps3fw/cellGcmSys.cpp
index c8ecfa215..31b914a76 100644
--- a/ps3fw/cellGcmSys.cpp
+++ b/ps3fw/cellGcmSys.cpp
@@ -152,7 +152,7 @@ vm::ptr<CellGcmReportData> cellGcmGetReportDataAddressLocation(u32 index, u32 lo
 		cellGcmSys.error("cellGcmGetReportDataAddressLocation: Wrong local index (%d)", index);
 	}
 
-	return vm::cast(rsx::get_current_renderer()->label_addr + ::offset32(&RsxReports::report) + index * 0x10);
+	return vm::cast(rsx::get_current_renderer()->label_addr + OFFSET_OF(RsxReports, report) + index * 0x10);
 }
 
 u64 cellGcmGetTimeStamp(u32 index)
@@ -164,7 +164,7 @@ u64 cellGcmGetTimeStamp(u32 index)
 		cellGcmSys.error("cellGcmGetTimeStamp: Wrong local index (%d)", index);
 	}
 
-	const u32 address = rsx::get_current_renderer()->label_addr + ::offset32(&RsxReports::report) + index * 0x10;
+	const u32 address = rsx::get_current_renderer()->label_addr + OFFSET_OF(RsxReports, report) + index * 0x10;
 	return *vm::get_super_ptr<u64>(address);
 }
 
@@ -193,7 +193,7 @@ u32 cellGcmGetNotifyDataAddress(u32 index)
  */
 vm::ptr<CellGcmReportData> _cellGcmFunc12()
 {
-	return vm::ptr<CellGcmReportData>::make(rsx::get_current_renderer()->label_addr + ::offset32(&RsxReports::report)); // TODO
+	return vm::ptr<CellGcmReportData>::make(rsx::get_current_renderer()->label_addr + OFFSET_OF(RsxReports, report)); // TODO
 }
 
 u32 cellGcmGetReport(u32 type, u32 index)
@@ -223,7 +223,7 @@ u32 cellGcmGetReportDataAddress(u32 index)
 		cellGcmSys.error("cellGcmGetReportDataAddress: Wrong local index (%d)", index);
 	}
 
-	return rsx::get_current_renderer()->label_addr + ::offset32(&RsxReports::report) + index * 0x10;
+	return rsx::get_current_renderer()->label_addr + OFFSET_OF(RsxReports, report) + index * 0x10;
 }
 
 u32 cellGcmGetReportDataLocation(u32 index, u32 location)
diff --git a/ps3fw/cellGem.cpp b/ps3fw/cellGem.cpp
index 2da34a643..15cfbd9cf 100644
--- a/ps3fw/cellGem.cpp
+++ b/ps3fw/cellGem.cpp
@@ -574,20 +574,7 @@ public:
 		for (gem_controller& c : controllers)
 		{
 			ar(c.status, c.ext_status, c.ext_id, c.port, c.enabled_magnetometer, c.calibrated_magnetometer, c.enabled_filtering, c.enabled_tracking, c.enabled_LED, c.hue_set, c.rumble);
-
-			// We need to add padding because we used bitwise serialization in version 1
-			if (version < 2)
-			{
-				ar.add_padding(&gem_controller::rumble, &gem_controller::sphere_rgb);
-			}
-
 			ar(c.sphere_rgb, c.hue, c.distance_mm, c.radius, c.radius_valid, c.is_calibrating);
-
-			if (version < 2)
-			{
-				ar.add_padding(&gem_controller::is_calibrating, &gem_controller::calibration_start_us);
-			}
-
 			ar(c.calibration_start_us);
 
 			if (ar.is_writing() || version >= 2)
diff --git a/ps3fw/cellSaveData.cpp b/ps3fw/cellSaveData.cpp
index 1937f84bc..a2c5b5bda 100644
--- a/ps3fw/cellSaveData.cpp
+++ b/ps3fw/cellSaveData.cpp
@@ -1022,7 +1022,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr<char> dirName,
 			listSet->focusPosition = CELL_SAVEDATA_FOCUSPOS_LISTHEAD;
 
 			std::memset(result.get_ptr(), 0,
-				::offset32(&CellSaveDataCBResult::userdata));
+				OFFSET_OF(CellSaveDataCBResult, userdata));
 
 			// List Callback
 			funcList(ppu, result, listGet, listSet);
@@ -1313,7 +1313,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr<char> dirName,
 			}
 
 			std::memset(result.get_ptr(), 0,
-				::offset32(&CellSaveDataCBResult::userdata));
+				OFFSET_OF(CellSaveDataCBResult, userdata));
 
 			if (!funcDone)
 			{
@@ -1436,8 +1436,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr<char> dirName,
 		{
 			lv2_sleep(ppu, 250);
 
-			std::memset(result.get_ptr(), 0,
-				::offset32(&CellSaveDataCBResult::userdata));
+			std::memset(result.get_ptr(), 0, OFFSET_OF(CellSaveDataCBResult, userdata));
 
 			// Fixed Callback
 			funcFixed(ppu, result, listGet, fixedSet);
@@ -1780,7 +1779,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr<char> dirName,
 			!save_entry.isNew ? ::narrow<s32>((size_bytes / 1024) + statGet->sysSizeKB) : 0;
 
 		std::memset(result.get_ptr(), 0,
-			::offset32(&CellSaveDataCBResult::userdata));
+			OFFSET_OF(CellSaveDataCBResult, userdata));
 
 		// Stat Callback
 		funcStat(ppu, result, statGet, statSet);
@@ -2036,7 +2035,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr<char> dirName,
 		std::memset(fileSet.get_ptr(), 0, fileSet.size());
 		std::memset(fileGet->reserved, 0, sizeof(fileGet->reserved));
 		std::memset(result.get_ptr(), 0,
-			::offset32(&CellSaveDataCBResult::userdata));
+			OFFSET_OF(CellSaveDataCBResult, userdata));
 
 		funcFile(ppu, result, fileGet, fileSet);
 		ppu.state += cpu_flag::wait;
diff --git a/ps3fw/cellSpursSpu.cpp b/ps3fw/cellSpursSpu.cpp
index 3bfa3d749..d68cf93f3 100644
--- a/ps3fw/cellSpursSpu.cpp
+++ b/ps3fw/cellSpursSpu.cpp
@@ -1215,7 +1215,7 @@ void spursSysServiceTraceUpdate(spu_thread& spu, SpursKernelContext* ctxt, u32 a
 	if (((sysSrvMsgUpdateTrace & (1 << ctxt->spuNum)) != 0) || (arg3 != 0))
 	{
 		// vm::reservation_acquire(ctxt->spurs.ptr(&CellSpurs::traceBuffer).addr());
-		auto spurs = spu._ptr<CellSpurs>(0x80 - offset32(&CellSpurs::traceBuffer));
+		auto spurs = spu._ptr<CellSpurs>(0x80 - OFFSET_OF(CellSpurs, traceBuffer));
 
 		if (ctxt->traceMsgCount != 0xffu || spurs->traceBuffer.addr() == 0u)
 		{
@@ -1238,7 +1238,7 @@ void spursSysServiceTraceUpdate(spu_thread& spu, SpursKernelContext* ctxt, u32 a
 
 	if (notify)
 	{
-		auto spurs = spu._ptr<CellSpurs>(0x2D80 - offset32(&CellSpurs::wklState1));
+		auto spurs = spu._ptr<CellSpurs>(0x2D80 - OFFSET_OF(CellSpurs, wklState1));
 		sys_spu_thread_send_event(spu, spurs->spuPort, 2, 0);
 	}
 }
@@ -1427,12 +1427,12 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
 	// vm::reservation_op(vm::cast(ctxt->taskset.addr()), 128, [&]()
 	{
 		auto taskset = ctxt->taskset;
-		v128 waiting = vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::waiting));
-		v128 running = vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::running));
-		v128 ready = vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::ready));
-		v128 pready = vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::pending_ready));
-		v128 enabled = vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::enabled));
-		v128 signalled = vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::signalled));
+		v128 waiting = vm::_ref<v128>(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, waiting));
+		v128 running = vm::_ref<v128>(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, running));
+		v128 ready = vm::_ref<v128>(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, ready));
+		v128 pready = vm::_ref<v128>(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, pending_ready));
+		v128 enabled = vm::_ref<v128>(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, enabled));
+		v128 signalled = vm::_ref<v128>(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, signalled));
 
 		// Verify taskset state is valid
 		if ((waiting & running) != v128{} || (ready & pready) != v128{} ||
@@ -1599,12 +1599,12 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
 			spursHalt(spu);
 		}
 
-		vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::waiting)) = waiting;
-		vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::running)) = running;
-		vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::ready)) = ready;
-		vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::pending_ready)) = v128{};
-		vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::enabled)) = enabled;
-		vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::signalled)) = signalled;
+		vm::_ref<v128>(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, waiting)) = waiting;
+		vm::_ref<v128>(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, running)) = running;
+		vm::_ref<v128>(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, ready)) = ready;
+		vm::_ref<v128>(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, pending_ready)) = v128{};
+		vm::_ref<v128>(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, enabled)) = enabled;
+		vm::_ref<v128>(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, signalled)) = signalled;
 
 		std::memcpy(spu._ptr<void>(0x2700), spu._ptr<void>(0x100), 128); // Copy data
 	} //);
diff --git a/ps3fw/cellVdec.cpp b/ps3fw/cellVdec.cpp
index ce4e07b11..c4f679688 100644
--- a/ps3fw/cellVdec.cpp
+++ b/ps3fw/cellVdec.cpp
@@ -1675,7 +1675,7 @@ error_code cellVdecGetPicItem(ppu_thread& ppu, u32 handle,
 	info->status = CELL_OK;
 	info->attr = attr;
 
-	const vm::addr_t picinfo_addr{info.addr() + ::offset32(&all_info_t::picInfo)};
+	const vm::addr_t picinfo_addr{info.addr() + OFFSET_OF(all_info_t, picInfo)};
 	info->picInfo_addr = picinfo_addr;
 
 	if (vdec->type == CELL_VDEC_CODEC_TYPE_AVC)
diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt
index bfcf682a2..04a9d41da 100644
--- a/rpcs3/Emu/CMakeLists.txt
+++ b/rpcs3/Emu/CMakeLists.txt
@@ -445,6 +445,8 @@ target_link_libraries(rpcs3_emu
         3rdparty::libusb 3rdparty::wolfssl
         Vulkan::Headers
         rpcsx::fw::ps3::api
+        rpcsx::cpu::cell::ppu
+        rpcsx::cpu::cell::ppu::semantic
 
     PRIVATE
         3rdparty::glslang
diff --git a/rpcs3/Emu/Cell/PPUFunction.cpp b/rpcs3/Emu/Cell/PPUFunction.cpp
index df7eaa76f..0c767090b 100644
--- a/rpcs3/Emu/Cell/PPUFunction.cpp
+++ b/rpcs3/Emu/Cell/PPUFunction.cpp
@@ -1908,7 +1908,7 @@ auto gen_ghc_cpp_trampoline(ppu_intrp_func_t fn_target)
 
 		// Take second ghc arg
 		c.mov(args[0], x86::rbp);
-		c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)));
+		c.mov(args[2].r32(), x86::dword_ptr(args[0], OFFSET_OF(ppu_thread, cia)));
 		c.add(args[2], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
 		c.jmp(fn_target);
 	};
@@ -1942,7 +1942,7 @@ auto gen_ghc_cpp_trampoline(ppu_intrp_func_t fn_target)
 		c.bind(base_addr);
 		c.embedUInt64(reinterpret_cast<u64>(&vm::g_base_addr));
 		c.bind(cia_offset);
-		c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::cia)));
+		c.embedUInt64(static_cast<u64>(OFFSET_OF(ppu_thread, cia)));
 		c.bind(jmp_target);
 		c.embedUInt64(reinterpret_cast<u64>(fn_target));
 	};
diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp
index 77b7fd785..a84bf21c4 100644
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@@ -1,4 +1,5 @@
 #include "stdafx.h"
+#include "rx/cpu/cell/ppu/Decoder.hpp"
 #include "PPUInterpreter.h"
 
 #include "Emu/Memory/vm_reservation.h"
@@ -189,13 +190,11 @@ namespace asmjit
 		}
 
 		// Indexed offset to ppu.member
-		template <auto MPtr, u32 Size = sizeof((std::declval<ppu_thread&>().*MPtr)[0]), uint I, uint N>
-		x86::Mem ppu_mem(const bf_t<u32, I, N>&, bool last = false)
+		template <uint I, uint N>
+		x86::Mem ppu_mem(const bf_t<u32, I, N>&, std::size_t offset, std::size_t size, std::size_t elemSize, bool last = false)
 		{
 			// Required index shift for array indexing
-			constexpr u32 Shift = std::countr_zero(sizeof((std::declval<ppu_thread&>().*MPtr)[0]));
-
-			const u32 offset = ::offset32(MPtr);
+			u32 Shift = std::countr_zero(elemSize);
 
 			auto tmp_r32 = x86::eax;
 			auto reg_ppu = arg_ppu;
@@ -222,13 +221,13 @@ namespace asmjit
 			}
 
 			// Use max possible index shift
-			constexpr u32 X86Shift = Shift > 3 ? 3 : Shift;
-			constexpr u32 AddShift = Shift - X86Shift;
-			constexpr u32 AndMask = (1u << N) - 1;
+			u32 X86Shift = Shift > 3 ? 3 : Shift;
+			u32 AddShift = Shift - X86Shift;
+			u32 AndMask = (1u << N) - 1;
 
-			if constexpr (I >= AddShift)
+			if (I >= AddShift)
 			{
-				if constexpr (I != AddShift)
+				if (I != AddShift)
 					base::shr(tmp_r32, I - AddShift);
 				base::and_(tmp_r32, AndMask << AddShift);
 			}
@@ -238,25 +237,24 @@ namespace asmjit
 				base::shl(tmp_r32, I + AddShift);
 			}
 
-			return x86::ptr(reg_ppu, tmp_r32.r64(), X86Shift, static_cast<s32>(offset - ppu_base), Size);
+			return x86::ptr(reg_ppu, tmp_r32.r64(), X86Shift, static_cast<s32>(offset - ppu_base), size);
 		}
 
 		// Generic offset to ppu.member
-		template <auto MPtr, u32 Size = sizeof(std::declval<ppu_thread&>().*MPtr)>
-		x86::Mem ppu_mem()
+		x86::Mem ppu_mem(std::uint32_t offset, std::size_t size)
 		{
-			return x86::ptr(arg_ppu, static_cast<s32>(::offset32(MPtr)), Size);
+			return x86::ptr(arg_ppu, offset, size);
 		}
 
 		template <u32 Size = 16, uint I, uint N>
 		x86::Mem ppu_vr(const bf_t<u32, I, N>& bf, bool last = false)
 		{
-			return ppu_mem<&ppu_thread::vr, Size>(bf, last);
+			return ppu_mem(bf, OFFSET_OF(ppu_thread, vr), Size, sizeof(ppu_thread::vr[0]), last);
 		}
 
 		x86::Mem ppu_sat()
 		{
-			return ppu_mem<&ppu_thread::sat>();
+			return ppu_mem(OFFSET_OF(ppu_thread, sat), sizeof(ppu_thread::sat));
 		}
 
 		void ppu_ret(bool last = true)
@@ -265,7 +263,7 @@ namespace asmjit
 			base::mov(x86::rax, x86::qword_ptr(arg_next_fn));
 			base::add(arg_this_op, 4);
 			if (is_debugger_present())
-				base::mov(ppu_mem<&ppu_thread::cia>(), arg_this_op.r32());
+				base::mov(ppu_mem(OFFSET_OF(ppu_thread, cia), sizeof(ppu_thread::cia)), arg_this_op.r32());
 			base::mov(arg_op, x86::dword_ptr(arg_this_op));
 			base::bswap(arg_op);
 			base::add(arg_next_fn, 8);
@@ -377,7 +375,7 @@ inline void ppu_cr_set(ppu_thread& ppu, u32 field, bool le, bool gt, bool eq, bo
 template <typename T>
 inline void ppu_cr_set(ppu_thread& ppu, u32 field, const T& a, const T& b)
 {
-	ppu_cr_set(ppu, field, a<b, a> b, a == b, ppu.xer.so);
+	ppu_cr_set(ppu, field, (a < b), (a > b), a == b, ppu.xer_so);
 }
 
 // TODO
@@ -398,8 +396,8 @@ void ppu_set_cr(ppu_thread& ppu, u32 field, bool le, bool gt, bool eq, bool so)
 // Set XER.OV bit (overflow)
 inline void ppu_ov_set(ppu_thread& ppu, bool bit)
 {
-	ppu.xer.ov = bit;
-	ppu.xer.so |= bit;
+	ppu.xer_ov = bit;
+	ppu.xer_so |= bit;
 }
 
 // Write comparison results to FPCC field with optional CR field update
@@ -428,7 +426,7 @@ void ppu_set_fpcc(ppu_thread& ppu, f64 a, f64 b, u64 cr_field = 1)
 		fpcc[3] = cmp == std::partial_ordering::unordered;
 #endif
 
-		const u32 data = std::bit_cast<u32>(fpcc);
+		auto data = std::bit_cast<CrField>(fpcc);
 
 		// Write FPCC
 		ppu.fpscr.fields[4] = data;
@@ -440,7 +438,7 @@ void ppu_set_fpcc(ppu_thread& ppu, f64 a, f64 b, u64 cr_field = 1)
 
 			if (g_cfg.core.ppu_debug) [[unlikely]]
 			{
-				*reinterpret_cast<u32*>(vm::g_stat_addr + ppu.cia) |= data;
+				*reinterpret_cast<u32*>(vm::g_stat_addr + ppu.cia) |= std::bit_cast<u32>(data);
 			}
 		}
 	}
@@ -608,7 +606,7 @@ inline v128 ppu_select_vnan(v128 a, v128 b)
 	return gv_selectfs(gv_eqfs(a, a), b, a | gv_bcst32(0x7fc00000u));
 }
 
-inline v128 ppu_select_vnan(v128 a, v128 b, Vector128 auto... args)
+inline v128 ppu_select_vnan(v128 a, v128 b, rx::Vector128 auto... args)
 {
 	return ppu_select_vnan(a, ppu_select_vnan(b, args...));
 }
@@ -633,7 +631,7 @@ inline v128 ppu_fix_vnan(v128 r)
 }
 
 template <ppu_exec_bit... Flags>
-inline v128 ppu_set_vnan(v128 r, Vector128 auto... args)
+inline v128 ppu_set_vnan(v128 r, rx::Vector128 auto... args)
 {
 	if constexpr (((Flags == set_vnan) || ...) && sizeof...(args) > 0)
 	{
@@ -712,7 +710,7 @@ auto VADDFP()
 
 	static const auto exec = [](auto&& d, auto&& a_, auto&& b_, auto&& jm_mask)
 	{
-		auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask);
+		auto m = gv_bcst32(jm_mask);
 		auto a = ppu_flush_denormal<false, Flags...>(m, std::move(a_));
 		auto b = ppu_flush_denormal<false, Flags...>(m, std::move(b_));
 		d = ppu_flush_denormal<true, Flags...>(std::move(m), ppu_set_vnan<Flags...>(gv_addfs(a, b), a, b));
@@ -1359,7 +1357,7 @@ auto VMADDFP()
 
 	static const auto exec = [](auto&& d, auto&& a_, auto&& b_, auto&& c_, auto&& jm_mask)
 	{
-		auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask);
+		auto m = gv_bcst32(jm_mask);
 		auto a = ppu_flush_denormal<false, Flags...>(m, std::move(a_));
 		auto b = ppu_flush_denormal<false, Flags...>(m, std::move(b_));
 		auto c = ppu_flush_denormal<false, Flags...>(m, std::move(c_));
@@ -1377,7 +1375,7 @@ auto VMAXFP()
 
 	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& jm_mask)
 	{
-		d = ppu_flush_denormal<true, Flags...>(gv_bcst32(jm_mask, &ppu_thread::jm_mask), ppu_set_vnan<Flags...>(gv_maxfs(a, b), a, b));
+		d = ppu_flush_denormal<true, Flags...>(gv_bcst32(jm_mask), ppu_set_vnan<Flags...>(gv_maxfs(a, b), a, b));
 	};
 
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.jm_mask);
@@ -1524,7 +1522,7 @@ auto VMINFP()
 
 	static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& jm_mask)
 	{
-		d = ppu_flush_denormal<true, Flags...>(gv_bcst32(jm_mask, &ppu_thread::jm_mask), ppu_set_vnan<Flags...>(gv_minfs(a, b), a, b));
+		d = ppu_flush_denormal<true, Flags...>(gv_bcst32(jm_mask), ppu_set_vnan<Flags...>(gv_minfs(a, b), a, b));
 	};
 
 	RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.jm_mask);
@@ -1931,7 +1929,7 @@ auto VNMSUBFP()
 	{
 		// An odd case with (FLT_MIN, FLT_MIN, FLT_MIN) produces FLT_MIN instead of 0
 		auto s = gv_bcstfs(-0.0f);
-		auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask);
+		auto m = gv_bcst32(jm_mask);
 		auto a = ppu_flush_denormal<false, Flags...>(m, std::move(a_));
 		auto b = ppu_flush_denormal<false, Flags...>(m, std::move(b_));
 		auto c = ppu_flush_denormal<false, Flags...>(m, std::move(c_));
@@ -2177,7 +2175,7 @@ auto VREFP()
 
 	static const auto exec = [](auto&& d, auto&& b_, auto&& jm_mask)
 	{
-		auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask);
+		auto m = gv_bcst32(jm_mask);
 		auto b = ppu_flush_denormal<false, Flags...>(m, std::move(b_));
 		d = ppu_flush_denormal<true, Flags...>(std::move(m), ppu_set_vnan<Flags...>(gv_divfs(gv_bcstfs(1.0f), b), b));
 	};
@@ -2193,7 +2191,7 @@ auto VRFIM()
 
 	static const auto exec = [](auto&& d, auto&& b_, auto&& jm_mask)
 	{
-		auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask);
+		auto m = gv_bcst32(jm_mask);
 		auto b = ppu_flush_denormal<false, Flags...>(m, std::move(b_));
 		d = ppu_flush_denormal<true, Flags...>(std::move(m), ppu_set_vnan<Flags...>(gv_roundfs_floor(b), b));
 	};
@@ -2209,7 +2207,7 @@ auto VRFIN()
 
 	static const auto exec = [](auto&& d, auto&& b, auto&& jm_mask)
 	{
-		auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask);
+		auto m = gv_bcst32(jm_mask);
 		d = ppu_flush_denormal<true, Flags...>(std::move(m), ppu_set_vnan<Flags...>(gv_roundfs_even(b), b));
 	};
 
@@ -2224,7 +2222,7 @@ auto VRFIP()
 
 	static const auto exec = [](auto&& d, auto&& b_, auto&& jm_mask)
 	{
-		auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask);
+		auto m = gv_bcst32(jm_mask);
 		auto b = ppu_flush_denormal<false, Flags...>(m, std::move(b_));
 		d = ppu_flush_denormal<true, Flags...>(std::move(m), ppu_set_vnan<Flags...>(gv_roundfs_ceil(b), b));
 	};
@@ -2240,7 +2238,7 @@ auto VRFIZ()
 
 	static const auto exec = [](auto&& d, auto&& b, auto&& jm_mask)
 	{
-		auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask);
+		auto m = gv_bcst32(jm_mask);
 		d = ppu_flush_denormal<true, Flags...>(std::move(m), ppu_set_vnan<Flags...>(gv_roundfs_trunc(b), b));
 	};
 
@@ -2297,7 +2295,7 @@ auto VRSQRTEFP()
 
 	static const auto exec = [](auto&& d, auto&& b_, auto&& jm_mask)
 	{
-		auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask);
+		auto m = gv_bcst32(jm_mask);
 		auto b = ppu_flush_denormal<false, Flags...>(m, std::move(b_));
 		d = ppu_flush_denormal<true, Flags...>(std::move(m), ppu_set_vnan<Flags...>(gv_divfs(gv_bcstfs(1.0f), gv_sqrtfs(b)), b));
 	};
@@ -2629,7 +2627,7 @@ auto VSUBFP()
 
 	static const auto exec = [](auto&& d, auto&& a_, auto&& b_, auto&& jm_mask)
 	{
-		auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask);
+		auto m = gv_bcst32(jm_mask);
 		auto a = ppu_flush_denormal<false, Flags...>(m, std::move(a_));
 		auto b = ppu_flush_denormal<false, Flags...>(m, std::move(b_));
 		d = ppu_flush_denormal<true, Flags...>(std::move(m), ppu_set_vnan<Flags...>(gv_subfs(a, b), a, b));
@@ -3113,7 +3111,7 @@ auto SUBFIC()
 		const s64 i = op.simm16;
 		const auto r = add64_flags(~a, i, 1);
 		ppu.gpr[op.rd] = r.result;
-		ppu.xer.ca = r.carry;
+		ppu.xer_ca = r.carry;
 	};
 	RETURN_(ppu, op);
 }
@@ -3170,7 +3168,7 @@ auto ADDIC()
 		const s64 i = op.simm16;
 		const auto r = add64_flags(a, i);
 		ppu.gpr[op.rd] = r.result;
-		ppu.xer.ca = r.carry;
+		ppu.xer_ca = r.carry;
 		if (op.main & 1) [[unlikely]]
 			ppu_cr_set<s64>(ppu, 0, r.result, 0);
 	};
@@ -3827,7 +3825,7 @@ auto SUBFC()
 		const u64 RB = ppu.gpr[op.rb];
 		const auto r = add64_flags(~RA, RB, 1);
 		ppu.gpr[op.rd] = r.result;
-		ppu.xer.ca = r.carry;
+		ppu.xer_ca = r.carry;
 		if constexpr (((Flags == has_oe) || ...))
 			ppu_ov_set(ppu, (~RA >> 63 == RB >> 63) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
 		if constexpr (((Flags == has_rc) || ...))
@@ -3863,7 +3861,7 @@ auto ADDC()
 		const u64 RB = ppu.gpr[op.rb];
 		const auto r = add64_flags(RA, RB);
 		ppu.gpr[op.rd] = r.result;
-		ppu.xer.ca = r.carry;
+		ppu.xer_ca = r.carry;
 		if constexpr (((Flags == has_oe) || ...))
 			ppu_ov_set(ppu, (RA >> 63 == RB >> 63) && (RA >> 63 != ppu.gpr[op.rd] >> 63));
 		if constexpr (((Flags == has_rc) || ...))
@@ -4394,9 +4392,9 @@ auto SUBFE()
 	{
 		const u64 RA = ppu.gpr[op.ra];
 		const u64 RB = ppu.gpr[op.rb];
-		const auto r = add64_flags(~RA, RB, ppu.xer.ca);
+		const auto r = add64_flags(~RA, RB, ppu.xer_ca);
 		ppu.gpr[op.rd] = r.result;
-		ppu.xer.ca = r.carry;
+		ppu.xer_ca = r.carry;
 		if constexpr (((Flags == has_oe) || ...))
 			ppu_ov_set(ppu, (~RA >> 63 == RB >> 63) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
 		if constexpr (((Flags == has_rc) || ...))
@@ -4415,9 +4413,9 @@ auto ADDE()
 	{
 		const u64 RA = ppu.gpr[op.ra];
 		const u64 RB = ppu.gpr[op.rb];
-		const auto r = add64_flags(RA, RB, ppu.xer.ca);
+		const auto r = add64_flags(RA, RB, ppu.xer_ca);
 		ppu.gpr[op.rd] = r.result;
-		ppu.xer.ca = r.carry;
+		ppu.xer_ca = r.carry;
 		if constexpr (((Flags == has_oe) || ...))
 			ppu_ov_set(ppu, (RA >> 63 == RB >> 63) && (RA >> 63 != ppu.gpr[op.rd] >> 63));
 		if constexpr (((Flags == has_rc) || ...))
@@ -4434,23 +4432,23 @@ auto MTOCRF()
 
 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
-		alignas(4) static const u8 s_table[16][4]{
-			{0, 0, 0, 0},
-			{0, 0, 0, 1},
-			{0, 0, 1, 0},
-			{0, 0, 1, 1},
-			{0, 1, 0, 0},
-			{0, 1, 0, 1},
-			{0, 1, 1, 0},
-			{0, 1, 1, 1},
-			{1, 0, 0, 0},
-			{1, 0, 0, 1},
-			{1, 0, 1, 0},
-			{1, 0, 1, 1},
-			{1, 1, 0, 0},
-			{1, 1, 0, 1},
-			{1, 1, 1, 0},
-			{1, 1, 1, 1},
+		alignas(4) static const CrField s_table[16]{
+			CrField::From(false, false, false, false),
+			CrField::From(false, false, false, true),
+			CrField::From(false, false, true, false),
+			CrField::From(false, false, true, true),
+			CrField::From(false, true, false, false),
+			CrField::From(false, true, false, true),
+			CrField::From(false, true, true, false),
+			CrField::From(false, true, true, true),
+			CrField::From(true, false, false, false),
+			CrField::From(true, false, false, true),
+			CrField::From(true, false, true, false),
+			CrField::From(true, false, true, true),
+			CrField::From(true, true, false, false),
+			CrField::From(true, true, false, true),
+			CrField::From(true, true, true, false),
+			CrField::From(true, true, true, true),
 		};
 
 		const u64 s = ppu.gpr[op.rs];
@@ -4461,7 +4459,7 @@ auto MTOCRF()
 
 			const u32 n = std::countl_zero<u32>(op.crm) & 7;
 			const u64 v = (s >> ((n * 4) ^ 0x1c)) & 0xf;
-			ppu.cr.fields[n] = *reinterpret_cast<const u32*>(s_table + v);
+			ppu.cr.fields[n] = s_table[v];
 		}
 		else
 		{
@@ -4472,7 +4470,7 @@ auto MTOCRF()
 				if (op.crm & (128 >> i))
 				{
 					const u64 v = (s >> ((i * 4) ^ 0x1c)) & 0xf;
-					ppu.cr.fields[i] = *reinterpret_cast<const u32*>(s_table + v);
+					ppu.cr.fields[i] = s_table[v];
 				}
 			}
 		}
@@ -4503,7 +4501,7 @@ auto STWCX()
 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
 		const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
-		ppu_cr_set(ppu, 0, false, false, ppu_stwcx(ppu, vm::cast(addr), static_cast<u32>(ppu.gpr[op.rs])), ppu.xer.so);
+		ppu_cr_set(ppu, 0, false, false, ppu_stwcx(ppu, vm::cast(addr), static_cast<u32>(ppu.gpr[op.rs])), ppu.xer_so);
 	};
 	RETURN_(ppu, op);
 }
@@ -4591,9 +4589,9 @@ auto SUBFZE()
 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
 		const u64 RA = ppu.gpr[op.ra];
-		const auto r = add64_flags(~RA, 0, ppu.xer.ca);
+		const auto r = add64_flags(~RA, 0, ppu.xer_ca);
 		ppu.gpr[op.rd] = r.result;
-		ppu.xer.ca = r.carry;
+		ppu.xer_ca = r.carry;
 		if constexpr (((Flags == has_oe) || ...))
 			ppu_ov_set(ppu, (~RA >> 63 == 0) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
 		if constexpr (((Flags == has_rc) || ...))
@@ -4611,9 +4609,9 @@ auto ADDZE()
 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
 		const u64 RA = ppu.gpr[op.ra];
-		const auto r = add64_flags(RA, 0, ppu.xer.ca);
+		const auto r = add64_flags(RA, 0, ppu.xer_ca);
 		ppu.gpr[op.rd] = r.result;
-		ppu.xer.ca = r.carry;
+		ppu.xer_ca = r.carry;
 		if constexpr (((Flags == has_oe) || ...))
 			ppu_ov_set(ppu, (RA >> 63 == 0) && (RA >> 63 != ppu.gpr[op.rd] >> 63));
 		if constexpr (((Flags == has_rc) || ...))
@@ -4631,7 +4629,7 @@ auto STDCX()
 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
 		const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
-		ppu_cr_set(ppu, 0, false, false, ppu_stdcx(ppu, vm::cast(addr), ppu.gpr[op.rs]), ppu.xer.so);
+		ppu_cr_set(ppu, 0, false, false, ppu_stdcx(ppu, vm::cast(addr), ppu.gpr[op.rs]), ppu.xer_so);
 	};
 	RETURN_(ppu, op);
 }
@@ -4695,9 +4693,9 @@ auto SUBFME()
 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
 		const u64 RA = ppu.gpr[op.ra];
-		const auto r = add64_flags(~RA, ~0ull, ppu.xer.ca);
+		const auto r = add64_flags(~RA, ~0ull, ppu.xer_ca);
 		ppu.gpr[op.rd] = r.result;
-		ppu.xer.ca = r.carry;
+		ppu.xer_ca = r.carry;
 		if constexpr (((Flags == has_oe) || ...))
 			ppu_ov_set(ppu, (~RA >> 63 == 1) && (~RA >> 63 != ppu.gpr[op.rd] >> 63));
 		if constexpr (((Flags == has_rc) || ...))
@@ -4715,9 +4713,9 @@ auto ADDME()
 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
 		const s64 RA = ppu.gpr[op.ra];
-		const auto r = add64_flags(RA, ~0ull, ppu.xer.ca);
+		const auto r = add64_flags(RA, ~0ull, ppu.xer_ca);
 		ppu.gpr[op.rd] = r.result;
-		ppu.xer.ca = r.carry;
+		ppu.xer_ca = r.carry;
 		if constexpr (((Flags == has_oe) || ...))
 			ppu_ov_set(ppu, (u64(RA) >> 63 == 1) && (u64(RA) >> 63 != ppu.gpr[op.rd] >> 63));
 		if constexpr (((Flags == has_rc) || ...))
@@ -4881,7 +4879,7 @@ auto MFSPR()
 
 		switch (n)
 		{
-		case 0x001: ppu.gpr[op.rd] = u32{ppu.xer.so} << 31 | ppu.xer.ov << 30 | ppu.xer.ca << 29 | ppu.xer.cnt; break;
+		case 0x001: ppu.gpr[op.rd] = u32{ppu.xer_so} << 31 | ppu.xer_ov << 30 | ppu.xer_ca << 29 | ppu.xer_cnt; break;
 		case 0x008: ppu.gpr[op.rd] = ppu.lr; break;
 		case 0x009: ppu.gpr[op.rd] = ppu.ctr; break;
 		case 0x100: ppu.gpr[op.rd] = ppu.vrsave; break;
@@ -5131,10 +5129,10 @@ auto MTSPR()
 		case 0x001:
 		{
 			const u64 value = ppu.gpr[op.rs];
-			ppu.xer.so = (value & 0x80000000) != 0;
-			ppu.xer.ov = (value & 0x40000000) != 0;
-			ppu.xer.ca = (value & 0x20000000) != 0;
-			ppu.xer.cnt = value & 0x7f;
+			ppu.xer_so = (value & 0x80000000) != 0;
+			ppu.xer_ov = (value & 0x40000000) != 0;
+			ppu.xer_ca = (value & 0x20000000) != 0;
+			ppu.xer_cnt = value & 0x7f;
 			break;
 		}
 		case 0x008: ppu.lr = ppu.gpr[op.rs]; break;
@@ -5264,7 +5262,7 @@ auto LSWX()
 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
 		u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
-		u32 count = ppu.xer.cnt & 0x7f;
+		u32 count = ppu.xer_cnt & 0x7f;
 		for (; count >= 4; count -= 4, addr += 4, op.rd = (op.rd + 1) & 31)
 		{
 			ppu.gpr[op.rd] = ppu_feed_data<u32, Flags...>(ppu, addr);
@@ -5497,7 +5495,7 @@ auto STSWX()
 	static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op)
 	{
 		u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
-		u32 count = ppu.xer.cnt & 0x7F;
+		u32 count = ppu.xer_cnt & 0x7F;
 		for (; count >= 4; count -= 4, addr += 4, op.rs = (op.rs + 1) & 31)
 		{
 			PPU_WRITE_32(vm::cast(addr), static_cast<u32>(ppu.gpr[op.rs]));
@@ -5675,12 +5673,12 @@ auto SRAW()
 		if (shift > 31)
 		{
 			ppu.gpr[op.ra] = 0 - (RS < 0);
-			ppu.xer.ca = (RS < 0);
+			ppu.xer_ca = (RS < 0);
 		}
 		else
 		{
 			ppu.gpr[op.ra] = RS >> shift;
-			ppu.xer.ca = (RS < 0) && ((ppu.gpr[op.ra] << shift) != static_cast<u64>(RS));
+			ppu.xer_ca = (RS < 0) && ((ppu.gpr[op.ra] << shift) != static_cast<u64>(RS));
 		}
 
 		if constexpr (((Flags == has_rc) || ...))
@@ -5702,12 +5700,12 @@ auto SRAD()
 		if (shift > 63)
 		{
 			ppu.gpr[op.ra] = 0 - (RS < 0);
-			ppu.xer.ca = (RS < 0);
+			ppu.xer_ca = (RS < 0);
 		}
 		else
 		{
 			ppu.gpr[op.ra] = RS >> shift;
-			ppu.xer.ca = (RS < 0) && ((ppu.gpr[op.ra] << shift) != static_cast<u64>(RS));
+			ppu.xer_ca = (RS < 0) && ((ppu.gpr[op.ra] << shift) != static_cast<u64>(RS));
 		}
 
 		if constexpr (((Flags == has_rc) || ...))
@@ -5742,7 +5740,7 @@ auto SRAWI()
 	{
 		s32 RS = static_cast<u32>(ppu.gpr[op.rs]);
 		ppu.gpr[op.ra] = RS >> op.sh32;
-		ppu.xer.ca = (RS < 0) && (static_cast<u32>(ppu.gpr[op.ra] << op.sh32) != static_cast<u32>(RS));
+		ppu.xer_ca = (RS < 0) && (static_cast<u32>(ppu.gpr[op.ra] << op.sh32) != static_cast<u32>(RS));
 
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
@@ -5761,7 +5759,7 @@ auto SRADI()
 		auto sh = op.sh64;
 		s64 RS = ppu.gpr[op.rs];
 		ppu.gpr[op.ra] = RS >> sh;
-		ppu.xer.ca = (RS < 0) && ((ppu.gpr[op.ra] << sh) != static_cast<u64>(RS));
+		ppu.xer_ca = (RS < 0) && ((ppu.gpr[op.ra] << sh) != static_cast<u64>(RS));
 
 		if constexpr (((Flags == has_rc) || ...))
 			ppu_cr_set<s64>(ppu, 0, ppu.gpr[op.ra], 0);
@@ -6562,9 +6560,9 @@ auto MTFSFI()
 		}
 		else
 		{
-			static constexpr std::array<u32, 16> all_values = []() -> std::array<u32, 16>
+			static constexpr auto all_values = []
 			{
-				std::array<u32, 16> values{};
+				std::array<CrField, 16> values{};
 
 				for (u32 i = 0; i < values.size(); i++)
 				{
@@ -6576,7 +6574,7 @@ auto MTFSFI()
 					value |= (im & 1) << (8 * 1);
 					im >>= 1;
 					value |= (im & 1) << (8 * 0);
-					values[i] = value;
+					values[i] = std::bit_cast<CrField>(value);
 				}
 
 				return values;
@@ -8085,3 +8083,1179 @@ ppu_intrp_func_t ppu_interpreter_rt::decode(u32 opv) const noexcept
 
 	return table.decode(opv);
 }
+
+using isel_type = void (*)(PPUContext&, rx::cell::ppu::Instruction);
+
+#define IMPORT_DECODER(x) extern "C" isel_type ISEL_PPU_##x##_DEC
+#define IMPORT_DECODER_ALIAS(x, name)           \
+	extern "C" isel_type ISEL_PPU_##name##_DEC; \
+	inline isel_type ISEL_PPU_##x##_DEC = ISEL_PPU_##name##_DEC;
+
+IMPORT_DECODER(MFVSCR);
+IMPORT_DECODER(MTVSCR);
+IMPORT_DECODER(VADDCUW);
+IMPORT_DECODER(VADDFP);
+IMPORT_DECODER(VADDSBS);
+IMPORT_DECODER(VADDSHS);
+IMPORT_DECODER(VADDSWS);
+IMPORT_DECODER(VADDUBM);
+IMPORT_DECODER(VADDUBS);
+IMPORT_DECODER(VADDUHM);
+IMPORT_DECODER(VADDUHS);
+IMPORT_DECODER(VADDUWM);
+IMPORT_DECODER(VADDUWS);
+IMPORT_DECODER(VAND);
+IMPORT_DECODER(VANDC);
+IMPORT_DECODER(VAVGSB);
+IMPORT_DECODER(VAVGSH);
+IMPORT_DECODER(VAVGSW);
+IMPORT_DECODER(VAVGUB);
+IMPORT_DECODER(VAVGUH);
+IMPORT_DECODER(VAVGUW);
+IMPORT_DECODER(VCFSX);
+IMPORT_DECODER(VCFUX);
+IMPORT_DECODER(VCMPBFP);
+IMPORT_DECODER_ALIAS(VCMPBFP_, VCMPBFP);
+IMPORT_DECODER(VCMPEQFP);
+IMPORT_DECODER_ALIAS(VCMPEQFP_, VCMPEQFP);
+IMPORT_DECODER(VCMPEQUB);
+IMPORT_DECODER_ALIAS(VCMPEQUB_, VCMPEQUB);
+IMPORT_DECODER(VCMPEQUH);
+IMPORT_DECODER_ALIAS(VCMPEQUH_, VCMPEQUH);
+IMPORT_DECODER(VCMPEQUW);
+IMPORT_DECODER_ALIAS(VCMPEQUW_, VCMPEQUW);
+IMPORT_DECODER(VCMPGEFP);
+IMPORT_DECODER_ALIAS(VCMPGEFP_, VCMPGEFP);
+IMPORT_DECODER(VCMPGTFP);
+IMPORT_DECODER_ALIAS(VCMPGTFP_, VCMPGTFP);
+IMPORT_DECODER(VCMPGTSB);
+IMPORT_DECODER_ALIAS(VCMPGTSB_, VCMPGTSB);
+IMPORT_DECODER(VCMPGTSH);
+IMPORT_DECODER_ALIAS(VCMPGTSH_, VCMPGTSH);
+IMPORT_DECODER(VCMPGTSW);
+IMPORT_DECODER_ALIAS(VCMPGTSW_, VCMPGTSW);
+IMPORT_DECODER(VCMPGTUB);
+IMPORT_DECODER_ALIAS(VCMPGTUB_, VCMPGTUB);
+IMPORT_DECODER(VCMPGTUH);
+IMPORT_DECODER_ALIAS(VCMPGTUH_, VCMPGTUH);
+IMPORT_DECODER(VCMPGTUW);
+IMPORT_DECODER_ALIAS(VCMPGTUW_, VCMPGTUW);
+IMPORT_DECODER(VCTSXS);
+IMPORT_DECODER(VCTUXS);
+IMPORT_DECODER(VEXPTEFP);
+IMPORT_DECODER(VLOGEFP);
+IMPORT_DECODER(VMADDFP);
+IMPORT_DECODER(VMAXFP);
+IMPORT_DECODER(VMAXSB);
+IMPORT_DECODER(VMAXSH);
+IMPORT_DECODER(VMAXSW);
+IMPORT_DECODER(VMAXUB);
+IMPORT_DECODER(VMAXUH);
+IMPORT_DECODER(VMAXUW);
+IMPORT_DECODER(VMHADDSHS);
+IMPORT_DECODER(VMHRADDSHS);
+IMPORT_DECODER(VMINFP);
+IMPORT_DECODER(VMINSB);
+IMPORT_DECODER(VMINSH);
+IMPORT_DECODER(VMINSW);
+IMPORT_DECODER(VMINUB);
+IMPORT_DECODER(VMINUH);
+IMPORT_DECODER(VMINUW);
+IMPORT_DECODER(VMLADDUHM);
+IMPORT_DECODER(VMRGHB);
+IMPORT_DECODER(VMRGHH);
+IMPORT_DECODER(VMRGHW);
+IMPORT_DECODER(VMRGLB);
+IMPORT_DECODER(VMRGLH);
+IMPORT_DECODER(VMRGLW);
+IMPORT_DECODER(VMSUMMBM);
+IMPORT_DECODER(VMSUMSHM);
+IMPORT_DECODER(VMSUMSHS);
+IMPORT_DECODER(VMSUMUBM);
+IMPORT_DECODER(VMSUMUHM);
+IMPORT_DECODER(VMSUMUHS);
+IMPORT_DECODER(VMULESB);
+IMPORT_DECODER(VMULESH);
+IMPORT_DECODER(VMULEUB);
+IMPORT_DECODER(VMULEUH);
+IMPORT_DECODER(VMULOSB);
+IMPORT_DECODER(VMULOSH);
+IMPORT_DECODER(VMULOUB);
+IMPORT_DECODER(VMULOUH);
+IMPORT_DECODER(VNMSUBFP);
+IMPORT_DECODER(VNOR);
+IMPORT_DECODER(VOR);
+IMPORT_DECODER(VPERM);
+IMPORT_DECODER(VPKPX);
+IMPORT_DECODER(VPKSHSS);
+IMPORT_DECODER(VPKSHUS);
+IMPORT_DECODER(VPKSWSS);
+IMPORT_DECODER(VPKSWUS);
+IMPORT_DECODER(VPKUHUM);
+IMPORT_DECODER(VPKUHUS);
+IMPORT_DECODER(VPKUWUM);
+IMPORT_DECODER(VPKUWUS);
+IMPORT_DECODER(VREFP);
+IMPORT_DECODER(VRFIM);
+IMPORT_DECODER(VRFIN);
+IMPORT_DECODER(VRFIP);
+IMPORT_DECODER(VRFIZ);
+IMPORT_DECODER(VRLB);
+IMPORT_DECODER(VRLH);
+IMPORT_DECODER(VRLW);
+IMPORT_DECODER(VRSQRTEFP);
+IMPORT_DECODER(VSEL);
+IMPORT_DECODER(VSL);
+IMPORT_DECODER(VSLB);
+IMPORT_DECODER(VSLDOI);
+IMPORT_DECODER(VSLH);
+IMPORT_DECODER(VSLO);
+IMPORT_DECODER(VSLW);
+IMPORT_DECODER(VSPLTB);
+IMPORT_DECODER(VSPLTH);
+IMPORT_DECODER(VSPLTISB);
+IMPORT_DECODER(VSPLTISH);
+IMPORT_DECODER(VSPLTISW);
+IMPORT_DECODER(VSPLTW);
+IMPORT_DECODER(VSR);
+IMPORT_DECODER(VSRAB);
+IMPORT_DECODER(VSRAH);
+IMPORT_DECODER(VSRAW);
+IMPORT_DECODER(VSRB);
+IMPORT_DECODER(VSRH);
+IMPORT_DECODER(VSRO);
+IMPORT_DECODER(VSRW);
+IMPORT_DECODER(VSUBCUW);
+IMPORT_DECODER(VSUBFP);
+IMPORT_DECODER(VSUBSBS);
+IMPORT_DECODER(VSUBSHS);
+IMPORT_DECODER(VSUBSWS);
+IMPORT_DECODER(VSUBUBM);
+IMPORT_DECODER(VSUBUBS);
+IMPORT_DECODER(VSUBUHM);
+IMPORT_DECODER(VSUBUHS);
+IMPORT_DECODER(VSUBUWM);
+IMPORT_DECODER(VSUBUWS);
+IMPORT_DECODER(VSUMSWS);
+IMPORT_DECODER(VSUM2SWS);
+IMPORT_DECODER(VSUM4SBS);
+IMPORT_DECODER(VSUM4SHS);
+IMPORT_DECODER(VSUM4UBS);
+IMPORT_DECODER(VUPKHPX);
+IMPORT_DECODER(VUPKHSB);
+IMPORT_DECODER(VUPKHSH);
+IMPORT_DECODER(VUPKLPX);
+IMPORT_DECODER(VUPKLSB);
+IMPORT_DECODER(VUPKLSH);
+IMPORT_DECODER(VXOR);
+IMPORT_DECODER(TDI);
+IMPORT_DECODER(TWI);
+IMPORT_DECODER(MULLI);
+IMPORT_DECODER(SUBFIC);
+IMPORT_DECODER(CMPLI);
+IMPORT_DECODER(CMPI);
+IMPORT_DECODER(ADDIC);
+IMPORT_DECODER(ADDI);
+IMPORT_DECODER(ADDIS);
+IMPORT_DECODER(BC);
+IMPORT_DECODER(SC);
+IMPORT_DECODER(B);
+IMPORT_DECODER(MCRF);
+IMPORT_DECODER(BCLR);
+IMPORT_DECODER(RFID);
+IMPORT_DECODER(CRNOR);
+IMPORT_DECODER(RFSCV);
+IMPORT_DECODER(CRANDC);
+IMPORT_DECODER(ISYNC);
+IMPORT_DECODER(CRXOR);
+IMPORT_DECODER(CRNAND);
+IMPORT_DECODER(CRAND);
+IMPORT_DECODER(HRFID);
+IMPORT_DECODER(CREQV);
+IMPORT_DECODER(URFID);
+IMPORT_DECODER(STOP);
+IMPORT_DECODER(CRORC);
+IMPORT_DECODER(CROR);
+IMPORT_DECODER(BCCTR);
+IMPORT_DECODER(RLWIMI);
+IMPORT_DECODER(RLWINM);
+IMPORT_DECODER(RLWNM);
+IMPORT_DECODER(ORI);
+IMPORT_DECODER(ORIS);
+IMPORT_DECODER(XORI);
+IMPORT_DECODER(XORIS);
+IMPORT_DECODER(ANDI);
+IMPORT_DECODER(ANDIS);
+IMPORT_DECODER(RLDICL);
+IMPORT_DECODER(RLDICR);
+IMPORT_DECODER(RLDIC);
+IMPORT_DECODER(RLDIMI);
+IMPORT_DECODER(RLDCL);
+IMPORT_DECODER(RLDCR);
+IMPORT_DECODER(CMP);
+IMPORT_DECODER(TW);
+IMPORT_DECODER(LVSL);
+IMPORT_DECODER(LVEBX);
+IMPORT_DECODER(SUBFC);
+IMPORT_DECODER(MULHDU);
+IMPORT_DECODER(ADDC);
+IMPORT_DECODER(MULHWU);
+IMPORT_DECODER(MFOCRF);
+IMPORT_DECODER(LWARX);
+IMPORT_DECODER(LDX);
+IMPORT_DECODER(LWZX);
+IMPORT_DECODER(SLW);
+IMPORT_DECODER(CNTLZW);
+IMPORT_DECODER(SLD);
+IMPORT_DECODER(AND);
+IMPORT_DECODER(CMPL);
+IMPORT_DECODER(LVSR);
+IMPORT_DECODER(LVEHX);
+IMPORT_DECODER(SUBF);
+IMPORT_DECODER(LDUX);
+IMPORT_DECODER(DCBST);
+IMPORT_DECODER(LWZUX);
+IMPORT_DECODER(CNTLZD);
+IMPORT_DECODER(ANDC);
+IMPORT_DECODER(TD);
+IMPORT_DECODER(LVEWX);
+IMPORT_DECODER(MULHD);
+IMPORT_DECODER(MULHW);
+IMPORT_DECODER(LDARX);
+IMPORT_DECODER(DCBF);
+IMPORT_DECODER(LBZX);
+IMPORT_DECODER(LVX);
+IMPORT_DECODER(NEG);
+IMPORT_DECODER(LBZUX);
+IMPORT_DECODER(NOR);
+IMPORT_DECODER(STVEBX);
+IMPORT_DECODER(SUBFE);
+IMPORT_DECODER(ADDE);
+IMPORT_DECODER(MTOCRF);
+IMPORT_DECODER(STDX);
+IMPORT_DECODER(STWCX);
+IMPORT_DECODER(STWX);
+IMPORT_DECODER(STVEHX);
+IMPORT_DECODER(STDUX);
+IMPORT_DECODER(STWUX);
+IMPORT_DECODER(STVEWX);
+IMPORT_DECODER(SUBFZE);
+IMPORT_DECODER(ADDZE);
+IMPORT_DECODER(STDCX);
+IMPORT_DECODER(STBX);
+IMPORT_DECODER(STVX);
+IMPORT_DECODER(MULLD);
+IMPORT_DECODER(SUBFME);
+IMPORT_DECODER(ADDME);
+IMPORT_DECODER(MULLW);
+IMPORT_DECODER(DCBTST);
+IMPORT_DECODER(STBUX);
+IMPORT_DECODER(ADD);
+IMPORT_DECODER(DCBT);
+IMPORT_DECODER(LHZX);
+IMPORT_DECODER(EQV);
+IMPORT_DECODER(ECIWX);
+IMPORT_DECODER(LHZUX);
+IMPORT_DECODER(XOR);
+IMPORT_DECODER(MFSPR);
+IMPORT_DECODER(LWAX);
+IMPORT_DECODER(DST);
+IMPORT_DECODER(LHAX);
+IMPORT_DECODER(LVXL);
+IMPORT_DECODER(MFTB);
+IMPORT_DECODER(LWAUX);
+IMPORT_DECODER(DSTST);
+IMPORT_DECODER(LHAUX);
+IMPORT_DECODER(STHX);
+IMPORT_DECODER(ORC);
+IMPORT_DECODER(ECOWX);
+IMPORT_DECODER(STHUX);
+IMPORT_DECODER(OR);
+IMPORT_DECODER(DIVDU);
+IMPORT_DECODER(DIVWU);
+IMPORT_DECODER(MTSPR);
+IMPORT_DECODER(DCBI);
+IMPORT_DECODER(NAND);
+IMPORT_DECODER(STVXL);
+IMPORT_DECODER(DIVD);
+IMPORT_DECODER(DIVW);
+IMPORT_DECODER(LVLX);
+IMPORT_DECODER(LDBRX);
+IMPORT_DECODER(LSWX);
+IMPORT_DECODER(LWBRX);
+IMPORT_DECODER(LFSX);
+IMPORT_DECODER(SRW);
+IMPORT_DECODER(SRD);
+IMPORT_DECODER(LVRX);
+IMPORT_DECODER(LSWI);
+IMPORT_DECODER(LFSUX);
+IMPORT_DECODER(SYNC);
+IMPORT_DECODER(LFDX);
+IMPORT_DECODER(LFDUX);
+IMPORT_DECODER(STVLX);
+IMPORT_DECODER(STDBRX);
+IMPORT_DECODER(STSWX);
+IMPORT_DECODER(STWBRX);
+IMPORT_DECODER(STFSX);
+IMPORT_DECODER(STVRX);
+IMPORT_DECODER(STFSUX);
+IMPORT_DECODER(STSWI);
+IMPORT_DECODER(STFDX);
+IMPORT_DECODER(STFDUX);
+IMPORT_DECODER(LVLXL);
+IMPORT_DECODER(LHBRX);
+IMPORT_DECODER(SRAW);
+IMPORT_DECODER(SRAD);
+IMPORT_DECODER(LVRXL);
+IMPORT_DECODER(DSS);
+IMPORT_DECODER(SRAWI);
+IMPORT_DECODER(SRADI);
+IMPORT_DECODER(EIEIO);
+IMPORT_DECODER(STVLXL);
+IMPORT_DECODER(STHBRX);
+IMPORT_DECODER(EXTSH);
+IMPORT_DECODER(STVRXL);
+IMPORT_DECODER(EXTSB);
+IMPORT_DECODER(STFIWX);
+IMPORT_DECODER(EXTSW);
+IMPORT_DECODER(ICBI);
+IMPORT_DECODER(DCBZ);
+IMPORT_DECODER(LWZ);
+IMPORT_DECODER(LWZU);
+IMPORT_DECODER(LBZ);
+IMPORT_DECODER(LBZU);
+IMPORT_DECODER(STW);
+IMPORT_DECODER(STWU);
+IMPORT_DECODER(STB);
+IMPORT_DECODER(STBU);
+IMPORT_DECODER(LHZ);
+IMPORT_DECODER(LHZU);
+IMPORT_DECODER(LHA);
+IMPORT_DECODER(LHAU);
+IMPORT_DECODER(STH);
+IMPORT_DECODER(STHU);
+IMPORT_DECODER(LMW);
+IMPORT_DECODER(STMW);
+IMPORT_DECODER(LFS);
+IMPORT_DECODER(LFSU);
+IMPORT_DECODER(LFD);
+IMPORT_DECODER(LFDU);
+IMPORT_DECODER(STFS);
+IMPORT_DECODER(STFSU);
+IMPORT_DECODER(STFD);
+IMPORT_DECODER(STFDU);
+IMPORT_DECODER(LD);
+IMPORT_DECODER(LDU);
+IMPORT_DECODER(LWA);
+IMPORT_DECODER(STD);
+IMPORT_DECODER(STDU);
+IMPORT_DECODER(FDIVS);
+IMPORT_DECODER(FSUBS);
+IMPORT_DECODER(FADDS);
+IMPORT_DECODER(FSQRTS);
+IMPORT_DECODER(FRES);
+IMPORT_DECODER(FMULS);
+IMPORT_DECODER(FMADDS);
+IMPORT_DECODER(FMSUBS);
+IMPORT_DECODER(FNMSUBS);
+IMPORT_DECODER(FNMADDS);
+IMPORT_DECODER(MTFSB1);
+IMPORT_DECODER(MCRFS);
+IMPORT_DECODER(MTFSB0);
+IMPORT_DECODER(MTFSFI);
+IMPORT_DECODER(MFFS);
+IMPORT_DECODER(MTFSF);
+IMPORT_DECODER(FCMPU);
+IMPORT_DECODER(FRSP);
+IMPORT_DECODER(FCTIW);
+IMPORT_DECODER(FCTIWZ);
+IMPORT_DECODER(FDIV);
+IMPORT_DECODER(FSUB);
+IMPORT_DECODER(FADD);
+IMPORT_DECODER(FSQRT);
+IMPORT_DECODER(FSEL);
+IMPORT_DECODER(FMUL);
+IMPORT_DECODER(FRSQRTE);
+IMPORT_DECODER(FMSUB);
+IMPORT_DECODER(FMADD);
+IMPORT_DECODER(FNMSUB);
+IMPORT_DECODER(FNMADD);
+IMPORT_DECODER(FCMPO);
+IMPORT_DECODER(FNEG);
+IMPORT_DECODER(FMR);
+IMPORT_DECODER(FNABS);
+IMPORT_DECODER(FABS);
+IMPORT_DECODER(FCTID);
+IMPORT_DECODER(FCTIDZ);
+IMPORT_DECODER(FCFID);
+IMPORT_DECODER(UNK);
+IMPORT_DECODER(SUBFCO);
+IMPORT_DECODER(ADDCO);
+IMPORT_DECODER(SUBFO);
+IMPORT_DECODER(NEGO);
+IMPORT_DECODER(SUBFEO);
+IMPORT_DECODER(ADDEO);
+IMPORT_DECODER(SUBFZEO);
+IMPORT_DECODER(ADDZEO);
+IMPORT_DECODER(SUBFMEO);
+IMPORT_DECODER(MULLDO);
+IMPORT_DECODER(ADDMEO);
+IMPORT_DECODER(MULLWO);
+IMPORT_DECODER(ADDO);
+IMPORT_DECODER(DIVDUO);
+IMPORT_DECODER(DIVWUO);
+IMPORT_DECODER(DIVDO);
+IMPORT_DECODER(DIVWO);
+IMPORT_DECODER_ALIAS(SUBFCO_, SUBFCO);
+IMPORT_DECODER_ALIAS(ADDCO_, ADDCO);
+IMPORT_DECODER_ALIAS(SUBFO_, SUBFO);
+IMPORT_DECODER_ALIAS(NEGO_, NEGO);
+IMPORT_DECODER_ALIAS(SUBFEO_, SUBFEO);
+IMPORT_DECODER_ALIAS(ADDEO_, ADDEO);
+IMPORT_DECODER_ALIAS(SUBFZEO_, SUBFZEO);
+IMPORT_DECODER_ALIAS(ADDZEO_, ADDZEO);
+IMPORT_DECODER_ALIAS(SUBFMEO_, SUBFMEO);
+IMPORT_DECODER_ALIAS(MULLDO_, MULLDO);
+IMPORT_DECODER_ALIAS(ADDMEO_, ADDMEO);
+IMPORT_DECODER_ALIAS(MULLWO_, MULLWO);
+IMPORT_DECODER_ALIAS(ADDO_, ADDO);
+IMPORT_DECODER_ALIAS(DIVDUO_, DIVDUO);
+IMPORT_DECODER_ALIAS(DIVWUO_, DIVWUO);
+IMPORT_DECODER_ALIAS(DIVDO_, DIVDO);
+IMPORT_DECODER_ALIAS(DIVWO_, DIVWO);
+IMPORT_DECODER_ALIAS(RLWIMI_, RLWIMI);
+IMPORT_DECODER_ALIAS(RLWINM_, RLWINM);
+IMPORT_DECODER_ALIAS(RLWNM_, RLWNM);
+IMPORT_DECODER_ALIAS(RLDICL_, RLDICL);
+IMPORT_DECODER_ALIAS(RLDICR_, RLDICR);
+IMPORT_DECODER_ALIAS(RLDIC_, RLDIC);
+IMPORT_DECODER_ALIAS(RLDIMI_, RLDIMI);
+IMPORT_DECODER_ALIAS(RLDCL_, RLDCL);
+IMPORT_DECODER_ALIAS(RLDCR_, RLDCR);
+IMPORT_DECODER_ALIAS(SUBFC_, SUBFC);
+IMPORT_DECODER_ALIAS(MULHDU_, MULHDU);
+IMPORT_DECODER_ALIAS(ADDC_, ADDC);
+IMPORT_DECODER_ALIAS(MULHWU_, MULHWU);
+IMPORT_DECODER_ALIAS(SLW_, SLW);
+IMPORT_DECODER_ALIAS(CNTLZW_, CNTLZW);
+IMPORT_DECODER_ALIAS(SLD_, SLD);
+IMPORT_DECODER_ALIAS(AND_, AND);
+IMPORT_DECODER_ALIAS(SUBF_, SUBF);
+IMPORT_DECODER_ALIAS(CNTLZD_, CNTLZD);
+IMPORT_DECODER_ALIAS(ANDC_, ANDC);
+IMPORT_DECODER_ALIAS(MULHD_, MULHD);
+IMPORT_DECODER_ALIAS(MULHW_, MULHW);
+IMPORT_DECODER_ALIAS(NEG_, NEG);
+IMPORT_DECODER_ALIAS(NOR_, NOR);
+IMPORT_DECODER_ALIAS(SUBFE_, SUBFE);
+IMPORT_DECODER_ALIAS(ADDE_, ADDE);
+IMPORT_DECODER_ALIAS(SUBFZE_, SUBFZE);
+IMPORT_DECODER_ALIAS(ADDZE_, ADDZE);
+IMPORT_DECODER_ALIAS(MULLD_, MULLD);
+IMPORT_DECODER_ALIAS(SUBFME_, SUBFME);
+IMPORT_DECODER_ALIAS(ADDME_, ADDME);
+IMPORT_DECODER_ALIAS(MULLW_, MULLW);
+IMPORT_DECODER_ALIAS(ADD_, ADD);
+IMPORT_DECODER_ALIAS(EQV_, EQV);
+IMPORT_DECODER_ALIAS(XOR_, XOR);
+IMPORT_DECODER_ALIAS(ORC_, ORC);
+IMPORT_DECODER_ALIAS(OR_, OR);
+IMPORT_DECODER_ALIAS(DIVDU_, DIVDU);
+IMPORT_DECODER_ALIAS(DIVWU_, DIVWU);
+IMPORT_DECODER_ALIAS(NAND_, NAND);
+IMPORT_DECODER_ALIAS(DIVD_, DIVD);
+IMPORT_DECODER_ALIAS(DIVW_, DIVW);
+IMPORT_DECODER_ALIAS(SRW_, SRW);
+IMPORT_DECODER_ALIAS(SRD_, SRD);
+IMPORT_DECODER_ALIAS(SRAW_, SRAW);
+IMPORT_DECODER_ALIAS(SRAD_, SRAD);
+IMPORT_DECODER_ALIAS(SRAWI_, SRAWI);
+IMPORT_DECODER_ALIAS(SRADI_, SRADI);
+IMPORT_DECODER_ALIAS(EXTSH_, EXTSH);
+IMPORT_DECODER_ALIAS(EXTSB_, EXTSB);
+IMPORT_DECODER_ALIAS(EXTSW_, EXTSW);
+IMPORT_DECODER_ALIAS(FDIVS_, FDIVS);
+IMPORT_DECODER_ALIAS(FSUBS_, FSUBS);
+IMPORT_DECODER_ALIAS(FADDS_, FADDS);
+IMPORT_DECODER_ALIAS(FSQRTS_, FSQRTS);
+IMPORT_DECODER_ALIAS(FRES_, FRES);
+IMPORT_DECODER_ALIAS(FMULS_, FMULS);
+IMPORT_DECODER_ALIAS(FMADDS_, FMADDS);
+IMPORT_DECODER_ALIAS(FMSUBS_, FMSUBS);
+IMPORT_DECODER_ALIAS(FNMSUBS_, FNMSUBS);
+IMPORT_DECODER_ALIAS(FNMADDS_, FNMADDS);
+IMPORT_DECODER_ALIAS(MTFSB1_, MTFSB1);
+IMPORT_DECODER_ALIAS(MTFSB0_, MTFSB0);
+IMPORT_DECODER_ALIAS(MTFSFI_, MTFSFI);
+IMPORT_DECODER_ALIAS(MFFS_, MFFS);
+IMPORT_DECODER_ALIAS(MTFSF_, MTFSF);
+IMPORT_DECODER_ALIAS(FRSP_, FRSP);
+IMPORT_DECODER_ALIAS(FCTIW_, FCTIW);
+IMPORT_DECODER_ALIAS(FCTIWZ_, FCTIWZ);
+IMPORT_DECODER_ALIAS(FDIV_, FDIV);
+IMPORT_DECODER_ALIAS(FSUB_, FSUB);
+IMPORT_DECODER_ALIAS(FADD_, FADD);
+IMPORT_DECODER_ALIAS(FSQRT_, FSQRT);
+IMPORT_DECODER_ALIAS(FSEL_, FSEL);
+IMPORT_DECODER_ALIAS(FMUL_, FMUL);
+IMPORT_DECODER_ALIAS(FRSQRTE_, FRSQRTE);
+IMPORT_DECODER_ALIAS(FMSUB_, FMSUB);
+IMPORT_DECODER_ALIAS(FMADD_, FMADD);
+IMPORT_DECODER_ALIAS(FNMSUB_, FNMSUB);
+IMPORT_DECODER_ALIAS(FNMADD_, FNMADD);
+IMPORT_DECODER_ALIAS(FNEG_, FNEG);
+IMPORT_DECODER_ALIAS(FMR_, FMR);
+IMPORT_DECODER_ALIAS(FNABS_, FNABS);
+IMPORT_DECODER_ALIAS(FABS_, FABS);
+IMPORT_DECODER_ALIAS(FCTID_, FCTID);
+IMPORT_DECODER_ALIAS(FCTIDZ_, FCTIDZ);
+IMPORT_DECODER_ALIAS(FCFID_, FCFID);
+#undef IMPORT_DECODER
+#undef IMPORT_DECODER_ALIAS
+
+PPUInterpreter::PPUInterpreter()
+{
+	for (auto& isel : impl)
+	{
+		isel = [](PPUContext&, rx::cell::ppu::Instruction)
+		{
+			fmt::throw_exception("PPU Invalid Instruction");
+		};
+	}
+#define DEFINE_DECODER(x) \
+	impl[static_cast<int>(rx::cell::ppu::Opcode::x)] = ISEL_PPU_##x##_DEC
+
+	DEFINE_DECODER(MFVSCR);
+	DEFINE_DECODER(MTVSCR);
+	DEFINE_DECODER(VADDCUW);
+	DEFINE_DECODER(VADDFP);
+	DEFINE_DECODER(VADDSBS);
+	DEFINE_DECODER(VADDSHS);
+	DEFINE_DECODER(VADDSWS);
+	DEFINE_DECODER(VADDUBM);
+	DEFINE_DECODER(VADDUBS);
+	DEFINE_DECODER(VADDUHM);
+	DEFINE_DECODER(VADDUHS);
+	DEFINE_DECODER(VADDUWM);
+	DEFINE_DECODER(VADDUWS);
+	DEFINE_DECODER(VAND);
+	DEFINE_DECODER(VANDC);
+	DEFINE_DECODER(VAVGSB);
+	DEFINE_DECODER(VAVGSH);
+	DEFINE_DECODER(VAVGSW);
+	DEFINE_DECODER(VAVGUB);
+	DEFINE_DECODER(VAVGUH);
+	DEFINE_DECODER(VAVGUW);
+	DEFINE_DECODER(VCFSX);
+	DEFINE_DECODER(VCFUX);
+	DEFINE_DECODER(VCMPBFP);
+	DEFINE_DECODER(VCMPBFP_);
+	DEFINE_DECODER(VCMPEQFP);
+	DEFINE_DECODER(VCMPEQFP_);
+	DEFINE_DECODER(VCMPEQUB);
+	DEFINE_DECODER(VCMPEQUB_);
+	DEFINE_DECODER(VCMPEQUH);
+	DEFINE_DECODER(VCMPEQUH_);
+	DEFINE_DECODER(VCMPEQUW);
+	DEFINE_DECODER(VCMPEQUW_);
+	DEFINE_DECODER(VCMPGEFP);
+	DEFINE_DECODER(VCMPGEFP_);
+	DEFINE_DECODER(VCMPGTFP);
+	DEFINE_DECODER(VCMPGTFP_);
+	DEFINE_DECODER(VCMPGTSB);
+	DEFINE_DECODER(VCMPGTSB_);
+	DEFINE_DECODER(VCMPGTSH);
+	DEFINE_DECODER(VCMPGTSH_);
+	DEFINE_DECODER(VCMPGTSW);
+	DEFINE_DECODER(VCMPGTSW_);
+	DEFINE_DECODER(VCMPGTUB);
+	DEFINE_DECODER(VCMPGTUB_);
+	DEFINE_DECODER(VCMPGTUH);
+	DEFINE_DECODER(VCMPGTUH_);
+	DEFINE_DECODER(VCMPGTUW);
+	DEFINE_DECODER(VCMPGTUW_);
+	DEFINE_DECODER(VCTSXS);
+	DEFINE_DECODER(VCTUXS);
+	DEFINE_DECODER(VEXPTEFP);
+	DEFINE_DECODER(VLOGEFP);
+	DEFINE_DECODER(VMADDFP);
+	DEFINE_DECODER(VMAXFP);
+	DEFINE_DECODER(VMAXSB);
+	DEFINE_DECODER(VMAXSH);
+	DEFINE_DECODER(VMAXSW);
+	DEFINE_DECODER(VMAXUB);
+	DEFINE_DECODER(VMAXUH);
+	DEFINE_DECODER(VMAXUW);
+	DEFINE_DECODER(VMHADDSHS);
+	DEFINE_DECODER(VMHRADDSHS);
+	DEFINE_DECODER(VMINFP);
+	DEFINE_DECODER(VMINSB);
+	DEFINE_DECODER(VMINSH);
+	DEFINE_DECODER(VMINSW);
+	DEFINE_DECODER(VMINUB);
+	DEFINE_DECODER(VMINUH);
+	DEFINE_DECODER(VMINUW);
+	DEFINE_DECODER(VMLADDUHM);
+	DEFINE_DECODER(VMRGHB);
+	DEFINE_DECODER(VMRGHH);
+	DEFINE_DECODER(VMRGHW);
+	DEFINE_DECODER(VMRGLB);
+	DEFINE_DECODER(VMRGLH);
+	DEFINE_DECODER(VMRGLW);
+	DEFINE_DECODER(VMSUMMBM);
+	DEFINE_DECODER(VMSUMSHM);
+	DEFINE_DECODER(VMSUMSHS);
+	DEFINE_DECODER(VMSUMUBM);
+	DEFINE_DECODER(VMSUMUHM);
+	DEFINE_DECODER(VMSUMUHS);
+	DEFINE_DECODER(VMULESB);
+	DEFINE_DECODER(VMULESH);
+	DEFINE_DECODER(VMULEUB);
+	DEFINE_DECODER(VMULEUH);
+	DEFINE_DECODER(VMULOSB);
+	DEFINE_DECODER(VMULOSH);
+	DEFINE_DECODER(VMULOUB);
+	DEFINE_DECODER(VMULOUH);
+	DEFINE_DECODER(VNMSUBFP);
+	DEFINE_DECODER(VNOR);
+	DEFINE_DECODER(VOR);
+	DEFINE_DECODER(VPERM);
+	DEFINE_DECODER(VPKPX);
+	DEFINE_DECODER(VPKSHSS);
+	DEFINE_DECODER(VPKSHUS);
+	DEFINE_DECODER(VPKSWSS);
+	DEFINE_DECODER(VPKSWUS);
+	DEFINE_DECODER(VPKUHUM);
+	DEFINE_DECODER(VPKUHUS);
+	DEFINE_DECODER(VPKUWUM);
+	DEFINE_DECODER(VPKUWUS);
+	DEFINE_DECODER(VREFP);
+	DEFINE_DECODER(VRFIM);
+	DEFINE_DECODER(VRFIN);
+	DEFINE_DECODER(VRFIP);
+	DEFINE_DECODER(VRFIZ);
+	DEFINE_DECODER(VRLB);
+	DEFINE_DECODER(VRLH);
+	DEFINE_DECODER(VRLW);
+	DEFINE_DECODER(VRSQRTEFP);
+	DEFINE_DECODER(VSEL);
+	DEFINE_DECODER(VSL);
+	DEFINE_DECODER(VSLB);
+	DEFINE_DECODER(VSLDOI);
+	DEFINE_DECODER(VSLH);
+	DEFINE_DECODER(VSLO);
+	DEFINE_DECODER(VSLW);
+	DEFINE_DECODER(VSPLTB);
+	DEFINE_DECODER(VSPLTH);
+	DEFINE_DECODER(VSPLTISB);
+	DEFINE_DECODER(VSPLTISH);
+	DEFINE_DECODER(VSPLTISW);
+	DEFINE_DECODER(VSPLTW);
+	DEFINE_DECODER(VSR);
+	DEFINE_DECODER(VSRAB);
+	DEFINE_DECODER(VSRAH);
+	DEFINE_DECODER(VSRAW);
+	DEFINE_DECODER(VSRB);
+	DEFINE_DECODER(VSRH);
+	DEFINE_DECODER(VSRO);
+	DEFINE_DECODER(VSRW);
+	DEFINE_DECODER(VSUBCUW);
+	DEFINE_DECODER(VSUBFP);
+	DEFINE_DECODER(VSUBSBS);
+	DEFINE_DECODER(VSUBSHS);
+	DEFINE_DECODER(VSUBSWS);
+	DEFINE_DECODER(VSUBUBM);
+	DEFINE_DECODER(VSUBUBS);
+	DEFINE_DECODER(VSUBUHM);
+	DEFINE_DECODER(VSUBUHS);
+	DEFINE_DECODER(VSUBUWM);
+	DEFINE_DECODER(VSUBUWS);
+	DEFINE_DECODER(VSUMSWS);
+	DEFINE_DECODER(VSUM2SWS);
+	DEFINE_DECODER(VSUM4SBS);
+	DEFINE_DECODER(VSUM4SHS);
+	DEFINE_DECODER(VSUM4UBS);
+	DEFINE_DECODER(VUPKHPX);
+	DEFINE_DECODER(VUPKHSB);
+	DEFINE_DECODER(VUPKHSH);
+	DEFINE_DECODER(VUPKLPX);
+	DEFINE_DECODER(VUPKLSB);
+	DEFINE_DECODER(VUPKLSH);
+	DEFINE_DECODER(VXOR);
+	DEFINE_DECODER(TDI);
+	DEFINE_DECODER(TWI);
+	DEFINE_DECODER(MULLI);
+	DEFINE_DECODER(SUBFIC);
+	DEFINE_DECODER(CMPLI);
+	DEFINE_DECODER(CMPI);
+	DEFINE_DECODER(ADDIC);
+	DEFINE_DECODER(ADDI);
+	DEFINE_DECODER(ADDIS);
+	DEFINE_DECODER(BC);
+	DEFINE_DECODER(SC);
+	DEFINE_DECODER(B);
+	DEFINE_DECODER(MCRF);
+	DEFINE_DECODER(BCLR);
+	DEFINE_DECODER(RFID);
+	DEFINE_DECODER(CRNOR);
+	DEFINE_DECODER(RFSCV);
+	DEFINE_DECODER(CRANDC);
+	DEFINE_DECODER(ISYNC);
+	DEFINE_DECODER(CRXOR);
+	DEFINE_DECODER(CRNAND);
+	DEFINE_DECODER(CRAND);
+	DEFINE_DECODER(HRFID);
+	DEFINE_DECODER(CREQV);
+	DEFINE_DECODER(URFID);
+	DEFINE_DECODER(STOP);
+	DEFINE_DECODER(CRORC);
+	DEFINE_DECODER(CROR);
+	DEFINE_DECODER(BCCTR);
+	DEFINE_DECODER(RLWIMI);
+	DEFINE_DECODER(RLWINM);
+	DEFINE_DECODER(RLWNM);
+	DEFINE_DECODER(ORI);
+	DEFINE_DECODER(ORIS);
+	DEFINE_DECODER(XORI);
+	DEFINE_DECODER(XORIS);
+	DEFINE_DECODER(ANDI);
+	DEFINE_DECODER(ANDIS);
+	DEFINE_DECODER(RLDICL);
+	DEFINE_DECODER(RLDICR);
+	DEFINE_DECODER(RLDIC);
+	DEFINE_DECODER(RLDIMI);
+	DEFINE_DECODER(RLDCL);
+	DEFINE_DECODER(RLDCR);
+	DEFINE_DECODER(CMP);
+	DEFINE_DECODER(TW);
+	DEFINE_DECODER(LVSL);
+	DEFINE_DECODER(LVEBX);
+	DEFINE_DECODER(SUBFC);
+	DEFINE_DECODER(MULHDU);
+	DEFINE_DECODER(ADDC);
+	DEFINE_DECODER(MULHWU);
+	DEFINE_DECODER(MFOCRF);
+	DEFINE_DECODER(LWARX);
+	DEFINE_DECODER(LDX);
+	DEFINE_DECODER(LWZX);
+	DEFINE_DECODER(SLW);
+	DEFINE_DECODER(CNTLZW);
+	DEFINE_DECODER(SLD);
+	DEFINE_DECODER(AND);
+	DEFINE_DECODER(CMPL);
+	DEFINE_DECODER(LVSR);
+	DEFINE_DECODER(LVEHX);
+	DEFINE_DECODER(SUBF);
+	DEFINE_DECODER(LDUX);
+	DEFINE_DECODER(DCBST);
+	DEFINE_DECODER(LWZUX);
+	DEFINE_DECODER(CNTLZD);
+	DEFINE_DECODER(ANDC);
+	DEFINE_DECODER(TD);
+	DEFINE_DECODER(LVEWX);
+	DEFINE_DECODER(MULHD);
+	DEFINE_DECODER(MULHW);
+	DEFINE_DECODER(LDARX);
+	DEFINE_DECODER(DCBF);
+	DEFINE_DECODER(LBZX);
+	DEFINE_DECODER(LVX);
+	DEFINE_DECODER(NEG);
+	DEFINE_DECODER(LBZUX);
+	DEFINE_DECODER(NOR);
+	DEFINE_DECODER(STVEBX);
+	DEFINE_DECODER(SUBFE);
+	DEFINE_DECODER(ADDE);
+	DEFINE_DECODER(MTOCRF);
+	DEFINE_DECODER(STDX);
+	DEFINE_DECODER(STWCX);
+	DEFINE_DECODER(STWX);
+	DEFINE_DECODER(STVEHX);
+	DEFINE_DECODER(STDUX);
+	DEFINE_DECODER(STWUX);
+	DEFINE_DECODER(STVEWX);
+	DEFINE_DECODER(SUBFZE);
+	DEFINE_DECODER(ADDZE);
+	DEFINE_DECODER(STDCX);
+	DEFINE_DECODER(STBX);
+	DEFINE_DECODER(STVX);
+	DEFINE_DECODER(MULLD);
+	DEFINE_DECODER(SUBFME);
+	DEFINE_DECODER(ADDME);
+	DEFINE_DECODER(MULLW);
+	DEFINE_DECODER(DCBTST);
+	DEFINE_DECODER(STBUX);
+	DEFINE_DECODER(ADD);
+	DEFINE_DECODER(DCBT);
+	DEFINE_DECODER(LHZX);
+	DEFINE_DECODER(EQV);
+	DEFINE_DECODER(ECIWX);
+	DEFINE_DECODER(LHZUX);
+	DEFINE_DECODER(XOR);
+	DEFINE_DECODER(MFSPR);
+	DEFINE_DECODER(LWAX);
+	DEFINE_DECODER(DST);
+	DEFINE_DECODER(LHAX);
+	DEFINE_DECODER(LVXL);
+	DEFINE_DECODER(MFTB);
+	DEFINE_DECODER(LWAUX);
+	DEFINE_DECODER(DSTST);
+	DEFINE_DECODER(LHAUX);
+	DEFINE_DECODER(STHX);
+	DEFINE_DECODER(ORC);
+	DEFINE_DECODER(ECOWX);
+	DEFINE_DECODER(STHUX);
+	DEFINE_DECODER(OR);
+	DEFINE_DECODER(DIVDU);
+	DEFINE_DECODER(DIVWU);
+	DEFINE_DECODER(MTSPR);
+	DEFINE_DECODER(DCBI);
+	DEFINE_DECODER(NAND);
+	DEFINE_DECODER(STVXL);
+	DEFINE_DECODER(DIVD);
+	DEFINE_DECODER(DIVW);
+	DEFINE_DECODER(LVLX);
+	DEFINE_DECODER(LDBRX);
+	DEFINE_DECODER(LSWX);
+	DEFINE_DECODER(LWBRX);
+	DEFINE_DECODER(LFSX);
+	DEFINE_DECODER(SRW);
+	DEFINE_DECODER(SRD);
+	DEFINE_DECODER(LVRX);
+	DEFINE_DECODER(LSWI);
+	DEFINE_DECODER(LFSUX);
+	DEFINE_DECODER(SYNC);
+	DEFINE_DECODER(LFDX);
+	DEFINE_DECODER(LFDUX);
+	DEFINE_DECODER(STVLX);
+	DEFINE_DECODER(STDBRX);
+	DEFINE_DECODER(STSWX);
+	DEFINE_DECODER(STWBRX);
+	DEFINE_DECODER(STFSX);
+	DEFINE_DECODER(STVRX);
+	DEFINE_DECODER(STFSUX);
+	DEFINE_DECODER(STSWI);
+	DEFINE_DECODER(STFDX);
+	DEFINE_DECODER(STFDUX);
+	DEFINE_DECODER(LVLXL);
+	DEFINE_DECODER(LHBRX);
+	DEFINE_DECODER(SRAW);
+	DEFINE_DECODER(SRAD);
+	DEFINE_DECODER(LVRXL);
+	DEFINE_DECODER(DSS);
+	DEFINE_DECODER(SRAWI);
+	DEFINE_DECODER(SRADI);
+	DEFINE_DECODER(EIEIO);
+	DEFINE_DECODER(STVLXL);
+	DEFINE_DECODER(STHBRX);
+	DEFINE_DECODER(EXTSH);
+	DEFINE_DECODER(STVRXL);
+	DEFINE_DECODER(EXTSB);
+	DEFINE_DECODER(STFIWX);
+	DEFINE_DECODER(EXTSW);
+	DEFINE_DECODER(ICBI);
+	DEFINE_DECODER(DCBZ);
+	DEFINE_DECODER(LWZ);
+	DEFINE_DECODER(LWZU);
+	DEFINE_DECODER(LBZ);
+	DEFINE_DECODER(LBZU);
+	DEFINE_DECODER(STW);
+	DEFINE_DECODER(STWU);
+	DEFINE_DECODER(STB);
+	DEFINE_DECODER(STBU);
+	DEFINE_DECODER(LHZ);
+	DEFINE_DECODER(LHZU);
+	DEFINE_DECODER(LHA);
+	DEFINE_DECODER(LHAU);
+	DEFINE_DECODER(STH);
+	DEFINE_DECODER(STHU);
+	DEFINE_DECODER(LMW);
+	DEFINE_DECODER(STMW);
+	DEFINE_DECODER(LFS);
+	DEFINE_DECODER(LFSU);
+	DEFINE_DECODER(LFD);
+	DEFINE_DECODER(LFDU);
+	DEFINE_DECODER(STFS);
+	DEFINE_DECODER(STFSU);
+	DEFINE_DECODER(STFD);
+	DEFINE_DECODER(STFDU);
+	DEFINE_DECODER(LD);
+	DEFINE_DECODER(LDU);
+	DEFINE_DECODER(LWA);
+	DEFINE_DECODER(STD);
+	DEFINE_DECODER(STDU);
+	DEFINE_DECODER(FDIVS);
+	DEFINE_DECODER(FSUBS);
+	DEFINE_DECODER(FADDS);
+	DEFINE_DECODER(FSQRTS);
+	DEFINE_DECODER(FRES);
+	DEFINE_DECODER(FMULS);
+	DEFINE_DECODER(FMADDS);
+	DEFINE_DECODER(FMSUBS);
+	DEFINE_DECODER(FNMSUBS);
+	DEFINE_DECODER(FNMADDS);
+	DEFINE_DECODER(MTFSB1);
+	DEFINE_DECODER(MCRFS);
+	DEFINE_DECODER(MTFSB0);
+	DEFINE_DECODER(MTFSFI);
+	DEFINE_DECODER(MFFS);
+	DEFINE_DECODER(MTFSF);
+	DEFINE_DECODER(FCMPU);
+	DEFINE_DECODER(FRSP);
+	DEFINE_DECODER(FCTIW);
+	DEFINE_DECODER(FCTIWZ);
+	DEFINE_DECODER(FDIV);
+	DEFINE_DECODER(FSUB);
+	DEFINE_DECODER(FADD);
+	DEFINE_DECODER(FSQRT);
+	DEFINE_DECODER(FSEL);
+	DEFINE_DECODER(FMUL);
+	DEFINE_DECODER(FRSQRTE);
+	DEFINE_DECODER(FMSUB);
+	DEFINE_DECODER(FMADD);
+	DEFINE_DECODER(FNMSUB);
+	DEFINE_DECODER(FNMADD);
+	DEFINE_DECODER(FCMPO);
+	DEFINE_DECODER(FNEG);
+	DEFINE_DECODER(FMR);
+	DEFINE_DECODER(FNABS);
+	DEFINE_DECODER(FABS);
+	DEFINE_DECODER(FCTID);
+	DEFINE_DECODER(FCTIDZ);
+	DEFINE_DECODER(FCFID);
+	DEFINE_DECODER(UNK);
+	DEFINE_DECODER(SUBFCO);
+	DEFINE_DECODER(ADDCO);
+	DEFINE_DECODER(SUBFO);
+	DEFINE_DECODER(NEGO);
+	DEFINE_DECODER(SUBFEO);
+	DEFINE_DECODER(ADDEO);
+	DEFINE_DECODER(SUBFZEO);
+	DEFINE_DECODER(ADDZEO);
+	DEFINE_DECODER(SUBFMEO);
+	DEFINE_DECODER(MULLDO);
+	DEFINE_DECODER(ADDMEO);
+	DEFINE_DECODER(MULLWO);
+	DEFINE_DECODER(ADDO);
+	DEFINE_DECODER(DIVDUO);
+	DEFINE_DECODER(DIVWUO);
+	DEFINE_DECODER(DIVDO);
+	DEFINE_DECODER(DIVWO);
+	DEFINE_DECODER(SUBFCO_);
+	DEFINE_DECODER(ADDCO_);
+	DEFINE_DECODER(SUBFO_);
+	DEFINE_DECODER(NEGO_);
+	DEFINE_DECODER(SUBFEO_);
+	DEFINE_DECODER(ADDEO_);
+	DEFINE_DECODER(SUBFZEO_);
+	DEFINE_DECODER(ADDZEO_);
+	DEFINE_DECODER(SUBFMEO_);
+	DEFINE_DECODER(MULLDO_);
+	DEFINE_DECODER(ADDMEO_);
+	DEFINE_DECODER(MULLWO_);
+	DEFINE_DECODER(ADDO_);
+	DEFINE_DECODER(DIVDUO_);
+	DEFINE_DECODER(DIVWUO_);
+	DEFINE_DECODER(DIVDO_);
+	DEFINE_DECODER(DIVWO_);
+	DEFINE_DECODER(RLWIMI_);
+	DEFINE_DECODER(RLWINM_);
+	DEFINE_DECODER(RLWNM_);
+	DEFINE_DECODER(RLDICL_);
+	DEFINE_DECODER(RLDICR_);
+	DEFINE_DECODER(RLDIC_);
+	DEFINE_DECODER(RLDIMI_);
+	DEFINE_DECODER(RLDCL_);
+	DEFINE_DECODER(RLDCR_);
+	DEFINE_DECODER(SUBFC_);
+	DEFINE_DECODER(MULHDU_);
+	DEFINE_DECODER(ADDC_);
+	DEFINE_DECODER(MULHWU_);
+	DEFINE_DECODER(SLW_);
+	DEFINE_DECODER(CNTLZW_);
+	DEFINE_DECODER(SLD_);
+	DEFINE_DECODER(AND_);
+	DEFINE_DECODER(SUBF_);
+	DEFINE_DECODER(CNTLZD_);
+	DEFINE_DECODER(ANDC_);
+	DEFINE_DECODER(MULHD_);
+	DEFINE_DECODER(MULHW_);
+	DEFINE_DECODER(NEG_);
+	DEFINE_DECODER(NOR_);
+	DEFINE_DECODER(SUBFE_);
+	DEFINE_DECODER(ADDE_);
+	DEFINE_DECODER(SUBFZE_);
+	DEFINE_DECODER(ADDZE_);
+	DEFINE_DECODER(MULLD_);
+	DEFINE_DECODER(SUBFME_);
+	DEFINE_DECODER(ADDME_);
+	DEFINE_DECODER(MULLW_);
+	DEFINE_DECODER(ADD_);
+	DEFINE_DECODER(EQV_);
+	DEFINE_DECODER(XOR_);
+	DEFINE_DECODER(ORC_);
+	DEFINE_DECODER(OR_);
+	DEFINE_DECODER(DIVDU_);
+	DEFINE_DECODER(DIVWU_);
+	DEFINE_DECODER(NAND_);
+	DEFINE_DECODER(DIVD_);
+	DEFINE_DECODER(DIVW_);
+	DEFINE_DECODER(SRW_);
+	DEFINE_DECODER(SRD_);
+	DEFINE_DECODER(SRAW_);
+	DEFINE_DECODER(SRAD_);
+	DEFINE_DECODER(SRAWI_);
+	DEFINE_DECODER(SRADI_);
+	DEFINE_DECODER(EXTSH_);
+	DEFINE_DECODER(EXTSB_);
+	DEFINE_DECODER(EXTSW_);
+	DEFINE_DECODER(FDIVS_);
+	DEFINE_DECODER(FSUBS_);
+	DEFINE_DECODER(FADDS_);
+	DEFINE_DECODER(FSQRTS_);
+	DEFINE_DECODER(FRES_);
+	DEFINE_DECODER(FMULS_);
+	DEFINE_DECODER(FMADDS_);
+	DEFINE_DECODER(FMSUBS_);
+	DEFINE_DECODER(FNMSUBS_);
+	DEFINE_DECODER(FNMADDS_);
+	DEFINE_DECODER(MTFSB1_);
+	DEFINE_DECODER(MTFSB0_);
+	DEFINE_DECODER(MTFSFI_);
+	DEFINE_DECODER(MFFS_);
+	DEFINE_DECODER(MTFSF_);
+	DEFINE_DECODER(FRSP_);
+	DEFINE_DECODER(FCTIW_);
+	DEFINE_DECODER(FCTIWZ_);
+	DEFINE_DECODER(FDIV_);
+	DEFINE_DECODER(FSUB_);
+	DEFINE_DECODER(FADD_);
+	DEFINE_DECODER(FSQRT_);
+	DEFINE_DECODER(FSEL_);
+	DEFINE_DECODER(FMUL_);
+	DEFINE_DECODER(FRSQRTE_);
+	DEFINE_DECODER(FMSUB_);
+	DEFINE_DECODER(FMADD_);
+	DEFINE_DECODER(FNMSUB_);
+	DEFINE_DECODER(FNMADD_);
+	DEFINE_DECODER(FNEG_);
+	DEFINE_DECODER(FMR_);
+	DEFINE_DECODER(FNABS_);
+	DEFINE_DECODER(FABS_);
+	DEFINE_DECODER(FCTID_);
+	DEFINE_DECODER(FCTIDZ_);
+	DEFINE_DECODER(FCFID_);
+#undef DEFINE_DECODER
+}
+
+static ppu_intrp_func ppu_ret = {[](ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
+	{
+		// Fix PC and return (step execution)
+		ppu.cia = vm::get_addr(this_op);
+		return;
+	}};
+
+void PPUInterpreter::interpret(PPUContext& context, std::uint32_t inst)
+{
+	auto op = rx::cell::ppu::getOpcode(inst);
+	auto instructionAddress = context.cia;
+
+	auto this_op = reinterpret_cast<be_t<u32>*>(vm::g_base_addr + instructionAddress);
+
+	const auto fn = *reinterpret_cast<ppu_intrp_func_t*>(vm::g_exec_addr + u64{instructionAddress} * 2);
+
+	if (fn)
+	{
+		fn(static_cast<ppu_thread&>(context), std::bit_cast<ppu_opcode_t>(inst), this_op, &ppu_ret);
+		return;
+	}
+
+	// if (op == rx::cell::ppu::Opcode::Invalid)
+	{
+		if (g_fxo->get<ppu_function_manager>().is_func(context.cia))
+		{
+			ppu_intrp_func_t hle_function = nullptr;
+			auto hle_addr = g_fxo->get<ppu_function_manager>().addr;
+			// HLE function index
+			const u32 index = (context.cia - hle_addr) / 8;
+
+			if (context.cia % 8 == 4 && index < ppu_function_manager::get().size())
+			{
+				// HLE function placement
+				hle_function = ppu_function_manager::get()[index];
+			}
+
+			if (hle_function)
+			{
+				hle_function(static_cast<ppu_thread&>(context), std::bit_cast<ppu_opcode_t>(inst), this_op, nullptr);
+				return;
+			}
+		}
+	}
+
+	// std::fprintf(stderr, "%08x: %s\n", instructionAddress, std::format("{}", op).c_str());
+	impl[static_cast<int>(op)](context, std::bit_cast<rx::cell::ppu::Instruction>(inst));
+
+	if (context.cia == instructionAddress &&
+		op != rx::cell::ppu::Opcode::B &&
+		op != rx::cell::ppu::Opcode::BC &&
+		op != rx::cell::ppu::Opcode::BCLR &&
+		op != rx::cell::ppu::Opcode::BCCTR)
+	{
+		context.cia += sizeof(std::uint32_t);
+	}
+}
+
+extern "C"
+{
+	[[noreturn]] void rpcsx_trap()
+	{
+		fmt::throw_exception("PPU Trap");
+	}
+	[[noreturn]] void rpcsx_invalid_instruction()
+	{
+		fmt::throw_exception("PPU Invalid Instruction");
+	}
+	[[noreturn]] void rpcsx_unimplemented_instruction()
+	{
+		fmt::throw_exception("PPU Unimplemented Instruction");
+	}
+
+	void rpcsx_vm_read(std::uint64_t vaddr, void* dest, std::size_t size)
+	{
+		std::memcpy(dest, vm::g_base_addr + vaddr, size);
+	}
+	void rpcsx_vm_write(std::uint64_t vaddr, const void* src, std::size_t size)
+	{
+		std::memcpy(vm::g_base_addr + vaddr, src, size);
+	}
+
+	std::uint64_t rpcsx_get_tb()
+	{
+		return get_timebased_time();
+	}
+}
+
+void ppu_execute_syscall(PPUContext& context, u64 code)
+{
+	return ppu_execute_syscall(static_cast<ppu_thread&>(context), code);
+}
+u32 ppu_lwarx(PPUContext& context, u32 addr)
+{
+	return ppu_lwarx(static_cast<ppu_thread&>(context), addr);
+}
+u64 ppu_ldarx(PPUContext& context, u32 addr)
+{
+	return ppu_ldarx(static_cast<ppu_thread&>(context), addr);
+}
+bool ppu_stwcx(PPUContext& context, u32 addr, u32 reg_value)
+{
+	return ppu_stwcx(static_cast<ppu_thread&>(context), addr, reg_value);
+}
+bool ppu_stdcx(PPUContext& context, u32 addr, u64 reg_value)
+{
+	return ppu_stdcx(static_cast<ppu_thread&>(context), addr, reg_value);
+}
+void ppu_trap(PPUContext& context, u64 addr)
+{
+	return ppu_trap(static_cast<ppu_thread&>(context), addr);
+}
diff --git a/rpcs3/Emu/Cell/PPUInterpreter.h b/rpcs3/Emu/Cell/PPUInterpreter.h
index ca8d03db5..64703e5f8 100644
--- a/rpcs3/Emu/Cell/PPUInterpreter.h
+++ b/rpcs3/Emu/Cell/PPUInterpreter.h
@@ -1,6 +1,11 @@
 #pragma once
 
 #include "PPUOpcodes.h"
+#include "rx/cpu/cell/ppu/Instruction.hpp"
+#include "rx/cpu/cell/ppu/Opcode.hpp"
+#include "rx/cpu/cell/ppu/PPUContext.hpp"
+#include "rx/refl.hpp"
+#include <array>
 
 class ppu_thread;
 
@@ -42,3 +47,12 @@ struct ppu_interpreter_rt : ppu_interpreter_rt_base
 private:
 	ppu_decoder<ppu_interpreter_t<ppu_intrp_func_t>, ppu_intrp_func_t> table;
 };
+
+struct PPUContext;
+
+struct PPUInterpreter
+{
+	std::array<void (*)(PPUContext& context, rx::cell::ppu::Instruction inst), rx::fieldCount<rx::cell::ppu::Opcode>> impl;
+	PPUInterpreter();
+	void interpret(PPUContext& context, std::uint32_t inst);
+};
diff --git a/rpcs3/Emu/Cell/PPUModule.cpp b/rpcs3/Emu/Cell/PPUModule.cpp
index 05e5a1a5e..4fca8778f 100644
--- a/rpcs3/Emu/Cell/PPUModule.cpp
+++ b/rpcs3/Emu/Cell/PPUModule.cpp
@@ -333,7 +333,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link, utils::serial* ar = n
 	};
 
 	// Initialize double-purpose fake OPD array for HLE functions
-	const auto& hle_funcs = ppu_function_manager::get(g_cfg.core.ppu_decoder != ppu_decoder_type::_static);
+	const auto& hle_funcs = ppu_function_manager::get(g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy);
 
 	u32& hle_funcs_addr = g_fxo->get<ppu_function_manager>().addr;
 
diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp
index bb3e7bb8d..34d6c13d4 100644
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@@ -1,4 +1,5 @@
 #include "stdafx.h"
+#include "rx/cpu/cell/ppu/Decoder.hpp"
 #include "util/JIT.h"
 #include "util/StrUtil.h"
 #include "util/serialization.hpp"
@@ -27,6 +28,9 @@
 #include "lv2/sys_overlay.h"
 #include "lv2/sys_process.h"
 #include "lv2/sys_spu.h"
+#include <cstddef>
+#include <rx/format.hpp>
+#include <format>
 
 #ifdef LLVM_AVAILABLE
 #ifdef _MSC_VER
@@ -317,12 +321,12 @@ const auto ppu_gateway = build_function_asm<void (*)(ppu_thread*)>("ppu_gateway"
 #endif
 
 		// Save native stack pointer for longjmp emulation
-		c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp);
+		c.mov(x86::qword_ptr(args[0], OFFSET_OF(ppu_thread, hv_ctx.regs)), x86::rsp);
 
 		// Initialize args
 		c.mov(x86::r13, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr)));
 		c.mov(x86::rbp, args[0]);
-		c.mov(x86::edx, x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia))); // Load PC
+		c.mov(x86::edx, x86::dword_ptr(x86::rbp, OFFSET_OF(ppu_thread, cia))); // Load PC
 
 		c.mov(x86::rax, x86::qword_ptr(x86::r13, x86::edx, 1, 0)); // Load call target
 		c.mov(x86::rdx, x86::rax);
@@ -333,9 +337,9 @@ const auto ppu_gateway = build_function_asm<void (*)(ppu_thread*)>("ppu_gateway"
 		c.mov(x86::r12d, x86::edx); // Load relocation base
 
 		c.mov(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
-		c.mov(x86::r14, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 0))); // Load some registers
-		c.mov(x86::rsi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 1)));
-		c.mov(x86::rdi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 2)));
+		c.mov(x86::r14, x86::qword_ptr(x86::rbp, OFFSET_OF(ppu_thread, gpr[0]))); // Load some registers
+		c.mov(x86::rsi, x86::qword_ptr(x86::rbp, OFFSET_OF(ppu_thread, gpr[1])));
+		c.mov(x86::rdi, x86::qword_ptr(x86::rbp, OFFSET_OF(ppu_thread, gpr[2])));
 
 		if (utils::has_avx())
 		{
@@ -403,7 +407,7 @@ const auto ppu_gateway = build_function_asm<void (*)(ppu_thread*)>("ppu_gateway"
 	    // pc, sp
 	    // x18, x19...x30
 	    // NOTE: Do not touch x19..x30 before saving the registers!
-		const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
+		const u64 hv_register_array_offset = OFFSET_OF(ppu_thread, hv_ctx.regs);
 		Label hv_ctx_pc = c.newLabel(); // Used to hold the far jump return address
 
 		// Sanity
@@ -434,7 +438,7 @@ const auto ppu_gateway = build_function_asm<void (*)(ppu_thread*)>("ppu_gateway"
 		const arm::GpX pc = a64::x15;
 		const arm::GpX cia_addr_reg = a64::x11;
 		// Load offset value
-		c.mov(cia_addr_reg, Imm(static_cast<u64>(::offset32(&ppu_thread::cia))));
+		c.mov(cia_addr_reg, Imm(static_cast<u64>(OFFSET_OF(ppu_thread, cia))));
 		// Load cia
 		c.ldr(pc.w(), arm::Mem(ppu_t_base, cia_addr_reg));
 
@@ -459,7 +463,7 @@ const auto ppu_gateway = build_function_asm<void (*)(ppu_thread*)>("ppu_gateway"
 		c.ldr(a64::x22, arm::Mem(a64::x22));
 
 		const arm::GpX gpr_addr_reg = a64::x9;
-		c.mov(gpr_addr_reg, Imm(static_cast<u64>(::offset32(&ppu_thread::gpr))));
+		c.mov(gpr_addr_reg, Imm(static_cast<u64>(OFFSET_OF(ppu_thread, gpr))));
 		c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base);
 		c.ldr(a64::x23, arm::Mem(gpr_addr_reg));
 		c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8));
@@ -514,7 +518,7 @@ const extern auto ppu_escape = build_function_asm<void (*)(ppu_thread*)>("ppu_es
 
 #if defined(ARCH_X64)
 		// Restore native stack pointer (longjmp emulation)
-		c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)));
+		c.mov(x86::rsp, x86::qword_ptr(args[0], OFFSET_OF(ppu_thread, hv_ctx.regs)));
 
 		// Return to the return location
 		c.sub(x86::rsp, 8);
@@ -523,7 +527,7 @@ const extern auto ppu_escape = build_function_asm<void (*)(ppu_thread*)>("ppu_es
 		// We really shouldn't be using this, but an implementation shoudln't hurt
 	    // Far jump return. Only clobbers x30.
 		const arm::GpX ppu_t_base = a64::x20;
-		const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
+		const u64 hv_register_array_offset = OFFSET_OF(ppu_thread, hv_ctx.regs);
 		c.mov(ppu_t_base, args[0]);
 		c.mov(a64::x30, Imm(hv_register_array_offset));
 		c.ldr(a64::x30, arm::Mem(ppu_t_base, a64::x30));
@@ -581,7 +585,7 @@ static inline ppu_intrp_func_t ppu_read(u32 addr)
 // Get interpreter cache value
 static ppu_intrp_func_t ppu_cache(u32 addr)
 {
-	if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
+	if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy)
 	{
 		fmt::throw_exception("Invalid PPU decoder");
 	}
@@ -882,7 +886,7 @@ extern void ppu_register_range(u32 addr, u32 size)
 
 	while (size)
 	{
-		if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
+		if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy)
 		{
 			// Assume addr is the start of first segment of PRX
 			const uptr entry_value = reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc) | (seg_base << (32 + 3));
@@ -919,7 +923,7 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr =
 		return;
 	}
 
-	if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
+	if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
 	{
 		return;
 	}
@@ -1097,14 +1101,14 @@ struct ppu_far_jumps_t
 
 #ifdef ARCH_X64
 					c.mov(args[0], x86::rbp);
-					c.mov(x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)), pc);
+					c.mov(x86::dword_ptr(args[0], OFFSET_OF(ppu_thread, cia)), pc);
 					c.jmp(ppu_far_jump);
 #else
 					Label jmp_address = c.newLabel();
 					Label imm_address = c.newLabel();
 
 					c.ldr(args[1].w(), arm::ptr(imm_address));
-					c.str(args[1].w(), arm::Mem(args[0], ::offset32(&ppu_thread::cia)));
+					c.str(args[1].w(), arm::Mem(args[0], OFFSET_OF(ppu_thread, cia)));
 					c.ldr(args[1], arm::ptr(jmp_address));
 					c.br(args[1]);
 
@@ -1204,7 +1208,7 @@ bool ppu_form_branch_to_code(u32 entry, u32 target, bool link, bool with_toc, st
 
 	std::lock_guard lock(jumps.mutex);
 	jumps.vals.insert_or_assign(entry, ppu_far_jumps_t::all_info_t{target, link, with_toc, std::move(module_name)});
-	ppu_register_function_at(entry, 4, g_cfg.core.ppu_decoder == ppu_decoder_type::_static ? &ppu_far_jump : ensure(g_fxo->get<ppu_far_jumps_t>().gen_jump<false>(entry)));
+	ppu_register_function_at(entry, 4, g_cfg.core.ppu_decoder != ppu_decoder_type::llvm_legacy ? &ppu_far_jump : ensure(g_fxo->get<ppu_far_jumps_t>().gen_jump<false>(entry)));
 
 	return true;
 }
@@ -1288,7 +1292,7 @@ static void ppu_break(ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_int
 // Set or remove breakpoint
 extern bool ppu_breakpoint(u32 addr, bool is_adding)
 {
-	if (addr % 4 || !vm::check_addr(addr, vm::page_executable) || g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
+	if (addr % 4 || !vm::check_addr(addr, vm::page_executable) || g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy)
 	{
 		return false;
 	}
@@ -1359,7 +1363,7 @@ extern bool ppu_patch(u32 addr, u32 value)
 
 	const bool is_exec = vm::check_addr(addr, vm::page_executable);
 
-	if (is_exec && g_cfg.core.ppu_decoder == ppu_decoder_type::llvm && !Emu.IsReady())
+	if (is_exec && g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy && !Emu.IsReady())
 	{
 		// TODO: support recompilers
 		ppu_log.fatal("Patch failed at 0x%x: LLVM recompiler is used.", addr);
@@ -1648,7 +1652,7 @@ void ppu_thread::dump_regs(std::string& ret, std::any& custom_data) const
 	fmt::append(ret, "LR: 0x%llx\n", lr);
 	fmt::append(ret, "CTR: 0x%llx\n", ctr);
 	fmt::append(ret, "VRSAVE: 0x%08x\n", vrsave);
-	fmt::append(ret, "XER: [CA=%u | OV=%u | SO=%u | CNT=%u]\n", xer.ca, xer.ov, xer.so, xer.cnt);
+	fmt::append(ret, "XER: [CA=%u | OV=%u | SO=%u | CNT=%u]\n", xer_ca, xer_ov, xer_so, xer_cnt);
 	fmt::append(ret, "VSCR: [SAT=%u | NJ=%u]\n", sat, nj);
 	fmt::append(ret, "FPSCR: [FL=%u | FG=%u | FE=%u | FU=%u]\n", fpscr.fl, fpscr.fg, fpscr.fe, fpscr.fu);
 
@@ -2441,9 +2445,10 @@ void ppu_thread::cpu_wait(bs_t<cpu_flag> old)
 	state.wait(old);
 }
 
+// static_assert(offsetof(ppu_thread, gpr[0]) == 24);
 void ppu_thread::exec_task()
 {
-	if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
+	if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy)
 	{
 		// HVContext push to allow recursion. This happens with guest callback invocations.
 		const auto old_hv_ctx = hv_ctx;
@@ -2464,9 +2469,28 @@ void ppu_thread::exec_task()
 		return;
 	}
 
-	const auto cache = vm::g_exec_addr;
 	const auto mem_ = vm::g_base_addr;
 
+	if (g_cfg.core.ppu_decoder == ppu_decoder_type::interpreter)
+	{
+		static PPUInterpreter interpreter;
+
+		while (true)
+		{
+			if (test_stopped()) [[unlikely]]
+			{
+				return;
+			}
+
+			std::uint32_t inst = *reinterpret_cast<be_t<std::uint32_t>*>(mem_ + std::uint64_t{cia});
+			interpreter.interpret(*this, inst);
+		}
+
+		return;
+	}
+
+	const auto cache = vm::g_exec_addr;
+
 	while (true)
 	{
 		if (test_stopped()) [[unlikely]]
@@ -2556,7 +2580,7 @@ void ppu_thread::serialize_common(utils::serial& ar)
 {
 	[[maybe_unused]] const s32 version = GET_OR_USE_SERIALIZATION_VERSION(ar.is_writing(), ppu);
 
-	ar(gpr, fpr, cr, fpscr.bits, lr, ctr, vrsave, cia, xer, sat, nj, prio.raw().all);
+	// ar(gpr, fpr, cr, fpscr.bits, lr, ctr, vrsave, cia, xer, sat, nj, prio.raw().all);
 
 	if (cia % 4 || (cia >> 28) >= 0xCu)
 	{
@@ -3309,7 +3333,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64 (*)(u32 raddr, u64 rtim
 			});
 
 		// Check pause flag
-		c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
+		c.bt(x86::dword_ptr(args[2], OFFSET_OF(ppu_thread, state) - OFFSET_OF(ppu_thread, rdata)), static_cast<u32>(cpu_flag::pause));
 		c.jc(fall);
 		c.xbegin(tx1);
 
@@ -3410,7 +3434,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64 (*)(u32 raddr, u64 rtim
 		}
 
 		c.mov(x86::rax, -1);
-		c.mov(x86::qword_ptr(args[2], ::offset32(&ppu_thread::last_ftime) - ::offset32(&ppu_thread::rdata)), x86::rax);
+		c.mov(x86::qword_ptr(args[2], OFFSET_OF(ppu_thread, last_ftime) - OFFSET_OF(ppu_thread, rdata)), x86::rax);
 		c.xor_(x86::eax, x86::eax);
 		// c.jmp(_ret);
 
@@ -4016,7 +4040,7 @@ extern void ppu_finalize(const ppu_module<lv2_obj>& info, bool force_mem_release
 		}
 	}
 
-	if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm)
+	if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm_legacy)
 	{
 		return;
 	}
@@ -4034,7 +4058,7 @@ extern void ppu_finalize(const ppu_module<lv2_obj>& info, bool force_mem_release
 
 extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_module<lv2_obj>*>* loaded_modules)
 {
-	if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm)
+	if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm_legacy)
 	{
 		return;
 	}
@@ -4744,7 +4768,7 @@ extern void ppu_initialize()
 
 bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_size, concurent_memory_limit& memory_limit)
 {
-	if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm)
+	if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm_legacy)
 	{
 		if (check_only || vm::base(info.segs[0].addr) != info.segs[0].ptr)
 		{
@@ -5106,7 +5130,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 
 				c.add(x86::edx, seg0);
 				c.mov(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr)));
-				c.mov(x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia)), x86::edx);
+				c.mov(x86::dword_ptr(x86::rbp, OFFSET_OF(ppu_thread, cia)), x86::edx);
 
 				c.mov(x86::rax, x86::qword_ptr(x86::rax, x86::rdx, 1, 0)); // Load call target
 				c.mov(x86::rdx, x86::rax);
@@ -5137,7 +5161,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 				code_size_until_jump = buf_end - buf_start;
 
 				// Load offset value
-				c.mov(cia_addr_reg, static_cast<u64>(::offset32(&ppu_thread::cia)));
+				c.mov(cia_addr_reg, static_cast<u64>(OFFSET_OF(ppu_thread, cia)));
 
 				// Update CIA
 				c.str(pc.w(), arm::Mem(ppu_t_base, cia_addr_reg));
diff --git a/rpcs3/Emu/Cell/PPUThread.h b/rpcs3/Emu/Cell/PPUThread.h
index 83c30a7b1..341a89dd9 100644
--- a/rpcs3/Emu/Cell/PPUThread.h
+++ b/rpcs3/Emu/Cell/PPUThread.h
@@ -3,6 +3,7 @@
 #include "../CPU/CPUThread.h"
 #include "../CPU/Hypervisor.h"
 #include "../Memory/vm_ptr.h"
+#include "rx/cpu/cell/ppu/PPUContext.hpp"
 #include "util/lockless.h"
 #include "util/BitField.h"
 
@@ -134,7 +135,7 @@ enum class ppu_debugger_mode : u32
 	max_mode,
 };
 
-class ppu_thread : public cpu_thread
+class ppu_thread : public cpu_thread, public PPUContext
 {
 public:
 	static const u32 id_base = 0x01000000; // TODO (used to determine thread type)
@@ -165,107 +166,6 @@ public:
 
 	using cpu_thread::operator=;
 
-	u64 gpr[32] = {}; // General-Purpose Registers
-	f64 fpr[32] = {}; // Floating Point Registers
-	v128 vr[32] = {}; // Vector Registers
-
-	union alignas(16) cr_bits
-	{
-		u8 bits[32];
-		u32 fields[8];
-
-		u8& operator[](usz i)
-		{
-			return bits[i];
-		}
-
-		// Pack CR bits
-		u32 pack() const
-		{
-			u32 result{};
-
-			for (u32 bit : bits)
-			{
-				result <<= 1;
-				result |= bit;
-			}
-
-			return result;
-		}
-
-		// Unpack CR bits
-		void unpack(u32 value)
-		{
-			for (u8& b : bits)
-			{
-				b = !!(value & (1u << 31));
-				value <<= 1;
-			}
-		}
-	};
-
-	cr_bits cr{}; // Condition Registers (unpacked)
-
-	// Floating-Point Status and Control Register (unpacked)
-	union
-	{
-		struct
-		{
-			// TODO
-			bool _start[16];
-			bool fl; // FPCC.FL
-			bool fg; // FPCC.FG
-			bool fe; // FPCC.FE
-			bool fu; // FPCC.FU
-			bool _end[12];
-		};
-
-		u32 fields[8];
-		cr_bits bits;
-	} fpscr{};
-
-	u64 lr{};               // Link Register
-	u64 ctr{};              // Counter Register
-	u32 vrsave{0xffffffff}; // VR Save Register
-	u32 cia{};              // Current Instruction Address
-
-	// Fixed-Point Exception Register (abstract representation)
-	struct
-	{
-		ENABLE_BITWISE_SERIALIZATION;
-
-		bool so{}; // Summary Overflow
-		bool ov{}; // Overflow
-		bool ca{}; // Carry
-		u8 cnt{};  // 0..6
-	} xer;
-
-	/*
-	    Non-Java. A mode control bit that determines whether vector floating-point operations will be performed
-	    in a Java-IEEE-C9X-compliant mode or a possibly faster non-Java/non-IEEE mode.
-	    0	The Java-IEEE-C9X-compliant mode is selected. Denormalized values are handled as specified
-	        by Java, IEEE, and C9X standard.
-	    1	The non-Java/non-IEEE-compliant mode is selected. If an element in a source vector register
-	        contains a denormalized value, the value '0' is used instead. If an instruction causes an underflow
-	        exception, the corresponding element in the target vr is cleared to '0'. In both cases, the '0'
-	        has the same sign as the denormalized or underflowing value.
-	*/
-	bool nj = true;
-
-	// Sticky saturation bit
-	v128 sat{};
-
-	// Optimization: precomputed java-mode mask for handling denormals
-	u32 jm_mask = 0x7f80'0000;
-
-	u32 raddr{0}; // Reservation addr
-	u64 rtime{0};
-	alignas(64) std::byte rdata[128]{}; // Reservation data
-	bool use_full_rdata{};
-	u32 res_cached{0}; // Reservation "cached" addresss
-	u32 res_notify{0};
-	u64 res_notify_time{0};
-
 	union ppu_prio_t
 	{
 		u64 all;
diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp
index 46d28c14e..819498292 100644
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@@ -60,7 +60,7 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo
 			{
 				.debug_info = false,       // Set to "true" to insert debug frames on x27
 				.use_stack_frames = false, // We don't need this since the PPU GW allocates global scratch on the stack
-				.hypervisor_context_offset = ::offset32(&ppu_thread::hv_ctx),
+				.hypervisor_context_offset = OFFSET_OF(ppu_thread, hv_ctx),
 				.exclusion_callback = {}, // Unused, we don't have special exclusion functions on PPU
 				.base_register_lookup = base_reg_lookup,
 				.faux_function_list = std::move(faux_functions_list)};
@@ -76,8 +76,8 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo
 	reset_transforms();
 
 	// Thread context struct (TODO: safer member access)
-	const u32 off0 = offset32(&ppu_thread::state);
-	const u32 off1 = offset32(&ppu_thread::gpr);
+	const u32 off0 = OFFSET_OF(ppu_thread, state);
+	const u32 off1 = OFFSET_OF(ppu_thread, gpr);
 	std::vector<Type*> thread_struct;
 	thread_struct.emplace_back(ArrayType::get(GetType<char>(), off0));
 	thread_struct.emplace_back(GetType<u32>()); // state
diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
index 94397eb76..b57a07b53 100644
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@@ -16,11 +16,11 @@
 #include <cmath>
 #include <thread>
 
-#define SPU_OFF_128(x, ...) asmjit::x86::oword_ptr(*cpu, offset32(&spu_thread::x, ##__VA_ARGS__))
-#define SPU_OFF_64(x, ...) asmjit::x86::qword_ptr(*cpu, offset32(&spu_thread::x, ##__VA_ARGS__))
-#define SPU_OFF_32(x, ...) asmjit::x86::dword_ptr(*cpu, offset32(&spu_thread::x, ##__VA_ARGS__))
-#define SPU_OFF_16(x, ...) asmjit::x86::word_ptr(*cpu, offset32(&spu_thread::x, ##__VA_ARGS__))
-#define SPU_OFF_8(x, ...) asmjit::x86::byte_ptr(*cpu, offset32(&spu_thread::x, ##__VA_ARGS__))
+#define SPU_OFF_128(x) asmjit::x86::oword_ptr(*cpu, OFFSET_OF(spu_thread, x))
+#define SPU_OFF_64(x) asmjit::x86::qword_ptr(*cpu, OFFSET_OF(spu_thread, x))
+#define SPU_OFF_32(x) asmjit::x86::dword_ptr(*cpu, OFFSET_OF(spu_thread, x))
+#define SPU_OFF_16(x) asmjit::x86::word_ptr(*cpu, OFFSET_OF(spu_thread, x))
+#define SPU_OFF_8(x) asmjit::x86::byte_ptr(*cpu, OFFSET_OF(spu_thread, x))
 
 const spu_decoder<spu_recompiler> s_spu_decoder;
 
@@ -945,9 +945,9 @@ spu_recompiler::XmmLink spu_recompiler::XmmGet(s8 reg, XmmType type) // get xmm
 
 	switch (type)
 	{
-	case XmmType::Int: c->movdqa(result, SPU_OFF_128(gpr, reg)); break;
-	case XmmType::Float: c->movaps(result, SPU_OFF_128(gpr, reg)); break;
-	case XmmType::Double: c->movapd(result, SPU_OFF_128(gpr, reg)); break;
+	case XmmType::Int: c->movdqa(result, SPU_OFF_128(gpr[reg])); break;
+	case XmmType::Float: c->movaps(result, SPU_OFF_128(gpr[reg])); break;
+	case XmmType::Double: c->movapd(result, SPU_OFF_128(gpr[reg])); break;
 	default: fmt::throw_exception("Invalid XmmType");
 	}
 
@@ -1117,9 +1117,9 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
 	{
 		// Get stack pointer, try to use native return address (check SPU return address)
 		Label fail = c->newLabel();
-		c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3));
+		c->mov(qw1->r32(), SPU_OFF_32(gpr[1]._u32[3]));
 		c->and_(qw1->r32(), 0x3fff0);
-		c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, ::offset32(&spu_thread::stack_mirror)));
+		c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, OFFSET_OF(spu_thread, stack_mirror)));
 		c->cmp(x86::dword_ptr(*qw1, 8), *addr);
 		c->jne(fail);
 		c->mov(pc0->r32(), x86::dword_ptr(*qw1, 12));
@@ -1179,9 +1179,9 @@ void spu_recompiler::branch_set_link(u32 target)
 			Label ret = c->newLabel();
 
 			// Get stack pointer, write native and SPU return addresses into the stack mirror
-			c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3));
+			c->mov(qw1->r32(), SPU_OFF_32(gpr[1]._u32[3]));
 			c->and_(qw1->r32(), 0x3fff0);
-			c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, ::offset32(&spu_thread::stack_mirror)));
+			c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, OFFSET_OF(spu_thread, stack_mirror)));
 			c->lea(x86::r10, x86::qword_ptr(ret));
 			c->mov(x86::qword_ptr(*qw1, 0), x86::r10);
 			c->lea(x86::r10, get_pc(target));
@@ -1194,10 +1194,10 @@ void spu_recompiler::branch_set_link(u32 target)
 					// Clear return info after use
 					c->align(AlignMode::kCode, 16);
 					c->bind(ret);
-					c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3));
+					c->mov(qw1->r32(), SPU_OFF_32(gpr[1]._u32[3]));
 					c->and_(qw1->r32(), 0x3fff0);
 					c->pcmpeqd(x86::xmm0, x86::xmm0);
-					c->movdqa(x86::dqword_ptr(*cpu, *qw1, 0, ::offset32(&spu_thread::stack_mirror)), x86::xmm0);
+					c->movdqa(x86::dqword_ptr(*cpu, *qw1, 0, OFFSET_OF(spu_thread, stack_mirror)), x86::xmm0);
 
 					// Set block hash for profiling (if enabled)
 					if (g_cfg.core.spu_prof)
@@ -1319,7 +1319,7 @@ void spu_recompiler::MFSPR(spu_opcode_t op)
 	// Check SPUInterpreter for notes.
 	const XmmLink& vr = XmmAlloc();
 	c->pxor(vr, vr);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 }
 
 static u32 spu_rdch(spu_thread* _spu, u32 ch)
@@ -1383,7 +1383,7 @@ void spu_recompiler::RDCH(spu_opcode_t op)
 		c->bind(ret);
 		c->movd(x86::xmm0, *addr);
 		c->pslldq(x86::xmm0, 12);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), x86::xmm0);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), x86::xmm0);
 	};
 
 	switch (op.ra)
@@ -1393,7 +1393,7 @@ void spu_recompiler::RDCH(spu_opcode_t op)
 		const XmmLink& vr = XmmAlloc();
 		c->movd(vr, SPU_OFF_32(srr0));
 		c->pslldq(vr, 12);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 		return;
 	}
 	case SPU_RdInMbox:
@@ -1411,7 +1411,7 @@ void spu_recompiler::RDCH(spu_opcode_t op)
 		const XmmLink& vr = XmmAlloc();
 		c->movd(vr, SPU_OFF_32(ch_tag_mask));
 		c->pslldq(vr, 12);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 		return;
 	}
 	case SPU_RdSigNotify1:
@@ -1467,7 +1467,7 @@ void spu_recompiler::RDCH(spu_opcode_t op)
 		c->lea(addr->r64(), get_pc(m_pos));
 		c->and_(*addr, 0x3fffc);
 		c->mov(SPU_OFF_32(pc), *addr);
-		c->lea(*arg1, SPU_OFF_128(gpr, op.rt));
+		c->lea(*arg1, SPU_OFF_128(gpr[op.rt]));
 		c->mov(*arg0, *cpu);
 		c->call(g_cfg.core.spu_loop_detection ? +sub1 : +sub2);
 		return;
@@ -1478,7 +1478,7 @@ void spu_recompiler::RDCH(spu_opcode_t op)
 		c->movq(vr, SPU_OFF_64(ch_events));
 		c->psrldq(vr, 4);
 		c->pslldq(vr, 12);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 		return;
 	}
 	case SPU_RdEventStat:
@@ -1495,7 +1495,7 @@ void spu_recompiler::RDCH(spu_opcode_t op)
 		c->or_(addr->r32(), arg1->r32());
 		c->movd(vr, *addr);
 		c->pslldq(vr, 12);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 		return;
 	}
 	default: break;
@@ -1509,7 +1509,7 @@ void spu_recompiler::RDCH(spu_opcode_t op)
 	c->call(spu_rdch);
 	c->movd(x86::xmm0, *addr);
 	c->pslldq(x86::xmm0, 12);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), x86::xmm0);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), x86::xmm0);
 }
 
 static u32 spu_rchcnt(spu_thread* _spu, u32 ch)
@@ -1530,7 +1530,7 @@ void spu_recompiler::RCHCNT(spu_opcode_t op)
 		if (inv)
 			c->pxor(vr, XmmConst(v128::from32p(1)));
 		c->pslldq(vr, 12);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 	};
 
 	switch (op.ra)
@@ -1549,7 +1549,7 @@ void spu_recompiler::RCHCNT(spu_opcode_t op)
 		c->mov(addr->r32(), 1);
 		c->movd(vr, addr->r32());
 		c->pslldq(vr, 12);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 		return;
 	}
 
@@ -1561,7 +1561,7 @@ void spu_recompiler::RCHCNT(spu_opcode_t op)
 		c->movd(v1, SPU_OFF_32(mfc_size));
 		c->psubd(vr, v1);
 		c->pslldq(vr, 12);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 		return;
 	}
 
@@ -1571,7 +1571,7 @@ void spu_recompiler::RCHCNT(spu_opcode_t op)
 		c->movdqa(vr, SPU_OFF_128(ch_in_mbox));
 		c->pslldq(vr, 14);
 		c->psrldq(vr, 3);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 		return;
 	}
 	// Channels with a constant count of 1:
@@ -1599,7 +1599,7 @@ void spu_recompiler::RCHCNT(spu_opcode_t op)
 		c->mov(addr->r32(), 1);
 		c->movd(vr, addr->r32());
 		c->pslldq(vr, 12);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 		return;
 	}
 	case SPU_RdEventStat:
@@ -1622,22 +1622,22 @@ void spu_recompiler::RCHCNT(spu_opcode_t op)
 	// Use result from the third argument
 	c->movd(x86::xmm0, *addr);
 	c->pslldq(x86::xmm0, 12);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), x86::xmm0);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), x86::xmm0);
 }
 
 void spu_recompiler::SF(spu_opcode_t op)
 {
 	// sub from
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
-	c->psubd(vb, SPU_OFF_128(gpr, op.ra));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
+	c->psubd(vb, SPU_OFF_128(gpr[op.ra]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::OR(spu_opcode_t op)
 {
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
-	c->por(vb, SPU_OFF_128(gpr, op.ra));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
+	c->por(vb, SPU_OFF_128(gpr[op.ra]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::BG(spu_opcode_t op)
@@ -1652,24 +1652,24 @@ void spu_recompiler::BG(spu_opcode_t op)
 		c->vpsubd(vi, vb, va);
 		c->vpternlogd(va, vb, vi, 0x4d /* B?nandAC:norAC */);
 		c->psrld(va, 31);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 		return;
 	}
 
 	c->movdqa(vi, XmmConst(v128::from32p(0x80000000)));
 	c->pxor(va, vi);
-	c->pxor(vi, SPU_OFF_128(gpr, op.rb));
+	c->pxor(vi, SPU_OFF_128(gpr[op.rb]));
 	c->pcmpgtd(va, vi);
 	c->paddd(va, XmmConst(v128::from32p(1)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::SFH(spu_opcode_t op)
 {
 	// sub from (halfword)
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
-	c->psubw(vb, SPU_OFF_128(gpr, op.ra));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
+	c->psubw(vb, SPU_OFF_128(gpr[op.ra]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::NOR(spu_opcode_t op)
@@ -1678,14 +1678,14 @@ void spu_recompiler::NOR(spu_opcode_t op)
 
 	if (utils::has_avx512())
 	{
-		c->vpternlogd(va, va, SPU_OFF_128(gpr, op.rb), 0x11 /* norCB */);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+		c->vpternlogd(va, va, SPU_OFF_128(gpr[op.rb]), 0x11 /* norCB */);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 		return;
 	}
 
-	c->por(va, SPU_OFF_128(gpr, op.rb));
+	c->por(va, SPU_OFF_128(gpr[op.rb]));
 	c->pxor(va, XmmConst(v128::from32p(0xffffffff)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ABSDB(spu_opcode_t op)
@@ -1697,7 +1697,7 @@ void spu_recompiler::ABSDB(spu_opcode_t op)
 	c->pmaxub(va, vb);
 	c->pminub(vb, vm);
 	c->psubb(va, vb);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ROT(spu_opcode_t op)
@@ -1708,7 +1708,7 @@ void spu_recompiler::ROT(spu_opcode_t op)
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
 		c->vprolvd(vt, va, vb);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
@@ -1725,7 +1725,7 @@ void spu_recompiler::ROT(spu_opcode_t op)
 		c->pandn(vb, v4);
 		c->vpsrlvd(va, va, vb);
 		c->por(vt, va);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
@@ -1735,16 +1735,16 @@ void spu_recompiler::ROT(spu_opcode_t op)
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
 		c->vprotd(vt, va, vb);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
 	for (u32 i = 0; i < 4; i++) // unrolled loop
 	{
-		c->mov(qw0->r32(), SPU_OFF_32(gpr, op.ra, &v128::_u32, i));
-		c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr, op.rb, &v128::_u32, i));
+		c->mov(qw0->r32(), SPU_OFF_32(gpr[op.ra]._u32[i]));
+		c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr[op.rb]._u32[i]));
 		c->rol(qw0->r32(), asmjit::x86::cl);
-		c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32());
+		c->mov(SPU_OFF_32(gpr[op.rt]._u32[i]), qw0->r32());
 	}
 }
 
@@ -1758,7 +1758,7 @@ void spu_recompiler::ROTM(spu_opcode_t op)
 		c->psubd(vb, XmmConst(v128::from32p(1)));
 		c->pandn(vb, XmmConst(v128::from32p(0x3f)));
 		c->vpsrlvd(vt, va, vb);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
@@ -1774,17 +1774,17 @@ void spu_recompiler::ROTM(spu_opcode_t op)
 		c->pcmpgtd(vb, XmmConst(v128::from32p(31)));
 		c->vpshld(vt, va, vt);
 		c->vpandn(vt, vb, vt);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
 	for (u32 i = 0; i < 4; i++) // unrolled loop
 	{
-		c->mov(qw0->r32(), SPU_OFF_32(gpr, op.ra, &v128::_u32, i));
-		c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr, op.rb, &v128::_u32, i));
+		c->mov(qw0->r32(), SPU_OFF_32(gpr[op.ra]._u32[i]));
+		c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr[op.rb]._u32[i]));
 		c->neg(asmjit::x86::ecx);
 		c->shr(*qw0, asmjit::x86::cl);
-		c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32());
+		c->mov(SPU_OFF_32(gpr[op.rt]._u32[i]), qw0->r32());
 	}
 }
 
@@ -1798,7 +1798,7 @@ void spu_recompiler::ROTMA(spu_opcode_t op)
 		c->psubd(vb, XmmConst(v128::from32p(1)));
 		c->pandn(vb, XmmConst(v128::from32p(0x3f)));
 		c->vpsravd(vt, va, vb);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
@@ -1813,17 +1813,17 @@ void spu_recompiler::ROTMA(spu_opcode_t op)
 		c->pminud(vb, XmmConst(v128::from32p(31)));
 		c->psubd(vt, vb);
 		c->vpshad(vt, va, vt);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
 	for (u32 i = 0; i < 4; i++) // unrolled loop
 	{
-		c->movsxd(*qw0, SPU_OFF_32(gpr, op.ra, &v128::_u32, i));
-		c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr, op.rb, &v128::_u32, i));
+		c->movsxd(*qw0, SPU_OFF_32(gpr[op.ra]._u32[i]));
+		c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr[op.rb]._u32[i]));
 		c->neg(asmjit::x86::ecx);
 		c->sar(*qw0, asmjit::x86::cl);
-		c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32());
+		c->mov(SPU_OFF_32(gpr[op.rt]._u32[i]), qw0->r32());
 	}
 }
 
@@ -1836,7 +1836,7 @@ void spu_recompiler::SHL(spu_opcode_t op)
 		const XmmLink& vt = XmmAlloc();
 		c->pand(vb, XmmConst(v128::from32p(0x3f)));
 		c->vpsllvd(vt, va, vb);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
@@ -1849,16 +1849,16 @@ void spu_recompiler::SHL(spu_opcode_t op)
 		c->vpcmpgtd(vt, vb, XmmConst(v128::from32p(31)));
 		c->vpshld(vb, va, vb);
 		c->pandn(vt, vb);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
 	for (u32 i = 0; i < 4; i++) // unrolled loop
 	{
-		c->mov(qw0->r32(), SPU_OFF_32(gpr, op.ra, &v128::_u32, i));
-		c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr, op.rb, &v128::_u32, i));
+		c->mov(qw0->r32(), SPU_OFF_32(gpr[op.ra]._u32[i]));
+		c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr[op.rb]._u32[i]));
 		c->shl(*qw0, asmjit::x86::cl);
-		c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32());
+		c->mov(SPU_OFF_32(gpr[op.rt]._u32[i]), qw0->r32());
 	}
 }
 
@@ -1878,7 +1878,7 @@ void spu_recompiler::ROTH(spu_opcode_t op) // nf
 		c->vprolvd(va, va, v4);
 		c->vprolvd(vb, vt, vb);
 		c->vpblendw(vt, vb, va, 0xaa);
-		c->vmovdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->vmovdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
@@ -1888,16 +1888,16 @@ void spu_recompiler::ROTH(spu_opcode_t op) // nf
 		const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
 		c->vprotw(vt, va, vb);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
 	for (u32 i = 0; i < 8; i++) // unrolled loop
 	{
-		c->movzx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i));
-		c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr, op.rb, &v128::_u16, i));
+		c->movzx(qw0->r32(), SPU_OFF_16(gpr[op.ra]._u16[i]));
+		c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr[op.rb]._u16[i]));
 		c->rol(qw0->r16(), asmjit::x86::cl);
-		c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16());
+		c->mov(SPU_OFF_16(gpr[op.rt]._u16[i]), qw0->r16());
 	}
 }
 
@@ -1911,7 +1911,7 @@ void spu_recompiler::ROTHM(spu_opcode_t op)
 		c->psubw(vb, XmmConst(v128::from16p(1)));
 		c->pandn(vb, XmmConst(v128::from16p(0x1f)));
 		c->vpsrlvw(vt, va, vb);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
@@ -1931,7 +1931,7 @@ void spu_recompiler::ROTHM(spu_opcode_t op)
 		c->vpsrlvd(va, va, v4);
 		c->vpsrlvd(vb, vb, v5);
 		c->vpblendw(vt, vb, va, 0xaa); // can use vpblendvb with 0xffff0000 mask (vt)
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
@@ -1947,17 +1947,17 @@ void spu_recompiler::ROTHM(spu_opcode_t op)
 		c->pcmpgtw(vb, XmmConst(v128::from16p(15)));
 		c->vpshlw(vt, va, vt);
 		c->vpandn(vt, vb, vt);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
 	for (u32 i = 0; i < 8; i++) // unrolled loop
 	{
-		c->movzx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i));
-		c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr, op.rb, &v128::_u16, i));
+		c->movzx(qw0->r32(), SPU_OFF_16(gpr[op.ra]._u16[i]));
+		c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr[op.rb]._u16[i]));
 		c->neg(asmjit::x86::ecx);
 		c->shr(qw0->r32(), asmjit::x86::cl);
-		c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16());
+		c->mov(SPU_OFF_16(gpr[op.rt]._u16[i]), qw0->r16());
 	}
 }
 
@@ -1971,7 +1971,7 @@ void spu_recompiler::ROTMAH(spu_opcode_t op)
 		c->psubw(vb, XmmConst(v128::from16p(1)));
 		c->pandn(vb, XmmConst(v128::from16p(0x1f)));
 		c->vpsravw(vt, va, vb);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
@@ -1993,7 +1993,7 @@ void spu_recompiler::ROTMAH(spu_opcode_t op)
 		c->vpsravd(va, va, v4);
 		c->vpsravd(vb, vb, v5);
 		c->vpblendw(vt, vb, va, 0xaa);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
@@ -2008,17 +2008,17 @@ void spu_recompiler::ROTMAH(spu_opcode_t op)
 		c->pminuw(vb, XmmConst(v128::from16p(15)));
 		c->psubw(vt, vb);
 		c->vpshaw(vt, va, vt);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
 	for (u32 i = 0; i < 8; i++) // unrolled loop
 	{
-		c->movsx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i));
-		c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr, op.rb, &v128::_u16, i));
+		c->movsx(qw0->r32(), SPU_OFF_16(gpr[op.ra]._u16[i]));
+		c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr[op.rb]._u16[i]));
 		c->neg(asmjit::x86::ecx);
 		c->sar(qw0->r32(), asmjit::x86::cl);
-		c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16());
+		c->mov(SPU_OFF_16(gpr[op.rt]._u16[i]), qw0->r16());
 	}
 }
 
@@ -2031,7 +2031,7 @@ void spu_recompiler::SHLH(spu_opcode_t op)
 		const XmmLink& vt = XmmAlloc();
 		c->pand(vb, XmmConst(v128::from16p(0x1f)));
 		c->vpsllvw(vt, va, vb);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
@@ -2050,7 +2050,7 @@ void spu_recompiler::SHLH(spu_opcode_t op)
 		c->vpsllvd(va, va, v5);
 		c->vpsllvd(vb, vb, v4);
 		c->vpblendw(vt, vb, va, 0x55);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
@@ -2063,16 +2063,16 @@ void spu_recompiler::SHLH(spu_opcode_t op)
 		c->vpcmpgtw(vt, vb, XmmConst(v128::from16p(15)));
 		c->vpshlw(vb, va, vb);
 		c->pandn(vt, vb);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
 	for (u32 i = 0; i < 8; i++) // unrolled loop
 	{
-		c->movzx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i));
-		c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr, op.rb, &v128::_u16, i));
+		c->movzx(qw0->r32(), SPU_OFF_16(gpr[op.ra]._u16[i]));
+		c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr[op.rb]._u16[i]));
 		c->shl(qw0->r32(), asmjit::x86::cl);
-		c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16());
+		c->mov(SPU_OFF_16(gpr[op.rt]._u16[i]), qw0->r16());
 	}
 }
 
@@ -2085,7 +2085,7 @@ void spu_recompiler::ROTI(spu_opcode_t op)
 	{
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		c->vprold(va, va, s);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 		return;
 	}
 
@@ -2093,7 +2093,7 @@ void spu_recompiler::ROTI(spu_opcode_t op)
 	{
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		c->vprotd(va, va, s);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 		return;
 	}
 
@@ -2103,7 +2103,7 @@ void spu_recompiler::ROTI(spu_opcode_t op)
 	c->pslld(va, s);
 	c->psrld(v1, 32 - s);
 	c->por(va, v1);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ROTMI(spu_opcode_t op)
@@ -2112,7 +2112,7 @@ void spu_recompiler::ROTMI(spu_opcode_t op)
 	const int s = (0 - op.i7) & 0x3f;
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->psrld(va, s);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ROTMAI(spu_opcode_t op)
@@ -2121,7 +2121,7 @@ void spu_recompiler::ROTMAI(spu_opcode_t op)
 	const int s = (0 - op.i7) & 0x3f;
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->psrad(va, s);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::SHLI(spu_opcode_t op)
@@ -2130,7 +2130,7 @@ void spu_recompiler::SHLI(spu_opcode_t op)
 	const int s = op.i7 & 0x3f;
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pslld(va, s);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ROTHI(spu_opcode_t op)
@@ -2143,7 +2143,7 @@ void spu_recompiler::ROTHI(spu_opcode_t op)
 	c->psllw(va, s);
 	c->psrlw(v1, 16 - s);
 	c->por(va, v1);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ROTHMI(spu_opcode_t op)
@@ -2152,7 +2152,7 @@ void spu_recompiler::ROTHMI(spu_opcode_t op)
 	const int s = (0 - op.i7) & 0x1f;
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->psrlw(va, s);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ROTMAHI(spu_opcode_t op)
@@ -2161,7 +2161,7 @@ void spu_recompiler::ROTMAHI(spu_opcode_t op)
 	const int s = (0 - op.i7) & 0x1f;
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->psraw(va, s);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::SHLHI(spu_opcode_t op)
@@ -2170,22 +2170,22 @@ void spu_recompiler::SHLHI(spu_opcode_t op)
 	const int s = op.i7 & 0x1f;
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->psllw(va, s);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::A(spu_opcode_t op)
 {
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
-	c->paddd(vb, SPU_OFF_128(gpr, op.ra));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
+	c->paddd(vb, SPU_OFF_128(gpr[op.ra]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::AND(spu_opcode_t op)
 {
 	// and
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
-	c->pand(vb, SPU_OFF_128(gpr, op.ra));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
+	c->pand(vb, SPU_OFF_128(gpr[op.ra]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::CG(spu_opcode_t op)
@@ -2199,7 +2199,7 @@ void spu_recompiler::CG(spu_opcode_t op)
 		c->vpaddd(vi, vb, va);
 		c->vpternlogd(vi, va, vb, 0x8e /* A?andBC:orBC */);
 		c->psrld(vi, 31);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vi);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vi);
 		return;
 	}
 
@@ -2209,14 +2209,14 @@ void spu_recompiler::CG(spu_opcode_t op)
 	c->pxor(vb, vi);
 	c->pcmpgtd(va, vb);
 	c->psrld(va, 31);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::AH(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->paddw(va, SPU_OFF_128(gpr, op.rb));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->paddw(va, SPU_OFF_128(gpr[op.rb]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::NAND(spu_opcode_t op)
@@ -2226,21 +2226,21 @@ void spu_recompiler::NAND(spu_opcode_t op)
 
 	if (utils::has_avx512())
 	{
-		c->vpternlogd(va, va, SPU_OFF_128(gpr, op.rb), 0x77 /* nandCB */);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+		c->vpternlogd(va, va, SPU_OFF_128(gpr[op.rb]), 0x77 /* nandCB */);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 		return;
 	}
 
-	c->pand(va, SPU_OFF_128(gpr, op.rb));
+	c->pand(va, SPU_OFF_128(gpr[op.rb]));
 	c->pxor(va, XmmConst(v128::from32p(0xffffffff)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::AVGB(spu_opcode_t op)
 {
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
-	c->pavgb(vb, SPU_OFF_128(gpr, op.ra));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
+	c->pavgb(vb, SPU_OFF_128(gpr[op.ra]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::MTSPR(spu_opcode_t)
@@ -2284,7 +2284,7 @@ void spu_recompiler::WRCH(spu_opcode_t op)
 	{
 	case SPU_WrSRR0:
 	{
-		c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
+		c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3]));
 		c->and_(*addr, 0x3fffc);
 		c->mov(SPU_OFF_32(srr0), *addr);
 		return;
@@ -2299,7 +2299,7 @@ void spu_recompiler::WRCH(spu_opcode_t op)
 		Label wait = c->newLabel();
 		Label again = c->newLabel();
 		Label ret = c->newLabel();
-		c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
+		c->mov(qw0->r32(), SPU_OFF_32(gpr[op.rt]._u32[3]));
 		c->mov(addr->r64(), SPU_OFF_64(ch_out_mbox));
 		c->align(AlignMode::kCode, 16);
 		c->bind(again);
@@ -2329,7 +2329,7 @@ void spu_recompiler::WRCH(spu_opcode_t op)
 	{
 		Label upd = c->newLabel();
 		Label ret = c->newLabel();
-		c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
+		c->mov(qw0->r32(), SPU_OFF_32(gpr[op.rt]._u32[3]));
 		c->mov(SPU_OFF_32(ch_tag_mask), qw0->r32());
 		c->cmp(SPU_OFF_32(ch_tag_upd), MFC_TAG_UPDATE_IMMEDIATE);
 		c->jnz(upd);
@@ -2354,7 +2354,7 @@ void spu_recompiler::WRCH(spu_opcode_t op)
 		Label fail = c->newLabel();
 		Label zero = c->newLabel();
 		Label ret = c->newLabel();
-		c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
+		c->mov(qw0->r32(), SPU_OFF_32(gpr[op.rt]._u32[3]));
 		c->cmp(qw0->r32(), 2);
 		c->ja(fail);
 
@@ -2394,40 +2394,40 @@ void spu_recompiler::WRCH(spu_opcode_t op)
 	}
 	case MFC_LSA:
 	{
-		c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
-		c->mov(SPU_OFF_32(ch_mfc_cmd, &spu_mfc_cmd::lsa), *addr);
+		c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3]));
+		c->mov(SPU_OFF_32(ch_mfc_cmd.lsa), *addr);
 		return;
 	}
 	case MFC_EAH:
 	{
-		c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
-		c->mov(SPU_OFF_32(ch_mfc_cmd, &spu_mfc_cmd::eah), *addr);
+		c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3]));
+		c->mov(SPU_OFF_32(ch_mfc_cmd.eah), *addr);
 		return;
 	}
 	case MFC_EAL:
 	{
-		c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
-		c->mov(SPU_OFF_32(ch_mfc_cmd, &spu_mfc_cmd::eal), *addr);
+		c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3]));
+		c->mov(SPU_OFF_32(ch_mfc_cmd.eal), *addr);
 		return;
 	}
 	case MFC_Size:
 	{
-		c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
+		c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3]));
 		c->and_(*addr, 0x7fff);
-		c->mov(SPU_OFF_16(ch_mfc_cmd, &spu_mfc_cmd::size), addr->r16());
+		c->mov(SPU_OFF_16(ch_mfc_cmd.size), addr->r16());
 		return;
 	}
 	case MFC_TagID:
 	{
-		c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
+		c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3]));
 		c->and_(*addr, 0x1f);
-		c->mov(SPU_OFF_8(ch_mfc_cmd, &spu_mfc_cmd::tag), addr->r8());
+		c->mov(SPU_OFF_8(ch_mfc_cmd.tag), addr->r8());
 		return;
 	}
 	case MFC_Cmd:
 	{
-		c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
-		c->mov(SPU_OFF_8(ch_mfc_cmd, &spu_mfc_cmd::cmd), addr->r8());
+		c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3]));
+		c->mov(SPU_OFF_8(ch_mfc_cmd.cmd), addr->r8());
 		c->lea(addr->r64(), get_pc(m_pos));
 		c->and_(*addr, 0x3fffc);
 		c->mov(SPU_OFF_32(pc), *addr);
@@ -2452,7 +2452,7 @@ void spu_recompiler::WRCH(spu_opcode_t op)
 		};
 
 		Label ret = c->newLabel();
-		c->mov(arg1->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
+		c->mov(arg1->r32(), SPU_OFF_32(gpr[op.rt]._u32[3]));
 		c->and_(arg1->r32(), 0x1f);
 		c->btr(SPU_OFF_32(ch_stall_mask), arg1->r32());
 		c->jnc(ret);
@@ -2471,7 +2471,7 @@ void spu_recompiler::WRCH(spu_opcode_t op)
 
 		c->mov(*arg0, *cpu);
 		c->call(+sub);
-		c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
+		c->mov(qw0->r32(), SPU_OFF_32(gpr[op.rt]._u32[3]));
 		c->mov(SPU_OFF_32(ch_dec_value), qw0->r32());
 		c->mov(SPU_OFF_8(is_dec_frozen), 0);
 		return;
@@ -2499,7 +2499,7 @@ void spu_recompiler::WRCH(spu_opcode_t op)
 	c->and_(*addr, 0x3fffc);
 	c->mov(SPU_OFF_32(pc), *addr);
 	c->mov(arg1->r32(), +op.ra);
-	c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
+	c->mov(qw0->r32(), SPU_OFF_32(gpr[op.rt]._u32[3]));
 	c->mov(*arg0, *cpu);
 	c->call(spu_wrch);
 }
@@ -2507,14 +2507,14 @@ void spu_recompiler::WRCH(spu_opcode_t op)
 void spu_recompiler::BIZ(spu_opcode_t op)
 {
 	asmjit::Label branch_label = c->newLabel();
-	c->cmp(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), 0);
+	c->cmp(SPU_OFF_32(gpr[op.rt]._u32[3]), 0);
 	c->je(branch_label);
 
 	after.emplace_back([=, this, jt = m_targets[m_pos].size() > 1]
 		{
 			c->align(asmjit::AlignMode::kCode, 16);
 			c->bind(branch_label);
-			c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+			c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 			c->and_(*addr, 0x3fffc);
 			branch_indirect(op, jt);
 		});
@@ -2523,14 +2523,14 @@ void spu_recompiler::BIZ(spu_opcode_t op)
 void spu_recompiler::BINZ(spu_opcode_t op)
 {
 	asmjit::Label branch_label = c->newLabel();
-	c->cmp(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), 0);
+	c->cmp(SPU_OFF_32(gpr[op.rt]._u32[3]), 0);
 	c->jne(branch_label);
 
 	after.emplace_back([=, this, jt = m_targets[m_pos].size() > 1]
 		{
 			c->align(asmjit::AlignMode::kCode, 16);
 			c->bind(branch_label);
-			c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+			c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 			c->and_(*addr, 0x3fffc);
 			branch_indirect(op, jt);
 		});
@@ -2539,14 +2539,14 @@ void spu_recompiler::BINZ(spu_opcode_t op)
 void spu_recompiler::BIHZ(spu_opcode_t op)
 {
 	asmjit::Label branch_label = c->newLabel();
-	c->cmp(SPU_OFF_16(gpr, op.rt, &v128::_u16, 6), 0);
+	c->cmp(SPU_OFF_16(gpr[op.rt]._u16[6]), 0);
 	c->je(branch_label);
 
 	after.emplace_back([=, this, jt = m_targets[m_pos].size() > 1]
 		{
 			c->align(asmjit::AlignMode::kCode, 16);
 			c->bind(branch_label);
-			c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+			c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 			c->and_(*addr, 0x3fffc);
 			branch_indirect(op, jt);
 		});
@@ -2555,14 +2555,14 @@ void spu_recompiler::BIHZ(spu_opcode_t op)
 void spu_recompiler::BIHNZ(spu_opcode_t op)
 {
 	asmjit::Label branch_label = c->newLabel();
-	c->cmp(SPU_OFF_16(gpr, op.rt, &v128::_u16, 6), 0);
+	c->cmp(SPU_OFF_16(gpr[op.rt]._u16[6]), 0);
 	c->jne(branch_label);
 
 	after.emplace_back([=, this, jt = m_targets[m_pos].size() > 1]
 		{
 			c->align(asmjit::AlignMode::kCode, 16);
 			c->bind(branch_label);
-			c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+			c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 			c->and_(*addr, 0x3fffc);
 			branch_indirect(op, jt);
 		});
@@ -2575,8 +2575,8 @@ void spu_recompiler::STOPD(spu_opcode_t)
 
 void spu_recompiler::STQX(spu_opcode_t op)
 {
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
-	c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
+	c->add(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
 	c->and_(*addr, 0x3fff0);
 
 	if (utils::has_ssse3())
@@ -2587,8 +2587,8 @@ void spu_recompiler::STQX(spu_opcode_t op)
 	}
 	else
 	{
-		c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0));
-		c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1));
+		c->mov(*qw0, SPU_OFF_64(gpr[op.rt]._u64[0]));
+		c->mov(*qw1, SPU_OFF_64(gpr[op.rt]._u64[1]));
 		c->bswap(*qw0);
 		c->bswap(*qw1);
 		c->mov(asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 0), *qw1);
@@ -2612,7 +2612,7 @@ void spu_recompiler::BI(spu_opcode_t op)
 		return;
 	}
 
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 	c->and_(*addr, 0x3fffc);
 	branch_indirect(op, is_jt, !is_jt);
 	m_pos = -1;
@@ -2620,14 +2620,14 @@ void spu_recompiler::BI(spu_opcode_t op)
 
 void spu_recompiler::BISL(spu_opcode_t op)
 {
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 	c->and_(*addr, 0x3fffc);
 	const XmmLink& vr = XmmAlloc();
 	c->lea(*qw0, get_pc(m_pos + 4));
 	c->and_(qw0->r32(), 0x3fffc);
 	c->movd(vr, qw0->r32());
 	c->pslldq(vr, 12);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 	branch_set_link(m_pos + 4);
 	branch_indirect(op, true, false);
 	m_pos = -1;
@@ -2647,14 +2647,14 @@ void spu_recompiler::BISLED(spu_opcode_t op)
 		return _spu->get_events(_spu->ch_events.load().mask).count;
 	};
 
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 
 	const XmmLink& vr = XmmAlloc();
 	c->lea(*qw0, get_pc(m_pos + 4));
 	c->movd(vr, qw0->r32());
 	c->pand(vr, XmmConst(v128::from32p(0x3fffc)));
 	c->pslldq(vr, 12);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 
 	asmjit::Label branch_label = c->newLabel();
 	c->mov(*arg0, *cpu);
@@ -2682,7 +2682,7 @@ void spu_recompiler::GB(spu_opcode_t op)
 	c->movmskps(*addr, va);
 	c->pxor(va, va);
 	c->pinsrw(va, *addr, 6);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::GBH(spu_opcode_t op)
@@ -2693,7 +2693,7 @@ void spu_recompiler::GBH(spu_opcode_t op)
 	c->pmovmskb(*addr, va);
 	c->pxor(va, va);
 	c->pinsrw(va, *addr, 6);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::GBB(spu_opcode_t op)
@@ -2703,7 +2703,7 @@ void spu_recompiler::GBB(spu_opcode_t op)
 	c->pmovmskb(*addr, va);
 	c->pxor(va, va);
 	c->pinsrw(va, *addr, 6);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::FSM(spu_opcode_t op)
@@ -2714,7 +2714,7 @@ void spu_recompiler::FSM(spu_opcode_t op)
 	c->movdqa(vm, XmmConst(v128::from32r(8, 4, 2, 1)));
 	c->pand(va, vm);
 	c->pcmpeqd(va, vm);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::FSMH(spu_opcode_t op)
@@ -2726,7 +2726,7 @@ void spu_recompiler::FSMH(spu_opcode_t op)
 	c->movdqa(vm, XmmConst(v128::from64r(0x0080004000200010, 0x0008000400020001)));
 	c->pand(va, vm);
 	c->pcmpeqw(va, vm);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::FSMB(spu_opcode_t op)
@@ -2748,7 +2748,7 @@ void spu_recompiler::FSMB(spu_opcode_t op)
 	c->movdqa(vm, XmmConst(v128::from64p(0x8040201008040201)));
 	c->pand(va, vm);
 	c->pcmpeqb(va, vm);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::FREST(spu_opcode_t op)
@@ -2791,7 +2791,7 @@ void spu_recompiler::FREST(spu_opcode_t op)
 	c->orps(v_fraction, v_exponent);
 	c->orps(v_sign, v_fraction);
 
-	c->movaps(SPU_OFF_128(gpr, op.rt), v_sign);
+	c->movaps(SPU_OFF_128(gpr[op.rt]), v_sign);
 }
 
 void spu_recompiler::FRSQEST(spu_opcode_t op)
@@ -2824,13 +2824,13 @@ void spu_recompiler::FRSQEST(spu_opcode_t op)
 
 	c->orps(v_fraction, v_exponent);
 
-	c->movaps(SPU_OFF_128(gpr, op.rt), v_fraction);
+	c->movaps(SPU_OFF_128(gpr[op.rt]), v_fraction);
 }
 
 void spu_recompiler::LQX(spu_opcode_t op)
 {
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
-	c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
+	c->add(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
 	c->and_(*addr, 0x3fff0);
 
 	if (utils::has_ssse3())
@@ -2838,7 +2838,7 @@ void spu_recompiler::LQX(spu_opcode_t op)
 		const XmmLink& vt = XmmAlloc();
 		c->movdqa(vt, asmjit::x86::oword_ptr(*ls, addr->r64()));
 		c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 	}
 	else
 	{
@@ -2846,8 +2846,8 @@ void spu_recompiler::LQX(spu_opcode_t op)
 		c->mov(*qw1, asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 8));
 		c->bswap(*qw0);
 		c->bswap(*qw1);
-		c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1);
-		c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0);
+		c->mov(SPU_OFF_64(gpr[op.rt]._u64[0]), *qw1);
+		c->mov(SPU_OFF_64(gpr[op.rt]._u64[1]), *qw0);
 	}
 }
 
@@ -2860,10 +2860,10 @@ void spu_recompiler::ROTQBYBI(spu_opcode_t op)
 
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->mov(*qw0, +g_spu_imm.rldq_pshufb);
-	c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
 	c->and_(*addr, 0xf << 3);
 	c->pshufb(va, asmjit::x86::oword_ptr(*qw0, addr->r64(), 1));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ROTQMBYBI(spu_opcode_t op)
@@ -2875,10 +2875,10 @@ void spu_recompiler::ROTQMBYBI(spu_opcode_t op)
 
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->mov(*qw0, +g_spu_imm.srdq_pshufb);
-	c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
 	c->and_(*addr, 0x1f << 3);
 	c->pshufb(va, asmjit::x86::oword_ptr(*qw0, addr->r64(), 1));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::SHLQBYBI(spu_opcode_t op)
@@ -2890,63 +2890,63 @@ void spu_recompiler::SHLQBYBI(spu_opcode_t op)
 
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->mov(*qw0, +g_spu_imm.sldq_pshufb);
-	c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
 	c->and_(*addr, 0x1f << 3);
 	c->pshufb(va, asmjit::x86::oword_ptr(*qw0, addr->r64(), 1));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CBX(spu_opcode_t op)
 {
-	c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
-	c->add(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
+	c->add(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 	c->not_(*addr);
 	c->and_(*addr, 0xf);
 
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
-	c->mov(asmjit::x86::byte_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x03);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
+	c->mov(asmjit::x86::byte_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), 0x03);
 }
 
 void spu_recompiler::CHX(spu_opcode_t op)
 {
-	c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
-	c->add(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
+	c->add(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 	c->not_(*addr);
 	c->and_(*addr, 0xe);
 
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
-	c->mov(asmjit::x86::word_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x0203);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
+	c->mov(asmjit::x86::word_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), 0x0203);
 }
 
 void spu_recompiler::CWX(spu_opcode_t op)
 {
-	c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
-	c->add(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
+	c->add(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 	c->not_(*addr);
 	c->and_(*addr, 0xc);
 
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
-	c->mov(asmjit::x86::dword_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x00010203);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
+	c->mov(asmjit::x86::dword_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), 0x00010203);
 }
 
 void spu_recompiler::CDX(spu_opcode_t op)
 {
-	c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
-	c->add(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
+	c->add(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 	c->not_(*addr);
 	c->and_(*addr, 0x8);
 
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 	c->mov(*qw0, asmjit::Imm(0x0001020304050607ull));
-	c->mov(asmjit::x86::qword_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), *qw0);
+	c->mov(asmjit::x86::qword_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), *qw0);
 }
 
 void spu_recompiler::ROTQBI(spu_opcode_t op)
@@ -2963,7 +2963,7 @@ void spu_recompiler::ROTQBI(spu_opcode_t op)
 	c->psllq(va, vb);
 	c->psrlq(vt, v4);
 	c->por(vt, va);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 }
 
 void spu_recompiler::ROTQMBI(spu_opcode_t op)
@@ -2983,7 +2983,7 @@ void spu_recompiler::ROTQMBI(spu_opcode_t op)
 	c->psrlq(va, vb);
 	c->psllq(vt, v4);
 	c->por(vt, va);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 }
 
 void spu_recompiler::SHLQBI(spu_opcode_t op)
@@ -3001,7 +3001,7 @@ void spu_recompiler::SHLQBI(spu_opcode_t op)
 	c->psllq(va, vb);
 	c->psrlq(vt, v4);
 	c->por(vt, va);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 }
 
 void spu_recompiler::ROTQBY(spu_opcode_t op)
@@ -3013,11 +3013,11 @@ void spu_recompiler::ROTQBY(spu_opcode_t op)
 
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->mov(*qw0, +g_spu_imm.rldq_pshufb);
-	c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
 	c->and_(*addr, 0xf);
 	c->shl(*addr, 4);
 	c->pshufb(va, asmjit::x86::oword_ptr(*qw0, addr->r64()));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ROTQMBY(spu_opcode_t op)
@@ -3029,11 +3029,11 @@ void spu_recompiler::ROTQMBY(spu_opcode_t op)
 
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->mov(*qw0, +g_spu_imm.srdq_pshufb);
-	c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
 	c->and_(*addr, 0x1f);
 	c->shl(*addr, 4);
 	c->pshufb(va, asmjit::x86::oword_ptr(*qw0, addr->r64()));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::SHLQBY(spu_opcode_t op)
@@ -3045,11 +3045,11 @@ void spu_recompiler::SHLQBY(spu_opcode_t op)
 
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->mov(*qw0, +g_spu_imm.sldq_pshufb);
-	c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
 	c->and_(*addr, 0x1f);
 	c->shl(*addr, 4);
 	c->pshufb(va, asmjit::x86::oword_ptr(*qw0, addr->r64()));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ORX(spu_opcode_t op)
@@ -3061,7 +3061,7 @@ void spu_recompiler::ORX(spu_opcode_t op)
 	c->pshufd(v1, va, 0x4e);
 	c->por(va, v1);
 	c->pslldq(va, 12);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CBD(spu_opcode_t op)
@@ -3073,11 +3073,11 @@ void spu_recompiler::CBD(spu_opcode_t op)
 	//	v128 value = v128::fromV(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
 	//	value.u8r[op.i7 & 0xf] = 0x03;
 	//	c->movdqa(vr, XmmConst(value));
-	//	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	//	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 	//	return;
 	// }
 
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 	if (op.i7)
 		c->add(*addr, +op.i7);
 	c->not_(*addr);
@@ -3085,8 +3085,8 @@ void spu_recompiler::CBD(spu_opcode_t op)
 
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
-	c->mov(asmjit::x86::byte_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x03);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
+	c->mov(asmjit::x86::byte_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), 0x03);
 }
 
 void spu_recompiler::CHD(spu_opcode_t op)
@@ -3098,11 +3098,11 @@ void spu_recompiler::CHD(spu_opcode_t op)
 	//	v128 value = v128::fromV(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
 	//	value.u16r[(op.i7 >> 1) & 0x7] = 0x0203;
 	//	c->movdqa(vr, XmmConst(value));
-	//	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	//	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 	//	return;
 	// }
 
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 	if (op.i7)
 		c->add(*addr, +op.i7);
 	c->not_(*addr);
@@ -3110,8 +3110,8 @@ void spu_recompiler::CHD(spu_opcode_t op)
 
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
-	c->mov(asmjit::x86::word_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x0203);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
+	c->mov(asmjit::x86::word_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), 0x0203);
 }
 
 void spu_recompiler::CWD(spu_opcode_t op)
@@ -3123,11 +3123,11 @@ void spu_recompiler::CWD(spu_opcode_t op)
 	//	v128 value = v128::fromV(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
 	//	value.u32r[(op.i7 >> 2) & 0x3] = 0x00010203;
 	//	c->movdqa(vr, XmmConst(value));
-	//	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	//	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 	//	return;
 	// }
 
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 	if (op.i7)
 		c->add(*addr, +op.i7);
 	c->not_(*addr);
@@ -3135,8 +3135,8 @@ void spu_recompiler::CWD(spu_opcode_t op)
 
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
-	c->mov(asmjit::x86::dword_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x00010203);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
+	c->mov(asmjit::x86::dword_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), 0x00010203);
 }
 
 void spu_recompiler::CDD(spu_opcode_t op)
@@ -3148,11 +3148,11 @@ void spu_recompiler::CDD(spu_opcode_t op)
 	//	v128 value = v128::fromV(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
 	//	value.u64r[(op.i7 >> 3) & 0x1] = 0x0001020304050607ull;
 	//	c->movdqa(vr, XmmConst(value));
-	//	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	//	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 	//	return;
 	// }
 
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 	if (op.i7)
 		c->add(*addr, +op.i7);
 	c->not_(*addr);
@@ -3160,9 +3160,9 @@ void spu_recompiler::CDD(spu_opcode_t op)
 
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 	c->mov(*qw0, asmjit::Imm(0x0001020304050607ull));
-	c->mov(asmjit::x86::qword_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), *qw0);
+	c->mov(asmjit::x86::qword_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), *qw0);
 }
 
 void spu_recompiler::ROTQBII(spu_opcode_t op)
@@ -3173,7 +3173,7 @@ void spu_recompiler::ROTQBII(spu_opcode_t op)
 	c->psllq(va, (op.i7 & 0x7));
 	c->psrlq(vt, 64 - (op.i7 & 0x7));
 	c->por(vt, va);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 }
 
 void spu_recompiler::ROTQMBII(spu_opcode_t op)
@@ -3185,7 +3185,7 @@ void spu_recompiler::ROTQMBII(spu_opcode_t op)
 	c->psrlq(va, ((0 - op.i7) & 0x7));
 	c->psllq(vt, 64 - ((0 - op.i7) & 0x7));
 	c->por(vt, va);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 }
 
 void spu_recompiler::SHLQBII(spu_opcode_t op)
@@ -3197,7 +3197,7 @@ void spu_recompiler::SHLQBII(spu_opcode_t op)
 	c->psllq(va, (op.i7 & 0x7));
 	c->psrlq(vt, 64 - (op.i7 & 0x7));
 	c->por(vt, va);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 }
 
 void spu_recompiler::ROTQBYI(spu_opcode_t op)
@@ -3225,7 +3225,7 @@ void spu_recompiler::ROTQBYI(spu_opcode_t op)
 		c->por(va, v2);
 	}
 
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ROTQMBYI(spu_opcode_t op)
@@ -3233,7 +3233,7 @@ void spu_recompiler::ROTQMBYI(spu_opcode_t op)
 	const int s = (0 - op.i7) & 0x1f;
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->psrldq(va, s);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::SHLQBYI(spu_opcode_t op)
@@ -3241,7 +3241,7 @@ void spu_recompiler::SHLQBYI(spu_opcode_t op)
 	const int s = op.i7 & 0x1f;
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pslldq(va, s);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::NOP(spu_opcode_t)
@@ -3251,23 +3251,23 @@ void spu_recompiler::NOP(spu_opcode_t)
 void spu_recompiler::CGT(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pcmpgtd(va, SPU_OFF_128(gpr, op.rb));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->pcmpgtd(va, SPU_OFF_128(gpr[op.rb]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::XOR(spu_opcode_t op)
 {
 	// xor
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pxor(va, SPU_OFF_128(gpr, op.rb));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->pxor(va, SPU_OFF_128(gpr[op.rb]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CGTH(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pcmpgtw(va, SPU_OFF_128(gpr, op.rb));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->pcmpgtw(va, SPU_OFF_128(gpr[op.rb]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::EQV(spu_opcode_t op)
@@ -3276,21 +3276,21 @@ void spu_recompiler::EQV(spu_opcode_t op)
 
 	if (utils::has_avx512())
 	{
-		c->vpternlogd(vb, vb, SPU_OFF_128(gpr, op.ra), 0x99 /* xnorCB */);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
+		c->vpternlogd(vb, vb, SPU_OFF_128(gpr[op.ra]), 0x99 /* xnorCB */);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vb);
 		return;
 	}
 
 	c->pxor(vb, XmmConst(v128::from32p(0xffffffff)));
-	c->pxor(vb, SPU_OFF_128(gpr, op.ra));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
+	c->pxor(vb, SPU_OFF_128(gpr[op.ra]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::CGTB(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pcmpgtb(va, SPU_OFF_128(gpr, op.rb));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->pcmpgtb(va, SPU_OFF_128(gpr[op.rb]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::SUMB(spu_opcode_t op)
@@ -3316,13 +3316,13 @@ void spu_recompiler::SUMB(spu_opcode_t op)
 	c->paddw(va, v1);
 	c->paddw(vb, v2);
 	c->por(va, vb);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::HGT(spu_opcode_t op)
 {
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_s32, 3));
-	c->cmp(*addr, SPU_OFF_32(gpr, op.rb, &v128::_s32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._s32[3]));
+	c->cmp(*addr, SPU_OFF_32(gpr[op.rb]._s32[3]));
 
 	asmjit::Label label = c->newLabel();
 	asmjit::Label ret = c->newLabel();
@@ -3347,26 +3347,26 @@ void spu_recompiler::CLZ(spu_opcode_t op)
 		const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 		const XmmLink& vt = XmmAlloc();
 		c->vplzcntd(vt, va);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 		return;
 	}
 
 	c->mov(qw0->r32(), 32 + 31);
 	for (u32 i = 0; i < 4; i++) // unrolled loop
 	{
-		c->bsr(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, i));
+		c->bsr(*addr, SPU_OFF_32(gpr[op.ra]._u32[i]));
 		c->cmovz(*addr, qw0->r32());
 		c->xor_(*addr, 31);
-		c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), *addr);
+		c->mov(SPU_OFF_32(gpr[op.rt]._u32[i]), *addr);
 	}
 }
 
 void spu_recompiler::XSWD(spu_opcode_t op)
 {
-	c->movsxd(*qw0, SPU_OFF_32(gpr, op.ra, &v128::_s32, 0));
-	c->movsxd(*qw1, SPU_OFF_32(gpr, op.ra, &v128::_s32, 2));
-	c->mov(SPU_OFF_64(gpr, op.rt, &v128::_s64, 0), *qw0);
-	c->mov(SPU_OFF_64(gpr, op.rt, &v128::_s64, 1), *qw1);
+	c->movsxd(*qw0, SPU_OFF_32(gpr[op.ra]._s32[0]));
+	c->movsxd(*qw1, SPU_OFF_32(gpr[op.ra]._s32[2]));
+	c->mov(SPU_OFF_64(gpr[op.rt]._s64[0]), *qw0);
+	c->mov(SPU_OFF_64(gpr[op.rt]._s64[1]), *qw1);
 }
 
 void spu_recompiler::XSHW(spu_opcode_t op)
@@ -3374,7 +3374,7 @@ void spu_recompiler::XSHW(spu_opcode_t op)
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pslld(va, 16);
 	c->psrad(va, 16);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CNTB(spu_opcode_t op)
@@ -3400,7 +3400,7 @@ void spu_recompiler::CNTB(spu_opcode_t op)
 	c->psrlq(v1, 4);
 	c->pand(v1, vm);
 	c->paddb(va, v1);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::XSBH(spu_opcode_t op)
@@ -3408,7 +3408,7 @@ void spu_recompiler::XSBH(spu_opcode_t op)
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->psllw(va, 8);
 	c->psraw(va, 8);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CLGT(spu_opcode_t op)
@@ -3418,17 +3418,17 @@ void spu_recompiler::CLGT(spu_opcode_t op)
 	const XmmLink& vi = XmmAlloc();
 	c->movdqa(vi, XmmConst(v128::from32p(0x80000000)));
 	c->pxor(va, vi);
-	c->pxor(vi, SPU_OFF_128(gpr, op.rb));
+	c->pxor(vi, SPU_OFF_128(gpr[op.rb]));
 	c->pcmpgtd(va, vi);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ANDC(spu_opcode_t op)
 {
 	// and not
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
-	c->pandn(vb, SPU_OFF_128(gpr, op.ra));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
+	c->pandn(vb, SPU_OFF_128(gpr[op.ra]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::FCGT(spu_opcode_t op)
@@ -3444,31 +3444,31 @@ void spu_recompiler::FCGT(spu_opcode_t op)
 
 	c->pxor(tmp0, tmp0);
 	c->pxor(tmp1, tmp1);
-	c->cmpps(tmp0, SPU_OFF_128(gpr, op.ra), 3); // tmp0 is true if a is extended (nan/inf)
-	c->cmpps(tmp1, SPU_OFF_128(gpr, op.rb), 3); // tmp1 is true if b is extended (nan/inf)
+	c->cmpps(tmp0, SPU_OFF_128(gpr[op.ra]), 3); // tmp0 is true if a is extended (nan/inf)
+	c->cmpps(tmp1, SPU_OFF_128(gpr[op.rb]), 3); // tmp1 is true if b is extended (nan/inf)
 
 	// compute lower a and b
 	c->movaps(tmp2, last_exp_bit);
 	c->movaps(tmp3, last_exp_bit);
-	c->pandn(tmp2, SPU_OFF_128(gpr, op.ra)); // tmp2 = lowered_a
-	c->pandn(tmp3, SPU_OFF_128(gpr, op.rb)); // tmp3 = lowered_b
+	c->pandn(tmp2, SPU_OFF_128(gpr[op.ra])); // tmp2 = lowered_a
+	c->pandn(tmp3, SPU_OFF_128(gpr[op.rb])); // tmp3 = lowered_b
 
 	// lower a if extended
 	c->movaps(tmpv, tmp0);
 	c->pand(tmpv, tmp2);
-	c->pandn(tmp0, SPU_OFF_128(gpr, op.ra));
+	c->pandn(tmp0, SPU_OFF_128(gpr[op.ra]));
 	c->orps(tmp0, tmpv);
 
 	// lower b if extended
 	c->movaps(tmpv, tmp1);
 	c->pand(tmpv, tmp3);
-	c->pandn(tmp1, SPU_OFF_128(gpr, op.rb));
+	c->pandn(tmp1, SPU_OFF_128(gpr[op.rb]));
 	c->orps(tmp1, tmpv);
 
 	// flush to 0 if denormalized
 	c->pxor(tmpv, tmpv);
-	c->movaps(tmp2, SPU_OFF_128(gpr, op.ra));
-	c->movaps(tmp3, SPU_OFF_128(gpr, op.rb));
+	c->movaps(tmp2, SPU_OFF_128(gpr[op.ra]));
+	c->movaps(tmp3, SPU_OFF_128(gpr[op.rb]));
 	c->andps(tmp2, all_exp_bits);
 	c->andps(tmp3, all_exp_bits);
 	c->cmpps(tmp2, tmpv, 0);
@@ -3477,7 +3477,7 @@ void spu_recompiler::FCGT(spu_opcode_t op)
 	c->pandn(tmp3, tmp1);
 
 	c->cmpps(tmp3, tmp2, 1);
-	c->movaps(SPU_OFF_128(gpr, op.rt), tmp3);
+	c->movaps(SPU_OFF_128(gpr[op.rt]), tmp3);
 }
 
 void spu_recompiler::DFCGT(spu_opcode_t op)
@@ -3488,15 +3488,15 @@ void spu_recompiler::DFCGT(spu_opcode_t op)
 void spu_recompiler::FA(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Float);
-	c->addps(va, SPU_OFF_128(gpr, op.rb));
-	c->movaps(SPU_OFF_128(gpr, op.rt), va);
+	c->addps(va, SPU_OFF_128(gpr[op.rb]));
+	c->movaps(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::FS(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Float);
-	c->subps(va, SPU_OFF_128(gpr, op.rb));
-	c->movaps(SPU_OFF_128(gpr, op.rt), va);
+	c->subps(va, SPU_OFF_128(gpr[op.rb]));
+	c->movaps(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::FM(spu_opcode_t op)
@@ -3536,8 +3536,8 @@ void spu_recompiler::FM(spu_opcode_t op)
 	c->movaps(tmp4, sign_bits);
 	c->movaps(tmp5, sign_bits);
 	c->movaps(tmp0, sign_bits);
-	c->andps(tmp4, SPU_OFF_128(gpr, op.ra));
-	c->andps(tmp5, SPU_OFF_128(gpr, op.rb));
+	c->andps(tmp4, SPU_OFF_128(gpr[op.ra]));
+	c->andps(tmp5, SPU_OFF_128(gpr[op.rb]));
 	c->xorps(tmp4, tmp5); // sign mask
 	c->pandn(tmp0, tmp2);
 	c->orps(tmp4, tmp0);   // add result sign back to original extended value
@@ -3549,7 +3549,7 @@ void spu_recompiler::FM(spu_opcode_t op)
 	c->andnps(tmp0, tmp3);
 	c->andps(tmp2, tmp5);
 	c->orps(tmp0, tmp2);
-	c->movaps(SPU_OFF_128(gpr, op.rt), tmp0);
+	c->movaps(SPU_OFF_128(gpr[op.rt]), tmp0);
 }
 
 void spu_recompiler::CLGTH(spu_opcode_t op)
@@ -3559,9 +3559,9 @@ void spu_recompiler::CLGTH(spu_opcode_t op)
 	const XmmLink& vi = XmmAlloc();
 	c->movdqa(vi, XmmConst(v128::from16p(0x8000)));
 	c->pxor(va, vi);
-	c->pxor(vi, SPU_OFF_128(gpr, op.rb));
+	c->pxor(vi, SPU_OFF_128(gpr[op.rb]));
 	c->pcmpgtw(va, vi);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ORC(spu_opcode_t op)
@@ -3570,14 +3570,14 @@ void spu_recompiler::ORC(spu_opcode_t op)
 
 	if (utils::has_avx512())
 	{
-		c->vpternlogd(vb, vb, SPU_OFF_128(gpr, op.ra), 0xbb /* orC!B */);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
+		c->vpternlogd(vb, vb, SPU_OFF_128(gpr[op.ra]), 0xbb /* orC!B */);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vb);
 		return;
 	}
 
 	c->pxor(vb, XmmConst(v128::from32p(0xffffffff)));
-	c->por(vb, SPU_OFF_128(gpr, op.ra));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
+	c->por(vb, SPU_OFF_128(gpr[op.ra]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::FCMGT(spu_opcode_t op)
@@ -3596,19 +3596,19 @@ void spu_recompiler::FCMGT(spu_opcode_t op)
 
 	c->pxor(tmp0, tmp0);
 	c->pxor(tmp1, tmp1);
-	c->cmpps(tmp0, SPU_OFF_128(gpr, op.ra), 3); // tmp0 is true if a is extended (nan/inf)
-	c->cmpps(tmp1, SPU_OFF_128(gpr, op.rb), 3); // tmp1 is true if b is extended (nan/inf)
+	c->cmpps(tmp0, SPU_OFF_128(gpr[op.ra]), 3); // tmp0 is true if a is extended (nan/inf)
+	c->cmpps(tmp1, SPU_OFF_128(gpr[op.rb]), 3); // tmp1 is true if b is extended (nan/inf)
 
 	// flush to 0 if denormalized
 	c->pxor(tmpv, tmpv);
-	c->movaps(tmp2, SPU_OFF_128(gpr, op.ra));
-	c->movaps(tmp3, SPU_OFF_128(gpr, op.rb));
+	c->movaps(tmp2, SPU_OFF_128(gpr[op.ra]));
+	c->movaps(tmp3, SPU_OFF_128(gpr[op.rb]));
 	c->andps(tmp2, all_exp_bits);
 	c->andps(tmp3, all_exp_bits);
 	c->cmpps(tmp2, tmpv, 0);
 	c->cmpps(tmp3, tmpv, 0);
-	c->pandn(tmp2, SPU_OFF_128(gpr, op.ra));
-	c->pandn(tmp3, SPU_OFF_128(gpr, op.rb));
+	c->pandn(tmp2, SPU_OFF_128(gpr[op.ra]));
+	c->pandn(tmp3, SPU_OFF_128(gpr[op.rb]));
 
 	// Set tmp1 to true where a is extended but b is not extended
 	// This is a simplification since absolute values remove necessity of lowering
@@ -3619,7 +3619,7 @@ void spu_recompiler::FCMGT(spu_opcode_t op)
 	c->andps(tmp3, remove_sign_bits);
 	c->cmpps(tmp3, tmp2, 1);
 	c->orps(tmp3, tmp1); // Force result to all true if a is extended but b is not
-	c->movaps(SPU_OFF_128(gpr, op.rt), tmp3);
+	c->movaps(SPU_OFF_128(gpr[op.rt]), tmp3);
 }
 
 void spu_recompiler::DFCMGT(spu_opcode_t op)
@@ -3630,22 +3630,22 @@ void spu_recompiler::DFCMGT(spu_opcode_t op)
 void spu_recompiler::DFA(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Double);
-	c->addpd(va, SPU_OFF_128(gpr, op.rb));
-	c->movapd(SPU_OFF_128(gpr, op.rt), va);
+	c->addpd(va, SPU_OFF_128(gpr[op.rb]));
+	c->movapd(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::DFS(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Double);
-	c->subpd(va, SPU_OFF_128(gpr, op.rb));
-	c->movapd(SPU_OFF_128(gpr, op.rt), va);
+	c->subpd(va, SPU_OFF_128(gpr[op.rb]));
+	c->movapd(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::DFM(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Double);
-	c->mulpd(va, SPU_OFF_128(gpr, op.rb));
-	c->movapd(SPU_OFF_128(gpr, op.rt), va);
+	c->mulpd(va, SPU_OFF_128(gpr[op.rb]));
+	c->movapd(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CLGTB(spu_opcode_t op)
@@ -3655,15 +3655,15 @@ void spu_recompiler::CLGTB(spu_opcode_t op)
 	const XmmLink& vi = XmmAlloc();
 	c->movdqa(vi, XmmConst(v128::from8p(0x80)));
 	c->pxor(va, vi);
-	c->pxor(vi, SPU_OFF_128(gpr, op.rb));
+	c->pxor(vi, SPU_OFF_128(gpr[op.rb]));
 	c->pcmpgtb(va, vi);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::HLGT(spu_opcode_t op)
 {
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
-	c->cmp(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
+	c->cmp(*addr, SPU_OFF_32(gpr[op.rb]._u32[3]));
 
 	asmjit::Label label = c->newLabel();
 	asmjit::Label ret = c->newLabel();
@@ -3685,44 +3685,44 @@ void spu_recompiler::DFMA(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmGet(op.rt, XmmType::Double);
 	const XmmLink& va = XmmGet(op.ra, XmmType::Double);
-	c->mulpd(va, SPU_OFF_128(gpr, op.rb));
+	c->mulpd(va, SPU_OFF_128(gpr[op.rb]));
 	c->addpd(vr, va);
-	c->movapd(SPU_OFF_128(gpr, op.rt), vr);
+	c->movapd(SPU_OFF_128(gpr[op.rt]), vr);
 }
 
 void spu_recompiler::DFMS(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Double);
 	const XmmLink& vt = XmmGet(op.rt, XmmType::Double);
-	c->mulpd(va, SPU_OFF_128(gpr, op.rb));
+	c->mulpd(va, SPU_OFF_128(gpr[op.rb]));
 	c->subpd(va, vt);
-	c->movapd(SPU_OFF_128(gpr, op.rt), va);
+	c->movapd(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::DFNMS(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmGet(op.rt, XmmType::Double);
 	const XmmLink& va = XmmGet(op.ra, XmmType::Double);
-	c->mulpd(va, SPU_OFF_128(gpr, op.rb));
+	c->mulpd(va, SPU_OFF_128(gpr[op.rb]));
 	c->subpd(vr, va);
-	c->movapd(SPU_OFF_128(gpr, op.rt), vr);
+	c->movapd(SPU_OFF_128(gpr[op.rt]), vr);
 }
 
 void spu_recompiler::DFNMA(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Double);
 	const XmmLink& vt = XmmGet(op.rt, XmmType::Double);
-	c->mulpd(va, SPU_OFF_128(gpr, op.rb));
+	c->mulpd(va, SPU_OFF_128(gpr[op.rb]));
 	c->addpd(va, vt);
 	c->xorpd(va, XmmConst(v128::from64p(0x8000000000000000)));
-	c->movapd(SPU_OFF_128(gpr, op.rt), va);
+	c->movapd(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CEQ(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pcmpeqd(va, SPU_OFF_128(gpr, op.rb));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->pcmpeqd(va, SPU_OFF_128(gpr[op.rb]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::MPYHHU(spu_opcode_t op)
@@ -3736,16 +3736,16 @@ void spu_recompiler::MPYHHU(spu_opcode_t op)
 	c->pand(va, XmmConst(v128::from32p(0xffff0000)));
 	c->psrld(va2, 16);
 	c->por(va, va2);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ADDX(spu_opcode_t op)
 {
 	const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
 	c->pand(vt, XmmConst(v128::from32p(1)));
-	c->paddd(vt, SPU_OFF_128(gpr, op.ra));
-	c->paddd(vt, SPU_OFF_128(gpr, op.rb));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+	c->paddd(vt, SPU_OFF_128(gpr[op.ra]));
+	c->paddd(vt, SPU_OFF_128(gpr[op.rb]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 }
 
 void spu_recompiler::SFX(spu_opcode_t op)
@@ -3753,9 +3753,9 @@ void spu_recompiler::SFX(spu_opcode_t op)
 	const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
 	c->pandn(vt, XmmConst(v128::from32p(1)));
-	c->psubd(vb, SPU_OFF_128(gpr, op.ra));
+	c->psubd(vb, SPU_OFF_128(gpr[op.ra]));
 	c->psubd(vb, vt);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::CGX(spu_opcode_t op) // nf
@@ -3788,7 +3788,7 @@ void spu_recompiler::CGX(spu_opcode_t op) // nf
 	c->pand(res, vt);
 	c->por(res, va);
 	c->psrld(res, 31);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), res);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), res);
 }
 
 void spu_recompiler::BGX(spu_opcode_t op) // nf
@@ -3818,7 +3818,7 @@ void spu_recompiler::BGX(spu_opcode_t op) // nf
 	c->pcmpgtd(vb, va);
 	c->por(vt, vb);
 	c->psrld(vt, 31);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 }
 
 void spu_recompiler::MPYHHA(spu_opcode_t op)
@@ -3830,7 +3830,7 @@ void spu_recompiler::MPYHHA(spu_opcode_t op)
 	c->psrld(vb, 16);
 	c->pmaddwd(va, vb);
 	c->paddd(vt, va);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 }
 
 void spu_recompiler::MPYHHAU(spu_opcode_t op)
@@ -3846,7 +3846,7 @@ void spu_recompiler::MPYHHAU(spu_opcode_t op)
 	c->psrld(va2, 16);
 	c->paddd(vt, va);
 	c->paddd(vt, va2);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 }
 
 void spu_recompiler::FSCRRD(spu_opcode_t op)
@@ -3854,7 +3854,7 @@ void spu_recompiler::FSCRRD(spu_opcode_t op)
 	// zero (hack)
 	const XmmLink& v0 = XmmAlloc();
 	c->pxor(v0, v0);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), v0);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), v0);
 }
 
 void spu_recompiler::FESD(spu_opcode_t op)
@@ -3862,7 +3862,7 @@ void spu_recompiler::FESD(spu_opcode_t op)
 	const XmmLink& va = XmmGet(op.ra, XmmType::Float);
 	c->shufps(va, va, 0x8d); // _f[0] = _f[1]; _f[1] = _f[3];
 	c->cvtps2pd(va, va);
-	c->movapd(SPU_OFF_128(gpr, op.rt), va);
+	c->movapd(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::FRDS(spu_opcode_t op)
@@ -3870,7 +3870,7 @@ void spu_recompiler::FRDS(spu_opcode_t op)
 	const XmmLink& va = XmmGet(op.ra, XmmType::Double);
 	c->cvtpd2ps(va, va);
 	c->shufps(va, va, 0x72); // _f[1] = _f[0]; _f[3] = _f[1]; _f[0] = _f[2] = 0;
-	c->movaps(SPU_OFF_128(gpr, op.rt), va);
+	c->movaps(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::FSCRWR(spu_opcode_t /*op*/)
@@ -3887,8 +3887,8 @@ void spu_recompiler::FCEQ(spu_opcode_t op)
 {
 	// compare equal
 	const XmmLink& vb = XmmGet(op.rb, XmmType::Float);
-	c->cmpps(vb, SPU_OFF_128(gpr, op.ra), 0);
-	c->movaps(SPU_OFF_128(gpr, op.rt), vb);
+	c->cmpps(vb, SPU_OFF_128(gpr[op.ra]), 0);
+	c->movaps(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::DFCEQ(spu_opcode_t op)
@@ -3905,7 +3905,7 @@ void spu_recompiler::MPY(spu_opcode_t op)
 	c->pand(va, vi);
 	c->pand(vb, vi);
 	c->pmaddwd(va, vb);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::MPYH(spu_opcode_t op)
@@ -3915,7 +3915,7 @@ void spu_recompiler::MPYH(spu_opcode_t op)
 	c->psrld(va, 16);
 	c->pmullw(va, vb);
 	c->pslld(va, 16);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::MPYHH(spu_opcode_t op)
@@ -3925,7 +3925,7 @@ void spu_recompiler::MPYHH(spu_opcode_t op)
 	c->psrld(va, 16);
 	c->psrld(vb, 16);
 	c->pmaddwd(va, vb);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::MPYS(spu_opcode_t op)
@@ -3935,14 +3935,14 @@ void spu_recompiler::MPYS(spu_opcode_t op)
 	c->pmulhw(va, vb);
 	c->pslld(va, 16);
 	c->psrad(va, 16);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CEQH(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pcmpeqw(va, SPU_OFF_128(gpr, op.rb));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->pcmpeqw(va, SPU_OFF_128(gpr[op.rb]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::FCMEQ(spu_opcode_t op)
@@ -3951,9 +3951,9 @@ void spu_recompiler::FCMEQ(spu_opcode_t op)
 	const XmmLink& vi = XmmAlloc();
 	c->movaps(vi, XmmConst(v128::from32p(0x7fffffff)));
 	c->andps(vb, vi); // abs
-	c->andps(vi, SPU_OFF_128(gpr, op.ra));
+	c->andps(vi, SPU_OFF_128(gpr[op.ra]));
 	c->cmpps(vb, vi, 0); // ==
-	c->movaps(SPU_OFF_128(gpr, op.rt), vb);
+	c->movaps(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::DFCMEQ(spu_opcode_t op)
@@ -3972,14 +3972,14 @@ void spu_recompiler::MPYU(spu_opcode_t op)
 	c->pslld(va, 16);
 	c->pand(va2, XmmConst(v128::from32p(0xffff)));
 	c->por(va, va2);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CEQB(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
-	c->pcmpeqb(va, SPU_OFF_128(gpr, op.rb));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->pcmpeqb(va, SPU_OFF_128(gpr[op.rb]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::FI(spu_opcode_t op)
@@ -4028,13 +4028,13 @@ void spu_recompiler::FI(spu_opcode_t op)
 	c->pand(ymul, XmmConst(v128::from32p(1 << 23)));
 	c->psubd(vb, ymul);
 
-	c->movaps(SPU_OFF_128(gpr, op.rt), vb);
+	c->movaps(SPU_OFF_128(gpr[op.rt]), vb);
 }
 
 void spu_recompiler::HEQ(spu_opcode_t op)
 {
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_s32, 3));
-	c->cmp(*addr, SPU_OFF_32(gpr, op.rb, &v128::_s32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._s32[3]));
+	c->cmp(*addr, SPU_OFF_32(gpr[op.rb]._s32[3]));
 
 	asmjit::Label label = c->newLabel();
 	asmjit::Label ret = c->newLabel();
@@ -4062,7 +4062,7 @@ void spu_recompiler::CFLTS(spu_opcode_t op)
 	c->cmpps(vi, va, 2);
 	c->cvttps2dq(va, va); // convert to ints with truncation
 	c->pxor(va, vi);      // fix result saturation (0x80000000 -> 0x7fffffff)
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CFLTU(spu_opcode_t op)
@@ -4079,7 +4079,7 @@ void spu_recompiler::CFLTU(spu_opcode_t op)
 		c->vcvttps2udq(vs, va);
 		c->psrad(va, 31);
 		c->pandn(va, vs);
-		c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 		return;
 	}
 
@@ -4097,7 +4097,7 @@ void spu_recompiler::CFLTU(spu_opcode_t op)
 	c->cvttps2dq(vs2, vs2);
 	c->por(va, vs);
 	c->por(va, vs2);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CSFLT(spu_opcode_t op)
@@ -4106,7 +4106,7 @@ void spu_recompiler::CSFLT(spu_opcode_t op)
 	c->cvtdq2ps(va, va); // convert to floats
 	if (op.i8 != 155)
 		c->mulps(va, XmmConst(v128::fromf32p(std::exp2(static_cast<float>(static_cast<s16>(op.i8 - 155)))))); // scale
-	c->movaps(SPU_OFF_128(gpr, op.rt), va);
+	c->movaps(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CUFLT(spu_opcode_t op)
@@ -4130,7 +4130,7 @@ void spu_recompiler::CUFLT(spu_opcode_t op)
 
 	if (op.i8 != 155)
 		c->mulps(va, XmmConst(v128::fromf32p(std::exp2(static_cast<float>(static_cast<s16>(op.i8 - 155)))))); // scale
-	c->movaps(SPU_OFF_128(gpr, op.rt), va);
+	c->movaps(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::BRZ(spu_opcode_t op)
@@ -4143,7 +4143,7 @@ void spu_recompiler::BRZ(spu_opcode_t op)
 	}
 
 	asmjit::Label branch_label = c->newLabel();
-	c->cmp(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), 0);
+	c->cmp(SPU_OFF_32(gpr[op.rt]._u32[3]), 0);
 	c->je(branch_label);
 
 	after.emplace_back([=, this]()
@@ -4164,8 +4164,8 @@ void spu_recompiler::STQA(spu_opcode_t op)
 	}
 	else
 	{
-		c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0));
-		c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1));
+		c->mov(*qw0, SPU_OFF_64(gpr[op.rt]._u64[0]));
+		c->mov(*qw1, SPU_OFF_64(gpr[op.rt]._u64[1]));
 		c->bswap(*qw0);
 		c->bswap(*qw1);
 		c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 0), *qw1);
@@ -4183,7 +4183,7 @@ void spu_recompiler::BRNZ(spu_opcode_t op)
 	}
 
 	asmjit::Label branch_label = c->newLabel();
-	c->cmp(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), 0);
+	c->cmp(SPU_OFF_32(gpr[op.rt]._u32[3]), 0);
 	c->jne(branch_label);
 
 	after.emplace_back([=, this]()
@@ -4204,7 +4204,7 @@ void spu_recompiler::BRHZ(spu_opcode_t op)
 	}
 
 	asmjit::Label branch_label = c->newLabel();
-	c->cmp(SPU_OFF_16(gpr, op.rt, &v128::_u16, 6), 0);
+	c->cmp(SPU_OFF_16(gpr[op.rt]._u16[6]), 0);
 	c->je(branch_label);
 
 	after.emplace_back([=, this]()
@@ -4225,7 +4225,7 @@ void spu_recompiler::BRHNZ(spu_opcode_t op)
 	}
 
 	asmjit::Label branch_label = c->newLabel();
-	c->cmp(SPU_OFF_16(gpr, op.rt, &v128::_u16, 6), 0);
+	c->cmp(SPU_OFF_16(gpr[op.rt]._u16[6]), 0);
 	c->jne(branch_label);
 
 	after.emplace_back([=, this]()
@@ -4249,8 +4249,8 @@ void spu_recompiler::STQR(spu_opcode_t op)
 	}
 	else
 	{
-		c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0));
-		c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1));
+		c->mov(*qw0, SPU_OFF_64(gpr[op.rt]._u64[0]));
+		c->mov(*qw1, SPU_OFF_64(gpr[op.rt]._u64[1]));
 		c->bswap(*qw0);
 		c->bswap(*qw1);
 		c->mov(asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 0), *qw1);
@@ -4273,7 +4273,7 @@ void spu_recompiler::LQA(spu_opcode_t op)
 		const XmmLink& vt = XmmAlloc();
 		c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)));
 		c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 	}
 	else
 	{
@@ -4281,8 +4281,8 @@ void spu_recompiler::LQA(spu_opcode_t op)
 		c->mov(*qw1, asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 8));
 		c->bswap(*qw0);
 		c->bswap(*qw1);
-		c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1);
-		c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0);
+		c->mov(SPU_OFF_64(gpr[op.rt]._u64[0]), *qw1);
+		c->mov(SPU_OFF_64(gpr[op.rt]._u64[1]), *qw0);
 	}
 }
 
@@ -4295,7 +4295,7 @@ void spu_recompiler::BRASL(spu_opcode_t op)
 	c->and_(*addr, 0x3fffc);
 	c->movd(vr, *addr);
 	c->pslldq(vr, 12);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 
 	branch_set_link(m_pos + 4);
 	branch_fixed(target, true);
@@ -4321,7 +4321,7 @@ void spu_recompiler::FSMBI(spu_opcode_t op)
 
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(data));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 }
 
 void spu_recompiler::BRSL(spu_opcode_t op)
@@ -4333,7 +4333,7 @@ void spu_recompiler::BRSL(spu_opcode_t op)
 	c->and_(*addr, 0x3fffc);
 	c->movd(vr, *addr);
 	c->pslldq(vr, 12);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 
 	if (target != m_pos + 4)
 	{
@@ -4353,7 +4353,7 @@ void spu_recompiler::LQR(spu_opcode_t op)
 		const XmmLink& vt = XmmAlloc();
 		c->movdqa(vt, asmjit::x86::oword_ptr(*ls, addr->r64()));
 		c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 	}
 	else
 	{
@@ -4361,8 +4361,8 @@ void spu_recompiler::LQR(spu_opcode_t op)
 		c->mov(*qw1, asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 8));
 		c->bswap(*qw0);
 		c->bswap(*qw1);
-		c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1);
-		c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0);
+		c->mov(SPU_OFF_64(gpr[op.rt]._u64[0]), *qw1);
+		c->mov(SPU_OFF_64(gpr[op.rt]._u64[1]), *qw0);
 	}
 }
 
@@ -4370,28 +4370,28 @@ void spu_recompiler::IL(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from32p(op.si16)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 }
 
 void spu_recompiler::ILHU(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from32p(op.i16 << 16)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 }
 
 void spu_recompiler::ILH(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from16p(op.i16)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 }
 
 void spu_recompiler::IOHL(spu_opcode_t op)
 {
 	const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
 	c->por(vt, XmmConst(v128::from32p(op.i16)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 }
 
 void spu_recompiler::ORI(spu_opcode_t op)
@@ -4399,58 +4399,58 @@ void spu_recompiler::ORI(spu_opcode_t op)
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	if (op.si10)
 		c->por(va, XmmConst(v128::from32p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ORHI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->por(va, XmmConst(v128::from16p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ORBI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->por(va, XmmConst(v128::from8p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::SFI(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from32p(op.si10)));
-	c->psubd(vr, SPU_OFF_128(gpr, op.ra));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->psubd(vr, SPU_OFF_128(gpr[op.ra]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 }
 
 void spu_recompiler::SFHI(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from16p(op.si10)));
-	c->psubw(vr, SPU_OFF_128(gpr, op.ra));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->psubw(vr, SPU_OFF_128(gpr[op.ra]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 }
 
 void spu_recompiler::ANDI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pand(va, XmmConst(v128::from32p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ANDHI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pand(va, XmmConst(v128::from16p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::ANDBI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pand(va, XmmConst(v128::from8p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::AI(spu_opcode_t op)
@@ -4458,7 +4458,7 @@ void spu_recompiler::AI(spu_opcode_t op)
 	// add
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->paddd(va, XmmConst(v128::from32p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::AHI(spu_opcode_t op)
@@ -4466,12 +4466,12 @@ void spu_recompiler::AHI(spu_opcode_t op)
 	// add
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->paddw(va, XmmConst(v128::from16p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::STQD(spu_opcode_t op)
 {
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 	if (op.si10)
 		c->add(*addr, op.si10 * 16);
 	c->and_(*addr, 0x3fff0);
@@ -4484,8 +4484,8 @@ void spu_recompiler::STQD(spu_opcode_t op)
 	}
 	else
 	{
-		c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0));
-		c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1));
+		c->mov(*qw0, SPU_OFF_64(gpr[op.rt]._u64[0]));
+		c->mov(*qw1, SPU_OFF_64(gpr[op.rt]._u64[1]));
 		c->bswap(*qw0);
 		c->bswap(*qw1);
 		c->mov(asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 0), *qw1);
@@ -4495,7 +4495,7 @@ void spu_recompiler::STQD(spu_opcode_t op)
 
 void spu_recompiler::LQD(spu_opcode_t op)
 {
-	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
+	c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3]));
 	if (op.si10)
 		c->add(*addr, op.si10 * 16);
 	c->and_(*addr, 0x3fff0);
@@ -4505,7 +4505,7 @@ void spu_recompiler::LQD(spu_opcode_t op)
 		const XmmLink& vt = XmmAlloc();
 		c->movdqa(vt, asmjit::x86::oword_ptr(*ls, addr->r64()));
 		c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
-		c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt]), vt);
 	}
 	else
 	{
@@ -4513,8 +4513,8 @@ void spu_recompiler::LQD(spu_opcode_t op)
 		c->mov(*qw1, asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 8));
 		c->bswap(*qw0);
 		c->bswap(*qw1);
-		c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1);
-		c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0);
+		c->mov(SPU_OFF_64(gpr[op.rt]._u64[0]), *qw1);
+		c->mov(SPU_OFF_64(gpr[op.rt]._u64[1]), *qw0);
 	}
 }
 
@@ -4522,47 +4522,47 @@ void spu_recompiler::XORI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pxor(va, XmmConst(v128::from32p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::XORHI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pxor(va, XmmConst(v128::from16p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::XORBI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pxor(va, XmmConst(v128::from8p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CGTI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pcmpgtd(va, XmmConst(v128::from32p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CGTHI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pcmpgtw(va, XmmConst(v128::from16p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CGTBI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pcmpgtb(va, XmmConst(v128::from8p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::HGTI(spu_opcode_t op)
 {
-	c->cmp(SPU_OFF_32(gpr, op.ra, &v128::_s32, 3), +op.si10);
+	c->cmp(SPU_OFF_32(gpr[op.ra]._s32[3]), +op.si10);
 
 	asmjit::Label label = c->newLabel();
 	asmjit::Label ret = c->newLabel();
@@ -4585,7 +4585,7 @@ void spu_recompiler::CLGTI(spu_opcode_t op)
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pxor(va, XmmConst(v128::from32p(0x80000000)));
 	c->pcmpgtd(va, XmmConst(v128::from32p(op.si10 - 0x80000000)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CLGTHI(spu_opcode_t op)
@@ -4593,7 +4593,7 @@ void spu_recompiler::CLGTHI(spu_opcode_t op)
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pxor(va, XmmConst(v128::from16p(0x8000)));
 	c->pcmpgtw(va, XmmConst(v128::from16p(op.si10 - 0x8000)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CLGTBI(spu_opcode_t op)
@@ -4601,12 +4601,12 @@ void spu_recompiler::CLGTBI(spu_opcode_t op)
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->psubb(va, XmmConst(v128::from8p(0x80)));
 	c->pcmpgtb(va, XmmConst(v128::from8p(op.si10 - 0x80)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::HLGTI(spu_opcode_t op)
 {
-	c->cmp(SPU_OFF_32(gpr, op.ra, &v128::_u32, 3), +op.si10);
+	c->cmp(SPU_OFF_32(gpr[op.ra]._u32[3]), +op.si10);
 
 	asmjit::Label label = c->newLabel();
 	asmjit::Label ret = c->newLabel();
@@ -4628,7 +4628,7 @@ void spu_recompiler::MPYI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pmaddwd(va, XmmConst(v128::from32p(op.si10 & 0xffff)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::MPYUI(spu_opcode_t op)
@@ -4642,33 +4642,33 @@ void spu_recompiler::MPYUI(spu_opcode_t op)
 	c->pmullw(va2, vi);
 	c->pslld(va, 16);
 	c->por(va, va2);
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CEQI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pcmpeqd(va, XmmConst(v128::from32p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CEQHI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pcmpeqw(va, XmmConst(v128::from16p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::CEQBI(spu_opcode_t op)
 {
 	const XmmLink& va = XmmGet(op.ra, XmmType::Int);
 	c->pcmpeqb(va, XmmConst(v128::from8p(op.si10)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), va);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), va);
 }
 
 void spu_recompiler::HEQI(spu_opcode_t op)
 {
-	c->cmp(SPU_OFF_32(gpr, op.ra, &v128::_u32, 3), +op.si10);
+	c->cmp(SPU_OFF_32(gpr[op.ra]._u32[3]), +op.si10);
 
 	asmjit::Label label = c->newLabel();
 	asmjit::Label ret = c->newLabel();
@@ -4698,7 +4698,7 @@ void spu_recompiler::ILA(spu_opcode_t op)
 {
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(v128::from32p(op.i18)));
-	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
+	c->movdqa(SPU_OFF_128(gpr[op.rt]), vr);
 }
 
 void spu_recompiler::SELB(spu_opcode_t op)
@@ -4708,22 +4708,22 @@ void spu_recompiler::SELB(spu_opcode_t op)
 
 	if (utils::has_avx512())
 	{
-		c->vpternlogd(vc, vb, SPU_OFF_128(gpr, op.ra), 0xca /* A?B:C */);
-		c->movdqa(SPU_OFF_128(gpr, op.rt4), vc);
+		c->vpternlogd(vc, vb, SPU_OFF_128(gpr[op.ra]), 0xca /* A?B:C */);
+		c->movdqa(SPU_OFF_128(gpr[op.rt4]), vc);
 		return;
 	}
 
 	if (utils::has_xop())
 	{
-		c->vpcmov(vc, vb, SPU_OFF_128(gpr, op.ra), vc);
-		c->movdqa(SPU_OFF_128(gpr, op.rt4), vc);
+		c->vpcmov(vc, vb, SPU_OFF_128(gpr[op.ra]), vc);
+		c->movdqa(SPU_OFF_128(gpr[op.rt4]), vc);
 		return;
 	}
 
 	c->pand(vb, vc);
-	c->pandn(vc, SPU_OFF_128(gpr, op.ra));
+	c->pandn(vc, SPU_OFF_128(gpr[op.ra]));
 	c->por(vb, vc);
-	c->movdqa(SPU_OFF_128(gpr, op.rt4), vb);
+	c->movdqa(SPU_OFF_128(gpr[op.rt4]), vb);
 }
 
 void spu_recompiler::SHUFB(spu_opcode_t op)
@@ -4748,7 +4748,7 @@ void spu_recompiler::SHUFB(spu_opcode_t op)
 		c->setExtraReg(asmjit::x86::k1);
 		c->vpshufb(vt, vb, vm); // {k1}
 		c->vpternlogd(vt, va, vc, 0xf6 /* orAxorBC */);
-		c->movdqa(SPU_OFF_128(gpr, op.rt4), vt);
+		c->movdqa(SPU_OFF_128(gpr[op.rt4]), vt);
 		return;
 	}
 
@@ -4800,7 +4800,7 @@ void spu_recompiler::SHUFB(spu_opcode_t op)
 		c->por(vt, vc);
 	}
 
-	c->movdqa(SPU_OFF_128(gpr, op.rt4), vt);
+	c->movdqa(SPU_OFF_128(gpr[op.rt4]), vt);
 }
 
 void spu_recompiler::MPYA(spu_opcode_t op)
@@ -4812,8 +4812,8 @@ void spu_recompiler::MPYA(spu_opcode_t op)
 	c->pand(va, vi);
 	c->pand(vb, vi);
 	c->pmaddwd(va, vb);
-	c->paddd(va, SPU_OFF_128(gpr, op.rc));
-	c->movdqa(SPU_OFF_128(gpr, op.rt4), va);
+	c->paddd(va, SPU_OFF_128(gpr[op.rc]));
+	c->movdqa(SPU_OFF_128(gpr[op.rt4]), va);
 }
 
 void spu_recompiler::FNMS(spu_opcode_t op)
@@ -4834,9 +4834,9 @@ void spu_recompiler::FNMS(spu_opcode_t op)
 	c->andps(vb, v2);      // vb = rb & ~rb_extended
 
 	c->mulps(va, vb);
-	c->movaps(vb, SPU_OFF_128(gpr, op.rc));
+	c->movaps(vb, SPU_OFF_128(gpr[op.rc]));
 	c->subps(vb, va);
-	c->movaps(SPU_OFF_128(gpr, op.rt4), vb);
+	c->movaps(SPU_OFF_128(gpr[op.rt4]), vb);
 }
 
 void spu_recompiler::FMA(spu_opcode_t op)
@@ -4857,8 +4857,8 @@ void spu_recompiler::FMA(spu_opcode_t op)
 	c->andps(vb, v2);      // vb = rb & ~rb_extended
 
 	c->mulps(va, vb);
-	c->addps(va, SPU_OFF_128(gpr, op.rc));
-	c->movaps(SPU_OFF_128(gpr, op.rt4), va);
+	c->addps(va, SPU_OFF_128(gpr[op.rc]));
+	c->movaps(SPU_OFF_128(gpr[op.rt4]), va);
 }
 
 void spu_recompiler::FMS(spu_opcode_t op)
@@ -4879,6 +4879,6 @@ void spu_recompiler::FMS(spu_opcode_t op)
 	c->andps(vb, v2);      // vb = rb & ~rb_extended
 
 	c->mulps(va, vb);
-	c->subps(va, SPU_OFF_128(gpr, op.rc));
-	c->movaps(SPU_OFF_128(gpr, op.rt4), va);
+	c->subps(va, SPU_OFF_128(gpr[op.rc]));
+	c->movaps(SPU_OFF_128(gpr[op.rt4]), va);
 }
diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
index 622ce7f6d..410de7b37 100644
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@@ -5,8 +5,6 @@
 
 #include <functional>
 
-union v128;
-
 // SPU ASMJIT Recompiler
 class spu_recompiler : public spu_recompiler_base
 {
diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
index 74ae4055a..0a26b572d 100644
--- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
@@ -203,7 +203,7 @@ DECLARE(spu_runtime::tr_all) = []
 	*raw++ = 0x41;
 	*raw++ = 0x8b;
 	*raw++ = 0x45;
-	*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
+	*raw++ = ::narrow<s8>(OFFSET_OF(spu_thread, pc));
 
 	// Get LS address starting from PC: lea rcx, [rbp + rax]
 	*raw++ = 0x48;
@@ -233,7 +233,7 @@ DECLARE(spu_runtime::tr_all) = []
 	*raw++ = 0x49;
 	*raw++ = 0xc7;
 	*raw++ = 0x45;
-	*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
+	*raw++ = ::narrow<s8>(OFFSET_OF(spu_thread, block_hash));
 	*raw++ = 0x00;
 	*raw++ = 0x00;
 	*raw++ = 0x00;
@@ -259,11 +259,11 @@ DECLARE(spu_runtime::tr_all) = []
 		    // x19 = m_thread a.k.a arg[0]
 		    // x20 = ls_base
 		    // x21 - x22 = args[2 - 3]
-		    // ensure(::offset32(&spu_thread::pc) <= 32760);
-		    // ensure(::offset32(&spu_thread::block_hash) <= 32760);
+		    // ensure(OFFSET_OF(spu_thread, pc) <= 32760);
+		    // ensure(OFFSET_OF(spu_thread, block_hash) <= 32760);
 
 			// Load PC
-			c.ldr(a64::w1, arm::Mem(a64::x19, ::offset32(&spu_thread::pc))); // REG_Base + offset(spu_thread::pc)
+			c.ldr(a64::w1, arm::Mem(a64::x19, OFFSET_OF(spu_thread, pc))); // REG_Base + offset(spu_thread::pc)
 			// Compute LS address = REG_Sp + PC, store into x7 (use later)
 			c.add(a64::x7, a64::x20, a64::x1);
 			// Load 32b from LS address
@@ -274,7 +274,7 @@ DECLARE(spu_runtime::tr_all) = []
 			c.mov(a64::x4, Imm(reinterpret_cast<u64>(g_dispatcher)));
 			// Update block hash
 			c.mov(a64::x5, Imm(0));
-			c.str(a64::x5, arm::Mem(a64::x19, ::offset32(&spu_thread::block_hash))); // REG_Base + offset(spu_thread::block_hash)
+			c.str(a64::x5, arm::Mem(a64::x19, OFFSET_OF(spu_thread, block_hash))); // REG_Base + offset(spu_thread::block_hash)
 			// Jump to [g_dispatcher + idx * 8]
 			c.mov(a64::x6, Imm(8));
 			c.mul(a64::x6, a64::x3, a64::x6);
@@ -327,7 +327,7 @@ DECLARE(spu_runtime::g_gateway) = build_function_asm<spu_function_t>("spu_gatewa
 #endif
 
 		// Save native stack pointer for longjmp emulation
-		c.mov(x86::qword_ptr(args[0], ::offset32(&spu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp);
+		c.mov(x86::qword_ptr(args[0], OFFSET_OF(spu_thread, hv_ctx.regs)), x86::rsp);
 
 		// Move 4 args (despite spu_function_t def)
 		c.mov(x86::r13, args[0]);
@@ -381,7 +381,7 @@ DECLARE(spu_runtime::g_gateway) = build_function_asm<spu_function_t>("spu_gatewa
 #elif defined(ARCH_ARM64)
 
 		// Save non-volatile regs. We do this within the thread context instead of normal stack
-		const u32 hv_regs_base = ::offset32(&spu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
+		const u32 hv_regs_base = OFFSET_OF(spu_thread, hv_ctx.regs);
 		// NOTE: A64 gp-gp-imm add only takes immediates of upto 4095. Larger numbers can work, but need to be multiples of 2 for lowering to replace the instruction correctly
 	    // Unfortunately asmjit fails silently on these patterns which can generate incorrect code
 		c.mov(a64::x15, args[0]);
@@ -447,14 +447,14 @@ DECLARE(spu_runtime::g_escape) = build_function_asm<void (*)(spu_thread*)>("spu_
 
 #if defined(ARCH_X64)
 		// Restore native stack pointer (longjmp emulation)
-		c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)));
+		c.mov(x86::rsp, x86::qword_ptr(args[0], OFFSET_OF(spu_thread, hv_ctx.regs)));
 
 		// Return to the return location
 		c.sub(x86::rsp, 8);
 		c.ret();
 #elif defined(ARCH_ARM64)
 		// Far ret, jumps to gateway epilogue
-		const u32 reg_base = ::offset32(&spu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
+		const u32 reg_base = OFFSET_OF(spu_thread, hv_ctx.regs);
 		c.mov(a64::x19, args[0]);
 		c.mov(a64::x15, Imm(reg_base));
 		c.add(a64::x15, a64::x15, args[0]);
@@ -471,28 +471,28 @@ DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void (*)(spu_thread*, s
 
 #if defined(ARCH_X64)
 		// Restore native stack pointer (longjmp emulation)
-		c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)));
+		c.mov(x86::rsp, x86::qword_ptr(args[0], OFFSET_OF(spu_thread, hv_ctx.regs)));
 
 		// Adjust stack for initial call instruction in the gateway
 		c.sub(x86::rsp, 16);
 
 		// Tail call, GHC CC (second arg)
 		c.mov(x86::r13, args[0]);
-		c.mov(x86::rbp, x86::qword_ptr(args[0], ::offset32(&spu_thread::ls)));
+		c.mov(x86::rbp, x86::qword_ptr(args[0], OFFSET_OF(spu_thread, ls)));
 		c.mov(x86::r12, args[2]);
 		c.xor_(x86::ebx, x86::ebx);
 		c.mov(x86::qword_ptr(x86::rsp), args[1]);
 		c.ret();
 #elif defined(ARCH_ARM64)
 		// HV pointer
-		const u32 reg_base = ::offset32(&spu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
+		const u32 reg_base = OFFSET_OF(spu_thread, hv_ctx.regs);
 
 		// Tail call, GHC CC
-		c.mov(a64::x19, args[0]);                          // REG_Base
-		c.mov(a64::x15, Imm(::offset32(&spu_thread::ls))); // SPU::ls offset cannot be correctly encoded for ldr as it is too large
-		c.ldr(a64::x20, arm::Mem(a64::x19, a64::x15));     // REG_Sp
-		c.mov(a64::x21, args[2]);                          // REG_Hp
-		c.mov(a64::x22, a64::xzr);                         // REG_R1
+		c.mov(a64::x19, args[0]);                        // REG_Base
+		c.mov(a64::x15, Imm(OFFSET_OF(spu_thread, ls))); // SPU::ls offset cannot be correctly encoded for ldr as it is too large
+		c.ldr(a64::x20, arm::Mem(a64::x19, a64::x15));   // REG_Sp
+		c.mov(a64::x21, args[2]);                        // REG_Hp
+		c.mov(a64::x22, a64::xzr);                       // REG_R1
 
 		// Reset sp to patch leaks. Calls to tail escape may leave their stack "dirty" due to optimizations.
 		c.mov(a64::x14, Imm(reg_base + 8));
@@ -7754,13 +7754,13 @@ struct spu_fast : public spu_recompiler_base
 		*raw++ = 0x49;
 		*raw++ = 0x89;
 		*raw++ = 0x45;
-		*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
+		*raw++ = ::narrow<s8>(OFFSET_OF(spu_thread, block_hash));
 
 		// Load PC: mov eax, [r13 + spu_thread::pc]
 		*raw++ = 0x41;
 		*raw++ = 0x8b;
 		*raw++ = 0x45;
-		*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
+		*raw++ = ::narrow<s8>(OFFSET_OF(spu_thread, pc));
 
 		// Get LS address starting from PC: lea rcx, [rbp + rax]
 		*raw++ = 0x48;
@@ -7824,18 +7824,18 @@ struct spu_fast : public spu_recompiler_base
 		*raw++ = 0x48;
 		*raw++ = 0x8d;
 		*raw++ = 0x7d;
-		*raw++ = ::narrow<s8>(::offset32(&spu_thread::gpr));
+		*raw++ = ::narrow<s8>(OFFSET_OF(spu_thread, gpr));
 
 		// Save base pc: mov [rbp + spu_thread::base_pc], eax
 		*raw++ = 0x89;
 		*raw++ = 0x45;
-		*raw++ = ::narrow<s8>(::offset32(&spu_thread::base_pc));
+		*raw++ = ::narrow<s8>(OFFSET_OF(spu_thread, base_pc));
 
 		// inc block_counter
 		*raw++ = 0x48;
 		*raw++ = 0xff;
 		*raw++ = 0x85;
-		const u32 blc_off = ::offset32(&spu_thread::block_counter);
+		const u32 blc_off = OFFSET_OF(spu_thread, block_counter);
 		std::memcpy(raw, &blc_off, 4);
 		raw += 4;
 
@@ -7858,7 +7858,7 @@ struct spu_fast : public spu_recompiler_base
 				*raw++ = 0x44;
 				*raw++ = 0x89;
 				*raw++ = 0x65;
-				*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
+				*raw++ = ::narrow<s8>(OFFSET_OF(spu_thread, pc));
 
 				// Epilogue: add rsp,0x28
 				*raw++ = 0x48;
@@ -7890,7 +7890,7 @@ struct spu_fast : public spu_recompiler_base
 					*raw++ = type == spu_itype::BRHZ || type == spu_itype::BRHNZ ? 0x66 : 0x90;
 					*raw++ = 0x83;
 					*raw++ = 0xbd;
-					const u32 off = ::offset32(&spu_thread::gpr, op.rt) + 12;
+					const u32 off = OFFSET_OF(spu_thread, gpr[op.rt]) + 12;
 					std::memcpy(raw, &off, 4);
 					raw += 4;
 					*raw++ = 0x00;
@@ -7957,7 +7957,7 @@ struct spu_fast : public spu_recompiler_base
 		// sub eax, [rbp + spu_thread::base_pc]
 		*raw++ = 0x2b;
 		*raw++ = 0x45;
-		*raw++ = ::narrow<s8>(::offset32(&spu_thread::base_pc));
+		*raw++ = ::narrow<s8>(OFFSET_OF(spu_thread, base_pc));
 
 		// cmp eax, (0 - size)
 		*raw++ = 0x3d;
@@ -7992,7 +7992,7 @@ struct spu_fast : public spu_recompiler_base
 		*raw++ = 0x44;
 		*raw++ = 0x89;
 		*raw++ = 0x65;
-		*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
+		*raw++ = ::narrow<s8>(OFFSET_OF(spu_thread, pc));
 
 		// Epilogue: add rsp,0x28 ; ret
 		*raw++ = 0x48;
diff --git a/rpcs3/Emu/Cell/SPUInterpreter.cpp b/rpcs3/Emu/Cell/SPUInterpreter.cpp
index 537196b88..b1e76b01b 100644
--- a/rpcs3/Emu/Cell/SPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp
@@ -99,7 +99,7 @@ namespace asmjit
 			c.shl(x86::eax, I + 4);
 		}
 
-		const auto ptr = x86::oword_ptr(spu, x86::rax, 0, ::offset32(&spu_thread::gpr));
+		const auto ptr = x86::oword_ptr(spu, x86::rax, 0, OFFSET_OF(spu_thread, gpr));
 
 		if (utils::has_avx())
 		{
diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
index eb92feb68..dfd1243f3 100644
--- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
@@ -329,9 +329,9 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 		if (!m_finfo->fn && !m_block)
 		{
-			lr = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::gpr, +s_reg_lr, &v128::_u32, 3));
-			sp = m_ir->CreateLoad(get_type<u32[4]>(), spu_ptr<u32[4]>(&spu_thread::gpr, +s_reg_sp));
-			r3 = m_ir->CreateLoad(get_type<u32[4]>(), spu_ptr<u32[4]>(&spu_thread::gpr, 3));
+			lr = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, gpr[+s_reg_lr]._u32[3])));
+			sp = m_ir->CreateLoad(get_type<u32[4]>(), spu_ptr<u32[4]>(OFFSET_OF(spu_thread, gpr[+s_reg_sp])));
+			r3 = m_ir->CreateLoad(get_type<u32[4]>(), spu_ptr<u32[4]>(OFFSET_OF(spu_thread, gpr[3])));
 		}
 		else
 		{
@@ -348,8 +348,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		if (!m_finfo->fn)
 		{
 			lr = m_ir->CreateAnd(lr, 0x3fffc);
-			m_ir->CreateStore(lr, spu_ptr<u32>(&spu_thread::pc));
-			m_ir->CreateStore(_call, spu_ptr<u32[4]>(&spu_thread::gpr, 3));
+			m_ir->CreateStore(lr, spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
+			m_ir->CreateStore(_call, spu_ptr<u32[4]>(OFFSET_OF(spu_thread, gpr[3])));
 			m_ir->CreateBr(add_block_indirect({}, value<u32>(lr)));
 		}
 		else if (tail)
@@ -392,7 +392,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		m_blocks.clear();
 		m_block_queue.clear();
 		m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", m_function));
-		m_memptr = m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(&spu_thread::memory_base_addr));
+		m_memptr = m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(OFFSET_OF(spu_thread, memory_base_addr)));
 	}
 
 	// Add block with current block as a predecessor
@@ -415,7 +415,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 				m_lsptr = fn->getArg(1);
 				m_base_pc = fn->getArg(2);
 				m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", fn));
-				m_memptr = m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(&spu_thread::memory_base_addr));
+				m_memptr = m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(OFFSET_OF(spu_thread, memory_base_addr)));
 
 				// Load registers at the entry chunk
 				for (u32 i = 0; i < s_reg_max; i++)
@@ -452,7 +452,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 				const auto fail = llvm::BasicBlock::Create(m_context, "", m_function);
 				m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_base_pc, m_ir->getInt32(m_base)), next, fail);
 				m_ir->SetInsertPoint(fail);
-				m_ir->CreateStore(m_ir->getInt32(target), spu_ptr<u32>(&spu_thread::pc));
+				m_ir->CreateStore(m_ir->getInt32(target), spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
 				tail_chunk(nullptr);
 				m_ir->SetInsertPoint(next);
 			}
@@ -490,7 +490,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 			{
 				ensure(!m_finfo->fn);
 
-				m_ir->CreateStore(m_ir->getInt32(target), spu_ptr<u32>(&spu_thread::pc));
+				m_ir->CreateStore(m_ir->getInt32(target), spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
 			}
 			else
 			{
@@ -539,16 +539,16 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		return m_ir->CreateGEP(get_type<u8>(), base, offset);
 	}
 
-	template <typename T, typename... Args>
-	llvm::Value* spu_ptr(Args... offset_args)
+	template <typename T>
+	llvm::Value* spu_ptr(std::uint32_t offset)
 	{
-		return _ptr<T>(m_thread, ::offset32(offset_args...));
+		return _ptr<T>(m_thread, offset);
 	}
 
-	template <typename T, typename... Args>
-	llvm::Value* spu_ptr(value_t<u64> add, Args... offset_args)
+	template <typename T>
+	llvm::Value* spu_ptr(value_t<u64> add, std::uint32_t offset)
 	{
-		const auto off = m_ir->CreateGEP(get_type<u8>(), m_thread, m_ir->getInt64(::offset32(offset_args...)));
+		const auto off = m_ir->CreateGEP(get_type<u8>(), m_thread, m_ir->getInt64(offset));
 		return m_ir->CreateAdd(off, add.value);
 	}
 
@@ -578,15 +578,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 	{
 		if (index < 128)
 		{
-			return ::offset32(&spu_thread::gpr, index);
+			return OFFSET_OF(spu_thread, gpr[index]);
 		}
 
 		switch (index)
 		{
-		case s_reg_mfc_eal: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::eal);
-		case s_reg_mfc_lsa: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::lsa);
-		case s_reg_mfc_tag: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::tag);
-		case s_reg_mfc_size: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::size);
+		case s_reg_mfc_eal: return OFFSET_OF(spu_thread, ch_mfc_cmd.eal);
+		case s_reg_mfc_lsa: return OFFSET_OF(spu_thread, ch_mfc_cmd.lsa);
+		case s_reg_mfc_tag: return OFFSET_OF(spu_thread, ch_mfc_cmd.tag);
+		case s_reg_mfc_size: return OFFSET_OF(spu_thread, ch_mfc_cmd.size);
 		default:
 			fmt::throw_exception("get_reg_offset(%u): invalid register index", index);
 		}
@@ -1049,13 +1049,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 	// Update PC for current or explicitly specified instruction address
 	void update_pc(u32 target = -1)
 	{
-		m_ir->CreateStore(m_ir->CreateAnd(get_pc(target + 1 ? target : m_pos), 0x3fffc), spu_ptr<u32>(&spu_thread::pc))->setVolatile(true);
+		m_ir->CreateStore(m_ir->CreateAnd(get_pc(target + 1 ? target : m_pos), 0x3fffc), spu_ptr<u32>(OFFSET_OF(spu_thread, pc)))->setVolatile(true);
 	}
 
 	// Call cpu_thread::check_state if necessary and return or continue (full check)
 	void check_state(u32 addr, bool may_be_unsafe_for_savestate = true)
 	{
-		const auto pstate = spu_ptr<u32>(&spu_thread::state);
+		const auto pstate = spu_ptr<u32>(OFFSET_OF(spu_thread, state));
 		const auto _body = llvm::BasicBlock::Create(m_context, "", m_function);
 		const auto check = llvm::BasicBlock::Create(m_context, "", m_function);
 		m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_ir->CreateLoad(get_type<u32>(), pstate, true), m_ir->getInt32(0)), _body, check, m_md_likely);
@@ -1069,14 +1069,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 		if (may_be_unsafe_for_savestate)
 		{
-			m_ir->CreateStore(m_ir->getInt8(1), spu_ptr<u8>(&spu_thread::unsavable))->setVolatile(true);
+			m_ir->CreateStore(m_ir->getInt8(1), spu_ptr<u8>(OFFSET_OF(spu_thread, unsavable)))->setVolatile(true);
 		}
 
 		m_ir->CreateCall(m_test_state, {m_thread});
 
 		if (may_be_unsafe_for_savestate)
 		{
-			m_ir->CreateStore(m_ir->getInt8(0), spu_ptr<u8>(&spu_thread::unsavable))->setVolatile(true);
+			m_ir->CreateStore(m_ir->getInt8(0), spu_ptr<u8>(OFFSET_OF(spu_thread, unsavable)))->setVolatile(true);
 		}
 
 		m_ir->CreateBr(_body);
@@ -1145,7 +1145,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		const auto _final = llvm::BasicBlock::Create(m_context, "__putllc16_final", m_function);
 
 		const auto _eal = (get_reg_fixed<u32>(s_reg_mfc_eal) & -128).eval(m_ir);
-		const auto _raddr = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::raddr));
+		const auto _raddr = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, raddr)));
 
 		m_ir->CreateCondBr(m_ir->CreateAnd(m_ir->CreateICmpEQ(_eal, _raddr), m_ir->CreateIsNotNull(_raddr)), _raddr_match, _fail, m_md_likely);
 		m_ir->SetInsertPoint(_raddr_match);
@@ -1259,7 +1259,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 			m_ir->SetInsertPoint(_fail);
 			call("PUTLLC16_fail", +on_fail, m_thread, _eal);
-			m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr<u64>(&spu_thread::ch_atomic_stat));
+			m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr<u64>(OFFSET_OF(spu_thread, ch_atomic_stat)));
 			m_ir->CreateBr(_final);
 
 			m_ir->SetInsertPoint(_final);
@@ -1269,7 +1269,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		const auto diff = m_ir->CreateZExt(m_ir->CreateSub(dest, _lsa), get_type<u64>());
 
 		const auto _new = m_ir->CreateAlignedLoad(get_type<u128>(), _ptr<u128>(m_lsptr, dest), llvm::MaybeAlign{16});
-		const auto _rdata = m_ir->CreateAlignedLoad(get_type<u128>(), _ptr<u128>(spu_ptr<u8>(&spu_thread::rdata), m_ir->CreateAnd(diff, 0x70)), llvm::MaybeAlign{16});
+		const auto _rdata = m_ir->CreateAlignedLoad(get_type<u128>(), _ptr<u128>(spu_ptr<u8>(OFFSET_OF(spu_thread, rdata)), m_ir->CreateAnd(diff, 0x70)), llvm::MaybeAlign{16});
 
 		const bool is_accurate_op = !!g_cfg.core.spu_accurate_reservations;
 
@@ -1289,8 +1289,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		// Touch memory (on the opposite side of the page)
 		m_ir->CreateAtomicRMW(llvm::AtomicRMWInst::Or, _ptr<u8>(m_memptr, m_ir->CreateXor(_eal, 4096 / 2)), m_ir->getInt8(0), llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent);
 
-		const auto rptr = _ptr<u64>(m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir));
-		const auto rtime = m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(&spu_thread::rtime));
+		const auto rptr = _ptr<u64>(m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(OFFSET_OF(spu_thread, reserv_base_addr))), ((eal_val & 0xff80) >> 1).eval(m_ir));
+		const auto rtime = m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(OFFSET_OF(spu_thread, rtime)));
 
 		m_ir->CreateBr(_repeat_lock);
 		m_ir->SetInsertPoint(_repeat_lock);
@@ -1313,7 +1313,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		m_ir->SetInsertPoint(_lock_success);
 
 		// Commit 16 bytes compare-exchange
-		const auto sudo_ptr = _ptr<u8>(m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(&spu_thread::memory_sudo_addr)), _eal);
+		const auto sudo_ptr = _ptr<u8>(m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(OFFSET_OF(spu_thread, memory_sudo_addr))), _eal);
 
 		m_ir->CreateCondBr(
 			m_ir->CreateExtractValue(m_ir->CreateAtomicCmpXchg(_ptr<u128>(sudo_ptr, diff), _rdata, _new, llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent), 1), _success_and_unlock, _fail_and_unlock);
@@ -1331,13 +1331,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 		// Perform unlocked vm::reservation_update if no physical memory changes needed
 		m_ir->SetInsertPoint(_inc_res);
-		const auto rptr2 = _ptr<u64>(m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir));
+		const auto rptr2 = _ptr<u64>(m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(OFFSET_OF(spu_thread, reserv_base_addr))), ((eal_val & 0xff80) >> 1).eval(m_ir));
 
 		llvm::Value* old_val{};
 
 		if (true || is_accurate_op)
 		{
-			old_val = m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(&spu_thread::rtime));
+			old_val = m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(OFFSET_OF(spu_thread, rtime)));
 		}
 		else
 		{
@@ -1358,8 +1358,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		}
 
 		m_ir->SetInsertPoint(_success);
-		m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr<u64>(&spu_thread::ch_atomic_stat));
-		m_ir->CreateStore(m_ir->getInt32(0), spu_ptr<u32>(&spu_thread::raddr));
+		m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr<u64>(OFFSET_OF(spu_thread, ch_atomic_stat)));
+		m_ir->CreateStore(m_ir->getInt32(0), spu_ptr<u32>(OFFSET_OF(spu_thread, raddr)));
 		m_ir->CreateBr(_final);
 
 		m_ir->SetInsertPoint(_fail_and_unlock);
@@ -1368,7 +1368,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 		m_ir->SetInsertPoint(_fail);
 		call("PUTLLC16_fail", +on_fail, m_thread, _eal);
-		m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr<u64>(&spu_thread::ch_atomic_stat));
+		m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr<u64>(OFFSET_OF(spu_thread, ch_atomic_stat)));
 		m_ir->CreateBr(_final);
 
 		m_ir->SetInsertPoint(_final);
@@ -1408,7 +1408,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		const auto _final = llvm::BasicBlock::Create(m_context, "", m_function);
 
 		const auto _eal = (get_reg_fixed<u32>(s_reg_mfc_eal) & -128).eval(m_ir);
-		const auto _raddr = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::raddr));
+		const auto _raddr = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, raddr)));
 
 		m_ir->CreateCondBr(m_ir->CreateAnd(m_ir->CreateICmpEQ(_eal, _raddr), m_ir->CreateIsNotNull(_raddr)), _next, _fail, m_md_likely);
 		m_ir->SetInsertPoint(_next);
@@ -1416,23 +1416,23 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		value_t<u32> eal_val;
 		eal_val.value = _eal;
 
-		const auto rptr = _ptr<u64>(m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir));
-		const auto rval = m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(&spu_thread::rtime));
+		const auto rptr = _ptr<u64>(m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(OFFSET_OF(spu_thread, reserv_base_addr))), ((eal_val & 0xff80) >> 1).eval(m_ir));
+		const auto rval = m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(OFFSET_OF(spu_thread, rtime)));
 		m_ir->CreateCondBr(
 			m_ir->CreateExtractValue(m_ir->CreateAtomicCmpXchg(rptr, rval, m_ir->CreateAdd(rval, m_ir->getInt64(128)), llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent), 1), _next0, g_cfg.core.spu_accurate_reservations ? _fail : _next0); // Succeed unconditionally
 
 		m_ir->SetInsertPoint(_next0);
 		// call("atomic_wait_engine::notify_all", static_cast<void(*)(const void*)>(atomic_wait_engine::notify_all), rptr);
-		m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr<u64>(&spu_thread::ch_atomic_stat));
+		m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr<u64>(OFFSET_OF(spu_thread, ch_atomic_stat)));
 		m_ir->CreateBr(_final);
 
 		m_ir->SetInsertPoint(_fail);
 		call("PUTLLC0_fail", +on_fail, m_thread, _eal);
-		m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr<u64>(&spu_thread::ch_atomic_stat));
+		m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr<u64>(OFFSET_OF(spu_thread, ch_atomic_stat)));
 		m_ir->CreateBr(_final);
 
 		m_ir->SetInsertPoint(_final);
-		m_ir->CreateStore(m_ir->getInt32(0), spu_ptr<u32>(&spu_thread::raddr));
+		m_ir->CreateStore(m_ir->getInt32(0), spu_ptr<u32>(OFFSET_OF(spu_thread, raddr)));
 	}
 
 public:
@@ -1470,7 +1470,7 @@ public:
 					{
 						.debug_info = false,       // Set to "true" to insert debug frames on x27
 						.use_stack_frames = false, // We don't need this since the SPU GW allocates global scratch on the stack
-						.hypervisor_context_offset = ::offset32(&spu_thread::hv_ctx),
+						.hypervisor_context_offset = OFFSET_OF(spu_thread, hv_ctx),
 						.exclusion_callback = should_exclude_function,
 						.base_register_lookup = {} // Unused, always x19 on SPU
 					};
@@ -1618,10 +1618,10 @@ public:
 		const auto label_stop = BasicBlock::Create(m_context, "", m_function);
 
 		// Load PC, which will be the actual value of 'm_base'
-		m_base_pc = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::pc));
+		m_base_pc = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
 
 		// Emit state check
-		const auto pstate = spu_ptr<u32>(&spu_thread::state);
+		const auto pstate = spu_ptr<u32>(OFFSET_OF(spu_thread, state));
 		m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type<u32>(), pstate), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely);
 
 		// Emit code check
@@ -1630,7 +1630,7 @@ public:
 
 		// Set block hash for profiling (if enabled)
 		if (g_cfg.core.spu_prof && g_cfg.core.spu_verification)
-			m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536)), spu_ptr<u64>(&spu_thread::block_hash));
+			m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536)), spu_ptr<u64>(OFFSET_OF(spu_thread, block_hash)));
 
 		if (!g_cfg.core.spu_verification)
 		{
@@ -1893,7 +1893,7 @@ public:
 
 		// Increase block counter with statistics
 		m_ir->SetInsertPoint(label_body);
-		const auto pbcount = spu_ptr<u64>(&spu_thread::block_counter);
+		const auto pbcount = spu_ptr<u64>(OFFSET_OF(spu_thread, block_counter));
 		m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(get_type<u64>(), pbcount), m_ir->getInt64(check_iterations)), pbcount);
 
 		// Call the entry function chunk
@@ -1927,7 +1927,7 @@ public:
 
 		if (g_cfg.core.spu_verification)
 		{
-			const auto pbfail = spu_ptr<u64>(&spu_thread::block_failure);
+			const auto pbfail = spu_ptr<u64>(OFFSET_OF(spu_thread, block_failure));
 			m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(get_type<u64>(), pbfail), m_ir->getInt64(1)), pbfail);
 			const auto dispci = call("spu_dispatch", spu_runtime::tr_dispatch, m_thread, m_lsptr, main_arg2);
 			dispci->setCallingConv(CallingConv::GHC);
@@ -1987,7 +1987,7 @@ public:
 
 			// Set block hash for profiling (if enabled)
 			if (g_cfg.core.spu_prof)
-				m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536) | (m_entry >> 2)), spu_ptr<u64>(&spu_thread::block_hash));
+				m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536) | (m_entry >> 2)), spu_ptr<u64>(OFFSET_OF(spu_thread, block_hash)));
 
 			m_finfo = &m_functions[m_entry];
 			m_ir->CreateBr(add_block(m_entry));
@@ -2918,7 +2918,7 @@ public:
 		set_function(main_func);
 
 		// Load pc and opcode
-		m_interp_pc = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::pc));
+		m_interp_pc = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
 		m_interp_op = m_ir->CreateLoad(get_type<u32>(), m_ir->CreateGEP(get_type<u8>(), m_lsptr, m_ir->CreateZExt(m_interp_pc, get_type<u64>())));
 		m_interp_op = m_ir->CreateCall(get_intrinsic<u32>(Intrinsic::bswap), {m_interp_op});
 
@@ -2932,7 +2932,7 @@ public:
 		m_interp_regs = _ptr(m_thread, get_reg_offset(0));
 
 		// Save host thread's stack pointer
-		const auto native_sp = spu_ptr<u64>(&spu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
+		const auto native_sp = spu_ptr<u64>(OFFSET_OF(spu_thread, hv_ctx.regs));
 #if defined(ARCH_X64)
 		const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")}));
 #elif defined(ARCH_ARM64)
@@ -3018,7 +3018,7 @@ public:
 				m_interp_regs = f->getArg(6);
 
 				m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", f));
-				m_memptr = m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(&spu_thread::memory_base_addr));
+				m_memptr = m_ir->CreateLoad(get_type<u8*>(), spu_ptr<u8*>(OFFSET_OF(spu_thread, memory_base_addr)));
 
 				switch (itype)
 				{
@@ -3034,7 +3034,7 @@ public:
 				case spu_itype::WRCH:
 				{
 					// Invalid or abortable instruction. Save current address.
-					m_ir->CreateStore(m_interp_pc, spu_ptr<u32>(&spu_thread::pc));
+					m_ir->CreateStore(m_interp_pc, spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
 					[[fallthrough]];
 				}
 				default:
@@ -3078,7 +3078,7 @@ public:
 					{
 						if (check)
 						{
-							m_ir->CreateStore(m_interp_pc, spu_ptr<u32>(&spu_thread::pc));
+							m_ir->CreateStore(m_interp_pc, spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
 						}
 
 						// Decode next instruction.
@@ -3115,9 +3115,9 @@ public:
 							{
 								const auto _stop = BasicBlock::Create(m_context, "", f);
 								const auto _next = BasicBlock::Create(m_context, "", f);
-								m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::state))), _stop, _next, m_md_unlikely);
+								m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, state)))), _stop, _next, m_md_unlikely);
 								m_ir->SetInsertPoint(_stop);
-								m_ir->CreateStore(m_interp_pc, spu_ptr<u32>(&spu_thread::pc));
+								m_ir->CreateStore(m_interp_pc, spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
 
 								const auto escape_yes = BasicBlock::Create(m_context, "", f);
 								const auto escape_no = BasicBlock::Create(m_context, "", f);
@@ -3171,7 +3171,7 @@ public:
 							// Call next instruction.
 							const auto _stop = BasicBlock::Create(m_context, "", f);
 							const auto _next = BasicBlock::Create(m_context, "", f);
-							m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::state))), _stop, _next, m_md_unlikely);
+							m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, state)))), _stop, _next, m_md_unlikely);
 							m_ir->SetInsertPoint(_next);
 
 							if (itype == spu_itype::WRCH ||
@@ -3189,7 +3189,7 @@ public:
 							ncall->setTailCall();
 							m_ir->CreateRetVoid();
 							m_ir->SetInsertPoint(_stop);
-							m_ir->CreateStore(m_interp_pc, spu_ptr<u32>(&spu_thread::pc));
+							m_ir->CreateStore(m_interp_pc, spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
 							call("spu_escape", spu_runtime::g_escape, m_thread)->setTailCall();
 							m_ir->CreateRetVoid();
 						}
@@ -3314,7 +3314,7 @@ public:
 	{
 		if (m_interp_magn)
 		{
-			m_ir->CreateStore(m_interp_pc, spu_ptr<u32>(&spu_thread::pc));
+			m_ir->CreateStore(m_interp_pc, spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
 			call("spu_unknown", &exec_unk, m_thread, m_ir->getInt32(op_unk.opcode));
 			return;
 		}
@@ -3469,7 +3469,7 @@ public:
 		{
 		case SPU_RdSRR0:
 		{
-			res.value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::srr0));
+			res.value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, srr0)));
 			break;
 		}
 		case SPU_RdInMbox:
@@ -3481,36 +3481,36 @@ public:
 		}
 		case MFC_RdTagStat:
 		{
-			res.value = get_rdch(op, ::offset32(&spu_thread::ch_tag_stat), false);
+			res.value = get_rdch(op, OFFSET_OF(spu_thread, ch_tag_stat), false);
 			break;
 		}
 		case MFC_RdTagMask:
 		{
-			res.value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::ch_tag_mask));
+			res.value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, ch_tag_mask)));
 			break;
 		}
 		case SPU_RdSigNotify1:
 		{
 			update_pc();
 			ensure_gpr_stores();
-			res.value = get_rdch(op, ::offset32(&spu_thread::ch_snr1), true);
+			res.value = get_rdch(op, OFFSET_OF(spu_thread, ch_snr1), true);
 			break;
 		}
 		case SPU_RdSigNotify2:
 		{
 			update_pc();
 			ensure_gpr_stores();
-			res.value = get_rdch(op, ::offset32(&spu_thread::ch_snr2), true);
+			res.value = get_rdch(op, OFFSET_OF(spu_thread, ch_snr2), true);
 			break;
 		}
 		case MFC_RdAtomicStat:
 		{
-			res.value = get_rdch(op, ::offset32(&spu_thread::ch_atomic_stat), false);
+			res.value = get_rdch(op, OFFSET_OF(spu_thread, ch_atomic_stat), false);
 			break;
 		}
 		case MFC_RdListStallStat:
 		{
-			res.value = get_rdch(op, ::offset32(&spu_thread::ch_stall_stat), false);
+			res.value = get_rdch(op, OFFSET_OF(spu_thread, ch_stall_stat), false);
 			break;
 		}
 		case SPU_RdDec:
@@ -3519,13 +3519,13 @@ public:
 			if (utils::get_tsc_freq() && !(g_cfg.core.spu_loop_detection) && (g_cfg.core.clocks_scale == 100))
 			{
 				const auto timebase_offs = m_ir->CreateLoad(get_type<u64>(), m_ir->CreateIntToPtr(m_ir->getInt64(reinterpret_cast<u64>(&g_timebase_offs)), get_type<u64*>()));
-				const auto timestamp = m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(&spu_thread::ch_dec_start_timestamp));
-				const auto dec_value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::ch_dec_value));
+				const auto timestamp = m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(OFFSET_OF(spu_thread, ch_dec_start_timestamp)));
+				const auto dec_value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, ch_dec_value)));
 				const auto tsc = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_rdtsc));
 				const auto tscx = m_ir->CreateMul(m_ir->CreateUDiv(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000));
 				const auto tscm = m_ir->CreateUDiv(m_ir->CreateMul(m_ir->CreateURem(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)), m_ir->getInt64(utils::get_tsc_freq()));
 				const auto tsctb = m_ir->CreateSub(m_ir->CreateAdd(tscx, tscm), timebase_offs);
-				const auto frz = m_ir->CreateLoad(get_type<u8>(), spu_ptr<u8>(&spu_thread::is_dec_frozen));
+				const auto frz = m_ir->CreateLoad(get_type<u8>(), spu_ptr<u8>(OFFSET_OF(spu_thread, is_dec_frozen)));
 				const auto frzev = m_ir->CreateICmpEQ(frz, m_ir->getInt8(0));
 
 				const auto delta = m_ir->CreateTrunc(m_ir->CreateSub(tsctb, timestamp), get_type<u32>());
@@ -3539,7 +3539,7 @@ public:
 		}
 		case SPU_RdEventMask:
 		{
-			const auto value = m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(&spu_thread::ch_events));
+			const auto value = m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(OFFSET_OF(spu_thread, ch_events)));
 			value->setAtomic(llvm::AtomicOrdering::Acquire);
 			res.value = m_ir->CreateTrunc(m_ir->CreateLShr(value, 32), get_type<u32>());
 			break;
@@ -3554,22 +3554,22 @@ public:
 			}
 			else
 			{
-				m_ir->CreateStore(m_ir->getInt8(1), spu_ptr<u8>(&spu_thread::unsavable));
+				m_ir->CreateStore(m_ir->getInt8(1), spu_ptr<u8>(OFFSET_OF(spu_thread, unsavable)));
 			}
 
 			res.value = call("spu_read_events", &exec_read_events, m_thread);
 
 			if (!g_cfg.savestate.compatible_mode)
 			{
-				m_ir->CreateStore(m_ir->getInt8(0), spu_ptr<u8>(&spu_thread::unsavable));
+				m_ir->CreateStore(m_ir->getInt8(0), spu_ptr<u8>(OFFSET_OF(spu_thread, unsavable)));
 			}
 
 			break;
 		}
 		case SPU_RdMachStat:
 		{
-			res.value = m_ir->CreateZExt(m_ir->CreateLoad(get_type<u8>(), spu_ptr<u8>(&spu_thread::interrupts_enabled)), get_type<u32>());
-			res.value = m_ir->CreateOr(res.value, m_ir->CreateAnd(m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::thread_type)), m_ir->getInt32(2)));
+			res.value = m_ir->CreateZExt(m_ir->CreateLoad(get_type<u8>(), spu_ptr<u8>(OFFSET_OF(spu_thread, interrupts_enabled))), get_type<u32>());
+			res.value = m_ir->CreateOr(res.value, m_ir->CreateAnd(m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, thread_type))), m_ir->getInt32(2)));
 			break;
 		}
 
@@ -3673,22 +3673,22 @@ public:
 			{
 			case SPU_WrOutMbox:
 			{
-				res.value = wait_rchcnt(::offset32(&spu_thread::ch_out_mbox), true);
+				res.value = wait_rchcnt(OFFSET_OF(spu_thread, ch_out_mbox), true);
 				break;
 			}
 			case SPU_WrOutIntrMbox:
 			{
-				res.value = wait_rchcnt(::offset32(&spu_thread::ch_out_intr_mbox), true);
+				res.value = wait_rchcnt(OFFSET_OF(spu_thread, ch_out_intr_mbox), true);
 				break;
 			}
 			case SPU_RdSigNotify1:
 			{
-				res.value = wait_rchcnt(::offset32(&spu_thread::ch_snr1));
+				res.value = wait_rchcnt(OFFSET_OF(spu_thread, ch_snr1));
 				break;
 			}
 			case SPU_RdSigNotify2:
 			{
-				res.value = wait_rchcnt(::offset32(&spu_thread::ch_snr2));
+				res.value = wait_rchcnt(OFFSET_OF(spu_thread, ch_snr2));
 				break;
 			}
 			case SPU_RdInMbox:
@@ -3698,7 +3698,7 @@ public:
 					return ch->pop_wait(*_spu, false), ch->get_count();
 				};
 
-				res.value = call("wait_spu_inbox", +wait_inbox, m_thread, spu_ptr<void*>(&spu_thread::ch_in_mbox));
+				res.value = call("wait_spu_inbox", +wait_inbox, m_thread, spu_ptr<void*>(OFFSET_OF(spu_thread, ch_in_mbox)));
 				break;
 			}
 			default: break;
@@ -3715,37 +3715,37 @@ public:
 		{
 		case SPU_WrOutMbox:
 		{
-			res.value = get_rchcnt(::offset32(&spu_thread::ch_out_mbox), true);
+			res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_out_mbox), true);
 			break;
 		}
 		case SPU_WrOutIntrMbox:
 		{
-			res.value = get_rchcnt(::offset32(&spu_thread::ch_out_intr_mbox), true);
+			res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_out_intr_mbox), true);
 			break;
 		}
 		case MFC_RdTagStat:
 		{
-			res.value = get_rchcnt(::offset32(&spu_thread::ch_tag_stat));
+			res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_tag_stat));
 			break;
 		}
 		case MFC_RdListStallStat:
 		{
-			res.value = get_rchcnt(::offset32(&spu_thread::ch_stall_stat));
+			res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_stall_stat));
 			break;
 		}
 		case SPU_RdSigNotify1:
 		{
-			res.value = get_rchcnt(::offset32(&spu_thread::ch_snr1));
+			res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_snr1));
 			break;
 		}
 		case SPU_RdSigNotify2:
 		{
-			res.value = get_rchcnt(::offset32(&spu_thread::ch_snr2));
+			res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_snr2));
 			break;
 		}
 		case MFC_RdAtomicStat:
 		{
-			res.value = get_rchcnt(::offset32(&spu_thread::ch_atomic_stat));
+			res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_atomic_stat));
 			break;
 		}
 		case MFC_WrTagUpdate:
@@ -3755,13 +3755,13 @@ public:
 		}
 		case MFC_Cmd:
 		{
-			res.value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::mfc_size));
+			res.value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, mfc_size)));
 			res.value = m_ir->CreateSub(m_ir->getInt32(16), res.value);
 			break;
 		}
 		case SPU_RdInMbox:
 		{
-			const auto value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::ch_in_mbox));
+			const auto value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, ch_in_mbox)));
 			value->setAtomic(llvm::AtomicOrdering::Acquire);
 			res.value = value;
 			res.value = m_ir->CreateLShr(res.value, 8);
@@ -3770,7 +3770,7 @@ public:
 		}
 		case SPU_RdEventStat:
 		{
-			const auto mask = m_ir->CreateTrunc(m_ir->CreateLShr(m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(&spu_thread::ch_events)), 32), get_type<u32>());
+			const auto mask = m_ir->CreateTrunc(m_ir->CreateLShr(m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(OFFSET_OF(spu_thread, ch_events))), 32), get_type<u32>());
 			res.value = call("spu_get_events", &exec_get_events, m_thread, mask);
 			break;
 		}
@@ -3868,7 +3868,7 @@ public:
 		{
 		case SPU_WrSRR0:
 		{
-			m_ir->CreateStore(eval(val & 0x3fffc).value, spu_ptr<u32>(&spu_thread::srr0));
+			m_ir->CreateStore(eval(val & 0x3fffc).value, spu_ptr<u32>(OFFSET_OF(spu_thread, srr0)));
 			return;
 		}
 		case SPU_WrOutIntrMbox:
@@ -3884,10 +3884,10 @@ public:
 		case MFC_WrTagMask:
 		{
 			// TODO
-			m_ir->CreateStore(val.value, spu_ptr<u32>(&spu_thread::ch_tag_mask));
+			m_ir->CreateStore(val.value, spu_ptr<u32>(OFFSET_OF(spu_thread, ch_tag_mask)));
 			const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
 			const auto _mfc = llvm::BasicBlock::Create(m_context, "", m_function);
-			m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::ch_tag_upd)), m_ir->getInt32(MFC_TAG_UPDATE_IMMEDIATE)), _mfc, next);
+			m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, ch_tag_upd))), m_ir->getInt32(MFC_TAG_UPDATE_IMMEDIATE)), _mfc, next);
 			m_ir->SetInsertPoint(_mfc);
 			update_pc();
 			call("spu_write_channel", &exec_wrch, m_thread, m_ir->getInt32(op.ra), val.value);
@@ -3899,11 +3899,11 @@ public:
 		{
 			if (true)
 			{
-				const auto tag_mask = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::ch_tag_mask));
-				const auto mfc_fence = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::mfc_fence));
+				const auto tag_mask = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, ch_tag_mask)));
+				const auto mfc_fence = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, mfc_fence)));
 				const auto completed = m_ir->CreateAnd(tag_mask, m_ir->CreateNot(mfc_fence));
-				const auto upd_ptr = spu_ptr<u32>(&spu_thread::ch_tag_upd);
-				const auto stat_ptr = spu_ptr<u64>(&spu_thread::ch_tag_stat);
+				const auto upd_ptr = spu_ptr<u32>(OFFSET_OF(spu_thread, ch_tag_upd));
+				const auto stat_ptr = spu_ptr<u64>(OFFSET_OF(spu_thread, ch_tag_stat));
 				const auto stat_val = m_ir->CreateOr(m_ir->CreateZExt(completed, get_type<u64>()), s64{smin});
 
 				const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
@@ -3955,7 +3955,7 @@ public:
 			}
 
 			spu_log.warning("[0x%x] MFC_EAH: $%u is not a zero constant", m_pos, +op.rt);
-			// m_ir->CreateStore(val.value, spu_ptr<u32>(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::eah));
+			// m_ir->CreateStore(val.value, spu_ptr<u32>(OFFSET_OF(spu_thread, ch_mfc_cmd.eah)));
 			return;
 		}
 		case MFC_EAL:
@@ -4009,8 +4009,8 @@ public:
 				const auto fail = llvm::BasicBlock::Create(m_context, "", m_function);
 				const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
 
-				const auto pf = spu_ptr<u32>(&spu_thread::mfc_fence);
-				const auto pb = spu_ptr<u32>(&spu_thread::mfc_barrier);
+				const auto pf = spu_ptr<u32>(OFFSET_OF(spu_thread, mfc_fence));
+				const auto pb = spu_ptr<u32>(OFFSET_OF(spu_thread, mfc_barrier));
 
 				switch (u64 cmd = ci->getZExtValue())
 				{
@@ -4035,7 +4035,7 @@ public:
 					m_ir->SetInsertPoint(fail);
 					m_ir->CreateUnreachable();
 					m_ir->SetInsertPoint(next);
-					m_ir->CreateStore(ci, spu_ptr<u8>(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd));
+					m_ir->CreateStore(ci, spu_ptr<u8>(OFFSET_OF(spu_thread, ch_mfc_cmd.cmd)));
 					update_pc();
 					ensure_gpr_stores();
 					call("spu_exec_mfc_cmd_saveable", &exec_mfc_cmd<true>, m_thread);
@@ -4054,7 +4054,7 @@ public:
 					m_ir->SetInsertPoint(fail);
 					m_ir->CreateUnreachable();
 					m_ir->SetInsertPoint(next);
-					m_ir->CreateStore(ci, spu_ptr<u8>(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd));
+					m_ir->CreateStore(ci, spu_ptr<u8>(OFFSET_OF(spu_thread, ch_mfc_cmd.cmd)));
 					update_pc();
 					call("spu_exec_mfc_cmd", &exec_mfc_cmd<false>, m_thread);
 					return;
@@ -4114,7 +4114,7 @@ public:
 						m_ir->SetInsertPoint(mmio);
 					}
 
-					m_ir->CreateStore(ci, spu_ptr<u8>(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd));
+					m_ir->CreateStore(ci, spu_ptr<u8>(OFFSET_OF(spu_thread, ch_mfc_cmd.cmd)));
 					call("spu_exec_mfc_cmd", &exec_mfc_cmd<false>, m_thread);
 					m_ir->CreateBr(next);
 					m_ir->SetInsertPoint(copy);
@@ -4206,7 +4206,7 @@ public:
 					}
 
 					// Disable certain thing
-					m_ir->CreateStore(m_ir->getInt32(0), spu_ptr<u32>(&spu_thread::last_faddr));
+					m_ir->CreateStore(m_ir->getInt32(0), spu_ptr<u32>(OFFSET_OF(spu_thread, last_faddr)));
 					m_ir->CreateBr(next);
 					break;
 				}
@@ -4214,7 +4214,7 @@ public:
 				case MFC_EIEIO_CMD:
 				case MFC_SYNC_CMD:
 				{
-					const auto cond = m_ir->CreateIsNull(m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::mfc_size)));
+					const auto cond = m_ir->CreateIsNull(m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, mfc_size))));
 					m_ir->CreateCondBr(cond, exec, fail, m_md_likely);
 					m_ir->SetInsertPoint(exec);
 					m_ir->CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
@@ -4236,12 +4236,12 @@ public:
 				m_ir->SetInsertPoint(fail);
 
 				// Get MFC slot, redirect to invalid memory address
-				const auto slot = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::mfc_size));
-				const auto off0 = m_ir->CreateAdd(m_ir->CreateMul(slot, m_ir->getInt32(sizeof(spu_mfc_cmd))), m_ir->getInt32(::offset32(&spu_thread::mfc_queue)));
+				const auto slot = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, mfc_size)));
+				const auto off0 = m_ir->CreateAdd(m_ir->CreateMul(slot, m_ir->getInt32(sizeof(spu_mfc_cmd))), m_ir->getInt32(OFFSET_OF(spu_thread, mfc_queue)));
 				const auto ptr0 = m_ir->CreateGEP(get_type<u8>(), m_thread, m_ir->CreateZExt(off0, get_type<u64>()));
 				const auto ptr1 = m_ir->CreateGEP(get_type<u8>(), m_memptr, m_ir->getInt64(0xffdeadf0));
 				const auto pmfc = m_ir->CreateSelect(m_ir->CreateICmpULT(slot, m_ir->getInt32(16)), ptr0, ptr1);
-				m_ir->CreateStore(ci, _ptr<u8>(pmfc, ::offset32(&spu_mfc_cmd::cmd)));
+				m_ir->CreateStore(ci, _ptr<u8>(pmfc, OFFSET_OF(spu_mfc_cmd, cmd)));
 
 				switch (u64 cmd = ci->getZExtValue())
 				{
@@ -4281,10 +4281,10 @@ public:
 				case MFC_GETB_CMD:
 				case MFC_GETF_CMD:
 				{
-					m_ir->CreateStore(tag.value, _ptr<u8>(pmfc, ::offset32(&spu_mfc_cmd::tag)));
-					m_ir->CreateStore(size.value, _ptr<u16>(pmfc, ::offset32(&spu_mfc_cmd::size)));
-					m_ir->CreateStore(lsa.value, _ptr<u32>(pmfc, ::offset32(&spu_mfc_cmd::lsa)));
-					m_ir->CreateStore(eal.value, _ptr<u32>(pmfc, ::offset32(&spu_mfc_cmd::eal)));
+					m_ir->CreateStore(tag.value, _ptr<u8>(pmfc, OFFSET_OF(spu_mfc_cmd, tag)));
+					m_ir->CreateStore(size.value, _ptr<u16>(pmfc, OFFSET_OF(spu_mfc_cmd, size)));
+					m_ir->CreateStore(lsa.value, _ptr<u32>(pmfc, OFFSET_OF(spu_mfc_cmd, lsa)));
+					m_ir->CreateStore(eal.value, _ptr<u32>(pmfc, OFFSET_OF(spu_mfc_cmd, eal)));
 					m_ir->CreateStore(m_ir->CreateOr(m_ir->CreateLoad(get_type<u32>(), pf), mask), pf);
 					if (cmd & MFC_BARRIER_MASK)
 						m_ir->CreateStore(m_ir->CreateOr(m_ir->CreateLoad(get_type<u32>(), pb), mask), pb);
@@ -4305,7 +4305,7 @@ public:
 				}
 				}
 
-				m_ir->CreateStore(m_ir->CreateAdd(slot, m_ir->getInt32(1)), spu_ptr<u32>(&spu_thread::mfc_size));
+				m_ir->CreateStore(m_ir->CreateAdd(slot, m_ir->getInt32(1)), spu_ptr<u32>(OFFSET_OF(spu_thread, mfc_size)));
 				m_ir->CreateBr(next);
 				m_ir->SetInsertPoint(next);
 				return;
@@ -4318,7 +4318,7 @@ public:
 		case MFC_WrListStallAck:
 		{
 			const auto mask = eval(splat<u32>(1) << (val & 0x1f));
-			const auto _ptr = spu_ptr<u32>(&spu_thread::ch_stall_mask);
+			const auto _ptr = spu_ptr<u32>(OFFSET_OF(spu_thread, ch_stall_mask));
 			const auto _old = m_ir->CreateLoad(get_type<u32>(), _ptr);
 			const auto _new = m_ir->CreateAnd(_old, m_ir->CreateNot(mask.value));
 			m_ir->CreateStore(_new, _ptr);
@@ -4345,16 +4345,16 @@ public:
 				const auto tscx = m_ir->CreateMul(m_ir->CreateUDiv(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000));
 				const auto tscm = m_ir->CreateUDiv(m_ir->CreateMul(m_ir->CreateURem(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)), m_ir->getInt64(utils::get_tsc_freq()));
 				const auto tsctb = m_ir->CreateSub(m_ir->CreateAdd(tscx, tscm), timebase_offs);
-				m_ir->CreateStore(tsctb, spu_ptr<u64>(&spu_thread::ch_dec_start_timestamp));
+				m_ir->CreateStore(tsctb, spu_ptr<u64>(OFFSET_OF(spu_thread, ch_dec_start_timestamp)));
 			}
 			else
 #endif
 			{
-				m_ir->CreateStore(call("get_timebased_time", &get_timebased_time), spu_ptr<u64>(&spu_thread::ch_dec_start_timestamp));
+				m_ir->CreateStore(call("get_timebased_time", &get_timebased_time), spu_ptr<u64>(OFFSET_OF(spu_thread, ch_dec_start_timestamp)));
 			}
 
-			m_ir->CreateStore(val.value, spu_ptr<u32>(&spu_thread::ch_dec_value));
-			m_ir->CreateStore(m_ir->getInt8(0), spu_ptr<u8>(&spu_thread::is_dec_frozen));
+			m_ir->CreateStore(val.value, spu_ptr<u32>(OFFSET_OF(spu_thread, ch_dec_value)));
+			m_ir->CreateStore(m_ir->getInt8(0), spu_ptr<u8>(OFFSET_OF(spu_thread, is_dec_frozen)));
 			return;
 		}
 		case SPU_Set_Bkmk_Tag:
@@ -7641,7 +7641,7 @@ public:
 		m_ir->CreateCondBr(cond.value, halt, next, m_md_unlikely);
 		m_ir->SetInsertPoint(halt);
 		if (m_interp_magn)
-			m_ir->CreateStore(m_function->getArg(2), spu_ptr<u32>(&spu_thread::pc));
+			m_ir->CreateStore(m_function->getArg(2), spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
 		else
 			update_pc();
 		const auto ptr = _ptr<u32>(m_memptr, 0xffdead00);
@@ -7748,7 +7748,7 @@ public:
 			target->addIncoming(e_addr, e_exec);
 			m_ir->CreateCondBr(get_imm<bool>(op.d).value, d_exec, d_done, m_md_unlikely);
 			m_ir->SetInsertPoint(d_exec);
-			m_ir->CreateStore(m_ir->getFalse(), spu_ptr<bool>(&spu_thread::interrupts_enabled));
+			m_ir->CreateStore(m_ir->getFalse(), spu_ptr<bool>(OFFSET_OF(spu_thread, interrupts_enabled)));
 			m_ir->CreateBr(d_done);
 			m_ir->SetInsertPoint(d_done);
 			m_ir->CreateBr(m_interp_bblock);
@@ -7784,7 +7784,7 @@ public:
 			}
 			else
 			{
-				sp.value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::gpr, 1, &v128::_u32, 3));
+				sp.value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, gpr[1]._u32[3])));
 			}
 		}
 
@@ -7799,15 +7799,15 @@ public:
 
 		if (op.d)
 		{
-			m_ir->CreateStore(m_ir->getFalse(), spu_ptr<bool>(&spu_thread::interrupts_enabled));
+			m_ir->CreateStore(m_ir->getFalse(), spu_ptr<bool>(OFFSET_OF(spu_thread, interrupts_enabled)));
 		}
 
-		m_ir->CreateStore(addr.value, spu_ptr<u32>(&spu_thread::pc));
+		m_ir->CreateStore(addr.value, spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
 
 		if (ret && g_cfg.core.spu_block_size >= spu_block_size_type::mega)
 		{
 			// Compare address stored in stack mirror with addr
-			const auto stack0 = eval(zext<u64>(sp) + ::offset32(&spu_thread::stack_mirror));
+			const auto stack0 = eval(zext<u64>(sp) + OFFSET_OF(spu_thread, stack_mirror));
 			const auto stack1 = eval(stack0 + 8);
 			const auto _ret = m_ir->CreateLoad(get_type<u64>(), m_ir->CreateGEP(get_type<u8>(), m_thread, stack0.value));
 			const auto link = m_ir->CreateLoad(get_type<u64>(), m_ir->CreateGEP(get_type<u8>(), m_thread, stack1.value));
@@ -8070,7 +8070,7 @@ public:
 		if (op.d && tfound != m_targets.end() && tfound->second.size() == 1 && tfound->second[0] == spu_branch_target(m_pos, 1))
 		{
 			// Interrupts-disable pattern
-			m_ir->CreateStore(m_ir->getFalse(), spu_ptr<bool>(&spu_thread::interrupts_enabled));
+			m_ir->CreateStore(m_ir->getFalse(), spu_ptr<bool>(OFFSET_OF(spu_thread, interrupts_enabled)));
 			return;
 		}
 
@@ -8130,7 +8130,7 @@ public:
 
 			// Exit function on unexpected target
 			m_ir->SetInsertPoint(sw->getDefaultDest());
-			m_ir->CreateStore(addr.value, spu_ptr<u32>(&spu_thread::pc));
+			m_ir->CreateStore(addr.value, spu_ptr<u32>(OFFSET_OF(spu_thread, pc)));
 
 			if (m_finfo && m_finfo->fn)
 			{
@@ -8165,7 +8165,7 @@ public:
 		if (m_block)
 			m_block->block_end = m_ir->GetInsertBlock();
 		value_t<u32> srr0;
-		srr0.value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(&spu_thread::srr0));
+		srr0.value = m_ir->CreateLoad(get_type<u32>(), spu_ptr<u32>(OFFSET_OF(spu_thread, srr0)));
 		m_ir->CreateBr(add_block_indirect(op, srr0));
 	}
 
@@ -8175,7 +8175,7 @@ public:
 			m_block->block_end = m_ir->GetInsertBlock();
 		const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
 		set_link(op);
-		const auto mask = m_ir->CreateTrunc(m_ir->CreateLShr(m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(&spu_thread::ch_events), true), 32), get_type<u32>());
+		const auto mask = m_ir->CreateTrunc(m_ir->CreateLShr(m_ir->CreateLoad(get_type<u64>(), spu_ptr<u64>(OFFSET_OF(spu_thread, ch_events)), true), 32), get_type<u32>());
 		const auto res = call("spu_get_events", &exec_get_events, m_thread, mask);
 		const auto target = add_block_indirect(op, addr);
 		m_ir->CreateCondBr(m_ir->CreateICmpNE(res, m_ir->getInt32(0)), target, add_block_next());
@@ -8507,7 +8507,7 @@ public:
 		{
 			// Store the return function chunk address at the stack mirror
 			const auto pfunc = add_function(m_pos + 4);
-			const auto stack0 = eval(zext<u64>(extract(get_reg_fixed(1), 3) & 0x3fff0) + ::offset32(&spu_thread::stack_mirror));
+			const auto stack0 = eval(zext<u64>(extract(get_reg_fixed(1), 3) & 0x3fff0) + OFFSET_OF(spu_thread, stack_mirror));
 			const auto stack1 = eval(stack0 + 8);
 			const auto rel_ptr = m_ir->CreateSub(m_ir->CreatePtrToInt(pfunc->chunk, get_type<u64>()), get_segment_base());
 			const auto ptr_plus_op = m_ir->CreateOr(m_ir->CreateShl(rel_ptr, 32), m_ir->getInt64(m_next_op));
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 27f1434dc..e6d8c0a70 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -695,7 +695,7 @@ const auto spu_putllc_tx = build_function_asm<u64 (*)(u32 raddr, u64 rtime, void
 
 		Label tx1 = build_transaction_enter(c, fall, [&]()
 			{
-				c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
+				c.add(x86::qword_ptr(args[2], OFFSET_OF(spu_thread, ftx) - OFFSET_OF(spu_thread, rdata)), 1);
 				build_get_tsc(c);
 				c.sub(x86::rax, stamp0);
 				c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
@@ -703,7 +703,7 @@ const auto spu_putllc_tx = build_function_asm<u64 (*)(u32 raddr, u64 rtime, void
 			});
 
 		// Check pause flag
-		c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
+		c.bt(x86::dword_ptr(args[2], OFFSET_OF(spu_thread, state) - OFFSET_OF(spu_thread, rdata)), static_cast<u32>(cpu_flag::pause));
 		c.jc(fall);
 		c.xbegin(tx1);
 
@@ -761,7 +761,7 @@ const auto spu_putllc_tx = build_function_asm<u64 (*)(u32 raddr, u64 rtime, void
 
 		c.xend();
 		c.lock().add(x86::qword_ptr(x86::r11), 64);
-		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
+		c.add(x86::qword_ptr(args[2], OFFSET_OF(spu_thread, stx) - OFFSET_OF(spu_thread, rdata)), 1);
 		build_get_tsc(c);
 		c.sub(x86::rax, stamp0);
 		c.jmp(_ret);
@@ -790,7 +790,7 @@ const auto spu_putllc_tx = build_function_asm<u64 (*)(u32 raddr, u64 rtime, void
 		}
 
 		c.xend();
-		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
+		c.add(x86::qword_ptr(args[2], OFFSET_OF(spu_thread, stx) - OFFSET_OF(spu_thread, rdata)), 1);
 		c.jmp(fail2);
 
 		c.bind(fall);
@@ -822,7 +822,7 @@ const auto spu_putllc_tx = build_function_asm<u64 (*)(u32 raddr, u64 rtime, void
 		}
 
 		c.mov(x86::rax, -1);
-		c.mov(x86::qword_ptr(args[2], ::offset32(&spu_thread::last_ftime) - ::offset32(&spu_thread::rdata)), x86::rax);
+		c.mov(x86::qword_ptr(args[2], OFFSET_OF(spu_thread, last_ftime) - OFFSET_OF(spu_thread, rdata)), x86::rax);
 		c.xor_(x86::eax, x86::eax);
 		// c.jmp(_ret);
 
@@ -1031,7 +1031,7 @@ const auto spu_getllar_tx = build_function_asm<u64 (*)(u32 raddr, void* rdata, c
 		// Begin transaction
 		Label tx0 = build_transaction_enter(c, fall, [&]()
 			{
-				c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
+				c.add(x86::qword_ptr(args[2], OFFSET_OF(spu_thread, ftx)), 1);
 				build_get_tsc(c);
 				c.sub(x86::rax, stamp0);
 				c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit1)));
@@ -1039,7 +1039,7 @@ const auto spu_getllar_tx = build_function_asm<u64 (*)(u32 raddr, void* rdata, c
 			});
 
 		// Check pause flag
-		c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
+		c.bt(x86::dword_ptr(args[2], OFFSET_OF(spu_thread, state)), static_cast<u32>(cpu_flag::pause));
 		c.jc(fall);
 		c.mov(x86::rax, x86::qword_ptr(x86::r11));
 		c.and_(x86::rax, -128);
@@ -1068,7 +1068,7 @@ const auto spu_getllar_tx = build_function_asm<u64 (*)(u32 raddr, void* rdata, c
 		}
 
 		c.xend();
-		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
+		c.add(x86::qword_ptr(args[2], OFFSET_OF(spu_thread, stx)), 1);
 		build_get_tsc(c);
 		c.sub(x86::rax, stamp0);
 
diff --git a/rpcs3/Emu/Cell/lv2/lv2.cpp b/rpcs3/Emu/Cell/lv2/lv2.cpp
index c4e7c3643..c25c39906 100644
--- a/rpcs3/Emu/Cell/lv2/lv2.cpp
+++ b/rpcs3/Emu/Cell/lv2/lv2.cpp
@@ -1240,7 +1240,7 @@ public:
 
 extern void ppu_execute_syscall(ppu_thread& ppu, u64 code)
 {
-	if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
+	if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy)
 	{
 		code = ppu.gpr[11];
 	}
diff --git a/rpcs3/Emu/Cell/lv2/sys_cond.cpp b/rpcs3/Emu/Cell/lv2/sys_cond.cpp
index 824c367fd..0d43554c8 100644
--- a/rpcs3/Emu/Cell/lv2/sys_cond.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_cond.cpp
@@ -62,7 +62,7 @@ CellError lv2_cond::on_id_create()
 
 std::function<void(void*)> lv2_cond::load(utils::serial& ar)
 {
-	return load_func(make_shared<lv2_cond>(stx::exact_t<utils::serial&>(ar)));
+	return load_func(make_shared<lv2_cond>(exact_t<utils::serial&>(ar)));
 }
 
 void lv2_cond::save(utils::serial& ar)
diff --git a/rpcs3/Emu/Cell/lv2/sys_event.cpp b/rpcs3/Emu/Cell/lv2/sys_event.cpp
index 946bbd137..c65efd62c 100644
--- a/rpcs3/Emu/Cell/lv2/sys_event.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_event.cpp
@@ -27,7 +27,7 @@ lv2_event_queue::lv2_event_queue(utils::serial& ar) noexcept
 
 std::function<void(void*)> lv2_event_queue::load(utils::serial& ar)
 {
-	auto queue = make_shared<lv2_event_queue>(stx::exact_t<utils::serial&>(ar));
+	auto queue = make_shared<lv2_event_queue>(exact_t<utils::serial&>(ar));
 	return [ptr = lv2_obj::load(queue->key, queue)](void* storage)
 	{
 		*static_cast<atomic_ptr<lv2_obj>*>(storage) = ptr;
diff --git a/rpcs3/Emu/Cell/lv2/sys_event_flag.cpp b/rpcs3/Emu/Cell/lv2/sys_event_flag.cpp
index 74cbb34d6..30c81ebf2 100644
--- a/rpcs3/Emu/Cell/lv2/sys_event_flag.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_event_flag.cpp
@@ -18,7 +18,7 @@ lv2_event_flag::lv2_event_flag(utils::serial& ar)
 
 std::function<void(void*)> lv2_event_flag::load(utils::serial& ar)
 {
-	return load_func(make_shared<lv2_event_flag>(stx::exact_t<utils::serial&>(ar)));
+	return load_func(make_shared<lv2_event_flag>(exact_t<utils::serial&>(ar)));
 }
 
 void lv2_event_flag::save(utils::serial& ar)
diff --git a/rpcs3/Emu/Cell/lv2/sys_memory.cpp b/rpcs3/Emu/Cell/lv2/sys_memory.cpp
index dbb513f9e..f5fc74321 100644
--- a/rpcs3/Emu/Cell/lv2/sys_memory.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_memory.cpp
@@ -27,7 +27,7 @@ lv2_memory_container::lv2_memory_container(utils::serial& ar, bool from_idm) noe
 std::function<void(void*)> lv2_memory_container::load(utils::serial& ar)
 {
 	// Use idm::last_id() only for the instances at IDM
-	return [ptr = make_shared<lv2_memory_container>(stx::exact_t<utils::serial&>(ar), true)](void* storage)
+	return [ptr = make_shared<lv2_memory_container>(exact_t<utils::serial&>(ar), true)](void* storage)
 	{
 		*static_cast<atomic_ptr<lv2_memory_container>*>(storage) = ptr;
 	};
diff --git a/rpcs3/Emu/Cell/lv2/sys_mmapper.cpp b/rpcs3/Emu/Cell/lv2/sys_mmapper.cpp
index c7069a332..c71fac708 100644
--- a/rpcs3/Emu/Cell/lv2/sys_mmapper.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_mmapper.cpp
@@ -72,7 +72,7 @@ CellError lv2_memory::on_id_create()
 
 std::function<void(void*)> lv2_memory::load(utils::serial& ar)
 {
-	auto mem = make_shared<lv2_memory>(stx::exact_t<utils::serial&>(ar));
+	auto mem = make_shared<lv2_memory>(exact_t<utils::serial&>(ar));
 	mem->exists++; // Disable on_id_create()
 	auto func = load_func(mem, +mem->pshared);
 	mem->exists--;
diff --git a/rpcs3/Emu/Cell/lv2/sys_mutex.cpp b/rpcs3/Emu/Cell/lv2/sys_mutex.cpp
index 8103d9595..28460f76c 100644
--- a/rpcs3/Emu/Cell/lv2/sys_mutex.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_mutex.cpp
@@ -22,7 +22,7 @@ lv2_mutex::lv2_mutex(utils::serial& ar)
 
 std::function<void(void*)> lv2_mutex::load(utils::serial& ar)
 {
-	return load_func(make_shared<lv2_mutex>(stx::exact_t<utils::serial&>(ar)));
+	return load_func(make_shared<lv2_mutex>(exact_t<utils::serial&>(ar)));
 }
 
 void lv2_mutex::save(utils::serial& ar)
diff --git a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_native.cpp b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_native.cpp
index 032ffeb0d..2f23375b9 100644
--- a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_native.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_native.cpp
@@ -20,7 +20,7 @@ lv2_socket_native::lv2_socket_native(lv2_socket_family family, lv2_socket_type t
 }
 
 lv2_socket_native::lv2_socket_native(utils::serial& ar, lv2_socket_type type)
-	: lv2_socket(stx::make_exact(ar), type)
+	: lv2_socket(make_exact(ar), type)
 {
 	[[maybe_unused]] const s32 version = GET_SERIALIZATION_VERSION(lv2_net);
 
diff --git a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_p2p.cpp b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_p2p.cpp
index 50b45c65c..157f5c4ec 100644
--- a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_p2p.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_p2p.cpp
@@ -17,7 +17,7 @@ lv2_socket_p2p::lv2_socket_p2p(lv2_socket_family family, lv2_socket_type type, l
 }
 
 lv2_socket_p2p::lv2_socket_p2p(utils::serial& ar, lv2_socket_type type)
-	: lv2_socket(stx::make_exact(ar), type)
+	: lv2_socket(make_exact(ar), type)
 {
 	ar(port, vport, bound_addr);
 
diff --git a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_raw.cpp b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_raw.cpp
index 6e74bd512..3dd109ca0 100644
--- a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_raw.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_raw.cpp
@@ -27,7 +27,7 @@ lv2_socket_raw::lv2_socket_raw(lv2_socket_family family, lv2_socket_type type, l
 }
 
 lv2_socket_raw::lv2_socket_raw(utils::serial& ar, lv2_socket_type type)
-	: lv2_socket(stx::make_exact(ar), type)
+	: lv2_socket(make_exact(ar), type)
 {
 }
 
diff --git a/rpcs3/Emu/Cell/lv2/sys_rwlock.cpp b/rpcs3/Emu/Cell/lv2/sys_rwlock.cpp
index bdf9456a4..da47702cb 100644
--- a/rpcs3/Emu/Cell/lv2/sys_rwlock.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_rwlock.cpp
@@ -18,7 +18,7 @@ lv2_rwlock::lv2_rwlock(utils::serial& ar)
 
 std::function<void(void*)> lv2_rwlock::load(utils::serial& ar)
 {
-	return load_func(make_shared<lv2_rwlock>(stx::exact_t<utils::serial&>(ar)));
+	return load_func(make_shared<lv2_rwlock>(exact_t<utils::serial&>(ar)));
 }
 
 void lv2_rwlock::save(utils::serial& ar)
diff --git a/rpcs3/Emu/Cell/lv2/sys_semaphore.cpp b/rpcs3/Emu/Cell/lv2/sys_semaphore.cpp
index d34f056d0..dc685c02d 100644
--- a/rpcs3/Emu/Cell/lv2/sys_semaphore.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_semaphore.cpp
@@ -18,7 +18,7 @@ lv2_sema::lv2_sema(utils::serial& ar)
 
 std::function<void(void*)> lv2_sema::load(utils::serial& ar)
 {
-	return load_func(make_shared<lv2_sema>(stx::exact_t<utils::serial&>(ar)));
+	return load_func(make_shared<lv2_sema>(exact_t<utils::serial&>(ar)));
 }
 
 void lv2_sema::save(utils::serial& ar)
diff --git a/rpcs3/Emu/GDB.cpp b/rpcs3/Emu/GDB.cpp
index 702e4a28c..4cdff6437 100644
--- a/rpcs3/Emu/GDB.cpp
+++ b/rpcs3/Emu/GDB.cpp
@@ -1,6 +1,7 @@
 #include "stdafx.h"
 
 #include "GDB.h"
+#include "util/bit_set.h"
 #include "util/logs.hpp"
 #include "util/StrUtil.h"
 #include "Emu/Memory/vm.h"
diff --git a/rpcs3/Emu/IdManager.h b/rpcs3/Emu/IdManager.h
index bf0cb9ac5..9d981b02b 100644
--- a/rpcs3/Emu/IdManager.h
+++ b/rpcs3/Emu/IdManager.h
@@ -26,7 +26,7 @@ template <typename T>
 concept IdmBaseCompatible = (std::is_final_v<T> ? IdmCompatible<T> : !!(requires() { u32{T::id_step}, u32{T::id_count}; }));
 
 template <typename T>
-concept IdmSavable = IdmBaseCompatible<T> && T::savestate_init_pos != 0 && (requires(T& t, utils::serial& ar) { t.save(stx::exact_t<utils::serial&>(ar)); });
+concept IdmSavable = IdmBaseCompatible<T> && T::savestate_init_pos != 0 && (requires(T& t, utils::serial& ar) { t.save(exact_t<utils::serial&>(ar)); });
 
 // If id_base is declared in base type, than storage type must declare id_type
 template <typename Base, typename Type>
@@ -113,13 +113,13 @@ namespace id_manager
 		static constexpr pointer_keeper (*load)(utils::serial&) = [](utils::serial& ar) -> pointer_keeper {
 			stx::shared_ptr<T> ptr;
 
-			if constexpr (std::is_constructible_v<T, stx::exact_t<const stx::launch_retainer&>, stx::exact_t<utils::serial&>>)
+			if constexpr (std::is_constructible_v<T, exact_t<const stx::launch_retainer&>, exact_t<utils::serial&>>)
 			{
-				ptr = stx::make_shared<T>(stx::launch_retainer{}, stx::exact_t<utils::serial&>(ar));
+				ptr = stx::make_shared<T>(stx::launch_retainer{}, exact_t<utils::serial&>(ar));
 			}
 			else
 			{
-				ptr = stx::make_shared<T>(stx::exact_t<utils::serial&>(ar));
+				ptr = stx::make_shared<T>(exact_t<utils::serial&>(ar));
 			}
 
 			return [ptr](void* storage)
@@ -134,7 +134,7 @@ namespace id_manager
 	struct id_traits_load_func<T>
 	{
 		static constexpr pointer_keeper (*load)(utils::serial&) = [](utils::serial& ar) -> pointer_keeper {
-			return T::load(stx::exact_t<utils::serial&>(ar));
+			return T::load(exact_t<utils::serial&>(ar));
 		};
 	};
 
diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
index 1b7a0752a..c6b7ab50e 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@@ -1399,7 +1399,7 @@ bool GLGSRender::release_GCM_label(u32 address, u32 args)
 
 	// Now write to DMA and then to host context
 	m_enqueued_host_write_buffer->get().copy_to(mapping.second, host_read_offset, mapping.first, 4);
-	m_enqueued_host_write_buffer->get().copy_to(m_host_gpu_context_data.get(), host_read_offset + 8, ::offset32(&rsx::host_gpu_context_t::commands_complete_event), 8);
+	m_enqueued_host_write_buffer->get().copy_to(m_host_gpu_context_data.get(), host_read_offset + 8, OFFSET_OF(rsx::host_gpu_context_t, commands_complete_event), 8);
 	m_enqueued_host_write_buffer->push_barrier(host_read_offset, 16);
 
 	host_ctx->on_label_release();
@@ -1425,7 +1425,7 @@ void GLGSRender::on_guest_texture_read()
 	// Tag the read as being in progress
 	u64 event_id = m_host_dma_ctrl->host_ctx()->inc_counter();
 	m_host_dma_ctrl->host_ctx()->texture_load_request_event = event_id;
-	enqueue_host_context_write(::offset32(&rsx::host_gpu_context_t::texture_load_complete_event), 8, &event_id);
+	enqueue_host_context_write(OFFSET_OF(rsx::host_gpu_context_t, texture_load_complete_event), 8, &event_id);
 }
 
 void GLGSRender::begin_occlusion_query(rsx::reports::occlusion_query_info* query)
diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp
index 0b297f2a9..8c494fbd8 100644
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@@ -165,7 +165,7 @@ namespace rsx
 		{
 			if (offset < sizeof(RsxReports::report) /*&& (offset % 0x10) == 0*/)
 			{
-				return render->label_addr + ::offset32(&RsxReports::report) + offset;
+				return render->label_addr + OFFSET_OF(RsxReports, report) + offset;
 			}
 
 			msg = "Local RSX REPORT offset out of range!"sv;
@@ -733,8 +733,8 @@ namespace rsx
 		if (!ar.is_writing() && version < 3)
 		{
 			// Be compatible with previous bitwise serialization
-			ar(std::span<u8>(reinterpret_cast<u8*>(this), ::offset32(&avconf::scan_mode)));
-			ar.pos += utils::align<usz>(::offset32(&avconf::scan_mode), alignof(avconf)) - ::offset32(&avconf::scan_mode);
+			ar(std::span<u8>(reinterpret_cast<u8*>(this), OFFSET_OF(avconf, scan_mode)));
+			ar.pos += utils::align<usz>(OFFSET_OF(avconf, scan_mode), alignof(avconf)) - OFFSET_OF(avconf, scan_mode);
 			return;
 		}
 
@@ -1209,7 +1209,7 @@ namespace rsx
 				if (const u64 get_put = new_get_put.exchange(u64{umax});
 					get_put != umax)
 				{
-					vm::_ref<atomic_be_t<u64>>(dma_address + ::offset32(&RsxDmaControl::put)).release(get_put);
+					vm::_ref<atomic_be_t<u64>>(dma_address + OFFSET_OF(RsxDmaControl, put)).release(get_put);
 					fifo_ctrl->set_get(static_cast<u32>(get_put));
 					fifo_ctrl->abort();
 					fifo_ret_addr = RSX_CALL_STACK_EMPTY;
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
index 90ddb5c9a..f54e503d3 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@@ -1717,7 +1717,7 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args)
 		auto cmd = m_secondary_cb_list.next();
 		cmd->begin();
 		VK_GET_SYMBOL(vkCmdUpdateBuffer)(*cmd, mapping.second->value, mapping.first, 4, &write_data);
-		VK_GET_SYMBOL(vkCmdUpdateBuffer)(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, &release_event_id);
+		VK_GET_SYMBOL(vkCmdUpdateBuffer)(*cmd, m_host_object_data->value, OFFSET_OF(vk::host_data_t, commands_complete_event), 8, &release_event_id);
 		cmd->end();
 
 		vk::queue_submit_t submit_info = {m_device->get_graphics_queue(), nullptr};
@@ -1739,7 +1739,7 @@ void VKGSRender::on_guest_texture_read(const vk::command_buffer& cmd)
 	// Queue a sync update on the CB doing the load
 	auto host_ctx = ensure(m_host_dma_ctrl->host_ctx());
 	const auto event_id = host_ctx->on_texture_load_acquire();
-	VK_GET_SYMBOL(vkCmdUpdateBuffer)(cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::texture_load_complete_event), sizeof(u64), &event_id);
+	VK_GET_SYMBOL(vkCmdUpdateBuffer)(cmd, m_host_object_data->value, OFFSET_OF(vk::host_data_t, texture_load_complete_event), sizeof(u64), &event_id);
 }
 
 void VKGSRender::sync_hint(rsx::FIFO::interrupt_hint hint, rsx::reports::sync_hint_payload_t payload)
@@ -2520,7 +2520,7 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
 	{
 		VK_GET_SYMBOL(vkCmdUpdateBuffer)(*m_current_command_buffer,
 			m_host_object_data->value,
-			::offset32(&vk::host_data_t::commands_complete_event),
+			OFFSET_OF(vk::host_data_t, commands_complete_event),
 			sizeof(u64),
 			const_cast<u64*>(&m_host_dma_ctrl->host_ctx()->last_label_acquire_event));
 
diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp
index 85f4c6523..19d1b8ca8 100644
--- a/rpcs3/Emu/System.cpp
+++ b/rpcs3/Emu/System.cpp
@@ -318,7 +318,7 @@ void init_fxo_for_exec(utils::serial* ar, bool full = false)
 // Some settings are not allowed in certain PPU decoders
 static void fixup_settings(const psf::registry* _psf)
 {
-	if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
+	if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy)
 	{
 		if (g_cfg.core.ppu_use_nj_bit)
 		{
diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h
index 1245148f5..cd6299903 100644
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@@ -21,7 +21,7 @@ struct cfg_root : cfg::node
 	public:
 		node_core(cfg::node* _this) : cfg::node(_this, "Core") {}
 
-		cfg::_enum<ppu_decoder_type> ppu_decoder{this, "PPU Decoder", ppu_decoder_type::llvm};
+		cfg::_enum<ppu_decoder_type> ppu_decoder{this, "PPU Decoder", ppu_decoder_type::llvm_legacy};
 		cfg::_int<1, 8> ppu_threads{this, "PPU Threads", 2}; // Amount of PPU threads running simultaneously (must be 2)
 		cfg::_bool ppu_debug{this, "PPU Debug"};
 		cfg::_bool ppu_call_history{this, "PPU Calling History"}; // Enable PPU calling history recording
diff --git a/rpcs3/Emu/system_config_types.cpp b/rpcs3/Emu/system_config_types.cpp
index 734042e1c..edf5e9474 100644
--- a/rpcs3/Emu/system_config_types.cpp
+++ b/rpcs3/Emu/system_config_types.cpp
@@ -520,8 +520,9 @@ void fmt_class_string<ppu_decoder_type>::format(std::string& out, u64 arg)
 		{
 			switch (type)
 			{
-			case ppu_decoder_type::_static: return "Interpreter (static)";
-			case ppu_decoder_type::llvm: return "Recompiler (LLVM)";
+			case ppu_decoder_type::_static: return "Interpreter (Legacy)";
+			case ppu_decoder_type::llvm_legacy: return "LLVM Recompiler (Legacy)";
+			case ppu_decoder_type::interpreter: return "Interpreter";
 			}
 
 			return unknown;
diff --git a/rpcs3/Emu/system_config_types.h b/rpcs3/Emu/system_config_types.h
index ee9da8b0f..c7610a442 100644
--- a/rpcs3/Emu/system_config_types.h
+++ b/rpcs3/Emu/system_config_types.h
@@ -3,7 +3,8 @@
 enum class ppu_decoder_type : unsigned
 {
 	_static,
-	llvm,
+	llvm_legacy,
+	interpreter,
 };
 
 enum class spu_decoder_type : unsigned
diff --git a/rpcs3/Loader/ELF.h b/rpcs3/Loader/ELF.h
index 56da9f464..6ccd4bbd6 100644
--- a/rpcs3/Loader/ELF.h
+++ b/rpcs3/Loader/ELF.h
@@ -3,6 +3,7 @@
 #include "util/types.hpp"
 #include "util/File.h"
 #include "util/bit_set.h"
+#include "util/endian.hpp"
 
 #include <span>
 
diff --git a/rpcs3/util/emu_utils.cpp b/rpcs3/util/emu_utils.cpp
index 667133dff..8b7145d0c 100644
--- a/rpcs3/util/emu_utils.cpp
+++ b/rpcs3/util/emu_utils.cpp
@@ -14,7 +14,7 @@ bool is_using_interpreter(thread_class t_class)
 {
 	switch (t_class)
 	{
-	case thread_class::ppu: return g_cfg.core.ppu_decoder != ppu_decoder_type::llvm;
+	case thread_class::ppu: return g_cfg.core.ppu_decoder != ppu_decoder_type::llvm_legacy;
 	case thread_class::spu: return g_cfg.core.spu_decoder != spu_decoder_type::asmjit && g_cfg.core.spu_decoder != spu_decoder_type::llvm;
 	default: return true;
 	}
diff --git a/rpcs3/util/fixed_typemap.hpp b/rpcs3/util/fixed_typemap.hpp
index 80cad14a4..0857bb5e7 100644
--- a/rpcs3/util/fixed_typemap.hpp
+++ b/rpcs3/util/fixed_typemap.hpp
@@ -146,10 +146,10 @@ namespace stx
 			}
 
 			template <typename T>
-				requires requires(T& a, utils::serial& ar) { a.save(stx::exact_t<utils::serial&>(ar)); }
+				requires requires(T& a, utils::serial& ar) { a.save(exact_t<utils::serial&>(ar)); }
 			static void call_save(void* ptr, utils::serial& ar) noexcept
 			{
-				std::launder(static_cast<T*>(ptr))->save(stx::exact_t<utils::serial&>(ar));
+				std::launder(static_cast<T*>(ptr))->save(exact_t<utils::serial&>(ar));
 			}
 
 			template <typename T>
@@ -173,7 +173,7 @@ namespace stx
 					r.thread_op = &call_thread_op<T>;
 				}
 
-				if constexpr (!!(requires(T& a, utils::serial& ar) { a.save(stx::exact_t<utils::serial&>(ar)); }))
+				if constexpr (!!(requires(T& a, utils::serial& ar) { a.save(exact_t<utils::serial&>(ar)); }))
 				{
 					r.save = &call_save<T>;
 				}
diff --git a/rpcs3/util/serialization.hpp b/rpcs3/util/serialization.hpp
index fddfc646a..4b3261396 100644
--- a/rpcs3/util/serialization.hpp
+++ b/rpcs3/util/serialization.hpp
@@ -98,24 +98,6 @@ namespace utils
 			pos += padding;
 		}
 
-		// Add padding needed between two members
-		template <typename T, typename T2, typename T3>
-		void add_padding(T T2::* const first, T3 T2::* const second)
-		{
-			if (m_is_writing)
-				return;
-
-			const u32 offset1 = ::offset32(first) + sizeof(T);
-			const u32 offset2 = ::offset32(second);
-
-			AUDIT(::offset32(first) <= ::offset32(second));
-
-			if (offset2 > offset1)
-			{
-				pos += offset2 - offset1;
-			}
-		}
-
 		void set_expect_little_data(bool value)
 		{
 			m_expect_little_data = value;
@@ -437,7 +419,7 @@ namespace utils
 		}
 
 		template <typename T>
-			requires requires(T& obj, utils::serial& ar) { (obj.*(&T::operator()))(stx::exact_t<utils::serial&>(ar)); }
+			requires requires(T& obj, utils::serial& ar) { (obj.*(&T::operator()))(exact_t<utils::serial&>(ar)); }
 		bool serialize(T& obj)
 		{
 			obj(*this);
@@ -565,7 +547,7 @@ namespace utils
 
 		template <typename T>
 			requires(std::is_copy_constructible_v<std::remove_const_t<T>>) && (std::is_constructible_v<std::remove_const_t<T>> || Bitcopy<std::remove_const_t<T>> ||
-																				  std::is_constructible_v<std::remove_const_t<T>, stx::exact_t<serial&>> || TupleAlike<std::remove_const_t<T>>)
+																				  std::is_constructible_v<std::remove_const_t<T>, exact_t<serial&>> || TupleAlike<std::remove_const_t<T>>)
 		operator T() noexcept
 		{
 			AUDIT(!is_writing());
@@ -604,9 +586,9 @@ namespace utils
 					return type{std::move(first), this->operator second_t()};
 				}
 			}
-			else if constexpr (std::is_constructible_v<type, stx::exact_t<serial&>>)
+			else if constexpr (std::is_constructible_v<type, exact_t<serial&>>)
 			{
-				return not_tuple_t(stx::exact_t<serial&>(*this));
+				return not_tuple_t(exact_t<serial&>(*this));
 			}
 			else if constexpr (std::is_constructible_v<type>)
 			{
diff --git a/rpcs3/util/simd.hpp b/rpcs3/util/simd.hpp
index 4a1bdb1af..e0b082fff 100644
--- a/rpcs3/util/simd.hpp
+++ b/rpcs3/util/simd.hpp
@@ -6,6 +6,7 @@
 #include "util/sysinfo.hpp"
 #include "util/asm.hpp"
 #include "util/JIT.h"
+#include <rx/simd.hpp>
 
 #if defined(ARCH_X64)
 #ifdef _MSC_VER
@@ -34,6 +35,8 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
+using namespace rx;
+
 namespace asmjit
 {
 	struct vec_builder;
@@ -565,2774 +568,445 @@ namespace asmjit
 #endif
 } // namespace asmjit
 
-inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false);
-inline v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false);
-inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false);
-inline v128 gv_select32(const v128& _cmp, const v128& _true, const v128& _false);
-inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false);
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline asmjit::vec_type gv_gts32(A&&, B&&);
-
-inline void gv_set_zeroing_denormals()
+namespace rx
 {
-#if defined(ARCH_X64)
-	u32 cr = _mm_getcsr();
-	cr = (cr & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_ZERO_ON;
-	cr = (cr & ~_MM_DENORMALS_ZERO_MASK) | _MM_DENORMALS_ZERO_ON;
-	cr = (cr | _MM_MASK_INVALID);
-	_mm_setcsr(cr);
-#elif defined(ARCH_ARM64)
-	u64 cr;
-	__asm__ volatile("mrs %0, FPCR" : "=r"(cr));
-	cr |= 0x1000000ull;
-	__asm__ volatile("msr FPCR, %0" ::"r"(cr));
-#else
-#error "Not implemented"
-#endif
-}
+	inline bool g_use_avx = utils::has_avx();
 
-inline void gv_unset_zeroing_denormals()
-{
+	inline void gv_zeroupper()
+	{
 #if defined(ARCH_X64)
-	u32 cr = _mm_getcsr();
-	cr = (cr & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_ZERO_OFF;
-	cr = (cr & ~_MM_DENORMALS_ZERO_MASK) | _MM_DENORMALS_ZERO_OFF;
-	cr = (cr | _MM_MASK_INVALID);
-	_mm_setcsr(cr);
-#elif defined(ARCH_ARM64)
-	u64 cr;
-	__asm__ volatile("mrs %0, FPCR" : "=r"(cr));
-	cr &= ~0x1000000ull;
-	__asm__ volatile("msr FPCR, %0" ::"r"(cr));
-#else
-#error "Not implemented"
-#endif
-}
-
-inline bool g_use_avx = utils::has_avx();
-
-inline void gv_zeroupper()
-{
-#if defined(ARCH_X64)
-	if (!g_use_avx)
-		return;
+		if (!g_use_avx)
+			return;
 #if defined(_M_X64) && defined(_MSC_VER)
-	_mm256_zeroupper();
+		_mm256_zeroupper();
 #else
-	__asm__ volatile("vzeroupper;");
+		__asm__ volatile("vzeroupper;");
 #endif
 #endif
-}
-
-inline v128 gv_bcst8(u8 value)
-{
-#if defined(ARCH_X64)
-	return _mm_set1_epi8(value);
-#elif defined(ARCH_ARM64)
-	return vdupq_n_s8(value);
-#endif
-}
-
-inline v128 gv_bcst16(u16 value)
-{
-#if defined(ARCH_X64)
-	return _mm_set1_epi16(value);
-#elif defined(ARCH_ARM64)
-	return vdupq_n_s16(value);
-#endif
-}
-
-// Optimized broadcast using constant offset assumption
-inline v128 gv_bcst16(const u16& value, auto mptr, auto... args)
-{
-#if defined(ARCH_X64)
-	const u32 offset = ::offset32(mptr, args...);
-	[[maybe_unused]] const __m128i* ptr = reinterpret_cast<__m128i*>(uptr(&value) - offset % 16);
-#if !defined(__AVX2__)
-	if (offset % 16 == 0)
-		return _mm_shuffle_epi32(_mm_shufflelo_epi16(*ptr, 0), 0);
-	if (offset % 16 == 2)
-		return _mm_shuffle_epi32(_mm_shufflelo_epi16(*ptr, 0b01010101), 0);
-	if (offset % 16 == 4)
-		return _mm_shuffle_epi32(_mm_shufflelo_epi16(*ptr, 0b10101010), 0);
-	if (offset % 16 == 6)
-		return _mm_shuffle_epi32(_mm_shufflelo_epi16(*ptr, 0xff), 0);
-	if (offset % 16 == 8)
-		return _mm_shuffle_epi32(_mm_shufflehi_epi16(*ptr, 0), 0xff);
-	if (offset % 16 == 10)
-		return _mm_shuffle_epi32(_mm_shufflehi_epi16(*ptr, 0b01010101), 0xff);
-	if (offset % 16 == 12)
-		return _mm_shuffle_epi32(_mm_shufflehi_epi16(*ptr, 0b10101010), 0xff);
-	if (offset % 16 == 14)
-		return _mm_shuffle_epi32(_mm_shufflehi_epi16(*ptr, 0xff), 0xff);
-#endif
-	return _mm_set1_epi16(value);
-#else
-	static_cast<void>(mptr);
-	return gv_bcst16(value);
-#endif
-}
-
-inline v128 gv_bcst32(u32 value)
-{
-#if defined(ARCH_X64)
-	return _mm_set1_epi32(value);
-#elif defined(ARCH_ARM64)
-	return vdupq_n_s32(value);
-#endif
-}
-
-// Optimized broadcast using constant offset assumption
-inline v128 gv_bcst32(const u32& value, auto mptr, auto... args)
-{
-#if defined(ARCH_X64)
-	const u32 offset = ::offset32(mptr, args...);
-	[[maybe_unused]] const __m128i* ptr = reinterpret_cast<__m128i*>(uptr(&value) - offset % 16);
-#if !defined(__AVX__)
-	if (offset % 16 == 0)
-		return _mm_shuffle_epi32(*ptr, 0);
-	if (offset % 16 == 4)
-		return _mm_shuffle_epi32(*ptr, 0b01010101);
-	if (offset % 16 == 8)
-		return _mm_shuffle_epi32(*ptr, 0b10101010);
-	if (offset % 16 == 12)
-		return _mm_shuffle_epi32(*ptr, 0xff);
-#endif
-	return _mm_set1_epi32(value);
-#else
-	static_cast<void>(mptr);
-	return gv_bcst32(value);
-#endif
-}
-
-inline v128 gv_bcst64(u64 value)
-{
-#if defined(ARCH_X64)
-	return _mm_set1_epi64x(value);
-#elif defined(ARCH_ARM64)
-	return vdupq_n_s64(value);
-#endif
-}
-
-// Optimized broadcast using constant offset assumption
-inline v128 gv_bcst64(const u64& value, auto mptr, auto... args)
-{
-#if defined(ARCH_X64)
-	const u32 offset = ::offset32(mptr, args...);
-	[[maybe_unused]] const __m128i* ptr = reinterpret_cast<__m128i*>(uptr(&value) - offset % 16);
-#if !defined(__AVX__)
-	if (offset % 16 == 0)
-		return _mm_shuffle_epi32(*ptr, 0b00010001);
-	if (offset % 16 == 8)
-		return _mm_shuffle_epi32(*ptr, 0b10111011);
-#endif
-	return _mm_set1_epi64x(value);
-#else
-	static_cast<void>(mptr);
-	return gv_bcst64(value);
-#endif
-}
-
-inline v128 gv_bcstfs(f32 value)
-{
-#if defined(ARCH_X64)
-	return _mm_set1_ps(value);
-#elif defined(ARCH_ARM64)
-	return vdupq_n_f32(value);
-#endif
-}
-
-inline v128 gv_and32(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_and_si128(a, b);
-#elif defined(ARCH_ARM64)
-	return vandq_s32(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_and32(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 4, kIdMovdqa, kIdPand, kIdVpand, kIdVpandd, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_andfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_and_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return vandq_s32(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_andfs(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 4, kIdMovaps, kIdAndps, kIdVandps, kIdVandps, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_andn32(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_andnot_si128(a, b);
-#elif defined(ARCH_ARM64)
-	return vbicq_s32(b, a);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_andn32(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 4, kIdMovdqa, kIdPandn, kIdVpandn, kIdVpandnd, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_andnfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_andnot_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return vbicq_s32(b, a);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_andnfs(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 4, kIdMovaps, kIdAndnps, kIdVandnps, kIdVandnps, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_or32(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_or_si128(a, b);
-#elif defined(ARCH_ARM64)
-	return vorrq_s32(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_or32(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 4, kIdMovdqa, kIdPor, kIdVpor, kIdVpord, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_orfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_or_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return vorrq_s32(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_orfs(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 4, kIdMovaps, kIdOrps, kIdVorps, kIdVorps, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_xor32(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_xor_si128(a, b);
-#elif defined(ARCH_ARM64)
-	return veorq_s32(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_xor32(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_xorfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_xor_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return veorq_s32(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_xorfs(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_not32(const v128& a)
-{
-#if defined(ARCH_X64)
-	return _mm_xor_si128(a, _mm_set1_epi32(-1));
-#elif defined(ARCH_ARM64)
-	return vmvnq_u32(a);
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_not32(A&& a)
-{
-#if defined(ARCH_X64)
-	asmjit::vec_type ones = g_vc->vec_alloc();
-	g_vc->pcmpeqd(ones, ones);
-	FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::move(ones), std::forward<A>(a));
-#endif
-}
-
-inline v128 gv_notfs(const v128& a)
-{
-#if defined(ARCH_X64)
-	return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1)));
-#elif defined(ARCH_ARM64)
-	return vmvnq_u32(a);
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_notfs(A&& a)
-{
-#if defined(ARCH_X64)
-	asmjit::vec_type ones = g_vc->vec_alloc();
-	g_vc->pcmpeqd(ones, ones);
-	FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::move(ones), std::forward<A>(a));
-#endif
-}
-
-inline v128 gv_shl16(const v128& a, u32 count)
-{
-	if (count >= 16)
-		return v128{};
-#if defined(ARCH_X64)
-	return _mm_slli_epi16(a, count);
-#elif defined(ARCH_ARM64)
-	return vshlq_s16(a, vdupq_n_s16(count));
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_shl16(A&& a, u32 count)
-{
-	FOR_X64(unary_op, kIdPsllw, kIdVpsllw, std::forward<A>(a), count);
-}
-
-inline v128 gv_shl32(const v128& a, u32 count)
-{
-	if (count >= 32)
-		return v128{};
-#if defined(ARCH_X64)
-	return _mm_slli_epi32(a, count);
-#elif defined(ARCH_ARM64)
-	return vshlq_s32(a, vdupq_n_s32(count));
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_shl32(A&& a, u32 count)
-{
-	FOR_X64(unary_op, kIdPslld, kIdVpslld, std::forward<A>(a), count);
-}
-
-inline v128 gv_shl64(const v128& a, u32 count)
-{
-	if (count >= 64)
-		return v128{};
-#if defined(ARCH_X64)
-	return _mm_slli_epi64(a, count);
-#elif defined(ARCH_ARM64)
-	return vshlq_s64(a, vdupq_n_s64(count));
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_shl64(A&& a, u32 count)
-{
-	FOR_X64(unary_op, kIdPsllq, kIdVpsllq, std::forward<A>(a), count);
-}
-
-inline v128 gv_shr16(const v128& a, u32 count)
-{
-	if (count >= 16)
-		return v128{};
-#if defined(ARCH_X64)
-	return _mm_srli_epi16(a, count);
-#elif defined(ARCH_ARM64)
-	return vshlq_u16(a, vdupq_n_s16(0 - count));
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_shr16(A&& a, u32 count)
-{
-	FOR_X64(unary_op, kIdPsrlw, kIdVpsrlw, std::forward<A>(a), count);
-}
-
-inline v128 gv_shr32(const v128& a, u32 count)
-{
-	if (count >= 32)
-		return v128{};
-#if defined(ARCH_X64)
-	return _mm_srli_epi32(a, count);
-#elif defined(ARCH_ARM64)
-	return vshlq_u32(a, vdupq_n_s32(0 - count));
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_shr32(A&& a, u32 count)
-{
-	FOR_X64(unary_op, kIdPsrld, kIdVpsrld, std::forward<A>(a), count);
-}
-
-inline v128 gv_shr64(const v128& a, u32 count)
-{
-	if (count >= 64)
-		return v128{};
-#if defined(ARCH_X64)
-	return _mm_srli_epi64(a, count);
-#elif defined(ARCH_ARM64)
-	return vshlq_u64(a, vdupq_n_s64(0 - count));
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_shr64(A&& a, u32 count)
-{
-	FOR_X64(unary_op, kIdPsrlq, kIdVpsrlq, std::forward<A>(a), count);
-}
-
-inline v128 gv_sar16(const v128& a, u32 count)
-{
-	if (count >= 16)
-		count = 15;
-#if defined(ARCH_X64)
-	return _mm_srai_epi16(a, count);
-#elif defined(ARCH_ARM64)
-	return vshlq_s16(a, vdupq_n_s16(0 - count));
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_sar16(A&& a, u32 count)
-{
-	FOR_X64(unary_op, kIdPsraw, kIdVpsraw, std::forward<A>(a), count);
-}
-
-inline v128 gv_sar32(const v128& a, u32 count)
-{
-	if (count >= 32)
-		count = 31;
-#if defined(ARCH_X64)
-	return _mm_srai_epi32(a, count);
-#elif defined(ARCH_ARM64)
-	return vshlq_s32(a, vdupq_n_s32(0 - count));
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_sar32(A&& a, u32 count)
-{
-	FOR_X64(unary_op, kIdPsrad, kIdVpsrad, std::forward<A>(a), count);
-}
-
-inline v128 gv_sar64(const v128& a, u32 count)
-{
-	if (count >= 64)
-		count = 63;
-#if defined(__AVX512VL__)
-	return _mm_srai_epi64(a, count);
-#elif defined(__SSE2__) && !defined(_M_X64)
-	return static_cast<__v2di>(a) >> count;
-#elif defined(ARCH_ARM64)
-	return vshlq_s64(a, vdupq_n_s64(0 - count));
-#else
-	v128 r;
-	r._s64[0] = a._s64[0] >> count;
-	r._s64[1] = a._s64[1] >> count;
-	return r;
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_sar64(A&& a, u32 count)
-{
-	if (count >= 64)
-		count = 63;
-#if defined(ARCH_X64)
-	using enum asmjit::x86::Inst::Id;
-	if (utils::has_avx512())
-		return asmjit::unary_op(kIdNone, kIdVpsraq, std::forward<A>(a), count);
-	g_vc->fail_flag = true;
-	return std::forward<A>(a);
-#endif
-}
-
-inline v128 gv_add8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_add_epi8(a, b);
-#elif defined(ARCH_ARM64)
-	return vaddq_s8(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_add8(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddb, kIdVpaddb, kIdNone, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_add16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_add_epi16(a, b);
-#elif defined(ARCH_ARM64)
-	return vaddq_s16(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_add16(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddw, kIdVpaddw, kIdNone, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_add32(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_add_epi32(a, b);
-#elif defined(ARCH_ARM64)
-	return vaddq_s32(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_add32(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 4, kIdMovdqa, kIdPaddd, kIdVpaddd, kIdVpaddd, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_add64(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_add_epi64(a, b);
-#elif defined(ARCH_ARM64)
-	return vaddq_s64(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_add64(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 8, kIdMovdqa, kIdPaddq, kIdVpaddq, kIdVpaddq, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_adds_s8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_adds_epi8(a, b);
-#elif defined(ARCH_ARM64)
-	return vqaddq_s8(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_adds_s8(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddsb, kIdVpaddsb, kIdNone, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_adds_s16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_adds_epi16(a, b);
-#elif defined(ARCH_ARM64)
-	return vqaddq_s16(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_adds_s16(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddsw, kIdVpaddsw, kIdNone, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_adds_s32(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	const v128 s = _mm_add_epi32(a, b);
-	const v128 m = (a ^ s) & (b ^ s);                       // overflow bit
-	const v128 x = _mm_srai_epi32(m, 31);                   // saturation mask
-	const v128 y = _mm_srai_epi32(_mm_and_si128(s, m), 31); // positive saturation mask
-	return _mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(x, 1), y), _mm_or_si128(s, x));
-#elif defined(ARCH_ARM64)
-	return vqaddq_s32(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_adds_s32(A&& a, B&& b)
-{
-#if defined(ARCH_X64)
-	auto s = gv_add32(a, b);
-	auto m = gv_and32(gv_xor32(std::forward<A>(a), s), gv_xor32(std::forward<B>(b), s));
-	auto x = gv_sar32(m, 31);
-	auto y = gv_sar32(gv_and32(s, std::move(m)), 31);
-	auto z = gv_xor32(gv_shr32(x, 1), std::move(y));
-	return gv_xor32(std::move(z), gv_or32(std::move(s), std::move(x)));
-#endif
-}
-
-inline v128 gv_addus_u8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_adds_epu8(a, b);
-#elif defined(ARCH_ARM64)
-	return vqaddq_u8(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_addus_u8(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddusb, kIdVpaddusb, kIdNone, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_addus_u16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_adds_epu16(a, b);
-#elif defined(ARCH_ARM64)
-	return vqaddq_u16(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_addus_u16(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddusw, kIdVpaddusw, kIdNone, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_addus_u32(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_add_epi32(a, _mm_min_epu32(~a, b));
-#elif defined(ARCH_X64)
-	const v128 s = _mm_add_epi32(a, b);
-	return _mm_or_si128(s, _mm_cmpgt_epi32(_mm_xor_si128(b, _mm_set1_epi32(smin)), _mm_xor_si128(a, _mm_set1_epi32(smax))));
-#elif defined(ARCH_ARM64)
-	return vqaddq_u32(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline asmjit::vec_type gv_addus_u32(A&& a, B&& b)
-{
-#if defined(ARCH_X64)
-	if (utils::has_sse41())
-		return gv_add32(gv_minu32(std::forward<B>(b), gv_not32(a)), std::forward<A>(a));
-	auto s = gv_add32(a, b);
-	auto x = gv_xor32(std::forward<B>(b), gv_bcst32(0x80000000));
-	auto y = gv_xor32(std::forward<A>(a), gv_bcst32(0x7fffffff));
-	return gv_or32(std::move(s), gv_gts32(std::move(x), std::move(y)));
-#endif
-	return {};
-}
-
-inline v128 gv_addfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_add_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return vaddq_f32(a, b);
-#endif
-}
-
-inline v128 gv_addfd(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_add_pd(a, b);
-#elif defined(ARCH_ARM64)
-	return vaddq_f64(a, b);
-#endif
-}
-
-inline v128 gv_sub8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_sub_epi8(a, b);
-#elif defined(ARCH_ARM64)
-	return vsubq_s8(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline auto gv_sub8(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 1, kIdMovdqa, kIdPsubb, kIdVpsubb, kIdNone, std::forward<A>(a), std::forward<B>(b));
-}
-
-inline v128 gv_sub16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_sub_epi16(a, b);
-#elif defined(ARCH_ARM64)
-	return vsubq_s16(a, b);
-#endif
-}
-
-inline v128 gv_sub32(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_sub_epi32(a, b);
-#elif defined(ARCH_ARM64)
-	return vsubq_s32(a, b);
-#endif
-}
-
-inline v128 gv_sub64(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_sub_epi64(a, b);
-#elif defined(ARCH_ARM64)
-	return vsubq_s64(a, b);
-#endif
-}
-
-inline v128 gv_subs_s8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_subs_epi8(a, b);
-#elif defined(ARCH_ARM64)
-	return vqsubq_s8(a, b);
-#endif
-}
-
-inline v128 gv_subs_s16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_subs_epi16(a, b);
-#elif defined(ARCH_ARM64)
-	return vqsubq_s16(a, b);
-#endif
-}
-
-inline v128 gv_subs_s32(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	const v128 d = _mm_sub_epi32(a, b);
-	const v128 m = (a ^ b) & (a ^ d); // overflow bit
-	const v128 x = _mm_srai_epi32(m, 31);
-	return _mm_or_si128(_mm_andnot_si128(x, d), _mm_and_si128(x, _mm_xor_si128(_mm_srli_epi32(x, 1), _mm_srai_epi32(a, 31))));
-#elif defined(ARCH_ARM64)
-	return vqsubq_s32(a, b);
-#endif
-}
-
-inline v128 gv_subus_u8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_subs_epu8(a, b);
-#elif defined(ARCH_ARM64)
-	return vqsubq_u8(a, b);
-#endif
-}
-
-inline v128 gv_subus_u16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_subs_epu16(a, b);
-#elif defined(ARCH_ARM64)
-	return vqsubq_u16(a, b);
-#endif
-}
-
-inline v128 gv_subus_u32(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_sub_epi32(a, _mm_min_epu32(a, b));
-#elif defined(ARCH_X64)
-	const auto sign = _mm_set1_epi32(smin);
-	return _mm_andnot_si128(_mm_cmpgt_epi32(_mm_xor_si128(b, sign), _mm_xor_si128(a, sign)), _mm_sub_epi32(a, b));
-#elif defined(ARCH_ARM64)
-	return vqsubq_u32(a, b);
-#endif
-}
-
-inline v128 gv_subfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_sub_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return vsubq_f32(a, b);
-#endif
-}
-
-inline v128 gv_subfd(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_sub_pd(a, b);
-#elif defined(ARCH_ARM64)
-	return vsubq_f64(a, b);
-#endif
-}
-
-inline v128 gv_maxu8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_max_epu8(a, b);
-#elif defined(ARCH_ARM64)
-	return vmaxq_u8(a, b);
-#endif
-}
-
-inline v128 gv_maxu16(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_max_epu16(a, b);
-#elif defined(ARCH_X64)
-	return _mm_add_epi16(_mm_subs_epu16(a, b), b);
-#elif defined(ARCH_ARM64)
-	return vmaxq_u16(a, b);
-#endif
-}
-
-inline v128 gv_maxu32(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_max_epu32(a, b);
-#elif defined(ARCH_X64)
-	const __m128i s = _mm_set1_epi32(smin);
-	const __m128i m = _mm_cmpgt_epi32(_mm_xor_si128(a, s), _mm_xor_si128(b, s));
-	return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
-#elif defined(ARCH_ARM64)
-	return vmaxq_u32(a, b);
-#endif
-}
-
-inline v128 gv_maxs8(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_max_epi8(a, b);
-#elif defined(ARCH_X64)
-	const __m128i m = _mm_cmpgt_epi8(a, b);
-	return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
-#elif defined(ARCH_ARM64)
-	return vmaxq_s8(a, b);
-#endif
-}
-
-inline v128 gv_maxs16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_max_epi16(a, b);
-#elif defined(ARCH_ARM64)
-	return vmaxq_s16(a, b);
-#endif
-}
-
-inline v128 gv_maxs32(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_max_epi32(a, b);
-#elif defined(ARCH_X64)
-	const __m128i m = _mm_cmpgt_epi32(a, b);
-	return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
-#elif defined(ARCH_ARM64)
-	return vmaxq_s32(a, b);
-#endif
-}
-
-inline v128 gv_maxfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_and_ps(_mm_max_ps(a, b), _mm_max_ps(b, a));
-#elif defined(ARCH_ARM64)
-	return vmaxq_f32(a, b);
-#endif
-}
-
-inline v128 gv_minu8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_min_epu8(a, b);
-#elif defined(ARCH_ARM64)
-	return vminq_u8(a, b);
-#endif
-}
-
-inline v128 gv_minu16(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_min_epu16(a, b);
-#elif defined(ARCH_X64)
-	return _mm_sub_epi16(a, _mm_subs_epu16(a, b));
-#elif defined(ARCH_ARM64)
-	return vminq_u16(a, b);
-#endif
-}
-
-inline v128 gv_minu32(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_min_epu32(a, b);
-#elif defined(ARCH_X64)
-	const __m128i s = _mm_set1_epi32(smin);
-	const __m128i m = _mm_cmpgt_epi32(_mm_xor_si128(a, s), _mm_xor_si128(b, s));
-	return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
-#elif defined(ARCH_ARM64)
-	return vminq_u32(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline asmjit::vec_type gv_minu32(A&& a, B&& b)
-{
-#if defined(ARCH_X64)
-	if (utils::has_sse41())
-		FOR_X64(binary_op, 4, kIdMovdqa, kIdPminud, kIdVpminud, kIdVpminud, std::forward<A>(a), std::forward<B>(b));
-	auto s = gv_bcst32(0x80000000);
-	auto x = gv_xor32(a, s);
-	auto m = gv_gts32(std::move(x), gv_xor32(std::move(s), b));
-	auto z = gv_and32(m, std::move(b));
-	return gv_or32(std::move(z), gv_andn32(std::move(m), std::move(a)));
-#endif
-	return {};
-}
-
-inline v128 gv_mins8(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_min_epi8(a, b);
-#elif defined(ARCH_X64)
-	const __m128i m = _mm_cmpgt_epi8(a, b);
-	return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
-#elif defined(ARCH_ARM64)
-	return vminq_s8(a, b);
-#endif
-}
-
-inline v128 gv_mins16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_min_epi16(a, b);
-#elif defined(ARCH_ARM64)
-	return vminq_s16(a, b);
-#endif
-}
-
-inline v128 gv_mins32(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_min_epi32(a, b);
-#elif defined(ARCH_X64)
-	const __m128i m = _mm_cmpgt_epi32(a, b);
-	return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
-#elif defined(ARCH_ARM64)
-	return vminq_s32(a, b);
-#endif
-}
-
-inline v128 gv_minfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a));
-#elif defined(ARCH_ARM64)
-	return vminq_f32(a, b);
-#endif
-}
-
-inline v128 gv_eq8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpeq_epi8(a, b);
-#elif defined(ARCH_ARM64)
-	return vceqq_s8(a, b);
-#endif
-}
-
-inline v128 gv_eq16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpeq_epi16(a, b);
-#elif defined(ARCH_ARM64)
-	return vceqq_s16(a, b);
-#endif
-}
-
-inline v128 gv_eq32(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpeq_epi32(a, b);
-#elif defined(ARCH_ARM64)
-	return vceqq_s32(a, b);
-#endif
-}
-
-// Ordered and equal
-inline v128 gv_eqfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpeq_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return vceqq_f32(a, b);
-#endif
-}
-
-// Unordered or not equal
-inline v128 gv_neqfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpneq_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return ~vceqq_f32(a, b);
-#endif
-}
-
-inline v128 gv_gtu8(const v128& a, const v128& b)
-{
-#if defined(__AVX512VL__) && defined(__AVX512BW__)
-	return _mm_movm_epi8(_mm_cmpgt_epu8_mask(a, b));
-#elif defined(ARCH_X64)
-	return _mm_cmpeq_epi8(_mm_cmpeq_epi8(a, _mm_min_epu8(a, b)), _mm_setzero_si128());
-#elif defined(ARCH_ARM64)
-	return vcgtq_u8(a, b);
-#endif
-}
-
-inline v128 gv_gtu16(const v128& a, const v128& b)
-{
-#if defined(__AVX512VL__) && defined(__AVX512BW__)
-	return _mm_movm_epi16(_mm_cmpgt_epu16_mask(a, b));
-#elif defined(__SSE4_1__)
-	return _mm_cmpeq_epi16(_mm_cmpeq_epi16(a, _mm_min_epu16(a, b)), _mm_setzero_si128());
-#elif defined(ARCH_X64)
-	return _mm_cmpeq_epi16(_mm_cmpeq_epi16(_mm_subs_epu16(a, b), _mm_setzero_si128()), _mm_setzero_si128());
-#elif defined(ARCH_ARM64)
-	return vcgtq_u16(a, b);
-#endif
-}
-
-inline v128 gv_gtu32(const v128& a, const v128& b)
-{
-#if defined(__AVX512VL__) && defined(__AVX512DQ__)
-	return _mm_movm_epi32(_mm_cmpgt_epu32_mask(a, b));
-#elif defined(__SSE4_1__)
-	return _mm_cmpeq_epi32(_mm_cmpeq_epi32(a, _mm_min_epu32(a, b)), _mm_setzero_si128());
-#elif defined(ARCH_X64)
-	const auto sign = _mm_set1_epi32(smin);
-	return _mm_cmpgt_epi32(_mm_xor_si128(a, sign), _mm_xor_si128(b, sign));
-#elif defined(ARCH_ARM64)
-	return vcgtq_u32(a, b);
-#endif
-}
-
-// Ordered and greater than
-inline v128 gv_gtfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpgt_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return vcgtq_f32(a, b);
-#endif
-}
-
-// Ordered and less than
-inline v128 gv_ltfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmplt_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return vcltq_f32(a, b);
-#endif
-}
-
-// Unordered or less or equal
-inline v128 gv_ngtfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpngt_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return ~vcgtq_f32(a, b);
-#endif
-}
-
-// Unordered or greater or equal
-inline v128 gv_nlefs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpnle_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return ~vcleq_f32(a, b);
-#endif
-}
-
-inline v128 gv_geu8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpeq_epi8(b, _mm_min_epu8(a, b));
-#elif defined(ARCH_ARM64)
-	return vcgeq_u8(a, b);
-#endif
-}
-
-inline v128 gv_geu16(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_cmpeq_epi16(b, _mm_min_epu16(a, b));
-#elif defined(ARCH_X64)
-	return _mm_cmpeq_epi16(_mm_subs_epu16(b, a), _mm_setzero_si128());
-#elif defined(ARCH_ARM64)
-	return vcgeq_u16(a, b);
-#endif
-}
-
-inline v128 gv_geu32(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_cmpeq_epi32(b, _mm_min_epu32(a, b));
-#elif defined(ARCH_X64)
-	const auto sign = _mm_set1_epi32(smin);
-	return _mm_cmpeq_epi32(_mm_cmpgt_epi32(_mm_xor_si128(b, sign), _mm_xor_si128(a, sign)), _mm_setzero_si128());
-#elif defined(ARCH_ARM64)
-	return vcgeq_u32(a, b);
-#endif
-}
-
-// Ordered and not less than
-inline v128 gv_gefs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpge_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return vcgeq_f32(a, b);
-#endif
-}
-
-// Unordered or less than
-inline v128 gv_ngefs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpnge_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return ~vcgeq_f32(a, b);
-#endif
-}
-
-inline v128 gv_gts8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpgt_epi8(a, b);
-#elif defined(ARCH_ARM64)
-	return vcgtq_s8(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline asmjit::vec_type gv_gts8(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 1, kIdMovdqa, kIdPcmpgtb, kIdVpcmpgtb, kIdNone, std::forward<A>(a), std::forward<B>(b));
-	return {};
-}
-
-inline v128 gv_gts16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpgt_epi16(a, b);
-#elif defined(ARCH_ARM64)
-	return vcgtq_s16(a, b);
-#endif
-}
-
-inline v128 gv_gts32(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_cmpgt_epi32(a, b);
-#elif defined(ARCH_ARM64)
-	return vcgtq_s32(a, b);
-#endif
-}
-
-template <typename A, typename B>
-	requires(asmjit::any_operand_v<A, B>)
-inline asmjit::vec_type gv_gts32(A&& a, B&& b)
-{
-	FOR_X64(binary_op, 4, kIdMovdqa, kIdPcmpgtd, kIdVpcmpgtd, kIdNone, std::forward<A>(a), std::forward<B>(b));
-	return {};
-}
-
-inline v128 gv_avgu8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_avg_epu8(a, b);
-#elif defined(ARCH_ARM64)
-	return vrhaddq_u8(a, b);
-#endif
-}
-
-inline v128 gv_avgu16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_avg_epu16(a, b);
-#elif defined(ARCH_ARM64)
-	return vrhaddq_u16(a, b);
-#endif
-}
-
-inline v128 gv_avgu32(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	const auto ones = _mm_set1_epi32(-1);
-	const auto summ = gv_sub32(gv_add32(a, b), ones);
-	const auto carry = _mm_slli_epi32(gv_geu32(a, summ), 31);
-	return _mm_or_si128(carry, _mm_srli_epi32(summ, 1));
-#elif defined(ARCH_ARM64)
-	return vrhaddq_u32(a, b);
-#endif
-}
-
-inline v128 gv_avgs8(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	const v128 sign = _mm_set1_epi8(smin);
-	return gv_avgu8(a ^ sign, b ^ sign) ^ sign;
-#elif defined(ARCH_ARM64)
-	return vrhaddq_s8(a, b);
-#endif
-}
-
-inline v128 gv_avgs16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	const v128 sign = _mm_set1_epi16(smin);
-	return gv_avgu16(a ^ sign, b ^ sign) ^ sign;
-#elif defined(ARCH_ARM64)
-	return vrhaddq_s16(a, b);
-#endif
-}
-
-inline v128 gv_avgs32(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	const v128 sign = _mm_set1_epi32(smin);
-	return gv_avgu32(a ^ sign, b ^ sign) ^ sign;
-#elif defined(ARCH_ARM64)
-	return vrhaddq_s32(a, b);
-#endif
-}
-
-inline v128 gv_divfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_div_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return vdivq_f32(a, b);
-#endif
-}
-
-inline v128 gv_sqrtfs(const v128& a)
-{
-#if defined(ARCH_X64)
-	return _mm_sqrt_ps(a);
-#elif defined(ARCH_ARM64)
-	return vsqrtq_f32(a);
-#endif
-}
-
-inline v128 gv_fmafs(const v128& a, const v128& b, const v128& c)
-{
-#if defined(ARCH_X64) && defined(__FMA__)
-	return _mm_fmadd_ps(a, b, c);
-#elif defined(__FMA4__)
-	return _mm_macc_ps(a, b, c);
-#elif defined(ARCH_X64)
-	// This is inaccurate implementation
-#ifdef __AVX__
-	const __m128 r = _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(a), _mm256_cvtps_pd(b)), _mm256_cvtps_pd(c)));
-#else
-	const __m128d a0 = _mm_cvtps_pd(a);
-	const __m128d a1 = _mm_cvtps_pd(_mm_movehl_ps(a, a));
-	const __m128d b0 = _mm_cvtps_pd(b);
-	const __m128d b1 = _mm_cvtps_pd(_mm_movehl_ps(b, b));
-	const __m128d c0 = _mm_cvtps_pd(c);
-	const __m128d c1 = _mm_cvtps_pd(_mm_movehl_ps(c, c));
-	const __m128d m0 = _mm_mul_pd(a0, b0);
-	const __m128d m1 = _mm_mul_pd(a1, b1);
-	const __m128d r0 = _mm_add_pd(m0, c0);
-	const __m128d r1 = _mm_add_pd(m1, c1);
-	const __m128 r = _mm_movelh_ps(_mm_cvtpd_ps(r0), _mm_cvtpd_ps(r1));
-#endif
-	return r;
-#elif defined(ARCH_ARM64)
-	return vfmaq_f32(c, a, b);
-#else
-	v128 r;
-	for (int i = 0; i < 4; i++)
-	{
-		r._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]);
 	}
-	return r;
-#endif
-}
 
-inline v128 gv_muladdfs(const v128& a, const v128& b, const v128& c)
-{
-#if defined(ARCH_X64) && defined(__FMA__)
-	return _mm_fmadd_ps(a, b, c);
-#elif defined(__FMA4__)
-	return _mm_macc_ps(a, b, c);
-#elif defined(ARCH_ARM64)
-	return vfmaq_f32(c, a, b);
-#elif defined(ARCH_X64)
-	return _mm_add_ps(_mm_mul_ps(a, b), c);
-#endif
-}
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline asmjit::vec_type gv_gts32(A&&, B&&);
 
-// -> ssat((a * b * 2 + (c << 16) + 0x8000) >> 16)
-inline v128 gv_rmuladds_hds16(const v128& a, const v128& b, const v128& c)
-{
-#if defined(ARCH_ARM64)
-#if defined(ANDROID)
-	// This function used in optimized PPU interpreter only, we do not use interperters in android
-	return a;
-#else
-	return vqrdmlahq_s16(c, a, b);
-#endif
-#elif defined(ARCH_X64)
-	const auto x80 = _mm_set1_epi16(0x80); // 0x80 * 0x80 = 0x4000, add this to the product
-	const auto al = _mm_unpacklo_epi16(a, x80);
-	const auto ah = _mm_unpackhi_epi16(a, x80);
-	const auto bl = _mm_unpacklo_epi16(b, x80);
-	const auto bh = _mm_unpackhi_epi16(b, x80);
-	const auto ml = _mm_srai_epi32(_mm_madd_epi16(al, bl), 15);
-	const auto mh = _mm_srai_epi32(_mm_madd_epi16(ah, bh), 15);
-	const auto cl = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), c), 16);
-	const auto ch = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), c), 16);
-	const auto sl = _mm_add_epi32(ml, cl);
-	const auto sh = _mm_add_epi32(mh, ch);
-	return _mm_packs_epi32(sl, sh);
-#endif
-}
-
-// -> ssat((a * b * 2 + 0x8000) >> 16)
-inline v128 gv_rmuls_hds16(const v128& a, const v128& b)
-{
-#if defined(ARCH_ARM64)
-	return vqrdmulhq_s16(a, b);
-#elif defined(ARCH_X64)
-	const auto x80 = _mm_set1_epi16(0x80); // 0x80 * 0x80 = 0x4000, add this to the product
-	const auto al = _mm_unpacklo_epi16(a, x80);
-	const auto ah = _mm_unpackhi_epi16(a, x80);
-	const auto bl = _mm_unpacklo_epi16(b, x80);
-	const auto bh = _mm_unpackhi_epi16(b, x80);
-	const auto ml = _mm_srai_epi32(_mm_madd_epi16(al, bl), 15);
-	const auto mh = _mm_srai_epi32(_mm_madd_epi16(ah, bh), 15);
-	return _mm_packs_epi32(ml, mh);
-#endif
-}
-
-// -> ssat((a * b * 2) >> 16)
-inline v128 gv_muls_hds16(const v128& a, const v128& b)
-{
-#if defined(ARCH_ARM64)
-	return vqdmulhq_s16(a, b);
-#elif defined(ARCH_X64)
-	const auto m = _mm_or_si128(_mm_srli_epi16(_mm_mullo_epi16(a, b), 15), _mm_slli_epi16(_mm_mulhi_epi16(a, b), 1));
-	const auto s = _mm_cmpeq_epi16(m, _mm_set1_epi16(-0x8000)); // detect special case (positive 0x8000)
-	return _mm_xor_si128(m, s);
-#endif
-}
-
-inline v128 gv_muladd16(const v128& a, const v128& b, const v128& c)
-{
-#if defined(ARCH_X64)
-	return _mm_add_epi16(_mm_mullo_epi16(a, b), c);
-#elif defined(ARCH_ARM64)
-	return vmlaq_s16(c, a, b);
-#endif
-}
-
-inline v128 gv_mul16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_mullo_epi16(a, b);
-#elif defined(ARCH_ARM64)
-	return vmulq_s16(a, b);
-#endif
-}
-
-inline v128 gv_mul32(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__)
-	return _mm_mullo_epi32(a, b);
-#elif defined(ARCH_X64)
-	const __m128i lows = _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8);
-	const __m128i highs = _mm_shuffle_epi32(_mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32)), 8);
-	return _mm_unpacklo_epi32(lows, highs);
-#elif defined(ARCH_ARM64)
-	return vmulq_s32(a, b);
-#endif
-}
-
-inline v128 gv_mulfs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_mul_ps(a, b);
-#elif defined(ARCH_ARM64)
-	return vmulq_f32(a, b);
-#endif
-}
-
-inline v128 gv_mulfs(const v128& a, f32 b)
-{
-#if defined(ARCH_X64)
-	return _mm_mul_ps(a, _mm_set_ps1(b));
-#elif defined(ARCH_ARM64)
-	return vmulq_n_f32(a, b);
-#endif
-}
-
-inline v128 gv_hadds8x2(const v128& a)
-{
-#if defined(__SSSE3__)
-	return _mm_maddubs_epi16(_mm_set1_epi8(1), a);
-#elif defined(ARCH_X64)
-	return _mm_add_epi16(_mm_srai_epi16(a, 8), _mm_srai_epi16(_mm_slli_epi16(a, 8), 8));
-#elif defined(ARCH_ARM64)
-	return vpaddlq_s8(a);
-#endif
-}
-
-inline v128 gv_hadds8x4(const v128& a, const v128& c)
-{
-#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
-	return _mm_dpbusd_epi32(c, _mm_set1_epi8(1), a);
-#elif defined(__SSSE3__)
-	return _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(_mm_set1_epi8(1), a), _mm_set1_epi16(1)), c);
-#elif defined(ARCH_X64)
-	return _mm_add_epi32(_mm_madd_epi16(_mm_add_epi16(_mm_srai_epi16(a, 8), _mm_srai_epi16(_mm_slli_epi16(a, 8), 8)), _mm_set1_epi16(1)), c);
-#elif defined(ARCH_ARM64)
-	return vaddq_s32(vpaddlq_s16(vpaddlq_s8(a)), c);
-#endif
-}
-
-inline v128 gv_haddu8x2(const v128& a)
-{
-#if defined(__SSSE3__)
-	return _mm_maddubs_epi16(a, _mm_set1_epi8(1));
-#elif defined(ARCH_X64)
-	return _mm_add_epi16(_mm_srli_epi16(a, 8), _mm_and_si128(a, _mm_set1_epi16(0x00ff)));
-#elif defined(ARCH_ARM64)
-	return vpaddlq_u8(a);
-#endif
-}
-
-inline v128 gv_haddu8x4(const v128& a)
-{
-#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
-	return _mm_dpbusd_epi32(_mm_setzero_si128(), a, _mm_set1_epi8(1));
-#elif defined(__SSSE3__)
-	return _mm_madd_epi16(_mm_maddubs_epi16(a, _mm_set1_epi8(1)), _mm_set1_epi16(1));
-#elif defined(ARCH_X64)
-	return _mm_madd_epi16(_mm_add_epi16(_mm_srli_epi16(a, 8), _mm_and_si128(a, _mm_set1_epi16(0x00ff))), _mm_set1_epi16(1));
-#elif defined(ARCH_ARM64)
-	return vpaddlq_u16(vpaddlq_u8(a));
-#endif
-}
-
-inline v128 gv_hadds16x2(const v128& a, const v128& c)
-{
-#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
-	return _mm_dpwssd_epi32(c, a, _mm_set1_epi8(1));
-#elif defined(ARCH_X64)
-	return _mm_add_epi32(_mm_madd_epi16(a, _mm_set1_epi16(1)), c);
-#elif defined(ARCH_ARM64)
-	return vaddq_s32(vpaddlq_s16(a), c);
-#endif
-}
-
-// Unsigned bytes from a, signed bytes from b, 32-bit accumulator c
-inline v128 gv_dotu8s8x4(const v128& a, const v128& b, const v128& c)
-{
-#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
-	return _mm_dpbusd_epi32(c, a, b);
-#elif defined(ARCH_X64)
-	const __m128i ah = _mm_srli_epi16(a, 8);
-	const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff));
-	const __m128i bh = _mm_srai_epi16(b, 8);
-	const __m128i bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
-	const __m128i mh = _mm_madd_epi16(ah, bh);
-	const __m128i ml = _mm_madd_epi16(al, bl);
-	const __m128i x = _mm_add_epi32(mh, ml);
-	return _mm_add_epi32(c, x);
-#elif defined(__ARM_FEATURE_MATMUL_INT8)
-	return vusdotq_s32(c, a, b);
-#elif defined(ARCH_ARM64)
-	const auto l = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), vmovl_s8(vget_low_s8(b))));
-	const auto h = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), vmovl_s8(vget_high_s8(b))));
-	return vaddq_s32(c, vaddq_s32(vuzp1q_s32(l, h), vuzp2q_s32(l, h)));
-#endif
-}
-
-inline v128 gv_dotu8x4(const v128& a, const v128& b, const v128& c)
-{
-#if defined(ARCH_X64)
-	const __m128i ah = _mm_srli_epi16(a, 8);
-	const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff));
-	const __m128i bh = _mm_srli_epi16(b, 8);
-	const __m128i bl = _mm_and_si128(b, _mm_set1_epi16(0x00ff));
-	const __m128i mh = _mm_madd_epi16(ah, bh);
-	const __m128i ml = _mm_madd_epi16(al, bl);
-	const __m128i x = _mm_add_epi32(mh, ml);
-	return _mm_add_epi32(c, x);
-#elif defined(__ARM_FEATURE_DOTPROD)
-	return vdotq_u32(c, a, b);
-#elif defined(ARCH_ARM64)
-	const auto l = vpaddlq_u16(vmulq_u16(vmovl_u8(vget_low_u8(a)), vmovl_u8(vget_low_u8(b))));
-	const auto h = vpaddlq_u16(vmulq_u16(vmovl_u8(vget_high_u8(a)), vmovl_u8(vget_high_u8(b))));
-	return vaddq_u32(c, vaddq_u32(vuzp1q_u32(l, h), vuzp2q_u32(l, h)));
-#endif
-}
-
-inline v128 gv_dots16x2(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_madd_epi16(a, b);
-#elif defined(ARCH_ARM64)
-	const auto ml = vmull_s16(vget_low_s16(a), vget_low_s16(b));
-	const auto mh = vmull_s16(vget_high_s16(a), vget_high_s16(b));
-	const auto sl = vpadd_s32(vget_low_s32(ml), vget_high_s32(ml));
-	const auto sh = vpadd_s32(vget_low_s32(mh), vget_high_s32(mh));
-	return vcombine_s32(sl, sh);
-#endif
-}
-
-// Signed s16 from a and b, 32-bit accumulator c
-inline v128 gv_dots16x2(const v128& a, const v128& b, const v128& c)
-{
-#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
-	return _mm_dpwssd_epi32(c, a, b);
-#else
-	return gv_add32(c, gv_dots16x2(a, b));
-#endif
-}
-
-inline v128 gv_dotu16x2(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	const auto ml = _mm_mullo_epi16(a, b); // low results
-	const auto mh = _mm_mulhi_epu16(a, b); // high results
-	const auto ls = _mm_add_epi32(_mm_srli_epi32(ml, 16), _mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)));
-	const auto hs = _mm_add_epi32(_mm_slli_epi32(mh, 16), _mm_and_si128(mh, _mm_set1_epi32(0xffff0000)));
-	return _mm_add_epi32(ls, hs);
-#elif defined(ARCH_ARM64)
-	const auto ml = vmull_u16(vget_low_u16(a), vget_low_u16(b));
-	const auto mh = vmull_u16(vget_high_u16(a), vget_high_u16(b));
-	const auto sl = vpadd_u32(vget_low_u32(ml), vget_high_u32(ml));
-	const auto sh = vpadd_u32(vget_low_u32(mh), vget_high_u32(mh));
-	return vcombine_u32(sl, sh);
-#endif
-}
-
-// Unsigned bytes from a, signed bytes from b, 32-bit accumulator c
-inline v128 gv_dots_u8s8x4(const v128& a, const v128& b, const v128& c)
-{
-#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
-	return _mm_dpbusds_epi32(c, a, b);
-#elif defined(ARCH_X64)
-	const __m128i ah = _mm_srli_epi16(a, 8);
-	const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff));
-	const __m128i bh = _mm_srai_epi16(b, 8);
-	const __m128i bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
-	const __m128i mh = _mm_madd_epi16(ah, bh);
-	const __m128i ml = _mm_madd_epi16(al, bl);
-	return gv_adds_s32(c, _mm_add_epi32(mh, ml));
-#elif defined(ARCH_ARM64)
-	const auto l = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), vmovl_s8(vget_low_s8(b))));
-	const auto h = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), vmovl_s8(vget_high_s8(b))));
-	return vqaddq_s32(c, vaddq_s32(vuzp1q_s32(l, h), vuzp2q_s32(l, h)));
-#endif
-}
-
-// Signed s16 from a and b, 32-bit accumulator c; signed saturation
-inline v128 gv_dots_s16x2(const v128& a, const v128& b, const v128& c)
-{
-#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
-	return _mm_dpwssds_epi32(c, a, b);
-#else
-	const auto ab = gv_dots16x2(a, b);
-	const auto s0 = gv_adds_s32(ab, c);
-	const auto s1 = gv_eq32(ab, gv_bcst32(0x80000000)); // +0x80000000, negative c -> c^0x80000000; otherwise 0x7fffffff
-	const auto s2 = gv_select32(gv_gts32(gv_bcst32(0), c), gv_xor32(c, gv_bcst32(0x80000000)), gv_bcst32(0x7fffffff));
-	return gv_select32(s1, s2, s0);
-#endif
-}
-
-// Multiply s16 elements 0, 2, 4, 6 to produce s32 results in corresponding lanes
-inline v128 gv_mul_even_s16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	const auto c = _mm_set1_epi32(0x0000ffff);
-	return _mm_madd_epi16(_mm_and_si128(a, c), _mm_and_si128(b, c));
-#else
-	// TODO
-	return gv_mul32(gv_sar32(gv_shl32(a, 16), 16), gv_sar32(gv_shl32(b, 16), 16));
-#endif
-}
-
-// Multiply u16 elements 0, 2, 4, 6 to produce u32 results in corresponding lanes
-inline v128 gv_mul_even_u16(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__) || defined(ARCH_ARM64)
-	const auto c = gv_bcst32(0x0000ffff);
-	return gv_mul32(a & c, b & c);
-#elif defined(ARCH_X64)
-	const auto ml = _mm_mullo_epi16(a, b);
-	const auto mh = _mm_mulhi_epu16(a, b);
-	return _mm_or_si128(_mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)), _mm_slli_epi32(mh, 16));
-#endif
-}
-
-// Multiply s16 elements 1, 3, 5, 7 to produce s32 results in corresponding lanes
-inline v128 gv_mul_odds_s16(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_madd_epi16(_mm_srli_epi32(a, 16), _mm_srli_epi32(b, 16));
-#else
-	return gv_mul32(gv_sar32(a, 16), gv_sar32(b, 16));
-#endif
-}
-
-// Multiply u16 elements 1, 3, 5, 7 to produce u32 results in corresponding lanes
-inline v128 gv_mul_odds_u16(const v128& a, const v128& b)
-{
-#if defined(__SSE4_1__) || defined(ARCH_ARM64)
-	return gv_mul32(gv_shr32(a, 16), gv_shr32(b, 16));
-#elif defined(ARCH_X64)
-	const auto ml = _mm_mullo_epi16(a, b);
-	const auto mh = _mm_mulhi_epu16(a, b);
-	return _mm_or_si128(_mm_and_si128(mh, _mm_set1_epi32(0xffff0000)), _mm_srli_epi32(ml, 16));
-#endif
-}
-
-inline v128 gv_cvts32_tofs(const v128& src)
-{
-#if defined(ARCH_X64)
-	return _mm_cvtepi32_ps(src);
-#elif defined(ARCH_ARM64)
-	return vcvtq_f32_s32(src);
-#endif
-}
-
-inline v128 gv_cvtu32_tofs(const v128& src)
-{
-#if defined(__AVX512VL__)
-	return _mm_cvtepu32_ps(src);
-#elif defined(ARCH_X64)
-	const auto fix = _mm_and_ps(_mm_castsi128_ps(_mm_srai_epi32(src, 31)), _mm_set1_ps(0x80000000));
-	return _mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(src, _mm_set1_epi32(0x7fffffff))), fix);
-#elif defined(ARCH_ARM64)
-	return vcvtq_f32_u32(src);
-#endif
-}
-
-inline v128 gv_cvtfs_tos32(const v128& src)
-{
-#if defined(ARCH_X64)
-	return _mm_cvttps_epi32(src);
-#elif defined(ARCH_ARM64)
-	return vcvtq_s32_f32(src);
-#endif
-}
-
-inline v128 gv_cvtfs_tou32(const v128& src)
-{
-#if defined(__AVX512VL__)
-	return _mm_cvttps_epu32(src);
-#elif defined(ARCH_X64)
-	const auto c1 = _mm_cvttps_epi32(src);
-	const auto s1 = _mm_srai_epi32(c1, 31);
-	const auto c2 = _mm_cvttps_epi32(_mm_sub_ps(src, _mm_set1_ps(2147483648.)));
-	return _mm_or_si128(c1, _mm_and_si128(c2, s1));
-#elif defined(ARCH_ARM64)
-	return vcvtq_u32_f32(src);
-#endif
-}
-
-namespace utils
-{
-	inline f32 roundevenf32(f32 arg)
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_and32(A&& a, B&& b)
 	{
-		u32 val = std::bit_cast<u32>(arg);
-		u32 exp = (val >> 23) & 0xff;
-		u32 abs = val & 0x7fffffff;
-
-		if (exp >= 127 + 23)
-		{
-			// Big enough, NaN or INF
-			return arg;
-		}
-		else if (exp >= 127)
-		{
-			u32 int_pos = (127 + 23) - exp;
-			u32 half_pos = int_pos - 1;
-			u32 half_bit = 1u << half_pos;
-			u32 int_bit = 1u << int_pos;
-			if (val & (int_bit | (half_bit - 1)))
-				val += half_bit;
-			val &= ~(int_bit - 1);
-		}
-		else if (exp == 126 && abs > 0x3f000000)
-		{
-			val &= 0x80000000;
-			val |= 0x3f800000;
-		}
-		else
-		{
-			val &= 0x80000000;
-		}
-
-		return std::bit_cast<f32>(val);
+		FOR_X64(binary_op, 4, kIdMovdqa, kIdPand, kIdVpand, kIdVpandd, std::forward<A>(a), std::forward<B>(b));
 	}
-} // namespace utils
 
-#if defined(ARCH_X64)
-template <uint Mode>
-const auto sse41_roundf = build_function_asm<__m128 (*)(__m128)>("sse41_roundf", [](native_asm& c, native_args&)
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_andfs(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 4, kIdMovaps, kIdAndps, kIdVandps, kIdVandps, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_andn32(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 4, kIdMovdqa, kIdPandn, kIdVpandn, kIdVpandnd, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_andnfs(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 4, kIdMovaps, kIdAndnps, kIdVandnps, kIdVandnps, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_or32(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 4, kIdMovdqa, kIdPor, kIdVpor, kIdVpord, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_orfs(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 4, kIdMovaps, kIdOrps, kIdVorps, kIdVorps, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_xor32(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_xorfs(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_not32(A&& a)
+	{
+#if defined(ARCH_X64)
+		asmjit::vec_type ones = g_vc->vec_alloc();
+		g_vc->pcmpeqd(ones, ones);
+		FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::move(ones), std::forward<A>(a));
+#endif
+	}
+
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_notfs(A&& a)
+	{
+#if defined(ARCH_X64)
+		asmjit::vec_type ones = g_vc->vec_alloc();
+		g_vc->pcmpeqd(ones, ones);
+		FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::move(ones), std::forward<A>(a));
+#endif
+	}
+
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_shl16(A&& a, u32 count)
+	{
+		FOR_X64(unary_op, kIdPsllw, kIdVpsllw, std::forward<A>(a), count);
+	}
+
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_shl32(A&& a, u32 count)
+	{
+		FOR_X64(unary_op, kIdPslld, kIdVpslld, std::forward<A>(a), count);
+	}
+
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_shl64(A&& a, u32 count)
+	{
+		FOR_X64(unary_op, kIdPsllq, kIdVpsllq, std::forward<A>(a), count);
+	}
+
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_shr16(A&& a, u32 count)
+	{
+		FOR_X64(unary_op, kIdPsrlw, kIdVpsrlw, std::forward<A>(a), count);
+	}
+
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_shr32(A&& a, u32 count)
+	{
+		FOR_X64(unary_op, kIdPsrld, kIdVpsrld, std::forward<A>(a), count);
+	}
+
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_shr64(A&& a, u32 count)
+	{
+		FOR_X64(unary_op, kIdPsrlq, kIdVpsrlq, std::forward<A>(a), count);
+	}
+
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_sar16(A&& a, u32 count)
+	{
+		FOR_X64(unary_op, kIdPsraw, kIdVpsraw, std::forward<A>(a), count);
+	}
+
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_sar32(A&& a, u32 count)
+	{
+		FOR_X64(unary_op, kIdPsrad, kIdVpsrad, std::forward<A>(a), count);
+	}
+
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_sar64(A&& a, u32 count)
+	{
+		if (count >= 64)
+			count = 63;
+#if defined(ARCH_X64)
+		using enum asmjit::x86::Inst::Id;
+		if (utils::has_avx512())
+			return asmjit::unary_op(kIdNone, kIdVpsraq, std::forward<A>(a), count);
+		g_vc->fail_flag = true;
+		return std::forward<A>(a);
+#endif
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_add8(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddb, kIdVpaddb, kIdNone, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_add16(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddw, kIdVpaddw, kIdNone, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_add32(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 4, kIdMovdqa, kIdPaddd, kIdVpaddd, kIdVpaddd, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_add64(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 8, kIdMovdqa, kIdPaddq, kIdVpaddq, kIdVpaddq, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_adds_s8(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddsb, kIdVpaddsb, kIdNone, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_adds_s16(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddsw, kIdVpaddsw, kIdNone, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_adds_s32(A&& a, B&& b)
+	{
+#if defined(ARCH_X64)
+		auto s = gv_add32(a, b);
+		auto m = gv_and32(gv_xor32(std::forward<A>(a), s), gv_xor32(std::forward<B>(b), s));
+		auto x = gv_sar32(m, 31);
+		auto y = gv_sar32(gv_and32(s, std::move(m)), 31);
+		auto z = gv_xor32(gv_shr32(x, 1), std::move(y));
+		return gv_xor32(std::move(z), gv_or32(std::move(s), std::move(x)));
+#endif
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_addus_u8(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddusb, kIdVpaddusb, kIdNone, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_addus_u16(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddusw, kIdVpaddusw, kIdNone, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline asmjit::vec_type gv_minu32(A&& a, B&& b)
+	{
+#if defined(ARCH_X64)
+		if (utils::has_sse41())
+			FOR_X64(binary_op, 4, kIdMovdqa, kIdPminud, kIdVpminud, kIdVpminud, std::forward<A>(a), std::forward<B>(b));
+		auto s = gv_bcst32(0x80000000);
+		auto x = gv_xor32(a, s);
+		auto m = gv_gts32(std::move(x), gv_xor32(std::move(s), b));
+		auto z = gv_and32(m, std::move(b));
+		return gv_or32(std::move(z), gv_andn32(std::move(m), std::move(a)));
+#endif
+		return {};
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline asmjit::vec_type gv_addus_u32(A&& a, B&& b)
+	{
+#if defined(ARCH_X64)
+		if (utils::has_sse41())
+			return gv_add32(gv_minu32(std::forward<B>(b), gv_not32(a)), std::forward<A>(a));
+		auto s = gv_add32(a, b);
+		auto x = gv_xor32(std::forward<B>(b), gv_bcst32(0x80000000));
+		auto y = gv_xor32(std::forward<A>(a), gv_bcst32(0x7fffffff));
+		return gv_or32(std::move(s), gv_gts32(std::move(x), std::move(y)));
+#endif
+		return {};
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline auto gv_sub8(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 1, kIdMovdqa, kIdPsubb, kIdVpsubb, kIdNone, std::forward<A>(a), std::forward<B>(b));
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline asmjit::vec_type gv_gts8(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 1, kIdMovdqa, kIdPcmpgtb, kIdVpcmpgtb, kIdNone, std::forward<A>(a), std::forward<B>(b));
+		return {};
+	}
+
+	template <typename A, typename B>
+		requires(asmjit::any_operand_v<A, B>)
+	inline asmjit::vec_type gv_gts32(A&& a, B&& b)
+	{
+		FOR_X64(binary_op, 4, kIdMovdqa, kIdPcmpgtd, kIdVpcmpgtd, kIdNone, std::forward<A>(a), std::forward<B>(b));
+		return {};
+	}
+
+	template <typename A, typename B, typename C>
+		requires(asmjit::any_operand_v<A, B, C>)
+	inline asmjit::vec_type gv_signselect8(A&& bits, B&& _true, C&& _false)
 	{
-		static_assert(Mode < 4);
 		using namespace asmjit;
+#if defined(ARCH_X64)
 		if (utils::has_avx())
-			c.vroundps(x86::xmm0, x86::xmm0, 8 + Mode);
-		else if (utils::has_sse41())
-			c.roundps(x86::xmm0, x86::xmm0, 8 + Mode);
-		else
-			c.jmp(+[](__m128 a) -> __m128
-				{
-					v128 r = a;
-					for (u32 i = 0; i < 4; i++)
-						if constexpr (Mode == 0)
-							r._f[i] = utils::roundevenf32(r._f[i]);
-						else if constexpr (Mode == 1)
-							r._f[i] = ::floorf(r._f[i]);
-						else if constexpr (Mode == 2)
-							r._f[i] = ::ceilf(r._f[i]);
-						else if constexpr (Mode == 3)
-							r._f[i] = ::truncf(r._f[i]);
-					return r;
-				});
-		c.ret();
-	});
+		{
+			Operand arg0{};
+			Operand arg1 = arg_eval(std::forward<A>(bits), 16);
+			Operand arg2 = arg_eval(std::forward<B>(_true), 16);
+			Operand arg3 = arg_eval(std::forward<C>(_false), 16);
+			if constexpr (!std::is_reference_v<A>)
+				arg0.isReg() ? arg_free(bits) : arg0.copyFrom(arg1);
+			if constexpr (!std::is_reference_v<B>)
+				arg0.isReg() ? arg_free(_true) : arg0.copyFrom(arg2);
+			if constexpr (!std::is_reference_v<C>)
+				arg0.isReg() ? arg_free(_false) : arg0.copyFrom(arg3);
+			if (arg0.isNone())
+				arg0 = g_vc->vec_alloc();
+			g_vc->emit(x86::Inst::kIdVpblendvb, arg0, arg3, arg2, arg1);
+			vec_type r;
+			r.copyFrom(arg0);
+			return r;
+		}
 #endif
-
-inline v128 gv_roundfs_even(const v128& a)
-{
-#if defined(__SSE4_1__)
-	return _mm_round_ps(a, 8 + 0);
-#elif defined(ARCH_ARM64)
-	return vrndnq_f32(a);
-#elif defined(ARCH_X64)
-	return sse41_roundf<0>(a);
-#else
-	v128 r;
-	for (u32 i = 0; i < 4; i++)
-		r._f[i] = utils::roundevenf32(a._f[i]);
-	return r;
-#endif
-}
-
-inline v128 gv_roundfs_ceil(const v128& a)
-{
-#if defined(__SSE4_1__)
-	return _mm_round_ps(a, 8 + 2);
-#elif defined(ARCH_ARM64)
-	return vrndpq_f32(a);
-#elif defined(ARCH_X64)
-	return sse41_roundf<2>(a);
-#else
-	v128 r;
-	for (u32 i = 0; i < 4; i++)
-		r._f[i] = ::ceilf(a._f[i]);
-	return r;
-#endif
-}
-
-inline v128 gv_roundfs_floor(const v128& a)
-{
-#if defined(__SSE4_1__)
-	return _mm_round_ps(a, 8 + 1);
-#elif defined(ARCH_ARM64)
-	return vrndmq_f32(a);
-#elif defined(ARCH_X64)
-	return sse41_roundf<1>(a);
-#else
-	v128 r;
-	for (u32 i = 0; i < 4; i++)
-		r._f[i] = ::floorf(a._f[i]);
-	return r;
-#endif
-}
-
-inline v128 gv_roundfs_trunc(const v128& a)
-{
-#if defined(__SSE4_1__)
-	return _mm_round_ps(a, 8 + 3);
-#elif defined(ARCH_ARM64)
-	return vrndq_f32(a);
-#elif defined(ARCH_X64)
-	return sse41_roundf<3>(a);
-#else
-	v128 r;
-	for (u32 i = 0; i < 4; i++)
-		r._f[i] = ::truncf(a._f[i]);
-	return r;
-#endif
-}
-
-inline bool gv_testz(const v128& a)
-{
-#if defined(__SSE4_1__)
-	return !!_mm_testz_si128(a, a);
-#elif defined(ARCH_X64)
-	return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == 0;
-#elif defined(ARCH_ARM64)
-	return std::bit_cast<s64>(vqmovn_s32(a)) == 0;
-#else
-	return !(a._u64[0] | a._u64[1]);
-#endif
-}
-
-// Same as gv_testz but tuned for pairing with gv_testall1
-inline bool gv_testall0(const v128& a)
-{
-#if defined(__SSE4_1__)
-	return !!_mm_testz_si128(a, _mm_set1_epi32(-1));
-#elif defined(ARCH_X64)
-	return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == 0;
-#elif defined(ARCH_ARM64)
-	return std::bit_cast<s64>(vqmovn_s32(a)) == 0;
-#else
-	return !(a._u64[0] | a._u64[1]);
-#endif
-}
-
-inline bool gv_testall1(const v128& a)
-{
-#if defined(__SSE4_1__)
-	return !!_mm_test_all_ones(a);
-#elif defined(ARCH_X64)
-	return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == -1;
-#elif defined(ARCH_ARM64)
-	return std::bit_cast<s64>(vqmovn_s32(a)) == -1;
-#else
-	return (a._u64[0] & a._u64[1]) == UINT64_MAX;
-#endif
-}
-
-// result = (~a) & (b)
-inline v128 gv_andn(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_andnot_si128(a, b);
-#elif defined(ARCH_ARM64)
-	return vbicq_s32(b, a);
-#endif
-}
-
-// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
-FORCE_INLINE v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false)
-{
-#if defined(__SSE4_1__)
-	return _mm_blendv_epi8(_false, _true, _cmp);
-#elif defined(ARCH_ARM64)
-	return vbslq_u8(_cmp, _true, _false);
-#else
-	return (_cmp & _true) | gv_andn(_cmp, _false);
-#endif
-}
-
-// Select elements using sign bit only
-FORCE_INLINE v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false)
-{
-#if defined(__SSE4_1__)
-	return _mm_blendv_epi8(_false, _true, bits);
-#else
-	return gv_select8(gv_gts8(gv_bcst8(0), bits), _true, _false);
-#endif
-}
-
-template <typename A, typename B, typename C>
-	requires(asmjit::any_operand_v<A, B, C>)
-inline asmjit::vec_type gv_signselect8(A&& bits, B&& _true, C&& _false)
-{
-	using namespace asmjit;
-#if defined(ARCH_X64)
-	if (utils::has_avx())
-	{
-		Operand arg0{};
-		Operand arg1 = arg_eval(std::forward<A>(bits), 16);
-		Operand arg2 = arg_eval(std::forward<B>(_true), 16);
-		Operand arg3 = arg_eval(std::forward<C>(_false), 16);
-		if constexpr (!std::is_reference_v<A>)
-			arg0.isReg() ? arg_free(bits) : arg0.copyFrom(arg1);
-		if constexpr (!std::is_reference_v<B>)
-			arg0.isReg() ? arg_free(_true) : arg0.copyFrom(arg2);
-		if constexpr (!std::is_reference_v<C>)
-			arg0.isReg() ? arg_free(_false) : arg0.copyFrom(arg3);
-		if (arg0.isNone())
-			arg0 = g_vc->vec_alloc();
-		g_vc->emit(x86::Inst::kIdVpblendvb, arg0, arg3, arg2, arg1);
-		vec_type r;
-		r.copyFrom(arg0);
-		return r;
+		g_vc->fail_flag = true;
+		return vec_type{0};
 	}
-#endif
-	g_vc->fail_flag = true;
-	return vec_type{0};
-}
 
-// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
-inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false)
-{
-#if defined(__SSE4_1__)
-	return _mm_blendv_epi8(_false, _true, _cmp);
-#elif defined(ARCH_ARM64)
-	return vbslq_u16(_cmp, _true, _false);
-#else
-	return (_cmp & _true) | gv_andn(_cmp, _false);
-#endif
-}
-
-// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
-inline v128 gv_select32(const v128& _cmp, const v128& _true, const v128& _false)
-{
-#if defined(__SSE4_1__)
-	return _mm_blendv_epi8(_false, _true, _cmp);
-#elif defined(ARCH_ARM64)
-	return vbslq_u32(_cmp, _true, _false);
-#else
-	return (_cmp & _true) | gv_andn(_cmp, _false);
-#endif
-}
-
-// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
-inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false)
-{
-#if defined(__SSE4_1__)
-	return _mm_blendv_ps(_false, _true, _cmp);
-#elif defined(ARCH_ARM64)
-	return vbslq_f32(_cmp, _true, _false);
-#else
-	return _mm_or_ps(_mm_and_ps(_cmp, _true), _mm_andnot_ps(_cmp, _false));
-#endif
-}
-
-inline v128 gv_packss_s16(const v128& low, const v128& high)
-{
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_extend_lo_s8(A&& a)
+	{
 #if defined(ARCH_X64)
-	return _mm_packs_epi16(low, high);
-#elif defined(ARCH_ARM64)
-	return vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
+		using enum asmjit::x86::Inst::Id;
+		if (utils::has_sse41())
+			return asmjit::unary_op(kIdNone, kIdPmovsxbw, std::forward<A>(a));
+		return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpcklbw, std::forward<A>(a)), 8);
 #endif
-}
+	}
 
-inline v128 gv_packus_s16(const v128& low, const v128& high)
-{
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_extend_hi_s8(A&& a)
+	{
 #if defined(ARCH_X64)
-	return _mm_packus_epi16(low, high);
-#elif defined(ARCH_ARM64)
-	return vcombine_u8(vqmovun_s16(low), vqmovun_s16(high));
+		using enum asmjit::x86::Inst::Id;
+		return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpckhbw, std::forward<A>(a)), 8);
 #endif
-}
+	}
 
-inline v128 gv_packus_u16(const v128& low, const v128& high)
-{
-#if defined(__SSE4_1__)
-	return _mm_packus_epi16(_mm_min_epu16(low, _mm_set1_epi16(0xff)), _mm_min_epu16(high, _mm_set1_epi16(0xff)));
-#elif defined(ARCH_X64)
-	return _mm_packus_epi16(_mm_sub_epi16(low, _mm_subs_epu16(low, _mm_set1_epi16(0xff))), _mm_sub_epi16(high, _mm_subs_epu16(high, _mm_set1_epi16(0xff))));
-#elif defined(ARCH_ARM64)
-	return vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
-#endif
-}
-
-inline v128 gv_packtu16(const v128& low, const v128& high)
-{
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_extend_lo_s16(A&& a)
+	{
 #if defined(ARCH_X64)
-	return _mm_packus_epi16(low & _mm_set1_epi16(0xff), high & _mm_set1_epi16(0xff));
-#elif defined(ARCH_ARM64)
-	return vuzp1q_s8(low, high);
+		using enum asmjit::x86::Inst::Id;
+		if (utils::has_sse41())
+			return asmjit::unary_op(kIdNone, kIdPmovsxwd, std::forward<A>(a));
+		return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpcklwd, std::forward<A>(a)), 16);
 #endif
-}
+	}
 
-inline v128 gv_packss_s32(const v128& low, const v128& high)
-{
+	template <typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_extend_hi_s16(A&& a)
+	{
 #if defined(ARCH_X64)
-	return _mm_packs_epi32(low, high);
-#elif defined(ARCH_ARM64)
-	return vcombine_s16(vqmovn_s32(low), vqmovn_s32(high));
+		using enum asmjit::x86::Inst::Id;
+		return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpckhwd, std::forward<A>(a)), 16);
 #endif
-}
+	}
 
-inline v128 gv_packus_s32(const v128& low, const v128& high)
-{
-#if defined(__SSE4_1__)
-	return _mm_packus_epi32(low, high);
-#elif defined(ARCH_X64)
-	const auto s = _mm_srai_epi16(_mm_packs_epi32(low, high), 15);
-	const auto r = gv_add16(_mm_packs_epi32(gv_sub32(low, gv_bcst32(0x8000)), gv_sub32(high, gv_bcst32(0x8000))), gv_bcst16(0x8000));
-	return gv_andn(s, r);
-#elif defined(ARCH_ARM64)
-	return vcombine_u16(vqmovun_s32(low), vqmovun_s32(high));
-#endif
-}
+	template <u32 Count, typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_shuffle_left(A&& a)
+	{
+		FOR_X64(unary_op, kIdPslldq, kIdVpslldq, std::forward<A>(a), Count);
+	}
 
-inline v128 gv_packus_u32(const v128& low, const v128& high)
-{
-#if defined(__SSE4_1__)
-	return _mm_packus_epi32(_mm_min_epu32(low, _mm_set1_epi32(0xffff)), _mm_min_epu32(high, _mm_set1_epi32(0xffff)));
-#elif defined(ARCH_X64)
-	const v128 s = _mm_cmpgt_epi16(_mm_packs_epi32(_mm_srli_epi32(low, 16), _mm_srli_epi32(high, 16)), _mm_setzero_si128());
-	const v128 r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(low, 16), 16), _mm_srai_epi32(_mm_slli_epi32(high, 16), 16));
-	return _mm_or_si128(r, s);
-#elif defined(ARCH_ARM64)
-	return vcombine_u16(vqmovn_u32(low), vqmovn_u32(high));
-#endif
-}
+	template <u32 Count, typename A>
+		requires(asmjit::any_operand_v<A>)
+	inline auto gv_shuffle_right(A&& a)
+	{
+		FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward<A>(a), Count);
+	}
 
-inline v128 gv_packtu32(const v128& low, const v128& high)
-{
-#if defined(__SSE4_1__)
-	return _mm_packus_epi32(low & _mm_set1_epi32(0xffff), high & _mm_set1_epi32(0xffff));
-#elif defined(ARCH_X64)
-	return _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(low, 16), 16), _mm_srai_epi32(_mm_slli_epi32(high, 16), 16));
-#elif defined(ARCH_ARM64)
-	return vuzp1q_s16(low, high);
-#endif
-}
-
-inline v128 gv_unpacklo8(const v128& lows, const v128& highs)
-{
-#if defined(ARCH_X64)
-	return _mm_unpacklo_epi8(lows, highs);
-#elif defined(ARCH_ARM64)
-	return vzip1q_s8(lows, highs);
-#endif
-}
-
-inline v128 gv_extend_lo_s8(const v128& vec)
-{
-#if defined(__SSE4_1__)
-	return _mm_cvtepi8_epi16(vec);
-#elif defined(ARCH_X64)
-	return _mm_srai_epi16(_mm_unpacklo_epi8(vec, vec), 8);
-#elif defined(ARCH_ARM64)
-	return int16x8_t(vmovl_s8(vget_low_s8(vec)));
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_extend_lo_s8(A&& a)
-{
-#if defined(ARCH_X64)
-	using enum asmjit::x86::Inst::Id;
-	if (utils::has_sse41())
-		return asmjit::unary_op(kIdNone, kIdPmovsxbw, std::forward<A>(a));
-	return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpcklbw, std::forward<A>(a)), 8);
-#endif
-}
-
-inline v128 gv_extend_hi_s8(const v128& vec)
-{
-#if defined(__SSE4_1__)
-	return _mm_cvtepi8_epi16(_mm_loadu_si64(vec._bytes + 8));
-#elif defined(ARCH_X64)
-	return _mm_srai_epi16(_mm_unpackhi_epi8(vec, vec), 8);
-#elif defined(ARCH_ARM64)
-	return int16x8_t(vmovl_s8(vget_high_s8(vec)));
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_extend_hi_s8(A&& a)
-{
-#if defined(ARCH_X64)
-	using enum asmjit::x86::Inst::Id;
-	return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpckhbw, std::forward<A>(a)), 8);
-#endif
-}
-
-inline v128 gv_unpacklo16(const v128& lows, const v128& highs)
-{
-#if defined(ARCH_X64)
-	return _mm_unpacklo_epi16(lows, highs);
-#elif defined(ARCH_ARM64)
-	return vzip1q_s16(lows, highs);
-#endif
-}
-
-inline v128 gv_extend_lo_s16(const v128& vec)
-{
-#if defined(__SSE4_1__)
-	return _mm_cvtepi16_epi32(vec);
-#elif defined(ARCH_X64)
-	return _mm_srai_epi32(_mm_unpacklo_epi16(vec, vec), 16);
-#elif defined(ARCH_ARM64)
-	return int32x4_t(vmovl_s16(vget_low_s16(vec)));
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_extend_lo_s16(A&& a)
-{
-#if defined(ARCH_X64)
-	using enum asmjit::x86::Inst::Id;
-	if (utils::has_sse41())
-		return asmjit::unary_op(kIdNone, kIdPmovsxwd, std::forward<A>(a));
-	return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpcklwd, std::forward<A>(a)), 16);
-#endif
-}
-
-inline v128 gv_extend_hi_s16(const v128& vec)
-{
-#if defined(__SSE4_1__)
-	return _mm_cvtepi16_epi32(_mm_loadu_si64(vec._bytes + 8));
-#elif defined(ARCH_X64)
-	return _mm_srai_epi32(_mm_unpackhi_epi16(vec, vec), 16);
-#elif defined(ARCH_ARM64)
-	return int32x4_t(vmovl_s16(vget_high_s16(vec)));
-#endif
-}
-
-template <typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_extend_hi_s16(A&& a)
-{
-#if defined(ARCH_X64)
-	using enum asmjit::x86::Inst::Id;
-	return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpckhwd, std::forward<A>(a)), 16);
-#endif
-}
-
-inline v128 gv_unpacklo32(const v128& lows, const v128& highs)
-{
-#if defined(ARCH_X64)
-	return _mm_unpacklo_epi32(lows, highs);
-#elif defined(ARCH_ARM64)
-	return vzip1q_s32(lows, highs);
-#endif
-}
-
-inline v128 gv_unpackhi8(const v128& lows, const v128& highs)
-{
-#if defined(ARCH_X64)
-	return _mm_unpackhi_epi8(lows, highs);
-#elif defined(ARCH_ARM64)
-	return vzip2q_s8(lows, highs);
-#endif
-}
-
-inline v128 gv_unpackhi16(const v128& lows, const v128& highs)
-{
-#if defined(ARCH_X64)
-	return _mm_unpackhi_epi16(lows, highs);
-#elif defined(ARCH_ARM64)
-	return vzip2q_s16(lows, highs);
-#endif
-}
-
-inline v128 gv_unpackhi32(const v128& lows, const v128& highs)
-{
-#if defined(ARCH_X64)
-	return _mm_unpackhi_epi32(lows, highs);
-#elif defined(ARCH_ARM64)
-	return vzip2q_s32(lows, highs);
-#endif
-}
-
-inline bool v128::operator==(const v128& b) const
-{
-#if defined(ARCH_X64)
-	return gv_testz(_mm_xor_si128(*this, b));
-#else
-	return gv_testz(*this ^ b);
-#endif
-}
-
-inline v128 v128::operator|(const v128& rhs) const
-{
-#if defined(ARCH_X64)
-	return _mm_or_si128(*this, rhs);
-#elif defined(ARCH_ARM64)
-	return vorrq_s32(*this, rhs);
-#endif
-}
-
-inline v128 v128::operator&(const v128& rhs) const
-{
-#if defined(ARCH_X64)
-	return _mm_and_si128(*this, rhs);
-#elif defined(ARCH_ARM64)
-	return vandq_s32(*this, rhs);
-#endif
-}
-
-inline v128 v128::operator^(const v128& rhs) const
-{
-#if defined(ARCH_X64)
-	return _mm_xor_si128(*this, rhs);
-#elif defined(ARCH_ARM64)
-	return veorq_s32(*this, rhs);
-#endif
-}
-
-inline v128 v128::operator~() const
-{
-#if defined(ARCH_X64)
-	return _mm_xor_si128(*this, _mm_set1_epi32(-1));
-#elif defined(ARCH_ARM64)
-	return vmvnq_u32(*this);
-#endif
-}
-
-inline v128 gv_exp2_approxfs(const v128& a)
-{
-	// TODO
-#if 0
-	const auto x0 = _mm_max_ps(_mm_min_ps(a, _mm_set1_ps(127.4999961f)), _mm_set1_ps(-127.4999961f));
-	const auto x1 = _mm_add_ps(x0, _mm_set1_ps(0.5f));
-	const auto x2 = _mm_sub_epi32(_mm_cvtps_epi32(x1), _mm_and_si128(_mm_castps_si128(_mm_cmpnlt_ps(_mm_setzero_ps(), x1)), _mm_set1_epi32(1)));
-	const auto x3 = _mm_sub_ps(x0, _mm_cvtepi32_ps(x2));
-	const auto x4 = _mm_mul_ps(x3, x3);
-	const auto x5 = _mm_mul_ps(x3, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(x4, _mm_set1_ps(0.023093347705f)), _mm_set1_ps(20.20206567f)), x4), _mm_set1_ps(1513.906801f)));
-	const auto x6 = _mm_mul_ps(x5, _mm_rcp_ps(_mm_sub_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(233.1842117f), x4), _mm_set1_ps(4368.211667f)), x5)));
-	return _mm_mul_ps(_mm_add_ps(_mm_add_ps(x6, x6), _mm_set1_ps(1.0f)), _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(x2, _mm_set1_epi32(127)), 23)));
-#else
-	v128 r;
-	for (u32 i = 0; i < 4; i++)
-		r._f[i] = std::exp2f(a._f[i]);
-	return r;
-#endif
-}
-
-inline v128 gv_log2_approxfs(const v128& a)
-{
-	// TODO
-#if 0
-	const auto _1 = _mm_set1_ps(1.0f);
-	const auto _c = _mm_set1_ps(1.442695040f);
-	const auto x0 = _mm_max_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x00800000)));
-	const auto x1 = _mm_or_ps(_mm_and_ps(x0, _mm_castsi128_ps(_mm_set1_epi32(0x807fffff))), _1);
-	const auto x2 = _mm_rcp_ps(_mm_add_ps(x1, _1));
-	const auto x3 = _mm_mul_ps(_mm_sub_ps(x1, _1), x2);
-	const auto x4 = _mm_add_ps(x3, x3);
-	const auto x5 = _mm_mul_ps(x4, x4);
-	const auto x6 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.7895802789f), x5), _mm_set1_ps(16.38666457f)), x5), _mm_set1_ps(-64.1409953f));
-	const auto x7 = _mm_rcp_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-35.67227983f), x5), _mm_set1_ps(312.0937664f)), x5), _mm_set1_ps(-769.6919436f)));
-	const auto x8 = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x0), 23), _mm_set1_epi32(127)));
-	return _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(x5, x6), x7), x4), _c), _mm_add_ps(_mm_mul_ps(x4, _c), x8));
-#else
-	v128 r;
-	for (u32 i = 0; i < 4; i++)
-		r._f[i] = std::log2f(a._f[i]);
-	return r;
-#endif
-}
-
-// For each 8-bit element, r = a << (b & 7)
-inline v128 gv_shl8(const v128& a, const v128& b)
-{
+	// For each 8-bit element, r = (a << (c & 7)) | (b >> (~c & 7) >> 1)
+	template <typename A, typename B, typename C>
+	inline auto gv_fshl8(A&& a, B&& b, C&& c)
+	{
 #if defined(ARCH_ARM64)
-	return vshlq_u8(a, vandq_s8(b, gv_bcst8(7)));
+		const auto amt1 = vandq_s8(c, gv_bcst8(7));
+		const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
+		return v128(vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(b, amt2)));
 #else
-	const v128 x1 = gv_add8(a, a); // shift left by 1
-	const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
-	const v128 x2 = gv_and32(gv_shl64(r1, 2), gv_bcst8(0xfc)); // shift by 2
-	const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
-	const v128 x3 = gv_and32(gv_shl64(r2, 4), gv_bcst8(0xf0)); // shift by 4
-	return gv_signselect8(gv_shl64(b, 5), x3, r2);
+		auto x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), b));
+		auto s1 = gv_shl64(c, 7);
+		auto r1 = gv_signselect8(s1, std::move(x1), std::forward<A>(a));
+		auto b1 = gv_signselect8(std::move(s1), gv_shl64(b, 1), std::forward<B>(b));
+		auto c2 = gv_bcst8(0x3);
+		auto x2 = gv_and32(gv_shr64(b1, 6), c2);
+		x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(r1, 2)));
+		auto s2 = gv_shl64(c, 6);
+		auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1));
+		auto b2 = gv_signselect8(std::move(s2), gv_shl64(b1, 2), std::move(b1));
+		auto c3 = gv_bcst8(0xf);
+		auto x3 = gv_and32(gv_shr64(std::move(b2), 4), c3);
+		x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(r2, 4)));
+		return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3),
+			std::move(r2));
 #endif
-}
+	}
 
-// For each 16-bit element, r = a << (b & 15)
-inline v128 gv_shl16(const v128& a, const v128& b)
-{
-#if defined(__AVX512VL__) && defined(__AVX512BW__)
-	return _mm_sllv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
-#elif defined(ARCH_ARM64)
-	return vshlq_u16(a, vandq_s16(b, gv_bcst8(15)));
-#else
-	v128 r;
-	for (u32 i = 0; i < 8; i++)
-		r._u16[i] = a._u16[i] << (b._u16[i] & 15);
-	return r;
-#endif
-}
-
-// For each 32-bit element, r = a << (b & 31)
-inline v128 gv_shl32(const v128& a, const v128& b)
-{
-#if defined(__AVX2__)
-	return _mm_sllv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
-#elif defined(ARCH_ARM64)
-	return vshlq_u32(a, vandq_s32(b, gv_bcst8(31)));
-#else
-	v128 r;
-	for (u32 i = 0; i < 4; i++)
-		r._u32[i] = a._u32[i] << (b._u32[i] & 31);
-	return r;
-#endif
-}
-
-// For each unsigned 8-bit element, r = a >> (b & 7)
-inline v128 gv_shr8(const v128& a, const v128& b)
-{
+	// For each 8-bit element, r = (b >> (c & 7)) | (a << (~c & 7) << 1)
+	template <typename A, typename B, typename C>
+	inline auto gv_fshr8(A&& a, B&& b, C&& c)
+	{
 #if defined(ARCH_ARM64)
-	return vshlq_u8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7))));
+		const auto amt1 = vandq_s8(c, gv_bcst8(7));
+		const auto amt2 = vsubq_s8(gv_bcst8(8), amt1);
+		return vorrq_u8(vshlq_u8(b, vnegq_s8(amt1)), vshlq_u8(a, amt2));
 #else
-	const v128 x1 = gv_and32(gv_shr64(a, 1), gv_bcst8(0x7f)); // shift right by 1
-	const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
-	const v128 x2 = gv_and32(gv_shr64(r1, 2), gv_bcst8(0x3f)); // shift by 2
-	const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
-	const v128 x3 = gv_and32(gv_shr64(r2, 4), gv_bcst8(0x0f)); // shift by 4
-	return gv_signselect8(gv_shl64(b, 5), x3, r2);
+		auto c1 = gv_bcst8(0x7f);
+		auto x1 = gv_and32(gv_shr64(b, 1), c1);
+		x1 = gv_or32(std::move(x1), gv_andn32(std::move(c1), gv_shl64(a, 7)));
+		auto s1 = gv_shl64(c, 7);
+		auto r1 = gv_signselect8(s1, std::move(x1), std::move(b));
+		auto a1 = gv_signselect8(std::move(s1), gv_shr64(a, 1), std::move(a));
+		auto c2 = gv_bcst8(0x3f);
+		auto x2 = gv_and32(gv_shr64(r1, 2), c2);
+		x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(a1, 6)));
+		auto s2 = gv_shl64(c, 6);
+		auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1));
+		auto a2 = gv_signselect8(std::move(s2), gv_shr64(a1, 2), std::move(a1));
+		auto c3 = gv_bcst8(0x0f);
+		auto x3 = gv_and32(gv_shr64(r2, 4), c3);
+		x3 = gv_or32(std::move(x3),
+			gv_andn32(std::move(c3), gv_shl64(std::move(a2), 4)));
+		return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3),
+			std::move(r2));
 #endif
-}
-
-// For each unsigned 16-bit element, r = a >> (b & 15)
-inline v128 gv_shr16(const v128& a, const v128& b)
-{
-#if defined(__AVX512VL__) && defined(__AVX512BW__)
-	return _mm_srlv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
-#elif defined(ARCH_ARM64)
-	return vshlq_u16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15))));
-#else
-	v128 r;
-	for (u32 i = 0; i < 8; i++)
-		r._u16[i] = a._u16[i] >> (b._u16[i] & 15);
-	return r;
-#endif
-}
-
-// For each unsigned 32-bit element, r = a >> (b & 31)
-inline v128 gv_shr32(const v128& a, const v128& b)
-{
-#if defined(__AVX2__)
-	return _mm_srlv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
-#elif defined(ARCH_ARM64)
-	return vshlq_u32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31))));
-#else
-	v128 r;
-	for (u32 i = 0; i < 4; i++)
-		r._u32[i] = a._u32[i] >> (b._u32[i] & 31);
-	return r;
-#endif
-}
-
-// For each signed 8-bit element, r = a >> (b & 7)
-inline v128 gv_sar8(const v128& a, const v128& b)
-{
-#if defined(ARCH_ARM64)
-	return vshlq_s8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7))));
-#else
-	v128 r;
-	for (u32 i = 0; i < 16; i++)
-		r._s8[i] = a._s8[i] >> (b._s8[i] & 7);
-	return r;
-#endif
-}
-
-// For each signed 16-bit element, r = a >> (b & 15)
-inline v128 gv_sar16(const v128& a, const v128& b)
-{
-#if defined(__AVX512VL__) && defined(__AVX512BW__)
-	return _mm_srav_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
-#elif defined(ARCH_ARM64)
-	return vshlq_s16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15))));
-#else
-	v128 r;
-	for (u32 i = 0; i < 8; i++)
-		r._s16[i] = a._s16[i] >> (b._s16[i] & 15);
-	return r;
-#endif
-}
-
-// For each signed 32-bit element, r = a >> (b & 31)
-inline v128 gv_sar32(const v128& a, const v128& b)
-{
-#if defined(__AVX2__)
-	return _mm_srav_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
-#elif defined(ARCH_ARM64)
-	return vshlq_s32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31))));
-#else
-	v128 r;
-	for (u32 i = 0; i < 4; i++)
-		r._s32[i] = a._s32[i] >> (b._s32[i] & 31);
-	return r;
-#endif
-}
-
-// For each 8-bit element, r = rotate a by b
-inline v128 gv_rol8(const v128& a, const v128& b)
-{
-#if defined(ARCH_ARM64)
-	const auto amt1 = vandq_s8(b, gv_bcst8(7));
-	const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
-	return vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(a, amt2));
-#else
-	const v128 x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), a)); // rotate left by 1
-	const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
-	const v128 c2 = gv_bcst8(0x3);
-	const v128 x2 = gv_or32(gv_and32(gv_shr64(r1, 6), c2), gv_andn32(c2, gv_shl64(r1, 2))); // rotate by 2
-	const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
-	const v128 c3 = gv_bcst8(0xf);
-	const v128 x3 = gv_or32(gv_and32(gv_shr64(r2, 4), c3), gv_andn32(c3, gv_shl64(r2, 4))); // rotate by 4
-	return gv_signselect8(gv_shl64(b, 5), x3, r2);
-#endif
-}
-
-// For each 16-bit element, r = rotate a by b
-inline v128 gv_rol16(const v128& a, const v128& b)
-{
-#if defined(ARCH_ARM64)
-	const auto amt1 = vandq_s16(b, gv_bcst16(15));
-	const auto amt2 = vsubq_s16(amt1, gv_bcst16(16));
-	return vorrq_u16(vshlq_u16(a, amt1), vshlq_u16(a, amt2));
-#else
-	v128 r;
-	for (u32 i = 0; i < 8; i++)
-		r._u16[i] = utils::rol16(a._u16[i], b._u16[i]);
-	return r;
-#endif
-}
-
-// For each 16-bit element, r = rotate a by count
-template <u8 Count>
-inline v128 gv_rol16(const v128& a)
-{
-	constexpr u8 count = Count & 0xf;
-#if defined(ARCH_X64)
-	return _mm_or_si128(_mm_srli_epi16(a, 16 - count), _mm_slli_epi16(a, count));
-#elif defined(ARCH_ARM64)
-	return vorrq_u16(vshrq_n_u16(a, 16 - count), vshlq_n_u16(a, count));
-#else
-	v128 r;
-	for (u32 i = 0; i < 8; i++)
-		r._u16[i] = std::rotl(a._u16[i], count);
-	return r;
-#endif
-}
-
-// For each 32-bit element, r = rotate a by b
-inline v128 gv_rol32(const v128& a, const v128& b)
-{
-#if defined(__AVX512VL__)
-	return _mm_rolv_epi32(a, b);
-#elif defined(ARCH_ARM64)
-	const auto amt1 = vandq_s32(b, gv_bcst32(31));
-	const auto amt2 = vsubq_s32(amt1, gv_bcst32(32));
-	return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2));
-#else
-	v128 r;
-	for (u32 i = 0; i < 4; i++)
-		r._u32[i] = utils::rol32(a._u32[i], b._u32[i]);
-	return r;
-#endif
-}
-
-// For each 32-bit element, r = rotate a by count
-template <u8 Count>
-inline v128 gv_rol32(const v128& a)
-{
-	constexpr u8 count = Count & 0x1f;
-#if defined(__AVX512VL__)
-	return _mm_rol_epi32(a, count);
-#elif defined(ARCH_X64)
-	return _mm_or_si128(_mm_srli_epi32(a, 32 - count), _mm_slli_epi32(a, count));
-#elif defined(ARCH_ARM64)
-	return vorrq_u32(vshrq_n_u32(a, 32 - count), vshlq_n_u32(a, count));
-#else
-	v128 r;
-	for (u32 i = 0; i < 4; i++)
-		r._u32[i] = utils::rol32(a._u32[i], count);
-	return r;
-#endif
-}
-
-// For each 8-bit element, r = (a << (c & 7)) | (b >> (~c & 7) >> 1)
-template <typename A, typename B, typename C>
-inline auto gv_fshl8(A&& a, B&& b, C&& c)
-{
-#if defined(ARCH_ARM64)
-	const auto amt1 = vandq_s8(c, gv_bcst8(7));
-	const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
-	return v128(vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(b, amt2)));
-#else
-	auto x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), b));
-	auto s1 = gv_shl64(c, 7);
-	auto r1 = gv_signselect8(s1, std::move(x1), std::forward<A>(a));
-	auto b1 = gv_signselect8(std::move(s1), gv_shl64(b, 1), std::forward<B>(b));
-	auto c2 = gv_bcst8(0x3);
-	auto x2 = gv_and32(gv_shr64(b1, 6), c2);
-	x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(r1, 2)));
-	auto s2 = gv_shl64(c, 6);
-	auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1));
-	auto b2 = gv_signselect8(std::move(s2), gv_shl64(b1, 2), std::move(b1));
-	auto c3 = gv_bcst8(0xf);
-	auto x3 = gv_and32(gv_shr64(std::move(b2), 4), c3);
-	x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(r2, 4)));
-	return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2));
-#endif
-}
-
-// For each 8-bit element, r = (b >> (c & 7)) | (a << (~c & 7) << 1)
-template <typename A, typename B, typename C>
-inline auto gv_fshr8(A&& a, B&& b, C&& c)
-{
-#if defined(ARCH_ARM64)
-	const auto amt1 = vandq_s8(c, gv_bcst8(7));
-	const auto amt2 = vsubq_s8(gv_bcst8(8), amt1);
-	return vorrq_u8(vshlq_u8(b, vnegq_s8(amt1)), vshlq_u8(a, amt2));
-#else
-	auto c1 = gv_bcst8(0x7f);
-	auto x1 = gv_and32(gv_shr64(b, 1), c1);
-	x1 = gv_or32(std::move(x1), gv_andn32(std::move(c1), gv_shl64(a, 7)));
-	auto s1 = gv_shl64(c, 7);
-	auto r1 = gv_signselect8(s1, std::move(x1), std::move(b));
-	auto a1 = gv_signselect8(std::move(s1), gv_shr64(a, 1), std::move(a));
-	auto c2 = gv_bcst8(0x3f);
-	auto x2 = gv_and32(gv_shr64(r1, 2), c2);
-	x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(a1, 6)));
-	auto s2 = gv_shl64(c, 6);
-	auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1));
-	auto a2 = gv_signselect8(std::move(s2), gv_shr64(a1, 2), std::move(a1));
-	auto c3 = gv_bcst8(0x0f);
-	auto x3 = gv_and32(gv_shr64(r2, 4), c3);
-	x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(std::move(a2), 4)));
-	return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2));
-#endif
-}
-
-// Shift left by byte amount
-template <u32 Count>
-inline v128 gv_shuffle_left(const v128& a)
-{
-	if (Count > 15)
-		return {};
-#if defined(ARCH_X64)
-	return _mm_slli_si128(a, Count);
-#elif defined(ARCH_ARM64)
-	v128 idx;
-	for (u32 i = 0; i < 16; i++)
-		idx._u8[i] = u8(i - Count);
-	return vqtbl1q_u8(a, idx);
-#endif
-}
-
-template <u32 Count, typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_shuffle_left(A&& a)
-{
-	FOR_X64(unary_op, kIdPslldq, kIdVpslldq, std::forward<A>(a), Count);
-}
-
-// Shift right by byte amount
-template <u32 Count>
-inline v128 gv_shuffle_right(const v128& a)
-{
-	if (Count > 15)
-		return {};
-#if defined(ARCH_X64)
-	return _mm_srli_si128(a, Count);
-#elif defined(ARCH_ARM64)
-	v128 idx;
-	for (u32 i = 0; i < 16; i++)
-		idx._u8[i] = u8(i + Count);
-	return vqtbl1q_u8(a, idx);
-#endif
-}
-
-template <u32 Count, typename A>
-	requires(asmjit::any_operand_v<A>)
-inline auto gv_shuffle_right(A&& a)
-{
-	FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward<A>(a), Count);
-}
-
-// Load 32-bit integer into the first element of a new vector, set other elements to zero
-inline v128 gv_loadu32(const void* ptr)
-{
-#if defined(ARCH_X64)
-	return _mm_loadu_si32(ptr);
-#elif defined(ARCH_ARM64)
-	return vld1q_lane_u32(static_cast<const u32*>(ptr), vdupq_n_u32(0), 0);
-#endif
-}
-
-// Load 16-bit integer into an existing vector at the position specified by Index
-template <u8 Index>
-inline v128 gv_insert16(const v128& vec, u16 value)
-{
-#if defined(ARCH_X64)
-	return _mm_insert_epi16(vec, value, Index);
-#elif defined(ARCH_ARM64)
-	return vsetq_lane_u16(value, vec, Index & 0x7);
-#endif
-}
-
-// For each 8-bit element,
-// if ctrl >= 0 && ctrl < 16 then r = vec[ctrl],
-// else if ctrl < 0 then r = 0
-inline v128 gv_shuffle8(const v128& vec, const v128& ctrl)
-{
-	AUDIT(std::ranges::none_of(ctrl._chars, [](s8 i)
-			  {
-				  return i >= static_cast<s8>(sizeof(v128));
-			  }),
-		"All indices must be in the range [0, 15] or negative, since PSHUFB and TBL behave differently otherwise");
-#if defined(__SSSE3__)
-	return _mm_shuffle_epi8(vec, ctrl);
-#elif defined(ARCH_ARM64)
-	return vqtbl1q_s8(vec, ctrl);
-#else
-	v128 r;
-	for (s32 i = 0; i < 16; i++)
-		r._s8[i] = ctrl._s8[i] < 0 ? 0 : vec._s8[ctrl._s8[i] & 0xf];
-	return r;
-#endif
-}
-
-// For each 2-bit index in Control, r = vec[index]
-template <u8 Control>
-inline v128 gv_shuffle32(const v128& vec)
-{
-#if defined(ARCH_X64)
-	return _mm_shuffle_epi32(vec, Control);
-#elif defined(ARCH_ARM64)
-	constexpr u8 idx0 = (Control & 3) * sizeof(s32);
-	constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32);
-	constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32);
-	constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32);
-
-	constexpr uint8x16_t idx_vec = {idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3};
-
-	return vqtbl1q_s8(vec, idx_vec);
-#endif
-}
-
-// For each index, r = vec[index & 3]
-template <u8 Index0, u8 Index1, u8 Index2, u8 Index3>
-inline v128 gv_shuffle32(const v128& vec)
-{
-#if defined(ARCH_X64)
-	return _mm_shuffle_epi32(vec, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6);
-#elif defined(ARCH_ARM64)
-	constexpr u8 idx0 = (Index0 & 3) * sizeof(s32);
-	constexpr u8 idx1 = (Index1 & 3) * sizeof(s32);
-	constexpr u8 idx2 = (Index2 & 3) * sizeof(s32);
-	constexpr u8 idx3 = (Index3 & 3) * sizeof(s32);
-
-	constexpr uint8x16_t idx_vec = {idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3};
-
-	return vqtbl1q_s8(vec, idx_vec);
-#endif
-}
-
-// For the first two 2-bit indices in Control, r = a[index],
-// for the last two indices, r = b[index]
-template <u8 Control>
-inline v128 gv_shufflefs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_shuffle_ps(a, b, Control);
-#elif defined(ARCH_ARM64)
-	constexpr u8 idx0 = (Control & 3) * sizeof(s32);
-	constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32);
-	constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32) + sizeof(v128);
-	constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32) + sizeof(v128);
-
-	constexpr uint8x16_t idx_vec = {idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3};
-
-	return vqtbl2q_s8({a, b}, idx_vec);
-#endif
-}
-
-// For the first two indices, r = a[index & 3],
-// for the last two indices, r = b[index & 3]
-template <u8 Index0, u8 Index1, u8 Index2, u8 Index3>
-inline v128 gv_shufflefs(const v128& a, const v128& b)
-{
-#if defined(ARCH_X64)
-	return _mm_shuffle_ps(a, b, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6);
-#elif defined(ARCH_ARM64)
-	constexpr u8 idx0 = (Index0 & 3) * sizeof(s32);
-	constexpr u8 idx1 = (Index1 & 3) * sizeof(s32);
-	constexpr u8 idx2 = (Index2 & 3) * sizeof(s32) + sizeof(v128);
-	constexpr u8 idx3 = (Index3 & 3) * sizeof(s32) + sizeof(v128);
-
-	constexpr uint8x16_t idx_vec = {idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3};
-
-	return vqtbl2q_s8({a, b}, idx_vec);
-#endif
-}
-
-// For each 32-bit element, reverse byte order
-inline v128 gv_rev32(const v128& vec)
-{
-#if defined(__SSSE3__)
-	return _mm_shuffle_epi8(vec, _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12));
-#elif defined(ARCH_ARM64)
-	return vrev32q_u8(vec);
-#else
-	return gv_rol32<16>(gv_rol16<8>(vec));
-#endif
-}
-
-// For each 32-bit element, convert between big-endian and native-endian
-inline v128 gv_to_be32(const v128& vec)
-{
-	if constexpr (std::endian::native == std::endian::little)
-		return gv_rev32(vec);
-	return vec;
-}
+	}
+} // namespace rx
 
 #if defined(__clang__)
 #pragma clang diagnostic pop
diff --git a/rpcs3/util/to_endian.hpp b/rpcs3/util/to_endian.hpp
index f4ec045d9..15480792b 100644
--- a/rpcs3/util/to_endian.hpp
+++ b/rpcs3/util/to_endian.hpp
@@ -3,7 +3,11 @@
 #include "util/types.hpp"
 #include "util/endian.hpp"
 
-union v128;
+namespace rx
+{
+	union v128;
+}
+using rx::v128;
 
 // Type converter: converts native endianness arithmetic/enum types to appropriate se_t<> type
 template <typename T, bool Se>
diff --git a/rpcs3/util/types.hpp b/rpcs3/util/types.hpp
index 7462d1a93..15b1a8422 100644
--- a/rpcs3/util/types.hpp
+++ b/rpcs3/util/types.hpp
@@ -1,195 +1,5 @@
-#pragma once // No BOM and only basic ASCII in this header, or a neko will die
-
-#include <cstdint>
-#include <cstddef>
-#include <cstring>
-#include <type_traits>
-#include <utility>
-#include <chrono>
-#include <array>
-#include <tuple>
-#include <compare>
-#include <memory>
-#include <bit>
-#include <string>
-#include <source_location>
-
-#if defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) || defined(__amd64__)
-#define ARCH_X64 1
-#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
-#define ARCH_ARM64 1
-// v8.4a+ gives us atomic 16 byte ld/st
-// See Arm C Language Extensions Documentation
-// Currently there is no feature macro for LSE2 specifically so we define it ourself
-// Unfortunately the __ARM_ARCH integer macro isn't universally defined so we use this hack instead
-#if defined(__ARM_ARCH_8_4__) || defined(__ARM_ARCH_8_5__) || defined(__ARM_ARCH_8_6__) || defined(__ARM_ARCH_9__)
-#define ARM_FEATURE_LSE2 1
-#endif
-#endif
-
-using std::chrono::steady_clock;
-
-using namespace std::literals;
-
-#ifndef __has_builtin
-#define __has_builtin(x) 0
-#endif
-
-#ifdef _MSC_VER
-#define SAFE_BUFFERS(...) __declspec(safebuffers) __VA_ARGS__
-#define NEVER_INLINE __declspec(noinline)
-#define FORCE_INLINE __forceinline
-#else // not _MSC_VER
-#ifdef __clang__
-#define SAFE_BUFFERS(...) __attribute__((no_stack_protector)) __VA_ARGS__
-#else
-#define SAFE_BUFFERS(...) __VA_ARGS__ __attribute__((__optimize__("no-stack-protector")))
-#endif
-#define NEVER_INLINE __attribute__((noinline)) inline
-#define FORCE_INLINE __attribute__((always_inline)) inline
-#endif // _MSC_VER
-
-#define CHECK_SIZE(type, size) static_assert(sizeof(type) == size, "Invalid " #type " type size")
-#define CHECK_ALIGN(type, align) static_assert(alignof(type) == align, "Invalid " #type " type alignment")
-#define CHECK_MAX_SIZE(type, size) static_assert(sizeof(type) <= size, #type " type size is too big")
-#define CHECK_SIZE_ALIGN(type, size, align) \
-	CHECK_SIZE(type, size);                 \
-	CHECK_ALIGN(type, align)
-
-#define DECLARE(...) decltype(__VA_ARGS__) __VA_ARGS__
-
-#define STR_CASE(...) \
-	case __VA_ARGS__: return #__VA_ARGS__
-
-#if defined(_DEBUG) || defined(_AUDIT)
-#define AUDIT(...) (static_cast<void>(ensure(__VA_ARGS__)))
-#else
-#define AUDIT(...) (static_cast<std::void_t<decltype((__VA_ARGS__))>>(0))
-#endif
-
-namespace utils
-{
-	template <typename F>
-	struct fn_helper
-	{
-		F f;
-
-		fn_helper(F&& f)
-			: f(std::forward<F>(f))
-		{
-		}
-
-		template <typename... Args>
-		auto operator()(Args&&... args) const
-		{
-			if constexpr (sizeof...(Args) == 0)
-				return f(0, 0, 0, 0);
-			else if constexpr (sizeof...(Args) == 1)
-				return f(std::forward<Args>(args)..., 0, 0, 0);
-			else if constexpr (sizeof...(Args) == 2)
-				return f(std::forward<Args>(args)..., 0, 0);
-			else if constexpr (sizeof...(Args) == 3)
-				return f(std::forward<Args>(args)..., 0);
-			else if constexpr (sizeof...(Args) == 4)
-				return f(std::forward<Args>(args)...);
-			else
-				static_assert(sizeof...(Args) <= 4);
-		}
-	};
-
-	template <typename F>
-	fn_helper(F&& f) -> fn_helper<F>;
-} // namespace utils
-
-// Shorter lambda.
-#define FN(...)                                         \
-	::utils::fn_helper([&](                             \
-						   [[maybe_unused]] auto&& x,   \
-						   [[maybe_unused]] auto&& y,   \
-						   [[maybe_unused]] auto&& z,   \
-						   [[maybe_unused]] auto&& w) { \
-		return (__VA_ARGS__);                           \
-	})
-
-#if __cpp_lib_bit_cast < 201806L
-namespace std
-{
-	template <typename To, typename From>
-	[[nodiscard]] constexpr To bit_cast(const From& from) noexcept
-	{
-		return __builtin_bit_cast(To, from);
-	}
-} // namespace std
-#endif
-
-#if defined(__INTELLISENSE__) || (defined(__clang__) && (__clang_major__ <= 16))
-#define consteval constexpr
-#define constinit
-#endif
-
-using schar = signed char;
-using uchar = unsigned char;
-using ushort = unsigned short;
-using uint = unsigned int;
-using ulong = unsigned long;
-using ullong = unsigned long long;
-using llong = long long;
-
-using uptr = std::uintptr_t;
-
-using u8 = std::uint8_t;
-using u16 = std::uint16_t;
-using u32 = std::uint32_t;
-using u64 = std::uint64_t;
-using usz = std::size_t;
-
-using s8 = std::int8_t;
-using s16 = std::int16_t;
-using s32 = std::int32_t;
-using s64 = std::int64_t;
-using ssz = std::make_signed_t<std::size_t>;
-
-// Get integral type from type size
-template <usz N>
-struct get_int_impl
-{
-};
-
-template <>
-struct get_int_impl<sizeof(u8)>
-{
-	using utype = u8;
-};
-
-template <>
-struct get_int_impl<sizeof(u16)>
-{
-	using utype = u16;
-};
-
-template <>
-struct get_int_impl<sizeof(u32)>
-{
-	using utype = u32;
-};
-
-template <>
-struct get_int_impl<sizeof(u64)>
-{
-	using utype = u64;
-};
-
-template <usz N>
-using get_uint_t = typename get_int_impl<N>::utype;
-
-template <typename T>
-std::remove_cvref_t<T> as_rvalue(T&& obj)
-{
-	return std::forward<T>(obj);
-}
-
-template <typename T, usz Align>
-class atomic_t;
+#pragma once
+#include <rx/types.hpp>
 
 namespace stx
 {
@@ -203,1148 +13,6 @@ namespace stx
 	struct generator;
 } // namespace stx
 
-using stx::se_t;
-
-// se_t<> with native endianness
-template <typename T, usz Align = alignof(T)>
-using nse_t = se_t<T, false, Align>;
-
-template <typename T, usz Align = alignof(T)>
-using be_t = se_t<T, std::endian::little == std::endian::native, Align>;
-template <typename T, usz Align = alignof(T)>
-using le_t = se_t<T, std::endian::big == std::endian::native, Align>;
-
-template <typename T, usz Align = alignof(T)>
-using atomic_be_t = atomic_t<be_t<T>, Align>;
-template <typename T, usz Align = alignof(T)>
-using atomic_le_t = atomic_t<le_t<T>, Align>;
-
-// Bool type equivalent
-class b8
-{
-	u8 m_value;
-
-public:
-	b8() = default;
-
-	using enable_bitcopy = std::true_type;
-
-	constexpr b8(bool value) noexcept
-		: m_value(value)
-	{
-	}
-
-	constexpr operator bool() const noexcept
-	{
-		return m_value != 0;
-	}
-
-	constexpr bool set(bool value) noexcept
-	{
-		m_value = value;
-		return value;
-	}
-};
-
-#if defined(ARCH_X64) && !defined(_MSC_VER)
-using __m128i = long long __attribute__((vector_size(16)));
-using __m128d = double __attribute__((vector_size(16)));
-using __m128 = float __attribute__((vector_size(16)));
-#endif
-
-#ifndef _MSC_VER
-using u128 = __uint128_t;
-using s128 = __int128_t;
-#else
-
-extern "C"
-{
-	union __m128;
-	union __m128i;
-	struct __m128d;
-
-	uchar _addcarry_u64(uchar, u64, u64, u64*);
-	uchar _subborrow_u64(uchar, u64, u64, u64*);
-	u64 __shiftleft128(u64, u64, uchar);
-	u64 __shiftright128(u64, u64, uchar);
-	u64 _umul128(u64, u64, u64*);
-}
-
-// Unsigned 128-bit integer implementation (TODO)
-struct alignas(16) u128
-{
-	u64 lo, hi;
-
-	u128() noexcept = default;
-
-	template <typename T>
-		requires std::is_unsigned_v<T>
-	constexpr u128(T arg) noexcept
-		: lo(arg), hi(0)
-	{
-	}
-
-	template <typename T>
-		requires std::is_signed_v<T>
-	constexpr u128(T arg) noexcept
-		: lo(s64{arg}), hi(s64{arg} >> 63)
-	{
-	}
-
-	constexpr explicit operator bool() const noexcept
-	{
-		return !!(lo | hi);
-	}
-
-	constexpr explicit operator u64() const noexcept
-	{
-		return lo;
-	}
-
-	constexpr explicit operator s64() const noexcept
-	{
-		return lo;
-	}
-
-	constexpr friend u128 operator+(const u128& l, const u128& r)
-	{
-		u128 value = l;
-		value += r;
-		return value;
-	}
-
-	constexpr friend u128 operator-(const u128& l, const u128& r)
-	{
-		u128 value = l;
-		value -= r;
-		return value;
-	}
-
-	constexpr friend u128 operator*(const u128& l, const u128& r)
-	{
-		u128 value = l;
-		value *= r;
-		return value;
-	}
-
-	constexpr u128 operator+() const
-	{
-		return *this;
-	}
-
-	constexpr u128 operator-() const
-	{
-		u128 value{};
-		value -= *this;
-		return value;
-	}
-
-	constexpr u128& operator++()
-	{
-		*this += 1;
-		return *this;
-	}
-
-	constexpr u128 operator++(int)
-	{
-		u128 value = *this;
-		*this += 1;
-		return value;
-	}
-
-	constexpr u128& operator--()
-	{
-		*this -= 1;
-		return *this;
-	}
-
-	constexpr u128 operator--(int)
-	{
-		u128 value = *this;
-		*this -= 1;
-		return value;
-	}
-
-	constexpr u128 operator<<(u128 shift_value) const
-	{
-		u128 value = *this;
-		value <<= shift_value;
-		return value;
-	}
-
-	constexpr u128 operator>>(u128 shift_value) const
-	{
-		u128 value = *this;
-		value >>= shift_value;
-		return value;
-	}
-
-	constexpr u128 operator~() const
-	{
-		u128 value{};
-		value.lo = ~lo;
-		value.hi = ~hi;
-		return value;
-	}
-
-	constexpr friend u128 operator&(const u128& l, const u128& r)
-	{
-		u128 value{};
-		value.lo = l.lo & r.lo;
-		value.hi = l.hi & r.hi;
-		return value;
-	}
-
-	constexpr friend u128 operator|(const u128& l, const u128& r)
-	{
-		u128 value{};
-		value.lo = l.lo | r.lo;
-		value.hi = l.hi | r.hi;
-		return value;
-	}
-
-	constexpr friend u128 operator^(const u128& l, const u128& r)
-	{
-		u128 value{};
-		value.lo = l.lo ^ r.lo;
-		value.hi = l.hi ^ r.hi;
-		return value;
-	}
-
-	constexpr u128& operator+=(const u128& r)
-	{
-		if (std::is_constant_evaluated())
-		{
-			lo += r.lo;
-			hi += r.hi + (lo < r.lo);
-		}
-		else
-		{
-			_addcarry_u64(_addcarry_u64(0, r.lo, lo, &lo), r.hi, hi, &hi);
-		}
-
-		return *this;
-	}
-
-	constexpr u128& operator-=(const u128& r)
-	{
-		if (std::is_constant_evaluated())
-		{
-			hi -= r.hi + (lo < r.lo);
-			lo -= r.lo;
-		}
-		else
-		{
-			_subborrow_u64(_subborrow_u64(0, lo, r.lo, &lo), hi, r.hi, &hi);
-		}
-
-		return *this;
-	}
-
-	constexpr u128& operator*=(const u128& r)
-	{
-		const u64 _hi = r.hi * lo + r.lo * hi;
-
-		if (std::is_constant_evaluated())
-		{
-			hi = (lo >> 32) * (r.lo >> 32) + (((lo >> 32) * (r.lo & 0xffffffff)) >> 32) + (((r.lo >> 32) * (lo & 0xffffffff)) >> 32);
-			lo = lo * r.lo;
-		}
-		else
-		{
-			lo = _umul128(lo, r.lo, &hi);
-		}
-
-		hi += _hi;
-		return *this;
-	}
-
-	constexpr u128& operator<<=(const u128& r)
-	{
-		if (std::is_constant_evaluated())
-		{
-			if (r.hi == 0 && r.lo < 64)
-			{
-				hi = (hi << r.lo) | (lo >> (64 - r.lo));
-				lo = (lo << r.lo);
-				return *this;
-			}
-			else if (r.hi == 0 && r.lo < 128)
-			{
-				hi = (lo << (r.lo - 64));
-				lo = 0;
-				return *this;
-			}
-		}
-
-		const u64 v0 = lo << (r.lo & 63);
-		const u64 v1 = __shiftleft128(lo, hi, static_cast<uchar>(r.lo));
-		lo = (r.lo & 64) ? 0 : v0;
-		hi = (r.lo & 64) ? v0 : v1;
-		return *this;
-	}
-
-	constexpr u128& operator>>=(const u128& r)
-	{
-		if (std::is_constant_evaluated())
-		{
-			if (r.hi == 0 && r.lo < 64)
-			{
-				lo = (lo >> r.lo) | (hi << (64 - r.lo));
-				hi = (hi >> r.lo);
-				return *this;
-			}
-			else if (r.hi == 0 && r.lo < 128)
-			{
-				lo = (hi >> (r.lo - 64));
-				hi = 0;
-				return *this;
-			}
-		}
-
-		const u64 v0 = hi >> (r.lo & 63);
-		const u64 v1 = __shiftright128(lo, hi, static_cast<uchar>(r.lo));
-		lo = (r.lo & 64) ? v0 : v1;
-		hi = (r.lo & 64) ? 0 : v0;
-		return *this;
-	}
-
-	constexpr u128& operator&=(const u128& r)
-	{
-		lo &= r.lo;
-		hi &= r.hi;
-		return *this;
-	}
-
-	constexpr u128& operator|=(const u128& r)
-	{
-		lo |= r.lo;
-		hi |= r.hi;
-		return *this;
-	}
-
-	constexpr u128& operator^=(const u128& r)
-	{
-		lo ^= r.lo;
-		hi ^= r.hi;
-		return *this;
-	}
-};
-
-// Signed 128-bit integer implementation
-struct s128 : u128
-{
-	using u128::u128;
-
-	constexpr s128 operator>>(u128 shift_value) const
-	{
-		s128 value = *this;
-		value >>= shift_value;
-		return value;
-	}
-
-	constexpr s128& operator>>=(const u128& r)
-	{
-		if (std::is_constant_evaluated())
-		{
-			if (r.hi == 0 && r.lo < 64)
-			{
-				lo = (lo >> r.lo) | (hi << (64 - r.lo));
-				hi = (static_cast<s64>(hi) >> r.lo);
-				return *this;
-			}
-			else if (r.hi == 0 && r.lo < 128)
-			{
-				s64 _lo = static_cast<s64>(hi) >> (r.lo - 64);
-				lo = _lo;
-				hi = _lo >> 63;
-				return *this;
-			}
-		}
-
-		const u64 v0 = static_cast<s64>(hi) >> (r.lo & 63);
-		const u64 v1 = __shiftright128(lo, hi, static_cast<uchar>(r.lo));
-		lo = (r.lo & 64) ? v0 : v1;
-		hi = (r.lo & 64) ? static_cast<s64>(hi) >> 63 : v0;
-		return *this;
-	}
-};
-#endif
-
-// Optimization for u64*u64=u128
-constexpr u128 u128_from_mul(u64 a, u64 b)
-{
-#ifdef _MSC_VER
-	if (!std::is_constant_evaluated())
-	{
-		u64 hi;
-		u128 result = _umul128(a, b, &hi);
-		result.hi = hi;
-		return result;
-	}
-#endif
-
-	return u128{a} * b;
-}
-
-template <>
-struct get_int_impl<16>
-{
-	using utype = u128;
-	using stype = s128;
-};
-
-enum class f16 : u16
-{
-};
-
-using f32 = float;
-using f64 = double;
-
-template <typename T>
-concept UnsignedInt = std::is_unsigned_v<std::common_type_t<T>> || std::is_same_v<std::common_type_t<T>, u128>;
-
-template <typename T>
-concept SignedInt = (std::is_signed_v<std::common_type_t<T>> && std::is_integral_v<std::common_type_t<T>>) || std::is_same_v<std::common_type_t<T>, s128>;
-
-template <typename T>
-concept FPInt = std::is_floating_point_v<std::common_type_t<T>> || std::is_same_v<std::common_type_t<T>, f16>;
-
-template <typename T>
-concept Integral = std::is_integral_v<std::common_type_t<T>> || std::is_same_v<std::common_type_t<T>, u128> || std::is_same_v<std::common_type_t<T>, s128>;
-
-template <typename T>
-constexpr T min_v;
-
-template <UnsignedInt T>
-constexpr std::common_type_t<T> min_v<T> = 0;
-
-template <SignedInt T>
-constexpr std::common_type_t<T> min_v<T> = static_cast<std::common_type_t<T>>(-1) << (sizeof(std::common_type_t<T>) * 8 - 1);
-
-template <>
-constexpr inline f16 min_v<f16>{0xfbffu};
-
-template <>
-constexpr inline f32 min_v<f32> = std::bit_cast<f32, u32>(0xff'7fffffu);
-
-template <>
-constexpr inline f64 min_v<f64> = std::bit_cast<f64, u64>(0xffe'7ffff'ffffffffu);
-
-template <FPInt T>
-constexpr std::common_type_t<T> min_v<T> = min_v<std::common_type_t<T>>;
-
-template <typename T>
-constexpr T max_v;
-
-template <UnsignedInt T>
-constexpr std::common_type_t<T> max_v<T> = -1;
-
-template <SignedInt T>
-constexpr std::common_type_t<T> max_v<T> = static_cast<std::common_type_t<T>>(~min_v<T>);
-
-template <>
-constexpr inline f16 max_v<f16>{0x7bffu};
-
-template <>
-constexpr inline f32 max_v<f32> = std::bit_cast<f32, u32>(0x7f'7fffffu);
-
-template <>
-constexpr inline f64 max_v<f64> = std::bit_cast<f64, u64>(0x7fe'fffff'ffffffffu);
-
-template <FPInt T>
-constexpr std::common_type_t<T> max_v<T> = max_v<std::common_type_t<T>>;
-
-// Return magic value for any unsigned type
-constexpr struct umax_impl_t
-{
-	template <UnsignedInt T>
-	constexpr bool operator==(const T& rhs) const
-	{
-		return rhs == max_v<T>;
-	}
-
-	template <UnsignedInt T>
-	constexpr std::strong_ordering operator<=>(const T& rhs) const
-	{
-		return rhs == max_v<T> ? std::strong_ordering::equal : std::strong_ordering::greater;
-	}
-
-	template <UnsignedInt T>
-	constexpr operator T() const
-	{
-		return max_v<T>;
-	}
-} umax;
-
-constexpr struct smin_impl_t
-{
-	template <SignedInt T>
-	constexpr bool operator==(const T& rhs) const
-	{
-		return rhs == min_v<T>;
-	}
-
-	template <SignedInt T>
-	constexpr std::strong_ordering operator<=>(const T& rhs) const
-	{
-		return rhs == min_v<T> ? std::strong_ordering::equal : std::strong_ordering::less;
-	}
-
-	template <SignedInt T>
-	constexpr operator T() const
-	{
-		return min_v<T>;
-	}
-} smin;
-
-constexpr struct smax_impl_t
-{
-	template <SignedInt T>
-	constexpr bool operator==(const T& rhs) const
-	{
-		return rhs == max_v<T>;
-	}
-
-	template <SignedInt T>
-	constexpr std::strong_ordering operator<=>(const T& rhs) const
-	{
-		return rhs == max_v<T> ? std::strong_ordering::equal : std::strong_ordering::greater;
-	}
-
-	template <SignedInt T>
-	constexpr operator T() const
-	{
-		return max_v<T>;
-	}
-} smax;
-
-// Compare signed or unsigned type with its max value
-constexpr struct amax_impl_t
-{
-	template <typename T>
-		requires SignedInt<T> || UnsignedInt<T>
-	constexpr bool operator==(const T& rhs) const
-	{
-		return rhs == max_v<T>;
-	}
-
-	template <typename T>
-		requires SignedInt<T> || UnsignedInt<T>
-	constexpr std::strong_ordering operator<=>(const T& rhs) const
-	{
-		return max_v<T> <=> rhs;
-	}
-
-	template <typename T>
-		requires SignedInt<T> || UnsignedInt<T>
-	constexpr operator T() const
-	{
-		return max_v<T>;
-	}
-} amax;
-
-// Compare signed or unsigned type with its minimal value (like zero or INT_MIN)
-constexpr struct amin_impl_t
-{
-	template <typename T>
-		requires SignedInt<T> || UnsignedInt<T>
-	constexpr bool operator==(const T& rhs) const
-	{
-		return rhs == min_v<T>;
-	}
-
-	template <typename T>
-		requires SignedInt<T> || UnsignedInt<T>
-	constexpr std::strong_ordering operator<=>(const T& rhs) const
-	{
-		return min_v<T> <=> rhs;
-	}
-
-	template <typename T>
-		requires SignedInt<T> || UnsignedInt<T>
-	constexpr operator T() const
-	{
-		return min_v<T>;
-	}
-} amin;
-
-template <typename T, typename T2>
-inline u32 offset32(T T2::* const mptr)
-{
-#ifdef _MSC_VER
-	return std::bit_cast<u32>(mptr);
-#elif __GNUG__
-	return std::bit_cast<usz>(mptr);
-#else
-	static_assert(sizeof(mptr) == 0, "Unsupported pointer-to-member size");
-#endif
-}
-
-template <typename T>
-struct offset32_array
-{
-	static_assert(std::is_array_v<T>, "Invalid pointer-to-member type (array expected)");
-
-	template <typename Arg>
-	static inline u32 index32(const Arg& arg)
-	{
-		return u32{sizeof(std::remove_extent_t<T>)} * static_cast<u32>(arg);
-	}
-};
-
-template <typename T, usz N>
-struct offset32_array<std::array<T, N>>
-{
-	template <typename Arg>
-	static inline u32 index32(const Arg& arg)
-	{
-		return u32{sizeof(T)} * static_cast<u32>(arg);
-	}
-};
-
-template <typename Arg>
-struct offset32_detail;
-
-template <typename T, typename T2, typename Arg, typename... Args>
-inline u32 offset32(T T2::* const mptr, const Arg& arg, const Args&... args)
-{
-	return offset32_detail<Arg>::offset32(mptr, arg, args...);
-}
-
-template <typename Arg>
-struct offset32_detail
-{
-	template <typename T, typename T2, typename... Args>
-	static inline u32 offset32(T T2::* const mptr, const Arg& arg, const Args&... args)
-	{
-		return ::offset32(mptr, args...) + offset32_array<T>::index32(arg);
-	}
-};
-
-template <typename T3, typename T4>
-struct offset32_detail<T3 T4::*>
-{
-	template <typename T, typename T2, typename... Args>
-	static inline u32 offset32(T T2::* const mptr, T3 T4::* const mptr2, const Args&... args)
-	{
-		return ::offset32(mptr) + ::offset32(mptr2, args...);
-	}
-};
-
-// Convert 0-2-byte string to u16 value like reinterpret_cast does
-constexpr u16 operator""_u16(const char* s, usz /*length*/)
-{
-	char buf[2]{s[0], s[1]};
-	return std::bit_cast<u16>(buf);
-}
-
-// Convert 3-4-byte string to u32 value like reinterpret_cast does
-constexpr u32 operator""_u32(const char* s, usz /*length*/)
-{
-	char buf[4]{s[0], s[1], s[2], s[3]};
-	return std::bit_cast<u32>(buf);
-}
-
-// Convert 5-8-byte string to u64 value like reinterpret_cast does
-constexpr u64 operator""_u64(const char* s, usz len)
-{
-	char buf[8]{s[0], s[1], s[2], s[3], s[4], (len < 6 ? '\0' : s[5]), (len < 7 ? '\0' : s[6]), (len < 8 ? '\0' : s[7])};
-	return std::bit_cast<u64>(buf);
-}
-
-#if !defined(__INTELLISENSE__) && !__has_builtin(__builtin_COLUMN) && !defined(_MSC_VER)
-constexpr unsigned __builtin_COLUMN()
-{
-	return -1;
-}
-#endif
-
-template <usz Size = umax>
-struct const_str_t
-{
-	static constexpr usz size = Size;
-
-	char8_t chars[Size + 1]{};
-
-	constexpr const_str_t(const char (&a)[Size + 1])
-	{
-		for (usz i = 0; i <= Size; i++)
-			chars[i] = a[i];
-	}
-
-	constexpr const_str_t(const char8_t (&a)[Size + 1])
-	{
-		for (usz i = 0; i <= Size; i++)
-			chars[i] = a[i];
-	}
-
-	operator const char*() const
-	{
-		return reinterpret_cast<const char*>(chars);
-	}
-
-	constexpr operator const char8_t*() const
-	{
-		return chars;
-	}
-};
-
-template <>
-struct const_str_t<umax>
-{
-	const usz size;
-
-	union
-	{
-		const char8_t* chars;
-		const char* chars2;
-	};
-
-	constexpr const_str_t()
-		: size(0), chars(nullptr)
-	{
-	}
-
-	template <usz N>
-	constexpr const_str_t(const char8_t (&a)[N])
-		: size(N - 1), chars(+a)
-	{
-	}
-
-	template <usz N>
-	constexpr const_str_t(const char (&a)[N])
-		: size(N - 1), chars2(+a)
-	{
-	}
-
-	constexpr operator const char*() const
-	{
-		return std::launder(chars2);
-	}
-
-	constexpr operator const char8_t*() const
-	{
-		return chars;
-	}
-};
-
-template <usz Size>
-const_str_t(const char (&a)[Size]) -> const_str_t<Size - 1>;
-
-template <usz Size>
-const_str_t(const char8_t (&a)[Size]) -> const_str_t<Size - 1>;
-
-using const_str = const_str_t<>;
-
-namespace fmt
-{
-	[[noreturn]] void raw_verify_error(std::source_location loc, const char8_t* msg, usz object);
-	[[noreturn]] void raw_range_error(std::source_location loc, std::string_view index, usz container_size);
-	[[noreturn]] void raw_range_error(std::source_location loc, usz index, usz container_size);
-} // namespace fmt
-
-// No full implementation to ease on header weight
-template <typename T>
-std::conditional_t<std::is_integral_v<std::remove_cvref_t<T>>, usz, std::string_view> format_object_simplified(const T& obj)
-{
-	using type = std::remove_cvref_t<T>;
-
-	if constexpr (std::is_integral_v<type> || std::is_same_v<std::string, type> || std::is_same_v<std::string_view, type>)
-	{
-		return obj;
-	}
-	else if constexpr (std::is_array_v<type> && std::is_constructible_v<std::string_view, type>)
-	{
-		return {obj, std::size(obj) - 1};
-	}
-	else
-	{
-		return std::string_view{};
-	}
-}
-
-template <typename T>
-constexpr decltype(auto) ensure(T&& arg, const_str msg = const_str(), std::source_location src_loc = std::source_location::current()) noexcept
-{
-	if (std::forward<T>(arg)) [[likely]]
-	{
-		return std::forward<T>(arg);
-	}
-
-	fmt::raw_verify_error(src_loc, msg, 0);
-}
-
-template <typename T, typename F>
-	requires(std::is_invocable_v<F, T &&>)
-constexpr decltype(auto) ensure(T&& arg, F&& pred, const_str msg = const_str(), std::source_location src_loc = std::source_location::current()) noexcept
-{
-	if (std::forward<F>(pred)(std::forward<T>(arg))) [[likely]]
-	{
-		return std::forward<T>(arg);
-	}
-
-	fmt::raw_verify_error(src_loc, msg, 0);
-}
-
-template <typename To, typename From>
-	requires(std::is_integral_v<decltype(std::declval<To>() + std::declval<From>())>)
-[[nodiscard]] constexpr To narrow(const From& value, std::source_location src_loc = std::source_location::current())
-{
-	// Narrow check
-	using CommonFrom = std::common_type_t<From>;
-	using CommonTo = std::common_type_t<To>;
-
-	using UnFrom = std::make_unsigned_t<CommonFrom>;
-	using UnTo = std::make_unsigned_t<CommonTo>;
-
-	constexpr bool is_from_signed = std::is_signed_v<CommonFrom>;
-	constexpr bool is_to_signed = std::is_signed_v<CommonTo>;
-
-	constexpr auto from_mask = (is_from_signed && !is_to_signed) ? UnFrom{umax} >> 1 : UnFrom{umax};
-	constexpr auto to_mask = (is_to_signed && !is_from_signed) ? UnTo{umax} >> 1 : UnTo{umax};
-
-	constexpr auto mask = ~(from_mask & to_mask);
-
-	// Signed to unsigned always require test
-	// Otherwise, this is bit-wise narrowing or conversion between types of different signedness of the same size
-	if constexpr ((is_from_signed && !is_to_signed) || to_mask < from_mask)
-	{
-		// Try to optimize test if both are of the same signedness
-		if (is_from_signed != is_to_signed ? !!(value & mask) : static_cast<CommonTo>(value) != value) [[unlikely]]
-		{
-			fmt::raw_verify_error(src_loc, u8"Narrowing error", +value);
-		}
-	}
-
-	return static_cast<To>(value);
-}
-
-// Returns u32 size() for container
-template <typename CT>
-	requires requires(const CT& x) { std::size(x); }
-[[nodiscard]] constexpr u32 size32(const CT& container, std::source_location src_loc = std::source_location::current())
-{
-	// TODO: Support std::array
-	constexpr bool is_const = std::is_array_v<std::remove_cvref_t<CT>>;
-
-	if constexpr (is_const)
-	{
-		constexpr usz Size = sizeof(container) / sizeof(container[0]);
-		return std::conditional_t<is_const, u32, usz>{Size};
-	}
-	else
-	{
-		return narrow<u32>(container.size(), src_loc);
-	}
-}
-
-template <typename CT, typename T>
-	requires requires(CT&& x) { std::size(x); std::data(x); } || requires(CT&& x) { std::size(x); x.front(); }
-[[nodiscard]] constexpr auto& at32(CT&& container, T&& index, std::source_location src_loc = std::source_location::current())
-{
-	// Make sure the index is within u32 range
-	const std::make_unsigned_t<std::common_type_t<T>> idx = index;
-	const u32 csz = ::size32(container, src_loc);
-	if (csz <= idx) [[unlikely]]
-		fmt::raw_range_error(src_loc, format_object_simplified(index), csz);
-	auto it = std::begin(std::forward<CT>(container));
-	std::advance(it, idx);
-	return *it;
-}
-
-template <typename CT, typename T>
-	requires requires(CT&& x, T&& y) { x.count(y); x.find(y); }
-[[nodiscard]] constexpr auto& at32(CT&& container, T&& index, std::source_location src_loc = std::source_location::current())
-{
-	// Associative container
-	const auto found = container.find(std::forward<T>(index));
-	usz csv = umax;
-	if constexpr ((requires() { container.size(); }))
-		csv = container.size();
-	if (found == container.end()) [[unlikely]]
-		fmt::raw_range_error(src_loc, format_object_simplified(index), csv);
-	return found->second;
-}
-
-// Simplified hash algorithm. May be used in std::unordered_(map|set).
-template <typename T, usz Shift = 0>
-struct value_hash
-{
-	usz operator()(T value) const
-	{
-		return static_cast<usz>(value) >> Shift;
-	}
-};
-
-template <typename... T>
-struct fill_array_t
-{
-	std::tuple<T...> args;
-
-	template <typename V, usz Num>
-	constexpr std::unwrap_reference_t<V> get() const
-	{
-		return std::get<Num>(args);
-	}
-
-	template <typename U, usz N, usz... M, usz... Idx>
-	constexpr std::array<U, N> fill(std::index_sequence<M...>, std::index_sequence<Idx...>) const
-	{
-		return {(static_cast<void>(Idx), U(get<T, M>()...))...};
-	}
-
-	template <typename U, usz N>
-	constexpr operator std::array<U, N>() const
-	{
-		return fill<U, N>(std::make_index_sequence<sizeof...(T)>(), std::make_index_sequence<N>());
-	}
-};
-
-template <typename... T>
-constexpr auto fill_array(const T&... args)
-{
-	return fill_array_t<T...>{{args...}};
-}
-
-template <typename X, typename Y>
-concept PtrCastable = requires(const volatile X* x, const volatile Y* y) {
-	static_cast<const volatile Y*>(x);
-	static_cast<const volatile X*>(y);
-};
-
-template <typename X, typename Y>
-	requires PtrCastable<X, Y>
-consteval bool is_same_ptr()
-{
-	if constexpr (std::is_void_v<X> || std::is_void_v<Y> || std::is_same_v<std::remove_cv_t<X>, std::remove_cv_t<Y>>)
-	{
-		return true;
-	}
-	else if constexpr (sizeof(X) == sizeof(Y))
-	{
-		return true;
-	}
-	else
-	{
-		bool result = false;
-
-		if constexpr (sizeof(X) < sizeof(Y))
-		{
-			std::allocator<Y> a{};
-			Y* ptr = a.allocate(1);
-			result = static_cast<X*>(ptr) == static_cast<void*>(ptr);
-			a.deallocate(ptr, 1);
-		}
-		else
-		{
-			std::allocator<X> a{};
-			X* ptr = a.allocate(1);
-			result = static_cast<Y*>(ptr) == static_cast<void*>(ptr);
-			a.deallocate(ptr, 1);
-		}
-
-		return result;
-	}
-}
-
-template <typename X, typename Y>
-	requires PtrCastable<X, Y>
-constexpr bool is_same_ptr(const volatile Y* ptr)
-{
-	return static_cast<const volatile X*>(ptr) == static_cast<const volatile void*>(ptr);
-}
-
-template <typename X, typename Y>
-concept PtrSame = (is_same_ptr<X, Y>());
-
-namespace stx
-{
-	template <typename T>
-	struct exact_t
-	{
-		static_assert(std::is_reference_v<T> || std::is_convertible_v<T, const T&>);
-
-		T obj;
-
-		explicit exact_t(T&& _obj) : obj(std::forward<T>(_obj)) {}
-		exact_t& operator=(const exact_t&) = delete;
-
-		template <typename U>
-			requires(std::is_same_v<U&, T>)
-		operator U&() const noexcept
-		{
-			return obj;
-		};
-
-		template <typename U>
-			requires(std::is_same_v<const U&, T>)
-		operator const U&() const noexcept
-		{
-			return obj;
-		};
-
-		template <typename U>
-			requires(std::is_same_v<U, T> && std::is_copy_constructible_v<T>)
-		operator U() const noexcept
-		{
-			return obj;
-		};
-	};
-
-	template <typename T>
-	stx::exact_t<T&> make_exact(T&& obj) noexcept
-	{
-		return stx::exact_t<T&>(static_cast<T&>(obj));
-	}
-} // namespace stx
-
-// Read object of type T from raw pointer, array, string, vector, or any contiguous container
-template <typename T, typename U>
-constexpr T read_from_ptr(U&& array, usz pos = 0)
-{
-	// TODO: ensure array element types are trivial
-	static_assert(sizeof(T) % sizeof(array[0]) == 0);
-	std::decay_t<decltype(array[0])> buf[sizeof(T) / sizeof(array[0])];
-	if (!std::is_constant_evaluated())
-		std::memcpy(+buf, &array[pos], sizeof(buf));
-	else
-		for (usz i = 0; i < pos; buf[i] = array[pos + i], i++)
-			;
-	return std::bit_cast<T>(buf);
-}
-
-template <typename T, typename U>
-constexpr void write_to_ptr(U&& array, usz pos, const T& value)
-{
-	static_assert(sizeof(T) % sizeof(array[0]) == 0);
-	if (!std::is_constant_evaluated())
-		std::memcpy(static_cast<void*>(&array[pos]), &value, sizeof(value));
-	else
-		ensure(!"Unimplemented");
-}
-
-template <typename T, typename U>
-constexpr void write_to_ptr(U&& array, const T& value)
-{
-	static_assert(sizeof(T) % sizeof(array[0]) == 0);
-	if (!std::is_constant_evaluated())
-		std::memcpy(&array[0], &value, sizeof(value));
-	else
-		ensure(!"Unimplemented");
-}
-
-constexpr struct aref_tag_t
-{
-} aref_tag{};
-
-template <typename T, typename U>
-class aref final
-{
-	U* m_ptr;
-
-	static_assert(sizeof(std::decay_t<T>) % sizeof(U) == 0);
-
-public:
-	aref() = delete;
-
-	constexpr aref(const aref&) = default;
-
-	explicit constexpr aref(aref_tag_t, U* ptr)
-		: m_ptr(ptr)
-	{
-	}
-
-	constexpr T value() const
-	{
-		return read_from_ptr<T>(m_ptr);
-	}
-
-	constexpr operator T() const
-	{
-		return read_from_ptr<T>(m_ptr);
-	}
-
-	aref& operator=(const aref&) = delete;
-
-	constexpr aref& operator=(const T& value) const
-	{
-		write_to_ptr<T>(m_ptr, value);
-		return *this;
-	}
-
-	template <typename MT, typename T2>
-		requires(std::is_convertible_v<const volatile T*, const volatile T2*>) && PtrSame<T, T2>
-	aref<MT, U> ref(MT T2::* const mptr) const
-	{
-		return aref<MT, U>(aref_tag, m_ptr + offset32(mptr) / sizeof(U));
-	}
-
-	template <typename MT, typename T2, typename ET = std::remove_extent_t<MT>>
-		requires(std::is_convertible_v<const volatile T*, const volatile T2*>) && PtrSame<T, T2>
-	aref<ET, U> ref(MT T2::* const mptr, usz index) const
-	{
-		return aref<ET, U>(aref_tag, m_ptr + offset32(mptr) / sizeof(U) + sizeof(ET) / sizeof(U) * index);
-	}
-};
-
-template <typename T, typename U>
-class aref<T[], U>
-{
-	U* m_ptr;
-
-	static_assert(sizeof(std::decay_t<T>) % sizeof(U) == 0);
-
-public:
-	aref() = delete;
-
-	constexpr aref(const aref&) = default;
-
-	explicit constexpr aref(aref_tag_t, U* ptr)
-		: m_ptr(ptr)
-	{
-	}
-
-	aref& operator=(const aref&) = delete;
-
-	constexpr aref<T, U> operator[](usz index) const
-	{
-		return aref<T, U>(aref_tag, m_ptr + index * (sizeof(T) / sizeof(U)));
-	}
-};
-
-template <typename T, typename U, std::size_t N>
-class aref<T[N], U>
-{
-	U* m_ptr;
-
-	static_assert(sizeof(std::decay_t<T>) % sizeof(U) == 0);
-
-public:
-	aref() = delete;
-
-	constexpr aref(const aref&) = default;
-
-	explicit constexpr aref(aref_tag_t, U* ptr)
-		: m_ptr(ptr)
-	{
-	}
-
-	aref& operator=(const aref&) = delete;
-
-	constexpr aref<T, U> operator[](usz index) const
-	{
-		return aref<T, U>(aref_tag, m_ptr + index * (sizeof(T) / sizeof(U)));
-	}
-};
-
-// Reference object of type T, see read_from_ptr
-template <typename T, typename U>
-constexpr auto ref_ptr(U&& array, usz pos = 0) -> aref<T, std::decay_t<decltype(array[0])>>
-{
-	return aref<T, std::decay_t<decltype(array[0])>>(aref_tag, &array[pos]);
-}
-
 namespace utils
 {
 	struct serial;
@@ -1374,9 +42,3 @@ extern bool serialize(utils::serial& ar, T& obj);
 
 #define ENABLE_BITWISE_SERIALIZATION using enable_bitcopy = std::true_type;
 #define SAVESTATE_INIT_POS(...) static constexpr double savestate_init_pos = (__VA_ARGS__)
-
-#define UNUSED(expr)  \
-	do                \
-	{                 \
-		(void)(expr); \
-	} while (0)
diff --git a/rpcs3/util/v128.hpp b/rpcs3/util/v128.hpp
index 0a5061dcd..80151d29e 100644
--- a/rpcs3/util/v128.hpp
+++ b/rpcs3/util/v128.hpp
@@ -1,223 +1,6 @@
 #pragma once // No BOM and only basic ASCII in this header, or a neko will die
 
 #include "util/types.hpp"
+#include <rx/v128.hpp>
 
-template <typename T>
-concept Vector128 = (sizeof(T) == 16) && (std::is_trivial_v<T>);
-
-// 128-bit vector type
-union alignas(16) v128
-{
-	uchar _bytes[16];
-	char _chars[16];
-
-	template <typename T, usz N, usz M>
-	struct masked_array_t // array type accessed as (index ^ M)
-	{
-		T m_data[N];
-
-	public:
-		T& operator[](usz index)
-		{
-			return m_data[index ^ M];
-		}
-
-		const T& operator[](usz index) const
-		{
-			return m_data[index ^ M];
-		}
-	};
-
-	template <typename T, usz N = 16 / sizeof(T)>
-	using normal_array_t = masked_array_t<T, N, std::endian::little == std::endian::native ? 0 : N - 1>;
-	template <typename T, usz N = 16 / sizeof(T)>
-	using reversed_array_t = masked_array_t<T, N, std::endian::little == std::endian::native ? N - 1 : 0>;
-
-	normal_array_t<u64> _u64;
-	normal_array_t<s64> _s64;
-	reversed_array_t<u64> u64r;
-	reversed_array_t<s64> s64r;
-
-	normal_array_t<u32> _u32;
-	normal_array_t<s32> _s32;
-	reversed_array_t<u32> u32r;
-	reversed_array_t<s32> s32r;
-
-	normal_array_t<u16> _u16;
-	normal_array_t<s16> _s16;
-	reversed_array_t<u16> u16r;
-	reversed_array_t<s16> s16r;
-
-	normal_array_t<u8> _u8;
-	normal_array_t<s8> _s8;
-	reversed_array_t<u8> u8r;
-	reversed_array_t<s8> s8r;
-
-	normal_array_t<f32> _f;
-	normal_array_t<f64> _d;
-	reversed_array_t<f32> fr;
-	reversed_array_t<f64> dr;
-
-	u128 _u;
-	s128 _s;
-
-	v128() = default;
-
-	constexpr v128(const v128&) noexcept = default;
-
-	template <Vector128 T>
-	constexpr v128(const T& rhs) noexcept
-		: v128(std::bit_cast<v128>(rhs))
-	{
-	}
-
-	constexpr v128& operator=(const v128&) noexcept = default;
-
-	template <Vector128 T>
-	constexpr operator T() const noexcept
-	{
-		return std::bit_cast<T>(*this);
-	}
-
-	ENABLE_BITWISE_SERIALIZATION;
-
-	static v128 from64(u64 _0, u64 _1 = 0)
-	{
-		v128 ret;
-		ret._u64[0] = _0;
-		ret._u64[1] = _1;
-		return ret;
-	}
-
-	static v128 from64r(u64 _1, u64 _0 = 0)
-	{
-		return from64(_0, _1);
-	}
-
-	static v128 from64p(u64 value)
-	{
-		v128 ret;
-		ret._u64[0] = value;
-		ret._u64[1] = value;
-		return ret;
-	}
-
-	static v128 from32(u32 _0, u32 _1 = 0, u32 _2 = 0, u32 _3 = 0)
-	{
-		v128 ret;
-		ret._u32[0] = _0;
-		ret._u32[1] = _1;
-		ret._u32[2] = _2;
-		ret._u32[3] = _3;
-		return ret;
-	}
-
-	static v128 from32r(u32 _3, u32 _2 = 0, u32 _1 = 0, u32 _0 = 0)
-	{
-		return from32(_0, _1, _2, _3);
-	}
-
-	static v128 from32p(u32 value)
-	{
-		v128 ret;
-		ret._u32[0] = value;
-		ret._u32[1] = value;
-		ret._u32[2] = value;
-		ret._u32[3] = value;
-		return ret;
-	}
-
-	static v128 fromf32p(f32 value)
-	{
-		v128 ret;
-		ret._f[0] = value;
-		ret._f[1] = value;
-		ret._f[2] = value;
-		ret._f[3] = value;
-		return ret;
-	}
-
-	static v128 from16p(u16 value)
-	{
-		v128 ret;
-		ret._u16[0] = value;
-		ret._u16[1] = value;
-		ret._u16[2] = value;
-		ret._u16[3] = value;
-		ret._u16[4] = value;
-		ret._u16[5] = value;
-		ret._u16[6] = value;
-		ret._u16[7] = value;
-		return ret;
-	}
-
-	static v128 from8p(u8 value)
-	{
-		v128 ret;
-		std::memset(&ret, value, sizeof(ret));
-		return ret;
-	}
-
-	static v128 undef()
-	{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#elif _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 6001)
-#endif
-		v128 ret;
-		return ret;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#elif _MSC_VER
-#pragma warning(pop)
-#endif
-	}
-
-	// Unaligned load with optional index offset
-	static v128 loadu(const void* ptr, usz index = 0)
-	{
-		v128 ret;
-		std::memcpy(&ret, static_cast<const u8*>(ptr) + index * sizeof(v128), sizeof(v128));
-		return ret;
-	}
-
-	// Unaligned store with optional index offset
-	static void storeu(v128 value, void* ptr, usz index = 0)
-	{
-		std::memcpy(static_cast<u8*>(ptr) + index * sizeof(v128), &value, sizeof(v128));
-	}
-
-	v128 operator|(const v128&) const;
-	v128 operator&(const v128&) const;
-	v128 operator^(const v128&) const;
-	v128 operator~() const;
-
-	bool operator==(const v128& right) const;
-
-	void clear()
-	{
-		*this = {};
-	}
-};
-
-template <typename T, usz N, usz M>
-struct offset32_array<v128::masked_array_t<T, N, M>>
-{
-	template <typename Arg>
-	static inline u32 index32(const Arg& arg)
-	{
-		return u32{sizeof(T)} * (static_cast<u32>(arg) ^ static_cast<u32>(M));
-	}
-};
-
-template <>
-struct std::hash<v128>
-{
-	usz operator()(const v128& key) const
-	{
-		return key._u64[0] + key._u64[1];
-	}
-};
+using rx::v128;
diff --git a/rpcs3/util/vm_native.cpp b/rpcs3/util/vm_native.cpp
index 008b13fde..188857a03 100644
--- a/rpcs3/util/vm_native.cpp
+++ b/rpcs3/util/vm_native.cpp
@@ -188,16 +188,19 @@ namespace utils
 	{
 		static const long r = []() -> long
 		{
+			long result;
 #ifdef _WIN32
 			SYSTEM_INFO info;
 			::GetSystemInfo(&info);
-			return info.dwPageSize;
+			result = info.dwPageSize;
 #else
-			return ::sysconf(_SC_PAGESIZE);
+			result = ::sysconf(_SC_PAGESIZE);
 #endif
+			ensure(result, FN(((x & (x - 1)) == 0 && x > 0 && x <= 0x10000)));
+			return result;
 		}();
 
-		return ensure(r, FN(((x & (x - 1)) == 0 && x > 0 && x <= 0x10000)));
+		return r;
 	}
 
 	// Convert memory protection (internal)
diff --git a/rpcs3qt-legacy/emu_settings.cpp b/rpcs3qt-legacy/emu_settings.cpp
index 277f1d354..459262c4b 100644
--- a/rpcs3qt-legacy/emu_settings.cpp
+++ b/rpcs3qt-legacy/emu_settings.cpp
@@ -1244,8 +1244,9 @@ QString emu_settings::GetLocalizedSetting(const QString& original, emu_settings_
 	case emu_settings_type::PPUDecoder:
 		switch (static_cast<ppu_decoder_type>(index))
 		{
-		case ppu_decoder_type::_static: return tr("Interpreter (static)", "PPU decoder");
-		case ppu_decoder_type::llvm: return tr("Recompiler (LLVM)", "PPU decoder");
+		case ppu_decoder_type::_static: return tr("Interpreter (Legacy)", "PPU decoder");
+		case ppu_decoder_type::llvm_legacy: return tr("LLVM Recompiler (Legacy)", "PPU decoder");
+		case ppu_decoder_type::interpreter: return tr("Interpreter", "PPU decoder");
 		}
 		break;
 	case emu_settings_type::SPUDecoder:
diff --git a/rpcs3qt-legacy/settings_dialog.cpp b/rpcs3qt-legacy/settings_dialog.cpp
index 38d731bec..090af6a06 100644
--- a/rpcs3qt-legacy/settings_dialog.cpp
+++ b/rpcs3qt-legacy/settings_dialog.cpp
@@ -367,7 +367,8 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
 
 	QButtonGroup* ppu_bg = new QButtonGroup(this);
 	ppu_bg->addButton(ui->ppu__static, static_cast<int>(ppu_decoder_type::_static));
-	ppu_bg->addButton(ui->ppu_llvm, static_cast<int>(ppu_decoder_type::llvm));
+	ppu_bg->addButton(ui->ppu_llvm, static_cast<int>(ppu_decoder_type::llvm_legacy));
+	ppu_bg->addButton(ui->ppu_interpreter, static_cast<int>(ppu_decoder_type::interpreter));
 
 	connect(ppu_bg, &QButtonGroup::idToggled, [this](int id, bool checked)
 		{
@@ -376,12 +377,13 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
 
 			switch (id)
 			{
+			case static_cast<int>(ppu_decoder_type::interpreter):
 			case static_cast<int>(ppu_decoder_type::_static):
 				ui->accuratePPUFPCC->setEnabled(true);
 				ui->accuratePPUNJ->setEnabled(true);
 				ui->accuratePPUVNAN->setEnabled(true);
 				break;
-			case static_cast<int>(ppu_decoder_type::llvm):
+			case static_cast<int>(ppu_decoder_type::llvm_legacy):
 				ui->accuratePPUFPCC->setEnabled(false);
 				ui->accuratePPUNJ->setEnabled(false);
 				ui->accuratePPUVNAN->setEnabled(false);
diff --git a/rpcs3qt-legacy/settings_dialog.ui b/rpcs3qt-legacy/settings_dialog.ui
index 08a453d1f..39ab1d1fa 100644
--- a/rpcs3qt-legacy/settings_dialog.ui
+++ b/rpcs3qt-legacy/settings_dialog.ui
@@ -74,14 +74,21 @@
                <item>
                 <widget class="QRadioButton" name="ppu__static">
                  <property name="text">
-                  <string notr="true">Interpreter (static)</string>
+                  <string notr="true">Interpreter (Legacy)</string>
                  </property>
                 </widget>
                </item>
                <item>
                 <widget class="QRadioButton" name="ppu_llvm">
                  <property name="text">
-                  <string notr="true">LLVM Recompiler (fastest)</string>
+                  <string notr="true">LLVM Recompiler (Legacy)</string>
+                 </property>
+                </widget>
+               </item>
+               <item>
+                <widget class="QRadioButton" name="ppu_interpreter">
+                 <property name="text">
+                  <string notr="true">Interpreter</string>
                  </property>
                 </widget>
                </item>
diff --git a/rpcsx/CMakeLists.txt b/rpcsx/CMakeLists.txt
index 869e24851..fd653db2a 100644
--- a/rpcsx/CMakeLists.txt
+++ b/rpcsx/CMakeLists.txt
@@ -1,103 +1,107 @@
-find_package(libunwind REQUIRED)
-find_package(sox REQUIRED)
-find_package(ALSA REQUIRED)
-
 add_library(standalone-config INTERFACE)
 target_include_directories(standalone-config INTERFACE orbis-kernel-config)
 add_library(orbis::kernel::config ALIAS standalone-config)
 
-add_executable(rpcsx
-  audio/AudioDevice.cpp
-  audio/AlsaDevice.cpp
+add_subdirectory(cpu)
 
-  iodev/a53io.cpp
-  iodev/ajm.cpp
-  iodev/blockpool.cpp
-  iodev/bt.cpp
-  iodev/camera.cpp
-  iodev/cd.cpp
-  iodev/console.cpp
-  iodev/hdd.cpp
-  iodev/dce.cpp
-  iodev/dipsw.cpp
-  iodev/dmem.cpp
-  iodev/gc.cpp
-  iodev/hid.cpp
-  iodev/hmd_3da.cpp
-  iodev/hmd_cmd.cpp
-  iodev/hmd_mmap.cpp
-  iodev/hmd_snsr.cpp
-  iodev/hmd2_cmd.cpp
-  iodev/hmd2_imu.cpp
-  iodev/hmd2_gen_data.cpp
-  iodev/hmd2_gaze.cpp
-  iodev/icc_configuration.cpp
-  iodev/mbus.cpp
-  iodev/metadbg.cpp
-  iodev/notification.cpp
-  iodev/npdrm.cpp
-  iodev/nsid_ctl.cpp
-  iodev/null.cpp
-  iodev/rng.cpp
-  iodev/sbl_srv.cpp
-  iodev/shm.cpp
-  iodev/urandom.cpp
-  iodev/xpt.cpp
-  iodev/zero.cpp
-  iodev/aout.cpp
-  iodev/av_control.cpp
-  iodev/hdmi.cpp
-  iodev/mbus_av.cpp
-  iodev/scanin.cpp
-  iodev/s3da.cpp
-  iodev/gbase.cpp
-  iodev/devstat.cpp
-  iodev/devact.cpp
-  iodev/devctl.cpp
-  iodev/uvd.cpp
-  iodev/vce.cpp
-  iodev/evlg.cpp
-  iodev/srtc.cpp
-  iodev/sshot.cpp
-  iodev/lvdctl.cpp
-  iodev/icc_power.cpp
-  iodev/cayman_reg.cpp
+if(LINUX AND WITH_RPCSX)
+  find_package(libunwind REQUIRED)
+  find_package(sox REQUIRED)
+  find_package(ALSA REQUIRED)
 
-  main.cpp
-  AudioOut.cpp
-  backtrace.cpp
-  vm.cpp
-  ops.cpp
-  linker.cpp
-  io-device.cpp
-  thread.cpp
-  vfs.cpp
-  ipmi.cpp
-)
+  add_subdirectory(gpu)
+  add_subdirectory(core)
 
-add_subdirectory(gpu)
-add_subdirectory(core)
+  add_executable(rpcsx
+    audio/AudioDevice.cpp
+    audio/AlsaDevice.cpp
 
-target_include_directories(rpcsx PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(rpcsx
-PUBLIC
-  ffmpeg::avcodec
-  ffmpeg::swresample
-  ffmpeg::avutil
-  Atrac9
-  rpcsx-gpu
-  orbis::kernel
-  rx
-  libcrypto
-  libunwind::unwind-x86_64
-  xbyak::xbyak
-  sox::sox
-  ALSA::ALSA
-  rpcsx-core
-)
+    iodev/a53io.cpp
+    iodev/ajm.cpp
+    iodev/blockpool.cpp
+    iodev/bt.cpp
+    iodev/camera.cpp
+    iodev/cd.cpp
+    iodev/console.cpp
+    iodev/hdd.cpp
+    iodev/dce.cpp
+    iodev/dipsw.cpp
+    iodev/dmem.cpp
+    iodev/gc.cpp
+    iodev/hid.cpp
+    iodev/hmd_3da.cpp
+    iodev/hmd_cmd.cpp
+    iodev/hmd_mmap.cpp
+    iodev/hmd_snsr.cpp
+    iodev/hmd2_cmd.cpp
+    iodev/hmd2_imu.cpp
+    iodev/hmd2_gen_data.cpp
+    iodev/hmd2_gaze.cpp
+    iodev/icc_configuration.cpp
+    iodev/mbus.cpp
+    iodev/metadbg.cpp
+    iodev/notification.cpp
+    iodev/npdrm.cpp
+    iodev/nsid_ctl.cpp
+    iodev/null.cpp
+    iodev/rng.cpp
+    iodev/sbl_srv.cpp
+    iodev/shm.cpp
+    iodev/urandom.cpp
+    iodev/xpt.cpp
+    iodev/zero.cpp
+    iodev/aout.cpp
+    iodev/av_control.cpp
+    iodev/hdmi.cpp
+    iodev/mbus_av.cpp
+    iodev/scanin.cpp
+    iodev/s3da.cpp
+    iodev/gbase.cpp
+    iodev/devstat.cpp
+    iodev/devact.cpp
+    iodev/devctl.cpp
+    iodev/uvd.cpp
+    iodev/vce.cpp
+    iodev/evlg.cpp
+    iodev/srtc.cpp
+    iodev/sshot.cpp
+    iodev/lvdctl.cpp
+    iodev/icc_power.cpp
+    iodev/cayman_reg.cpp
 
-target_base_address(rpcsx 0x0000070000000000)
-target_compile_options(rpcsx PRIVATE "-mfsgsbase")
+    main.cpp
+    AudioOut.cpp
+    backtrace.cpp
+    vm.cpp
+    ops.cpp
+    linker.cpp
+    io-device.cpp
+    thread.cpp
+    vfs.cpp
+    ipmi.cpp
+  )
+
+  target_base_address(rpcsx 0x0000070000000000)
+  target_compile_options(rpcsx PRIVATE "-mfsgsbase")
+  set_target_properties(rpcsx PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+  install(TARGETS rpcsx RUNTIME DESTINATION bin)
+
+  target_include_directories(rpcsx PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+  target_link_libraries(rpcsx
+  PUBLIC
+    ffmpeg::avcodec
+    ffmpeg::swresample
+    ffmpeg::avutil
+    Atrac9
+    rpcsx-gpu
+    orbis::kernel
+    rx
+    libcrypto
+    libunwind::unwind-x86_64
+    xbyak::xbyak
+    sox::sox
+    ALSA::ALSA
+    rpcsx-core
+  )
+endif()
 
-set_target_properties(rpcsx PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-install(TARGETS rpcsx RUNTIME DESTINATION bin)
diff --git a/rpcsx/cpu/CMakeLists.txt b/rpcsx/cpu/CMakeLists.txt
new file mode 100644
index 000000000..4ddccd7cb
--- /dev/null
+++ b/rpcsx/cpu/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(cell)
diff --git a/rpcsx/cpu/cell/CMakeLists.txt b/rpcsx/cpu/cell/CMakeLists.txt
new file mode 100644
index 000000000..c17385891
--- /dev/null
+++ b/rpcsx/cpu/cell/CMakeLists.txt
@@ -0,0 +1,3 @@
+
+add_subdirectory(ppu)
+
diff --git a/rpcsx/cpu/cell/ppu/CMakeLists.txt b/rpcsx/cpu/cell/ppu/CMakeLists.txt
new file mode 100644
index 000000000..0da7bccf1
--- /dev/null
+++ b/rpcsx/cpu/cell/ppu/CMakeLists.txt
@@ -0,0 +1,32 @@
+add_library(
+    rpcsx_cpu_cell_ppu STATIC
+    src/Decoder.cpp
+)
+
+add_library(rpcsx_cpu_cell_ppu_semantic
+STATIC
+    semantic/ppu.cpp
+)
+target_include_directories(rpcsx_cpu_cell_ppu_semantic PUBLIC include PRIVATE include/rx/cpu/cell/ppu)
+target_link_libraries(rpcsx_cpu_cell_ppu_semantic PUBLIC rx)
+
+# add_custom_command(
+#     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ppu.ll
+#     COMMAND ${CLANG_EXECUTABLE} -O3 -S -emit-llvm semantic/ppu.cpp -o ${CMAKE_CURRENT_BINARY_DIR}/ppu.ll -I include/rx/cpu/cell/ppu/ -I ../../../../rx/include/ -std=c++23 -fno-exceptions
+#     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+# )
+
+# add_custom_target(ppu-semantic DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/ppu.ll)
+
+target_include_directories(rpcsx_cpu_cell_ppu
+    PUBLIC
+        include
+    PRIVATE
+        include/rx/cpu/cell/ppu
+)
+
+target_link_libraries(rpcsx_cpu_cell_ppu PUBLIC rx)
+# add_dependencies(rpcsx_cpu_cell_ppu ppu-semantic)
+add_library(rpcsx::cpu::cell::ppu ALIAS rpcsx_cpu_cell_ppu)
+add_library(rpcsx::cpu::cell::ppu::semantic ALIAS rpcsx_cpu_cell_ppu_semantic)
+
diff --git a/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Decoder.hpp b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Decoder.hpp
new file mode 100644
index 000000000..b945b1c7c
--- /dev/null
+++ b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Decoder.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "Opcode.hpp"
+#include <array>
+#include <cstdint>
+#include <rx/refl.hpp>
+
+namespace rx::cell::ppu {
+template <typename T> using DecoderTable = std::array<T, 0x20000>;
+
+extern DecoderTable<Opcode> g_ppuOpcodeTable;
+// extern std::array<Form, rx::fieldCount<Opcode>> g_opcodeForms;
+
+inline Opcode getOpcode(std::uint32_t instruction) {
+  auto decode = [](std::uint32_t inst) {
+    return ((inst >> 26) | (inst << 6)) & 0x1ffff; // Rotate + mask
+  };
+
+  return g_ppuOpcodeTable[decode(instruction)];
+}
+
+Opcode fixOpcode(Opcode opcode, std::uint32_t instruction);
+} // namespace rx::cell::ppu
diff --git a/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Emitter.hpp b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Emitter.hpp
new file mode 100644
index 000000000..93b1d358e
--- /dev/null
+++ b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Emitter.hpp
@@ -0,0 +1,410 @@
+#pragma once
+
+#include "Instruction.hpp"
+#include <cstdint>
+
+namespace rx::cell::ppu {
+inline namespace registers {
+enum {
+  r0,
+  r1,
+  r2,
+  r3,
+  r4,
+  r5,
+  r6,
+  r7,
+  r8,
+  r9,
+  r10,
+  r11,
+  r12,
+  r13,
+  r14,
+  r15,
+  r16,
+  r17,
+  r18,
+  r19,
+  r20,
+  r21,
+  r22,
+  r23,
+  r24,
+  r25,
+  r26,
+  r27,
+  r28,
+  r29,
+  r30,
+  r31,
+};
+
+enum {
+  f0,
+  f1,
+  f2,
+  f3,
+  f4,
+  f5,
+  f6,
+  f7,
+  f8,
+  f9,
+  f10,
+  f11,
+  f12,
+  f13,
+  f14,
+  f15,
+  F16,
+  f17,
+  f18,
+  f19,
+  f20,
+  f21,
+  f22,
+  f23,
+  f24,
+  f25,
+  f26,
+  f27,
+  f28,
+  f29,
+  f30,
+  f31,
+};
+
+enum {
+  v0,
+  v1,
+  v2,
+  v3,
+  v4,
+  v5,
+  v6,
+  v7,
+  v8,
+  v9,
+  v10,
+  v11,
+  v12,
+  v13,
+  v14,
+  v15,
+  v16,
+  v17,
+  v18,
+  v19,
+  v20,
+  v21,
+  v22,
+  v23,
+  v24,
+  v25,
+  v26,
+  v27,
+  v28,
+  v29,
+  v30,
+  v31,
+};
+
+enum {
+  cr0,
+  cr1,
+  cr2,
+  cr3,
+  cr4,
+  cr5,
+  cr6,
+  cr7,
+};
+} // namespace registers
+
+inline std::uint32_t ADDI(std::uint32_t rt, std::uint32_t ra, std::int32_t si) {
+  Instruction op{0x0eu << 26};
+  op.rd = rt;
+  op.ra = ra;
+  op.simm16 = si;
+  return op.raw;
+}
+inline std::uint32_t ADDIS(std::uint32_t rt, std::uint32_t ra,
+                           std::int32_t si) {
+  Instruction op{0x0fu << 26};
+  op.rd = rt;
+  op.ra = ra;
+  op.simm16 = si;
+  return op.raw;
+}
+inline std::uint32_t XORIS(std::uint32_t rt, std::uint32_t ra,
+                           std::int32_t si) {
+  Instruction op{0x1bu << 26};
+  op.rd = rt;
+  op.ra = ra;
+  op.simm16 = si;
+  return op.raw;
+}
+inline std::uint32_t ORI(std::uint32_t rt, std::uint32_t ra, std::uint32_t ui) {
+  Instruction op{0x18u << 26};
+  op.rd = rt;
+  op.ra = ra;
+  op.uimm16 = ui;
+  return op.raw;
+}
+inline std::uint32_t ORIS(std::uint32_t rt, std::uint32_t ra,
+                          std::uint32_t ui) {
+  Instruction op{0x19u << 26};
+  op.rd = rt;
+  op.ra = ra;
+  op.uimm16 = ui;
+  return op.raw;
+}
+inline std::uint32_t OR(std::uint32_t ra, std::uint32_t rs, std::uint32_t rb,
+                        bool rc = false) {
+  Instruction op{0x1fu << 26 | 0x1bcu << 1};
+  op.rs = rs;
+  op.ra = ra;
+  op.rb = rb;
+  op.rc = rc;
+  return op.raw;
+}
+inline std::uint32_t SC(std::uint32_t lev) {
+  Instruction op{0x11u << 26 | 1 << 1};
+  op.lev = lev;
+  return op.raw;
+}
+inline std::uint32_t B(std::int32_t li, bool aa = false, bool lk = false) {
+  Instruction op{0x12u << 26};
+  op.ll = li;
+  op.aa = aa;
+  op.lk = lk;
+  return op.raw;
+}
+inline std::uint32_t BC(std::uint32_t bo, std::uint32_t bi, std::int32_t bd,
+                        bool aa = false, bool lk = false) {
+  Instruction op{0x10u << 26};
+  op.bo = bo;
+  op.bi = bi;
+  op.ds = bd / 4;
+  op.aa = aa;
+  op.lk = lk;
+  return op.raw;
+}
+inline std::uint32_t BCLR(std::uint32_t bo, std::uint32_t bi, std::uint32_t bh,
+                          bool lk = false) {
+  Instruction op{0x13u << 26 | 0x10u << 1};
+  op.bo = bo;
+  op.bi = bi;
+  op.bh = bh;
+  op.lk = lk;
+  return op.raw;
+}
+inline std::uint32_t BCCTR(std::uint32_t bo, std::uint32_t bi, std::uint32_t bh,
+                           bool lk = false) {
+  Instruction op{0x13u << 26 | 0x210u << 1};
+  op.bo = bo;
+  op.bi = bi;
+  op.bh = bh;
+  op.lk = lk;
+  return op.raw;
+}
+inline std::uint32_t MFSPR(std::uint32_t rt, std::uint32_t spr) {
+  Instruction op{0x1fu << 26 | 0x153u << 1};
+  op.rd = rt;
+  op.spr = spr;
+  return op.raw;
+}
+inline std::uint32_t MTSPR(std::uint32_t spr, std::uint32_t rs) {
+  Instruction op{0x1fu << 26 | 0x1d3u << 1};
+  op.rs = rs;
+  op.spr = spr;
+  return op.raw;
+}
+inline std::uint32_t LWZ(std::uint32_t rt, std::uint32_t ra, std::int32_t si) {
+  Instruction op{0x20u << 26};
+  op.rd = rt;
+  op.ra = ra;
+  op.simm16 = si;
+  return op.raw;
+}
+inline std::uint32_t STW(std::uint32_t rt, std::uint32_t ra, std::int32_t si) {
+  Instruction op{0x24u << 26};
+  op.rd = rt;
+  op.ra = ra;
+  op.simm16 = si;
+  return op.raw;
+}
+inline std::uint32_t STD(std::uint32_t rs, std::uint32_t ra, std::int32_t si) {
+  Instruction op{0x3eu << 26};
+  op.rs = rs;
+  op.ra = ra;
+  op.ds = si / 4;
+  return op.raw;
+}
+inline std::uint32_t STDU(std::uint32_t rs, std::uint32_t ra, std::int32_t si) {
+  Instruction op{0x3eu << 26 | 1};
+  op.rs = rs;
+  op.ra = ra;
+  op.ds = si / 4;
+  return op.raw;
+}
+inline std::uint32_t LD(std::uint32_t rt, std::uint32_t ra, std::int32_t si) {
+  Instruction op{0x3au << 26};
+  op.rd = rt;
+  op.ra = ra;
+  op.ds = si / 4;
+  return op.raw;
+}
+inline std::uint32_t LDU(std::uint32_t rt, std::uint32_t ra, std::int32_t si) {
+  Instruction op{0x3au << 26 | 1};
+  op.rd = rt;
+  op.ra = ra;
+  op.ds = si / 4;
+  return op.raw;
+}
+inline std::uint32_t CMPI(std::uint32_t bf, std::uint32_t l, std::uint32_t ra,
+                          std::uint32_t ui) {
+  Instruction op{0xbu << 26};
+  op.crfd = bf;
+  op.l10 = l;
+  op.ra = ra;
+  op.uimm16 = ui;
+  return op.raw;
+}
+inline std::uint32_t CMPLI(std::uint32_t bf, std::uint32_t l, std::uint32_t ra,
+                           std::uint32_t ui) {
+  Instruction op{0xau << 26};
+  op.crfd = bf;
+  op.l10 = l;
+  op.ra = ra;
+  op.uimm16 = ui;
+  return op.raw;
+}
+inline std::uint32_t RLDICL(std::uint32_t ra, std::uint32_t rs,
+                            std::uint32_t sh, std::uint32_t mb,
+                            bool rc = false) {
+  Instruction op{30 << 26};
+  op.ra = ra;
+  op.rs = rs;
+  op.sh64 = sh;
+  op.mbe64 = mb;
+  op.rc = rc;
+  return op.raw;
+}
+inline std::uint32_t RLDICR(std::uint32_t ra, std::uint32_t rs,
+                            std::uint32_t sh, std::uint32_t mb,
+                            bool rc = false) {
+  return RLDICL(ra, rs, sh, mb, rc) | 1 << 2;
+}
+inline std::uint32_t STFD(std::uint32_t frs, std::uint32_t ra,
+                          std::int32_t si) {
+  Instruction op{54u << 26};
+  op.frs = frs;
+  op.ra = ra;
+  op.simm16 = si;
+  return op.raw;
+}
+inline std::uint32_t STVX(std::uint32_t vs, std::uint32_t ra,
+                          std::uint32_t rb) {
+  Instruction op{31 << 26 | 231 << 1};
+  op.vs = vs;
+  op.ra = ra;
+  op.rb = rb;
+  return op.raw;
+}
+inline std::uint32_t LFD(std::uint32_t frd, std::uint32_t ra, std::int32_t si) {
+  Instruction op{50u << 26};
+  op.frd = frd;
+  op.ra = ra;
+  op.simm16 = si;
+  return op.raw;
+}
+inline std::uint32_t LVX(std::uint32_t vd, std::uint32_t ra, std::uint32_t rb) {
+  Instruction op{31 << 26 | 103 << 1};
+  op.vd = vd;
+  op.ra = ra;
+  op.rb = rb;
+  return op.raw;
+}
+inline constexpr std::uint32_t EIEIO() { return 0x7c0006ac; }
+
+inline namespace implicts {
+inline std::uint32_t NOP() { return ORI(r0, r0, 0); }
+inline std::uint32_t MR(std::uint32_t rt, std::uint32_t ra) {
+  return OR(rt, ra, ra, false);
+}
+inline std::uint32_t LI(std::uint32_t rt, std::uint32_t imm) {
+  return ADDI(rt, r0, imm);
+}
+inline std::uint32_t LIS(std::uint32_t rt, std::uint32_t imm) {
+  return ADDIS(rt, r0, imm);
+}
+
+inline std::uint32_t BLR() { return BCLR(0x10 | 0x04, 0, 0); }
+inline std::uint32_t BCTR() { return BCCTR(0x10 | 0x04, 0, 0); }
+inline std::uint32_t BCTRL() { return BCCTR(0x10 | 0x04, 0, 0, true); }
+inline std::uint32_t MFCTR(std::uint32_t reg) { return MFSPR(reg, 9 << 5); }
+inline std::uint32_t MTCTR(std::uint32_t reg) { return MTSPR(9 << 5, reg); }
+inline std::uint32_t MFLR(std::uint32_t reg) { return MFSPR(reg, 8 << 5); }
+inline std::uint32_t MTLR(std::uint32_t reg) { return MTSPR(8 << 5, reg); }
+
+inline std::uint32_t BNE(std::uint32_t cr, std::int32_t imm) {
+  return BC(4, 2 | cr << 2, imm);
+}
+inline std::uint32_t BEQ(std::uint32_t cr, std::int32_t imm) {
+  return BC(12, 2 | cr << 2, imm);
+}
+inline std::uint32_t BGT(std::uint32_t cr, std::int32_t imm) {
+  return BC(12, 1 | cr << 2, imm);
+}
+inline std::uint32_t BNE(std::int32_t imm) { return BNE(cr0, imm); }
+inline std::uint32_t BEQ(std::int32_t imm) { return BEQ(cr0, imm); }
+inline std::uint32_t BGT(std::int32_t imm) { return BGT(cr0, imm); }
+
+inline std::uint32_t CMPDI(std::uint32_t cr, std::uint32_t reg,
+                           std::uint32_t imm) {
+  return CMPI(cr, 1, reg, imm);
+}
+inline std::uint32_t CMPDI(std::uint32_t reg, std::uint32_t imm) {
+  return CMPDI(cr0, reg, imm);
+}
+inline std::uint32_t CMPWI(std::uint32_t cr, std::uint32_t reg,
+                           std::uint32_t imm) {
+  return CMPI(cr, 0, reg, imm);
+}
+inline std::uint32_t CMPWI(std::uint32_t reg, std::uint32_t imm) {
+  return CMPWI(cr0, reg, imm);
+}
+inline std::uint32_t CMPLDI(std::uint32_t cr, std::uint32_t reg,
+                            std::uint32_t imm) {
+  return CMPLI(cr, 1, reg, imm);
+}
+inline std::uint32_t CMPLDI(std::uint32_t reg, std::uint32_t imm) {
+  return CMPLDI(cr0, reg, imm);
+}
+inline std::uint32_t CMPLWI(std::uint32_t cr, std::uint32_t reg,
+                            std::uint32_t imm) {
+  return CMPLI(cr, 0, reg, imm);
+}
+inline std::uint32_t CMPLWI(std::uint32_t reg, std::uint32_t imm) {
+  return CMPLWI(cr0, reg, imm);
+}
+
+inline std::uint32_t EXTRDI(std::uint32_t x, std::uint32_t y, std::uint32_t n,
+                            std::uint32_t b) {
+  return RLDICL(x, y, b + n, 64 - b, false);
+}
+inline std::uint32_t SRDI(std::uint32_t x, std::uint32_t y, std::uint32_t n) {
+  return RLDICL(x, y, 64 - n, n, false);
+}
+inline std::uint32_t CLRLDI(std::uint32_t x, std::uint32_t y, std::uint32_t n) {
+  return RLDICL(x, y, 0, n, false);
+}
+inline std::uint32_t CLRRDI(std::uint32_t x, std::uint32_t y, std::uint32_t n) {
+  return RLDICR(x, y, 0, 63 - n, false);
+}
+
+inline constexpr std::uint32_t TRAP() { return 0x7FE00008; } // tw 31,r0,r0
+} // namespace implicts
+} // namespace rx::cell::ppu
diff --git a/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Instruction.hpp b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Instruction.hpp
new file mode 100644
index 000000000..918aac79d
--- /dev/null
+++ b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Instruction.hpp
@@ -0,0 +1,72 @@
+#pragma once
+#include <cstdint>
+#include <rx/BitField.h>
+
+namespace rx::cell::ppu {
+union Instruction {
+  template <typename T, std::uint32_t I, std::uint32_t N>
+  using bf = BitField<T, sizeof(T) * 8 - N - I, N>;
+
+  std::uint32_t raw;
+
+  bf<std::uint32_t, 0, 6> main; // 0..5
+  BitFieldPack<bf<std::uint32_t, 30, 1>, bf<std::uint32_t, 16, 5>>
+      sh64; // 30 + 16..20
+  BitFieldPack<bf<std::uint32_t, 26, 1>, bf<std::uint32_t, 21, 5>>
+      mbe64;                        // 26 + 21..25
+  bf<std::uint32_t, 11, 5> vuimm;   // 11..15
+  bf<std::uint32_t, 6, 5> vs;       // 6..10
+  bf<std::uint32_t, 22, 4> vsh;     // 22..25
+  bf<std::uint32_t, 21, 1> oe;      // 21
+  bf<std::uint32_t, 11, 10> spr;    // 11..20
+  bf<std::uint32_t, 21, 5> vc;      // 21..25
+  bf<std::uint32_t, 16, 5> vb;      // 16..20
+  bf<std::uint32_t, 11, 5> va;      // 11..15
+  bf<std::uint32_t, 6, 5> vd;       // 6..10
+  bf<std::uint32_t, 31, 1> lk;      // 31
+  bf<std::uint32_t, 30, 1> aa;      // 30
+  bf<std::uint32_t, 16, 5> rb;      // 16..20
+  bf<std::uint32_t, 11, 5> ra;      // 11..15
+  bf<std::uint32_t, 6, 5> rd;       // 6..10
+  bf<std::uint32_t, 16, 16> uimm16; // 16..31
+  bf<std::uint32_t, 11, 1> l11;     // 11
+  bf<std::uint32_t, 6, 5> rs;       // 6..10
+  bf<std::int32_t, 16, 16> simm16;  // 16..31, signed
+  bf<std::int32_t, 16, 14> ds;      // 16..29, signed
+  bf<std::int32_t, 11, 5> vsimm;    // 11..15, signed
+  bf<std::int32_t, 6, 26> ll;       // 6..31, signed
+  bf<std::int32_t, 6, 24> li;       // 6..29, signed
+  bf<std::uint32_t, 20, 7> lev;     // 20..26
+  bf<std::uint32_t, 16, 4> i;       // 16..19
+  bf<std::uint32_t, 11, 3> crfs;    // 11..13
+  bf<std::uint32_t, 10, 1> l10;     // 10
+  bf<std::uint32_t, 6, 3> crfd;     // 6..8
+  bf<std::uint32_t, 16, 5> crbb;    // 16..20
+  bf<std::uint32_t, 11, 5> crba;    // 11..15
+  bf<std::uint32_t, 6, 5> crbd;     // 6..10
+  bf<std::uint32_t, 31, 1> rc;      // 31
+  bf<std::uint32_t, 26, 5> me32;    // 26..30
+  bf<std::uint32_t, 21, 5> mb32;    // 21..25
+  bf<std::uint32_t, 16, 5> sh32;    // 16..20
+  bf<std::uint32_t, 11, 5> bi;      // 11..15
+  bf<std::uint32_t, 6, 5> bo;       // 6..10
+  bf<std::uint32_t, 19, 2> bh;      // 19..20
+  bf<std::uint32_t, 21, 5> frc;     // 21..25
+  bf<std::uint32_t, 16, 5> frb;     // 16..20
+  bf<std::uint32_t, 11, 5> fra;     // 11..15
+  bf<std::uint32_t, 6, 5> frd;      // 6..10
+  bf<std::uint32_t, 12, 8> crm;     // 12..19
+  bf<std::uint32_t, 6, 5> frs;      // 6..10
+  bf<std::uint32_t, 7, 8> flm;      // 7..14
+  bf<std::uint32_t, 6, 1> l6;       // 6
+  bf<std::uint32_t, 15, 1> l15;     // 15
+
+  BitFieldPack<bf<std::int32_t, 16, 14>, BitFieldFixed<std::uint32_t, 0, 2>>
+      bt14;
+
+  BitFieldPack<bf<std::int32_t, 6, 24>, BitFieldFixed<std::uint32_t, 0, 2>>
+      bt24;
+};
+
+static_assert(sizeof(Instruction) == sizeof(std::uint32_t));
+} // namespace rx::cell::ppu
diff --git a/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Opcode.hpp b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Opcode.hpp
new file mode 100644
index 000000000..1d573fcb5
--- /dev/null
+++ b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Opcode.hpp
@@ -0,0 +1,858 @@
+#pragma once
+
+namespace rx::cell::ppu {
+enum class Opcode {
+  Invalid,
+
+  MFVSCR,
+  MTVSCR,
+  VADDCUW,
+  VADDFP,
+  VADDSBS,
+  VADDSHS,
+  VADDSWS,
+  VADDUBM,
+  VADDUBS,
+  VADDUHM,
+  VADDUHS,
+  VADDUWM,
+  VADDUWS,
+  VAND,
+  VANDC,
+  VAVGSB,
+  VAVGSH,
+  VAVGSW,
+  VAVGUB,
+  VAVGUH,
+  VAVGUW,
+  VCFSX,
+  VCFUX,
+  VCMPBFP,
+  VCMPBFP_,
+  VCMPEQFP,
+  VCMPEQFP_,
+  VCMPEQUB,
+  VCMPEQUB_,
+  VCMPEQUH,
+  VCMPEQUH_,
+  VCMPEQUW,
+  VCMPEQUW_,
+  VCMPGEFP,
+  VCMPGEFP_,
+  VCMPGTFP,
+  VCMPGTFP_,
+  VCMPGTSB,
+  VCMPGTSB_,
+  VCMPGTSH,
+  VCMPGTSH_,
+  VCMPGTSW,
+  VCMPGTSW_,
+  VCMPGTUB,
+  VCMPGTUB_,
+  VCMPGTUH,
+  VCMPGTUH_,
+  VCMPGTUW,
+  VCMPGTUW_,
+  VCTSXS,
+  VCTUXS,
+  VEXPTEFP,
+  VLOGEFP,
+  VMADDFP,
+  VMAXFP,
+  VMAXSB,
+  VMAXSH,
+  VMAXSW,
+  VMAXUB,
+  VMAXUH,
+  VMAXUW,
+  VMHADDSHS,
+  VMHRADDSHS,
+  VMINFP,
+  VMINSB,
+  VMINSH,
+  VMINSW,
+  VMINUB,
+  VMINUH,
+  VMINUW,
+  VMLADDUHM,
+  VMRGHB,
+  VMRGHH,
+  VMRGHW,
+  VMRGLB,
+  VMRGLH,
+  VMRGLW,
+  VMSUMMBM,
+  VMSUMSHM,
+  VMSUMSHS,
+  VMSUMUBM,
+  VMSUMUHM,
+  VMSUMUHS,
+  VMULESB,
+  VMULESH,
+  VMULEUB,
+  VMULEUH,
+  VMULOSB,
+  VMULOSH,
+  VMULOUB,
+  VMULOUH,
+  VNMSUBFP,
+  VNOR,
+  VOR,
+  VPERM,
+  VPKPX,
+  VPKSHSS,
+  VPKSHUS,
+  VPKSWSS,
+  VPKSWUS,
+  VPKUHUM,
+  VPKUHUS,
+  VPKUWUM,
+  VPKUWUS,
+  VREFP,
+  VRFIM,
+  VRFIN,
+  VRFIP,
+  VRFIZ,
+  VRLB,
+  VRLH,
+  VRLW,
+  VRSQRTEFP,
+  VSEL,
+  VSL,
+  VSLB,
+  VSLDOI,
+  VSLH,
+  VSLO,
+  VSLW,
+  VSPLTB,
+  VSPLTH,
+  VSPLTISB,
+  VSPLTISH,
+  VSPLTISW,
+  VSPLTW,
+  VSR,
+  VSRAB,
+  VSRAH,
+  VSRAW,
+  VSRB,
+  VSRH,
+  VSRO,
+  VSRW,
+  VSUBCUW,
+  VSUBFP,
+  VSUBSBS,
+  VSUBSHS,
+  VSUBSWS,
+  VSUBUBM,
+  VSUBUBS,
+  VSUBUHM,
+  VSUBUHS,
+  VSUBUWM,
+  VSUBUWS,
+  VSUMSWS,
+  VSUM2SWS,
+  VSUM4SBS,
+  VSUM4SHS,
+  VSUM4UBS,
+  VUPKHPX,
+  VUPKHSB,
+  VUPKHSH,
+  VUPKLPX,
+  VUPKLSB,
+  VUPKLSH,
+  VXOR,
+  TDI,
+  TWI,
+  MULLI,
+  SUBFIC,
+  CMPLI,
+  CMPI,
+  ADDIC,
+  ADDI,
+  ADDIS,
+  BC,
+  SC,
+  B,
+  MCRF,
+  BCLR,
+  RFID,
+  CRNOR,
+  RFSCV,
+  CRANDC,
+  ISYNC,
+  CRXOR,
+  CRNAND,
+  CRAND,
+  HRFID,
+  CREQV,
+  URFID,
+  STOP,
+  CRORC,
+  CROR,
+  BCCTR,
+  RLWIMI,
+  RLWINM,
+  RLWNM,
+  ORI,
+  ORIS,
+  XORI,
+  XORIS,
+  ANDI,
+  ANDIS,
+  RLDICL,
+  RLDICR,
+  RLDIC,
+  RLDIMI,
+  RLDCL,
+  RLDCR,
+  CMP,
+  TW,
+  LVSL,
+  LVEBX,
+  SUBFC,
+  MULHDU,
+  ADDC,
+  MULHWU,
+  MFOCRF,
+  LWARX,
+  LDX,
+  LWZX,
+  SLW,
+  CNTLZW,
+  SLD,
+  AND,
+  CMPL,
+  LVSR,
+  LVEHX,
+  SUBF,
+  LDUX,
+  DCBST,
+  LWZUX,
+  CNTLZD,
+  ANDC,
+  TD,
+  LVEWX,
+  MULHD,
+  MULHW,
+  LDARX,
+  DCBF,
+  LBZX,
+  LVX,
+  NEG,
+  LBZUX,
+  NOR,
+  STVEBX,
+  SUBFE,
+  ADDE,
+  MTOCRF,
+  STDX,
+  STWCX,
+  STWX,
+  STVEHX,
+  STDUX,
+  STWUX,
+  STVEWX,
+  SUBFZE,
+  ADDZE,
+  STDCX,
+  STBX,
+  STVX,
+  MULLD,
+  SUBFME,
+  ADDME,
+  MULLW,
+  DCBTST,
+  STBUX,
+  ADD,
+  DCBT,
+  LHZX,
+  EQV,
+  ECIWX,
+  LHZUX,
+  XOR,
+  MFSPR,
+  LWAX,
+  DST,
+  LHAX,
+  LVXL,
+  MFTB,
+  LWAUX,
+  DSTST,
+  LHAUX,
+  STHX,
+  ORC,
+  ECOWX,
+  STHUX,
+  OR,
+  DIVDU,
+  DIVWU,
+  MTSPR,
+  DCBI,
+  NAND,
+  STVXL,
+  DIVD,
+  DIVW,
+  LVLX,
+  LDBRX,
+  LSWX,
+  LWBRX,
+  LFSX,
+  SRW,
+  SRD,
+  LVRX,
+  LSWI,
+  LFSUX,
+  SYNC,
+  LFDX,
+  LFDUX,
+  STVLX,
+  STDBRX,
+  STSWX,
+  STWBRX,
+  STFSX,
+  STVRX,
+  STFSUX,
+  STSWI,
+  STFDX,
+  STFDUX,
+  LVLXL,
+  LHBRX,
+  SRAW,
+  SRAD,
+  LVRXL,
+  DSS,
+  SRAWI,
+  SRADI,
+  EIEIO,
+  STVLXL,
+  STHBRX,
+  EXTSH,
+  STVRXL,
+  EXTSB,
+  STFIWX,
+  EXTSW,
+  ICBI,
+  DCBZ,
+  LWZ,
+  LWZU,
+  LBZ,
+  LBZU,
+  STW,
+  STWU,
+  STB,
+  STBU,
+  LHZ,
+  LHZU,
+  LHA,
+  LHAU,
+  STH,
+  STHU,
+  LMW,
+  STMW,
+  LFS,
+  LFSU,
+  LFD,
+  LFDU,
+  STFS,
+  STFSU,
+  STFD,
+  STFDU,
+  LD,
+  LDU,
+  LWA,
+  STD,
+  STDU,
+  FDIVS,
+  FSUBS,
+  FADDS,
+  FSQRTS,
+  FRES,
+  FMULS,
+  FMADDS,
+  FMSUBS,
+  FNMSUBS,
+  FNMADDS,
+  MTFSB1,
+  MCRFS,
+  MTFSB0,
+  MTFSFI,
+  MFFS,
+  MTFSF,
+  FCMPU,
+  FRSP,
+  FCTIW,
+  FCTIWZ,
+  FDIV,
+  FSUB,
+  FADD,
+  FSQRT,
+  FSEL,
+  FMUL,
+  FRSQRTE,
+  FMSUB,
+  FMADD,
+  FNMSUB,
+  FNMADD,
+  FCMPO,
+  FNEG,
+  FMR,
+  FNABS,
+  FABS,
+  FCTID,
+  FCTIDZ,
+  FCFID,
+  UNK,
+  SUBFCO,
+  ADDCO,
+  SUBFO,
+  NEGO,
+  SUBFEO,
+  ADDEO,
+  SUBFZEO,
+  ADDZEO,
+  SUBFMEO,
+  MULLDO,
+  ADDMEO,
+  MULLWO,
+  ADDO,
+  DIVDUO,
+  DIVWUO,
+  DIVDO,
+  DIVWO,
+  SUBFCO_,
+  ADDCO_,
+  SUBFO_,
+  NEGO_,
+  SUBFEO_,
+  ADDEO_,
+  SUBFZEO_,
+  ADDZEO_,
+  SUBFMEO_,
+  MULLDO_,
+  ADDMEO_,
+  MULLWO_,
+  ADDO_,
+  DIVDUO_,
+  DIVWUO_,
+  DIVDO_,
+  DIVWO_,
+  RLWIMI_,
+  RLWINM_,
+  RLWNM_,
+  RLDICL_,
+  RLDICR_,
+  RLDIC_,
+  RLDIMI_,
+  RLDCL_,
+  RLDCR_,
+  SUBFC_,
+  MULHDU_,
+  ADDC_,
+  MULHWU_,
+  SLW_,
+  CNTLZW_,
+  SLD_,
+  AND_,
+  SUBF_,
+  CNTLZD_,
+  ANDC_,
+  MULHD_,
+  MULHW_,
+  NEG_,
+  NOR_,
+  SUBFE_,
+  ADDE_,
+  SUBFZE_,
+  ADDZE_,
+  MULLD_,
+  SUBFME_,
+  ADDME_,
+  MULLW_,
+  ADD_,
+  EQV_,
+  XOR_,
+  ORC_,
+  OR_,
+  DIVDU_,
+  DIVWU_,
+  NAND_,
+  DIVD_,
+  DIVW_,
+  SRW_,
+  SRD_,
+  SRAW_,
+  SRAD_,
+  SRAWI_,
+  SRADI_,
+  EXTSH_,
+  EXTSB_,
+  EXTSW_,
+  FDIVS_,
+  FSUBS_,
+  FADDS_,
+  FSQRTS_,
+  FRES_,
+  FMULS_,
+  FMADDS_,
+  FMSUBS_,
+  FNMSUBS_,
+  FNMADDS_,
+  MTFSB1_,
+  MTFSB0_,
+  MTFSFI_,
+  MFFS_,
+  MTFSF_,
+  FRSP_,
+  FCTIW_,
+  FCTIWZ_,
+  FDIV_,
+  FSUB_,
+  FADD_,
+  FSQRT_,
+  FSEL_,
+  FMUL_,
+  FRSQRTE_,
+  FMSUB_,
+  FMADD_,
+  FNMSUB_,
+  FNMADD_,
+  FNEG_,
+  FMR_,
+  FNABS_,
+  FABS_,
+  FCTID_,
+  FCTIDZ_,
+  FCFID_,
+
+  // extended mnemonic
+  LI,
+  LIS,
+  NOP,
+  MR,
+
+  CLRLDI,
+  ROTLDI,
+  SRDI,
+
+  CMPD,
+  CMPW,
+
+  CMPLD,
+  CMPLW,
+
+  NOT,
+
+  MTCRF,
+  MFXER,
+  MFLR,
+  MFCTR,
+
+  MFTBU,
+
+  CCTPL,
+  CCTPM,
+  CCTPH,
+  DB8CYC,
+  DB10CYC,
+  DB12CYC,
+  DB16CYC,
+
+  CRNOT,
+
+  BDNZF,
+  BDZF,
+  BDNZT,
+  BDZT,
+  BDZ,
+  BDZ_P,
+  BDZ_M,
+  BDNZ,
+  BDNZ_P,
+  BDNZ_M,
+  BGE,
+  BGE_P,
+  BGE_M,
+  BLE,
+  BLE_P,
+  BLE_M,
+  BNE,
+  BNE_P,
+  BNE_M,
+  BNS,
+  BNS_P,
+  BNS_M,
+  BLT,
+  BLT_P,
+  BLT_M,
+  BGT,
+  BGT_P,
+  BGT_M,
+  BEQ,
+  BEQ_P,
+  BEQ_M,
+  BSO,
+  BSO_P,
+  BSO_M,
+
+  BDNZFL,
+  BDZFL,
+  BDNZTL,
+  BDZTL,
+  BDZL,
+  BDZL_P,
+  BDZL_M,
+  BDNZL,
+  BDNZL_P,
+  BDNZL_M,
+  BGEL,
+  BGEL_P,
+  BGEL_M,
+  BLEL,
+  BLEL_P,
+  BLEL_M,
+  BNEL,
+  BNEL_P,
+  BNEL_M,
+  BNSL,
+  BNSL_P,
+  BNSL_M,
+  BLTL,
+  BLTL_P,
+  BLTL_M,
+  BGTL,
+  BGTL_P,
+  BGTL_M,
+  BEQL,
+  BEQL_P,
+  BEQL_M,
+  BSOL,
+  BSOL_P,
+  BSOL_M,
+
+  BDNZFA,
+  BDZFA,
+  BDNZTA,
+  BDZTA,
+  BDZA,
+  BDZA_P,
+  BDZA_M,
+  BDNZA,
+  BDNZA_P,
+  BDNZA_M,
+  BGEA,
+  BGEA_P,
+  BGEA_M,
+  BLEA,
+  BLEA_P,
+  BLEA_M,
+  BNEA,
+  BNEA_P,
+  BNEA_M,
+  BNSA,
+  BNSA_P,
+  BNSA_M,
+  BLTA,
+  BLTA_P,
+  BLTA_M,
+  BGTA,
+  BGTA_P,
+  BGTA_M,
+  BEQA,
+  BEQA_P,
+  BEQA_M,
+  BSOA,
+  BSOA_P,
+  BSOA_M,
+
+  BDNZFLA,
+  BDZFLA,
+  BDNZTLA,
+  BDZTLA,
+  BDZLA,
+  BDZLA_P,
+  BDZLA_M,
+  BDNZLA,
+  BDNZLA_P,
+  BDNZLA_M,
+  BGELA,
+  BGELA_P,
+  BGELA_M,
+  BLELA,
+  BLELA_P,
+  BLELA_M,
+  BNELA,
+  BNELA_P,
+  BNELA_M,
+  BNSLA,
+  BNSLA_P,
+  BNSLA_M,
+  BLTLA,
+  BLTLA_P,
+  BLTLA_M,
+  BGTLA,
+  BGTLA_P,
+  BGTLA_M,
+  BEQLA,
+  BEQLA_P,
+  BEQLA_M,
+  BSOLA,
+  BSOLA_P,
+  BSOLA_M,
+
+  BDNZFLR,
+  BDZFLR,
+  BDNZTLR,
+  BDZTLR,
+  BDZLR,
+  BDZLR_P,
+  BDZLR_M,
+  BDNZLR,
+  BDNZLR_P,
+  BDNZLR_M,
+  BGELR,
+  BGELR_P,
+  BGELR_M,
+  BLELR,
+  BLELR_P,
+  BLELR_M,
+  BNELR,
+  BNELR_P,
+  BNELR_M,
+  BNSLR,
+  BNSLR_P,
+  BNSLR_M,
+  BLTLR,
+  BLTLR_P,
+  BLTLR_M,
+  BGTLR,
+  BGTLR_P,
+  BGTLR_M,
+  BEQLR,
+  BEQLR_P,
+  BEQLR_M,
+  BSOLR,
+  BSOLR_P,
+  BSOLR_M,
+
+  BDNZFCTR,
+  BDZFCTR,
+  BDNZTCTR,
+  BDZTCTR,
+  BDZCTR,
+  BDZCTR_P,
+  BDZCTR_M,
+  BDNZCTR,
+  BDNZCTR_P,
+  BDNZCTR_M,
+  BGECTR,
+  BGECTR_P,
+  BGECTR_M,
+  BLECTR,
+  BLECTR_P,
+  BLECTR_M,
+  BNECTR,
+  BNECTR_P,
+  BNECTR_M,
+  BNSCTR,
+  BNSCTR_P,
+  BNSCTR_M,
+  BLTCTR,
+  BLTCTR_P,
+  BLTCTR_M,
+  BGTCTR,
+  BGTCTR_P,
+  BGTCTR_M,
+  BEQCTR,
+  BEQCTR_P,
+  BEQCTR_M,
+  BSOCTR,
+  BSOCTR_P,
+  BSOCTR_M,
+
+  BDNZFCTRL,
+  BDZFCTRL,
+  BDNZTCTRL,
+  BDZTCTRL,
+  BDZCTRL,
+  BDZCTRL_P,
+  BDZCTRL_M,
+  BDNZCTRL,
+  BDNZCTRL_P,
+  BDNZCTRL_M,
+  BGECTRL,
+  BGECTRL_P,
+  BGECTRL_M,
+  BLECTRL,
+  BLECTRL_P,
+  BLECTRL_M,
+  BNECTRL,
+  BNECTRL_P,
+  BNECTRL_M,
+  BNSCTRL,
+  BNSCTRL_P,
+  BNSCTRL_M,
+  BLTCTRL,
+  BLTCTRL_P,
+  BLTCTRL_M,
+  BGTCTRL,
+  BGTCTRL_P,
+  BGTCTRL_M,
+  BEQCTRL,
+  BEQCTRL_P,
+  BEQCTRL_M,
+  BSOCTRL,
+  BSOCTRL_P,
+  BSOCTRL_M,
+
+  BDNZFLRL,
+  BDZFLRL,
+  BDNZTLRL,
+  BDZTLRL,
+  BDZLRL,
+  BDZLRL_P,
+  BDZLRL_M,
+  BDNZLRL,
+  BDNZLRL_P,
+  BDNZLRL_M,
+  BGELRL,
+  BGELRL_P,
+  BGELRL_M,
+  BLELRL,
+  BLELRL_P,
+  BLELRL_M,
+  BNELRL,
+  BNELRL_P,
+  BNELRL_M,
+  BNSLRL,
+  BNSLRL_P,
+  BNSLRL_M,
+  BLTLRL,
+  BLTLRL_P,
+  BLTLRL_M,
+  BGTLRL,
+  BGTLRL_P,
+  BGTLRL_M,
+  BEQLRL,
+  BEQLRL_P,
+  BEQLRL_M,
+  BSOLRL,
+  BSOLRL_P,
+  BSOLRL_M,
+
+  BL,
+  BA,
+  BLA,
+  BCL,
+  BCA,
+  BCLA,
+  BLR,
+  BTLR,
+  BFLR,
+  BCTRL,
+  BCCTRL,
+  BTCTRL,
+  BFCTRL,
+
+  _count
+};
+}
diff --git a/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/PPUContext.hpp b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/PPUContext.hpp
new file mode 100644
index 000000000..da5102ec3
--- /dev/null
+++ b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/PPUContext.hpp
@@ -0,0 +1,129 @@
+#pragma once
+
+#include "rx/v128.hpp"
+#include <cstdint>
+
+struct alignas(4) CrField {
+  std::uint8_t bits[4];
+
+  constexpr void set(bool lt, bool gt, bool eq, bool so) {
+    bits[0] = lt;
+    bits[1] = gt;
+    bits[2] = eq;
+    bits[3] = so;
+  }
+
+  template <typename T>
+  constexpr void update(const T &lhs, const T &rhs, bool so) {
+    bits[0] = lhs < rhs;
+    bits[1] = lhs > rhs;
+    bits[2] = lhs == rhs;
+    bits[3] = so;
+  }
+
+  static constexpr CrField From(bool lt, bool gt, bool eq, bool so) {
+    CrField result;
+    result.set(lt, gt, eq, so);
+    return result;
+  }
+
+  [[nodiscard]] constexpr bool isLt() const { return bits[0] != 0; }
+  [[nodiscard]] constexpr bool isGt() const { return bits[1] != 0; }
+  [[nodiscard]] constexpr bool isEq() const { return bits[2] != 0; }
+  [[nodiscard]] constexpr bool isSo() const { return bits[3] != 0; }
+};
+
+struct PPUContext {
+  std::uint64_t gpr[32] = {}; // General-Purpose Registers
+  double fpr[32] = {};        // Floating Point Registers
+  rx::v128 vr[32] = {};       // Vector Registers
+
+  union alignas(16) cr_bits {
+    std::uint8_t bits[32];
+    CrField fields[8];
+
+    std::uint8_t &operator[](std::size_t i) { return bits[i]; }
+
+    // Pack CR bits
+    [[nodiscard]] std::uint32_t pack() const {
+      std::uint32_t result{};
+
+      for (u32 bit : bits) {
+        result <<= 1;
+        result |= bit;
+      }
+
+      return result;
+    }
+
+    // Unpack CR bits
+    void unpack(std::uint32_t value) {
+      for (u8 &b : bits) {
+        b = !!(value & (1u << 31));
+        value <<= 1;
+      }
+    }
+  };
+
+  cr_bits cr{}; // Condition Registers (unpacked)
+
+  // Floating-Point Status and Control Register (unpacked)
+  union alignas(16) {
+    struct {
+      // TODO
+      bool _start[16];
+      bool fl; // FPCC.FL
+      bool fg; // FPCC.FG
+      bool fe; // FPCC.FE
+      bool fu; // FPCC.FU
+      bool _end[12];
+    };
+
+    CrField fields[8];
+    cr_bits bits;
+  } fpscr{};
+
+  std::uint64_t lr{};               // Link Register
+  std::uint64_t ctr{};              // Counter Register
+  std::uint32_t vrsave{0xffffffff}; // vr Save Register
+  std::uint32_t cia{};              // Current Instruction Address
+
+  // Fixed-Point Exception Register (abstract representation)
+  bool xer_so{};          // Summary Overflow
+  bool xer_ov{};          // Overflow
+  bool xer_ca{};          // Carry
+  std::uint8_t xer_cnt{}; // 0..6
+
+  /*
+      Non-Java. A mode control bit that determines whether vector floating-point
+     operations will be performed in a Java-IEEE-C9X-compliant mode or a
+     possibly faster non-Java/non-IEEE mode. 0	The Java-IEEE-C9X-compliant mode
+     is selected. Denormalized values are handled as specified by Java, IEEE,
+     and C9X standard. 1	The non-Java/non-IEEE-compliant mode is
+     selected. If an element in a source vector register contains a denormalized
+     value, the value '0' is used instead. If an instruction causes an underflow
+          exception, the corresponding element in the target vr is cleared to
+     '0'. In both cases, the '0' has the same sign as the denormalized or
+     underflowing value.
+  */
+  bool nj = true;
+
+  // Sticky saturation bit
+  rx::v128 sat{};
+
+  // Optimization: precomputed java-mode mask for handling denormals
+  std::uint32_t jm_mask = 0x7f80'0000;
+
+  std::uint32_t raddr{0}; // Reservation addr
+  std::uint64_t rtime{0};
+  alignas(64) std::byte rdata[128]{}; // Reservation data
+  bool use_full_rdata{};
+  std::uint32_t res_cached{0}; // Reservation "cached" addresss
+  std::uint32_t res_notify{0};
+  std::uint64_t res_notify_time{0};
+
+  inline void setOV(bool bit) {
+    xer_ov = bit;
+    xer_so |= bit;
+  }
+};
diff --git a/rpcsx/cpu/cell/ppu/semantic/ppu.cpp b/rpcsx/cpu/cell/ppu/semantic/ppu.cpp
new file mode 100644
index 000000000..81e6e6436
--- /dev/null
+++ b/rpcsx/cpu/cell/ppu/semantic/ppu.cpp
@@ -0,0 +1,4230 @@
+#include "Instruction.hpp"
+#include "PPUContext.hpp"
+#include <atomic>
+#include <bit>
+#include <cmath>
+#include <cstdint>
+#include <rx/simd.hpp>
+#include <rx/types.hpp>
+#include <rx/v128.hpp>
+
+using namespace rx;
+using namespace rx::cell::ppu;
+
+#define EXPORT_SEMANTIC(x)                                                     \
+  extern "C" {                                                                 \
+  auto ISEL_PPU_##x = x;                                                       \
+  auto ISEL_PPU_##x##_DEC = x##_DEC;                                           \
+  }
+
+#define SEMANTIC(x) inline x [[gnu::always_inline]]
+#define DECODER(x)                                                             \
+  inline x##_DEC                                                               \
+      [[gnu::always_inline]] ([[maybe_unused]] PPUContext & context,           \
+                              [[maybe_unused]] Instruction inst)
+
+template <typename T> struct add_flags_result_t {
+  T result;
+  bool carry;
+
+  add_flags_result_t() = default;
+
+  // Straighforward ADD with flags
+  add_flags_result_t(T a, T b) : result(a + b), carry(result < a) {}
+
+  // Straighforward ADC with flags
+  add_flags_result_t(T a, T b, bool c) : add_flags_result_t(a, b) {
+    add_flags_result_t r(result, c);
+    result = r.result;
+    carry |= r.carry;
+  }
+};
+
+static add_flags_result_t<u64> add64_flags(u64 a, u64 b) { return {a, b}; }
+
+static add_flags_result_t<u64> add64_flags(u64 a, u64 b, bool c) {
+  return {a, b, c};
+}
+
+extern "C" {
+[[noreturn]] void rpcsx_trap();
+[[noreturn]] void rpcsx_invalid_instruction();
+[[noreturn]] void rpcsx_unimplemented_instruction();
+
+void rpcsx_vm_read(std::uint64_t vaddr, void *dest, std::size_t size);
+void rpcsx_vm_write(std::uint64_t vaddr, const void *src, std::size_t size);
+
+std::uint64_t rpcsx_get_tb();
+}
+
+namespace {
+u32 ppu_fres_mantissas[128] = {
+    0x007f0000, 0x007d0800, 0x007b1800, 0x00793000, 0x00775000, 0x00757000,
+    0x0073a000, 0x0071e000, 0x00700000, 0x006e4000, 0x006ca000, 0x006ae000,
+    0x00694000, 0x00678000, 0x00660000, 0x00646000, 0x0062c000, 0x00614000,
+    0x005fc000, 0x005e4000, 0x005cc000, 0x005b4000, 0x0059c000, 0x00584000,
+    0x00570000, 0x00558000, 0x00540000, 0x0052c000, 0x00518000, 0x00500000,
+    0x004ec000, 0x004d8000, 0x004c0000, 0x004b0000, 0x00498000, 0x00488000,
+    0x00474000, 0x00460000, 0x0044c000, 0x00438000, 0x00428000, 0x00418000,
+    0x00400000, 0x003f0000, 0x003e0000, 0x003d0000, 0x003bc000, 0x003ac000,
+    0x00398000, 0x00388000, 0x00378000, 0x00368000, 0x00358000, 0x00348000,
+    0x00338000, 0x00328000, 0x00318000, 0x00308000, 0x002f8000, 0x002ec000,
+    0x002e0000, 0x002d0000, 0x002c0000, 0x002b0000, 0x002a0000, 0x00298000,
+    0x00288000, 0x00278000, 0x0026c000, 0x00260000, 0x00250000, 0x00244000,
+    0x00238000, 0x00228000, 0x00220000, 0x00210000, 0x00200000, 0x001f8000,
+    0x001e8000, 0x001e0000, 0x001d0000, 0x001c8000, 0x001b8000, 0x001b0000,
+    0x001a0000, 0x00198000, 0x00190000, 0x00180000, 0x00178000, 0x00168000,
+    0x00160000, 0x00158000, 0x00148000, 0x00140000, 0x00138000, 0x00128000,
+    0x00120000, 0x00118000, 0x00108000, 0x00100000, 0x000f8000, 0x000f0000,
+    0x000e0000, 0x000d8000, 0x000d0000, 0x000c8000, 0x000b8000, 0x000b0000,
+    0x000a8000, 0x000a0000, 0x00098000, 0x00090000, 0x00080000, 0x00078000,
+    0x00070000, 0x00068000, 0x00060000, 0x00058000, 0x00050000, 0x00048000,
+    0x00040000, 0x00038000, 0x00030000, 0x00028000, 0x00020000, 0x00018000,
+    0x00010000, 0x00000000,
+};
+
+u32 ppu_frsqrte_mantissas[16] = {
+    0x000f1000u, 0x000d8000u, 0x000c0000u, 0x000a8000u,
+    0x00098000u, 0x00088000u, 0x00080000u, 0x00070000u,
+    0x00060000u, 0x0004c000u, 0x0003c000u, 0x00030000u,
+    0x00020000u, 0x00018000u, 0x00010000u, 0x00008000u,
+};
+
+// Large lookup table for FRSQRTE instruction
+struct ppu_frsqrte_lut_t {
+  // Store only high 32 bits of doubles
+  u32 data[0x8000]{};
+
+  constexpr ppu_frsqrte_lut_t() noexcept {
+    for (u64 i = 0; i < 0x8000; i++) {
+      // Decomposed LUT index
+      const u64 sign = i >> 14;
+      const u64 expv = (i >> 3) & 0x7ff;
+
+      // (0x3FF - (((EXP_BITS(b) - 0x3FF) >> 1) + 1)) << 52
+      const u64 exp = 0x3fe0'0000 - (((expv + 0x1c01) >> 1) << (52 - 32));
+
+      if (expv == 0) // ±INF on zero/denormal, not accurate
+      {
+        data[i] = static_cast<u32>(0x7ff0'0000 | (sign << 31));
+      } else if (expv == 0x7ff) {
+        if (i == (0x7ff << 3))
+          data[i] = 0; // Zero on +INF, inaccurate
+        else
+          data[i] = 0x7ff8'0000; // QNaN
+      } else if (sign) {
+        data[i] = 0x7ff8'0000; // QNaN
+      } else {
+        // ((MAN_BITS(b) >> 49) & 7ull) + (!(EXP_BITS(b) & 1) << 3)
+        const u64 idx = 8 ^ (i & 0xf);
+
+        data[i] = static_cast<u32>(ppu_frsqrte_mantissas[idx] | exp);
+      }
+    }
+  }
+} inline ppu_frqrte_lut;
+} // namespace
+
+namespace vm {
+namespace detail {
+template <typename T> struct vm_type_selector {
+  using type = be_t<T>;
+};
+template <typename T> struct vm_type_selector<le_t<T>> {
+  using type = le_t<T>;
+};
+template <typename T> struct vm_type_selector<be_t<T>> {
+  using type = be_t<T>;
+};
+
+template <typename T>
+  requires(sizeof(T) == 1)
+struct vm_type_selector<T> {
+  using type = T;
+};
+} // namespace detail
+
+template <typename T> T read(std::uint64_t vaddr) {
+  typename detail::vm_type_selector<T>::type result;
+  rpcsx_vm_read(vaddr, &result, sizeof(result));
+  return T(result);
+}
+
+template <typename T> void write(std::uint64_t vaddr, const T &data) {
+  typename detail::vm_type_selector<T>::type value = data;
+  rpcsx_vm_write(vaddr, &value, sizeof(value));
+}
+
+std::uint64_t cast(std::uint64_t address) { return address; }
+} // namespace vm
+
+extern void ppu_execute_syscall(PPUContext &context, u64 code);
+extern u32 ppu_lwarx(PPUContext &context, u32 addr);
+extern u64 ppu_ldarx(PPUContext &context, u32 addr);
+extern bool ppu_stwcx(PPUContext &context, u32 addr, u32 reg_value);
+extern bool ppu_stdcx(PPUContext &context, u32 addr, u64 reg_value);
+extern void ppu_trap(PPUContext &context, u64 addr);
+
+void do_cell_atomic_128_store(u32 addr, const void *to_write);
+
+// NaNs production precedence: NaN from Va, Vb, Vc
+// and lastly the result of the operation in case none of the operands is a NaN
+// Signaling NaNs are 'quieted' (MSB of fraction is set) with other bits of data
+// remain the same
+inline v128 ppu_select_vnan(v128 a) { return a; }
+
+inline v128 ppu_select_vnan(v128 a, v128 b) {
+  return gv_selectfs(gv_eqfs(a, a), b, a | gv_bcst32(0x7fc00000u));
+}
+
+inline v128 ppu_select_vnan(v128 a, v128 b, Vector128 auto... args) {
+  return ppu_select_vnan(a, ppu_select_vnan(b, args...));
+}
+
+// Flush denormals to zero if NJ is 1
+inline v128 ppu_flush_denormal(const v128 &mask, const v128 &a) {
+  return gv_andn(gv_shr32(gv_eq32(mask & a, gv_bcst32(0)), 1), a);
+}
+
+inline v128 ppu_fix_vnan(v128 r) {
+  return gv_selectfs(gv_eqfs(r, r), r, gv_bcst32(0x7fc00000u));
+}
+
+inline v128 ppu_set_vnan(v128 r, Vector128 auto... args) {
+  return ppu_select_vnan(args..., ppu_fix_vnan(r));
+}
+
+template <typename T> auto ppu_feed_data(PPUContext &, u64 addr) {
+  static_assert(sizeof(T) <= 128,
+                "Incompatible type-size, break down into smaller loads");
+
+  return vm::read<T>(addr);
+}
+
+constexpr u64 ppu_rotate_mask(u32 mb, u32 me) {
+  const u64 mask = ~0ull << (~(me - mb) & 63);
+  return (mask >> (mb & 63)) | (mask << ((64 - mb) & 63));
+}
+inline u64 dup32(u32 x) { return x | static_cast<u64>(x) << 32; }
+
+void SEMANTIC(MFVSCR)(v128 &d, v128 sat, bool nj) {
+  u32 sat_bit = !gv_testz(sat);
+  d._u64[0] = 0;
+  d._u64[1] = u64(sat_bit | (u32{nj} << 16)) << 32;
+}
+void DECODER(MFVSCR) { MFVSCR(context.vr[inst.vd], context.sat, context.nj); }
+EXPORT_SEMANTIC(MFVSCR);
+
+void SEMANTIC(MTVSCR)(v128 &sat, bool &nj, u32 &jm_mask, v128 b) {
+  const u32 vscr = b._u32[3];
+  sat._u = vscr & 1;
+  jm_mask = (vscr & 0x10000) ? 0x7f80'0000 : 0x7fff'ffff;
+  nj = (vscr & 0x10000) != 0;
+}
+void DECODER(MTVSCR) {
+  MTVSCR(context.sat, context.nj, context.jm_mask, context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(MTVSCR);
+
+void SEMANTIC(VADDCUW)(v128 &d, v128 a, v128 b) {
+  d = gv_sub32(gv_geu32(gv_not32(a), b), gv_bcst32(-1));
+}
+void DECODER(VADDCUW) {
+  VADDCUW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VADDCUW);
+
+void SEMANTIC(VADDFP)(v128 &d, v128 a, v128 b, u32 jm_mask) {
+  auto m = gv_bcst32(jm_mask);
+  a = ppu_flush_denormal(m, a);
+  b = ppu_flush_denormal(m, b);
+  d = ppu_flush_denormal(m, ppu_set_vnan(gv_addfs(a, b), a, b));
+}
+void DECODER(VADDFP) {
+  VADDFP(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+         context.jm_mask);
+}
+EXPORT_SEMANTIC(VADDFP);
+
+void SEMANTIC(VADDSBS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_adds_s8(a, b);
+  sat = gv_or32(gv_xor32(gv_add8(a, b), r), sat);
+  d = r;
+}
+void DECODER(VADDSBS) {
+  VADDSBS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VADDSBS);
+
+void SEMANTIC(VADDSHS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_adds_s16(a, b);
+  sat = gv_or32(gv_xor32(gv_add16(a, b), r), sat);
+  d = r;
+}
+void DECODER(VADDSHS) {
+  VADDSHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VADDSHS);
+
+void SEMANTIC(VADDSWS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_adds_s32(a, b);
+  sat = gv_or32(gv_xor32(gv_add32(a, b), r), sat);
+  d = r;
+}
+void DECODER(VADDSWS) {
+  VADDSWS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VADDSWS);
+
+void SEMANTIC(VADDUBM)(v128 &d, v128 a, v128 b) { d = gv_add8(a, b); }
+void DECODER(VADDUBM) {
+  VADDUBM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VADDUBM);
+
+void SEMANTIC(VADDUBS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_addus_u8(a, b);
+  sat = gv_or32(gv_xor32(gv_add8(a, b), r), sat);
+  d = r;
+}
+void DECODER(VADDUBS) {
+  VADDUBS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VADDUBS);
+
+void SEMANTIC(VADDUHM)(v128 &d, v128 a, v128 b) { d = gv_add16(a, b); }
+void DECODER(VADDUHM) {
+  VADDUHM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VADDUHM);
+
+void SEMANTIC(VADDUHS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_addus_u16(a, b);
+  sat = gv_or32(gv_xor32(gv_add16(a, b), r), sat);
+  d = r;
+}
+void DECODER(VADDUHS) {
+  VADDUHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VADDUHS);
+
+void SEMANTIC(VADDUWM)(v128 &d, v128 a, v128 b) { d = gv_add32(a, b); }
+void DECODER(VADDUWM) {
+  VADDUWM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VADDUWM);
+
+void SEMANTIC(VADDUWS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_addus_u32(a, b);
+  sat = gv_or32(gv_xor32(gv_add32(a, b), r), sat);
+  d = r;
+}
+void DECODER(VADDUWS) {
+  VADDUWS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VADDUWS);
+
+void SEMANTIC(VAND)(v128 &d, v128 a, v128 b) { d = gv_andfs(a, b); }
+void DECODER(VAND) {
+  VAND(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VAND);
+
+void SEMANTIC(VANDC)(v128 &d, v128 a, v128 b) { d = gv_andnfs(b, a); }
+void DECODER(VANDC) {
+  VANDC(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VANDC);
+
+void SEMANTIC(VAVGSB)(v128 &d, v128 a, v128 b) { d = gv_avgs8(a, b); }
+void DECODER(VAVGSB) {
+  VAVGSB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VAVGSB);
+
+void SEMANTIC(VAVGSH)(v128 &d, v128 a, v128 b) { d = gv_avgs16(a, b); }
+void DECODER(VAVGSH) {
+  VAVGSH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VAVGSH);
+
+void SEMANTIC(VAVGSW)(v128 &d, v128 a, v128 b) { d = gv_avgs32(a, b); }
+void DECODER(VAVGSW) {
+  VAVGSW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VAVGSW);
+
+void SEMANTIC(VAVGUB)(v128 &d, v128 a, v128 b) { d = gv_avgu8(a, b); }
+void DECODER(VAVGUB) {
+  VAVGUB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VAVGUB);
+
+void SEMANTIC(VAVGUH)(v128 &d, v128 a, v128 b) { d = gv_avgu16(a, b); }
+void DECODER(VAVGUH) {
+  VAVGUH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VAVGUH);
+
+void SEMANTIC(VAVGUW)(v128 &d, v128 &a, v128 &b) { d = gv_avgu32(a, b); }
+void DECODER(VAVGUW) {
+  VAVGUW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VAVGUW);
+
+void SEMANTIC(VCFSX)(v128 &d, v128 b, u32 i) {
+  d = gv_subus_u16(gv_cvts32_tofs(b), gv_bcst32(i));
+}
+void DECODER(VCFSX) {
+  VCFSX(context.vr[inst.vd], context.vr[inst.vb], inst.vuimm << 23);
+}
+EXPORT_SEMANTIC(VCFSX);
+
+void SEMANTIC(VCFUX)(v128 &d, v128 b, u32 i) {
+  d = gv_subus_u16(gv_cvtu32_tofs(b), gv_bcst32(i));
+}
+void DECODER(VCFUX) {
+  VCFUX(context.vr[inst.vd], context.vr[inst.vb], inst.vuimm << 23);
+}
+EXPORT_SEMANTIC(VCFUX);
+
+void SEMANTIC(VCMPBFP)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto sign = gv_bcstfs(-0.);
+  auto cmp1 = gv_nlefs(a, b);
+  auto cmp2 = gv_ngefs(a, b ^ sign);
+  auto r = (cmp1 & sign) | gv_shr32(cmp2 & sign, 1);
+  if (cr6 != nullptr) {
+    cr6->set(false, false, gv_testz(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPBFP) {
+  VCMPBFP(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+          context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPBFP);
+
+void SEMANTIC(VCMPEQFP)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto r = gv_eqfs(a, b);
+  if (cr6 != nullptr) {
+    cr6->set(gv_testall1(r), false, gv_testall0(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPEQFP) {
+  VCMPEQFP(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+           context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPEQFP);
+
+void SEMANTIC(VCMPEQUB)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto r = gv_eq8(a, b);
+  if (cr6 != nullptr) {
+    cr6->set(gv_testall1(r), false, gv_testall0(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPEQUB) {
+  VCMPEQUB(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+           context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPEQUB);
+
+void SEMANTIC(VCMPEQUH)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto r = gv_eq16(a, b);
+  if (cr6 != nullptr) {
+    cr6->set(gv_testall1(r), false, gv_testall0(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPEQUH) {
+  VCMPEQUH(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+           context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPEQUH);
+
+void SEMANTIC(VCMPEQUW)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto r = gv_eq32(a, b);
+  if (cr6 != nullptr) {
+    cr6->set(gv_testall1(r), false, gv_testall0(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPEQUW) {
+  VCMPEQUW(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+           context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPEQUW);
+
+void SEMANTIC(VCMPGEFP)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto r = gv_gefs(a, b);
+  if (cr6 != nullptr) {
+    cr6->set(gv_testall1(r), false, gv_testall0(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPGEFP) {
+  VCMPGEFP(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+           context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPGEFP);
+
+void SEMANTIC(VCMPGTFP)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto r = gv_gtfs(a, b);
+  if (cr6 != nullptr) {
+    cr6->set(gv_testall1(r), false, gv_testall0(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPGTFP) {
+  VCMPGTFP(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+           context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPGTFP);
+
+void SEMANTIC(VCMPGTSB)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto r = gv_gts8(a, b);
+  if (cr6 != nullptr) {
+    cr6->set(gv_testall1(r), false, gv_testall0(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPGTSB) {
+  VCMPGTSB(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+           context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPGTSB);
+
+void SEMANTIC(VCMPGTSH)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto r = gv_gts16(a, b);
+  if (cr6 != nullptr) {
+    cr6->set(gv_testall1(r), false, gv_testall0(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPGTSH) {
+  VCMPGTSH(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+           context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPGTSH);
+
+void SEMANTIC(VCMPGTSW)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto r = gv_gts32(a, b);
+  if (cr6 != nullptr) {
+    cr6->set(gv_testall1(r), false, gv_testall0(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPGTSW) {
+  VCMPGTSW(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+           context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPGTSW);
+
+void SEMANTIC(VCMPGTUB)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto r = gv_gtu8(a, b);
+  if (cr6 != nullptr) {
+    cr6->set(gv_testall1(r), false, gv_testall0(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPGTUB) {
+  VCMPGTUB(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+           context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPGTUB);
+
+void SEMANTIC(VCMPGTUH)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto r = gv_gtu16(a, b);
+  if (cr6 != nullptr) {
+    cr6->set(gv_testall1(r), false, gv_testall0(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPGTUH) {
+  VCMPGTUH(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+           context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPGTUH);
+
+void SEMANTIC(VCMPGTUW)(CrField *cr6, v128 &d, v128 a, v128 b) {
+  auto r = gv_gtu32(a, b);
+  if (cr6 != nullptr) {
+    cr6->set(gv_testall1(r), false, gv_testall0(r), false);
+  }
+  d = r;
+}
+void DECODER(VCMPGTUW) {
+  VCMPGTUW(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd],
+           context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VCMPGTUW);
+
+void SEMANTIC(VCTSXS)(v128 &d, v128 b, v128 &sat, u32 i) {
+  auto r = gv_mulfs(b, gv_bcst32(i));
+  auto l = gv_ltfs(r, gv_bcstfs(-2147483648.));
+  auto h = gv_gefs(r, gv_bcstfs(2147483648.));
+#if !defined(ARCH_X64) && !defined(ARCH_ARM64)
+  r = gv_selectfs(l, gv_bcstfs(-2147483648.), r);
+#endif
+  r = gv_cvtfs_tos32(r);
+#if !defined(ARCH_ARM64)
+  r = gv_select32(h, gv_bcst32(0x7fffffff), r);
+#endif
+  r = gv_and32(r, gv_eqfs(b, b));
+  sat = gv_or32(gv_or32(l, h), sat);
+  d = r;
+}
+void DECODER(VCTSXS) {
+  VCTSXS(context.vr[inst.vd], context.vr[inst.vb], context.sat,
+         (inst.vuimm + 127) << 23);
+}
+EXPORT_SEMANTIC(VCTSXS);
+
+void SEMANTIC(VCTUXS)(v128 &d, v128 b, v128 &sat, u32 i) {
+  auto r = gv_mulfs(b, gv_bcst32(i));
+  auto l = gv_ltfs(r, gv_bcstfs(0.));
+  auto h = gv_gefs(r, gv_bcstfs(4294967296.));
+  r = gv_cvtfs_tou32(r);
+#if !defined(ARCH_ARM64)
+  r = gv_andn32(l, r); // saturate to zero
+#endif
+#if !defined(__AVX512VL__) && !defined(ARCH_ARM64)
+  r = gv_or32(r, h); // saturate to 0xffffffff
+#endif
+  r = gv_and32(r, gv_eqfs(b, b));
+
+  sat = gv_or32(gv_or32(l, h), sat);
+  d = r;
+}
+void DECODER(VCTUXS) {
+  VCTUXS(context.vr[inst.vd], context.vr[inst.vb], context.sat,
+         (inst.vuimm + 127) << 23);
+}
+EXPORT_SEMANTIC(VCTUXS);
+
+void SEMANTIC(VEXPTEFP)(v128 &d, v128 b) {
+  // for (u32 i = 0; i < 4; i++) d._f[i] = std::exp2f(b._f[i]);
+  d = ppu_set_vnan(gv_exp2_approxfs(b));
+}
+void DECODER(VEXPTEFP) { VEXPTEFP(context.vr[inst.vd], context.vr[inst.vb]); }
+EXPORT_SEMANTIC(VEXPTEFP);
+
+void SEMANTIC(VLOGEFP)(v128 &d, v128 b) {
+  // for (u32 i = 0; i < 4; i++) d._f[i] = std::log2f(b._f[i]);
+  d = ppu_set_vnan(gv_log2_approxfs(b));
+}
+void DECODER(VLOGEFP) { VLOGEFP(context.vr[inst.vd], context.vr[inst.vb]); }
+EXPORT_SEMANTIC(VLOGEFP);
+
+void SEMANTIC(VMADDFP)(v128 &d, v128 a_, v128 b_, v128 c_, u32 jm_mask) {
+  auto m = gv_bcst32(jm_mask);
+  auto a = ppu_flush_denormal(m, a_);
+  auto b = ppu_flush_denormal(m, b_);
+  auto c = ppu_flush_denormal(m, c_);
+  d = ppu_flush_denormal(m, ppu_set_vnan(gv_fmafs(a, c, b)));
+}
+void DECODER(VMADDFP) {
+  VMADDFP(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.vr[inst.vc], context.jm_mask);
+}
+EXPORT_SEMANTIC(VMADDFP);
+
+void SEMANTIC(VMAXFP)(v128 &d, v128 a, v128 b, u32 jm_mask) {
+  d = ppu_flush_denormal(gv_bcst32(jm_mask),
+                         ppu_set_vnan(gv_maxfs(a, b), a, b));
+}
+void DECODER(VMAXFP) {
+  VMAXFP(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+         context.jm_mask);
+}
+EXPORT_SEMANTIC(VMAXFP);
+
+void SEMANTIC(VMAXSB)(v128 &d, v128 a, v128 b) { d = gv_maxs8(a, b); }
+void DECODER(VMAXSB) {
+  VMAXSB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMAXSB);
+
+void SEMANTIC(VMAXSH)(v128 &d, v128 a, v128 b) { d = gv_maxs16(a, b); }
+void DECODER(VMAXSH) {
+  VMAXSH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMAXSH);
+
+void SEMANTIC(VMAXSW)(v128 &d, v128 a, v128 b) { d = gv_maxs32(a, b); }
+void DECODER(VMAXSW) {
+  VMAXSW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMAXSW);
+
+void SEMANTIC(VMAXUB)(v128 &d, v128 a, v128 b) { d = gv_maxu8(a, b); }
+void DECODER(VMAXUB) {
+  VMAXUB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMAXUB);
+
+void SEMANTIC(VMAXUH)(v128 &d, v128 a, v128 b) { d = gv_maxu16(a, b); }
+void DECODER(VMAXUH) {
+  VMAXUH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMAXUH);
+
+void SEMANTIC(VMAXUW)(v128 &d, v128 a, v128 b) { d = gv_maxu32(a, b); }
+void DECODER(VMAXUW) {
+  VMAXUW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMAXUW);
+
+void SEMANTIC(VMHADDSHS)(v128 &d, v128 a, v128 b, v128 c, v128 &sat) {
+  auto m = gv_muls_hds16(a, b);
+  auto f = gv_gts16(gv_bcst16(0), c);
+  auto x = gv_eq16(gv_maxs16(a, b), gv_bcst16(0x8000));
+  auto r = gv_sub16(gv_adds_s16(m, c), gv_and32(x, f));
+  auto s = gv_add16(m, c);
+
+  sat = gv_or32(gv_or32(gv_andn32(f, x), gv_andn32(x, gv_xor32(s, r))), sat);
+  d = r;
+}
+void DECODER(VMHADDSHS) {
+  VMHADDSHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+            context.vr[inst.vc], context.sat);
+}
+EXPORT_SEMANTIC(VMHADDSHS);
+
+void SEMANTIC(VMHRADDSHS)(v128 &d, v128 a, v128 b, v128 c, v128 &sat) {
+  auto m = gv_rmuls_hds16(a, b);
+  auto f = gv_gts16(gv_bcst16(0), c);
+  auto x = gv_eq16(gv_maxs16(a, b), gv_bcst16(0x8000));
+  auto r = gv_sub16(gv_adds_s16(m, c), gv_and32(x, f));
+  auto s = gv_add16(m, c);
+  sat = gv_or32(gv_or32(gv_andn32(f, x), gv_andn32(x, gv_xor32(s, r))), sat);
+  d = r;
+}
+void DECODER(VMHRADDSHS) {
+  VMHRADDSHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+             context.vr[inst.vc], context.sat);
+}
+EXPORT_SEMANTIC(VMHRADDSHS);
+
+void SEMANTIC(VMINFP)(v128 &d, v128 a, v128 b, u32 jm_mask) {
+  d = ppu_flush_denormal(gv_bcst32(jm_mask),
+                         ppu_set_vnan(gv_minfs(a, b), a, b));
+}
+void DECODER(VMINFP) {
+  VMINFP(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+         context.jm_mask);
+}
+EXPORT_SEMANTIC(VMINFP);
+
+void SEMANTIC(VMINSB)(v128 &d, v128 a, v128 b) { d = gv_mins8(a, b); }
+void DECODER(VMINSB) {
+  VMINSB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMINSB);
+
+void SEMANTIC(VMINSH)(v128 &d, v128 a, v128 b) { d = gv_mins16(a, b); }
+void DECODER(VMINSH) {
+  VMINSH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMINSH);
+
+void SEMANTIC(VMINSW)(v128 &d, v128 a, v128 b) { d = gv_mins32(a, b); }
+void DECODER(VMINSW) {
+  VMINSW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMINSW);
+
+void SEMANTIC(VMINUB)(v128 &d, v128 a, v128 b) { d = gv_minu8(a, b); }
+void DECODER(VMINUB) {
+  VMINUB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMINUB);
+
+void SEMANTIC(VMINUH)(v128 &d, v128 a, v128 b) { d = gv_minu16(a, b); }
+void DECODER(VMINUH) {
+  VMINUH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMINUH);
+
+void SEMANTIC(VMINUW)(v128 &d, v128 a, v128 b) { d = gv_minu32(a, b); }
+void DECODER(VMINUW) {
+  VMINUW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMINUW);
+
+void SEMANTIC(VMLADDUHM)(v128 &d, v128 a, v128 b, v128 c) {
+  d = gv_muladd16(a, b, c);
+}
+void DECODER(VMLADDUHM) {
+  VMLADDUHM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+            context.vr[inst.vc]);
+}
+EXPORT_SEMANTIC(VMLADDUHM);
+
+void SEMANTIC(VMRGHB)(v128 &d, v128 a, v128 b) { d = gv_unpackhi8(b, a); }
+void DECODER(VMRGHB) {
+  VMRGHB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMRGHB);
+
+void SEMANTIC(VMRGHH)(v128 &d, v128 a, v128 &b) { d = gv_unpackhi16(b, a); }
+void DECODER(VMRGHH) {
+  VMRGHH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMRGHH);
+
+void SEMANTIC(VMRGHW)(v128 &d, v128 a, v128 b) { d = gv_unpackhi32(b, a); }
+void DECODER(VMRGHW) {
+  VMRGHW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMRGHW);
+
+void SEMANTIC(VMRGLB)(v128 &d, v128 a, v128 b) { d = gv_unpacklo8(b, a); }
+void DECODER(VMRGLB) {
+  VMRGLB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMRGLB);
+
+void SEMANTIC(VMRGLH)(v128 &d, v128 a, v128 b) { d = gv_unpacklo16(b, a); }
+void DECODER(VMRGLH) {
+  VMRGLH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMRGLH);
+
+void SEMANTIC(VMRGLW)(v128 &d, v128 a, v128 b) { d = gv_unpacklo32(b, a); }
+void DECODER(VMRGLW) {
+  VMRGLW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMRGLW);
+
+void SEMANTIC(VMSUMMBM)(v128 &d, v128 a, v128 b, v128 c) {
+  d = gv_dotu8s8x4(b, a, c);
+}
+void DECODER(VMSUMMBM) {
+  VMSUMMBM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+           context.vr[inst.vc]);
+}
+EXPORT_SEMANTIC(VMSUMMBM);
+
+void SEMANTIC(VMSUMSHM)(v128 &d, v128 a, v128 b, v128 c) {
+  d = gv_dots16x2(a, b, c);
+}
+void DECODER(VMSUMSHM) {
+  VMSUMSHM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+           context.vr[inst.vc]);
+}
+EXPORT_SEMANTIC(VMSUMSHM);
+
+void SEMANTIC(VMSUMSHS)(v128 &d, v128 a, v128 b, v128 c, v128 &sat) {
+  auto r = gv_dots_s16x2(a, b, c);
+  auto s = gv_dots16x2(a, b, c);
+  sat = gv_or32(gv_xor32(s, r), sat);
+  d = r;
+}
+void DECODER(VMSUMSHS) {
+  VMSUMSHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+           context.vr[inst.vc], context.sat);
+}
+EXPORT_SEMANTIC(VMSUMSHS);
+
+void SEMANTIC(VMSUMUBM)(v128 &d, v128 a, v128 b, v128 c) {
+  d = gv_dotu8x4(a, b, c);
+}
+void DECODER(VMSUMUBM) {
+  VMSUMUBM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+           context.vr[inst.vc]);
+}
+EXPORT_SEMANTIC(VMSUMUBM);
+
+void SEMANTIC(VMSUMUHM)(v128 &d, v128 a, v128 b, v128 c) {
+  d = gv_add32(c, gv_dotu16x2(a, b));
+}
+void DECODER(VMSUMUHM) {
+  VMSUMUHM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+           context.vr[inst.vc]);
+}
+EXPORT_SEMANTIC(VMSUMUHM);
+
+void SEMANTIC(VMSUMUHS)(v128 d, v128 a, v128 b, v128 c, v128 &sat) {
+  auto m1 = gv_mul_even_u16(a, b);
+  auto m2 = gv_mul_odds_u16(a, b);
+  auto s1 = gv_add32(m1, m2);
+  auto x1 = gv_gtu32(m1, s1);
+  auto s2 = gv_or32(gv_add32(s1, c), x1);
+  auto x2 = gv_gtu32(s1, s2);
+  sat = gv_or32(gv_or32(x1, x2), sat);
+  d = gv_or32(s2, x2);
+}
+void DECODER(VMSUMUHS) {
+  VMSUMUHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+           context.vr[inst.vc], context.sat);
+}
+EXPORT_SEMANTIC(VMSUMUHS);
+
+void SEMANTIC(VMULESB)(v128 &d, v128 a, v128 b) {
+  d = gv_mul16(gv_sar16(a, 8), gv_sar16(b, 8));
+}
+void DECODER(VMULESB) {
+  VMULESB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMULESB);
+
+void SEMANTIC(VMULESH)(v128 &d, v128 a, v128 b) { d = gv_mul_odds_s16(a, b); }
+void DECODER(VMULESH) {
+  VMULESH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMULESH);
+
+void SEMANTIC(VMULEUB)(v128 &d, v128 a, v128 b) {
+  d = gv_mul16(gv_shr16(a, 8), gv_shr16(b, 8));
+}
+void DECODER(VMULEUB) {
+  VMULEUB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMULEUB);
+
+void SEMANTIC(VMULEUH)(v128 &d, v128 a, v128 b) { d = gv_mul_odds_u16(a, b); }
+void DECODER(VMULEUH) {
+  VMULEUH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMULEUH);
+
+void SEMANTIC(VMULOSB)(v128 &d, v128 a, v128 b) {
+  d = gv_mul16(gv_sar16(gv_shl16(a, 8), 8), gv_sar16(gv_shl16(b, 8), 8));
+}
+void DECODER(VMULOSB) {
+  VMULOSB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMULOSB);
+
+void SEMANTIC(VMULOSH)(v128 &d, v128 a, v128 b) { d = gv_mul_even_s16(a, b); }
+void DECODER(VMULOSH) {
+  VMULOSH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMULOSH);
+
+void SEMANTIC(VMULOUB)(v128 &d, v128 a, v128 b) {
+  auto mask = gv_bcst16(0x00ff);
+  d = gv_mul16(gv_and32(a, mask), gv_and32(b, mask));
+}
+void DECODER(VMULOUB) {
+  VMULOUB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMULOUB);
+
+void SEMANTIC(VMULOUH)(v128 &d, v128 a, v128 b) { d = gv_mul_even_u16(a, b); }
+void DECODER(VMULOUH) {
+  VMULOUH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VMULOUH);
+
+void SEMANTIC(VNMSUBFP)(v128 &d, v128 a_, v128 b_, v128 c_, u32 jm_mask) {
+  // An odd case with (FLT_MIN, FLT_MIN, FLT_MIN) produces FLT_MIN instead of
+  // 0
+  auto s = gv_bcstfs(-0.0f);
+  auto m = gv_bcst32(jm_mask);
+  auto a = ppu_flush_denormal(m, a_);
+  auto b = ppu_flush_denormal(m, b_);
+  auto c = ppu_flush_denormal(m, c_);
+  auto r = gv_xorfs(s, gv_fmafs(a, c, gv_xorfs(b, s)));
+  d = ppu_flush_denormal(m, ppu_set_vnan(r));
+}
+void DECODER(VNMSUBFP) {
+  VNMSUBFP(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+           context.vr[inst.vc], context.jm_mask);
+}
+EXPORT_SEMANTIC(VNMSUBFP);
+
+void SEMANTIC(VNOR)(v128 &d, v128 a, v128 b) { d = gv_notfs(gv_orfs(a, b)); }
+void DECODER(VNOR) {
+  VNOR(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VNOR);
+
+void SEMANTIC(VOR)(v128 &d, v128 a, v128 b) { d = gv_orfs(a, b); }
+void DECODER(VOR) {
+  VOR(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VOR);
+
+void SEMANTIC(VPERM)(v128 &d, v128 a, v128 b, v128 c) {
+#if defined(ARCH_ARM64)
+  uint8x16x2_t ab;
+  ab.val[0] = b;
+  ab.val[1] = a;
+  d = vqtbl2q_u8(ab, vbicq_u8(vdupq_n_u8(0x1f), c));
+#else
+  u8 ab[32];
+  std::memcpy(ab + 0, &b, 16);
+  std::memcpy(ab + 16, &a, 16);
+
+  for (u32 i = 0; i < 16; i++) {
+    d._u8[i] = ab[~c._u8[i] & 0x1f];
+  }
+#endif
+}
+void DECODER(VPERM) {
+  VPERM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+        context.vr[inst.vc]);
+}
+EXPORT_SEMANTIC(VPERM);
+
+void SEMANTIC(VPKPX)(v128 &d, v128 a, v128 b) {
+  auto a1 = gv_sar32(gv_shl32(a, 7), 7 + 9);
+  auto b1 = gv_sar32(gv_shl32(b, 7), 7 + 9);
+  auto a2 = gv_sar32(gv_shl32(a, 16), 16 + 3);
+  auto b2 = gv_sar32(gv_shl32(b, 16), 16 + 3);
+  auto p1 = gv_packss_s32(b1, a1);
+  auto p2 = gv_packss_s32(b2, a2);
+  d = gv_or32(gv_or32(gv_and32(p1, gv_bcst16(0xfc00)),
+                      gv_shl16(gv_and32(p1, gv_bcst16(0x7c)), 3)),
+              gv_and32(p2, gv_bcst16(0x1f)));
+}
+void DECODER(VPKPX) {
+  VPKPX(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VPKPX);
+
+void SEMANTIC(VPKSHSS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  sat = gv_or32(
+      gv_shr16(gv_add16(a, gv_bcst16(0x80)) | gv_add16(b, gv_bcst16(0x80)), 8),
+      sat);
+  d = gv_packss_s16(b, a);
+}
+void DECODER(VPKSHSS) {
+  VPKSHSS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VPKSHSS);
+
+void SEMANTIC(VPKSHUS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  sat = gv_or32(gv_shr16(a | b, 8), sat);
+  d = gv_packus_s16(b, a);
+}
+void DECODER(VPKSHUS) {
+  VPKSHUS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VPKSHUS);
+
+void SEMANTIC(VPKSWSS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  sat = gv_or32(
+      gv_shr32(gv_add32(a, gv_bcst32(0x8000)) | gv_add32(b, gv_bcst32(0x8000)),
+               16),
+      sat);
+  d = gv_packss_s32(b, a);
+}
+void DECODER(VPKSWSS) {
+  VPKSWSS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VPKSWSS);
+
+void SEMANTIC(VPKSWUS)(v128 &d, v128 a, v128 b, v128 sat) {
+  sat = gv_or32(gv_shr32(a | b, 16), sat);
+  d = gv_packus_s32(b, a);
+}
+void DECODER(VPKSWUS) {
+  VPKSWUS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VPKSWUS);
+
+void SEMANTIC(VPKUHUM)(v128 &d, v128 a, v128 b) { d = gv_packtu16(b, a); }
+void DECODER(VPKUHUM) {
+  VPKUHUM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VPKUHUM);
+
+void SEMANTIC(VPKUHUS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  sat = gv_or32(gv_shr16(a | b, 8), sat);
+  d = gv_packus_u16(b, a);
+}
+void DECODER(VPKUHUS) {
+  VPKUHUS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VPKUHUS);
+
+void SEMANTIC(VPKUWUM)(v128 &d, v128 a, v128 b) { d = gv_packtu32(b, a); }
+void DECODER(VPKUWUM) {
+  VPKUWUM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VPKUWUM);
+
+void SEMANTIC(VPKUWUS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  sat = gv_or32(gv_shr32(a | b, 16), sat);
+  d = gv_packus_u32(b, a);
+}
+void DECODER(VPKUWUS) {
+  VPKUWUS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VPKUWUS);
+
+void SEMANTIC(VREFP)(v128 &d, v128 b_, u32 jm_mask) {
+  auto m = gv_bcst32(jm_mask);
+  auto b = ppu_flush_denormal(m, b_);
+  d = ppu_flush_denormal(m, ppu_set_vnan(gv_divfs(gv_bcstfs(1.0f), b), b));
+}
+void DECODER(VREFP) {
+  VREFP(context.vr[inst.vd], context.vr[inst.vb], context.jm_mask);
+}
+EXPORT_SEMANTIC(VREFP);
+
+void SEMANTIC(VRFIM)(v128 &d, v128 b_, u32 jm_mask) {
+  auto m = gv_bcst32(jm_mask);
+  auto b = ppu_flush_denormal(m, b_);
+  d = ppu_flush_denormal(m, ppu_set_vnan(gv_roundfs_floor(b), b));
+}
+void DECODER(VRFIM) {
+  VRFIM(context.vr[inst.vd], context.vr[inst.vb], context.jm_mask);
+}
+EXPORT_SEMANTIC(VRFIM);
+
+void SEMANTIC(VRFIN)(v128 &d, v128 b, u32 jm_mask) {
+  auto m = gv_bcst32(jm_mask);
+  d = ppu_flush_denormal(m, ppu_set_vnan(gv_roundfs_even(b), b));
+}
+void DECODER(VRFIN) {
+  VRFIN(context.vr[inst.vd], context.vr[inst.vb], context.jm_mask);
+}
+EXPORT_SEMANTIC(VRFIN);
+
+void SEMANTIC(VRFIP)(v128 &d, v128 b_, u32 jm_mask) {
+  auto m = gv_bcst32(jm_mask);
+  auto b = ppu_flush_denormal(m, b_);
+  d = ppu_flush_denormal(m, ppu_set_vnan(gv_roundfs_ceil(b), b));
+}
+void DECODER(VRFIP) {
+  VRFIP(context.vr[inst.vd], context.vr[inst.vb], context.jm_mask);
+}
+EXPORT_SEMANTIC(VRFIP);
+
+void SEMANTIC(VRFIZ)(v128 &d, v128 b, u32 jm_mask) {
+  auto m = gv_bcst32(jm_mask);
+  d = ppu_flush_denormal(m, ppu_set_vnan(gv_roundfs_trunc(b), b));
+}
+void DECODER(VRFIZ) {
+  VRFIZ(context.vr[inst.vd], context.vr[inst.vb], context.jm_mask);
+}
+EXPORT_SEMANTIC(VRFIZ);
+
+void SEMANTIC(VRLB)(v128 &d, v128 a, v128 b) { d = gv_rol8(a, b); }
+void DECODER(VRLB) {
+  VRLB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VRLB);
+
+void SEMANTIC(VRLH)(v128 &d, v128 a, v128 b) { d = gv_rol16(a, b); }
+void DECODER(VRLH) {
+  VRLH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VRLH);
+
+void SEMANTIC(VRLW)(v128 &d, v128 a, v128 b) { d = gv_rol32(a, b); }
+void DECODER(VRLW) {
+  VRLW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VRLW);
+
+void SEMANTIC(VRSQRTEFP)(v128 &d, v128 b_, u32 jm_mask) {
+  auto m = gv_bcst32(jm_mask);
+  auto b = ppu_flush_denormal(m, b_);
+  d = ppu_flush_denormal(
+      m, ppu_set_vnan(gv_divfs(gv_bcstfs(1.0f), gv_sqrtfs(b)), b));
+}
+void DECODER(VRSQRTEFP) {
+  VRSQRTEFP(context.vr[inst.vd], context.vr[inst.vb], context.jm_mask);
+}
+EXPORT_SEMANTIC(VRSQRTEFP);
+
+void SEMANTIC(VSEL)(v128 &d, v128 a, v128 b, v128 c) {
+  auto x = gv_andfs(b, c);
+  d = gv_orfs(x, gv_andnfs(c, a));
+}
+void DECODER(VSEL) {
+  VSEL(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+       context.vr[inst.vc]);
+}
+EXPORT_SEMANTIC(VSEL);
+
+void SEMANTIC(VSL)(v128 &d, v128 a, v128 b) {
+  d = gv_fshl8(a, gv_shuffle_left<1>(a), b);
+}
+void DECODER(VSL) {
+  VSL(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSL);
+
+void SEMANTIC(VSLB)(v128 &d, v128 a, v128 b) { d = gv_shl8(a, b); }
+void DECODER(VSLB) {
+  VSLB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSLB);
+
+template <u32 Count> static void VSLDOI_IMPL(v128 &d, v128 a, v128 b) {
+  d = gv_or32(gv_shuffle_left<Count>(a), gv_shuffle_right<16 - Count>(b));
+}
+void SEMANTIC(VSLDOI)(v128 &d, v128 a, v128 b, u32 vsh) {
+  switch (vsh) {
+  case 0:
+    VSLDOI_IMPL<0>(d, a, b);
+    break;
+  case 1:
+    VSLDOI_IMPL<1>(d, a, b);
+    break;
+  case 2:
+    VSLDOI_IMPL<2>(d, a, b);
+    break;
+  case 3:
+    VSLDOI_IMPL<3>(d, a, b);
+    break;
+  case 4:
+    VSLDOI_IMPL<4>(d, a, b);
+    break;
+  case 5:
+    VSLDOI_IMPL<5>(d, a, b);
+    break;
+  case 6:
+    VSLDOI_IMPL<6>(d, a, b);
+    break;
+  case 7:
+    VSLDOI_IMPL<7>(d, a, b);
+    break;
+  case 8:
+    VSLDOI_IMPL<8>(d, a, b);
+    break;
+  case 9:
+    VSLDOI_IMPL<9>(d, a, b);
+    break;
+  case 10:
+    VSLDOI_IMPL<10>(d, a, b);
+    break;
+  case 11:
+    VSLDOI_IMPL<11>(d, a, b);
+    break;
+  case 12:
+    VSLDOI_IMPL<12>(d, a, b);
+    break;
+  case 13:
+    VSLDOI_IMPL<13>(d, a, b);
+    break;
+  case 14:
+    VSLDOI_IMPL<14>(d, a, b);
+    break;
+  case 15:
+    VSLDOI_IMPL<15>(d, a, b);
+    break;
+  }
+}
+void DECODER(VSLDOI) {
+  VSLDOI(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+         inst.vsh);
+}
+EXPORT_SEMANTIC(VSLDOI);
+
+void SEMANTIC(VSLH)(v128 &d, v128 a, v128 b) { d = gv_shl16(a, b); }
+void DECODER(VSLH) {
+  VSLH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSLH);
+
+void SEMANTIC(VSLO)(v128 &d, v128 a, v128 b) {
+  d._u = a._u << (b._u8[0] & 0x78);
+}
+void DECODER(VSLO) {
+  VSLO(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSLO);
+
+void SEMANTIC(VSLW)(v128 &d, v128 a, v128 b) { d = gv_shl32(a, b); }
+void DECODER(VSLW) {
+  VSLW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSLW);
+
+void SEMANTIC(VSPLTB)(v128 &d, v128 b, std::uint32_t imm) {
+  d = gv_bcst8(b.u8r[imm & 15]);
+}
+void DECODER(VSPLTB) {
+  VSPLTB(context.vr[inst.vd], context.vr[inst.vb], inst.vuimm);
+}
+EXPORT_SEMANTIC(VSPLTB);
+
+void SEMANTIC(VSPLTH)(v128 &d, v128 b, std::uint32_t imm) {
+  d = gv_bcst16(b.u16r[imm & 7]);
+}
+void DECODER(VSPLTH) {
+  VSPLTH(context.vr[inst.vd], context.vr[inst.vb], inst.vuimm);
+}
+EXPORT_SEMANTIC(VSPLTH);
+
+void SEMANTIC(VSPLTISB)(v128 &d, std::int32_t imm) { d = gv_bcst8(imm); }
+void DECODER(VSPLTISB) { VSPLTISB(context.vr[inst.vd], inst.vsimm); }
+EXPORT_SEMANTIC(VSPLTISB);
+
+void SEMANTIC(VSPLTISH)(v128 &d, std::int32_t imm) { d = gv_bcst16(imm); }
+void DECODER(VSPLTISH) { VSPLTISH(context.vr[inst.vd], inst.vsimm); }
+EXPORT_SEMANTIC(VSPLTISH);
+
+void SEMANTIC(VSPLTISW)(v128 &d, std::int32_t imm) { d = gv_bcst32(imm); }
+void DECODER(VSPLTISW) { VSPLTISW(context.vr[inst.vd], inst.vsimm); }
+EXPORT_SEMANTIC(VSPLTISW);
+
+void SEMANTIC(VSPLTW)(v128 &d, v128 b, u32 imm) {
+  d = gv_bcst32(b.u32r[imm & 3]);
+}
+void DECODER(VSPLTW) {
+  VSPLTW(context.vr[inst.vd], context.vr[inst.vb], inst.vuimm);
+}
+EXPORT_SEMANTIC(VSPLTW);
+
+void SEMANTIC(VSR)(v128 &d, v128 a, v128 b) {
+  d = gv_fshr8(gv_shuffle_right<1>(a), a, b);
+}
+void DECODER(VSR) {
+  VSR(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSR);
+
+void SEMANTIC(VSRAB)(v128 &d, v128 a, v128 b) { d = gv_sar8(a, b); }
+void DECODER(VSRAB) {
+  VSRAB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSRAB);
+
+void SEMANTIC(VSRAH)(v128 &d, v128 a, v128 b) { d = gv_sar16(a, b); }
+void DECODER(VSRAH) {
+  VSRAH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSRAH);
+
+void SEMANTIC(VSRAW)(v128 &d, v128 a, v128 b) { d = gv_sar32(a, b); }
+void DECODER(VSRAW) {
+  VSRAW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSRAW);
+
+void SEMANTIC(VSRB)(v128 &d, v128 a, v128 b) { d = gv_shr8(a, b); }
+void DECODER(VSRB) {
+  VSRB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSRB);
+
+void SEMANTIC(VSRH)(v128 &d, v128 a, v128 b) { d = gv_shr16(a, b); }
+void DECODER(VSRH) {
+  VSRH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSRH);
+
+void SEMANTIC(VSRO)(v128 &d, v128 a, v128 b) {
+  d._u = a._u >> (b._u8[0] & 0x78);
+}
+void DECODER(VSRO) {
+  VSRO(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSRO);
+
+void SEMANTIC(VSRW)(v128 &d, v128 a, v128 b) { d = gv_shr32(a, b); }
+void DECODER(VSRW) {
+  VSRW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSRW);
+
+void SEMANTIC(VSUBCUW)(v128 &d, v128 a, v128 b) {
+  d = gv_shr32(gv_geu32(a, b), 31);
+}
+void DECODER(VSUBCUW) {
+  VSUBCUW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSUBCUW);
+
+void SEMANTIC(VSUBFP)(v128 &d, v128 a_, v128 b_, u32 jm_mask) {
+  auto m = gv_bcst32(jm_mask);
+  auto a = ppu_flush_denormal(m, a_);
+  auto b = ppu_flush_denormal(m, b_);
+  d = ppu_flush_denormal(m, ppu_set_vnan(gv_subfs(a, b), a, b));
+}
+void DECODER(VSUBFP) {
+  VSUBFP(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+         context.jm_mask);
+}
+EXPORT_SEMANTIC(VSUBFP);
+
+void SEMANTIC(VSUBSBS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_subs_s8(a, b);
+  sat = gv_or32(gv_xor32(gv_sub8(a, b), r), sat);
+  d = r;
+}
+void DECODER(VSUBSBS) {
+  VSUBSBS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VSUBSBS);
+
+void SEMANTIC(VSUBSHS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_subs_s16(a, b);
+  sat = gv_or32(gv_xor32(gv_sub16(a, b), r), sat);
+  d = r;
+}
+void DECODER(VSUBSHS) {
+  VSUBSHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VSUBSHS);
+
+void SEMANTIC(VSUBSWS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_subs_s32(a, b);
+  sat = gv_or32(gv_xor32(gv_sub32(a, b), r), sat);
+  d = r;
+}
+void DECODER(VSUBSWS) {
+  VSUBSWS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VSUBSWS);
+
+void SEMANTIC(VSUBUBM)(v128 &d, v128 a, v128 b) { d = gv_sub8(a, b); }
+void DECODER(VSUBUBM) {
+  VSUBUBM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSUBUBM);
+
+void SEMANTIC(VSUBUBS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_subus_u8(a, b);
+  sat = gv_or32(gv_xor32(gv_sub8(a, b), r), sat);
+  d = r;
+}
+void DECODER(VSUBUBS) {
+  VSUBUBS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VSUBUBS);
+
+void SEMANTIC(VSUBUHM)(v128 &d, v128 a, v128 b) { d = gv_sub16(a, b); }
+void DECODER(VSUBUHM) {
+  VSUBUHM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSUBUHM);
+
+void SEMANTIC(VSUBUHS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_subus_u16(a, b);
+  sat = gv_or32(gv_xor32(gv_sub16(a, b), r), sat);
+  d = r;
+}
+void DECODER(VSUBUHS) {
+  VSUBUHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VSUBUHS);
+
+void SEMANTIC(VSUBUWM)(v128 &d, v128 a, v128 b) { d = gv_sub32(a, b); }
+void DECODER(VSUBUWM) {
+  VSUBUWM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VSUBUWM);
+
+void SEMANTIC(VSUBUWS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_subus_u32(a, b);
+  sat = gv_or32(gv_xor32(gv_sub32(a, b), r), sat);
+  d = r;
+}
+void DECODER(VSUBUWS) {
+  VSUBUWS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VSUBUWS);
+
+void SEMANTIC(VSUMSWS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  s64 sum = s64{b._s32[0]} + a._s32[0] + a._s32[1] + a._s32[2] + a._s32[3];
+  if (sum > INT32_MAX) {
+    sum = u32(INT32_MAX);
+    sat._bytes[0] = 1;
+  } else if (sum < INT32_MIN) {
+    sum = u32(INT32_MIN);
+    sat._bytes[0] = 1;
+  } else {
+    sum = static_cast<u32>(sum);
+  }
+
+  d._u = sum;
+}
+void DECODER(VSUMSWS) {
+  VSUMSWS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+          context.sat);
+}
+EXPORT_SEMANTIC(VSUMSWS);
+
+void SEMANTIC(VSUM2SWS)(v128 &d, v128 a, v128 b, v128 &sat) {
+#if defined(__AVX512VL__)
+  const auto x = gv_add64(gv_sar64(gv_shl64(a, 32), 32), gv_sar64(a, 32));
+  const auto y = gv_add64(x, gv_sar64(gv_shl64(b, 32), 32));
+  const auto r =
+      _mm_unpacklo_epi32(_mm_cvtsepi64_epi32(y), _mm_setzero_si128());
+#elif defined(ARCH_ARM64)
+  const auto x =
+      vaddl_s32(vget_low_s32(vuzp1q_s32(a, a)), vget_low_s32(vuzp2q_s32(a, a)));
+  const auto y = vaddw_s32(x, vget_low_s32(vuzp1q_s32(b, b)));
+  const auto r = vmovl_u32(uint32x2_t(vqmovn_s64(y)));
+#else
+  v128 y{};
+  y._s64[0] = s64{a._s32[0]} + a._s32[1] + b._s32[0];
+  y._s64[1] = s64{a._s32[2]} + a._s32[3] + b._s32[2];
+  v128 r{};
+  r._u64[0] = y._s64[0] > INT32_MAX   ? INT32_MAX
+              : y._s64[0] < INT32_MIN ? u32(INT32_MIN)
+                                      : static_cast<u32>(y._s64[0]);
+  r._u64[1] = y._s64[1] > INT32_MAX   ? INT32_MAX
+              : y._s64[1] < INT32_MIN ? u32(INT32_MIN)
+                                      : static_cast<u32>(y._s64[1]);
+#endif
+  sat = gv_or32(gv_shr64(gv_add64(y, gv_bcst64(0x80000000u)), 32), sat);
+  d = r;
+}
+void DECODER(VSUM2SWS) {
+  VSUM2SWS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+           context.sat);
+}
+EXPORT_SEMANTIC(VSUM2SWS);
+
+void SEMANTIC(VSUM4SBS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_dots_u8s8x4(gv_bcst8(1), a, b);
+  sat = gv_or32(gv_xor32(gv_hadds8x4(a, b), r), sat);
+  d = r;
+}
+void DECODER(VSUM4SBS) {
+  VSUM4SBS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+           context.sat);
+}
+EXPORT_SEMANTIC(VSUM4SBS);
+
+void SEMANTIC(VSUM4SHS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto r = gv_dots_s16x2(a, gv_bcst16(1), b);
+  sat = gv_or32(gv_xor32(gv_hadds16x2(a, b), r), sat);
+  d = r;
+}
+void DECODER(VSUM4SHS) {
+  VSUM4SHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+           context.sat);
+}
+EXPORT_SEMANTIC(VSUM4SHS);
+
+void SEMANTIC(VSUM4UBS)(v128 &d, v128 a, v128 b, v128 &sat) {
+  auto x = gv_haddu8x4(a);
+  auto r = gv_addus_u32(x, b);
+  sat = gv_or32(gv_xor32(gv_add32(x, b), r), sat);
+  d = r;
+}
+void DECODER(VSUM4UBS) {
+  VSUM4UBS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb],
+           context.sat);
+}
+EXPORT_SEMANTIC(VSUM4UBS);
+
+void SEMANTIC(VUPKHPX)(v128 &d, v128 b) {
+  auto x = gv_extend_hi_s16(b);
+  auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)),
+                   gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)));
+  d = gv_or32(y, gv_and32(x, gv_bcst32(0xff00001f)));
+}
+void DECODER(VUPKHPX) { VUPKHPX(context.vr[inst.vd], context.vr[inst.vb]); }
+EXPORT_SEMANTIC(VUPKHPX);
+
+void SEMANTIC(VUPKHSB)(v128 &d, v128 b) { d = gv_extend_hi_s8(b); }
+void DECODER(VUPKHSB) { VUPKHSB(context.vr[inst.vd], context.vr[inst.vb]); }
+EXPORT_SEMANTIC(VUPKHSB);
+
+void SEMANTIC(VUPKHSH)(v128 &d, v128 b) { d = gv_extend_hi_s16(b); }
+void DECODER(VUPKHSH) { VUPKHSH(context.vr[inst.vd], context.vr[inst.vb]); }
+EXPORT_SEMANTIC(VUPKHSH);
+
+void SEMANTIC(VUPKLPX)(v128 &d, v128 b) {
+  auto x = gv_extend_lo_s16(b);
+  auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)),
+                   gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)));
+  d = gv_or32(y, gv_and32(x, gv_bcst32(0xff00001f)));
+}
+void DECODER(VUPKLPX) { VUPKLPX(context.vr[inst.vd], context.vr[inst.vb]); }
+EXPORT_SEMANTIC(VUPKLPX);
+
+void SEMANTIC(VUPKLSB)(v128 &d, v128 b) { d = gv_extend_lo_s8(b); }
+void DECODER(VUPKLSB) { VUPKLSB(context.vr[inst.vd], context.vr[inst.vb]); }
+EXPORT_SEMANTIC(VUPKLSB);
+
+void SEMANTIC(VUPKLSH)(v128 &d, v128 b) { d = gv_extend_lo_s16(b); }
+void DECODER(VUPKLSH) { VUPKLSH(context.vr[inst.vd], context.vr[inst.vb]); }
+EXPORT_SEMANTIC(VUPKLSH);
+
+void SEMANTIC(VXOR)(v128 &d, v128 a, v128 b) { d = gv_xorfs(a, b); }
+void DECODER(VXOR) {
+  VXOR(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]);
+}
+EXPORT_SEMANTIC(VXOR);
+
+void SEMANTIC(TDI)(s64 ra, u8 bo, s16 simm16) {
+  if ((bo & 0x10) && ra < (s64)simm16) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x8) && ra > (s64)simm16) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x4) && ra == (s64)simm16) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x2) && (u64)ra < (u64)simm16) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x1) && (u64)ra > (u64)simm16) {
+    rpcsx_trap();
+  }
+}
+void DECODER(TDI) { TDI(context.gpr[inst.ra], inst.bo, inst.simm16); }
+EXPORT_SEMANTIC(TDI);
+
+void SEMANTIC(TWI)(s32 ra, u8 bo, s16 simm16) {
+  if ((bo & 0x10) && ra < (s32)simm16) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x8) && ra > (s32)simm16) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x4) && ra == (s32)simm16) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x2) && (u32)ra < (u32)simm16) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x1) && (u32)ra > (u32)simm16) {
+    rpcsx_trap();
+  }
+}
+
+void DECODER(TWI) { TWI(context.gpr[inst.ra], inst.bo, inst.simm16); }
+
+EXPORT_SEMANTIC(TWI);
+
+void SEMANTIC(MULLI)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.rd] = static_cast<s64>(context.gpr[inst.ra]) * inst.simm16;
+}
+void DECODER(MULLI) { MULLI(context, inst); }
+EXPORT_SEMANTIC(MULLI);
+
+void SEMANTIC(SUBFIC)(PPUContext &context, Instruction inst) {
+  const u64 a = context.gpr[inst.ra];
+  const s64 i = inst.simm16;
+  const auto r = add64_flags(~a, i, 1);
+  context.gpr[inst.rd] = r.result;
+  context.xer_ca = r.carry;
+}
+void DECODER(SUBFIC) { SUBFIC(context, inst); }
+EXPORT_SEMANTIC(SUBFIC);
+
+void SEMANTIC(CMPLI)(PPUContext &context, Instruction inst) {
+  if (inst.l10) {
+    context.cr.fields[inst.crfd].update<u64>(context.gpr[inst.ra], inst.uimm16,
+                                             context.xer_so);
+  } else {
+    context.cr.fields[inst.crfd].update<u32>(
+        static_cast<u32>(context.gpr[inst.ra]), inst.uimm16, context.xer_so);
+  }
+}
+void DECODER(CMPLI) { CMPLI(context, inst); }
+EXPORT_SEMANTIC(CMPLI);
+
+void SEMANTIC(CMPI)(PPUContext &context, Instruction inst) {
+  if (inst.l10) {
+    context.cr.fields[inst.crfd].update<s64>(context.gpr[inst.ra], inst.simm16,
+                                             context.xer_so);
+  } else {
+    context.cr.fields[inst.crfd].update<s32>(
+        static_cast<s32>(context.gpr[inst.ra]), inst.simm16, context.xer_so);
+  }
+}
+void DECODER(CMPI) { CMPI(context, inst); }
+EXPORT_SEMANTIC(CMPI);
+
+void SEMANTIC(ADDIC)(PPUContext &context, Instruction inst) {
+  const s64 a = context.gpr[inst.ra];
+  const s64 i = inst.simm16;
+  const auto r = add64_flags(a, i);
+  context.gpr[inst.rd] = r.result;
+  context.xer_ca = r.carry;
+  if (inst.main & 1) [[unlikely]]
+    context.cr.fields[0].update<s64>(r.result, 0, context.xer_so);
+}
+void DECODER(ADDIC) { ADDIC(context, inst); }
+EXPORT_SEMANTIC(ADDIC);
+
+void SEMANTIC(ADDI)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.rd] =
+      inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+}
+void DECODER(ADDI) { ADDI(context, inst); }
+EXPORT_SEMANTIC(ADDI);
+
+void SEMANTIC(ADDIS)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.rd] = inst.ra ? context.gpr[inst.ra] + (inst.simm16 * 65536)
+                                 : (inst.simm16 * 65536);
+}
+void DECODER(ADDIS) { ADDIS(context, inst); }
+EXPORT_SEMANTIC(ADDIS);
+
+void SEMANTIC(BC)(std::uint32_t &cia, std::uint64_t &lr, std::uint64_t &ctr,
+                  u8 bo, u8 crBit, bool lk, std::uint32_t target) {
+  bool bo0 = (bo & 0x10) != 0;
+  bool bo1 = (bo & 0x08) != 0;
+  bool bo2 = (bo & 0x04) != 0;
+  bool bo3 = (bo & 0x02) != 0;
+
+  ctr -= (bo2 ^ true);
+
+  bool ctr_ok = bo2 | ((ctr != 0) ^ bo3);
+  bool cond_ok = bo0 | (!!crBit ^ (bo1 ^ true));
+
+  u32 nextInst = cia + 4;
+  if (lk) {
+    lr = nextInst;
+  }
+
+  if (ctr_ok && cond_ok) {
+    cia = target;
+  } else {
+    cia = nextInst;
+  }
+}
+void DECODER(BC) {
+  BC(context.cia, context.lr, context.ctr, inst.bo, context.cr[inst.bi],
+     inst.lk, (inst.aa ? 0 : context.cia) + inst.bt14);
+}
+EXPORT_SEMANTIC(BC);
+
+void SEMANTIC(SC)(PPUContext &context, std::uint64_t sysId) {
+  ppu_execute_syscall(context, sysId);
+}
+void DECODER(SC) { SC(context, context.gpr[11]); }
+EXPORT_SEMANTIC(SC);
+
+void SEMANTIC(B)(std::uint32_t &cia, std::uint64_t &lr, bool lk,
+                 std::uint32_t target) {
+  u32 nextInst = cia + 4;
+  if (lk) {
+    lr = nextInst;
+  }
+
+  cia = target;
+}
+void DECODER(B) {
+  B(context.cia, context.lr, inst.lk, (inst.aa ? 0 : context.cia) + inst.bt24);
+}
+EXPORT_SEMANTIC(B);
+
+void SEMANTIC(MCRF)(PPUContext &context, Instruction inst) {
+  context.cr.fields[inst.crfd] = context.cr.fields[inst.crfs];
+}
+void DECODER(MCRF) { MCRF(context, inst); }
+EXPORT_SEMANTIC(MCRF);
+
+void SEMANTIC(BCLR)(std::uint32_t &cia, std::uint64_t &lr, u64 &ctr, u8 bo,
+                    u8 crBit, bool lk) {
+  bool bo0 = (bo & 0x10) != 0;
+  bool bo1 = (bo & 0x08) != 0;
+  bool bo2 = (bo & 0x04) != 0;
+  bool bo3 = (bo & 0x02) != 0;
+
+  ctr -= (bo2 ^ true);
+
+  bool ctr_ok = bo2 | ((ctr != 0) ^ bo3);
+  bool cond_ok = bo0 | (!!crBit ^ (bo1 ^ true));
+
+  u32 target = static_cast<u32>(lr) & ~3;
+  u32 nextInst = cia + 4;
+  if (lk) {
+    lr = nextInst;
+  }
+
+  if (ctr_ok && cond_ok) {
+    cia = target;
+  } else {
+    cia = nextInst;
+  }
+}
+void DECODER(BCLR) {
+  BCLR(context.cia, context.lr, context.ctr, inst.bo, context.cr[inst.bi],
+       inst.lk);
+}
+EXPORT_SEMANTIC(BCLR);
+
+void SEMANTIC(CRNOR)(PPUContext &context, Instruction inst) {
+  context.cr[inst.crbd] =
+      (context.cr[inst.crba] | context.cr[inst.crbb]) ^ true;
+}
+void DECODER(CRNOR) { CRNOR(context, inst); }
+EXPORT_SEMANTIC(CRNOR);
+
+void SEMANTIC(CRANDC)(PPUContext &context, Instruction inst) {
+  context.cr[inst.crbd] =
+      context.cr[inst.crba] & (context.cr[inst.crbb] ^ true);
+}
+void DECODER(CRANDC) { CRANDC(context, inst); }
+EXPORT_SEMANTIC(CRANDC);
+
+void SEMANTIC(ISYNC)() { std::atomic_thread_fence(std::memory_order::acquire); }
+void DECODER(ISYNC) { ISYNC(); }
+EXPORT_SEMANTIC(ISYNC);
+
+void SEMANTIC(CRXOR)(PPUContext &context, Instruction inst) {
+  context.cr[inst.crbd] = context.cr[inst.crba] ^ context.cr[inst.crbb];
+}
+void DECODER(CRXOR) { CRXOR(context, inst); }
+EXPORT_SEMANTIC(CRXOR);
+
+void SEMANTIC(CRNAND)(PPUContext &context, Instruction inst) {
+  context.cr[inst.crbd] =
+      (context.cr[inst.crba] & context.cr[inst.crbb]) ^ true;
+}
+void DECODER(CRNAND) { CRNAND(context, inst); }
+EXPORT_SEMANTIC(CRNAND);
+
+void SEMANTIC(CRAND)(PPUContext &context, Instruction inst) {
+  context.cr[inst.crbd] = context.cr[inst.crba] & context.cr[inst.crbb];
+}
+void DECODER(CRAND) { CRAND(context, inst); }
+EXPORT_SEMANTIC(CRAND);
+
+void SEMANTIC(CREQV)(PPUContext &context, Instruction inst) {
+  context.cr[inst.crbd] =
+      (context.cr[inst.crba] ^ context.cr[inst.crbb]) ^ true;
+}
+void DECODER(CREQV) { CREQV(context, inst); }
+EXPORT_SEMANTIC(CREQV);
+
+void SEMANTIC(CRORC)(PPUContext &context, Instruction inst) {
+  context.cr[inst.crbd] =
+      context.cr[inst.crba] | (context.cr[inst.crbb] ^ true);
+}
+void DECODER(CRORC) { CRORC(context, inst); }
+EXPORT_SEMANTIC(CRORC);
+
+void SEMANTIC(CROR)(PPUContext &context, Instruction inst) {
+  context.cr[inst.crbd] = context.cr[inst.crba] | context.cr[inst.crbb];
+}
+void DECODER(CROR) { CROR(context, inst); }
+EXPORT_SEMANTIC(CROR);
+
+void SEMANTIC(BCCTR)(std::uint32_t &cia, std::uint64_t &lr, std::uint64_t ctr,
+                     u8 bo, u8 crBit, bool lk) {
+  u32 target = static_cast<u32>(ctr) & ~3;
+  u32 nextInst = cia + 4;
+
+  if (lk) {
+    lr = nextInst;
+  }
+
+  if (bo & 0x10 || crBit == ((bo & 0x8) != 0)) {
+    cia = target;
+  } else {
+    cia = nextInst;
+  }
+}
+void DECODER(BCCTR) {
+  BCCTR(context.cia, context.lr, context.ctr, inst.bo, context.cr[inst.bi],
+        inst.lk);
+}
+EXPORT_SEMANTIC(BCCTR);
+
+void SEMANTIC(RLWIMI)(PPUContext &context, Instruction inst) {
+  const u64 mask = ppu_rotate_mask(32 + inst.mb32, 32 + inst.me32);
+  context.gpr[inst.ra] =
+      (context.gpr[inst.ra] & ~mask) |
+      (dup32(rol32(static_cast<u32>(context.gpr[inst.rs]), inst.sh32)) & mask);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(RLWIMI) { RLWIMI(context, inst); }
+EXPORT_SEMANTIC(RLWIMI);
+
+void SEMANTIC(RLWINM)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] =
+      dup32(rol32(static_cast<u32>(context.gpr[inst.rs]), inst.sh32)) &
+      ppu_rotate_mask(32 + inst.mb32, 32 + inst.me32);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(RLWINM) { RLWINM(context, inst); }
+EXPORT_SEMANTIC(RLWINM);
+
+void SEMANTIC(RLWNM)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = dup32(rol32(static_cast<u32>(context.gpr[inst.rs]),
+                                     context.gpr[inst.rb] & 0x1f)) &
+                         ppu_rotate_mask(32 + inst.mb32, 32 + inst.me32);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(RLWNM) { RLWNM(context, inst); }
+EXPORT_SEMANTIC(RLWNM);
+
+void SEMANTIC(ORI)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = context.gpr[inst.rs] | inst.uimm16;
+}
+void DECODER(ORI) { ORI(context, inst); }
+EXPORT_SEMANTIC(ORI);
+
+void SEMANTIC(ORIS)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = context.gpr[inst.rs] | (u64{inst.uimm16} << 16);
+}
+void DECODER(ORIS) { ORIS(context, inst); }
+EXPORT_SEMANTIC(ORIS);
+
+void SEMANTIC(XORI)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = context.gpr[inst.rs] ^ inst.uimm16;
+}
+void DECODER(XORI) { XORI(context, inst); }
+EXPORT_SEMANTIC(XORI);
+
+void SEMANTIC(XORIS)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = context.gpr[inst.rs] ^ (u64{inst.uimm16} << 16);
+}
+void DECODER(XORIS) { XORIS(context, inst); }
+EXPORT_SEMANTIC(XORIS);
+
+void SEMANTIC(ANDI)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = context.gpr[inst.rs] & inst.uimm16;
+  context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+}
+void DECODER(ANDI) { ANDI(context, inst); }
+EXPORT_SEMANTIC(ANDI);
+
+void SEMANTIC(ANDIS)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = context.gpr[inst.rs] & (u64{inst.uimm16} << 16);
+  context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+}
+void DECODER(ANDIS) { ANDIS(context, inst); }
+EXPORT_SEMANTIC(ANDIS);
+
+void SEMANTIC(RLDICL)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] =
+      rol64(context.gpr[inst.rs], inst.sh64) & (~0ull >> inst.mbe64);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(RLDICL) { RLDICL(context, inst); }
+EXPORT_SEMANTIC(RLDICL);
+
+void SEMANTIC(RLDICR)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] =
+      rol64(context.gpr[inst.rs], inst.sh64) & (~0ull << (inst.mbe64 ^ 63));
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(RLDICR) { RLDICR(context, inst); }
+EXPORT_SEMANTIC(RLDICR);
+
+void SEMANTIC(RLDIC)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = rol64(context.gpr[inst.rs], inst.sh64) &
+                         ppu_rotate_mask(inst.mbe64, inst.sh64 ^ 63);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(RLDIC) { RLDIC(context, inst); }
+EXPORT_SEMANTIC(RLDIC);
+
+void SEMANTIC(RLDIMI)(PPUContext &context, Instruction inst) {
+  const u64 mask = ppu_rotate_mask(inst.mbe64, inst.sh64 ^ 63);
+  context.gpr[inst.ra] = (context.gpr[inst.ra] & ~mask) |
+                         (rol64(context.gpr[inst.rs], inst.sh64) & mask);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(RLDIMI) { RLDIMI(context, inst); }
+EXPORT_SEMANTIC(RLDIMI);
+
+void SEMANTIC(RLDCL)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] =
+      rol64(context.gpr[inst.rs], context.gpr[inst.rb] & 0x3f) &
+      (~0ull >> inst.mbe64);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(RLDCL) { RLDCL(context, inst); }
+EXPORT_SEMANTIC(RLDCL);
+
+void SEMANTIC(RLDCR)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] =
+      rol64(context.gpr[inst.rs], context.gpr[inst.rb] & 0x3f) &
+      (~0ull << (inst.mbe64 ^ 63));
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(RLDCR) { RLDCR(context, inst); }
+EXPORT_SEMANTIC(RLDCR);
+
+void SEMANTIC(CMP)(PPUContext &context, Instruction inst) {
+  if (inst.l10) {
+    context.cr.fields[inst.crfd].update<s64>(
+        context.gpr[inst.ra], context.gpr[inst.rb], context.xer_so);
+  } else {
+    context.cr.fields[inst.crfd].update<s32>(
+        context.gpr[inst.ra], static_cast<s32>(context.gpr[inst.rb]),
+        static_cast<s32>(context.xer_so));
+  }
+}
+void DECODER(CMP) { CMP(context, inst); }
+EXPORT_SEMANTIC(CMP);
+
+void SEMANTIC(TW)(s32 ra, u8 bo, s32 rb) {
+  if ((bo & 0x10) && ra < rb) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x8) && ra > rb) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x4) && ra == rb) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x2) && (u32)ra < (u32)rb) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x1) && (u32)ra > (u32)rb) {
+    rpcsx_trap();
+  }
+}
+
+void DECODER(TW) { TW(context.gpr[inst.ra], inst.bo, context.gpr[inst.rb]); }
+
+EXPORT_SEMANTIC(TW);
+
+static const v128 s_lvsl_base =
+    v128::from64r(0x0001020304050607, 0x08090a0b0c0d0e0f);
+
+static const v128 s_lvsl_consts[16] = {
+    gv_add8(s_lvsl_base, gv_bcst8(0)),  gv_add8(s_lvsl_base, gv_bcst8(1)),
+    gv_add8(s_lvsl_base, gv_bcst8(2)),  gv_add8(s_lvsl_base, gv_bcst8(3)),
+    gv_add8(s_lvsl_base, gv_bcst8(4)),  gv_add8(s_lvsl_base, gv_bcst8(5)),
+    gv_add8(s_lvsl_base, gv_bcst8(6)),  gv_add8(s_lvsl_base, gv_bcst8(7)),
+    gv_add8(s_lvsl_base, gv_bcst8(8)),  gv_add8(s_lvsl_base, gv_bcst8(9)),
+    gv_add8(s_lvsl_base, gv_bcst8(10)), gv_add8(s_lvsl_base, gv_bcst8(11)),
+    gv_add8(s_lvsl_base, gv_bcst8(12)), gv_add8(s_lvsl_base, gv_bcst8(13)),
+    gv_add8(s_lvsl_base, gv_bcst8(14)), gv_add8(s_lvsl_base, gv_bcst8(15)),
+};
+
+void SEMANTIC(LVSL)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.vr[inst.vd] = s_lvsl_consts[addr % 16];
+}
+void DECODER(LVSL) { LVSL(context, inst); }
+EXPORT_SEMANTIC(LVSL);
+
+void SEMANTIC(LVEBX)(PPUContext &context, Instruction inst) {
+  const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                            : context.gpr[inst.rb]) &
+                   ~0xfull;
+  context.vr[inst.vd] = ppu_feed_data<v128>(context, addr);
+}
+void DECODER(LVEBX) { LVEBX(context, inst); }
+EXPORT_SEMANTIC(LVEBX);
+
+void SEMANTIC(SUBFC)(PPUContext &context, Instruction inst) {
+  const u64 RA = context.gpr[inst.ra];
+  const u64 RB = context.gpr[inst.rb];
+  const auto r = add64_flags(~RA, RB, 1);
+  context.gpr[inst.rd] = r.result;
+  context.xer_ca = r.carry;
+
+  if (inst.oe) {
+    context.setOV((~RA >> 63 == RB >> 63) &&
+                  (~RA >> 63 != context.gpr[inst.rd] >> 63));
+  }
+
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(r.result, 0, context.xer_so);
+  }
+}
+void DECODER(SUBFC) { SUBFC(context, inst); }
+EXPORT_SEMANTIC(SUBFC);
+
+void SEMANTIC(MULHDU)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.rd] = umulh64(context.gpr[inst.ra], context.gpr[inst.rb]);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(MULHDU) { MULHDU(context, inst); }
+EXPORT_SEMANTIC(MULHDU);
+
+void SEMANTIC(ADDC)(PPUContext &context, Instruction inst) {
+  const u64 RA = context.gpr[inst.ra];
+  const u64 RB = context.gpr[inst.rb];
+  const auto r = add64_flags(RA, RB);
+  context.gpr[inst.rd] = r.result;
+  context.xer_ca = r.carry;
+
+  if (inst.oe) {
+    context.setOV((RA >> 63 == RB >> 63) &&
+                  (RA >> 63 != context.gpr[inst.rd] >> 63));
+  }
+
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(r.result, 0, context.xer_so);
+  }
+}
+void DECODER(ADDC) { ADDC(context, inst); }
+EXPORT_SEMANTIC(ADDC);
+
+void SEMANTIC(MULHWU)(PPUContext &context, Instruction inst) {
+  u32 a = static_cast<u32>(context.gpr[inst.ra]);
+  u32 b = static_cast<u32>(context.gpr[inst.rb]);
+  context.gpr[inst.rd] = (u64{a} * b) >> 32;
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(MULHWU) { MULHWU(context, inst); }
+EXPORT_SEMANTIC(MULHWU);
+
+void SEMANTIC(MFCR)(PPUContext &context, std::uint64_t &d) {
+#if defined(ARCH_X64)
+  be_t<v128> lane0, lane1;
+  std::memcpy(&lane0, context.cr.fields, sizeof(v128));
+  std::memcpy(&lane1, context.cr.fields + 4, sizeof(v128));
+  const u32 mh = _mm_movemask_epi8(_mm_slli_epi64(lane0.value(), 7));
+  const u32 ml = _mm_movemask_epi8(_mm_slli_epi64(lane1.value(), 7));
+
+  d = (mh << 16) | ml;
+#else
+  d = context.cr.pack();
+#endif
+}
+void DECODER(MFCR) { MFCR(context, context.gpr[inst.rd]); }
+EXPORT_SEMANTIC(MFCR);
+
+void SEMANTIC(MFOCRF)(u64 &d, u32 crIndex, CrField &cr) {
+  const u32 v =
+      cr.bits[0] << 3 | cr.bits[1] << 2 | cr.bits[2] << 1 | cr.bits[3] << 0;
+
+  d = v << ((crIndex * 4) ^ 0x1c);
+}
+void DECODER(MFOCRF) {
+  if (inst.l11) {
+    auto crIndex = std::countl_zero<u32>(inst.crm) & 7;
+    MFOCRF(context.gpr[inst.rd], crIndex, context.cr.fields[crIndex]);
+  } else {
+    MFCR(context, context.gpr[inst.rd]);
+  }
+}
+EXPORT_SEMANTIC(MFOCRF);
+
+void SEMANTIC(LWARX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_lwarx(context, vm::cast(addr));
+}
+void DECODER(LWARX) { LWARX(context, inst); }
+EXPORT_SEMANTIC(LWARX);
+
+void SEMANTIC(LDX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<u64>(context, addr);
+}
+void DECODER(LDX) { LDX(context, inst); }
+EXPORT_SEMANTIC(LDX);
+
+void SEMANTIC(LWZX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<u32>(context, addr);
+}
+void DECODER(LWZX) { LWZX(context, inst); }
+EXPORT_SEMANTIC(LWZX);
+
+void SEMANTIC(SLW)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] =
+      static_cast<u32>(context.gpr[inst.rs] << (context.gpr[inst.rb] & 0x3f));
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(SLW) { SLW(context, inst); }
+EXPORT_SEMANTIC(SLW);
+
+void SEMANTIC(CNTLZW)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] =
+      std::countl_zero(static_cast<u32>(context.gpr[inst.rs]));
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(CNTLZW) { CNTLZW(context, inst); }
+EXPORT_SEMANTIC(CNTLZW);
+
+void SEMANTIC(SLD)(PPUContext &context, Instruction inst) {
+  const u32 n = context.gpr[inst.rb] & 0x7f;
+  context.gpr[inst.ra] = n & 0x40 ? 0 : context.gpr[inst.rs] << n;
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(SLD) { SLD(context, inst); }
+EXPORT_SEMANTIC(SLD);
+
+void SEMANTIC(AND)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = context.gpr[inst.rs] & context.gpr[inst.rb];
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(AND) { AND(context, inst); }
+EXPORT_SEMANTIC(AND);
+
+void SEMANTIC(CMPL)(PPUContext &context, Instruction inst) {
+  if (inst.l10) {
+    context.cr.fields[inst.crfd].update<u64>(
+        context.gpr[inst.ra], context.gpr[inst.rb], context.xer_so);
+  } else {
+    context.cr.fields[inst.crfd].update<u32>(
+        static_cast<u32>(context.gpr[inst.ra]),
+        static_cast<u32>(context.gpr[inst.rb]), context.xer_so);
+  }
+}
+void DECODER(CMPL) { CMPL(context, inst); }
+EXPORT_SEMANTIC(CMPL);
+
+static const v128 s_lvsr_consts[16] = {
+    gv_add8(s_lvsl_base, gv_bcst8(16)), gv_add8(s_lvsl_base, gv_bcst8(15)),
+    gv_add8(s_lvsl_base, gv_bcst8(14)), gv_add8(s_lvsl_base, gv_bcst8(13)),
+    gv_add8(s_lvsl_base, gv_bcst8(12)), gv_add8(s_lvsl_base, gv_bcst8(11)),
+    gv_add8(s_lvsl_base, gv_bcst8(10)), gv_add8(s_lvsl_base, gv_bcst8(9)),
+    gv_add8(s_lvsl_base, gv_bcst8(8)),  gv_add8(s_lvsl_base, gv_bcst8(7)),
+    gv_add8(s_lvsl_base, gv_bcst8(6)),  gv_add8(s_lvsl_base, gv_bcst8(5)),
+    gv_add8(s_lvsl_base, gv_bcst8(4)),  gv_add8(s_lvsl_base, gv_bcst8(3)),
+    gv_add8(s_lvsl_base, gv_bcst8(2)),  gv_add8(s_lvsl_base, gv_bcst8(1)),
+};
+
+void SEMANTIC(LVSR)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.vr[inst.vd] = s_lvsr_consts[addr % 16];
+}
+void DECODER(LVSR) { LVSR(context, inst); }
+EXPORT_SEMANTIC(LVSR);
+
+void SEMANTIC(LVEHX)(PPUContext &context, Instruction inst) {
+  const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                            : context.gpr[inst.rb]) &
+                   ~0xfull;
+  context.vr[inst.vd] = ppu_feed_data<v128>(context, addr);
+}
+void DECODER(LVEHX) { LVEHX(context, inst); }
+EXPORT_SEMANTIC(LVEHX);
+
+void SEMANTIC(SUBF)(PPUContext &context, Instruction inst) {
+  const u64 RA = context.gpr[inst.ra];
+  const u64 RB = context.gpr[inst.rb];
+  context.gpr[inst.rd] = RB - RA;
+
+  if (inst.oe) {
+    context.setOV((~RA >> 63 == RB >> 63) &&
+                  (~RA >> 63 != context.gpr[inst.rd] >> 63));
+  }
+
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(SUBF) { SUBF(context, inst); }
+EXPORT_SEMANTIC(SUBF);
+
+void SEMANTIC(LDUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<u64>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LDUX) { LDUX(context, inst); }
+EXPORT_SEMANTIC(LDUX);
+
+void SEMANTIC(DCBST)() {}
+void DECODER(DCBST) { DCBST(); }
+EXPORT_SEMANTIC(DCBST);
+
+void SEMANTIC(LWZUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<u32>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LWZUX) { LWZUX(context, inst); }
+EXPORT_SEMANTIC(LWZUX);
+
+void SEMANTIC(CNTLZD)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = std::countl_zero(context.gpr[inst.rs]);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(CNTLZD) { CNTLZD(context, inst); }
+EXPORT_SEMANTIC(CNTLZD);
+
+void SEMANTIC(ANDC)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = context.gpr[inst.rs] & ~context.gpr[inst.rb];
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(ANDC) { ANDC(context, inst); }
+EXPORT_SEMANTIC(ANDC);
+
+void SEMANTIC(TD)(s64 ra, u8 bo, s64 rb) {
+  if ((bo & 0x10) && ra < rb) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x8) && ra > rb) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x4) && ra == rb) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x2) && (u64)ra < (u64)rb) {
+    rpcsx_trap();
+  }
+
+  if ((bo & 0x1) && (u64)ra > (u64)rb) {
+    rpcsx_trap();
+  }
+}
+void DECODER(TD) { TD(context.gpr[inst.ra], inst.bo, context.gpr[inst.rb]); }
+EXPORT_SEMANTIC(TD);
+
+void SEMANTIC(LVEWX)(PPUContext &context, Instruction inst) {
+  const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                            : context.gpr[inst.rb]) &
+                   ~0xfull;
+  context.vr[inst.vd] = ppu_feed_data<v128>(context, addr);
+}
+void DECODER(LVEWX) { LVEWX(context, inst); }
+EXPORT_SEMANTIC(LVEWX);
+
+void SEMANTIC(MULHD)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.rd] = mulh64(context.gpr[inst.ra], context.gpr[inst.rb]);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(MULHD) { MULHD(context, inst); }
+EXPORT_SEMANTIC(MULHD);
+
+void SEMANTIC(MULHW)(PPUContext &context, Instruction inst) {
+  s32 a = static_cast<s32>(context.gpr[inst.ra]);
+  s32 b = static_cast<s32>(context.gpr[inst.rb]);
+  context.gpr[inst.rd] = (s64{a} * b) >> 32;
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(MULHW) { MULHW(context, inst); }
+EXPORT_SEMANTIC(MULHW);
+
+void SEMANTIC(LDARX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_ldarx(context, vm::cast(addr));
+}
+void DECODER(LDARX) { LDARX(context, inst); }
+EXPORT_SEMANTIC(LDARX);
+
+void SEMANTIC(DCBF)() {}
+void DECODER(DCBF) { DCBF(); }
+EXPORT_SEMANTIC(DCBF);
+
+void SEMANTIC(LBZX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<u8>(context, addr);
+}
+void DECODER(LBZX) { LBZX(context, inst); }
+EXPORT_SEMANTIC(LBZX);
+
+void SEMANTIC(LVX)(PPUContext &context, Instruction inst) {
+  const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                            : context.gpr[inst.rb]) &
+                   ~0xfull;
+  context.vr[inst.vd] = ppu_feed_data<v128>(context, addr);
+}
+void DECODER(LVX) { LVX(context, inst); }
+EXPORT_SEMANTIC(LVX);
+
+void SEMANTIC(NEG)(PPUContext &context, Instruction inst) {
+  const u64 RA = context.gpr[inst.ra];
+  context.gpr[inst.rd] = 0 - RA;
+
+  if (inst.oe) {
+    // FIXME: verify
+    context.setOV(RA == (1ull << 63));
+  }
+
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(NEG) { NEG(context, inst); }
+EXPORT_SEMANTIC(NEG);
+
+void SEMANTIC(LBZUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<u8>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LBZUX) { LBZUX(context, inst); }
+EXPORT_SEMANTIC(LBZUX);
+
+void SEMANTIC(NOR)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = ~(context.gpr[inst.rs] | context.gpr[inst.rb]);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(NOR) { NOR(context, inst); }
+EXPORT_SEMANTIC(NOR);
+
+void SEMANTIC(STVEBX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  const u8 eb = addr & 0xf;
+  vm::write(vm::cast(addr), context.vr[inst.vs]._u8[15 - eb]);
+}
+void DECODER(STVEBX) { STVEBX(context, inst); }
+EXPORT_SEMANTIC(STVEBX);
+
+void SEMANTIC(SUBFE)(PPUContext &context, Instruction inst) {
+  const u64 RA = context.gpr[inst.ra];
+  const u64 RB = context.gpr[inst.rb];
+  const auto r = add64_flags(~RA, RB, context.xer_ca);
+  context.gpr[inst.rd] = r.result;
+  context.xer_ca = r.carry;
+
+  if (inst.oe) {
+    context.setOV((~RA >> 63 == RB >> 63) &&
+                  (~RA >> 63 != context.gpr[inst.rd] >> 63));
+  }
+
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(r.result, 0, context.xer_so);
+  }
+}
+void DECODER(SUBFE) { SUBFE(context, inst); }
+EXPORT_SEMANTIC(SUBFE);
+
+void SEMANTIC(ADDE)(PPUContext &context, Instruction inst) {
+  const u64 RA = context.gpr[inst.ra];
+  const u64 RB = context.gpr[inst.rb];
+  const auto r = add64_flags(RA, RB, context.xer_ca);
+  context.gpr[inst.rd] = r.result;
+  context.xer_ca = r.carry;
+
+  if (inst.oe) {
+    context.setOV((RA >> 63 == RB >> 63) &&
+                  (RA >> 63 != context.gpr[inst.rd] >> 63));
+  }
+
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(r.result, 0, context.xer_so);
+  }
+}
+void DECODER(ADDE) { ADDE(context, inst); }
+EXPORT_SEMANTIC(ADDE);
+
+void SEMANTIC(MTOCRF)(PPUContext &context, Instruction inst) {
+  static constexpr CrField s_table[16]{
+      CrField::From(false, false, false, false),
+      CrField::From(false, false, false, true),
+      CrField::From(false, false, true, false),
+      CrField::From(false, false, true, true),
+      CrField::From(false, true, false, false),
+      CrField::From(false, true, false, true),
+      CrField::From(false, true, true, false),
+      CrField::From(false, true, true, true),
+      CrField::From(true, false, false, false),
+      CrField::From(true, false, false, true),
+      CrField::From(true, false, true, false),
+      CrField::From(true, false, true, true),
+      CrField::From(true, true, false, false),
+      CrField::From(true, true, false, true),
+      CrField::From(true, true, true, false),
+      CrField::From(true, true, true, true),
+  };
+
+  const u64 s = context.gpr[inst.rs];
+
+  if (inst.l11) {
+    // MTOCRF
+
+    const u32 n = std::countl_zero<u32>(inst.crm) & 7;
+    const u64 v = (s >> ((n * 4) ^ 0x1c)) & 0xf;
+    context.cr.fields[n] = s_table[v];
+  } else {
+    // MTCRF
+
+    for (u32 i = 0; i < 8; i++) {
+      if (inst.crm & (128 >> i)) {
+        const u64 v = (s >> ((i * 4) ^ 0x1c)) & 0xf;
+        context.cr.fields[i] = s_table[v];
+      }
+    }
+  }
+}
+void DECODER(MTOCRF) { MTOCRF(context, inst); }
+EXPORT_SEMANTIC(MTOCRF);
+
+void SEMANTIC(STDX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  vm::write(vm::cast(addr), context.gpr[inst.rs]);
+}
+void DECODER(STDX) { STDX(context, inst); }
+EXPORT_SEMANTIC(STDX);
+
+void SEMANTIC(STWCX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.cr.fields[0].set(false, false,
+                           ppu_stwcx(context, vm::cast(addr),
+                                     static_cast<u32>(context.gpr[inst.rs])),
+                           context.xer_so);
+}
+void DECODER(STWCX) { STWCX(context, inst); }
+EXPORT_SEMANTIC(STWCX);
+
+void SEMANTIC(STWX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  vm::write(vm::cast(addr), static_cast<u32>(context.gpr[inst.rs]));
+}
+void DECODER(STWX) { STWX(context, inst); }
+EXPORT_SEMANTIC(STWX);
+
+void SEMANTIC(STVEHX)(PPUContext &context, Instruction inst) {
+  const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                            : context.gpr[inst.rb]) &
+                   ~1ULL;
+  const u8 eb = (addr & 0xf) >> 1;
+  vm::write(vm::cast(addr), context.vr[inst.vs]._u16[7 - eb]);
+}
+void DECODER(STVEHX) { STVEHX(context, inst); }
+EXPORT_SEMANTIC(STVEHX);
+
+void SEMANTIC(STDUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb];
+  vm::write(vm::cast(addr), context.gpr[inst.rs]);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(STDUX) { STDUX(context, inst); }
+EXPORT_SEMANTIC(STDUX);
+
+void SEMANTIC(STWUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb];
+  vm::write(vm::cast(addr), static_cast<u32>(context.gpr[inst.rs]));
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(STWUX) { STWUX(context, inst); }
+EXPORT_SEMANTIC(STWUX);
+
+void SEMANTIC(STVEWX)(PPUContext &context, Instruction inst) {
+  const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                            : context.gpr[inst.rb]) &
+                   ~3ULL;
+  const u8 eb = (addr & 0xf) >> 2;
+  vm::write(vm::cast(addr), context.vr[inst.vs]._u32[3 - eb]);
+}
+void DECODER(STVEWX) { STVEWX(context, inst); }
+EXPORT_SEMANTIC(STVEWX);
+
+void SEMANTIC(SUBFZE)(PPUContext &context, Instruction inst) {
+  const u64 RA = context.gpr[inst.ra];
+  const auto r = add64_flags(~RA, 0, context.xer_ca);
+  context.gpr[inst.rd] = r.result;
+  context.xer_ca = r.carry;
+
+  if (inst.oe) {
+    context.setOV((~RA >> 63 == 0) && (~RA >> 63 != r.result >> 63));
+  }
+
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(r.result, 0, context.xer_so);
+  }
+}
+void DECODER(SUBFZE) { SUBFZE(context, inst); }
+EXPORT_SEMANTIC(SUBFZE);
+
+void SEMANTIC(ADDZE)(PPUContext &context, Instruction inst) {
+  const u64 RA = context.gpr[inst.ra];
+  const auto r = add64_flags(RA, 0, context.xer_ca);
+  context.gpr[inst.rd] = r.result;
+  context.xer_ca = r.carry;
+
+  if (inst.oe) {
+    context.setOV((RA >> 63 == 0) && (RA >> 63 != r.result >> 63));
+  }
+
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(r.result, 0, context.xer_so);
+  }
+}
+void DECODER(ADDZE) { ADDZE(context, inst); }
+EXPORT_SEMANTIC(ADDZE);
+
+void SEMANTIC(STDCX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.cr.fields[0].set(
+      false, false, ppu_stdcx(context, vm::cast(addr), context.gpr[inst.rs]),
+      context.xer_so);
+}
+void DECODER(STDCX) { STDCX(context, inst); }
+EXPORT_SEMANTIC(STDCX);
+
+void SEMANTIC(STBX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  vm::write(vm::cast(addr), static_cast<u8>(context.gpr[inst.rs]));
+}
+void DECODER(STBX) { STBX(context, inst); }
+EXPORT_SEMANTIC(STBX);
+
+void SEMANTIC(STVX)(PPUContext &context, Instruction inst) {
+  const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                            : context.gpr[inst.rb]) &
+                   ~0xfull;
+  vm::write(vm::cast(addr), context.vr[inst.vs]);
+}
+void DECODER(STVX) { STVX(context, inst); }
+EXPORT_SEMANTIC(STVX);
+
+void SEMANTIC(MULLD)(PPUContext &context, Instruction inst) {
+  const s64 RA = context.gpr[inst.ra];
+  const s64 RB = context.gpr[inst.rb];
+  context.gpr[inst.rd] = RA * RB;
+  if (inst.oe) {
+    const s64 high = mulh64(RA, RB);
+    // FIXME: verify
+    context.setOV(high != s64(context.gpr[inst.rd]) >> 63);
+  }
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(MULLD) { MULLD(context, inst); }
+EXPORT_SEMANTIC(MULLD);
+
+void SEMANTIC(SUBFME)(PPUContext &context, Instruction inst) {
+  const u64 RA = context.gpr[inst.ra];
+  const auto r = add64_flags(~RA, ~0ull, context.xer_ca);
+  context.gpr[inst.rd] = r.result;
+  context.xer_ca = r.carry;
+  if (inst.oe) {
+    context.setOV((~RA >> 63 == 1) && (~RA >> 63 != r.result >> 63));
+  }
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(r.result, 0, context.xer_so);
+  }
+}
+void DECODER(SUBFME) { SUBFME(context, inst); }
+EXPORT_SEMANTIC(SUBFME);
+
+void SEMANTIC(ADDME)(PPUContext &context, Instruction inst) {
+  const s64 RA = context.gpr[inst.ra];
+  const auto r = add64_flags(RA, ~0ull, context.xer_ca);
+  context.gpr[inst.rd] = r.result;
+  context.xer_ca = r.carry;
+  if (inst.oe) {
+    context.setOV((u64(RA) >> 63 == 1) && (u64(RA) >> 63 != r.result >> 63));
+  }
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(r.result, 0, context.xer_so);
+  }
+}
+void DECODER(ADDME) { ADDME(context, inst); }
+EXPORT_SEMANTIC(ADDME);
+
+void SEMANTIC(MULLW)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.rd] = s64{static_cast<s32>(context.gpr[inst.ra])} *
+                         static_cast<s32>(context.gpr[inst.rb]);
+
+  if (inst.oe) {
+    context.setOV(s64(context.gpr[inst.rd]) < INT32_MIN ||
+                  s64(context.gpr[inst.rd]) > INT32_MAX);
+  }
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(MULLW) { MULLW(context, inst); }
+EXPORT_SEMANTIC(MULLW);
+
+void SEMANTIC(DCBTST)() {}
+void DECODER(DCBTST) { DCBTST(); }
+EXPORT_SEMANTIC(DCBTST);
+
+void SEMANTIC(STBUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb];
+  vm::write(vm::cast(addr), static_cast<u8>(context.gpr[inst.rs]));
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(STBUX) { STBUX(context, inst); }
+EXPORT_SEMANTIC(STBUX);
+
+void SEMANTIC(ADD)(PPUContext &context, Instruction inst) {
+  const u64 RA = context.gpr[inst.ra];
+  const u64 RB = context.gpr[inst.rb];
+  context.gpr[inst.rd] = RA + RB;
+
+  if (inst.oe) {
+    context.setOV((RA >> 63 == RB >> 63) &&
+                  (RA >> 63 != context.gpr[inst.rd] >> 63));
+  }
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(ADD) { ADD(context, inst); }
+EXPORT_SEMANTIC(ADD);
+
+void SEMANTIC(DCBT)() {}
+void DECODER(DCBT) { DCBT(); }
+EXPORT_SEMANTIC(DCBT);
+
+void SEMANTIC(LHZX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<u16>(context, addr);
+}
+void DECODER(LHZX) { LHZX(context, inst); }
+EXPORT_SEMANTIC(LHZX);
+
+void SEMANTIC(EQV)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = ~(context.gpr[inst.rs] ^ context.gpr[inst.rb]);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(EQV) { EQV(context, inst); }
+EXPORT_SEMANTIC(EQV);
+
+void SEMANTIC(ECIWX)() { rpcsx_unimplemented_instruction(); }
+void DECODER(ECIWX) { ECIWX(); }
+EXPORT_SEMANTIC(ECIWX);
+
+void SEMANTIC(LHZUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<u16>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LHZUX) { LHZUX(context, inst); }
+EXPORT_SEMANTIC(LHZUX);
+
+void SEMANTIC(XOR)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = context.gpr[inst.rs] ^ context.gpr[inst.rb];
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(XOR) { XOR(context, inst); }
+EXPORT_SEMANTIC(XOR);
+
+void SEMANTIC(MFSPR)(PPUContext &context, Instruction inst) {
+  const u32 n = (inst.spr >> 5) | ((inst.spr & 0x1f) << 5);
+
+  switch (n) {
+  case 0x001:
+    context.gpr[inst.rd] = u32{context.xer_so} << 31 | context.xer_ov << 30 |
+                           context.xer_ca << 29 | context.xer_cnt;
+    break;
+  case 0x008:
+    context.gpr[inst.rd] = context.lr;
+    break;
+  case 0x009:
+    context.gpr[inst.rd] = context.ctr;
+    break;
+  case 0x100:
+    context.gpr[inst.rd] = context.vrsave;
+    break;
+
+  case 0x10C:
+    context.gpr[inst.rd] = rpcsx_get_tb();
+    break;
+  case 0x10D:
+    context.gpr[inst.rd] = rpcsx_get_tb() >> 32;
+    break;
+  default:
+    rpcsx_invalid_instruction();
+  }
+}
+void DECODER(MFSPR) { MFSPR(context, inst); }
+EXPORT_SEMANTIC(MFSPR);
+
+void SEMANTIC(LWAX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<s32>(context, addr);
+}
+void DECODER(LWAX) { LWAX(context, inst); }
+EXPORT_SEMANTIC(LWAX);
+
+void SEMANTIC(DST)() {}
+void DECODER(DST) { DST(); }
+EXPORT_SEMANTIC(DST);
+
+void SEMANTIC(LHAX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<s16>(context, addr);
+}
+void DECODER(LHAX) { LHAX(context, inst); }
+EXPORT_SEMANTIC(LHAX);
+
+void SEMANTIC(LVXL)(PPUContext &context, Instruction inst) {
+  const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                            : context.gpr[inst.rb]) &
+                   ~0xfull;
+  context.vr[inst.vd] = ppu_feed_data<v128>(context, addr);
+}
+void DECODER(LVXL) { LVXL(context, inst); }
+EXPORT_SEMANTIC(LVXL);
+
+void SEMANTIC(MFTB)(PPUContext &context, Instruction inst) {
+  const u32 n = (inst.spr >> 5) | ((inst.spr & 0x1f) << 5);
+
+  switch (n) {
+  case 0x10C:
+    context.gpr[inst.rd] = rpcsx_get_tb();
+    break;
+  case 0x10D:
+    context.gpr[inst.rd] = rpcsx_get_tb() >> 32;
+    break;
+  default:
+    rpcsx_invalid_instruction();
+  }
+}
+void DECODER(MFTB) { MFTB(context, inst); }
+EXPORT_SEMANTIC(MFTB);
+
+void SEMANTIC(LWAUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<s32>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LWAUX) { LWAUX(context, inst); }
+EXPORT_SEMANTIC(LWAUX);
+
+void SEMANTIC(DSTST)() {}
+void DECODER(DSTST) { DSTST(); }
+EXPORT_SEMANTIC(DSTST);
+
+void SEMANTIC(LHAUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<s16>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LHAUX) { LHAUX(context, inst); }
+EXPORT_SEMANTIC(LHAUX);
+
+void SEMANTIC(STHX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  vm::write(vm::cast(addr), static_cast<u16>(context.gpr[inst.rs]));
+}
+void DECODER(STHX) { STHX(context, inst); }
+EXPORT_SEMANTIC(STHX);
+
+void SEMANTIC(ORC)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = context.gpr[inst.rs] | ~context.gpr[inst.rb];
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(ORC) { ORC(context, inst); }
+EXPORT_SEMANTIC(ORC);
+
+void SEMANTIC(ECOWX)() { rpcsx_unimplemented_instruction(); }
+void DECODER(ECOWX) { ECOWX(); }
+EXPORT_SEMANTIC(ECOWX);
+
+void SEMANTIC(STHUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb];
+  vm::write(vm::cast(addr), static_cast<u16>(context.gpr[inst.rs]));
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(STHUX) { STHUX(context, inst); }
+EXPORT_SEMANTIC(STHUX);
+
+void SEMANTIC(OR)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = context.gpr[inst.rs] | context.gpr[inst.rb];
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(OR) { OR(context, inst); }
+EXPORT_SEMANTIC(OR);
+
+void SEMANTIC(DIVDU)(PPUContext &context, Instruction inst) {
+  const u64 RA = context.gpr[inst.ra];
+  const u64 RB = context.gpr[inst.rb];
+  context.gpr[inst.rd] = RB == 0 ? 0 : RA / RB;
+
+  if (inst.oe) {
+    context.setOV(RB == 0);
+  }
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(DIVDU) { DIVDU(context, inst); }
+EXPORT_SEMANTIC(DIVDU);
+
+void SEMANTIC(DIVWU)(PPUContext &context, Instruction inst) {
+  const u32 RA = static_cast<u32>(context.gpr[inst.ra]);
+  const u32 RB = static_cast<u32>(context.gpr[inst.rb]);
+  context.gpr[inst.rd] = RB == 0 ? 0 : RA / RB;
+  if (inst.oe) {
+    context.setOV(RB == 0);
+  }
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(DIVWU) { DIVWU(context, inst); }
+EXPORT_SEMANTIC(DIVWU);
+
+void SEMANTIC(MTSPR)(PPUContext &context, Instruction inst) {
+  const u32 n = (inst.spr >> 5) | ((inst.spr & 0x1f) << 5);
+
+  switch (n) {
+  case 0x001: {
+    const u64 value = context.gpr[inst.rs];
+    context.xer_so = (value & 0x80000000) != 0;
+    context.xer_ov = (value & 0x40000000) != 0;
+    context.xer_ca = (value & 0x20000000) != 0;
+    context.xer_cnt = value & 0x7f;
+    break;
+  }
+  case 0x008:
+    context.lr = context.gpr[inst.rs];
+    break;
+  case 0x009:
+    context.ctr = context.gpr[inst.rs];
+    break;
+  case 0x100:
+    context.vrsave = static_cast<u32>(context.gpr[inst.rs]);
+    break;
+  default:
+    rpcsx_invalid_instruction();
+  }
+}
+void DECODER(MTSPR) { MTSPR(context, inst); }
+EXPORT_SEMANTIC(MTSPR);
+
+void SEMANTIC(DCBI)() {}
+void DECODER(DCBI) { DCBI(); }
+EXPORT_SEMANTIC(DCBI);
+
+void SEMANTIC(NAND)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = ~(context.gpr[inst.rs] & context.gpr[inst.rb]);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(NAND) { NAND(context, inst); }
+EXPORT_SEMANTIC(NAND);
+
+void SEMANTIC(STVXL)(PPUContext &context, Instruction inst) {
+  const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                            : context.gpr[inst.rb]) &
+                   ~0xfull;
+  vm::write(vm::cast(addr), context.vr[inst.vs]);
+}
+void DECODER(STVXL) { STVXL(context, inst); }
+EXPORT_SEMANTIC(STVXL);
+
+void SEMANTIC(DIVD)(PPUContext &context, Instruction inst) {
+  const s64 RA = context.gpr[inst.ra];
+  const s64 RB = context.gpr[inst.rb];
+  const bool o = RB == 0 || (RA == INT64_MIN && RB == -1);
+  context.gpr[inst.rd] = o ? 0 : RA / RB;
+  if (inst.oe) {
+    context.setOV(o);
+  }
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(DIVD) { DIVD(context, inst); }
+EXPORT_SEMANTIC(DIVD);
+
+void SEMANTIC(DIVW)(PPUContext &context, Instruction inst) {
+  const s32 RA = static_cast<s32>(context.gpr[inst.ra]);
+  const s32 RB = static_cast<s32>(context.gpr[inst.rb]);
+  const bool o = RB == 0 || (RA == INT32_MIN && RB == -1);
+  context.gpr[inst.rd] = o ? 0 : static_cast<u32>(RA / RB);
+  if (inst.oe) {
+    context.setOV(o);
+  }
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.rd], 0, context.xer_so);
+  }
+}
+void DECODER(DIVW) { DIVW(context, inst); }
+EXPORT_SEMANTIC(DIVW);
+
+void SEMANTIC(LVLX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  const u128 data = ppu_feed_data<u128>(context, addr & -16);
+  context.vr[inst.vd] = data << ((addr & 15) * 8);
+}
+void DECODER(LVLX) { LVLX(context, inst); }
+EXPORT_SEMANTIC(LVLX);
+
+void SEMANTIC(LDBRX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<le_t<u64>>(context, addr);
+}
+void DECODER(LDBRX) { LDBRX(context, inst); }
+EXPORT_SEMANTIC(LDBRX);
+
+void SEMANTIC(LSWX)(PPUContext &context, Instruction inst) {
+  u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                     : context.gpr[inst.rb];
+  u32 count = context.xer_cnt & 0x7f;
+  for (; count >= 4; count -= 4, addr += 4, inst.rd = (inst.rd + 1) & 31) {
+    context.gpr[inst.rd] = ppu_feed_data<u32>(context, addr);
+  }
+  if (count) {
+    u32 value = 0;
+    for (u32 byte = 0; byte < count; byte++) {
+      u32 byte_value = ppu_feed_data<u8>(context, addr + byte);
+      value |= byte_value << ((3 ^ byte) * 8);
+    }
+    context.gpr[inst.rd] = value;
+  }
+}
+void DECODER(LSWX) { LSWX(context, inst); }
+EXPORT_SEMANTIC(LSWX);
+
+void SEMANTIC(LWBRX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<le_t<u32>>(context, addr);
+}
+void DECODER(LWBRX) { LWBRX(context, inst); }
+EXPORT_SEMANTIC(LWBRX);
+
+void SEMANTIC(LFSX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.fpr[inst.frd] = ppu_feed_data<f32>(context, addr);
+}
+void DECODER(LFSX) { LFSX(context, inst); }
+EXPORT_SEMANTIC(LFSX);
+
+void SEMANTIC(SRW)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] =
+      (context.gpr[inst.rs] & 0xffffffff) >> (context.gpr[inst.rb] & 0x3f);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(SRW) { SRW(context, inst); }
+EXPORT_SEMANTIC(SRW);
+
+void SEMANTIC(SRD)(PPUContext &context, Instruction inst) {
+  const u32 n = context.gpr[inst.rb] & 0x7f;
+  context.gpr[inst.ra] = n & 0x40 ? 0 : context.gpr[inst.rs] >> n;
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(SRD) { SRD(context, inst); }
+EXPORT_SEMANTIC(SRD);
+
+void SEMANTIC(LVRX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+
+  if ((addr & 15) == 0) {
+    context.vr[inst.vd] = u128(0);
+  } else {
+    const auto data = ppu_feed_data<u128>(context, addr & -16);
+    context.vr[inst.vd] = data >> ((~addr & 15) * 8) >> 8;
+  }
+}
+void DECODER(LVRX) { LVRX(context, inst); }
+EXPORT_SEMANTIC(LVRX);
+
+void SEMANTIC(LSWI)(PPUContext &context, Instruction inst) {
+  u64 addr = inst.ra ? context.gpr[inst.ra] : 0;
+  u64 N = inst.rb ? inst.rb : 32;
+  u8 reg = inst.rd;
+
+  while (N > 0) {
+    if (N > 3) {
+      context.gpr[reg] = ppu_feed_data<u32>(context, addr);
+      addr += 4;
+      N -= 4;
+    } else {
+      u32 buf = 0;
+      u32 i = 3;
+      while (N > 0) {
+        N = N - 1;
+        buf |= ppu_feed_data<u8>(context, addr) << (i * 8);
+        addr++;
+        i--;
+      }
+      context.gpr[reg] = buf;
+    }
+    reg = (reg + 1) % 32;
+  }
+}
+void DECODER(LSWI) { LSWI(context, inst); }
+EXPORT_SEMANTIC(LSWI);
+
+void SEMANTIC(LFSUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb];
+  context.fpr[inst.frd] = ppu_feed_data<f32>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LFSUX) { LFSUX(context, inst); }
+EXPORT_SEMANTIC(LFSUX);
+
+void SEMANTIC(SYNC)() { std::atomic_thread_fence(std::memory_order::seq_cst); }
+void DECODER(SYNC) { SYNC(); }
+EXPORT_SEMANTIC(SYNC);
+
+void SEMANTIC(LFDX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.fpr[inst.frd] = ppu_feed_data<f64>(context, addr);
+}
+void DECODER(LFDX) { LFDX(context, inst); }
+EXPORT_SEMANTIC(LFDX);
+
+void SEMANTIC(LFDUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb];
+  context.fpr[inst.frd] = ppu_feed_data<f64>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LFDUX) { LFDUX(context, inst); }
+EXPORT_SEMANTIC(LFDUX);
+
+void SEMANTIC(STVLX)(v128 s, std::uint64_t a, std::uint64_t b) {
+  const u64 addr = a + b;
+  const u32 tail = u32(addr & 15);
+  std::uint8_t data[16];
+  for (u32 j = 0; j < 16 - tail; j++)
+    data[j] = s.u8r[j];
+
+  rpcsx_vm_write(addr, data, 16 - tail);
+}
+void DECODER(STVLX) {
+  STVLX(context.vr[inst.vs], inst.ra ? context.gpr[inst.ra] : 0,
+        context.gpr[inst.rb]);
+}
+EXPORT_SEMANTIC(STVLX);
+
+void SEMANTIC(STDBRX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  vm::write<le_t<u64>>(vm::cast(addr), context.gpr[inst.rs]);
+}
+void DECODER(STDBRX) { STDBRX(context, inst); }
+EXPORT_SEMANTIC(STDBRX);
+
+void SEMANTIC(STSWX)(PPUContext &context, Instruction inst) {
+  u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                     : context.gpr[inst.rb];
+  u32 count = context.xer_cnt & 0x7F;
+  for (; count >= 4; count -= 4, addr += 4, inst.rs = (inst.rs + 1) & 31) {
+    vm::write(vm::cast(addr), static_cast<u32>(context.gpr[inst.rs]));
+  }
+  if (count) {
+    u32 value = static_cast<u32>(context.gpr[inst.rs]);
+    for (u32 byte = 0; byte < count; byte++) {
+      u8 byte_value = static_cast<u8>(value >> ((3 ^ byte) * 8));
+      vm::write(vm::cast(addr + byte), byte_value);
+    }
+  }
+}
+void DECODER(STSWX) { STSWX(context, inst); }
+EXPORT_SEMANTIC(STSWX);
+
+void SEMANTIC(STWBRX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  vm::write<le_t<u32>>(vm::cast(addr), static_cast<u32>(context.gpr[inst.rs]));
+}
+void DECODER(STWBRX) { STWBRX(context, inst); }
+EXPORT_SEMANTIC(STWBRX);
+
+void SEMANTIC(STFSX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  vm::write<f32>(vm::cast(addr), static_cast<float>(context.fpr[inst.frs]));
+}
+void DECODER(STFSX) { STFSX(context, inst); }
+EXPORT_SEMANTIC(STFSX);
+
+void SEMANTIC(STVRX)(v128 s, std::uint64_t a, std::uint64_t b) {
+  const u64 addr = a + b;
+  const u32 tail = u32(addr & 15);
+  std::uint8_t data[16];
+  for (u32 i = 15; i > 15 - tail; i--)
+    data[i] = s.u8r[i];
+
+  // FIXME: verify
+  rpcsx_vm_write(addr - 16, data + 15 - tail, tail + 1);
+  // u8 *ptr = vm::_ptr<u8>(addr - 16);
+}
+void DECODER(STVRX) {
+  STVRX(context.vr[inst.vs], inst.ra ? context.gpr[inst.ra] : 0,
+        context.gpr[inst.rb]);
+}
+EXPORT_SEMANTIC(STVRX);
+
+void SEMANTIC(STFSUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb];
+  vm::write(vm::cast(addr), static_cast<f32>(context.fpr[inst.frs]));
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(STFSUX) { STFSUX(context, inst); }
+EXPORT_SEMANTIC(STFSUX);
+
+void SEMANTIC(STSWI)(PPUContext &context, Instruction inst) {
+  u64 addr = inst.ra ? context.gpr[inst.ra] : 0;
+  u64 N = inst.rb ? inst.rb : 32;
+  u8 reg = inst.rd;
+
+  while (N > 0) {
+    if (N > 3) {
+      vm::write<u32>(vm::cast(addr), static_cast<u32>(context.gpr[reg]));
+      addr += 4;
+      N -= 4;
+    } else {
+      u32 buf = static_cast<u32>(context.gpr[reg]);
+      while (N > 0) {
+        N = N - 1;
+        vm::write<u8>(vm::cast(addr), (0xFF000000 & buf) >> 24);
+        buf <<= 8;
+        addr++;
+      }
+    }
+    reg = (reg + 1) % 32;
+  }
+}
+void DECODER(STSWI) { STSWI(context, inst); }
+EXPORT_SEMANTIC(STSWI);
+
+void SEMANTIC(STFDX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  vm::write<f64>(vm::cast(addr), context.fpr[inst.frs]);
+}
+void DECODER(STFDX) { STFDX(context, inst); }
+EXPORT_SEMANTIC(STFDX);
+
+void SEMANTIC(STFDUX)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb];
+  vm::write<f64>(vm::cast(addr), context.fpr[inst.frs]);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(STFDUX) { STFDUX(context, inst); }
+EXPORT_SEMANTIC(STFDUX);
+
+void SEMANTIC(LVLXL)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  const u128 data = ppu_feed_data<u128>(context, addr & -16);
+  context.vr[inst.vd] = data << ((addr & 15) * 8);
+}
+void DECODER(LVLXL) { LVLXL(context, inst); }
+EXPORT_SEMANTIC(LVLXL);
+
+void SEMANTIC(LHBRX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  context.gpr[inst.rd] = ppu_feed_data<le_t<u16>>(context, addr);
+}
+void DECODER(LHBRX) { LHBRX(context, inst); }
+EXPORT_SEMANTIC(LHBRX);
+
+void SEMANTIC(SRAW)(PPUContext &context, Instruction inst) {
+  s32 RS = static_cast<s32>(context.gpr[inst.rs]);
+  u8 shift = context.gpr[inst.rb] & 63;
+  if (shift > 31) {
+    context.gpr[inst.ra] = 0 - (RS < 0);
+    context.xer_ca = (RS < 0);
+  } else {
+    context.gpr[inst.ra] = RS >> shift;
+    context.xer_ca =
+        (RS < 0) && ((context.gpr[inst.ra] << shift) != static_cast<u64>(RS));
+  }
+
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(SRAW) { SRAW(context, inst); }
+EXPORT_SEMANTIC(SRAW);
+
+void SEMANTIC(SRAD)(PPUContext &context, Instruction inst) {
+  s64 RS = context.gpr[inst.rs];
+  u8 shift = context.gpr[inst.rb] & 127;
+  if (shift > 63) {
+    context.gpr[inst.ra] = 0 - (RS < 0);
+    context.xer_ca = (RS < 0);
+  } else {
+    context.gpr[inst.ra] = RS >> shift;
+    context.xer_ca =
+        (RS < 0) && ((context.gpr[inst.ra] << shift) != static_cast<u64>(RS));
+  }
+
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(SRAD) { SRAD(context, inst); }
+EXPORT_SEMANTIC(SRAD);
+
+void SEMANTIC(LVRXL)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+
+  if ((addr & 15) == 0) {
+    context.vr[inst.vd] = u128(0);
+  } else {
+    const u128 data = ppu_feed_data<u128>(context, addr & -16);
+    context.vr[inst.vd] = data >> ((~addr & 15) * 8) >> 8;
+  }
+}
+void DECODER(LVRXL) { LVRXL(context, inst); }
+EXPORT_SEMANTIC(LVRXL);
+
+void SEMANTIC(DSS)() {}
+void DECODER(DSS) { DSS(); }
+EXPORT_SEMANTIC(DSS);
+
+void SEMANTIC(SRAWI)(PPUContext &context, Instruction inst) {
+  s32 RS = static_cast<u32>(context.gpr[inst.rs]);
+  context.gpr[inst.ra] = RS >> inst.sh32;
+  context.xer_ca =
+      (RS < 0) && (static_cast<u32>(context.gpr[inst.ra] << inst.sh32) !=
+                   static_cast<u32>(RS));
+
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(SRAWI) { SRAWI(context, inst); }
+EXPORT_SEMANTIC(SRAWI);
+
+void SEMANTIC(SRADI)(PPUContext &context, Instruction inst) {
+  auto sh = inst.sh64;
+  s64 RS = context.gpr[inst.rs];
+  context.gpr[inst.ra] = RS >> sh;
+  context.xer_ca =
+      (RS < 0) && ((context.gpr[inst.ra] << sh) != static_cast<u64>(RS));
+
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(SRADI) { SRADI(context, inst); }
+EXPORT_SEMANTIC(SRADI);
+
+void SEMANTIC(EIEIO)() { std::atomic_thread_fence(std::memory_order::seq_cst); }
+void DECODER(EIEIO) { EIEIO(); }
+EXPORT_SEMANTIC(EIEIO);
+
+void SEMANTIC(STVLXL)(v128 s, u64 a, u64 b) {
+  const u64 addr = a + b;
+  const u32 tail = u32(addr & 15);
+  // FIXME
+  for (u32 j = 0; j < 16 - tail; j++)
+    vm::write(addr + j, s.u8r[j]);
+}
+void DECODER(STVLXL) {
+  STVLXL(context.vr[inst.vs], inst.ra ? context.gpr[inst.ra] : 0,
+         context.gpr[inst.rb]);
+}
+EXPORT_SEMANTIC(STVLXL);
+
+void SEMANTIC(STHBRX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  vm::write<le_t<u16>>(vm::cast(addr), static_cast<u16>(context.gpr[inst.rs]));
+}
+void DECODER(STHBRX) { STHBRX(context, inst); }
+EXPORT_SEMANTIC(STHBRX);
+
+void SEMANTIC(EXTSH)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = static_cast<s16>(context.gpr[inst.rs]);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(EXTSH) { EXTSH(context, inst); }
+EXPORT_SEMANTIC(EXTSH);
+
+void SEMANTIC(STVRXL)(v128 s, u64 a, u64 b) {
+  const u64 addr = a + b;
+  const u32 tail = u32(addr & 15);
+
+  // FIXME
+  for (u32 i = 15; i > 15 - tail; i--)
+    vm::write(addr - 16 + i, s.u8r[i]);
+}
+void DECODER(STVRXL) {
+  STVRXL(context.vr[inst.vs], inst.ra ? context.gpr[inst.ra] : 0,
+         context.gpr[inst.rb]);
+}
+EXPORT_SEMANTIC(STVRXL);
+
+void SEMANTIC(EXTSB)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = static_cast<s8>(context.gpr[inst.rs]);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(EXTSB) { EXTSB(context, inst); }
+EXPORT_SEMANTIC(EXTSB);
+
+void SEMANTIC(STFIWX)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  vm::write<u32>(vm::cast(addr),
+                 static_cast<u32>(std::bit_cast<u64>(context.fpr[inst.frs])));
+}
+void DECODER(STFIWX) { STFIWX(context, inst); }
+EXPORT_SEMANTIC(STFIWX);
+
+void SEMANTIC(EXTSW)(PPUContext &context, Instruction inst) {
+  context.gpr[inst.ra] = static_cast<s32>(context.gpr[inst.rs]);
+  if (inst.rc) {
+    context.cr.fields[0].update<s64>(context.gpr[inst.ra], 0, context.xer_so);
+  }
+}
+void DECODER(EXTSW) { EXTSW(context, inst); }
+EXPORT_SEMANTIC(EXTSW);
+
+void SEMANTIC(ICBI)() {}
+void DECODER(ICBI) { ICBI(); }
+EXPORT_SEMANTIC(ICBI);
+
+void SEMANTIC(DCBZ)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb]
+                           : context.gpr[inst.rb];
+  const u32 addr0 = vm::cast(addr) & ~127;
+
+  alignas(64) static constexpr u8 zero_buf[128]{};
+  do_cell_atomic_128_store(addr0, zero_buf);
+}
+void DECODER(DCBZ) { DCBZ(context, inst); }
+EXPORT_SEMANTIC(DCBZ);
+
+void SEMANTIC(LWZ)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  context.gpr[inst.rd] = ppu_feed_data<u32>(context, addr);
+}
+void DECODER(LWZ) { LWZ(context, inst); }
+EXPORT_SEMANTIC(LWZ);
+
+void SEMANTIC(LWZU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + inst.simm16;
+  context.gpr[inst.rd] = ppu_feed_data<u32>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LWZU) { LWZU(context, inst); }
+EXPORT_SEMANTIC(LWZU);
+
+void SEMANTIC(LBZ)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  context.gpr[inst.rd] = ppu_feed_data<u8>(context, addr);
+}
+void DECODER(LBZ) { LBZ(context, inst); }
+EXPORT_SEMANTIC(LBZ);
+
+void SEMANTIC(LBZU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + inst.simm16;
+  context.gpr[inst.rd] = ppu_feed_data<u8>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LBZU) { LBZU(context, inst); }
+EXPORT_SEMANTIC(LBZU);
+
+void SEMANTIC(STW)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  const u32 value = static_cast<u32>(context.gpr[inst.rs]);
+  vm::write<u32>(vm::cast(addr), value);
+
+  // Insomniac engine v3 & v4 (newer R&C, Fuse, Resitance 3)
+  // if (value == 0xAAAAAAAA) [[unlikely]] {
+  //   vm::reservation_update(vm::cast(addr));
+  // }
+}
+void DECODER(STW) { STW(context, inst); }
+EXPORT_SEMANTIC(STW);
+
+void SEMANTIC(STWU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + inst.simm16;
+  vm::write<u32>(vm::cast(addr), static_cast<u32>(context.gpr[inst.rs]));
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(STWU) { STWU(context, inst); }
+EXPORT_SEMANTIC(STWU);
+
+void SEMANTIC(STB)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  vm::write<u8>(vm::cast(addr), static_cast<u8>(context.gpr[inst.rs]));
+}
+void DECODER(STB) { STB(context, inst); }
+EXPORT_SEMANTIC(STB);
+
+void SEMANTIC(STBU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + inst.simm16;
+  vm::write<u8>(vm::cast(addr), static_cast<u8>(context.gpr[inst.rs]));
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(STBU) { STBU(context, inst); }
+EXPORT_SEMANTIC(STBU);
+
+void SEMANTIC(LHZ)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  context.gpr[inst.rd] = ppu_feed_data<u16>(context, addr);
+}
+void DECODER(LHZ) { LHZ(context, inst); }
+EXPORT_SEMANTIC(LHZ);
+
+void SEMANTIC(LHZU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + inst.simm16;
+  context.gpr[inst.rd] = ppu_feed_data<u16>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LHZU) { LHZU(context, inst); }
+EXPORT_SEMANTIC(LHZU);
+
+void SEMANTIC(LHA)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  context.gpr[inst.rd] = ppu_feed_data<s16>(context, addr);
+}
+void DECODER(LHA) { LHA(context, inst); }
+EXPORT_SEMANTIC(LHA);
+
+void SEMANTIC(LHAU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + inst.simm16;
+  context.gpr[inst.rd] = ppu_feed_data<s16>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LHAU) { LHAU(context, inst); }
+EXPORT_SEMANTIC(LHAU);
+
+void SEMANTIC(STH)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  vm::write<u16>(vm::cast(addr), static_cast<u16>(context.gpr[inst.rs]));
+}
+void DECODER(STH) { STH(context, inst); }
+EXPORT_SEMANTIC(STH);
+
+void SEMANTIC(STHU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + inst.simm16;
+  vm::write<u16>(vm::cast(addr), static_cast<u16>(context.gpr[inst.rs]));
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(STHU) { STHU(context, inst); }
+EXPORT_SEMANTIC(STHU);
+
+void SEMANTIC(LMW)(PPUContext &context, Instruction inst) {
+  u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  for (u32 i = inst.rd; i < 32; ++i, addr += 4) {
+    context.gpr[i] = ppu_feed_data<u32>(context, addr);
+  }
+}
+void DECODER(LMW) { LMW(context, inst); }
+EXPORT_SEMANTIC(LMW);
+
+void SEMANTIC(STMW)(PPUContext &context, Instruction inst) {
+  u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  for (u32 i = inst.rs; i < 32; ++i, addr += 4) {
+    vm::write<u32>(vm::cast(addr), static_cast<u32>(context.gpr[i]));
+  }
+}
+void DECODER(STMW) { STMW(context, inst); }
+EXPORT_SEMANTIC(STMW);
+
+void SEMANTIC(LFS)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  context.fpr[inst.frd] = ppu_feed_data<f32>(context, addr);
+}
+void DECODER(LFS) { LFS(context, inst); }
+EXPORT_SEMANTIC(LFS);
+
+void SEMANTIC(LFSU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + inst.simm16;
+  context.fpr[inst.frd] = ppu_feed_data<f32>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LFSU) { LFSU(context, inst); }
+EXPORT_SEMANTIC(LFSU);
+
+void SEMANTIC(LFD)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  context.fpr[inst.frd] = ppu_feed_data<f64>(context, addr);
+}
+void DECODER(LFD) { LFD(context, inst); }
+EXPORT_SEMANTIC(LFD);
+
+void SEMANTIC(LFDU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + inst.simm16;
+  context.fpr[inst.frd] = ppu_feed_data<f64>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LFDU) { LFDU(context, inst); }
+EXPORT_SEMANTIC(LFDU);
+
+void SEMANTIC(STFS)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  vm::write<f32>(vm::cast(addr), static_cast<float>(context.fpr[inst.frs]));
+}
+void DECODER(STFS) { STFS(context, inst); }
+EXPORT_SEMANTIC(STFS);
+
+void SEMANTIC(STFSU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + inst.simm16;
+  vm::write<f32>(vm::cast(addr), static_cast<float>(context.fpr[inst.frs]));
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(STFSU) { STFSU(context, inst); }
+EXPORT_SEMANTIC(STFSU);
+
+void SEMANTIC(STFD)(PPUContext &context, Instruction inst) {
+  const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16;
+  vm::write<f64>(vm::cast(addr), context.fpr[inst.frs]);
+}
+void DECODER(STFD) { STFD(context, inst); }
+EXPORT_SEMANTIC(STFD);
+
+void SEMANTIC(STFDU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + inst.simm16;
+  vm::write<f64>(vm::cast(addr), context.fpr[inst.frs]);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(STFDU) { STFDU(context, inst); }
+EXPORT_SEMANTIC(STFDU);
+
+void SEMANTIC(LD)(PPUContext &context, Instruction inst) {
+  const u64 addr = (inst.simm16 & ~3) + (inst.ra ? context.gpr[inst.ra] : 0);
+  context.gpr[inst.rd] = ppu_feed_data<u64>(context, addr);
+}
+void DECODER(LD) { LD(context, inst); }
+EXPORT_SEMANTIC(LD);
+
+void SEMANTIC(LDU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + (inst.simm16 & ~3);
+  context.gpr[inst.rd] = ppu_feed_data<u64>(context, addr);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(LDU) { LDU(context, inst); }
+EXPORT_SEMANTIC(LDU);
+
+void SEMANTIC(LWA)(PPUContext &context, Instruction inst) {
+  const u64 addr = (inst.simm16 & ~3) + (inst.ra ? context.gpr[inst.ra] : 0);
+  context.gpr[inst.rd] = ppu_feed_data<s32>(context, addr);
+}
+void DECODER(LWA) { LWA(context, inst); }
+EXPORT_SEMANTIC(LWA);
+
+void SEMANTIC(STD)(PPUContext &context, Instruction inst) {
+  const u64 addr = (inst.simm16 & ~3) + (inst.ra ? context.gpr[inst.ra] : 0);
+  vm::write<u64>(vm::cast(addr), context.gpr[inst.rs]);
+}
+void DECODER(STD) { STD(context, inst); }
+EXPORT_SEMANTIC(STD);
+
+void SEMANTIC(STDU)(PPUContext &context, Instruction inst) {
+  const u64 addr = context.gpr[inst.ra] + (inst.simm16 & ~3);
+  vm::write<u64>(vm::cast(addr), context.gpr[inst.rs]);
+  context.gpr[inst.ra] = addr;
+}
+void DECODER(STDU) { STDU(context, inst); }
+EXPORT_SEMANTIC(STDU);
+
+static void ppu_set_fpcc(PPUContext &context, bool updateCr, f64 a, f64 b,
+                         u64 cr_field = 1) {
+  static_assert(std::endian::native == std::endian::little, "Not implemented");
+
+  bool fpcc[4];
+#if defined(ARCH_X64) && !defined(_M_X64)
+  __asm__("comisd %[b], %[a]\n"
+          : "=@ccb"(fpcc[0]), "=@cca"(fpcc[1]), "=@ccz"(fpcc[2]),
+            "=@ccp"(fpcc[3])
+          : [a] "x"(a), [b] "x"(b)
+          : "cc");
+  if (fpcc[3]) [[unlikely]] {
+    fpcc[0] = fpcc[1] = fpcc[2] = false;
+  }
+#else
+  const auto cmp = a <=> b;
+  fpcc[0] = cmp == std::partial_ordering::less;
+  fpcc[1] = cmp == std::partial_ordering::greater;
+  fpcc[2] = cmp == std::partial_ordering::equivalent;
+  fpcc[3] = cmp == std::partial_ordering::unordered;
+#endif
+
+  auto data = std::bit_cast<CrField>(fpcc);
+
+  // Write FPCC
+  context.fpscr.fields[4] = data;
+
+  if (updateCr) {
+    // Previous behaviour was throwing an exception; TODO
+    context.cr.fields[cr_field] = data;
+  }
+}
+
+void SEMANTIC(FDIVS)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = f32(context.fpr[inst.fra] / context.fpr[inst.frb]);
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FDIVS) { FDIVS(context, inst); }
+EXPORT_SEMANTIC(FDIVS);
+
+void SEMANTIC(FSUBS)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = f32(context.fpr[inst.fra] - context.fpr[inst.frb]);
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FSUBS) { FSUBS(context, inst); }
+EXPORT_SEMANTIC(FSUBS);
+
+void SEMANTIC(FADDS)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = f32(context.fpr[inst.fra] + context.fpr[inst.frb]);
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FADDS) { FADDS(context, inst); }
+EXPORT_SEMANTIC(FADDS);
+
+void SEMANTIC(FSQRTS)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = f32(std::sqrt(context.fpr[inst.frb]));
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FSQRTS) { FSQRTS(context, inst); }
+EXPORT_SEMANTIC(FSQRTS);
+
+void SEMANTIC(FRES)(PPUContext &context, Instruction inst) {
+  const f64 a = context.fpr[inst.frb];
+  const u64 b = std::bit_cast<u64>(a);
+  const u64 e = (b >> 52) & 0x7ff; // double exp
+  const u64 i = (b >> 45) & 0x7f;  // mantissa LUT index
+  const u64 r = e >= (0x3ff + 0x80)
+                    ? 0
+                    : (0x7ff - 2 - e) << 52 | u64{ppu_fres_mantissas[i]}
+                                                  << (32 - 3);
+
+  context.fpr[inst.frd] = f32(std::bit_cast<f64>(
+      a == a ? (b & 0x8000'0000'0000'0000) | r : (0x8'0000'0000'0000 | b)));
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FRES) { FRES(context, inst); }
+EXPORT_SEMANTIC(FRES);
+
+void SEMANTIC(FMULS)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = f32(context.fpr[inst.fra] * context.fpr[inst.frc]);
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FMULS) { FMULS(context, inst); }
+EXPORT_SEMANTIC(FMULS);
+
+void SEMANTIC(FMADDS)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = f32(std::fma(
+      context.fpr[inst.fra], context.fpr[inst.frc], context.fpr[inst.frb]));
+
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FMADDS) { FMADDS(context, inst); }
+EXPORT_SEMANTIC(FMADDS);
+
+void SEMANTIC(FMSUBS)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = f32(std::fma(
+      context.fpr[inst.fra], context.fpr[inst.frc], -context.fpr[inst.frb]));
+
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FMSUBS) { FMSUBS(context, inst); }
+EXPORT_SEMANTIC(FMSUBS);
+
+void SEMANTIC(FNMSUBS)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = f32(-std::fma(
+      context.fpr[inst.fra], context.fpr[inst.frc], -context.fpr[inst.frb]));
+
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FNMSUBS) { FNMSUBS(context, inst); }
+EXPORT_SEMANTIC(FNMSUBS);
+
+void SEMANTIC(FNMADDS)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = f32(-std::fma(
+      context.fpr[inst.fra], context.fpr[inst.frc], context.fpr[inst.frb]));
+
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FNMADDS) { FNMADDS(context, inst); }
+EXPORT_SEMANTIC(FNMADDS);
+
+void SEMANTIC(MTFSB1)(PPUContext &context, Instruction inst) {
+  const u32 bit = inst.crbd;
+  context.fpscr.bits[bit] = 1;
+  context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl, context.fpscr.fe,
+                           context.fpscr.fu);
+}
+void DECODER(MTFSB1) { MTFSB1(context, inst); }
+EXPORT_SEMANTIC(MTFSB1);
+
+void SEMANTIC(MCRFS)(PPUContext &context, Instruction inst) {
+  std::memcpy(context.cr.fields + inst.crfd, context.fpscr.fields + inst.crfs,
+              sizeof(u32));
+}
+void DECODER(MCRFS) { MCRFS(context, inst); }
+EXPORT_SEMANTIC(MCRFS);
+
+void SEMANTIC(MTFSB0)(PPUContext &context, Instruction inst) {
+  const u32 bit = inst.crbd;
+  // if (bit < 16 || bit > 19)
+  //   ppu_log.warning("MTFSB0(%d)", bit);
+  context.fpscr.bits[bit] = 0;
+  context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl, context.fpscr.fe,
+                           context.fpscr.fu);
+}
+void DECODER(MTFSB0) { MTFSB0(context, inst); }
+EXPORT_SEMANTIC(MTFSB0);
+
+void SEMANTIC(MTFSFI)(PPUContext &context, Instruction inst) {
+  const u32 bf = inst.crfd;
+
+  if (bf != 4) {
+    // Do nothing on non-FPCC field (TODO)
+    // ppu_log.warning("MTFSFI(%d)", inst.crfd);
+  } else {
+    static constexpr auto all_values = [] {
+      std::array<CrField, 16> values{};
+
+      for (u32 i = 0; i < values.size(); i++) {
+        u32 value = 0, im = i;
+        value |= (im & 1) << (8 * 3);
+        im >>= 1;
+        value |= (im & 1) << (8 * 2);
+        im >>= 1;
+        value |= (im & 1) << (8 * 1);
+        im >>= 1;
+        value |= (im & 1) << (8 * 0);
+        values[i] = std::bit_cast<CrField>(value);
+      }
+
+      return values;
+    }();
+
+    context.fpscr.fields[bf] = all_values[inst.i];
+  }
+
+  if (inst.rc) {
+    context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl,
+                             context.fpscr.fe, context.fpscr.fu);
+  }
+}
+void DECODER(MTFSFI) { MTFSFI(context, inst); }
+EXPORT_SEMANTIC(MTFSFI);
+
+void SEMANTIC(MFFS)(PPUContext &context, Instruction inst) {
+  // ppu_log.warning("MFFS");
+  context.fpr[inst.frd] = std::bit_cast<f64>(
+      u64{context.fpscr.fl} << 15 | u64{context.fpscr.fg} << 14 |
+      u64{context.fpscr.fe} << 13 | u64{context.fpscr.fu} << 12);
+  if (inst.rc) {
+    context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl,
+                             context.fpscr.fe, context.fpscr.fu);
+  }
+}
+void DECODER(MFFS) { MFFS(context, inst); }
+EXPORT_SEMANTIC(MFFS);
+
+void SEMANTIC(MTFSF)(PPUContext &context, Instruction inst) {
+  if (inst.rc) {
+    context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl,
+                             context.fpscr.fe, context.fpscr.fu);
+  }
+}
+void DECODER(MTFSF) { MTFSF(context, inst); }
+EXPORT_SEMANTIC(MTFSF);
+
+void SEMANTIC(FCMPU)(PPUContext &context, Instruction inst) {
+  const f64 a = context.fpr[inst.fra];
+  const f64 b = context.fpr[inst.frb];
+  ppu_set_fpcc(context, true, a, b, inst.crfd);
+}
+void DECODER(FCMPU) { FCMPU(context, inst); }
+EXPORT_SEMANTIC(FCMPU);
+
+void SEMANTIC(FCTIW)(PPUContext &context, Instruction inst, f64 &d, f64 b) {
+#if defined(ARCH_X64)
+  const auto val = _mm_set_sd(b);
+  const auto res = _mm_xor_si128(
+      _mm_cvtpd_epi32(val),
+      _mm_castpd_si128(_mm_cmpge_pd(val, _mm_set1_pd(0x80000000))));
+  d = std::bit_cast<f64, s64>(_mm_cvtsi128_si32(res));
+#elif defined(ARCH_ARM64)
+  d = std::bit_cast<f64, s64>(!(b == b)
+                                  ? INT32_MIN
+                                  : vqmovnd_s64(std::bit_cast<f64>(vrndi_f64(
+                                        std::bit_cast<float64x1_t>(b)))));
+#endif
+  ppu_set_fpcc(context, inst.rc, 0., 0.); // undefined (TODO)
+}
+void DECODER(FCTIW) {
+  FCTIW(context, inst, context.fpr[inst.frd], context.fpr[inst.frb]);
+}
+EXPORT_SEMANTIC(FCTIW);
+
+void SEMANTIC(FCTIWZ)(PPUContext &context, Instruction inst, f64 &d, f64 b) {
+#if defined(ARCH_X64)
+  const auto val = _mm_set_sd(b);
+  const auto res = _mm_xor_si128(
+      _mm_cvttpd_epi32(val),
+      _mm_castpd_si128(_mm_cmpge_pd(val, _mm_set1_pd(0x80000000))));
+  d = std::bit_cast<f64, s64>(_mm_cvtsi128_si32(res));
+#elif defined(ARCH_ARM64)
+  d = std::bit_cast<f64, s64>(!(b == b)
+                                  ? INT32_MIN
+                                  : vqmovnd_s64(std::bit_cast<s64>(vcvt_s64_f64(
+                                        std::bit_cast<float64x1_t>(b)))));
+#endif
+  ppu_set_fpcc(context, inst.rc, 0., 0.); // undefined (TODO)
+}
+void DECODER(FCTIWZ) {
+  FCTIWZ(context, inst, context.fpr[inst.frd], context.fpr[inst.frb]);
+}
+EXPORT_SEMANTIC(FCTIWZ);
+
+void SEMANTIC(FRSP)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = f32(context.fpr[inst.frb]);
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FRSP) { FRSP(context, inst); }
+EXPORT_SEMANTIC(FRSP);
+
+void SEMANTIC(FDIV)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = context.fpr[inst.fra] / context.fpr[inst.frb];
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FDIV) { FDIV(context, inst); }
+EXPORT_SEMANTIC(FDIV);
+
+void SEMANTIC(FSUB)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = context.fpr[inst.fra] - context.fpr[inst.frb];
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FSUB) { FSUB(context, inst); }
+EXPORT_SEMANTIC(FSUB);
+
+void SEMANTIC(FADD)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = context.fpr[inst.fra] + context.fpr[inst.frb];
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FADD) { FADD(context, inst); }
+EXPORT_SEMANTIC(FADD);
+
+void SEMANTIC(FSQRT)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = std::sqrt(context.fpr[inst.frb]);
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FSQRT) { FSQRT(context, inst); }
+EXPORT_SEMANTIC(FSQRT);
+
+void SEMANTIC(FSEL)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = context.fpr[inst.fra] >= 0.0 ? context.fpr[inst.frc]
+                                                       : context.fpr[inst.frb];
+  if (inst.rc) {
+    context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl,
+                             context.fpscr.fe, context.fpscr.fu);
+  }
+}
+void DECODER(FSEL) { FSEL(context, inst); }
+EXPORT_SEMANTIC(FSEL);
+
+void SEMANTIC(FMUL)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = context.fpr[inst.fra] * context.fpr[inst.frc];
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FMUL) { FMUL(context, inst); }
+EXPORT_SEMANTIC(FMUL);
+
+void SEMANTIC(FRSQRTE)(PPUContext &context, Instruction inst) {
+  const u64 b = std::bit_cast<u64>(context.fpr[inst.frb]);
+  context.fpr[inst.frd] =
+      std::bit_cast<f64>(u64{ppu_frqrte_lut.data[b >> 49]} << 32);
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FRSQRTE) { FRSQRTE(context, inst); }
+EXPORT_SEMANTIC(FRSQRTE);
+
+void SEMANTIC(FMSUB)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = std::fma(context.fpr[inst.fra], context.fpr[inst.frc],
+                                   -context.fpr[inst.frb]);
+
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FMSUB) { FMSUB(context, inst); }
+EXPORT_SEMANTIC(FMSUB);
+
+void SEMANTIC(FMADD)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = std::fma(context.fpr[inst.fra], context.fpr[inst.frc],
+                                   context.fpr[inst.frb]);
+
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FMADD) { FMADD(context, inst); }
+EXPORT_SEMANTIC(FMADD);
+
+void SEMANTIC(FNMSUB)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = -std::fma(
+      context.fpr[inst.fra], context.fpr[inst.frc], -context.fpr[inst.frb]);
+
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FNMSUB) { FNMSUB(context, inst); }
+EXPORT_SEMANTIC(FNMSUB);
+
+void SEMANTIC(FNMADD)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = -std::fma(
+      context.fpr[inst.fra], context.fpr[inst.frc], context.fpr[inst.frb]);
+
+  ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.);
+}
+void DECODER(FNMADD) { FNMADD(context, inst); }
+EXPORT_SEMANTIC(FNMADD);
+
+void SEMANTIC(FCMPO)(PPUContext &context, Instruction inst) {
+  const f64 a = context.fpr[inst.fra];
+  const f64 b = context.fpr[inst.frb];
+  ppu_set_fpcc(context, true, a, b, inst.crfd);
+}
+void DECODER(FCMPO) { FCMPO(context, inst); }
+EXPORT_SEMANTIC(FCMPO);
+
+void SEMANTIC(FNEG)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = -context.fpr[inst.frb];
+  if (inst.rc) {
+    context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl,
+                             context.fpscr.fe, context.fpscr.fu);
+  }
+}
+void DECODER(FNEG) { FNEG(context, inst); }
+EXPORT_SEMANTIC(FNEG);
+
+void SEMANTIC(FMR)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = context.fpr[inst.frb];
+  if (inst.rc) {
+    context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl,
+                             context.fpscr.fe, context.fpscr.fu);
+  }
+}
+void DECODER(FMR) { FMR(context, inst); }
+EXPORT_SEMANTIC(FMR);
+
+void SEMANTIC(FNABS)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = -std::fabs(context.fpr[inst.frb]);
+  if (inst.rc) {
+    context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl,
+                             context.fpscr.fe, context.fpscr.fu);
+  }
+}
+void DECODER(FNABS) { FNABS(context, inst); }
+EXPORT_SEMANTIC(FNABS);
+
+void SEMANTIC(FABS)(PPUContext &context, Instruction inst) {
+  context.fpr[inst.frd] = std::fabs(context.fpr[inst.frb]);
+  if (inst.rc) {
+    context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl,
+                             context.fpscr.fe, context.fpscr.fu);
+  }
+}
+void DECODER(FABS) { FABS(context, inst); }
+EXPORT_SEMANTIC(FABS);
+
+void SEMANTIC(FCTID)(PPUContext &context, Instruction inst, f64 &d, f64 b) {
+#if defined(ARCH_X64)
+  const auto val = _mm_set_sd(b);
+  const auto res = _mm_xor_si128(
+      _mm_set1_epi64x(_mm_cvtsd_si64(val)),
+      _mm_castpd_si128(_mm_cmpge_pd(val, _mm_set1_pd(f64(1ull << 63)))));
+  d = std::bit_cast<f64>(_mm_cvtsi128_si64(res));
+#elif defined(ARCH_ARM64)
+  d = std::bit_cast<f64, s64>(
+      !(b == b) ? f64{INT64_MIN}
+                : std::bit_cast<f64>(vrndi_f64(std::bit_cast<float64x1_t>(b))));
+#endif
+  ppu_set_fpcc(context, inst.rc, 0., 0.); // undefined (TODO)
+}
+void DECODER(FCTID) {
+  FCTID(context, inst, context.fpr[inst.frd], context.fpr[inst.frb]);
+}
+EXPORT_SEMANTIC(FCTID);
+
+void SEMANTIC(FCTIDZ)(PPUContext &context, Instruction inst, f64 &d, f64 b) {
+#if defined(ARCH_X64)
+  const auto val = _mm_set_sd(b);
+  const auto res = _mm_xor_si128(
+      _mm_set1_epi64x(_mm_cvttsd_si64(val)),
+      _mm_castpd_si128(_mm_cmpge_pd(val, _mm_set1_pd(f64(1ull << 63)))));
+  d = std::bit_cast<f64>(_mm_cvtsi128_si64(res));
+#elif defined(ARCH_ARM64)
+  d = std::bit_cast<f64>(!(b == b)
+                             ? int64x1_t{INT64_MIN}
+                             : vcvt_s64_f64(std::bit_cast<float64x1_t>(b)));
+#endif
+  ppu_set_fpcc(context, inst.rc, 0., 0.); // undefined (TODO)
+}
+void DECODER(FCTIDZ) {
+  FCTIDZ(context, inst, context.fpr[inst.frd], context.fpr[inst.frb]);
+}
+EXPORT_SEMANTIC(FCTIDZ);
+
+void SEMANTIC(FCFID)(PPUContext &context, Instruction inst, f64 &d, f64 b) {
+  f64 r = static_cast<f64>(std::bit_cast<s64>(b));
+  d = r;
+  ppu_set_fpcc(context, inst.rc, r, 0.);
+}
+void DECODER(FCFID) {
+  FCFID(context, inst, context.fpr[inst.frd], context.fpr[inst.frb]);
+}
+EXPORT_SEMANTIC(FCFID);
+
+void SEMANTIC(RFID)() { rpcsx_unimplemented_instruction(); }
+void DECODER(RFID) { RFID(); }
+EXPORT_SEMANTIC(RFID);
+
+void SEMANTIC(RFSCV)() { rpcsx_unimplemented_instruction(); }
+void DECODER(RFSCV) { RFSCV(); }
+EXPORT_SEMANTIC(RFSCV);
+
+void SEMANTIC(HRFID)() { rpcsx_unimplemented_instruction(); }
+void DECODER(HRFID) { HRFID(); }
+EXPORT_SEMANTIC(HRFID);
+
+void SEMANTIC(STOP)() { rpcsx_unimplemented_instruction(); }
+void DECODER(STOP) { STOP(); }
+EXPORT_SEMANTIC(STOP);
+
+void SEMANTIC(URFID)() { rpcsx_unimplemented_instruction(); }
+void DECODER(URFID) { URFID(); }
+EXPORT_SEMANTIC(URFID);
+
+void SEMANTIC(SUBFCO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(SUBFCO) { SUBFCO(); }
+EXPORT_SEMANTIC(SUBFCO);
+
+void SEMANTIC(ADDCO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(ADDCO) { ADDCO(); }
+EXPORT_SEMANTIC(ADDCO);
+
+void SEMANTIC(UNK)() { rpcsx_unimplemented_instruction(); }
+void DECODER(UNK) { UNK(); }
+EXPORT_SEMANTIC(UNK);
+
+void SEMANTIC(SUBFEO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(SUBFEO) { SUBFEO(); }
+EXPORT_SEMANTIC(SUBFEO);
+
+void SEMANTIC(ADDEO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(ADDEO) { ADDEO(); }
+EXPORT_SEMANTIC(ADDEO);
+
+void SEMANTIC(SUBFO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(SUBFO) { SUBFO(); }
+EXPORT_SEMANTIC(SUBFO);
+
+void SEMANTIC(NEGO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(NEGO) { NEGO(); }
+EXPORT_SEMANTIC(NEGO);
+
+void SEMANTIC(SUBFMEO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(SUBFMEO) { SUBFMEO(); }
+EXPORT_SEMANTIC(SUBFMEO);
+
+void SEMANTIC(MULLDO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(MULLDO) { MULLDO(); }
+EXPORT_SEMANTIC(MULLDO);
+
+void SEMANTIC(SUBFZEO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(SUBFZEO) { SUBFZEO(); }
+EXPORT_SEMANTIC(SUBFZEO);
+
+void SEMANTIC(ADDZEO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(ADDZEO) { ADDZEO(); }
+EXPORT_SEMANTIC(ADDZEO);
+
+void SEMANTIC(ADDO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(ADDO) { ADDO(); }
+EXPORT_SEMANTIC(ADDO);
+
+void SEMANTIC(DIVDUO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(DIVDUO) { DIVDUO(); }
+EXPORT_SEMANTIC(DIVDUO);
+
+void SEMANTIC(ADDMEO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(ADDMEO) { ADDMEO(); }
+EXPORT_SEMANTIC(ADDMEO);
+
+void SEMANTIC(MULLWO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(MULLWO) { MULLWO(); }
+EXPORT_SEMANTIC(MULLWO);
+
+void SEMANTIC(DIVWO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(DIVWO) { DIVWO(); }
+EXPORT_SEMANTIC(DIVWO);
+
+void SEMANTIC(DIVWUO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(DIVWUO) { DIVWUO(); }
+EXPORT_SEMANTIC(DIVWUO);
+
+void SEMANTIC(DIVDO)() { rpcsx_unimplemented_instruction(); }
+void DECODER(DIVDO) { DIVDO(); }
+EXPORT_SEMANTIC(DIVDO);
diff --git a/rpcsx/cpu/cell/ppu/src/Decoder.cpp b/rpcsx/cpu/cell/ppu/src/Decoder.cpp
new file mode 100644
index 000000000..11f49f6ed
--- /dev/null
+++ b/rpcsx/cpu/cell/ppu/src/Decoder.cpp
@@ -0,0 +1,501 @@
+#include "Decoder.hpp"
+#include "Instruction.hpp"
+#include "Opcode.hpp"
+#include <bit>
+#include <cstdint>
+
+struct InstructionEncodingInfo {
+  std::uint32_t value;
+  rx::cell::ppu::Opcode opcode;
+  rx::cell::ppu::Opcode rcOpcode;
+  std::uint32_t magn = 0;
+
+  constexpr InstructionEncodingInfo(std::uint32_t value,
+                                    rx::cell::ppu::Opcode opcode,
+                                    rx::cell::ppu::Opcode rcOpcode)
+      : value(value), opcode(opcode), rcOpcode(rcOpcode) {}
+
+  constexpr InstructionEncodingInfo(std::uint32_t value,
+                                    rx::cell::ppu::Opcode opcode,
+                                    rx::cell::ppu::Opcode rcOpcode,
+                                    std::uint32_t magn)
+      : value(value), opcode(opcode), rcOpcode(rcOpcode), magn(magn) {}
+};
+
+static constexpr rx::cell::ppu::DecoderTable<rx::cell::ppu::Opcode>
+buildOpcodeTable() {
+  // Main opcodes (field 0..5)
+  rx::cell::ppu::DecoderTable<rx::cell::ppu::Opcode> result;
+  result.fill(rx::cell::ppu::Opcode::Invalid);
+
+  auto fill_table =
+      [&](std::uint32_t main_op, std::uint32_t count, std::uint32_t sh,
+          std::initializer_list<InstructionEncodingInfo> entries) noexcept {
+        if (sh < 11) {
+          for (const auto &v : entries) {
+            for (std::uint32_t i = 0; i < 1u << (v.magn + (11 - sh - count));
+                 i++) {
+              for (std::uint32_t j = 0; j < 1u << sh; j++) {
+                const std::uint32_t k =
+                    (((i << (count - v.magn)) | v.value) << sh) | j;
+                result[(k << 6) | main_op] = i & 1 ? v.rcOpcode : v.opcode;
+              }
+            }
+          }
+        } else {
+          // Main table (special case)
+          for (const auto &v : entries) {
+            for (std::uint32_t i = 0; i < 1u << 11; i++) {
+              result[i << 6 | v.value] = i & 1 ? v.rcOpcode : v.opcode;
+            }
+          }
+        }
+      };
+
+#define GET(name) rx::cell::ppu::Opcode::name, rx::cell::ppu::Opcode::name
+#define GETRC(name) rx::cell::ppu::Opcode::name, rx::cell::ppu::Opcode::name##_
+
+  fill_table(
+      0x00, 6, -1,
+      {
+          {0x02, GET(TDI)},     {0x03, GET(TWI)},      {0x07, GET(MULLI)},
+          {0x08, GET(SUBFIC)},  {0x0a, GET(CMPLI)},    {0x0b, GET(CMPI)},
+          {0x0c, GET(ADDIC)},   {0x0d, GET(ADDIC)},    {0x0e, GET(ADDI)},
+          {0x0f, GET(ADDIS)},   {0x10, GET(BC)},       {0x11, GET(SC)},
+          {0x12, GET(B)},       {0x14, GETRC(RLWIMI)}, {0x15, GETRC(RLWINM)},
+          {0x17, GETRC(RLWNM)}, {0x18, GET(ORI)},      {0x19, GET(ORIS)},
+          {0x1a, GET(XORI)},    {0x1b, GET(XORIS)},    {0x1c, GET(ANDI)},
+          {0x1d, GET(ANDIS)},   {0x20, GET(LWZ)},      {0x21, GET(LWZU)},
+          {0x22, GET(LBZ)},     {0x23, GET(LBZU)},     {0x24, GET(STW)},
+          {0x25, GET(STWU)},    {0x26, GET(STB)},      {0x27, GET(STBU)},
+          {0x28, GET(LHZ)},     {0x29, GET(LHZU)},     {0x2a, GET(LHA)},
+          {0x2b, GET(LHAU)},    {0x2c, GET(STH)},      {0x2d, GET(STHU)},
+          {0x2e, GET(LMW)},     {0x2f, GET(STMW)},     {0x30, GET(LFS)},
+          {0x31, GET(LFSU)},    {0x32, GET(LFD)},      {0x33, GET(LFDU)},
+          {0x34, GET(STFS)},    {0x35, GET(STFSU)},    {0x36, GET(STFD)},
+          {0x37, GET(STFDU)},
+      });
+
+  // Group 0x04 opcodes (field 21..31)
+  fill_table(0x04, 11, 0,
+             {
+                 {0x0, GET(VADDUBM)},       {0x2, GET(VMAXUB)},
+                 {0x4, GET(VRLB)},          {0x006, GET(VCMPEQUB)},
+                 {0x406, GET(VCMPEQUB_)},   {0x8, GET(VMULOUB)},
+                 {0xa, GET(VADDFP)},        {0xc, GET(VMRGHB)},
+                 {0xe, GET(VPKUHUM)},
+
+                 {0x20, GET(VMHADDSHS), 5}, {0x21, GET(VMHRADDSHS), 5},
+                 {0x22, GET(VMLADDUHM), 5}, {0x24, GET(VMSUMUBM), 5},
+                 {0x25, GET(VMSUMMBM), 5},  {0x26, GET(VMSUMUHM), 5},
+                 {0x27, GET(VMSUMUHS), 5},  {0x28, GET(VMSUMSHM), 5},
+                 {0x29, GET(VMSUMSHS), 5},  {0x2a, GET(VSEL), 5},
+                 {0x2b, GET(VPERM), 5},     {0x2c, GET(VSLDOI), 5},
+                 {0x2e, GET(VMADDFP), 5},   {0x2f, GET(VNMSUBFP), 5},
+
+                 {0x40, GET(VADDUHM)},      {0x42, GET(VMAXUH)},
+                 {0x44, GET(VRLH)},         {0x046, GET(VCMPEQUH)},
+                 {0x446, GET(VCMPEQUH_)},   {0x48, GET(VMULOUH)},
+                 {0x4a, GET(VSUBFP)},       {0x4c, GET(VMRGHH)},
+                 {0x4e, GET(VPKUWUM)},      {0x80, GET(VADDUWM)},
+                 {0x82, GET(VMAXUW)},       {0x84, GET(VRLW)},
+                 {0x086, GET(VCMPEQUW)},    {0x486, GET(VCMPEQUW_)},
+                 {0x8c, GET(VMRGHW)},       {0x8e, GET(VPKUHUS)},
+                 {0x0c6, GET(VCMPEQFP)},    {0x4c6, GET(VCMPEQFP_)},
+                 {0xce, GET(VPKUWUS)},
+
+                 {0x102, GET(VMAXSB)},      {0x104, GET(VSLB)},
+                 {0x108, GET(VMULOSB)},     {0x10a, GET(VREFP)},
+                 {0x10c, GET(VMRGLB)},      {0x10e, GET(VPKSHUS)},
+                 {0x142, GET(VMAXSH)},      {0x144, GET(VSLH)},
+                 {0x148, GET(VMULOSH)},     {0x14a, GET(VRSQRTEFP)},
+                 {0x14c, GET(VMRGLH)},      {0x14e, GET(VPKSWUS)},
+                 {0x180, GET(VADDCUW)},     {0x182, GET(VMAXSW)},
+                 {0x184, GET(VSLW)},        {0x18a, GET(VEXPTEFP)},
+                 {0x18c, GET(VMRGLW)},      {0x18e, GET(VPKSHSS)},
+                 {0x1c4, GET(VSL)},         {0x1c6, GET(VCMPGEFP)},
+                 {0x5c6, GET(VCMPGEFP_)},   {0x1ca, GET(VLOGEFP)},
+                 {0x1ce, GET(VPKSWSS)},     {0x200, GET(VADDUBS)},
+                 {0x202, GET(VMINUB)},      {0x204, GET(VSRB)},
+                 {0x206, GET(VCMPGTUB)},    {0x606, GET(VCMPGTUB_)},
+                 {0x208, GET(VMULEUB)},     {0x20a, GET(VRFIN)},
+                 {0x20c, GET(VSPLTB)},      {0x20e, GET(VUPKHSB)},
+                 {0x240, GET(VADDUHS)},     {0x242, GET(VMINUH)},
+                 {0x244, GET(VSRH)},        {0x246, GET(VCMPGTUH)},
+                 {0x646, GET(VCMPGTUH_)},   {0x248, GET(VMULEUH)},
+                 {0x24a, GET(VRFIZ)},       {0x24c, GET(VSPLTH)},
+                 {0x24e, GET(VUPKHSH)},     {0x280, GET(VADDUWS)},
+                 {0x282, GET(VMINUW)},      {0x284, GET(VSRW)},
+                 {0x286, GET(VCMPGTUW)},    {0x686, GET(VCMPGTUW_)},
+                 {0x28a, GET(VRFIP)},       {0x28c, GET(VSPLTW)},
+                 {0x28e, GET(VUPKLSB)},     {0x2c4, GET(VSR)},
+                 {0x2c6, GET(VCMPGTFP)},    {0x6c6, GET(VCMPGTFP_)},
+                 {0x2ca, GET(VRFIM)},       {0x2ce, GET(VUPKLSH)},
+                 {0x300, GET(VADDSBS)},     {0x302, GET(VMINSB)},
+                 {0x304, GET(VSRAB)},       {0x306, GET(VCMPGTSB)},
+                 {0x706, GET(VCMPGTSB_)},   {0x308, GET(VMULESB)},
+                 {0x30a, GET(VCFUX)},       {0x30c, GET(VSPLTISB)},
+                 {0x30e, GET(VPKPX)},       {0x340, GET(VADDSHS)},
+                 {0x342, GET(VMINSH)},      {0x344, GET(VSRAH)},
+                 {0x346, GET(VCMPGTSH)},    {0x746, GET(VCMPGTSH_)},
+                 {0x348, GET(VMULESH)},     {0x34a, GET(VCFSX)},
+                 {0x34c, GET(VSPLTISH)},    {0x34e, GET(VUPKHPX)},
+                 {0x380, GET(VADDSWS)},     {0x382, GET(VMINSW)},
+                 {0x384, GET(VSRAW)},       {0x386, GET(VCMPGTSW)},
+                 {0x786, GET(VCMPGTSW_)},   {0x38a, GET(VCTUXS)},
+                 {0x38c, GET(VSPLTISW)},    {0x3c6, GET(VCMPBFP)},
+                 {0x7c6, GET(VCMPBFP_)},    {0x3ca, GET(VCTSXS)},
+                 {0x3ce, GET(VUPKLPX)},     {0x400, GET(VSUBUBM)},
+                 {0x402, GET(VAVGUB)},      {0x404, GET(VAND)},
+                 {0x40a, GET(VMAXFP)},      {0x40c, GET(VSLO)},
+                 {0x440, GET(VSUBUHM)},     {0x442, GET(VAVGUH)},
+                 {0x444, GET(VANDC)},       {0x44a, GET(VMINFP)},
+                 {0x44c, GET(VSRO)},        {0x480, GET(VSUBUWM)},
+                 {0x482, GET(VAVGUW)},      {0x484, GET(VOR)},
+                 {0x4c4, GET(VXOR)},        {0x502, GET(VAVGSB)},
+                 {0x504, GET(VNOR)},        {0x542, GET(VAVGSH)},
+                 {0x580, GET(VSUBCUW)},     {0x582, GET(VAVGSW)},
+                 {0x600, GET(VSUBUBS)},     {0x604, GET(MFVSCR)},
+                 {0x608, GET(VSUM4UBS)},    {0x640, GET(VSUBUHS)},
+                 {0x644, GET(MTVSCR)},      {0x648, GET(VSUM4SHS)},
+                 {0x680, GET(VSUBUWS)},     {0x688, GET(VSUM2SWS)},
+                 {0x700, GET(VSUBSBS)},     {0x708, GET(VSUM4SBS)},
+                 {0x740, GET(VSUBSHS)},     {0x780, GET(VSUBSWS)},
+                 {0x788, GET(VSUMSWS)},
+             });
+
+  // Group 0x13 opcodes (field 21..30)
+  fill_table(0x13, 10, 1,
+             {
+                 {0x000, GET(MCRF)},
+                 {0x010, GET(BCLR)},
+                 {0x012, GET(RFID)},
+                 {0x021, GET(CRNOR)},
+                 {0x052, GET(RFSCV)},
+                 {0x081, GET(CRANDC)},
+                 {0x096, GET(ISYNC)},
+                 {0x0c1, GET(CRXOR)},
+                 {0x0e1, GET(CRNAND)},
+                 {0x101, GET(CRAND)},
+                 {0x112, GET(HRFID)},
+                 {0x121, GET(CREQV)},
+                 {0x132, GET(URFID)},
+                 {0x172, GET(STOP)},
+                 {0x1a1, GET(CRORC)},
+                 {0x1c1, GET(CROR)},
+                 {0x210, GET(BCCTR)},
+             });
+
+  // Group 0x1e opcodes (field 27..30)
+  fill_table(0x1e, 4, 1,
+             {
+                 {0x0, GETRC(RLDICL)},
+                 {0x1, GETRC(RLDICL)},
+                 {0x2, GETRC(RLDICR)},
+                 {0x3, GETRC(RLDICR)},
+                 {0x4, GETRC(RLDIC)},
+                 {0x5, GETRC(RLDIC)},
+                 {0x6, GETRC(RLDIMI)},
+                 {0x7, GETRC(RLDIMI)},
+                 {0x8, GETRC(RLDCL)},
+                 {0x9, GETRC(RLDCR)},
+             });
+
+  // Group 0x1f opcodes (field 21..30)
+  fill_table(0x1f, 10, 1,
+             {
+                 {0x000, GET(CMP)},       {0x004, GET(TW)},
+                 {0x006, GET(LVSL)},      {0x007, GET(LVEBX)},
+                 {0x008, GETRC(SUBFC)},   {0x208, GETRC(SUBFCO)},
+                 {0x009, GETRC(MULHDU)},  {0x00a, GETRC(ADDC)},
+                 {0x20a, GETRC(ADDCO)},   {0x00b, GETRC(MULHWU)},
+                 {0x013, GET(MFOCRF)},    {0x014, GET(LWARX)},
+                 {0x015, GET(LDX)},       {0x017, GET(LWZX)},
+                 {0x018, GETRC(SLW)},     {0x01a, GETRC(CNTLZW)},
+                 {0x01b, GETRC(SLD)},     {0x01c, GETRC(AND)},
+                 {0x020, GET(CMPL)},      {0x026, GET(LVSR)},
+                 {0x027, GET(LVEHX)},     {0x028, GETRC(SUBF)},
+                 {0x228, GETRC(SUBFO)},   {0x035, GET(LDUX)},
+                 {0x036, GET(DCBST)},     {0x037, GET(LWZUX)},
+                 {0x03a, GETRC(CNTLZD)},  {0x03c, GETRC(ANDC)},
+                 {0x044, GET(TD)},        {0x047, GET(LVEWX)},
+                 {0x049, GETRC(MULHD)},   {0x04b, GETRC(MULHW)},
+                 {0x054, GET(LDARX)},     {0x056, GET(DCBF)},
+                 {0x057, GET(LBZX)},      {0x067, GET(LVX)},
+                 {0x068, GETRC(NEG)},     {0x268, GETRC(NEGO)},
+                 {0x077, GET(LBZUX)},     {0x07c, GETRC(NOR)},
+                 {0x087, GET(STVEBX)},    {0x088, GETRC(SUBFE)},
+                 {0x288, GETRC(SUBFEO)},  {0x08a, GETRC(ADDE)},
+                 {0x28a, GETRC(ADDEO)},   {0x090, GET(MTOCRF)},
+                 {0x095, GET(STDX)},      {0x096, GET(STWCX)},
+                 {0x097, GET(STWX)},      {0x0a7, GET(STVEHX)},
+                 {0x0b5, GET(STDUX)},     {0x0b7, GET(STWUX)},
+                 {0x0c7, GET(STVEWX)},    {0x0c8, GETRC(SUBFZE)},
+                 {0x2c8, GETRC(SUBFZEO)}, {0x0ca, GETRC(ADDZE)},
+                 {0x2ca, GETRC(ADDZEO)},  {0x0d6, GET(STDCX)},
+                 {0x0d7, GET(STBX)},      {0x0e7, GET(STVX)},
+                 {0x0e8, GETRC(SUBFME)},  {0x2e8, GETRC(SUBFMEO)},
+                 {0x0e9, GETRC(MULLD)},   {0x2e9, GETRC(MULLDO)},
+                 {0x0ea, GETRC(ADDME)},   {0x2ea, GETRC(ADDMEO)},
+                 {0x0eb, GETRC(MULLW)},   {0x2eb, GETRC(MULLWO)},
+                 {0x0f6, GET(DCBTST)},    {0x0f7, GET(STBUX)},
+                 {0x10a, GETRC(ADD)},     {0x30a, GETRC(ADDO)},
+                 {0x116, GET(DCBT)},      {0x117, GET(LHZX)},
+                 {0x11c, GETRC(EQV)},     {0x136, GET(ECIWX)},
+                 {0x137, GET(LHZUX)},     {0x13c, GETRC(XOR)},
+                 {0x153, GET(MFSPR)},     {0x155, GET(LWAX)},
+                 {0x156, GET(DST)},       {0x157, GET(LHAX)},
+                 {0x167, GET(LVXL)},      {0x173, GET(MFTB)},
+                 {0x175, GET(LWAUX)},     {0x176, GET(DSTST)},
+                 {0x177, GET(LHAUX)},     {0x197, GET(STHX)},
+                 {0x19c, GETRC(ORC)},     {0x1b6, GET(ECOWX)},
+                 {0x1b7, GET(STHUX)},     {0x1bc, GETRC(OR)},
+                 {0x1c9, GETRC(DIVDU)},   {0x3c9, GETRC(DIVDUO)},
+                 {0x1cb, GETRC(DIVWU)},   {0x3cb, GETRC(DIVWUO)},
+                 {0x1d3, GET(MTSPR)},     {0x1d6, GET(DCBI)},
+                 {0x1dc, GETRC(NAND)},    {0x1e7, GET(STVXL)},
+                 {0x1e9, GETRC(DIVD)},    {0x3e9, GETRC(DIVDO)},
+                 {0x1eb, GETRC(DIVW)},    {0x3eb, GETRC(DIVWO)},
+                 {0x207, GET(LVLX)},      {0x214, GET(LDBRX)},
+                 {0x215, GET(LSWX)},      {0x216, GET(LWBRX)},
+                 {0x217, GET(LFSX)},      {0x218, GETRC(SRW)},
+                 {0x21b, GETRC(SRD)},     {0x227, GET(LVRX)},
+                 {0x237, GET(LFSUX)},     {0x255, GET(LSWI)},
+                 {0x256, GET(SYNC)},      {0x257, GET(LFDX)},
+                 {0x277, GET(LFDUX)},     {0x287, GET(STVLX)},
+                 {0x294, GET(STDBRX)},    {0x295, GET(STSWX)},
+                 {0x296, GET(STWBRX)},    {0x297, GET(STFSX)},
+                 {0x2a7, GET(STVRX)},     {0x2b7, GET(STFSUX)},
+                 {0x2d5, GET(STSWI)},     {0x2d7, GET(STFDX)},
+                 {0x2f7, GET(STFDUX)},    {0x307, GET(LVLXL)},
+                 {0x316, GET(LHBRX)},     {0x318, GETRC(SRAW)},
+                 {0x31a, GETRC(SRAD)},    {0x327, GET(LVRXL)},
+                 {0x336, GET(DSS)},       {0x338, GETRC(SRAWI)},
+                 {0x33a, GETRC(SRADI)},   {0x33b, GETRC(SRADI)},
+                 {0x356, GET(EIEIO)},     {0x387, GET(STVLXL)},
+                 {0x396, GET(STHBRX)},    {0x39a, GETRC(EXTSH)},
+                 {0x3a7, GET(STVRXL)},    {0x3ba, GETRC(EXTSB)},
+                 {0x3d7, GET(STFIWX)},    {0x3da, GETRC(EXTSW)},
+                 {0x3d6, GET(ICBI)},      {0x3f6, GET(DCBZ)},
+             });
+
+  // Group 0x3a opcodes (field 30..31)
+  fill_table(0x3a, 2, 0,
+             {
+                 {0x0, GET(LD)},
+                 {0x1, GET(LDU)},
+                 {0x2, GET(LWA)},
+             });
+
+  // Group 0x3b opcodes (field 21..30)
+  fill_table(0x3b, 10, 1,
+             {
+                 {0x12, GETRC(FDIVS), 5},
+                 {0x14, GETRC(FSUBS), 5},
+                 {0x15, GETRC(FADDS), 5},
+                 {0x16, GETRC(FSQRTS), 5},
+                 {0x18, GETRC(FRES), 5},
+                 {0x19, GETRC(FMULS), 5},
+                 {0x1c, GETRC(FMSUBS), 5},
+                 {0x1d, GETRC(FMADDS), 5},
+                 {0x1e, GETRC(FNMSUBS), 5},
+                 {0x1f, GETRC(FNMADDS), 5},
+             });
+
+  // Group 0x3e opcodes (field 30..31)
+  fill_table(0x3e, 2, 0,
+             {
+                 {0x0, GET(STD)},
+                 {0x1, GET(STDU)},
+             });
+
+  // Group 0x3f opcodes (field 21..30)
+  fill_table(0x3f, 10, 1,
+             {
+                 {0x026, GETRC(MTFSB1)},     {0x040, GET(MCRFS)},
+                 {0x046, GETRC(MTFSB0)},     {0x086, GETRC(MTFSFI)},
+                 {0x247, GETRC(MFFS)},       {0x2c7, GETRC(MTFSF)},
+
+                 {0x000, GET(FCMPU)},        {0x00c, GETRC(FRSP)},
+                 {0x00e, GETRC(FCTIW)},      {0x00f, GETRC(FCTIWZ)},
+
+                 {0x012, GETRC(FDIV), 5},    {0x014, GETRC(FSUB), 5},
+                 {0x015, GETRC(FADD), 5},    {0x016, GETRC(FSQRT), 5},
+                 {0x017, GETRC(FSEL), 5},    {0x019, GETRC(FMUL), 5},
+                 {0x01a, GETRC(FRSQRTE), 5}, {0x01c, GETRC(FMSUB), 5},
+                 {0x01d, GETRC(FMADD), 5},   {0x01e, GETRC(FNMSUB), 5},
+                 {0x01f, GETRC(FNMADD), 5},
+
+                 {0x020, GET(FCMPO)},        {0x028, GETRC(FNEG)},
+                 {0x048, GETRC(FMR)},        {0x088, GETRC(FNABS)},
+                 {0x108, GETRC(FABS)},       {0x32e, GETRC(FCTID)},
+                 {0x32f, GETRC(FCTIDZ)},     {0x34e, GETRC(FCFID)},
+             });
+
+  return result;
+}
+
+rx::cell::ppu::DecoderTable<rx::cell::ppu::Opcode>
+    rx::cell::ppu::g_ppuOpcodeTable = buildOpcodeTable();
+
+rx::cell::ppu::Opcode rx::cell::ppu::fixOpcode(Opcode opcode,
+                                               std::uint32_t instruction) {
+  auto inst = std::bit_cast<Instruction>(instruction);
+
+  if (opcode == Opcode::ADDI) {
+    if (inst.ra == 0) {
+      return Opcode::LI;
+    }
+
+    return opcode;
+  }
+
+  if (opcode == Opcode::ADDIS) {
+    if (inst.ra == 0) {
+      return Opcode::LIS;
+    }
+
+    return opcode;
+  }
+
+  if (opcode == Opcode::CRNOR) {
+    if (inst.crba == inst.crbb) {
+      return Opcode::CRNOT;
+    }
+
+    return opcode;
+  }
+
+  if (opcode == Opcode::B) {
+    if (inst.aa && inst.lk) {
+      return Opcode::BLA;
+    } else if (inst.lk) {
+      return Opcode::BL;
+    } else if (inst.aa) {
+      return Opcode::BA;
+    }
+
+    return opcode;
+  }
+
+  if (opcode == Opcode::ORI) {
+    if (inst.rs == 0 && inst.ra == 0 && inst.uimm16 == 0) {
+      return Opcode::NOP;
+    }
+
+    if (inst.uimm16 == 0) {
+      return Opcode::MR;
+    }
+
+    return opcode;
+  }
+
+  if (opcode == Opcode::ORIS) {
+    if (inst.rs == 0 && inst.ra == 0 && inst.uimm16 == 0) {
+      return Opcode::NOP;
+    }
+
+    return opcode;
+  }
+
+  if (opcode == Opcode::RLDICL) {
+    if (inst.sh64 == 0) {
+      return Opcode::CLRLDI;
+    }
+
+    if (inst.mbe64 == 0) {
+      return Opcode::ROTLDI;
+    }
+
+    if (inst.mbe64 == 64 - inst.sh64) {
+      return Opcode::SRDI;
+    }
+
+    return opcode;
+  }
+
+  if (opcode == Opcode::CMP) {
+    if (inst.l10) {
+      return Opcode::CMPD;
+    }
+    return Opcode::CMPW;
+  }
+
+  if (opcode == Opcode::CMPL) {
+    if (inst.l10) {
+      return Opcode::CMPLD;
+    }
+    return Opcode::CMPLW;
+  }
+
+  if (opcode == Opcode::NOR) {
+    if (inst.rs == inst.rb) {
+      return Opcode::NOT;
+    }
+
+    return opcode;
+  }
+
+  if (opcode == Opcode::MTOCRF) {
+    if (!inst.l10) {
+      return Opcode::MTCRF;
+    }
+
+    return opcode;
+  }
+
+  if (opcode == Opcode::MFSPR) {
+    auto n = (inst.spr >> 5) | ((inst.spr & 0x1f) << 5);
+
+    switch (n) {
+    case 1:
+      return Opcode::MFXER;
+    case 8:
+      return Opcode::MFLR;
+    case 9:
+      return Opcode::MFCTR;
+    }
+
+    return opcode;
+  }
+
+  if (opcode == Opcode::MFTB) {
+    auto n = (inst.spr >> 5) | ((inst.spr & 0x1f) << 5);
+
+    switch (n) {
+    case 268:
+      return Opcode::MFTB;
+    case 269:
+      return Opcode::MFTBU;
+    }
+
+    return opcode;
+  }
+
+  if (opcode == Opcode::OR) {
+    if (inst.rs == inst.rb) {
+      switch (inst.raw) {
+      case 0x7c210b78:
+        return Opcode::CCTPL;
+      case 0x7c421378:
+        return Opcode::CCTPM;
+      case 0x7c631b78:
+        return Opcode::CCTPH;
+      case 0x7f9ce378:
+        return Opcode::DB8CYC;
+      case 0x7fbdeb78:
+        return Opcode::DB10CYC;
+      case 0x7fdef378:
+        return Opcode::DB12CYC;
+      case 0x7ffffb78:
+        return Opcode::DB16CYC;
+      }
+
+      return Opcode::MR;
+    }
+
+    return opcode;
+  }
+
+  return opcode;
+}
diff --git a/rx/include/rx/BitField.h b/rx/include/rx/BitField.h
new file mode 100644
index 000000000..a644bfde9
--- /dev/null
+++ b/rx/include/rx/BitField.h
@@ -0,0 +1,244 @@
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+
+#ifndef _MSC_VER
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#endif
+
+namespace rx {
+template <typename T, std::size_t N> struct BitFieldBase {
+  using type = T;
+  using vtype = std::common_type_t<type>;
+  using utype = std::make_unsigned_t<vtype>;
+
+  static constexpr bool can_be_packed =
+      N < (sizeof(int) * 8 + (std::is_unsigned_v<vtype> ? 1 : 0)) &&
+      sizeof(vtype) > sizeof(int);
+  using compact_type = std::conditional_t<
+      can_be_packed,
+      std::conditional_t<std::is_unsigned_v<vtype>, std::size_t, int>, vtype>;
+
+  // Datatype bitsize
+  static constexpr std::size_t bitmax = sizeof(T) * 8;
+  static_assert(N - 1 < bitmax, "BitFieldBase<> error: N out of bounds");
+
+  // Field bitsize
+  static constexpr std::size_t bitsize = N;
+
+  // All ones mask
+  static constexpr utype mask1 = static_cast<utype>(~static_cast<utype>(0));
+
+  // Value mask
+  static constexpr utype vmask = mask1 >> (bitmax - bitsize);
+
+protected:
+  type m_data;
+};
+
+// Bitfield accessor (N bits from I position, 0 is LSB)
+template <typename T, std::size_t I, std::size_t N>
+struct BitField : BitFieldBase<T, N> {
+  using type = typename BitField::type;
+  using vtype = typename BitField::vtype;
+  using utype = typename BitField::utype;
+  using compact_type = typename BitField::compact_type;
+
+  // Field offset
+  static constexpr std::size_t bitpos = I;
+  static_assert(bitpos + N <= BitField::bitmax,
+                "BitField<> error: I out of bounds");
+
+  // Get bitmask of size N, at I pos
+  static constexpr utype data_mask() {
+    return static_cast<utype>(
+        static_cast<utype>(BitField::mask1 >>
+                           (BitField::bitmax - BitField::bitsize))
+        << bitpos);
+  }
+
+  // Bitfield extraction
+  static constexpr compact_type extract(const T &data) noexcept {
+    if constexpr (std::is_signed_v<T>) {
+      // Load signed value (sign-extended)
+      return static_cast<compact_type>(
+          static_cast<vtype>(static_cast<utype>(data)
+                             << (BitField::bitmax - bitpos - N)) >>
+          (BitField::bitmax - N));
+    } else {
+      // Load unsigned value
+      return static_cast<compact_type>((static_cast<utype>(data) >> bitpos) &
+                                       BitField::vmask);
+    }
+  }
+
+  // Bitfield insertion
+  static constexpr vtype insert(compact_type value) {
+    return static_cast<vtype>((value & BitField::vmask) << bitpos);
+  }
+
+  // Load bitfield value
+  constexpr operator compact_type() const noexcept {
+    return extract(this->m_data);
+  }
+
+  // Load raw data with mask applied
+  constexpr T unshifted() const {
+    return static_cast<T>(this->m_data & data_mask());
+  }
+
+  // Optimized bool conversion (must be removed if inappropriate)
+  explicit constexpr operator bool() const noexcept {
+    return unshifted() != 0u;
+  }
+
+  // Store bitfield value
+  BitField &operator=(compact_type value) noexcept {
+    this->m_data =
+        static_cast<vtype>((this->m_data & ~data_mask()) | insert(value));
+    return *this;
+  }
+
+  compact_type operator++(int) {
+    compact_type result = *this;
+    *this = static_cast<compact_type>(result + 1u);
+    return result;
+  }
+
+  BitField &operator++() {
+    return *this = static_cast<compact_type>(*this + 1u);
+  }
+
+  compact_type operator--(int) {
+    compact_type result = *this;
+    *this = static_cast<compact_type>(result - 1u);
+    return result;
+  }
+
+  BitField &operator--() {
+    return *this = static_cast<compact_type>(*this - 1u);
+  }
+
+  BitField &operator+=(compact_type right) {
+    return *this = static_cast<compact_type>(*this + right);
+  }
+
+  BitField &operator-=(compact_type right) {
+    return *this = static_cast<compact_type>(*this - right);
+  }
+
+  BitField &operator*=(compact_type right) {
+    return *this = static_cast<compact_type>(*this * right);
+  }
+
+  BitField &operator&=(compact_type right) {
+    this->m_data &= static_cast<vtype>(
+        ((static_cast<utype>(right + 0u) & BitField::vmask) << bitpos) |
+        ~(BitField::vmask << bitpos));
+    return *this;
+  }
+
+  BitField &operator|=(compact_type right) {
+    this->m_data |= static_cast<vtype>(
+        (static_cast<utype>(right + 0u) & BitField::vmask) << bitpos);
+    return *this;
+  }
+
+  BitField &operator^=(compact_type right) {
+    this->m_data ^= static_cast<vtype>(
+        (static_cast<utype>(right + 0u) & BitField::vmask) << bitpos);
+    return *this;
+  }
+};
+
+// Field pack (concatenated from left to right)
+template <typename F = void, typename... Fields>
+struct BitFieldPack
+    : BitFieldBase<typename F::type,
+                   F::bitsize + BitFieldPack<Fields...>::bitsize> {
+  using type = typename BitFieldPack::type;
+  using vtype = typename BitFieldPack::vtype;
+  using utype = typename BitFieldPack::utype;
+  using compact_type = typename BitFieldPack::compact_type;
+
+  // Get disjunction of all "data" masks of concatenated values
+  static constexpr vtype data_mask() {
+    return static_cast<vtype>(F::data_mask() |
+                              BitFieldPack<Fields...>::data_mask());
+  }
+
+  // Extract all bitfields and concatenate
+  static constexpr compact_type extract(const type &data) {
+    return static_cast<compact_type>(static_cast<utype>(F::extract(data))
+                                         << BitFieldPack<Fields...>::bitsize |
+                                     BitFieldPack<Fields...>::extract(data));
+  }
+
+  // Split bitfields and insert them
+  static constexpr vtype insert(compact_type value) {
+    return static_cast<vtype>(
+        F::insert(value >> BitFieldPack<Fields...>::bitsize) |
+        BitFieldPack<Fields...>::insert(value));
+  }
+
+  // Load value
+  constexpr operator compact_type() const noexcept {
+    return extract(this->m_data);
+  }
+
+  // Store value
+  BitFieldPack &operator=(compact_type value) noexcept {
+    this->m_data = (this->m_data & ~data_mask()) | insert(value);
+    return *this;
+  }
+};
+
+// Empty field pack (recursion terminator)
+template <> struct BitFieldPack<void> {
+  static constexpr std::size_t bitsize = 0;
+
+  static constexpr std::size_t data_mask() { return 0; }
+
+  template <typename T>
+  static constexpr auto extract(const T &) -> decltype(+T()) {
+    return 0;
+  }
+
+  template <typename T> static constexpr T insert(T /*value*/) { return 0; }
+};
+
+// Fixed field (provides constant values in field pack)
+template <typename T, T V, std::size_t N>
+struct BitFieldFixed : BitFieldBase<T, N> {
+  using type = typename BitFieldFixed::type;
+  using vtype = typename BitFieldFixed::vtype;
+
+  // Return constant value
+  static constexpr vtype extract(const type &) {
+    static_assert((V & BitFieldFixed::vmask) == V,
+                  "BitFieldFixed<> error: V out of bounds");
+    return V;
+  }
+
+  // Get value
+  constexpr operator vtype() const noexcept { return V; }
+};
+} // namespace rx
+
+template <typename T, std::size_t I, std::size_t N>
+struct std::common_type<rx::BitField<T, I, N>, rx::BitField<T, I, N>>
+    : std::common_type<T> {};
+
+template <typename T, std::size_t I, std::size_t N, typename T2>
+struct std::common_type<rx::BitField<T, I, N>, T2>
+    : std::common_type<T2, std::common_type_t<T>> {};
+
+template <typename T, std::size_t I, std::size_t N, typename T2>
+struct std::common_type<T2, rx::BitField<T, I, N>>
+    : std::common_type<std::common_type_t<T>, T2> {};
+
+#ifndef _MSC_VER
+#pragma GCC diagnostic pop
+#endif
diff --git a/rx/include/rx/BitSet.h b/rx/include/rx/BitSet.h
new file mode 100644
index 000000000..55fe47ab1
--- /dev/null
+++ b/rx/include/rx/BitSet.h
@@ -0,0 +1,268 @@
+#pragma once
+
+/*
+This header implements bs_t<> class for scoped enum types (enum class).
+To enable bs_t<>, enum scope must contain `__bitset_enum_max` entry.
+
+enum class flagzz : u32
+{
+    flag1, // Bit indices start from zero
+    flag2,
+};
+
+This also enables helper operators for this enum type.
+
+Examples:
+`flagzz::flag1 | flagzz::flag2` - bitset union
+`flagzz::flag1 & ~flagzz::flag2` - bitset difference
+Intersection (&) and symmetric difference (^) is also available.
+*/
+
+#include "refl.hpp"
+#include "types.hpp"
+
+namespace rx {
+template <typename T>
+concept BitSetEnum =
+    std::is_enum_v<T> && requires(T x) { rx::fieldCount<T> > 0; };
+
+template <BitSetEnum T> class BitSet;
+
+namespace detail {
+template <BitSetEnum T> class InvertedBitSet final {
+  using underlying_type = std::underlying_type_t<T>;
+  underlying_type m_data;
+  constexpr InvertedBitSet(underlying_type data) : m_data(data) {}
+  friend BitSet<T>;
+};
+} // namespace detail
+
+// Bitset type for enum class with available bits [0, fieldCount)
+template <BitSetEnum T> class BitSet final {
+public:
+  // Underlying type
+  using underlying_type = std::underlying_type_t<T>;
+
+private:
+  // Underlying value
+  underlying_type m_data;
+
+  // Value constructor
+  constexpr explicit BitSet(int, underlying_type data) noexcept
+      : m_data(data) {}
+
+public:
+  static constexpr usz bitmax = sizeof(T) * 8;
+  static constexpr usz bitsize =
+      static_cast<underlying_type>(rx::fieldCount<T>);
+
+  static_assert(std::is_enum_v<T>,
+                "BitSet<> error: invalid type (must be enum)");
+  static_assert(bitsize <= bitmax,
+                "BitSet<> error: failed to determine enum field count");
+  static_assert(bitsize != bitmax || std::is_unsigned_v<underlying_type>,
+                "BitSet<> error: invalid field count (sign bit)");
+
+  // Helper function
+  static constexpr underlying_type shift(T value) {
+    return static_cast<underlying_type>(1)
+           << static_cast<underlying_type>(value);
+  }
+
+  BitSet() = default;
+
+  // Construct from a single bit
+  constexpr BitSet(T bit) noexcept : m_data(shift(bit)) {}
+
+  // Test for empty bitset
+  constexpr explicit operator bool() const noexcept { return m_data != 0; }
+
+  // Extract underlying data
+  constexpr explicit operator underlying_type() const noexcept {
+    return m_data;
+  }
+
+  constexpr detail::InvertedBitSet<T> operator~() const { return {m_data}; }
+
+  constexpr BitSet &operator+=(BitSet rhs) {
+    m_data |= static_cast<underlying_type>(rhs);
+    return *this;
+  }
+
+  constexpr BitSet &operator-=(BitSet rhs) {
+    m_data &= ~static_cast<underlying_type>(rhs);
+    return *this;
+  }
+
+  constexpr BitSet without(BitSet rhs) const {
+    BitSet result = *this;
+    result.m_data &= ~static_cast<underlying_type>(rhs);
+    return result;
+  }
+
+  constexpr BitSet with(BitSet rhs) const {
+    BitSet result = *this;
+    result.m_data |= static_cast<underlying_type>(rhs);
+    return result;
+  }
+
+  constexpr BitSet &operator&=(BitSet rhs) {
+    m_data &= static_cast<underlying_type>(rhs);
+    return *this;
+  }
+
+  constexpr BitSet &operator^=(BitSet rhs) {
+    m_data ^= static_cast<underlying_type>(rhs);
+    return *this;
+  }
+
+  [[deprecated("Use operator|")]] friend constexpr BitSet
+  operator+(BitSet lhs, BitSet rhs) {
+    return BitSet(0, lhs.m_data | rhs.m_data);
+  }
+
+  friend constexpr BitSet operator-(BitSet lhs, BitSet rhs) {
+    return BitSet(0, lhs.m_data & ~rhs.m_data);
+  }
+
+  friend constexpr BitSet operator|(BitSet lhs, BitSet rhs) {
+    return BitSet(0, lhs.m_data | rhs.m_data);
+  }
+
+  friend constexpr BitSet operator&(BitSet lhs, BitSet rhs) {
+    return BitSet(0, lhs.m_data & rhs.m_data);
+  }
+
+  friend constexpr BitSet operator&(BitSet lhs, detail::InvertedBitSet<T> rhs) {
+    return BitSet(0, lhs.m_data & rhs.m_data);
+  }
+
+  friend constexpr BitSet operator^(BitSet lhs, BitSet rhs) {
+    return BitSet(0, lhs.m_data ^ rhs.m_data);
+  }
+
+  constexpr bool operator==(BitSet rhs) const noexcept {
+    return m_data == rhs.m_data;
+  }
+
+  constexpr bool test_and_set(T bit) {
+    bool r = (m_data & shift(bit)) != 0;
+    m_data |= shift(bit);
+    return r;
+  }
+
+  constexpr bool test_and_reset(T bit) {
+    bool r = (m_data & shift(bit)) != 0;
+    m_data &= ~shift(bit);
+    return r;
+  }
+
+  constexpr bool test_and_complement(T bit) {
+    bool r = (m_data & shift(bit)) != 0;
+    m_data ^= shift(bit);
+    return r;
+  }
+
+  constexpr bool any_of(BitSet arg) const { return (m_data & arg.m_data) != 0; }
+
+  constexpr bool all_of(BitSet arg) const {
+    return (m_data & arg.m_data) == arg.m_data;
+  }
+
+  constexpr bool none_of(BitSet arg) const {
+    return (m_data & arg.m_data) == 0;
+  }
+};
+
+namespace bitset {
+// Unary '+' operator: promote plain enum value to bitset value
+template <BitSetEnum T>
+[[deprecated("Use toBitSet(bit)")]] constexpr BitSet<T> operator+(T bit) {
+  return BitSet<T>(bit);
+}
+
+template <BitSetEnum T> constexpr BitSet<T> toBitSet(T bit) {
+  return BitSet<T>(bit);
+}
+
+// Binary '+' operator: bitset union
+template <BitSetEnum T, typename U>
+  requires(std::is_constructible_v<BitSet<T>, U>)
+[[deprecated("Use operator|")]] constexpr BitSet<T> operator+(T lhs,
+                                                              const U &rhs) {
+  return BitSet<T>(lhs) | BitSet<T>(rhs);
+}
+
+// Binary '+' operator: bitset union
+template <typename U, BitSetEnum T>
+  requires(std::is_constructible_v<BitSet<T>, U> && !std::is_enum_v<U>)
+[[deprecated("Use operator|")]] constexpr BitSet<T> operator+(const U &lhs,
+                                                              T rhs) {
+  return BitSet<T>(lhs) | BitSet<T>(rhs);
+}
+
+// Binary '|' operator: bitset union
+template <BitSetEnum T, typename U>
+  requires(std::is_constructible_v<BitSet<T>, U>)
+constexpr BitSet<T> operator|(T lhs, const U &rhs) {
+  return BitSet<T>(lhs) | BitSet<T>(rhs);
+}
+
+// Binary '|' operator: bitset union
+template <typename U, BitSetEnum T>
+  requires(std::is_constructible_v<BitSet<T>, U> && !std::is_enum_v<U>)
+constexpr BitSet<T> operator|(const U &lhs, T rhs) {
+  return BitSet<T>(lhs) | BitSet<T>(rhs);
+}
+
+// Binary '-' operator: bitset difference
+template <BitSetEnum T, typename U>
+  requires(std::is_constructible_v<BitSet<T>, U>)
+constexpr BitSet<T> operator-(T lhs, const U &rhs) {
+  return BitSet<T>(lhs) - BitSet<T>(rhs);
+}
+
+// Binary '-' operator: bitset difference
+template <typename U, BitSetEnum T>
+  requires(std::is_constructible_v<BitSet<T>, U> && !std::is_enum_v<U>)
+constexpr BitSet<T> operator-(const U &lhs, T rhs) {
+  return BitSet<T>(lhs) - BitSet<T>(rhs);
+}
+
+// Binary '&' operator: bitset intersection
+template <BitSetEnum T, typename U>
+  requires(std::is_constructible_v<BitSet<T>, U>)
+constexpr BitSet<T> operator&(T lhs, const U &rhs) {
+  return BitSet<T>(lhs) & BitSet<T>(rhs);
+}
+
+// Binary '&' operator: bitset intersection
+template <typename U, BitSetEnum T>
+  requires(std::is_constructible_v<BitSet<T>, U> && !std::is_enum_v<U>)
+constexpr BitSet<T> operator&(const U &lhs, T rhs) {
+  return BitSet<T>(lhs) & BitSet<T>(rhs);
+}
+
+// Binary '&' operator: bitset intersection
+template <BitSetEnum T, typename U>
+constexpr BitSet<T> operator&(T lhs, detail::InvertedBitSet<T> rhs) {
+  return BitSet<T>(lhs) & rhs;
+}
+
+// Binary '^' operator: bitset symmetric difference
+template <BitSetEnum T, typename U>
+  requires(std::is_constructible_v<BitSet<T>, U>)
+constexpr BitSet<T> operator^(T lhs, const U &rhs) {
+  return BitSet<T>(lhs) ^ BitSet<T>(rhs);
+}
+
+// Binary '^' operator: bitset symmetric difference
+template <typename U, BitSetEnum T>
+  requires(std::is_constructible_v<BitSet<T>, U> && !std::is_enum_v<U>)
+constexpr BitSet<T> operator^(const U &lhs, T rhs) {
+  return BitSet<T>(lhs) ^ BitSet<T>(rhs);
+}
+} // namespace bitset
+} // namespace rx
+
+using namespace rx::bitset;
diff --git a/rx/include/rx/asm.hpp b/rx/include/rx/asm.hpp
new file mode 100644
index 000000000..2faed84ea
--- /dev/null
+++ b/rx/include/rx/asm.hpp
@@ -0,0 +1,358 @@
+#pragma once
+
+#include "types.hpp"
+#include <atomic>
+
+extern bool g_use_rtm;
+extern u64 g_rtm_tx_limit1;
+
+#ifdef _M_X64
+#ifdef _MSC_VER
+extern "C" {
+u32 _xbegin();
+void _xend();
+void _mm_pause();
+void _mm_prefetch(const char *, int);
+void _m_prefetchw(const volatile void *);
+
+uchar _rotl8(uchar, uchar);
+ushort _rotl16(ushort, uchar);
+u64 __popcnt64(u64);
+
+s64 __mulh(s64, s64);
+u64 __umulh(u64, u64);
+
+s64 _div128(s64, s64, s64, s64 *);
+u64 _udiv128(u64, u64, u64, u64 *);
+void __debugbreak();
+}
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#endif
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+namespace rx {
+// Try to prefetch to Level 2 cache since it's not split to data/code on most
+// processors
+template <typename T> constexpr void prefetch_exec(T func) {
+  if (std::is_constant_evaluated()) {
+    return;
+  }
+
+  const u64 value = reinterpret_cast<u64>(func);
+  const void *ptr = reinterpret_cast<const void *>(value);
+
+#ifdef _M_X64
+  return _mm_prefetch(static_cast<const char *>(ptr), _MM_HINT_T1);
+#else
+  return __builtin_prefetch(ptr, 0, 2);
+#endif
+}
+
+// Try to prefetch to Level 1 cache
+constexpr void prefetch_read(const void *ptr) {
+  if (std::is_constant_evaluated()) {
+    return;
+  }
+
+#ifdef _M_X64
+  return _mm_prefetch(static_cast<const char *>(ptr), _MM_HINT_T0);
+#else
+  return __builtin_prefetch(ptr, 0, 3);
+#endif
+}
+
+constexpr void prefetch_write(void *ptr) {
+  if (std::is_constant_evaluated()) {
+    return;
+  }
+
+#if defined(_M_X64) && !defined(__clang__)
+  return _m_prefetchw(ptr);
+#else
+  return __builtin_prefetch(ptr, 1, 0);
+#endif
+}
+
+constexpr u8 rol8(u8 x, u8 n) {
+  if (std::is_constant_evaluated()) {
+    return (x << (n & 7)) | (x >> ((-n & 7)));
+  }
+
+#ifdef _MSC_VER
+  return _rotl8(x, n);
+#elif defined(__clang__)
+  return __builtin_rotateleft8(x, n);
+#elif defined(ARCH_X64)
+  return __builtin_ia32_rolqi(x, n);
+#else
+  return (x << (n & 7)) | (x >> ((-n & 7)));
+#endif
+}
+
+constexpr u16 rol16(u16 x, u16 n) {
+  if (std::is_constant_evaluated()) {
+    return (x << (n & 15)) | (x >> ((-n & 15)));
+  }
+
+#ifdef _MSC_VER
+  return _rotl16(x, static_cast<uchar>(n));
+#elif defined(__clang__)
+  return __builtin_rotateleft16(x, n);
+#elif defined(ARCH_X64)
+  return __builtin_ia32_rolhi(x, n);
+#else
+  return (x << (n & 15)) | (x >> ((-n & 15)));
+#endif
+}
+
+constexpr u32 rol32(u32 x, u32 n) {
+  if (std::is_constant_evaluated()) {
+    return (x << (n & 31)) | (x >> (((0 - n) & 31)));
+  }
+
+#ifdef _MSC_VER
+  return _rotl(x, n);
+#elif defined(__clang__)
+  return __builtin_rotateleft32(x, n);
+#else
+  return (x << (n & 31)) | (x >> (((0 - n) & 31)));
+#endif
+}
+
+constexpr u64 rol64(u64 x, u64 n) {
+  if (std::is_constant_evaluated()) {
+    return (x << (n & 63)) | (x >> (((0 - n) & 63)));
+  }
+
+#ifdef _MSC_VER
+  return _rotl64(x, static_cast<int>(n));
+#elif defined(__clang__)
+  return __builtin_rotateleft64(x, n);
+#else
+  return (x << (n & 63)) | (x >> (((0 - n) & 63)));
+#endif
+}
+
+constexpr u32 popcnt64(u64 v) {
+#if !defined(_MSC_VER) || defined(__SSE4_2__)
+  if (std::is_constant_evaluated())
+#endif
+  {
+    v = (v & 0xaaaaaaaaaaaaaaaa) / 2 + (v & 0x5555555555555555);
+    v = (v & 0xcccccccccccccccc) / 4 + (v & 0x3333333333333333);
+    v = (v & 0xf0f0f0f0f0f0f0f0) / 16 + (v & 0x0f0f0f0f0f0f0f0f);
+    v = (v & 0xff00ff00ff00ff00) / 256 + (v & 0x00ff00ff00ff00ff);
+    v = ((v & 0xffff0000ffff0000) >> 16) + (v & 0x0000ffff0000ffff);
+    return static_cast<u32>((v >> 32) + v);
+  }
+
+#if !defined(_MSC_VER) || defined(__SSE4_2__)
+#ifdef _MSC_VER
+  return static_cast<u32>(__popcnt64(v));
+#else
+  return __builtin_popcountll(v);
+#endif
+#endif
+}
+
+constexpr u32 popcnt128(const u128 &v) {
+#ifdef _MSC_VER
+  return popcnt64(v.lo) + popcnt64(v.hi);
+#else
+  return popcnt64(v) + popcnt64(v >> 64);
+#endif
+}
+
+constexpr u64 umulh64(u64 x, u64 y) {
+#ifdef _MSC_VER
+  if (std::is_constant_evaluated())
+#endif
+  {
+    return static_cast<u64>((u128{x} * u128{y}) >> 64);
+  }
+
+#ifdef _MSC_VER
+  return __umulh(x, y);
+#endif
+}
+
+inline s64 mulh64(s64 x, s64 y) {
+#ifdef _MSC_VER
+  return __mulh(x, y);
+#else
+  return (s128{x} * s128{y}) >> 64;
+#endif
+}
+
+inline s64 div128(s64 high, s64 low, s64 divisor, s64 *remainder = nullptr) {
+#ifdef _MSC_VER
+  s64 rem = 0;
+  s64 r = _div128(high, low, divisor, &rem);
+
+  if (remainder) {
+    *remainder = rem;
+  }
+#else
+  const s128 x = (u128{static_cast<u64>(high)} << 64) | u64(low);
+  const s128 r = x / divisor;
+
+  if (remainder) {
+    *remainder = x % divisor;
+  }
+#endif
+  return r;
+}
+
+inline u64 udiv128(u64 high, u64 low, u64 divisor, u64 *remainder = nullptr) {
+#ifdef _MSC_VER
+  u64 rem = 0;
+  u64 r = _udiv128(high, low, divisor, &rem);
+
+  if (remainder) {
+    *remainder = rem;
+  }
+#else
+  const u128 x = (u128{high} << 64) | low;
+  const u128 r = x / divisor;
+
+  if (remainder) {
+    *remainder = x % divisor;
+  }
+#endif
+  return r;
+}
+
+#ifdef _MSC_VER
+inline u128 operator/(u128 lhs, u64 rhs) {
+  u64 rem = 0;
+  return _udiv128(lhs.hi, lhs.lo, rhs, &rem);
+}
+#endif
+
+constexpr u32 ctz128(u128 arg) {
+#ifdef _MSC_VER
+  if (!arg.lo)
+    return std::countr_zero(arg.hi) + 64u;
+  else
+    return std::countr_zero(arg.lo);
+#else
+  if (u64 lo = static_cast<u64>(arg))
+    return std::countr_zero<u64>(lo);
+  else
+    return std::countr_zero<u64>(arg >> 64) + 64;
+#endif
+}
+
+constexpr u32 clz128(u128 arg) {
+#ifdef _MSC_VER
+  if (arg.hi)
+    return std::countl_zero(arg.hi);
+  else
+    return std::countl_zero(arg.lo) + 64;
+#else
+  if (u64 hi = static_cast<u64>(arg >> 64))
+    return std::countl_zero<u64>(hi);
+  else
+    return std::countl_zero<u64>(arg) + 64;
+#endif
+}
+
+inline void pause() {
+#if defined(ARCH_ARM64)
+  __asm__ volatile("yield");
+#elif defined(_M_X64)
+  _mm_pause();
+#elif defined(ARCH_X64)
+  __builtin_ia32_pause();
+#else
+#error "Missing pause() implementation"
+#endif
+}
+
+// Align to power of 2
+template <typename T, typename U>
+  requires std::is_unsigned_v<T>
+constexpr std::make_unsigned_t<std::common_type_t<T, U>> align(T value,
+                                                               U align) {
+  return static_cast<std::make_unsigned_t<std::common_type_t<T, U>>>(
+      (value + (align - 1)) & (T{0} - align));
+}
+
+// General purpose aligned division, the result is rounded up not truncated
+template <typename T>
+  requires std::is_unsigned_v<T>
+constexpr T aligned_div(T value, std::type_identity_t<T> align) {
+  return static_cast<T>(value / align + T{!!(value % align)});
+}
+
+// General purpose aligned division, the result is rounded to nearest
+template <typename T>
+  requires std::is_integral_v<T>
+constexpr T rounded_div(T value, std::type_identity_t<T> align) {
+  if constexpr (std::is_unsigned_v<T>) {
+    return static_cast<T>(value / align + T{(value % align) > (align / 2)});
+  }
+
+  return static_cast<T>(value / align +
+                        (value > 0 ? T{(value % align) > (align / 2)}
+                                   : 0 - T{(value % align) < (align / 2)}));
+}
+
+// Multiplying by ratio, semi-resistant to overflows
+template <UnsignedInt T>
+constexpr T rational_mul(T value, std::type_identity_t<T> numerator,
+                         std::type_identity_t<T> denominator) {
+  if constexpr (sizeof(T) <= sizeof(u64) / 2) {
+    return static_cast<T>(value * u64{numerator} / u64{denominator});
+  }
+
+#if is_u128_emulated
+  if constexpr (sizeof(T) <= sizeof(u128) / 2) {
+    return static_cast<T>(u128_from_mul(value, numerator) / u64{denominator});
+  }
+#endif
+
+  return static_cast<T>(value / denominator * numerator +
+                        (value % denominator) * numerator / denominator);
+}
+
+template <UnsignedInt T> constexpr T add_saturate(T addend1, T addend2) {
+  return static_cast<T>(~addend1) < addend2 ? T{umax}
+                                            : static_cast<T>(addend1 + addend2);
+}
+
+template <UnsignedInt T> constexpr T sub_saturate(T minuend, T subtrahend) {
+  return minuend < subtrahend ? T{0} : static_cast<T>(minuend - subtrahend);
+}
+
+template <UnsignedInt T> constexpr T mul_saturate(T factor1, T factor2) {
+  return factor1 > 0 && T{umax} / factor1 < factor2
+             ? T{umax}
+             : static_cast<T>(factor1 * factor2);
+}
+
+inline void trigger_write_page_fault(void *ptr) {
+#if defined(ARCH_X64) && !defined(_MSC_VER)
+  __asm__ volatile("lock orl $0, 0(%0)" ::"r"(ptr));
+#elif defined(ARCH_ARM64) && !defined(ANDROID)
+  u32 value = 0;
+  u32 *u32_ptr = static_cast<u32 *>(ptr);
+  __asm__ volatile("ldset %w0, %w0, %1"
+                   : "+r"(value), "=Q"(*u32_ptr)
+                   : "r"(value));
+#else
+  static_cast<std::atomic<std::uint32_t> *>(ptr)->fetch_or(
+      0, std::memory_order::relaxed);
+#endif
+}
+} // namespace rx
+
+#ifdef _MSC_VER
+using rx::operator/;
+#endif
diff --git a/rx/include/rx/format.hpp b/rx/include/rx/format.hpp
index 682999ce3..bed16fbef 100644
--- a/rx/include/rx/format.hpp
+++ b/rx/include/rx/format.hpp
@@ -218,13 +218,15 @@ struct std::formatter<T> {
 
     std::string fieldName;
 
-    auto underlying = std::to_underlying(value);
+    // FIXME: requires C++23
+    // auto underlying = std::to_underlying(value);
+    auto underlying = static_cast<int>(value);
 
     if (underlying < 0) {
       fieldName = queryUnknownField(
           underlying, std::integral_constant<std::int64_t, 0>{},
           std::make_integer_sequence<std::int64_t, 128>{});
-    } else if (underlying >= rx::fieldCount<T>) {
+    } else if (static_cast<std::size_t>(underlying) >= rx::fieldCount<T>) {
       fieldName = queryUnknownField(
           underlying, std::integral_constant<std::int64_t, rx::fieldCount<T>>{},
           std::make_integer_sequence<std::int64_t, 128>{});
diff --git a/rx/include/rx/simd.hpp b/rx/include/rx/simd.hpp
new file mode 100644
index 000000000..c9ed0feea
--- /dev/null
+++ b/rx/include/rx/simd.hpp
@@ -0,0 +1,2236 @@
+#pragma once
+
+#include "asm.hpp"
+#include "types.hpp"
+#include "v128.hpp"
+
+#if defined(ARCH_X64)
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#include <emmintrin.h>
+#include <immintrin.h>
+#endif
+
+#if defined(ARCH_ARM64)
+#include <arm_neon.h>
+#endif
+
+#include <algorithm>
+#include <cmath>
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+namespace rx {
+inline v128 gv_select8(const v128 &_cmp, const v128 &_true, const v128 &_false);
+inline v128 gv_signselect8(const v128 &bits, const v128 &_true,
+                           const v128 &_false);
+inline v128 gv_select16(const v128 &_cmp, const v128 &_true,
+                        const v128 &_false);
+inline v128 gv_select32(const v128 &_cmp, const v128 &_true,
+                        const v128 &_false);
+inline v128 gv_selectfs(const v128 &_cmp, const v128 &_true,
+                        const v128 &_false);
+
+inline void gv_set_zeroing_denormals() {
+#if defined(ARCH_X64)
+  u32 cr = _mm_getcsr();
+  cr = (cr & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_ZERO_ON;
+  cr = (cr & ~_MM_DENORMALS_ZERO_MASK) | _MM_DENORMALS_ZERO_ON;
+  cr = (cr | _MM_MASK_INVALID);
+  _mm_setcsr(cr);
+#elif defined(ARCH_ARM64)
+  u64 cr;
+  __asm__ volatile("mrs %0, FPCR" : "=r"(cr));
+  cr |= 0x1000000ull;
+  __asm__ volatile("msr FPCR, %0" ::"r"(cr));
+#else
+#error "Not implemented"
+#endif
+}
+
+inline void gv_unset_zeroing_denormals() {
+#if defined(ARCH_X64)
+  u32 cr = _mm_getcsr();
+  cr = (cr & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_ZERO_OFF;
+  cr = (cr & ~_MM_DENORMALS_ZERO_MASK) | _MM_DENORMALS_ZERO_OFF;
+  cr = (cr | _MM_MASK_INVALID);
+  _mm_setcsr(cr);
+#elif defined(ARCH_ARM64)
+  u64 cr;
+  __asm__ volatile("mrs %0, FPCR" : "=r"(cr));
+  cr &= ~0x1000000ull;
+  __asm__ volatile("msr FPCR, %0" ::"r"(cr));
+#else
+#error "Not implemented"
+#endif
+}
+
+inline v128 gv_bcst8(u8 value) {
+#if defined(ARCH_X64)
+  return _mm_set1_epi8(value);
+#elif defined(ARCH_ARM64)
+  return vdupq_n_s8(value);
+#endif
+}
+
+inline v128 gv_bcst16(u16 value) {
+#if defined(ARCH_X64)
+  return _mm_set1_epi16(value);
+#elif defined(ARCH_ARM64)
+  return vdupq_n_s16(value);
+#endif
+}
+
+inline v128 gv_bcst32(u32 value) {
+#if defined(ARCH_X64)
+  return _mm_set1_epi32(value);
+#elif defined(ARCH_ARM64)
+  return vdupq_n_s32(value);
+#endif
+}
+
+inline v128 gv_bcst64(u64 value) {
+#if defined(ARCH_X64)
+  return _mm_set1_epi64x(value);
+#elif defined(ARCH_ARM64)
+  return vdupq_n_s64(value);
+#endif
+}
+
+inline v128 gv_bcstfs(f32 value) {
+#if defined(ARCH_X64)
+  return _mm_set1_ps(value);
+#elif defined(ARCH_ARM64)
+  return vdupq_n_f32(value);
+#endif
+}
+
+inline v128 gv_and32(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_and_si128(a, b);
+#elif defined(ARCH_ARM64)
+  return vandq_s32(a, b);
+#endif
+}
+
+inline v128 gv_andfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_and_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return vandq_s32(a, b);
+#endif
+}
+
+inline v128 gv_andn32(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_andnot_si128(a, b);
+#elif defined(ARCH_ARM64)
+  return vbicq_s32(b, a);
+#endif
+}
+
+inline v128 gv_andnfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_andnot_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return vbicq_s32(b, a);
+#endif
+}
+
+inline v128 gv_or32(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_or_si128(a, b);
+#elif defined(ARCH_ARM64)
+  return vorrq_s32(a, b);
+#endif
+}
+
+inline v128 gv_orfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_or_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return vorrq_s32(a, b);
+#endif
+}
+
+inline v128 gv_xor32(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_xor_si128(a, b);
+#elif defined(ARCH_ARM64)
+  return veorq_s32(a, b);
+#endif
+}
+
+inline v128 gv_xorfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_xor_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return veorq_s32(a, b);
+#endif
+}
+
+inline v128 gv_not32(const v128 &a) {
+#if defined(ARCH_X64)
+  return _mm_xor_si128(a, _mm_set1_epi32(-1));
+#elif defined(ARCH_ARM64)
+  return vmvnq_u32(a);
+#endif
+}
+
+inline v128 gv_notfs(const v128 &a) {
+#if defined(ARCH_X64)
+  return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+#elif defined(ARCH_ARM64)
+  return vmvnq_u32(a);
+#endif
+}
+
+inline v128 gv_shl16(const v128 &a, u32 count) {
+  if (count >= 16)
+    return v128{};
+#if defined(ARCH_X64)
+  return _mm_slli_epi16(a, count);
+#elif defined(ARCH_ARM64)
+  return vshlq_s16(a, vdupq_n_s16(count));
+#endif
+}
+
+inline v128 gv_shl32(const v128 &a, u32 count) {
+  if (count >= 32)
+    return v128{};
+#if defined(ARCH_X64)
+  return _mm_slli_epi32(a, count);
+#elif defined(ARCH_ARM64)
+  return vshlq_s32(a, vdupq_n_s32(count));
+#endif
+}
+
+inline v128 gv_shl64(const v128 &a, u32 count) {
+  if (count >= 64)
+    return v128{};
+#if defined(ARCH_X64)
+  return _mm_slli_epi64(a, count);
+#elif defined(ARCH_ARM64)
+  return vshlq_s64(a, vdupq_n_s64(count));
+#endif
+}
+
+inline v128 gv_shr16(const v128 &a, u32 count) {
+  if (count >= 16)
+    return v128{};
+#if defined(ARCH_X64)
+  return _mm_srli_epi16(a, count);
+#elif defined(ARCH_ARM64)
+  return vshlq_u16(a, vdupq_n_s16(0 - count));
+#endif
+}
+
+inline v128 gv_shr32(const v128 &a, u32 count) {
+  if (count >= 32)
+    return v128{};
+#if defined(ARCH_X64)
+  return _mm_srli_epi32(a, count);
+#elif defined(ARCH_ARM64)
+  return vshlq_u32(a, vdupq_n_s32(0 - count));
+#endif
+}
+
+inline v128 gv_shr64(const v128 &a, u32 count) {
+  if (count >= 64)
+    return v128{};
+#if defined(ARCH_X64)
+  return _mm_srli_epi64(a, count);
+#elif defined(ARCH_ARM64)
+  return vshlq_u64(a, vdupq_n_s64(0 - count));
+#endif
+}
+
+inline v128 gv_sar16(const v128 &a, u32 count) {
+  if (count >= 16)
+    count = 15;
+#if defined(ARCH_X64)
+  return _mm_srai_epi16(a, count);
+#elif defined(ARCH_ARM64)
+  return vshlq_s16(a, vdupq_n_s16(0 - count));
+#endif
+}
+
+inline v128 gv_sar32(const v128 &a, u32 count) {
+  if (count >= 32)
+    count = 31;
+#if defined(ARCH_X64)
+  return _mm_srai_epi32(a, count);
+#elif defined(ARCH_ARM64)
+  return vshlq_s32(a, vdupq_n_s32(0 - count));
+#endif
+}
+
+inline v128 gv_sar64(const v128 &a, u32 count) {
+  if (count >= 64)
+    count = 63;
+#if defined(__AVX512VL__)
+  return _mm_srai_epi64(a, count);
+#elif defined(__SSE2__) && !defined(_M_X64)
+  return static_cast<__v2di>(a) >> count;
+#elif defined(ARCH_ARM64)
+  return vshlq_s64(a, vdupq_n_s64(0 - count));
+#else
+  v128 r;
+  r._s64[0] = a._s64[0] >> count;
+  r._s64[1] = a._s64[1] >> count;
+  return r;
+#endif
+}
+inline v128 gv_add8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_add_epi8(a, b);
+#elif defined(ARCH_ARM64)
+  return vaddq_s8(a, b);
+#endif
+}
+
+inline v128 gv_add16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_add_epi16(a, b);
+#elif defined(ARCH_ARM64)
+  return vaddq_s16(a, b);
+#endif
+}
+
+inline v128 gv_add32(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_add_epi32(a, b);
+#elif defined(ARCH_ARM64)
+  return vaddq_s32(a, b);
+#endif
+}
+
+inline v128 gv_add64(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_add_epi64(a, b);
+#elif defined(ARCH_ARM64)
+  return vaddq_s64(a, b);
+#endif
+}
+
+inline v128 gv_adds_s8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_adds_epi8(a, b);
+#elif defined(ARCH_ARM64)
+  return vqaddq_s8(a, b);
+#endif
+}
+
+inline v128 gv_adds_s16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_adds_epi16(a, b);
+#elif defined(ARCH_ARM64)
+  return vqaddq_s16(a, b);
+#endif
+}
+
+inline v128 gv_adds_s32(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  const v128 s = _mm_add_epi32(a, b);
+  const v128 m = (a ^ s) & (b ^ s);     // overflow bit
+  const v128 x = _mm_srai_epi32(m, 31); // saturation mask
+  const v128 y =
+      _mm_srai_epi32(_mm_and_si128(s, m), 31); // positive saturation mask
+  return _mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(x, 1), y),
+                       _mm_or_si128(s, x));
+#elif defined(ARCH_ARM64)
+  return vqaddq_s32(a, b);
+#endif
+}
+
+inline v128 gv_addus_u8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_adds_epu8(a, b);
+#elif defined(ARCH_ARM64)
+  return vqaddq_u8(a, b);
+#endif
+}
+
+inline v128 gv_addus_u16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_adds_epu16(a, b);
+#elif defined(ARCH_ARM64)
+  return vqaddq_u16(a, b);
+#endif
+}
+
+inline v128 gv_addus_u32(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_add_epi32(a, _mm_min_epu32(~a, b));
+#elif defined(ARCH_X64)
+  const v128 s = _mm_add_epi32(a, b);
+  return _mm_or_si128(s,
+                      _mm_cmpgt_epi32(_mm_xor_si128(b, _mm_set1_epi32(smin)),
+                                      _mm_xor_si128(a, _mm_set1_epi32(smax))));
+#elif defined(ARCH_ARM64)
+  return vqaddq_u32(a, b);
+#endif
+}
+
+inline v128 gv_addfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_add_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return vaddq_f32(a, b);
+#endif
+}
+
+inline v128 gv_addfd(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_add_pd(a, b);
+#elif defined(ARCH_ARM64)
+  return vaddq_f64(a, b);
+#endif
+}
+
+inline v128 gv_sub8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_sub_epi8(a, b);
+#elif defined(ARCH_ARM64)
+  return vsubq_s8(a, b);
+#endif
+}
+
+inline v128 gv_sub16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_sub_epi16(a, b);
+#elif defined(ARCH_ARM64)
+  return vsubq_s16(a, b);
+#endif
+}
+
+inline v128 gv_sub32(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_sub_epi32(a, b);
+#elif defined(ARCH_ARM64)
+  return vsubq_s32(a, b);
+#endif
+}
+
+inline v128 gv_sub64(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_sub_epi64(a, b);
+#elif defined(ARCH_ARM64)
+  return vsubq_s64(a, b);
+#endif
+}
+
+inline v128 gv_subs_s8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_subs_epi8(a, b);
+#elif defined(ARCH_ARM64)
+  return vqsubq_s8(a, b);
+#endif
+}
+
+inline v128 gv_subs_s16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_subs_epi16(a, b);
+#elif defined(ARCH_ARM64)
+  return vqsubq_s16(a, b);
+#endif
+}
+
+inline v128 gv_subs_s32(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  const v128 d = _mm_sub_epi32(a, b);
+  const v128 m = (a ^ b) & (a ^ d); // overflow bit
+  const v128 x = _mm_srai_epi32(m, 31);
+  return _mm_or_si128(_mm_andnot_si128(x, d),
+                      _mm_and_si128(x, _mm_xor_si128(_mm_srli_epi32(x, 1),
+                                                     _mm_srai_epi32(a, 31))));
+#elif defined(ARCH_ARM64)
+  return vqsubq_s32(a, b);
+#endif
+}
+
+inline v128 gv_subus_u8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_subs_epu8(a, b);
+#elif defined(ARCH_ARM64)
+  return vqsubq_u8(a, b);
+#endif
+}
+
+inline v128 gv_subus_u16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_subs_epu16(a, b);
+#elif defined(ARCH_ARM64)
+  return vqsubq_u16(a, b);
+#endif
+}
+
+inline v128 gv_subus_u32(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_sub_epi32(a, _mm_min_epu32(a, b));
+#elif defined(ARCH_X64)
+  const auto sign = _mm_set1_epi32(smin);
+  return _mm_andnot_si128(
+      _mm_cmpgt_epi32(_mm_xor_si128(b, sign), _mm_xor_si128(a, sign)),
+      _mm_sub_epi32(a, b));
+#elif defined(ARCH_ARM64)
+  return vqsubq_u32(a, b);
+#endif
+}
+
+inline v128 gv_subfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_sub_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return vsubq_f32(a, b);
+#endif
+}
+
+inline v128 gv_subfd(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_sub_pd(a, b);
+#elif defined(ARCH_ARM64)
+  return vsubq_f64(a, b);
+#endif
+}
+
+inline v128 gv_maxu8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_max_epu8(a, b);
+#elif defined(ARCH_ARM64)
+  return vmaxq_u8(a, b);
+#endif
+}
+
+inline v128 gv_maxu16(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epu16(a, b);
+#elif defined(ARCH_X64)
+  return _mm_add_epi16(_mm_subs_epu16(a, b), b);
+#elif defined(ARCH_ARM64)
+  return vmaxq_u16(a, b);
+#endif
+}
+
+inline v128 gv_maxu32(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epu32(a, b);
+#elif defined(ARCH_X64)
+  const __m128i s = _mm_set1_epi32(smin);
+  const __m128i m = _mm_cmpgt_epi32(_mm_xor_si128(a, s), _mm_xor_si128(b, s));
+  return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
+#elif defined(ARCH_ARM64)
+  return vmaxq_u32(a, b);
+#endif
+}
+
+inline v128 gv_maxs8(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi8(a, b);
+#elif defined(ARCH_X64)
+  const __m128i m = _mm_cmpgt_epi8(a, b);
+  return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
+#elif defined(ARCH_ARM64)
+  return vmaxq_s8(a, b);
+#endif
+}
+
+inline v128 gv_maxs16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_max_epi16(a, b);
+#elif defined(ARCH_ARM64)
+  return vmaxq_s16(a, b);
+#endif
+}
+
+inline v128 gv_maxs32(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi32(a, b);
+#elif defined(ARCH_X64)
+  const __m128i m = _mm_cmpgt_epi32(a, b);
+  return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b));
+#elif defined(ARCH_ARM64)
+  return vmaxq_s32(a, b);
+#endif
+}
+
+inline v128 gv_maxfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_and_ps(_mm_max_ps(a, b), _mm_max_ps(b, a));
+#elif defined(ARCH_ARM64)
+  return vmaxq_f32(a, b);
+#endif
+}
+
+inline v128 gv_minu8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_min_epu8(a, b);
+#elif defined(ARCH_ARM64)
+  return vminq_u8(a, b);
+#endif
+}
+
+inline v128 gv_minu16(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epu16(a, b);
+#elif defined(ARCH_X64)
+  return _mm_sub_epi16(a, _mm_subs_epu16(a, b));
+#elif defined(ARCH_ARM64)
+  return vminq_u16(a, b);
+#endif
+}
+
+inline v128 gv_minu32(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epu32(a, b);
+#elif defined(ARCH_X64)
+  const __m128i s = _mm_set1_epi32(smin);
+  const __m128i m = _mm_cmpgt_epi32(_mm_xor_si128(a, s), _mm_xor_si128(b, s));
+  return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
+#elif defined(ARCH_ARM64)
+  return vminq_u32(a, b);
+#endif
+}
+
+inline v128 gv_mins8(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi8(a, b);
+#elif defined(ARCH_X64)
+  const __m128i m = _mm_cmpgt_epi8(a, b);
+  return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
+#elif defined(ARCH_ARM64)
+  return vminq_s8(a, b);
+#endif
+}
+
+inline v128 gv_mins16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_min_epi16(a, b);
+#elif defined(ARCH_ARM64)
+  return vminq_s16(a, b);
+#endif
+}
+
+inline v128 gv_mins32(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi32(a, b);
+#elif defined(ARCH_X64)
+  const __m128i m = _mm_cmpgt_epi32(a, b);
+  return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b));
+#elif defined(ARCH_ARM64)
+  return vminq_s32(a, b);
+#endif
+}
+
+inline v128 gv_minfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a));
+#elif defined(ARCH_ARM64)
+  return vminq_f32(a, b);
+#endif
+}
+
+inline v128 gv_eq8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpeq_epi8(a, b);
+#elif defined(ARCH_ARM64)
+  return vceqq_s8(a, b);
+#endif
+}
+
+inline v128 gv_eq16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpeq_epi16(a, b);
+#elif defined(ARCH_ARM64)
+  return vceqq_s16(a, b);
+#endif
+}
+
+inline v128 gv_eq32(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpeq_epi32(a, b);
+#elif defined(ARCH_ARM64)
+  return vceqq_s32(a, b);
+#endif
+}
+
+// Ordered and equal
+inline v128 gv_eqfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpeq_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return vceqq_f32(a, b);
+#endif
+}
+
+// Unordered or not equal
+inline v128 gv_neqfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpneq_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return ~vceqq_f32(a, b);
+#endif
+}
+
+inline v128 gv_gtu8(const v128 &a, const v128 &b) {
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+  return _mm_movm_epi8(_mm_cmpgt_epu8_mask(a, b));
+#elif defined(ARCH_X64)
+  return _mm_cmpeq_epi8(_mm_cmpeq_epi8(a, _mm_min_epu8(a, b)),
+                        _mm_setzero_si128());
+#elif defined(ARCH_ARM64)
+  return vcgtq_u8(a, b);
+#endif
+}
+
+inline v128 gv_gtu16(const v128 &a, const v128 &b) {
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+  return _mm_movm_epi16(_mm_cmpgt_epu16_mask(a, b));
+#elif defined(__SSE4_1__)
+  return _mm_cmpeq_epi16(_mm_cmpeq_epi16(a, _mm_min_epu16(a, b)),
+                         _mm_setzero_si128());
+#elif defined(ARCH_X64)
+  return _mm_cmpeq_epi16(
+      _mm_cmpeq_epi16(_mm_subs_epu16(a, b), _mm_setzero_si128()),
+      _mm_setzero_si128());
+#elif defined(ARCH_ARM64)
+  return vcgtq_u16(a, b);
+#endif
+}
+
+inline v128 gv_gtu32(const v128 &a, const v128 &b) {
+#if defined(__AVX512VL__) && defined(__AVX512DQ__)
+  return _mm_movm_epi32(_mm_cmpgt_epu32_mask(a, b));
+#elif defined(__SSE4_1__)
+  return _mm_cmpeq_epi32(_mm_cmpeq_epi32(a, _mm_min_epu32(a, b)),
+                         _mm_setzero_si128());
+#elif defined(ARCH_X64)
+  const auto sign = _mm_set1_epi32(smin);
+  return _mm_cmpgt_epi32(_mm_xor_si128(a, sign), _mm_xor_si128(b, sign));
+#elif defined(ARCH_ARM64)
+  return vcgtq_u32(a, b);
+#endif
+}
+
+// Ordered and greater than
+inline v128 gv_gtfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpgt_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return vcgtq_f32(a, b);
+#endif
+}
+
+// Ordered and less than
+inline v128 gv_ltfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmplt_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return vcltq_f32(a, b);
+#endif
+}
+
+// Unordered or less or equal
+inline v128 gv_ngtfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpngt_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return ~vcgtq_f32(a, b);
+#endif
+}
+
+// Unordered or greater or equal
+inline v128 gv_nlefs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpnle_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return ~vcleq_f32(a, b);
+#endif
+}
+
+inline v128 gv_geu8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpeq_epi8(b, _mm_min_epu8(a, b));
+#elif defined(ARCH_ARM64)
+  return vcgeq_u8(a, b);
+#endif
+}
+
+inline v128 gv_geu16(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_cmpeq_epi16(b, _mm_min_epu16(a, b));
+#elif defined(ARCH_X64)
+  return _mm_cmpeq_epi16(_mm_subs_epu16(b, a), _mm_setzero_si128());
+#elif defined(ARCH_ARM64)
+  return vcgeq_u16(a, b);
+#endif
+}
+
+inline v128 gv_geu32(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_cmpeq_epi32(b, _mm_min_epu32(a, b));
+#elif defined(ARCH_X64)
+  const auto sign = _mm_set1_epi32(smin);
+  return _mm_cmpeq_epi32(
+      _mm_cmpgt_epi32(_mm_xor_si128(b, sign), _mm_xor_si128(a, sign)),
+      _mm_setzero_si128());
+#elif defined(ARCH_ARM64)
+  return vcgeq_u32(a, b);
+#endif
+}
+
+// Ordered and not less than
+inline v128 gv_gefs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpge_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return vcgeq_f32(a, b);
+#endif
+}
+
+// Unordered or less than
+inline v128 gv_ngefs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpnge_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return ~vcgeq_f32(a, b);
+#endif
+}
+
+inline v128 gv_gts8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpgt_epi8(a, b);
+#elif defined(ARCH_ARM64)
+  return vcgtq_s8(a, b);
+#endif
+}
+
+inline v128 gv_gts16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpgt_epi16(a, b);
+#elif defined(ARCH_ARM64)
+  return vcgtq_s16(a, b);
+#endif
+}
+
+inline v128 gv_gts32(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_cmpgt_epi32(a, b);
+#elif defined(ARCH_ARM64)
+  return vcgtq_s32(a, b);
+#endif
+}
+
+inline v128 gv_avgu8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_avg_epu8(a, b);
+#elif defined(ARCH_ARM64)
+  return vrhaddq_u8(a, b);
+#endif
+}
+
+inline v128 gv_avgu16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_avg_epu16(a, b);
+#elif defined(ARCH_ARM64)
+  return vrhaddq_u16(a, b);
+#endif
+}
+
+inline v128 gv_avgu32(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  const auto ones = _mm_set1_epi32(-1);
+  const auto summ = gv_sub32(gv_add32(a, b), ones);
+  const auto carry = _mm_slli_epi32(gv_geu32(a, summ), 31);
+  return _mm_or_si128(carry, _mm_srli_epi32(summ, 1));
+#elif defined(ARCH_ARM64)
+  return vrhaddq_u32(a, b);
+#endif
+}
+
+inline v128 gv_avgs8(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  const v128 sign = _mm_set1_epi8(smin);
+  return gv_avgu8(a ^ sign, b ^ sign) ^ sign;
+#elif defined(ARCH_ARM64)
+  return vrhaddq_s8(a, b);
+#endif
+}
+
+inline v128 gv_avgs16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  const v128 sign = _mm_set1_epi16(smin);
+  return gv_avgu16(a ^ sign, b ^ sign) ^ sign;
+#elif defined(ARCH_ARM64)
+  return vrhaddq_s16(a, b);
+#endif
+}
+
+inline v128 gv_avgs32(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  const v128 sign = _mm_set1_epi32(smin);
+  return gv_avgu32(a ^ sign, b ^ sign) ^ sign;
+#elif defined(ARCH_ARM64)
+  return vrhaddq_s32(a, b);
+#endif
+}
+
+inline v128 gv_divfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_div_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return vdivq_f32(a, b);
+#endif
+}
+
+inline v128 gv_sqrtfs(const v128 &a) {
+#if defined(ARCH_X64)
+  return _mm_sqrt_ps(a);
+#elif defined(ARCH_ARM64)
+  return vsqrtq_f32(a);
+#endif
+}
+
+inline v128 gv_fmafs(const v128 &a, const v128 &b, const v128 &c) {
+#if defined(ARCH_X64) && defined(__FMA__)
+  return _mm_fmadd_ps(a, b, c);
+#elif defined(__FMA4__)
+  return _mm_macc_ps(a, b, c);
+#elif defined(ARCH_X64)
+  // This is inaccurate implementation
+#ifdef __AVX__
+  const __m128 r = _mm256_cvtpd_ps(
+      _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(a), _mm256_cvtps_pd(b)),
+                    _mm256_cvtps_pd(c)));
+#else
+  const __m128d a0 = _mm_cvtps_pd(a);
+  const __m128d a1 = _mm_cvtps_pd(_mm_movehl_ps(a, a));
+  const __m128d b0 = _mm_cvtps_pd(b);
+  const __m128d b1 = _mm_cvtps_pd(_mm_movehl_ps(b, b));
+  const __m128d c0 = _mm_cvtps_pd(c);
+  const __m128d c1 = _mm_cvtps_pd(_mm_movehl_ps(c, c));
+  const __m128d m0 = _mm_mul_pd(a0, b0);
+  const __m128d m1 = _mm_mul_pd(a1, b1);
+  const __m128d r0 = _mm_add_pd(m0, c0);
+  const __m128d r1 = _mm_add_pd(m1, c1);
+  const __m128 r = _mm_movelh_ps(_mm_cvtpd_ps(r0), _mm_cvtpd_ps(r1));
+#endif
+  return r;
+#elif defined(ARCH_ARM64)
+  return vfmaq_f32(c, a, b);
+#else
+  v128 r;
+  for (int i = 0; i < 4; i++) {
+    r._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]);
+  }
+  return r;
+#endif
+}
+
+// -> ssat((a * b * 2 + (c << 16) + 0x8000) >> 16)
+inline v128 gv_rmuladds_hds16(const v128 &a, const v128 &b, const v128 &c) {
+#if defined(ARCH_ARM64)
+#if defined(__ARM_FEATURE_QRDMX)
+  return vqrdmlahq_s16(c, a, b);
+#else
+  v128 d;
+
+  for (uint h = 0; h < 8; h++) {
+    s32 result = ((s32)a._s16[h] * (s32)b._s16[h]) + 0x4000;
+    result = (result >> 15) + (s32)c._s16[h];
+
+    if (result > INT16_MAX) {
+      d._s16[h] = (s16)INT16_MAX;
+    } else if (result < INT16_MIN) {
+      d._s16[h] = (s16)INT16_MIN;
+    } else {
+      d._s16[h] = (s16)result;
+    }
+  }
+
+  return d;
+#endif
+#elif defined(ARCH_X64)
+  const auto x80 =
+      _mm_set1_epi16(0x80); // 0x80 * 0x80 = 0x4000, add this to the product
+  const auto al = _mm_unpacklo_epi16(a, x80);
+  const auto ah = _mm_unpackhi_epi16(a, x80);
+  const auto bl = _mm_unpacklo_epi16(b, x80);
+  const auto bh = _mm_unpackhi_epi16(b, x80);
+  const auto ml = _mm_srai_epi32(_mm_madd_epi16(al, bl), 15);
+  const auto mh = _mm_srai_epi32(_mm_madd_epi16(ah, bh), 15);
+  const auto cl =
+      _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), c), 16);
+  const auto ch =
+      _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), c), 16);
+  const auto sl = _mm_add_epi32(ml, cl);
+  const auto sh = _mm_add_epi32(mh, ch);
+  return _mm_packs_epi32(sl, sh);
+#endif
+}
+
+// -> ssat((a * b * 2 + 0x8000) >> 16)
+inline v128 gv_rmuls_hds16(const v128 &a, const v128 &b) {
+#if defined(ARCH_ARM64)
+  return vqrdmulhq_s16(a, b);
+#elif defined(ARCH_X64)
+  const auto x80 =
+      _mm_set1_epi16(0x80); // 0x80 * 0x80 = 0x4000, add this to the product
+  const auto al = _mm_unpacklo_epi16(a, x80);
+  const auto ah = _mm_unpackhi_epi16(a, x80);
+  const auto bl = _mm_unpacklo_epi16(b, x80);
+  const auto bh = _mm_unpackhi_epi16(b, x80);
+  const auto ml = _mm_srai_epi32(_mm_madd_epi16(al, bl), 15);
+  const auto mh = _mm_srai_epi32(_mm_madd_epi16(ah, bh), 15);
+  return _mm_packs_epi32(ml, mh);
+#endif
+}
+
+// -> ssat((a * b * 2) >> 16)
+inline v128 gv_muls_hds16(const v128 &a, const v128 &b) {
+#if defined(ARCH_ARM64)
+  return vqdmulhq_s16(a, b);
+#elif defined(ARCH_X64)
+  const auto m = _mm_or_si128(_mm_srli_epi16(_mm_mullo_epi16(a, b), 15),
+                              _mm_slli_epi16(_mm_mulhi_epi16(a, b), 1));
+  const auto s = _mm_cmpeq_epi16(
+      m, _mm_set1_epi16(-0x8000)); // detect special case (positive 0x8000)
+  return _mm_xor_si128(m, s);
+#endif
+}
+
+inline v128 gv_muladd16(const v128 &a, const v128 &b, const v128 &c) {
+#if defined(ARCH_X64)
+  return _mm_add_epi16(_mm_mullo_epi16(a, b), c);
+#elif defined(ARCH_ARM64)
+  return vmlaq_s16(c, a, b);
+#endif
+}
+
+inline v128 gv_mul16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_mullo_epi16(a, b);
+#elif defined(ARCH_ARM64)
+  return vmulq_s16(a, b);
+#endif
+}
+
+inline v128 gv_mul32(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__)
+  return _mm_mullo_epi32(a, b);
+#elif defined(ARCH_X64)
+  const __m128i lows = _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8);
+  const __m128i highs = _mm_shuffle_epi32(
+      _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32)), 8);
+  return _mm_unpacklo_epi32(lows, highs);
+#elif defined(ARCH_ARM64)
+  return vmulq_s32(a, b);
+#endif
+}
+
+inline v128 gv_mulfs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_mul_ps(a, b);
+#elif defined(ARCH_ARM64)
+  return vmulq_f32(a, b);
+#endif
+}
+
+inline v128 gv_mulfs(const v128 &a, f32 b) {
+#if defined(ARCH_X64)
+  return _mm_mul_ps(a, _mm_set_ps1(b));
+#elif defined(ARCH_ARM64)
+  return vmulq_n_f32(a, b);
+#endif
+}
+
+inline v128 gv_hadds8x2(const v128 &a) {
+#if defined(__SSSE3__)
+  return _mm_maddubs_epi16(_mm_set1_epi8(1), a);
+#elif defined(ARCH_X64)
+  return _mm_add_epi16(_mm_srai_epi16(a, 8),
+                       _mm_srai_epi16(_mm_slli_epi16(a, 8), 8));
+#elif defined(ARCH_ARM64)
+  return vpaddlq_s8(a);
+#endif
+}
+
+inline v128 gv_hadds8x4(const v128 &a, const v128 &c) {
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+  return _mm_dpbusd_epi32(c, _mm_set1_epi8(1), a);
+#elif defined(__SSSE3__)
+  return _mm_add_epi32(
+      _mm_madd_epi16(_mm_maddubs_epi16(_mm_set1_epi8(1), a), _mm_set1_epi16(1)),
+      c);
+#elif defined(ARCH_X64)
+  return _mm_add_epi32(
+      _mm_madd_epi16(_mm_add_epi16(_mm_srai_epi16(a, 8),
+                                   _mm_srai_epi16(_mm_slli_epi16(a, 8), 8)),
+                     _mm_set1_epi16(1)),
+      c);
+#elif defined(ARCH_ARM64)
+  return vaddq_s32(vpaddlq_s16(vpaddlq_s8(a)), c);
+#endif
+}
+
+inline v128 gv_haddu8x2(const v128 &a) {
+#if defined(__SSSE3__)
+  return _mm_maddubs_epi16(a, _mm_set1_epi8(1));
+#elif defined(ARCH_X64)
+  return _mm_add_epi16(_mm_srli_epi16(a, 8),
+                       _mm_and_si128(a, _mm_set1_epi16(0x00ff)));
+#elif defined(ARCH_ARM64)
+  return vpaddlq_u8(a);
+#endif
+}
+
+inline v128 gv_haddu8x4(const v128 &a) {
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+  return _mm_dpbusd_epi32(_mm_setzero_si128(), a, _mm_set1_epi8(1));
+#elif defined(__SSSE3__)
+  return _mm_madd_epi16(_mm_maddubs_epi16(a, _mm_set1_epi8(1)),
+                        _mm_set1_epi16(1));
+#elif defined(ARCH_X64)
+  return _mm_madd_epi16(_mm_add_epi16(_mm_srli_epi16(a, 8),
+                                      _mm_and_si128(a, _mm_set1_epi16(0x00ff))),
+                        _mm_set1_epi16(1));
+#elif defined(ARCH_ARM64)
+  return vpaddlq_u16(vpaddlq_u8(a));
+#endif
+}
+
+inline v128 gv_hadds16x2(const v128 &a, const v128 &c) {
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+  return _mm_dpwssd_epi32(c, a, _mm_set1_epi8(1));
+#elif defined(ARCH_X64)
+  return _mm_add_epi32(_mm_madd_epi16(a, _mm_set1_epi16(1)), c);
+#elif defined(ARCH_ARM64)
+  return vaddq_s32(vpaddlq_s16(a), c);
+#endif
+}
+
+// Unsigned bytes from a, signed bytes from b, 32-bit accumulator c
+inline v128 gv_dotu8s8x4(const v128 &a, const v128 &b, const v128 &c) {
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+  return _mm_dpbusd_epi32(c, a, b);
+#elif defined(ARCH_X64)
+  const __m128i ah = _mm_srli_epi16(a, 8);
+  const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff));
+  const __m128i bh = _mm_srai_epi16(b, 8);
+  const __m128i bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
+  const __m128i mh = _mm_madd_epi16(ah, bh);
+  const __m128i ml = _mm_madd_epi16(al, bl);
+  const __m128i x = _mm_add_epi32(mh, ml);
+  return _mm_add_epi32(c, x);
+#elif defined(__ARM_FEATURE_MATMUL_INT8)
+  return vusdotq_s32(c, a, b);
+#elif defined(ARCH_ARM64)
+  const auto l =
+      vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                            vmovl_s8(vget_low_s8(b))));
+  const auto h =
+      vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                            vmovl_s8(vget_high_s8(b))));
+  return vaddq_s32(c, vaddq_s32(vuzp1q_s32(l, h), vuzp2q_s32(l, h)));
+#endif
+}
+
+inline v128 gv_dotu8x4(const v128 &a, const v128 &b, const v128 &c) {
+#if defined(ARCH_X64)
+  const __m128i ah = _mm_srli_epi16(a, 8);
+  const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff));
+  const __m128i bh = _mm_srli_epi16(b, 8);
+  const __m128i bl = _mm_and_si128(b, _mm_set1_epi16(0x00ff));
+  const __m128i mh = _mm_madd_epi16(ah, bh);
+  const __m128i ml = _mm_madd_epi16(al, bl);
+  const __m128i x = _mm_add_epi32(mh, ml);
+  return _mm_add_epi32(c, x);
+#elif defined(__ARM_FEATURE_DOTPROD)
+  return vdotq_u32(c, a, b);
+#elif defined(ARCH_ARM64)
+  const auto l = vpaddlq_u16(
+      vmulq_u16(vmovl_u8(vget_low_u8(a)), vmovl_u8(vget_low_u8(b))));
+  const auto h = vpaddlq_u16(
+      vmulq_u16(vmovl_u8(vget_high_u8(a)), vmovl_u8(vget_high_u8(b))));
+  return vaddq_u32(c, vaddq_u32(vuzp1q_u32(l, h), vuzp2q_u32(l, h)));
+#endif
+}
+
+inline v128 gv_dots16x2(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_madd_epi16(a, b);
+#elif defined(ARCH_ARM64)
+  const auto ml = vmull_s16(vget_low_s16(a), vget_low_s16(b));
+  const auto mh = vmull_s16(vget_high_s16(a), vget_high_s16(b));
+  const auto sl = vpadd_s32(vget_low_s32(ml), vget_high_s32(ml));
+  const auto sh = vpadd_s32(vget_low_s32(mh), vget_high_s32(mh));
+  return vcombine_s32(sl, sh);
+#endif
+}
+
+// Signed s16 from a and b, 32-bit accumulator c
+inline v128 gv_dots16x2(const v128 &a, const v128 &b, const v128 &c) {
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+  return _mm_dpwssd_epi32(c, a, b);
+#else
+  return gv_add32(c, gv_dots16x2(a, b));
+#endif
+}
+
+inline v128 gv_dotu16x2(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  const auto ml = _mm_mullo_epi16(a, b); // low results
+  const auto mh = _mm_mulhi_epu16(a, b); // high results
+  const auto ls = _mm_add_epi32(_mm_srli_epi32(ml, 16),
+                                _mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)));
+  const auto hs = _mm_add_epi32(_mm_slli_epi32(mh, 16),
+                                _mm_and_si128(mh, _mm_set1_epi32(0xffff0000)));
+  return _mm_add_epi32(ls, hs);
+#elif defined(ARCH_ARM64)
+  const auto ml = vmull_u16(vget_low_u16(a), vget_low_u16(b));
+  const auto mh = vmull_u16(vget_high_u16(a), vget_high_u16(b));
+  const auto sl = vpadd_u32(vget_low_u32(ml), vget_high_u32(ml));
+  const auto sh = vpadd_u32(vget_low_u32(mh), vget_high_u32(mh));
+  return vcombine_u32(sl, sh);
+#endif
+}
+
+// Unsigned bytes from a, signed bytes from b, 32-bit accumulator c
+inline v128 gv_dots_u8s8x4(const v128 &a, const v128 &b, const v128 &c) {
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+  return _mm_dpbusds_epi32(c, a, b);
+#elif defined(ARCH_X64)
+  const __m128i ah = _mm_srli_epi16(a, 8);
+  const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff));
+  const __m128i bh = _mm_srai_epi16(b, 8);
+  const __m128i bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
+  const __m128i mh = _mm_madd_epi16(ah, bh);
+  const __m128i ml = _mm_madd_epi16(al, bl);
+  return gv_adds_s32(c, _mm_add_epi32(mh, ml));
+#elif defined(ARCH_ARM64)
+  const auto l =
+      vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                            vmovl_s8(vget_low_s8(b))));
+  const auto h =
+      vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                            vmovl_s8(vget_high_s8(b))));
+  return vqaddq_s32(c, vaddq_s32(vuzp1q_s32(l, h), vuzp2q_s32(l, h)));
+#endif
+}
+
+// Signed s16 from a and b, 32-bit accumulator c; signed saturation
+inline v128 gv_dots_s16x2(const v128 &a, const v128 &b, const v128 &c) {
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+  return _mm_dpwssds_epi32(c, a, b);
+#else
+  const auto ab = gv_dots16x2(a, b);
+  const auto s0 = gv_adds_s32(ab, c);
+  const auto s1 =
+      gv_eq32(ab, gv_bcst32(0x80000000)); // +0x80000000, negative c ->
+                                          // c^0x80000000; otherwise 0x7fffffff
+  const auto s2 =
+      gv_select32(gv_gts32(gv_bcst32(0), c), gv_xor32(c, gv_bcst32(0x80000000)),
+                  gv_bcst32(0x7fffffff));
+  return gv_select32(s1, s2, s0);
+#endif
+}
+
+// Multiply s16 elements 0, 2, 4, 6 to produce s32 results in corresponding
+// lanes
+inline v128 gv_mul_even_s16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  const auto c = _mm_set1_epi32(0x0000ffff);
+  return _mm_madd_epi16(_mm_and_si128(a, c), _mm_and_si128(b, c));
+#else
+  // TODO
+  return gv_mul32(gv_sar32(gv_shl32(a, 16), 16), gv_sar32(gv_shl32(b, 16), 16));
+#endif
+}
+
+// Multiply u16 elements 0, 2, 4, 6 to produce u32 results in corresponding
+// lanes
+inline v128 gv_mul_even_u16(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__) || defined(ARCH_ARM64)
+  const auto c = gv_bcst32(0x0000ffff);
+  return gv_mul32(a & c, b & c);
+#elif defined(ARCH_X64)
+  const auto ml = _mm_mullo_epi16(a, b);
+  const auto mh = _mm_mulhi_epu16(a, b);
+  return _mm_or_si128(_mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)),
+                      _mm_slli_epi32(mh, 16));
+#endif
+}
+
+// Multiply s16 elements 1, 3, 5, 7 to produce s32 results in corresponding
+// lanes
+inline v128 gv_mul_odds_s16(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_madd_epi16(_mm_srli_epi32(a, 16), _mm_srli_epi32(b, 16));
+#else
+  return gv_mul32(gv_sar32(a, 16), gv_sar32(b, 16));
+#endif
+}
+
+// Multiply u16 elements 1, 3, 5, 7 to produce u32 results in corresponding
+// lanes
+inline v128 gv_mul_odds_u16(const v128 &a, const v128 &b) {
+#if defined(__SSE4_1__) || defined(ARCH_ARM64)
+  return gv_mul32(gv_shr32(a, 16), gv_shr32(b, 16));
+#elif defined(ARCH_X64)
+  const auto ml = _mm_mullo_epi16(a, b);
+  const auto mh = _mm_mulhi_epu16(a, b);
+  return _mm_or_si128(_mm_and_si128(mh, _mm_set1_epi32(0xffff0000)),
+                      _mm_srli_epi32(ml, 16));
+#endif
+}
+
+inline v128 gv_cvts32_tofs(const v128 &src) {
+#if defined(ARCH_X64)
+  return _mm_cvtepi32_ps(src);
+#elif defined(ARCH_ARM64)
+  return vcvtq_f32_s32(src);
+#endif
+}
+
+inline v128 gv_cvtu32_tofs(const v128 &src) {
+#if defined(__AVX512VL__)
+  return _mm_cvtepu32_ps(src);
+#elif defined(ARCH_X64)
+  const auto fix = _mm_and_ps(_mm_castsi128_ps(_mm_srai_epi32(src, 31)),
+                              _mm_set1_ps(0x80000000));
+  return _mm_add_ps(
+      _mm_cvtepi32_ps(_mm_and_si128(src, _mm_set1_epi32(0x7fffffff))), fix);
+#elif defined(ARCH_ARM64)
+  return vcvtq_f32_u32(src);
+#endif
+}
+
+inline v128 gv_cvtfs_tos32(const v128 &src) {
+#if defined(ARCH_X64)
+  return _mm_cvttps_epi32(src);
+#elif defined(ARCH_ARM64)
+  return vcvtq_s32_f32(src);
+#endif
+}
+
+inline v128 gv_cvtfs_tou32(const v128 &src) {
+#if defined(__AVX512VL__)
+  return _mm_cvttps_epu32(src);
+#elif defined(ARCH_X64)
+  const auto c1 = _mm_cvttps_epi32(src);
+  const auto s1 = _mm_srai_epi32(c1, 31);
+  const auto c2 = _mm_cvttps_epi32(_mm_sub_ps(src, _mm_set1_ps(2147483648.)));
+  return _mm_or_si128(c1, _mm_and_si128(c2, s1));
+#elif defined(ARCH_ARM64)
+  return vcvtq_u32_f32(src);
+#endif
+}
+
+inline f32 roundevenf32(f32 arg) {
+  u32 val = std::bit_cast<u32>(arg);
+  u32 exp = (val >> 23) & 0xff;
+  u32 abs = val & 0x7fffffff;
+
+  if (exp >= 127 + 23) {
+    // Big enough, NaN or INF
+    return arg;
+  }
+
+  if (exp >= 127) {
+    u32 int_pos = (127 + 23) - exp;
+    u32 half_pos = int_pos - 1;
+    u32 half_bit = 1u << half_pos;
+    u32 int_bit = 1u << int_pos;
+    if (val & (int_bit | (half_bit - 1)))
+      val += half_bit;
+    val &= ~(int_bit - 1);
+  } else if (exp == 126 && abs > 0x3f000000) {
+    val &= 0x80000000;
+    val |= 0x3f800000;
+  } else {
+    val &= 0x80000000;
+  }
+
+  return std::bit_cast<f32>(val);
+}
+
+#if defined(ARCH_X64)
+enum class RoundMode { Even, Floor, Ceil, Trunc };
+
+template <RoundMode Mode> __m128 sse41_roundf(__m128 a) {
+  v128 r = a;
+  for (u32 i = 0; i < 4; i++)
+    if constexpr (Mode == RoundMode::Even)
+      r._f[i] = roundevenf32(r._f[i]);
+    else if constexpr (Mode == RoundMode::Floor)
+      r._f[i] = ::floorf(r._f[i]);
+    else if constexpr (Mode == RoundMode::Ceil)
+      r._f[i] = ::ceilf(r._f[i]);
+    else if constexpr (Mode == RoundMode::Trunc)
+      r._f[i] = ::truncf(r._f[i]);
+  return r;
+}
+#endif
+
+inline v128 gv_roundfs_even(const v128 &a) {
+#if defined(__SSE4_1__)
+  return _mm_round_ps(a, 8 + 0);
+#elif defined(ARCH_ARM64)
+  return vrndnq_f32(a);
+#elif defined(ARCH_X64)
+  return sse41_roundf<RoundMode::Even>(a);
+#else
+  v128 r;
+  for (u32 i = 0; i < 4; i++)
+    r._f[i] = roundevenf32(a._f[i]);
+  return r;
+#endif
+}
+
+inline v128 gv_roundfs_ceil(const v128 &a) {
+#if defined(__SSE4_1__)
+  return _mm_round_ps(a, 8 + 2);
+#elif defined(ARCH_ARM64)
+  return vrndpq_f32(a);
+#elif defined(ARCH_X64)
+  return sse41_roundf<RoundMode::Ceil>(a);
+#else
+  v128 r;
+  for (u32 i = 0; i < 4; i++)
+    r._f[i] = ::ceilf(a._f[i]);
+  return r;
+#endif
+}
+
+inline v128 gv_roundfs_floor(const v128 &a) {
+#if defined(__SSE4_1__)
+  return _mm_round_ps(a, 8 + 1);
+#elif defined(ARCH_ARM64)
+  return vrndmq_f32(a);
+#elif defined(ARCH_X64)
+  return sse41_roundf<RoundMode::Floor>(a);
+#else
+  v128 r;
+  for (u32 i = 0; i < 4; i++)
+    r._f[i] = ::floorf(a._f[i]);
+  return r;
+#endif
+}
+
+inline v128 gv_roundfs_trunc(const v128 &a) {
+#if defined(__SSE4_1__)
+  return _mm_round_ps(a, 8 + 3);
+#elif defined(ARCH_ARM64)
+  return vrndq_f32(a);
+#elif defined(ARCH_X64)
+  return sse41_roundf<RoundMode::Trunc>(a);
+#else
+  v128 r;
+  for (u32 i = 0; i < 4; i++)
+    r._f[i] = ::truncf(a._f[i]);
+  return r;
+#endif
+}
+
+inline bool gv_testz(const v128 &a) {
+#if defined(__SSE4_1__)
+  return !!_mm_testz_si128(a, a);
+#elif defined(ARCH_X64)
+  return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == 0;
+#elif defined(ARCH_ARM64)
+  return std::bit_cast<s64>(vqmovn_s32(a)) == 0;
+#else
+  return !(a._u64[0] | a._u64[1]);
+#endif
+}
+
+// Same as gv_testz but tuned for pairing with gv_testall1
+inline bool gv_testall0(const v128 &a) {
+#if defined(__SSE4_1__)
+  return !!_mm_testz_si128(a, _mm_set1_epi32(-1));
+#elif defined(ARCH_X64)
+  return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == 0;
+#elif defined(ARCH_ARM64)
+  return std::bit_cast<s64>(vqmovn_s32(a)) == 0;
+#else
+  return !(a._u64[0] | a._u64[1]);
+#endif
+}
+
+inline bool gv_testall1(const v128 &a) {
+#if defined(__SSE4_1__)
+  return !!_mm_test_all_ones(a);
+#elif defined(ARCH_X64)
+  return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == -1;
+#elif defined(ARCH_ARM64)
+  return std::bit_cast<s64>(vqmovn_s32(a)) == -1;
+#else
+  return (a._u64[0] & a._u64[1]) == UINT64_MAX;
+#endif
+}
+
+// result = (~a) & (b)
+inline v128 gv_andn(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_andnot_si128(a, b);
+#elif defined(ARCH_ARM64)
+  return vbicq_s32(b, a);
+#endif
+}
+
+// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
+FORCE_INLINE v128 gv_select8(const v128 &_cmp, const v128 &_true,
+                             const v128 &_false) {
+#if defined(__SSE4_1__)
+  return _mm_blendv_epi8(_false, _true, _cmp);
+#elif defined(ARCH_ARM64)
+  return vbslq_u8(_cmp, _true, _false);
+#else
+  return (_cmp & _true) | gv_andn(_cmp, _false);
+#endif
+}
+
+// Select elements using sign bit only
+FORCE_INLINE v128 gv_signselect8(const v128 &bits, const v128 &_true,
+                                 const v128 &_false) {
+#if defined(__SSE4_1__)
+  return _mm_blendv_epi8(_false, _true, bits);
+#else
+  return gv_select8(gv_gts8(gv_bcst8(0), bits), _true, _false);
+#endif
+}
+
+// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
+inline v128 gv_select16(const v128 &_cmp, const v128 &_true,
+                        const v128 &_false) {
+#if defined(__SSE4_1__)
+  return _mm_blendv_epi8(_false, _true, _cmp);
+#elif defined(ARCH_ARM64)
+  return vbslq_u16(_cmp, _true, _false);
+#else
+  return (_cmp & _true) | gv_andn(_cmp, _false);
+#endif
+}
+
+// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
+inline v128 gv_select32(const v128 &_cmp, const v128 &_true,
+                        const v128 &_false) {
+#if defined(__SSE4_1__)
+  return _mm_blendv_epi8(_false, _true, _cmp);
+#elif defined(ARCH_ARM64)
+  return vbslq_u32(_cmp, _true, _false);
+#else
+  return (_cmp & _true) | gv_andn(_cmp, _false);
+#endif
+}
+
+// Select elements; _cmp must be result of SIMD comparison; undefined otherwise
+inline v128 gv_selectfs(const v128 &_cmp, const v128 &_true,
+                        const v128 &_false) {
+#if defined(__SSE4_1__)
+  return _mm_blendv_ps(_false, _true, _cmp);
+#elif defined(ARCH_ARM64)
+  return vbslq_f32(_cmp, _true, _false);
+#else
+  return _mm_or_ps(_mm_and_ps(_cmp, _true), _mm_andnot_ps(_cmp, _false));
+#endif
+}
+
+inline v128 gv_packss_s16(const v128 &low, const v128 &high) {
+#if defined(ARCH_X64)
+  return _mm_packs_epi16(low, high);
+#elif defined(ARCH_ARM64)
+  return vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
+#endif
+}
+
+inline v128 gv_packus_s16(const v128 &low, const v128 &high) {
+#if defined(ARCH_X64)
+  return _mm_packus_epi16(low, high);
+#elif defined(ARCH_ARM64)
+  return vcombine_u8(vqmovun_s16(low), vqmovun_s16(high));
+#endif
+}
+
+inline v128 gv_packus_u16(const v128 &low, const v128 &high) {
+#if defined(__SSE4_1__)
+  return _mm_packus_epi16(_mm_min_epu16(low, _mm_set1_epi16(0xff)),
+                          _mm_min_epu16(high, _mm_set1_epi16(0xff)));
+#elif defined(ARCH_X64)
+  return _mm_packus_epi16(
+      _mm_sub_epi16(low, _mm_subs_epu16(low, _mm_set1_epi16(0xff))),
+      _mm_sub_epi16(high, _mm_subs_epu16(high, _mm_set1_epi16(0xff))));
+#elif defined(ARCH_ARM64)
+  return vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
+#endif
+}
+
+inline v128 gv_packtu16(const v128 &low, const v128 &high) {
+#if defined(ARCH_X64)
+  return _mm_packus_epi16(low & _mm_set1_epi16(0xff),
+                          high & _mm_set1_epi16(0xff));
+#elif defined(ARCH_ARM64)
+  return vuzp1q_s8(low, high);
+#endif
+}
+
+inline v128 gv_packss_s32(const v128 &low, const v128 &high) {
+#if defined(ARCH_X64)
+  return _mm_packs_epi32(low, high);
+#elif defined(ARCH_ARM64)
+  return vcombine_s16(vqmovn_s32(low), vqmovn_s32(high));
+#endif
+}
+
+inline v128 gv_packus_s32(const v128 &low, const v128 &high) {
+#if defined(__SSE4_1__)
+  return _mm_packus_epi32(low, high);
+#elif defined(ARCH_X64)
+  const auto s = _mm_srai_epi16(_mm_packs_epi32(low, high), 15);
+  const auto r = gv_add16(_mm_packs_epi32(gv_sub32(low, gv_bcst32(0x8000)),
+                                          gv_sub32(high, gv_bcst32(0x8000))),
+                          gv_bcst16(0x8000));
+  return gv_andn(s, r);
+#elif defined(ARCH_ARM64)
+  return vcombine_u16(vqmovun_s32(low), vqmovun_s32(high));
+#endif
+}
+
+inline v128 gv_packus_u32(const v128 &low, const v128 &high) {
+#if defined(__SSE4_1__)
+  return _mm_packus_epi32(_mm_min_epu32(low, _mm_set1_epi32(0xffff)),
+                          _mm_min_epu32(high, _mm_set1_epi32(0xffff)));
+#elif defined(ARCH_X64)
+  const v128 s = _mm_cmpgt_epi16(
+      _mm_packs_epi32(_mm_srli_epi32(low, 16), _mm_srli_epi32(high, 16)),
+      _mm_setzero_si128());
+  const v128 r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(low, 16), 16),
+                                 _mm_srai_epi32(_mm_slli_epi32(high, 16), 16));
+  return _mm_or_si128(r, s);
+#elif defined(ARCH_ARM64)
+  return vcombine_u16(vqmovn_u32(low), vqmovn_u32(high));
+#endif
+}
+
+inline v128 gv_packtu32(const v128 &low, const v128 &high) {
+#if defined(__SSE4_1__)
+  return _mm_packus_epi32(low & _mm_set1_epi32(0xffff),
+                          high & _mm_set1_epi32(0xffff));
+#elif defined(ARCH_X64)
+  return _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(low, 16), 16),
+                         _mm_srai_epi32(_mm_slli_epi32(high, 16), 16));
+#elif defined(ARCH_ARM64)
+  return vuzp1q_s16(low, high);
+#endif
+}
+
+inline v128 gv_unpacklo8(const v128 &lows, const v128 &highs) {
+#if defined(ARCH_X64)
+  return _mm_unpacklo_epi8(lows, highs);
+#elif defined(ARCH_ARM64)
+  return vzip1q_s8(lows, highs);
+#endif
+}
+
+inline v128 gv_extend_lo_s8(const v128 &vec) {
+#if defined(__SSE4_1__)
+  return _mm_cvtepi8_epi16(vec);
+#elif defined(ARCH_X64)
+  return _mm_srai_epi16(_mm_unpacklo_epi8(vec, vec), 8);
+#elif defined(ARCH_ARM64)
+  return int16x8_t(vmovl_s8(vget_low_s8(vec)));
+#endif
+}
+
+inline v128 gv_extend_hi_s8(const v128 &vec) {
+#if defined(__SSE4_1__)
+  return _mm_cvtepi8_epi16(_mm_loadu_si64(vec._bytes + 8));
+#elif defined(ARCH_X64)
+  return _mm_srai_epi16(_mm_unpackhi_epi8(vec, vec), 8);
+#elif defined(ARCH_ARM64)
+  return int16x8_t(vmovl_s8(vget_high_s8(vec)));
+#endif
+}
+
+inline v128 gv_unpacklo16(const v128 &lows, const v128 &highs) {
+#if defined(ARCH_X64)
+  return _mm_unpacklo_epi16(lows, highs);
+#elif defined(ARCH_ARM64)
+  return vzip1q_s16(lows, highs);
+#endif
+}
+
+inline v128 gv_extend_lo_s16(const v128 &vec) {
+#if defined(__SSE4_1__)
+  return _mm_cvtepi16_epi32(vec);
+#elif defined(ARCH_X64)
+  return _mm_srai_epi32(_mm_unpacklo_epi16(vec, vec), 16);
+#elif defined(ARCH_ARM64)
+  return int32x4_t(vmovl_s16(vget_low_s16(vec)));
+#endif
+}
+
+inline v128 gv_extend_hi_s16(const v128 &vec) {
+#if defined(__SSE4_1__)
+  return _mm_cvtepi16_epi32(_mm_loadu_si64(vec._bytes + 8));
+#elif defined(ARCH_X64)
+  return _mm_srai_epi32(_mm_unpackhi_epi16(vec, vec), 16);
+#elif defined(ARCH_ARM64)
+  return int32x4_t(vmovl_s16(vget_high_s16(vec)));
+#endif
+}
+
+inline v128 gv_unpacklo32(const v128 &lows, const v128 &highs) {
+#if defined(ARCH_X64)
+  return _mm_unpacklo_epi32(lows, highs);
+#elif defined(ARCH_ARM64)
+  return vzip1q_s32(lows, highs);
+#endif
+}
+
+inline v128 gv_unpackhi8(const v128 &lows, const v128 &highs) {
+#if defined(ARCH_X64)
+  return _mm_unpackhi_epi8(lows, highs);
+#elif defined(ARCH_ARM64)
+  return vzip2q_s8(lows, highs);
+#endif
+}
+
+inline v128 gv_unpackhi16(const v128 &lows, const v128 &highs) {
+#if defined(ARCH_X64)
+  return _mm_unpackhi_epi16(lows, highs);
+#elif defined(ARCH_ARM64)
+  return vzip2q_s16(lows, highs);
+#endif
+}
+
+inline v128 gv_unpackhi32(const v128 &lows, const v128 &highs) {
+#if defined(ARCH_X64)
+  return _mm_unpackhi_epi32(lows, highs);
+#elif defined(ARCH_ARM64)
+  return vzip2q_s32(lows, highs);
+#endif
+}
+
+inline bool v128::operator==(const v128 &b) const {
+#if defined(ARCH_X64)
+  return gv_testz(_mm_xor_si128(*this, b));
+#else
+  return gv_testz(*this ^ b);
+#endif
+}
+
+inline v128 v128::operator|(const v128 &rhs) const {
+#if defined(ARCH_X64)
+  return _mm_or_si128(*this, rhs);
+#elif defined(ARCH_ARM64)
+  return vorrq_s32(*this, rhs);
+#endif
+}
+
+inline v128 v128::operator&(const v128 &rhs) const {
+#if defined(ARCH_X64)
+  return _mm_and_si128(*this, rhs);
+#elif defined(ARCH_ARM64)
+  return vandq_s32(*this, rhs);
+#endif
+}
+
+inline v128 v128::operator^(const v128 &rhs) const {
+#if defined(ARCH_X64)
+  return _mm_xor_si128(*this, rhs);
+#elif defined(ARCH_ARM64)
+  return veorq_s32(*this, rhs);
+#endif
+}
+
+inline v128 v128::operator~() const {
+#if defined(ARCH_X64)
+  return _mm_xor_si128(*this, _mm_set1_epi32(-1));
+#elif defined(ARCH_ARM64)
+  return vmvnq_u32(*this);
+#endif
+}
+
+inline v128 gv_exp2_approxfs(const v128 &a) {
+  // TODO
+#if 0
+	const auto x0 = _mm_max_ps(_mm_min_ps(a, _mm_set1_ps(127.4999961f)), _mm_set1_ps(-127.4999961f));
+	const auto x1 = _mm_add_ps(x0, _mm_set1_ps(0.5f));
+	const auto x2 = _mm_sub_epi32(_mm_cvtps_epi32(x1), _mm_and_si128(_mm_castps_si128(_mm_cmpnlt_ps(_mm_setzero_ps(), x1)), _mm_set1_epi32(1)));
+	const auto x3 = _mm_sub_ps(x0, _mm_cvtepi32_ps(x2));
+	const auto x4 = _mm_mul_ps(x3, x3);
+	const auto x5 = _mm_mul_ps(x3, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(x4, _mm_set1_ps(0.023093347705f)), _mm_set1_ps(20.20206567f)), x4), _mm_set1_ps(1513.906801f)));
+	const auto x6 = _mm_mul_ps(x5, _mm_rcp_ps(_mm_sub_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(233.1842117f), x4), _mm_set1_ps(4368.211667f)), x5)));
+	return _mm_mul_ps(_mm_add_ps(_mm_add_ps(x6, x6), _mm_set1_ps(1.0f)), _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(x2, _mm_set1_epi32(127)), 23)));
+#else
+  v128 r;
+  for (u32 i = 0; i < 4; i++)
+    r._f[i] = std::exp2f(a._f[i]);
+  return r;
+#endif
+}
+
+inline v128 gv_log2_approxfs(const v128 &a) {
+  // TODO
+#if 0
+	const auto _1 = _mm_set1_ps(1.0f);
+	const auto _c = _mm_set1_ps(1.442695040f);
+	const auto x0 = _mm_max_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x00800000)));
+	const auto x1 = _mm_or_ps(_mm_and_ps(x0, _mm_castsi128_ps(_mm_set1_epi32(0x807fffff))), _1);
+	const auto x2 = _mm_rcp_ps(_mm_add_ps(x1, _1));
+	const auto x3 = _mm_mul_ps(_mm_sub_ps(x1, _1), x2);
+	const auto x4 = _mm_add_ps(x3, x3);
+	const auto x5 = _mm_mul_ps(x4, x4);
+	const auto x6 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.7895802789f), x5), _mm_set1_ps(16.38666457f)), x5), _mm_set1_ps(-64.1409953f));
+	const auto x7 = _mm_rcp_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-35.67227983f), x5), _mm_set1_ps(312.0937664f)), x5), _mm_set1_ps(-769.6919436f)));
+	const auto x8 = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x0), 23), _mm_set1_epi32(127)));
+	return _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(x5, x6), x7), x4), _c), _mm_add_ps(_mm_mul_ps(x4, _c), x8));
+#else
+  v128 r;
+  for (u32 i = 0; i < 4; i++)
+    r._f[i] = std::log2f(a._f[i]);
+  return r;
+#endif
+}
+
+// For each 8-bit element, r = a << (b & 7)
+inline v128 gv_shl8(const v128 &a, const v128 &b) {
+#if defined(ARCH_ARM64)
+  return vshlq_u8(a, vandq_s8(b, gv_bcst8(7)));
+#else
+  const v128 x1 = gv_add8(a, a); // shift left by 1
+  const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
+  const v128 x2 = gv_and32(gv_shl64(r1, 2), gv_bcst8(0xfc)); // shift by 2
+  const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
+  const v128 x3 = gv_and32(gv_shl64(r2, 4), gv_bcst8(0xf0)); // shift by 4
+  return gv_signselect8(gv_shl64(b, 5), x3, r2);
+#endif
+}
+
+// For each 16-bit element, r = a << (b & 15)
+inline v128 gv_shl16(const v128 &a, const v128 &b) {
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+  return _mm_sllv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
+#elif defined(ARCH_ARM64)
+  return vshlq_u16(a, vandq_s16(b, gv_bcst8(15)));
+#else
+  v128 r;
+  for (u32 i = 0; i < 8; i++)
+    r._u16[i] = a._u16[i] << (b._u16[i] & 15);
+  return r;
+#endif
+}
+
+// For each 32-bit element, r = a << (b & 31)
+inline v128 gv_shl32(const v128 &a, const v128 &b) {
+#if defined(__AVX2__)
+  return _mm_sllv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
+#elif defined(ARCH_ARM64)
+  return vshlq_u32(a, vandq_s32(b, gv_bcst8(31)));
+#else
+  v128 r;
+  for (u32 i = 0; i < 4; i++)
+    r._u32[i] = a._u32[i] << (b._u32[i] & 31);
+  return r;
+#endif
+}
+
+// For each unsigned 8-bit element, r = a >> (b & 7)
+inline v128 gv_shr8(const v128 &a, const v128 &b) {
+#if defined(ARCH_ARM64)
+  return vshlq_u8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7))));
+#else
+  const v128 x1 = gv_and32(gv_shr64(a, 1), gv_bcst8(0x7f)); // shift right by 1
+  const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
+  const v128 x2 = gv_and32(gv_shr64(r1, 2), gv_bcst8(0x3f)); // shift by 2
+  const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
+  const v128 x3 = gv_and32(gv_shr64(r2, 4), gv_bcst8(0x0f)); // shift by 4
+  return gv_signselect8(gv_shl64(b, 5), x3, r2);
+#endif
+}
+
+// For each unsigned 16-bit element, r = a >> (b & 15)
+inline v128 gv_shr16(const v128 &a, const v128 &b) {
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+  return _mm_srlv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
+#elif defined(ARCH_ARM64)
+  return vshlq_u16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15))));
+#else
+  v128 r;
+  for (u32 i = 0; i < 8; i++)
+    r._u16[i] = a._u16[i] >> (b._u16[i] & 15);
+  return r;
+#endif
+}
+
+// For each unsigned 32-bit element, r = a >> (b & 31)
+inline v128 gv_shr32(const v128 &a, const v128 &b) {
+#if defined(__AVX2__)
+  return _mm_srlv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
+#elif defined(ARCH_ARM64)
+  return vshlq_u32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31))));
+#else
+  v128 r;
+  for (u32 i = 0; i < 4; i++)
+    r._u32[i] = a._u32[i] >> (b._u32[i] & 31);
+  return r;
+#endif
+}
+
+// For each signed 8-bit element, r = a >> (b & 7)
+inline v128 gv_sar8(const v128 &a, const v128 &b) {
+#if defined(ARCH_ARM64)
+  return vshlq_s8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7))));
+#else
+  v128 r;
+  for (u32 i = 0; i < 16; i++)
+    r._s8[i] = a._s8[i] >> (b._s8[i] & 7);
+  return r;
+#endif
+}
+
+// For each signed 16-bit element, r = a >> (b & 15)
+inline v128 gv_sar16(const v128 &a, const v128 &b) {
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+  return _mm_srav_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15)));
+#elif defined(ARCH_ARM64)
+  return vshlq_s16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15))));
+#else
+  v128 r;
+  for (u32 i = 0; i < 8; i++)
+    r._s16[i] = a._s16[i] >> (b._s16[i] & 15);
+  return r;
+#endif
+}
+
+// For each signed 32-bit element, r = a >> (b & 31)
+inline v128 gv_sar32(const v128 &a, const v128 &b) {
+#if defined(__AVX2__)
+  return _mm_srav_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31)));
+#elif defined(ARCH_ARM64)
+  return vshlq_s32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31))));
+#else
+  v128 r;
+  for (u32 i = 0; i < 4; i++)
+    r._s32[i] = a._s32[i] >> (b._s32[i] & 31);
+  return r;
+#endif
+}
+
+// For each 8-bit element, r = rotate a by b
+inline v128 gv_rol8(const v128 &a, const v128 &b) {
+#if defined(ARCH_ARM64)
+  const auto amt1 = vandq_s8(b, gv_bcst8(7));
+  const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
+  return vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(a, amt2));
+#else
+  const v128 x1 =
+      gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), a)); // rotate left by 1
+  const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a);
+  const v128 c2 = gv_bcst8(0x3);
+  const v128 x2 = gv_or32(gv_and32(gv_shr64(r1, 6), c2),
+                          gv_andn32(c2, gv_shl64(r1, 2))); // rotate by 2
+  const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1);
+  const v128 c3 = gv_bcst8(0xf);
+  const v128 x3 = gv_or32(gv_and32(gv_shr64(r2, 4), c3),
+                          gv_andn32(c3, gv_shl64(r2, 4))); // rotate by 4
+  return gv_signselect8(gv_shl64(b, 5), x3, r2);
+#endif
+}
+
+// For each 16-bit element, r = rotate a by b
+inline v128 gv_rol16(const v128 &a, const v128 &b) {
+#if defined(ARCH_ARM64)
+  const auto amt1 = vandq_s16(b, gv_bcst16(15));
+  const auto amt2 = vsubq_s16(amt1, gv_bcst16(16));
+  return vorrq_u16(vshlq_u16(a, amt1), vshlq_u16(a, amt2));
+#else
+  v128 r;
+  for (u32 i = 0; i < 8; i++)
+    r._u16[i] = rol16(a._u16[i], b._u16[i]);
+  return r;
+#endif
+}
+
+// For each 16-bit element, r = rotate a by count
+template <u8 Count> inline v128 gv_rol16(const v128 &a) {
+  constexpr u8 count = Count & 0xf;
+#if defined(ARCH_X64)
+  return _mm_or_si128(_mm_srli_epi16(a, 16 - count), _mm_slli_epi16(a, count));
+#elif defined(ARCH_ARM64)
+  return vorrq_u16(vshrq_n_u16(a, 16 - count), vshlq_n_u16(a, count));
+#else
+  v128 r;
+  for (u32 i = 0; i < 8; i++)
+    r._u16[i] = std::rotl(a._u16[i], count);
+  return r;
+#endif
+}
+
+// For each 32-bit element, r = rotate a by b
+inline v128 gv_rol32(const v128 &a, const v128 &b) {
+#if defined(__AVX512VL__)
+  return _mm_rolv_epi32(a, b);
+#elif defined(ARCH_ARM64)
+  const auto amt1 = vandq_s32(b, gv_bcst32(31));
+  const auto amt2 = vsubq_s32(amt1, gv_bcst32(32));
+  return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2));
+#else
+  v128 r;
+  for (u32 i = 0; i < 4; i++)
+    r._u32[i] = rol32(a._u32[i], b._u32[i]);
+  return r;
+#endif
+}
+
+// For each 32-bit element, r = rotate a by count
+template <u8 Count> inline v128 gv_rol32(const v128 &a) {
+  constexpr u8 count = Count & 0x1f;
+#if defined(__AVX512VL__)
+  return _mm_rol_epi32(a, count);
+#elif defined(ARCH_X64)
+  return _mm_or_si128(_mm_srli_epi32(a, 32 - count), _mm_slli_epi32(a, count));
+#elif defined(ARCH_ARM64)
+  return vorrq_u32(vshrq_n_u32(a, 32 - count), vshlq_n_u32(a, count));
+#else
+  v128 r;
+  for (u32 i = 0; i < 4; i++)
+    r._u32[i] = rol32(a._u32[i], count);
+  return r;
+#endif
+}
+
+// For each 8-bit element, r = (a << (c & 7)) | (b >> (~c & 7) >> 1)
+inline auto gv_fshl8(const v128 &a, const v128 &b, const v128 &c) {
+#if defined(ARCH_ARM64)
+  const auto amt1 = vandq_s8(c, gv_bcst8(7));
+  const auto amt2 = vsubq_s8(amt1, gv_bcst8(8));
+  return v128(vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(b, amt2)));
+#else
+  auto x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), b));
+  auto s1 = gv_shl64(c, 7);
+  auto r1 = gv_signselect8(s1, x1, a);
+  auto b1 = gv_signselect8(s1, gv_shl64(b, 1), b);
+  auto c2 = gv_bcst8(0x3);
+  auto x2 = gv_and32(gv_shr64(b1, 6), c2);
+  x2 = gv_or32(x2, gv_andn32(c2, gv_shl64(r1, 2)));
+  auto s2 = gv_shl64(c, 6);
+  auto r2 = gv_signselect8(s2, x2, r1);
+  auto b2 = gv_signselect8(s2, gv_shl64(b1, 2), b1);
+  auto c3 = gv_bcst8(0xf);
+  auto x3 = gv_and32(gv_shr64(b2, 4), c3);
+  x3 = gv_or32(x3, gv_andn32(c3, gv_shl64(r2, 4)));
+  return gv_signselect8(gv_shl64(c, 5), x3, r2);
+#endif
+}
+
+// For each 8-bit element, r = (b >> (c & 7)) | (a << (~c & 7) << 1)
+inline auto gv_fshr8(const v128 &a, const v128 &b, const v128 &c) {
+#if defined(ARCH_ARM64)
+  const auto amt1 = vandq_s8(c, gv_bcst8(7));
+  const auto amt2 = vsubq_s8(gv_bcst8(8), amt1);
+  return vorrq_u8(vshlq_u8(b, vnegq_s8(amt1)), vshlq_u8(a, amt2));
+#else
+  auto c1 = gv_bcst8(0x7f);
+  auto x1 = gv_and32(gv_shr64(b, 1), c1);
+  x1 = gv_or32(x1, gv_andn32(c1, gv_shl64(a, 7)));
+  auto s1 = gv_shl64(c, 7);
+  auto r1 = gv_signselect8(s1, x1, b);
+  auto a1 = gv_signselect8(s1, gv_shr64(a, 1), a);
+  auto c2 = gv_bcst8(0x3f);
+  auto x2 = gv_and32(gv_shr64(r1, 2), c2);
+  x2 = gv_or32(x2, gv_andn32(c2, gv_shl64(a1, 6)));
+  auto s2 = gv_shl64(c, 6);
+  auto r2 = gv_signselect8(s2, x2, r1);
+  auto a2 = gv_signselect8(s2, gv_shr64(a1, 2), a1);
+  auto c3 = gv_bcst8(0x0f);
+  auto x3 = gv_and32(gv_shr64(r2, 4), c3);
+  x3 = gv_or32(x3, gv_andn32(c3, gv_shl64(a2, 4)));
+  return gv_signselect8(gv_shl64(c, 5), x3, r2);
+#endif
+}
+
+// Shift left by byte amount
+template <u32 Count> inline v128 gv_shuffle_left(const v128 &a) {
+  if (Count > 15)
+    return {};
+#if defined(ARCH_X64)
+  return _mm_slli_si128(a, Count);
+#elif defined(ARCH_ARM64)
+  v128 idx;
+  for (u32 i = 0; i < 16; i++)
+    idx._u8[i] = u8(i - Count);
+  return vqtbl1q_u8(a, idx);
+#endif
+}
+
+// Shift right by byte amount
+template <u32 Count> inline v128 gv_shuffle_right(const v128 &a) {
+  if (Count > 15)
+    return {};
+#if defined(ARCH_X64)
+  return _mm_srli_si128(a, Count);
+#elif defined(ARCH_ARM64)
+  v128 idx;
+  for (u32 i = 0; i < 16; i++)
+    idx._u8[i] = u8(i + Count);
+  return vqtbl1q_u8(a, idx);
+#endif
+}
+
+// Load 32-bit integer into the first element of a new vector, set other
+// elements to zero
+inline v128 gv_loadu32(const void *ptr) {
+#if defined(ARCH_X64)
+  return _mm_loadu_si32(ptr);
+#elif defined(ARCH_ARM64)
+  return vld1q_lane_u32(static_cast<const u32 *>(ptr), vdupq_n_u32(0), 0);
+#endif
+}
+
+// Load 16-bit integer into an existing vector at the position specified by
+// Index
+template <u8 Index> inline v128 gv_insert16(const v128 &vec, u16 value) {
+#if defined(ARCH_X64)
+  return _mm_insert_epi16(vec, value, Index);
+#elif defined(ARCH_ARM64)
+  return vsetq_lane_u16(value, vec, Index & 0x7);
+#endif
+}
+
+// For each 8-bit element,
+// if ctrl >= 0 && ctrl < 16 then r = vec[ctrl],
+// else if ctrl < 0 then r = 0
+inline v128 gv_shuffle8(const v128 &vec, const v128 &ctrl) {
+  AUDIT(
+      std::ranges::none_of(
+          ctrl._chars, [](s8 i) { return i >= static_cast<s8>(sizeof(v128)); }),
+      "All indices must be in the range [0, 15] or negative, since PSHUFB and "
+      "TBL behave differently otherwise");
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(vec, ctrl);
+#elif defined(ARCH_ARM64)
+  return vqtbl1q_s8(vec, ctrl);
+#else
+  v128 r;
+  for (s32 i = 0; i < 16; i++)
+    r._s8[i] = ctrl._s8[i] < 0 ? 0 : vec._s8[ctrl._s8[i] & 0xf];
+  return r;
+#endif
+}
+
+// For each 2-bit index in Control, r = vec[index]
+template <u8 Control> inline v128 gv_shuffle32(const v128 &vec) {
+#if defined(ARCH_X64)
+  return _mm_shuffle_epi32(vec, Control);
+#elif defined(ARCH_ARM64)
+  constexpr u8 idx0 = (Control & 3) * sizeof(s32);
+  constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32);
+  constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32);
+  constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32);
+
+  constexpr uint8x16_t idx_vec = {
+      idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3,
+      idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3};
+
+  return vqtbl1q_s8(vec, idx_vec);
+#endif
+}
+
+// For each index, r = vec[index & 3]
+template <u8 Index0, u8 Index1, u8 Index2, u8 Index3>
+inline v128 gv_shuffle32(const v128 &vec) {
+#if defined(ARCH_X64)
+  return _mm_shuffle_epi32(vec, (Index0 & 3) | (Index1 & 3) << 2 |
+                                    (Index2 & 3) << 4 | (Index3 & 3) << 6);
+#elif defined(ARCH_ARM64)
+  constexpr u8 idx0 = (Index0 & 3) * sizeof(s32);
+  constexpr u8 idx1 = (Index1 & 3) * sizeof(s32);
+  constexpr u8 idx2 = (Index2 & 3) * sizeof(s32);
+  constexpr u8 idx3 = (Index3 & 3) * sizeof(s32);
+
+  constexpr uint8x16_t idx_vec = {
+      idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3,
+      idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3};
+
+  return vqtbl1q_s8(vec, idx_vec);
+#endif
+}
+
+// For the first two 2-bit indices in Control, r = a[index],
+// for the last two indices, r = b[index]
+template <u8 Control> inline v128 gv_shufflefs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_shuffle_ps(a, b, Control);
+#elif defined(ARCH_ARM64)
+  constexpr u8 idx0 = (Control & 3) * sizeof(s32);
+  constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32);
+  constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32) + sizeof(v128);
+  constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32) + sizeof(v128);
+
+  constexpr uint8x16_t idx_vec = {
+      idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3,
+      idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3};
+
+  return vqtbl2q_s8({a, b}, idx_vec);
+#endif
+}
+
+// For the first two indices, r = a[index & 3],
+// for the last two indices, r = b[index & 3]
+template <u8 Index0, u8 Index1, u8 Index2, u8 Index3>
+inline v128 gv_shufflefs(const v128 &a, const v128 &b) {
+#if defined(ARCH_X64)
+  return _mm_shuffle_ps(a, b,
+                        (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 |
+                            (Index3 & 3) << 6);
+#elif defined(ARCH_ARM64)
+  constexpr u8 idx0 = (Index0 & 3) * sizeof(s32);
+  constexpr u8 idx1 = (Index1 & 3) * sizeof(s32);
+  constexpr u8 idx2 = (Index2 & 3) * sizeof(s32) + sizeof(v128);
+  constexpr u8 idx3 = (Index3 & 3) * sizeof(s32) + sizeof(v128);
+
+  constexpr uint8x16_t idx_vec = {
+      idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3,
+      idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3};
+
+  return vqtbl2q_s8({a, b}, idx_vec);
+#endif
+}
+
+// For each 32-bit element, reverse byte order
+inline v128 gv_rev32(const v128 &vec) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(
+      vec, _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12));
+#elif defined(ARCH_ARM64)
+  return vrev32q_u8(vec);
+#else
+  return gv_rol32<16>(gv_rol16<8>(vec));
+#endif
+}
+
+// For each 32-bit element, convert between big-endian and native-endian
+inline v128 gv_to_be32(const v128 &vec) {
+  if constexpr (std::endian::native == std::endian::little)
+    return gv_rev32(vec);
+  return vec;
+}
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+} // namespace rx
diff --git a/rx/include/rx/types.hpp b/rx/include/rx/types.hpp
new file mode 100644
index 000000000..c545a3c57
--- /dev/null
+++ b/rx/include/rx/types.hpp
@@ -0,0 +1,1522 @@
+#pragma once
+
+#include <array>
+#include <bit>
+#include <chrono>
+#include <compare>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <source_location>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#if defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) ||               \
+    defined(__x86_64__) || defined(__amd64__)
+#define ARCH_X64 1
+#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
+#define ARCH_ARM64 1
+// v8.4a+ gives us atomic 16 byte ld/st
+// See Arm C Language Extensions Documentation
+// Currently there is no feature macro for LSE2 specifically so we define it
+// ourself Unfortunately the __ARM_ARCH integer macro isn't universally defined
+// so we use this hack instead
+#if defined(__ARM_ARCH_8_4__) || defined(__ARM_ARCH_8_5__) ||                  \
+    defined(__ARM_ARCH_8_6__) || defined(__ARM_ARCH_9__)
+#define ARM_FEATURE_LSE2 1
+#endif
+#endif
+
+using std::chrono::steady_clock;
+
+using namespace std::literals;
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#ifdef _MSC_VER
+#define SAFE_BUFFERS(...) __declspec(safebuffers) __VA_ARGS__
+#define NEVER_INLINE __declspec(noinline)
+#define FORCE_INLINE __forceinline
+#else // not _MSC_VER
+#ifdef __clang__
+#define SAFE_BUFFERS(...) __attribute__((no_stack_protector)) __VA_ARGS__
+#else
+#define SAFE_BUFFERS(...)                                                      \
+  __VA_ARGS__ __attribute__((__optimize__("no-stack-protector")))
+#endif
+#define NEVER_INLINE __attribute__((noinline)) inline
+#define FORCE_INLINE __attribute__((always_inline)) inline
+#endif // _MSC_VER
+
+#define CHECK_SIZE(type, size)                                                 \
+  static_assert(sizeof(type) == size, "Invalid " #type " type size")
+#define CHECK_ALIGN(type, align)                                               \
+  static_assert(alignof(type) == align, "Invalid " #type " type alignment")
+#define CHECK_MAX_SIZE(type, size)                                             \
+  static_assert(sizeof(type) <= size, #type " type size is too big")
+#define CHECK_SIZE_ALIGN(type, size, align)                                    \
+  CHECK_SIZE(type, size);                                                      \
+  CHECK_ALIGN(type, align)
+
+#define DECLARE(...) decltype(__VA_ARGS__) __VA_ARGS__
+
+#define STR_CASE(...)                                                          \
+  case __VA_ARGS__:                                                            \
+    return #__VA_ARGS__
+
+#if defined(_DEBUG) || defined(_AUDIT)
+#define AUDIT(...) (static_cast<void>(ensure(__VA_ARGS__)))
+#else
+#define AUDIT(...) (static_cast<std::void_t<decltype((__VA_ARGS__))>>(0))
+#endif
+
+namespace rx {
+template <typename F> struct fn_helper {
+  F f;
+
+  fn_helper(F &&f) : f(std::forward<F>(f)) {}
+
+  template <typename... Args> auto operator()(Args &&...args) const {
+    if constexpr (sizeof...(Args) == 0)
+      return f(0, 0, 0, 0);
+    else if constexpr (sizeof...(Args) == 1)
+      return f(std::forward<Args>(args)..., 0, 0, 0);
+    else if constexpr (sizeof...(Args) == 2)
+      return f(std::forward<Args>(args)..., 0, 0);
+    else if constexpr (sizeof...(Args) == 3)
+      return f(std::forward<Args>(args)..., 0);
+    else if constexpr (sizeof...(Args) == 4)
+      return f(std::forward<Args>(args)...);
+    else
+      static_assert(sizeof...(Args) <= 4);
+  }
+};
+
+template <typename F> fn_helper(F &&f) -> fn_helper<F>;
+} // namespace rx
+
+// Shorter lambda.
+#define FN(...)                                                                \
+  ::rx::fn_helper([&]([[maybe_unused]] auto &&x, [[maybe_unused]] auto &&y,    \
+                      [[maybe_unused]] auto &&z,                               \
+                      [[maybe_unused]] auto &&w) { return (__VA_ARGS__); })
+
+#if __cpp_lib_bit_cast < 201806L
+namespace std {
+template <typename To, typename From>
+[[nodiscard]] constexpr To bit_cast(const From &from) noexcept {
+  return __builtin_bit_cast(To, from);
+}
+} // namespace std
+#endif
+
+#if defined(__INTELLISENSE__) || (defined(__clang__) && (__clang_major__ <= 16))
+#define consteval constexpr
+#define constinit
+#endif
+
+// FIXME: move to ps3 kernel implementation
+using schar = signed char;
+using uchar = unsigned char;
+using ushort = unsigned short;
+using uint = unsigned int;
+using ulong = unsigned long;
+using ullong = unsigned long long;
+using llong = long long;
+
+using uptr = std::uintptr_t;
+
+using u8 = std::uint8_t;
+using u16 = std::uint16_t;
+using u32 = std::uint32_t;
+using u64 = std::uint64_t;
+using usz = std::size_t;
+
+using s8 = std::int8_t;
+using s16 = std::int16_t;
+using s32 = std::int32_t;
+using s64 = std::int64_t;
+using ssz = std::make_signed_t<std::size_t>;
+
+// Get integral type from type size
+template <usz N> struct get_int_impl {};
+
+template <> struct get_int_impl<sizeof(u8)> {
+  using utype = u8;
+};
+
+template <> struct get_int_impl<sizeof(u16)> {
+  using utype = u16;
+};
+
+template <> struct get_int_impl<sizeof(u32)> {
+  using utype = u32;
+};
+
+template <> struct get_int_impl<sizeof(u64)> {
+  using utype = u64;
+};
+
+template <usz N> using get_uint_t = typename get_int_impl<N>::utype;
+
+template <typename T> std::remove_cvref_t<T> as_rvalue(T &&obj) {
+  return std::forward<T>(obj);
+}
+
+template <typename T, bool Se, usz Align> class se_t;
+
+// se_t<> with native endianness
+template <typename T, usz Align = alignof(T)>
+using nse_t = se_t<T, false, Align>;
+
+template <typename T, usz Align = alignof(T)>
+using be_t = se_t<T, std::endian::little == std::endian::native, Align>;
+template <typename T, usz Align = alignof(T)>
+using le_t = se_t<T, std::endian::big == std::endian::native, Align>;
+
+// FIXME: remove
+template <typename T, usz Align> class atomic_t;
+template <typename T, usz Align = alignof(T)>
+using atomic_be_t = atomic_t<be_t<T>, Align>;
+template <typename T, usz Align = alignof(T)>
+using atomic_le_t = atomic_t<le_t<T>, Align>;
+
+// Bool type equivalent
+class b8 {
+  u8 m_value;
+
+public:
+  b8() = default;
+
+  using enable_bitcopy = std::true_type;
+
+  constexpr b8(bool value) noexcept : m_value(value) {}
+
+  constexpr operator bool() const noexcept { return m_value != 0; }
+
+  constexpr bool set(bool value) noexcept {
+    m_value = value;
+    return value;
+  }
+};
+
+#if defined(ARCH_X64) && !defined(_MSC_VER)
+using __m128i = long long __attribute__((vector_size(16)));
+using __m128d = double __attribute__((vector_size(16)));
+using __m128 = float __attribute__((vector_size(16)));
+#endif
+
+#ifndef _MSC_VER
+using u128 = __uint128_t;
+using s128 = __int128_t;
+#else
+
+extern "C" {
+union __m128;
+union __m128i;
+struct __m128d;
+
+uchar _addcarry_u64(uchar, u64, u64, u64 *);
+uchar _subborrow_u64(uchar, u64, u64, u64 *);
+u64 __shiftleft128(u64, u64, uchar);
+u64 __shiftright128(u64, u64, uchar);
+u64 _umul128(u64, u64, u64 *);
+}
+
+// Unsigned 128-bit integer implementation (TODO)
+struct alignas(16) u128 {
+  u64 lo, hi;
+
+  u128() noexcept = default;
+
+  template <typename T>
+    requires std::is_unsigned_v<T>
+  constexpr u128(T arg) noexcept : lo(arg), hi(0) {}
+
+  template <typename T>
+    requires std::is_signed_v<T>
+  constexpr u128(T arg) noexcept : lo(s64{arg}), hi(s64{arg} >> 63) {}
+
+  constexpr explicit operator bool() const noexcept { return !!(lo | hi); }
+
+  constexpr explicit operator u64() const noexcept { return lo; }
+
+  constexpr explicit operator s64() const noexcept { return lo; }
+
+  constexpr friend u128 operator+(const u128 &l, const u128 &r) {
+    u128 value = l;
+    value += r;
+    return value;
+  }
+
+  constexpr friend u128 operator-(const u128 &l, const u128 &r) {
+    u128 value = l;
+    value -= r;
+    return value;
+  }
+
+  constexpr friend u128 operator*(const u128 &l, const u128 &r) {
+    u128 value = l;
+    value *= r;
+    return value;
+  }
+
+  constexpr u128 operator+() const { return *this; }
+
+  constexpr u128 operator-() const {
+    u128 value{};
+    value -= *this;
+    return value;
+  }
+
+  constexpr u128 &operator++() {
+    *this += 1;
+    return *this;
+  }
+
+  constexpr u128 operator++(int) {
+    u128 value = *this;
+    *this += 1;
+    return value;
+  }
+
+  constexpr u128 &operator--() {
+    *this -= 1;
+    return *this;
+  }
+
+  constexpr u128 operator--(int) {
+    u128 value = *this;
+    *this -= 1;
+    return value;
+  }
+
+  constexpr u128 operator<<(u128 shift_value) const {
+    u128 value = *this;
+    value <<= shift_value;
+    return value;
+  }
+
+  constexpr u128 operator>>(u128 shift_value) const {
+    u128 value = *this;
+    value >>= shift_value;
+    return value;
+  }
+
+  constexpr u128 operator~() const {
+    u128 value{};
+    value.lo = ~lo;
+    value.hi = ~hi;
+    return value;
+  }
+
+  constexpr friend u128 operator&(const u128 &l, const u128 &r) {
+    u128 value{};
+    value.lo = l.lo & r.lo;
+    value.hi = l.hi & r.hi;
+    return value;
+  }
+
+  constexpr friend u128 operator|(const u128 &l, const u128 &r) {
+    u128 value{};
+    value.lo = l.lo | r.lo;
+    value.hi = l.hi | r.hi;
+    return value;
+  }
+
+  constexpr friend u128 operator^(const u128 &l, const u128 &r) {
+    u128 value{};
+    value.lo = l.lo ^ r.lo;
+    value.hi = l.hi ^ r.hi;
+    return value;
+  }
+
+  constexpr u128 &operator+=(const u128 &r) {
+    if (std::is_constant_evaluated()) {
+      lo += r.lo;
+      hi += r.hi + (lo < r.lo);
+    } else {
+      _addcarry_u64(_addcarry_u64(0, r.lo, lo, &lo), r.hi, hi, &hi);
+    }
+
+    return *this;
+  }
+
+  constexpr u128 &operator-=(const u128 &r) {
+    if (std::is_constant_evaluated()) {
+      hi -= r.hi + (lo < r.lo);
+      lo -= r.lo;
+    } else {
+      _subborrow_u64(_subborrow_u64(0, lo, r.lo, &lo), hi, r.hi, &hi);
+    }
+
+    return *this;
+  }
+
+  constexpr u128 &operator*=(const u128 &r) {
+    const u64 _hi = r.hi * lo + r.lo * hi;
+
+    if (std::is_constant_evaluated()) {
+      hi = (lo >> 32) * (r.lo >> 32) +
+           (((lo >> 32) * (r.lo & 0xffffffff)) >> 32) +
+           (((r.lo >> 32) * (lo & 0xffffffff)) >> 32);
+      lo = lo * r.lo;
+    } else {
+      lo = _umul128(lo, r.lo, &hi);
+    }
+
+    hi += _hi;
+    return *this;
+  }
+
+  constexpr u128 &operator<<=(const u128 &r) {
+    if (std::is_constant_evaluated()) {
+      if (r.hi == 0 && r.lo < 64) {
+        hi = (hi << r.lo) | (lo >> (64 - r.lo));
+        lo = (lo << r.lo);
+        return *this;
+      } else if (r.hi == 0 && r.lo < 128) {
+        hi = (lo << (r.lo - 64));
+        lo = 0;
+        return *this;
+      }
+    }
+
+    const u64 v0 = lo << (r.lo & 63);
+    const u64 v1 = __shiftleft128(lo, hi, static_cast<uchar>(r.lo));
+    lo = (r.lo & 64) ? 0 : v0;
+    hi = (r.lo & 64) ? v0 : v1;
+    return *this;
+  }
+
+  constexpr u128 &operator>>=(const u128 &r) {
+    if (std::is_constant_evaluated()) {
+      if (r.hi == 0 && r.lo < 64) {
+        lo = (lo >> r.lo) | (hi << (64 - r.lo));
+        hi = (hi >> r.lo);
+        return *this;
+      } else if (r.hi == 0 && r.lo < 128) {
+        lo = (hi >> (r.lo - 64));
+        hi = 0;
+        return *this;
+      }
+    }
+
+    const u64 v0 = hi >> (r.lo & 63);
+    const u64 v1 = __shiftright128(lo, hi, static_cast<uchar>(r.lo));
+    lo = (r.lo & 64) ? v0 : v1;
+    hi = (r.lo & 64) ? 0 : v0;
+    return *this;
+  }
+
+  constexpr u128 &operator&=(const u128 &r) {
+    lo &= r.lo;
+    hi &= r.hi;
+    return *this;
+  }
+
+  constexpr u128 &operator|=(const u128 &r) {
+    lo |= r.lo;
+    hi |= r.hi;
+    return *this;
+  }
+
+  constexpr u128 &operator^=(const u128 &r) {
+    lo ^= r.lo;
+    hi ^= r.hi;
+    return *this;
+  }
+};
+
+// Signed 128-bit integer implementation
+struct s128 : u128 {
+  using u128::u128;
+
+  constexpr s128 operator>>(u128 shift_value) const {
+    s128 value = *this;
+    value >>= shift_value;
+    return value;
+  }
+
+  constexpr s128 &operator>>=(const u128 &r) {
+    if (std::is_constant_evaluated()) {
+      if (r.hi == 0 && r.lo < 64) {
+        lo = (lo >> r.lo) | (hi << (64 - r.lo));
+        hi = (static_cast<s64>(hi) >> r.lo);
+        return *this;
+      } else if (r.hi == 0 && r.lo < 128) {
+        s64 _lo = static_cast<s64>(hi) >> (r.lo - 64);
+        lo = _lo;
+        hi = _lo >> 63;
+        return *this;
+      }
+    }
+
+    const u64 v0 = static_cast<s64>(hi) >> (r.lo & 63);
+    const u64 v1 = __shiftright128(lo, hi, static_cast<uchar>(r.lo));
+    lo = (r.lo & 64) ? v0 : v1;
+    hi = (r.lo & 64) ? static_cast<s64>(hi) >> 63 : v0;
+    return *this;
+  }
+};
+#endif
+
+// Optimization for u64*u64=u128
+constexpr u128 u128_from_mul(u64 a, u64 b) {
+#ifdef _MSC_VER
+  if (!std::is_constant_evaluated()) {
+    u64 hi;
+    u128 result = _umul128(a, b, &hi);
+    result.hi = hi;
+    return result;
+  }
+#endif
+
+  return u128{a} * b;
+}
+
+template <> struct get_int_impl<16> {
+  using utype = u128;
+  using stype = s128;
+};
+
+enum class f16 : u16 {};
+
+using f32 = float;
+using f64 = double;
+
+template <typename T>
+concept UnsignedInt = std::is_unsigned_v<std::common_type_t<T>> ||
+                      std::is_same_v<std::common_type_t<T>, u128>;
+
+template <typename T>
+concept SignedInt = (std::is_signed_v<std::common_type_t<T>> &&
+                     std::is_integral_v<std::common_type_t<T>>) ||
+                    std::is_same_v<std::common_type_t<T>, s128>;
+
+template <typename T>
+concept FPInt = std::is_floating_point_v<std::common_type_t<T>> ||
+                std::is_same_v<std::common_type_t<T>, f16>;
+
+template <typename T>
+concept Integral = std::is_integral_v<std::common_type_t<T>> ||
+                   std::is_same_v<std::common_type_t<T>, u128> ||
+                   std::is_same_v<std::common_type_t<T>, s128>;
+
+template <typename T> constexpr T min_v;
+
+template <UnsignedInt T> constexpr std::common_type_t<T> min_v<T> = 0;
+
+template <SignedInt T>
+constexpr std::common_type_t<T> min_v<T> =
+    static_cast<std::common_type_t<T>>(-1)
+    << (sizeof(std::common_type_t<T>) * 8 - 1);
+
+template <> constexpr inline f16 min_v<f16>{0xfbffu};
+
+template <>
+constexpr inline f32 min_v<f32> = std::bit_cast<f32, u32>(0xff'7fffffu);
+
+template <>
+constexpr inline f64 min_v<f64> =
+    std::bit_cast<f64, u64>(0xffe'7ffff'ffffffffu);
+
+template <FPInt T>
+constexpr std::common_type_t<T> min_v<T> = min_v<std::common_type_t<T>>;
+
+template <typename T> constexpr T max_v;
+
+template <UnsignedInt T> constexpr std::common_type_t<T> max_v<T> = -1;
+
+template <SignedInt T>
+constexpr std::common_type_t<T> max_v<T> =
+    static_cast<std::common_type_t<T>>(~min_v<T>);
+
+template <> constexpr inline f16 max_v<f16>{0x7bffu};
+
+template <>
+constexpr inline f32 max_v<f32> = std::bit_cast<f32, u32>(0x7f'7fffffu);
+
+template <>
+constexpr inline f64 max_v<f64> =
+    std::bit_cast<f64, u64>(0x7fe'fffff'ffffffffu);
+
+template <FPInt T>
+constexpr std::common_type_t<T> max_v<T> = max_v<std::common_type_t<T>>;
+
+// Return magic value for any unsigned type
+constexpr struct umax_impl_t {
+  template <UnsignedInt T> constexpr bool operator==(const T &rhs) const {
+    return rhs == max_v<T>;
+  }
+
+  template <UnsignedInt T>
+  constexpr std::strong_ordering operator<=>(const T &rhs) const {
+    return rhs == max_v<T> ? std::strong_ordering::equal
+                           : std::strong_ordering::greater;
+  }
+
+  template <UnsignedInt T> constexpr operator T() const { return max_v<T>; }
+} umax;
+
+constexpr struct smin_impl_t {
+  template <SignedInt T> constexpr bool operator==(const T &rhs) const {
+    return rhs == min_v<T>;
+  }
+
+  template <SignedInt T>
+  constexpr std::strong_ordering operator<=>(const T &rhs) const {
+    return rhs == min_v<T> ? std::strong_ordering::equal
+                           : std::strong_ordering::less;
+  }
+
+  template <SignedInt T> constexpr operator T() const { return min_v<T>; }
+} smin;
+
+constexpr struct smax_impl_t {
+  template <SignedInt T> constexpr bool operator==(const T &rhs) const {
+    return rhs == max_v<T>;
+  }
+
+  template <SignedInt T>
+  constexpr std::strong_ordering operator<=>(const T &rhs) const {
+    return rhs == max_v<T> ? std::strong_ordering::equal
+                           : std::strong_ordering::greater;
+  }
+
+  template <SignedInt T> constexpr operator T() const { return max_v<T>; }
+} smax;
+
+// Compare signed or unsigned type with its max value
+constexpr struct amax_impl_t {
+  template <typename T>
+    requires SignedInt<T> || UnsignedInt<T>
+  constexpr bool operator==(const T &rhs) const {
+    return rhs == max_v<T>;
+  }
+
+  template <typename T>
+    requires SignedInt<T> || UnsignedInt<T>
+  constexpr std::strong_ordering operator<=>(const T &rhs) const {
+    return max_v<T> <=> rhs;
+  }
+
+  template <typename T>
+    requires SignedInt<T> || UnsignedInt<T>
+  constexpr operator T() const {
+    return max_v<T>;
+  }
+} amax;
+
+// Compare signed or unsigned type with its minimal value (like zero or INT_MIN)
+constexpr struct amin_impl_t {
+  template <typename T>
+    requires SignedInt<T> || UnsignedInt<T>
+  constexpr bool operator==(const T &rhs) const {
+    return rhs == min_v<T>;
+  }
+
+  template <typename T>
+    requires SignedInt<T> || UnsignedInt<T>
+  constexpr std::strong_ordering operator<=>(const T &rhs) const {
+    return min_v<T> <=> rhs;
+  }
+
+  template <typename T>
+    requires SignedInt<T> || UnsignedInt<T>
+  constexpr operator T() const {
+    return min_v<T>;
+  }
+} amin;
+
+namespace rx::detail {
+template <typename T> union UndefinedObject {
+  T s;
+
+  UndefinedObject() {}
+  ~UndefinedObject() {}
+};
+} // namespace rx::detail
+
+#define OFFSET_OF(STRUCT, FIELD)                                               \
+  ([&] {                                                                       \
+    ::rx::detail::UndefinedObject<STRUCT> undefinedObject;                     \
+    return static_cast<std::uint32_t>(                                         \
+        std::bit_cast<const char *>(&(undefinedObject.s.FIELD)) -              \
+        std::bit_cast<const char *>(&undefinedObject.s));                      \
+  }())
+
+template <typename T, typename T2> inline u32 offset32(T T2::*const mptr) {
+#ifdef _MSC_VER
+  return std::bit_cast<u32>(mptr);
+#elif __GNUG__
+  return std::bit_cast<usz>(mptr);
+#else
+  static_assert(sizeof(mptr) == 0, "Unsupported pointer-to-member size");
+#endif
+}
+
+template <typename T> struct offset32_array {
+  static_assert(std::is_array_v<T>,
+                "Invalid pointer-to-member type (array expected)");
+
+  template <typename Arg> static inline u32 index32(const Arg &arg) {
+    return u32{sizeof(std::remove_extent_t<T>)} * static_cast<u32>(arg);
+  }
+};
+
+template <typename T, usz N> struct offset32_array<std::array<T, N>> {
+  template <typename Arg> static inline u32 index32(const Arg &arg) {
+    return u32{sizeof(T)} * static_cast<u32>(arg);
+  }
+};
+
+template <typename Arg> struct offset32_detail;
+
+template <typename T, typename T2, typename Arg, typename... Args>
+inline u32 offset32(T T2::*const mptr, const Arg &arg, const Args &...args) {
+  return offset32_detail<Arg>::offset32(mptr, arg, args...);
+}
+
+template <typename Arg> struct offset32_detail {
+  template <typename T, typename T2, typename... Args>
+  static inline u32 offset32(T T2::*const mptr, const Arg &arg,
+                             const Args &...args) {
+    return ::offset32(mptr, args...) + offset32_array<T>::index32(arg);
+  }
+};
+
+template <typename T3, typename T4> struct offset32_detail<T3 T4::*> {
+  template <typename T, typename T2, typename... Args>
+  static inline u32 offset32(T T2::*const mptr, T3 T4::*const mptr2,
+                             const Args &...args) {
+    return ::offset32(mptr) + ::offset32(mptr2, args...);
+  }
+};
+
+// Convert 0-2-byte string to u16 value like reinterpret_cast does
+constexpr u16 operator""_u16(const char *s, usz /*length*/) {
+  char buf[2]{s[0], s[1]};
+  return std::bit_cast<u16>(buf);
+}
+
+// Convert 3-4-byte string to u32 value like reinterpret_cast does
+constexpr u32 operator""_u32(const char *s, usz /*length*/) {
+  char buf[4]{s[0], s[1], s[2], s[3]};
+  return std::bit_cast<u32>(buf);
+}
+
+// Convert 5-8-byte string to u64 value like reinterpret_cast does
+constexpr u64 operator""_u64(const char *s, usz len) {
+  char buf[8]{s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4],
+              (len < 6 ? '\0' : s[5]),
+              (len < 7 ? '\0' : s[6]),
+              (len < 8 ? '\0' : s[7])};
+  return std::bit_cast<u64>(buf);
+}
+
+#if !defined(__INTELLISENSE__) && !__has_builtin(__builtin_COLUMN) &&          \
+    !defined(_MSC_VER)
+constexpr unsigned __builtin_COLUMN() { return -1; }
+#endif
+
+template <usz Size = umax> struct const_str_t {
+  static constexpr usz size = Size;
+
+  char8_t chars[Size + 1]{};
+
+  constexpr const_str_t(const char (&a)[Size + 1]) {
+    for (usz i = 0; i <= Size; i++)
+      chars[i] = a[i];
+  }
+
+  constexpr const_str_t(const char8_t (&a)[Size + 1]) {
+    for (usz i = 0; i <= Size; i++)
+      chars[i] = a[i];
+  }
+
+  operator const char *() const {
+    return reinterpret_cast<const char *>(chars);
+  }
+
+  constexpr operator const char8_t *() const { return chars; }
+};
+
+template <> struct const_str_t<umax> {
+  const usz size;
+
+  union {
+    const char8_t *chars;
+    const char *chars2;
+  };
+
+  constexpr const_str_t() : size(0), chars(nullptr) {}
+
+  template <usz N>
+  constexpr const_str_t(const char8_t (&a)[N]) : size(N - 1), chars(+a) {}
+
+  template <usz N>
+  constexpr const_str_t(const char (&a)[N]) : size(N - 1), chars2(+a) {}
+
+  constexpr operator const char *() const { return std::launder(chars2); }
+
+  constexpr operator const char8_t *() const { return chars; }
+};
+
+template <usz Size> const_str_t(const char (&a)[Size]) -> const_str_t<Size - 1>;
+
+template <usz Size>
+const_str_t(const char8_t (&a)[Size]) -> const_str_t<Size - 1>;
+
+using const_str = const_str_t<>;
+
+namespace fmt {
+[[noreturn]] void raw_verify_error(std::source_location loc, const char8_t *msg,
+                                   usz object);
+[[noreturn]] void raw_range_error(std::source_location loc,
+                                  std::string_view index, usz container_size);
+[[noreturn]] void raw_range_error(std::source_location loc, usz index,
+                                  usz container_size);
+} // namespace fmt
+
+// No full implementation to ease on header weight
+template <typename T>
+std::conditional_t<std::is_integral_v<std::remove_cvref_t<T>>, usz,
+                   std::string_view>
+format_object_simplified(const T &obj) {
+  using type = std::remove_cvref_t<T>;
+
+  if constexpr (std::is_integral_v<type> || std::is_same_v<std::string, type> ||
+                std::is_same_v<std::string_view, type>) {
+    return obj;
+  } else if constexpr (std::is_array_v<type> &&
+                       std::is_constructible_v<std::string_view, type>) {
+    return {obj, std::size(obj) - 1};
+  } else {
+    return std::string_view{};
+  }
+}
+
+template <typename T>
+constexpr decltype(auto) ensure(
+    T &&arg, const_str msg = const_str(),
+    std::source_location src_loc = std::source_location::current()) noexcept {
+  if (std::forward<T>(arg)) [[likely]] {
+    return std::forward<T>(arg);
+  }
+
+  fmt::raw_verify_error(src_loc, msg, 0);
+}
+
+template <typename T, typename F>
+  requires(std::is_invocable_v<F, T &&>)
+constexpr decltype(auto) ensure(
+    T &&arg, F &&pred, const_str msg = const_str(),
+    std::source_location src_loc = std::source_location::current()) noexcept {
+  if (std::forward<F>(pred)(std::forward<T>(arg))) [[likely]] {
+    return std::forward<T>(arg);
+  }
+
+  fmt::raw_verify_error(src_loc, msg, 0);
+}
+
+template <typename To, typename From>
+  requires(
+      std::is_integral_v<decltype(std::declval<To>() + std::declval<From>())>)
+[[nodiscard]] constexpr To
+narrow(const From &value,
+       std::source_location src_loc = std::source_location::current()) {
+  // Narrow check
+  using CommonFrom = std::common_type_t<From>;
+  using CommonTo = std::common_type_t<To>;
+
+  using UnFrom = std::make_unsigned_t<CommonFrom>;
+  using UnTo = std::make_unsigned_t<CommonTo>;
+
+  constexpr bool is_from_signed = std::is_signed_v<CommonFrom>;
+  constexpr bool is_to_signed = std::is_signed_v<CommonTo>;
+
+  constexpr auto from_mask =
+      (is_from_signed && !is_to_signed) ? UnFrom{umax} >> 1 : UnFrom{umax};
+  constexpr auto to_mask =
+      (is_to_signed && !is_from_signed) ? UnTo{umax} >> 1 : UnTo{umax};
+
+  constexpr auto mask = ~(from_mask & to_mask);
+
+  // Signed to unsigned always require test
+  // Otherwise, this is bit-wise narrowing or conversion between types of
+  // different signedness of the same size
+  if constexpr ((is_from_signed && !is_to_signed) || to_mask < from_mask) {
+    // Try to optimize test if both are of the same signedness
+    if (is_from_signed != is_to_signed ? !!(value & mask)
+                                       : static_cast<CommonTo>(value) != value)
+        [[unlikely]] {
+      fmt::raw_verify_error(src_loc, u8"Narrowing error", +value);
+    }
+  }
+
+  return static_cast<To>(value);
+}
+
+// Returns u32 size() for container
+template <typename CT>
+  requires requires(const CT &x) { std::size(x); }
+[[nodiscard]] constexpr u32
+size32(const CT &container,
+       std::source_location src_loc = std::source_location::current()) {
+  // TODO: Support std::array
+  constexpr bool is_const = std::is_array_v<std::remove_cvref_t<CT>>;
+
+  if constexpr (is_const) {
+    constexpr usz Size = sizeof(container) / sizeof(container[0]);
+    return std::conditional_t<is_const, u32, usz>{Size};
+  } else {
+    return narrow<u32>(container.size(), src_loc);
+  }
+}
+
+template <typename CT, typename T>
+  requires requires(CT &&x) {
+    std::size(x);
+    std::data(x);
+  } || requires(CT &&x) {
+    std::size(x);
+    x.front();
+  }
+[[nodiscard]] constexpr auto &
+at32(CT &&container, T &&index,
+     std::source_location src_loc = std::source_location::current()) {
+  // Make sure the index is within u32 range
+  const std::make_unsigned_t<std::common_type_t<T>> idx = index;
+  const u32 csz = ::size32(container, src_loc);
+  if (csz <= idx) [[unlikely]]
+    fmt::raw_range_error(src_loc, format_object_simplified(index), csz);
+  auto it = std::begin(std::forward<CT>(container));
+  std::advance(it, idx);
+  return *it;
+}
+
+template <typename CT, typename T>
+  requires requires(CT &&x, T &&y) {
+    x.count(y);
+    x.find(y);
+  }
+[[nodiscard]] constexpr auto &
+at32(CT &&container, T &&index,
+     std::source_location src_loc = std::source_location::current()) {
+  // Associative container
+  const auto found = container.find(std::forward<T>(index));
+  usz csv = umax;
+  if constexpr ((requires() { container.size(); }))
+    csv = container.size();
+  if (found == container.end()) [[unlikely]]
+    fmt::raw_range_error(src_loc, format_object_simplified(index), csv);
+  return found->second;
+}
+
+// Simplified hash algorithm. May be used in std::unordered_(map|set).
+template <typename T, usz Shift = 0> struct value_hash {
+  usz operator()(T value) const { return static_cast<usz>(value) >> Shift; }
+};
+
+template <typename... T> struct fill_array_t {
+  std::tuple<T...> args;
+
+  template <typename V, usz Num>
+  constexpr std::unwrap_reference_t<V> get() const {
+    return std::get<Num>(args);
+  }
+
+  template <typename U, usz N, usz... M, usz... Idx>
+  constexpr std::array<U, N> fill(std::index_sequence<M...>,
+                                  std::index_sequence<Idx...>) const {
+    return {(static_cast<void>(Idx), U(get<T, M>()...))...};
+  }
+
+  template <typename U, usz N> constexpr operator std::array<U, N>() const {
+    return fill<U, N>(std::make_index_sequence<sizeof...(T)>(),
+                      std::make_index_sequence<N>());
+  }
+};
+
+template <typename... T> constexpr auto fill_array(const T &...args) {
+  return fill_array_t<T...>{{args...}};
+}
+
+template <typename X, typename Y>
+concept PtrCastable = requires(const volatile X *x, const volatile Y *y) {
+  static_cast<const volatile Y *>(x);
+  static_cast<const volatile X *>(y);
+};
+
+template <typename X, typename Y>
+  requires PtrCastable<X, Y>
+consteval bool is_same_ptr() {
+  if constexpr (std::is_void_v<X> || std::is_void_v<Y> ||
+                std::is_same_v<std::remove_cv_t<X>, std::remove_cv_t<Y>>) {
+    return true;
+  } else if constexpr (sizeof(X) == sizeof(Y)) {
+    return true;
+  } else {
+    bool result = false;
+
+    if constexpr (sizeof(X) < sizeof(Y)) {
+      std::allocator<Y> a{};
+      Y *ptr = a.allocate(1);
+      result = static_cast<X *>(ptr) == static_cast<void *>(ptr);
+      a.deallocate(ptr, 1);
+    } else {
+      std::allocator<X> a{};
+      X *ptr = a.allocate(1);
+      result = static_cast<Y *>(ptr) == static_cast<void *>(ptr);
+      a.deallocate(ptr, 1);
+    }
+
+    return result;
+  }
+}
+
+template <typename X, typename Y>
+  requires PtrCastable<X, Y>
+constexpr bool is_same_ptr(const volatile Y *ptr) {
+  return static_cast<const volatile X *>(ptr) ==
+         static_cast<const volatile void *>(ptr);
+}
+
+template <typename X, typename Y>
+concept PtrSame = (is_same_ptr<X, Y>());
+
+template <typename T> struct exact_t {
+  static_assert(std::is_reference_v<T> || std::is_convertible_v<T, const T &>);
+
+  T obj;
+
+  explicit exact_t(T &&_obj) : obj(std::forward<T>(_obj)) {}
+  exact_t &operator=(const exact_t &) = delete;
+
+  template <typename U>
+    requires(std::is_same_v<U &, T>)
+  operator U &() const noexcept {
+    return obj;
+  };
+
+  template <typename U>
+    requires(std::is_same_v<const U &, T>)
+  operator const U &() const noexcept {
+    return obj;
+  };
+
+  template <typename U>
+    requires(std::is_same_v<U, T> && std::is_copy_constructible_v<T>)
+  operator U() const noexcept {
+    return obj;
+  };
+};
+
+template <typename T> exact_t<T &> make_exact(T &&obj) noexcept {
+  return exact_t<T &>(static_cast<T &>(obj));
+}
+
+// Read object of type T from raw pointer, array, string, vector, or any
+// contiguous container
+template <typename T, typename U>
+constexpr T read_from_ptr(U &&array, usz pos = 0) {
+  // TODO: ensure array element types are trivial
+  static_assert(sizeof(T) % sizeof(array[0]) == 0);
+  std::decay_t<decltype(array[0])> buf[sizeof(T) / sizeof(array[0])];
+  if (!std::is_constant_evaluated())
+    std::memcpy(+buf, &array[pos], sizeof(buf));
+  else
+    for (usz i = 0; i < pos; buf[i] = array[pos + i], i++)
+      ;
+  return std::bit_cast<T>(buf);
+}
+
+template <typename T, typename U>
+constexpr void write_to_ptr(U &&array, usz pos, const T &value) {
+  static_assert(sizeof(T) % sizeof(array[0]) == 0);
+  if (!std::is_constant_evaluated())
+    std::memcpy(static_cast<void *>(&array[pos]), &value, sizeof(value));
+  else
+    ensure(!"Unimplemented");
+}
+
+template <typename T, typename U>
+constexpr void write_to_ptr(U &&array, const T &value) {
+  static_assert(sizeof(T) % sizeof(array[0]) == 0);
+  if (!std::is_constant_evaluated())
+    std::memcpy(&array[0], &value, sizeof(value));
+  else
+    ensure(!"Unimplemented");
+}
+
+constexpr struct aref_tag_t {
+} aref_tag{};
+
+template <typename T, typename U> class aref final {
+  U *m_ptr;
+
+  static_assert(sizeof(std::decay_t<T>) % sizeof(U) == 0);
+
+public:
+  aref() = delete;
+
+  constexpr aref(const aref &) = default;
+
+  explicit constexpr aref(aref_tag_t, U *ptr) : m_ptr(ptr) {}
+
+  constexpr T value() const { return read_from_ptr<T>(m_ptr); }
+
+  constexpr operator T() const { return read_from_ptr<T>(m_ptr); }
+
+  aref &operator=(const aref &) = delete;
+
+  constexpr aref &operator=(const T &value) const {
+    write_to_ptr<T>(m_ptr, value);
+    return *this;
+  }
+};
+
+template <typename T, typename U> class aref<T[], U> {
+  U *m_ptr;
+
+  static_assert(sizeof(std::decay_t<T>) % sizeof(U) == 0);
+
+public:
+  aref() = delete;
+
+  constexpr aref(const aref &) = default;
+
+  explicit constexpr aref(aref_tag_t, U *ptr) : m_ptr(ptr) {}
+
+  aref &operator=(const aref &) = delete;
+
+  constexpr aref<T, U> operator[](usz index) const {
+    return aref<T, U>(aref_tag, m_ptr + index * (sizeof(T) / sizeof(U)));
+  }
+};
+
+template <typename T, typename U, std::size_t N> class aref<T[N], U> {
+  U *m_ptr;
+
+  static_assert(sizeof(std::decay_t<T>) % sizeof(U) == 0);
+
+public:
+  aref() = delete;
+
+  constexpr aref(const aref &) = default;
+
+  explicit constexpr aref(aref_tag_t, U *ptr) : m_ptr(ptr) {}
+
+  aref &operator=(const aref &) = delete;
+
+  constexpr aref<T, U> operator[](usz index) const {
+    return aref<T, U>(aref_tag, m_ptr + index * (sizeof(T) / sizeof(U)));
+  }
+};
+
+// Reference object of type T, see read_from_ptr
+template <typename T, typename U>
+constexpr auto ref_ptr(U &&array, usz pos = 0)
+    -> aref<T, std::decay_t<decltype(array[0])>> {
+  return aref<T, std::decay_t<decltype(array[0])>>(aref_tag, &array[pos]);
+}
+
+template <typename T, usz Align = alignof(T), usz Size = sizeof(T)>
+struct se_storage {
+  struct type8 {
+    alignas(Align > alignof(T) ? alignof(T) : Align) uchar data[sizeof(T)];
+  };
+
+  struct type64 {
+    alignas(8) u64 data[sizeof(T) < 8 ? 1 : sizeof(T) / 8];
+  };
+
+  using type =
+      std::conditional_t<(Align >= 8 && sizeof(T) % 8 == 0), type64, type8>;
+
+  // Possibly unoptimized generic byteswap for unaligned data
+  static constexpr type swap(const type &src) noexcept;
+};
+
+template <typename T> struct se_storage<T, 2, 2> {
+  using type = u16;
+
+  static constexpr u16 swap(u16 src) noexcept {
+#if __cpp_lib_byteswap >= 202110L
+    return std::byteswap(src);
+#elif defined(__GNUG__)
+    return __builtin_bswap16(src);
+#else
+    if (std::is_constant_evaluated()) {
+      return (src >> 8) | (src << 8);
+    }
+
+    return _byteswap_ushort(src);
+#endif
+  }
+};
+
+template <typename T> struct se_storage<T, 4, 4> {
+  using type = u32;
+
+  static constexpr u32 swap(u32 src) noexcept {
+#if __cpp_lib_byteswap >= 202110L
+    return std::byteswap(src);
+#elif defined(__GNUG__)
+    return __builtin_bswap32(src);
+#else
+    if (std::is_constant_evaluated()) {
+      const u32 v0 = ((src << 8) & 0xff00ff00) | ((src >> 8) & 0x00ff00ff);
+      return (v0 << 16) | (v0 >> 16);
+    }
+
+    return _byteswap_ulong(src);
+#endif
+  }
+};
+
+template <typename T> struct se_storage<T, 8, 8> {
+  using type = u64;
+
+  static constexpr u64 swap(u64 src) noexcept {
+#if __cpp_lib_byteswap >= 202110L
+    return std::byteswap(src);
+#elif defined(__GNUG__)
+    return __builtin_bswap64(src);
+#else
+    if (std::is_constant_evaluated()) {
+      const u64 v0 =
+          ((src << 8) & 0xff00ff00ff00ff00) | ((src >> 8) & 0x00ff00ff00ff00ff);
+      const u64 v1 =
+          ((v0 << 16) & 0xffff0000ffff0000) | ((v0 >> 16) & 0x0000ffff0000ffff);
+      return (v1 << 32) | (v1 >> 32);
+    }
+
+    return _byteswap_uint64(src);
+#endif
+  }
+};
+
+template <typename T, usz Align, usz Size>
+constexpr typename se_storage<T, Align, Size>::type
+se_storage<T, Align, Size>::swap(const type &src) noexcept {
+  // Try to keep u16/u32/u64 optimizations at the cost of more bitcasts
+  if constexpr (sizeof(T) == 1) {
+    return src;
+  } else if constexpr (sizeof(T) == 2) {
+    return std::bit_cast<type>(se_storage<u16>::swap(std::bit_cast<u16>(src)));
+  } else if constexpr (sizeof(T) == 4) {
+    return std::bit_cast<type>(se_storage<u32>::swap(std::bit_cast<u32>(src)));
+  } else if constexpr (sizeof(T) == 8) {
+    return std::bit_cast<type>(se_storage<u64>::swap(std::bit_cast<u64>(src)));
+  } else if constexpr (sizeof(T) % 8 == 0) {
+    type64 tmp = std::bit_cast<type64>(src);
+    type64 dst{};
+
+    // Swap u64 blocks
+    for (usz i = 0; i < sizeof(T) / 8; i++) {
+      dst.data[i] = se_storage<u64>::swap(tmp.data[sizeof(T) / 8 - 1 - i]);
+    }
+
+    return std::bit_cast<type>(dst);
+  } else {
+    type dst{};
+
+    // Swap by moving every byte
+    for (usz i = 0; i < sizeof(T); i++) {
+      dst.data[i] = src.data[sizeof(T) - 1 - i];
+    }
+
+    return dst;
+  }
+}
+
+// Endianness support template
+template <typename T, bool Swap, usz Align = alignof(T)>
+class alignas(Align) se_t {
+  using type = std::remove_cv_t<T>;
+  using stype = typename se_storage<type, Align>::type;
+  using storage = se_storage<type, Align>;
+
+  stype m_data;
+
+  static_assert(!std::is_pointer_v<type>,
+                "se_t<> error: invalid type (pointer)");
+  static_assert(!std::is_reference_v<type>,
+                "se_t<> error: invalid type (reference)");
+  static_assert(!std::is_array_v<type>, "se_t<> error: invalid type (array)");
+  static_assert(sizeof(type) == alignof(type),
+                "se_t<> error: unexpected alignment");
+
+  static constexpr stype to_data(type value) noexcept {
+    if constexpr (Swap) {
+      return storage::swap(std::bit_cast<stype>(value));
+    } else {
+      return std::bit_cast<stype>(value);
+    }
+  }
+
+  static constexpr auto int_or_enum() {
+    if constexpr (std::is_enum_v<type>) {
+      return std::underlying_type_t<type>{};
+    } else {
+      return type{};
+    }
+  }
+
+  using under = decltype(int_or_enum());
+
+public:
+  se_t() noexcept = default;
+
+  constexpr se_t(type value) noexcept : m_data(to_data(value)) {}
+
+  constexpr type value() const noexcept {
+    if constexpr (Swap) {
+      return std::bit_cast<type>(storage::swap(m_data));
+    } else {
+      return std::bit_cast<type>(m_data);
+    }
+  }
+
+  constexpr type get() const noexcept { return value(); }
+
+  constexpr se_t &operator=(type value) noexcept {
+    m_data = to_data(value);
+    return *this;
+  }
+
+  constexpr operator type() const noexcept { return value(); }
+
+#ifdef _MSC_VER
+  explicit constexpr operator bool() const noexcept {
+    static_assert(!type{});
+    static_assert(!std::is_floating_point_v<type>);
+    return !!std::bit_cast<type>(m_data);
+  }
+#endif
+
+  constexpr auto operator~() const noexcept {
+    if constexpr ((std::is_integral_v<T> || std::is_enum_v<T>) &&
+                  std::is_convertible_v<T, int>) {
+      // Return se_t of integral type if possible. Promotion to int is omitted
+      // on purpose (a compromise).
+      return std::bit_cast<se_t<under, Swap>>(
+          static_cast<under>(~std::bit_cast<under>(m_data)));
+    } else {
+      return ~value();
+    }
+  }
+
+private:
+  // Compatible bit pattern cast
+  template <typename To, typename Test = int, typename T2>
+  static constexpr To right_arg_cast(const T2 &rhs) noexcept {
+    return std::bit_cast<To>(static_cast<se_t<To, Swap>>(rhs));
+  }
+
+  template <typename To, typename Test = int, typename R, usz Align2>
+  static constexpr To
+  right_arg_cast(const se_t<R, Swap, Align2> &rhs) noexcept {
+    if constexpr ((std::is_integral_v<R> || std::is_enum_v<R>) &&
+                  std::is_convertible_v<R, Test> && sizeof(R) == sizeof(T)) {
+      // Optimization: allow to reuse bit pattern of any se_t with
+      // bit-compatible type
+      return std::bit_cast<To>(rhs);
+    } else {
+      return std::bit_cast<To>(static_cast<se_t<To, Swap>>(rhs.value()));
+    }
+  }
+
+public:
+  template <typename T2>
+    requires requires(const T2 &t2) { +t2; }
+  constexpr bool operator==(const T2 &rhs) const noexcept {
+    using R = std::common_type_t<T2>;
+
+    if constexpr ((std::is_integral_v<T> || std::is_enum_v<T>) &&
+                  (std::is_integral_v<R> || std::is_enum_v<R>)) {
+      if constexpr (sizeof(T) >= sizeof(R)) {
+        if constexpr (std::is_convertible_v<T, int> &&
+                      std::is_convertible_v<R, int>) {
+          return std::bit_cast<under>(m_data) == right_arg_cast<under>(rhs);
+        } else {
+          // Compare with strict type on the right side (possibly scoped enum)
+          return std::bit_cast<type>(m_data) == right_arg_cast<type, type>(rhs);
+        }
+      }
+    }
+
+    // Keep outside of if constexpr to make sure it fails on invalid comparison
+    return value() == rhs;
+  }
+
+private:
+  template <typename T2> static constexpr bool check_args_for_bitwise_op() {
+    using R = std::common_type_t<T2>;
+
+    if constexpr ((std::is_integral_v<T> || std::is_enum_v<T>) &&
+                  (std::is_integral_v<R> || std::is_enum_v<R>)) {
+      if constexpr (std::is_convertible_v<T, int> &&
+                    std::is_convertible_v<R, int> && sizeof(T) >= sizeof(R)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+public:
+  template <typename T2>
+  constexpr auto operator&(const T2 &rhs) const noexcept {
+    if constexpr (check_args_for_bitwise_op<T2>()) {
+      return std::bit_cast<se_t<under, Swap>>(static_cast<under>(
+          std::bit_cast<under>(m_data) & right_arg_cast<under>(rhs)));
+    } else {
+      return value() & rhs;
+    }
+  }
+
+  template <typename T2>
+  constexpr auto operator|(const T2 &rhs) const noexcept {
+    if constexpr (check_args_for_bitwise_op<T2>()) {
+      return std::bit_cast<se_t<under, Swap>>(static_cast<under>(
+          std::bit_cast<under>(m_data) | right_arg_cast<under>(rhs)));
+    } else {
+      return value() | rhs;
+    }
+  }
+
+  template <typename T2>
+  constexpr auto operator^(const T2 &rhs) const noexcept {
+    if constexpr (check_args_for_bitwise_op<T2>()) {
+      return std::bit_cast<se_t<under, Swap>>(static_cast<under>(
+          std::bit_cast<under>(m_data) ^ right_arg_cast<under>(rhs)));
+    } else {
+      return value() ^ rhs;
+    }
+  }
+
+  template <typename T1> constexpr se_t &operator+=(const T1 &rhs) {
+    *this = value() + rhs;
+    return *this;
+  }
+
+  template <typename T1> constexpr se_t &operator-=(const T1 &rhs) {
+    *this = value() - rhs;
+    return *this;
+  }
+
+  template <typename T1> constexpr se_t &operator*=(const T1 &rhs) {
+    *this = value() * rhs;
+    return *this;
+  }
+
+  template <typename T1> constexpr se_t &operator/=(const T1 &rhs) {
+    *this = value() / rhs;
+    return *this;
+  }
+
+  template <typename T1> constexpr se_t &operator%=(const T1 &rhs) {
+    *this = value() % rhs;
+    return *this;
+  }
+
+  template <typename T1> constexpr se_t &operator&=(const T1 &rhs) {
+    if constexpr (std::is_integral_v<T>) {
+      m_data = std::bit_cast<stype, type>(static_cast<type>(
+          std::bit_cast<type>(m_data) & right_arg_cast<type>(rhs)));
+      return *this;
+    }
+
+    *this = value() & rhs;
+    return *this;
+  }
+
+  template <typename T1> constexpr se_t &operator|=(const T1 &rhs) {
+    if constexpr (std::is_integral_v<T>) {
+      m_data = std::bit_cast<stype, type>(static_cast<type>(
+          std::bit_cast<type>(m_data) | right_arg_cast<type>(rhs)));
+      return *this;
+    }
+
+    *this = value() | rhs;
+    return *this;
+  }
+
+  template <typename T1> constexpr se_t &operator^=(const T1 &rhs) {
+    if constexpr (std::is_integral_v<T>) {
+      m_data = std::bit_cast<stype, type>(static_cast<type>(
+          std::bit_cast<type>(m_data) ^ right_arg_cast<type>(rhs)));
+      return *this;
+    }
+
+    *this = value() ^ rhs;
+    return *this;
+  }
+
+  template <typename T1> constexpr se_t &operator<<=(const T1 &rhs) {
+    *this = value() << rhs;
+    return *this;
+  }
+
+  template <typename T1> constexpr se_t &operator>>=(const T1 &rhs) {
+    *this = value() >> rhs;
+    return *this;
+  }
+
+  constexpr se_t &operator++() {
+    T value = *this;
+    *this = ++value;
+    return *this;
+  }
+
+  constexpr se_t &operator--() {
+    T value = *this;
+    *this = --value;
+    return *this;
+  }
+
+  constexpr T operator++(int) {
+    T value = *this;
+    T result = value++;
+    *this = value;
+    return result;
+  }
+
+  constexpr T operator--(int) {
+    T value = *this;
+    T result = value--;
+    *this = value;
+    return result;
+  }
+};
+
+// Specializations
+
+template <typename T, bool Swap, usz Align, typename T2, bool Swap2, usz Align2>
+struct std::common_type<se_t<T, Swap, Align>, se_t<T2, Swap2, Align2>>
+    : std::common_type<T, T2> {};
+
+template <typename T, bool Swap, usz Align, typename T2>
+struct std::common_type<se_t<T, Swap, Align>, T2>
+    : std::common_type<T, std::common_type_t<T2>> {};
+
+template <typename T, typename T2, bool Swap2, usz Align2>
+struct std::common_type<T, se_t<T2, Swap2, Align2>>
+    : std::common_type<std::common_type_t<T>, T2> {};
+
+#define UNUSED(expr)                                                           \
+  do {                                                                         \
+    (void)(expr);                                                              \
+  } while (0)
diff --git a/rx/include/rx/v128.hpp b/rx/include/rx/v128.hpp
new file mode 100644
index 000000000..3e477a1cc
--- /dev/null
+++ b/rx/include/rx/v128.hpp
@@ -0,0 +1,187 @@
+#pragma once // No BOM and only basic ASCII in this header, or a neko will die
+
+#include "types.hpp"
+
+namespace rx {
+template <typename T>
+concept Vector128 = (sizeof(T) == 16) && (std::is_trivial_v<T>);
+
+// 128-bit vector type
+union alignas(16) v128 {
+  using enable_bitcopy = std::true_type; // FIXME: remove
+
+  uchar _bytes[16];
+  char _chars[16];
+
+  template <typename T, usz N, usz M>
+  struct masked_array_t // array type accessed as (index ^ M)
+  {
+    T data[N];
+
+    T &operator[](usz index) { return data[index ^ M]; }
+    const T &operator[](usz index) const { return data[index ^ M]; }
+  };
+
+  template <typename T, usz N = 16 / sizeof(T)>
+  using normal_array_t =
+      masked_array_t<T, N,
+                     std::endian::little == std::endian::native ? 0 : N - 1>;
+  template <typename T, usz N = 16 / sizeof(T)>
+  using reversed_array_t =
+      masked_array_t<T, N,
+                     std::endian::little == std::endian::native ? N - 1 : 0>;
+
+  normal_array_t<u64> _u64;
+  normal_array_t<s64> _s64;
+  reversed_array_t<u64> u64r;
+  reversed_array_t<s64> s64r;
+
+  normal_array_t<u32> _u32;
+  normal_array_t<s32> _s32;
+  reversed_array_t<u32> u32r;
+  reversed_array_t<s32> s32r;
+
+  normal_array_t<u16> _u16;
+  normal_array_t<s16> _s16;
+  reversed_array_t<u16> u16r;
+  reversed_array_t<s16> s16r;
+
+  normal_array_t<u8> _u8;
+  normal_array_t<s8> _s8;
+  reversed_array_t<u8> u8r;
+  reversed_array_t<s8> s8r;
+
+  normal_array_t<f32> _f;
+  normal_array_t<f64> _d;
+  reversed_array_t<f32> fr;
+  reversed_array_t<f64> dr;
+
+  u128 _u;
+  s128 _s;
+
+  v128() = default;
+
+  constexpr v128(const v128 &) noexcept = default;
+
+  template <Vector128 T>
+  constexpr v128(const T &rhs) noexcept : v128(std::bit_cast<v128>(rhs)) {}
+
+  constexpr v128 &operator=(const v128 &) noexcept = default;
+
+  template <Vector128 T> constexpr operator T() const noexcept {
+    return std::bit_cast<T>(*this);
+  }
+
+  static v128 from64(u64 _0, u64 _1 = 0) {
+    v128 ret;
+    ret._u64[0] = _0;
+    ret._u64[1] = _1;
+    return ret;
+  }
+
+  static v128 from64r(u64 _1, u64 _0 = 0) { return from64(_0, _1); }
+
+  static v128 from64p(u64 value) {
+    v128 ret;
+    ret._u64[0] = value;
+    ret._u64[1] = value;
+    return ret;
+  }
+
+  static v128 from32(u32 _0, u32 _1 = 0, u32 _2 = 0, u32 _3 = 0) {
+    v128 ret;
+    ret._u32[0] = _0;
+    ret._u32[1] = _1;
+    ret._u32[2] = _2;
+    ret._u32[3] = _3;
+    return ret;
+  }
+
+  static v128 from32r(u32 _3, u32 _2 = 0, u32 _1 = 0, u32 _0 = 0) {
+    return from32(_0, _1, _2, _3);
+  }
+
+  static v128 from32p(u32 value) {
+    v128 ret;
+    ret._u32[0] = value;
+    ret._u32[1] = value;
+    ret._u32[2] = value;
+    ret._u32[3] = value;
+    return ret;
+  }
+
+  static v128 fromf32p(f32 value) {
+    v128 ret;
+    ret._f[0] = value;
+    ret._f[1] = value;
+    ret._f[2] = value;
+    ret._f[3] = value;
+    return ret;
+  }
+
+  static v128 from16p(u16 value) {
+    v128 ret;
+    ret._u16[0] = value;
+    ret._u16[1] = value;
+    ret._u16[2] = value;
+    ret._u16[3] = value;
+    ret._u16[4] = value;
+    ret._u16[5] = value;
+    ret._u16[6] = value;
+    ret._u16[7] = value;
+    return ret;
+  }
+
+  static v128 from8p(u8 value) {
+    v128 ret;
+    std::memset(&ret, value, sizeof(ret));
+    return ret;
+  }
+
+  static v128 undef() {
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#elif _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 6001)
+#endif
+    v128 ret;
+    return ret;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#elif _MSC_VER
+#pragma warning(pop)
+#endif
+  }
+
+  // Unaligned load with optional index offset
+  static v128 loadu(const void *ptr, usz index = 0) {
+    v128 ret;
+    std::memcpy(&ret, static_cast<const u8 *>(ptr) + index * sizeof(v128),
+                sizeof(v128));
+    return ret;
+  }
+
+  // Unaligned store with optional index offset
+  static void storeu(v128 value, void *ptr, usz index = 0) {
+    std::memcpy(static_cast<u8 *>(ptr) + index * sizeof(v128), &value,
+                sizeof(v128));
+  }
+
+  v128 operator|(const v128 &) const;
+  v128 operator&(const v128 &) const;
+  v128 operator^(const v128 &) const;
+  v128 operator~() const;
+
+  bool operator==(const v128 &right) const;
+
+  void clear() { *this = {}; }
+};
+} // namespace rx
+
+template <> struct std::hash<rx::v128> {
+  usz operator()(const rx::v128 &key) const {
+    return key._u64[0] ^ (key._u64[1] << 1);
+  }
+};
diff --git a/rx/src/debug.cpp b/rx/src/debug.cpp
index c8ed6855c..d8a2d9d32 100644
--- a/rx/src/debug.cpp
+++ b/rx/src/debug.cpp
@@ -5,12 +5,47 @@
 #include <thread>
 #include <vector>
 
-#ifdef __GNUC__
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+
+#ifdef __linux__
 #include <linux/limits.h>
 #include <sys/ptrace.h>
+#endif
 #include <unistd.h>
 
+#endif
+
 bool rx::isDebuggerPresent() {
+#ifdef _WIN32
+  return ::IsDebuggerPresent();
+#elif defined(__APPLE__) || defined(__DragonFly__) || defined(__FreeBSD__) ||  \
+    defined(__NetBSD__) || defined(__OpenBSD__)
+  int mib[] = {
+      CTL_KERN,
+      KERN_PROC,
+      KERN_PROC_PID,
+      getpid(),
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+      sizeof(struct kinfo_proc),
+      1,
+#endif
+  };
+  u_int miblen = std::size(mib);
+  struct kinfo_proc info;
+  usz size = sizeof(info);
+
+  if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
+    return false;
+  }
+
+  return info.KP_FLAGS & P_TRACED;
+#elif defined(__linux__)
   std::ifstream in("/proc/self/status");
   std::string line;
   while (std::getline(in, line)) {
@@ -30,6 +65,7 @@ bool rx::isDebuggerPresent() {
   }
 
   return false;
+#endif
 }
 
 void rx::waitForDebugger() {
@@ -49,6 +85,7 @@ void rx::waitForDebugger() {
 }
 
 void rx::runDebugger() {
+#ifdef __linux__
   int pid = ::getpid();
   char path[PATH_MAX];
   ::readlink("/proc/self/exe", path, sizeof(path));
@@ -78,19 +115,20 @@ void rx::runDebugger() {
   argv.push_back(nullptr);
 
   execv(gdbPath, (char **)argv.data());
-}
-
-#else
-bool rx::isDebuggerPresent() { return false; }
-void rx::waitForDebugger() {}
-void rx::runDebugger() {}
 #endif
+}
 
 void rx::breakpoint() {
 #if __has_builtin(__builtin_debugtrap)
   __builtin_debugtrap();
-#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#elif defined(__GNUC__)
+#if defined(__i386__) || defined(__x86_64__)
   __asm__ volatile("int3");
+#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
+  __asm__ volatile("brk 0x42");
+#endif
+#elif defined(_M_X64)
+  __debugbreak();
 #endif
 }
 
diff --git a/rx/src/mem.cpp b/rx/src/mem.cpp
index 69afd161b..c3e225fa7 100644
--- a/rx/src/mem.cpp
+++ b/rx/src/mem.cpp
@@ -1,4 +1,7 @@
 #include "mem.hpp"
+
+#ifdef __linux__
+
 #include <cstdio>
 #include <print>
 #include <sys/mman.h>
@@ -44,3 +47,4 @@ void rx::mem::printStats() {
   free(line);
   fclose(maps);
 }
+#endif