diff --git a/.ci/build-linux-aarch64.sh b/.ci/build-linux-aarch64.sh index 54b2e49d3..0ee670ea3 100755 --- a/.ci/build-linux-aarch64.sh +++ b/.ci/build-linux-aarch64.sh @@ -1,7 +1,5 @@ #!/bin/sh -ex -cd rpcs3/ || exit 1 - git config --global --add safe.directory '*' # Pull all the submodules except llvm and opencv @@ -9,33 +7,30 @@ git config --global --add safe.directory '*' git submodule -q update --init $(awk '/path/ && !/llvm/ && !/opencv/ { print $3 }' .gitmodules) if [ "$COMPILER" = "gcc" ]; then - # These are set in the dockerfile - export CC="${GCC_BINARY}" - export CXX="${GXX_BINARY}" - export LINKER=gold + export CC=gcc-14 + export CXX=g++-14 else - export CC="${CLANG_BINARY}" - export CXX="${CLANGXX_BINARY}" - export LINKER="${LLD_BINARY}" + export CC=clang + export CXX=clang++ + export CFLAGS="$CFLAGS -fuse-ld=lld" fi -export CFLAGS="$CFLAGS -fuse-ld=${LINKER}" -export CXXFLAGS="$CXXFLAGS -fuse-ld=${LINKER}" cmake -B build \ -DCMAKE_INSTALL_PREFIX=/usr \ - -DUSE_NATIVE_INSTRUCTIONS=OFF \ - -DUSE_PRECOMPILED_HEADERS=OFF \ -DCMAKE_C_FLAGS="$CFLAGS" \ -DCMAKE_CXX_FLAGS="$CFLAGS" \ + -DUSE_NATIVE_INSTRUCTIONS=OFF \ + -DUSE_PRECOMPILED_HEADERS=OFF \ -DUSE_SYSTEM_CURL=ON \ - -DUSE_SDL=ON \ - -DUSE_SYSTEM_SDL=ON \ + -DUSE_SDL=OFF \ + -DUSE_SYSTEM_FFMPEG=OFF \ + -DUSE_SYSTEM_CURL=OFF \ + -DUSE_SYSTEM_OPENAL=OFF \ -DUSE_SYSTEM_FFMPEG=OFF \ - -DUSE_SYSTEM_OPENCV=ON \ -DUSE_DISCORD_RPC=ON \ -DOpenGL_GL_PREFERENCE=LEGACY \ - -DLLVM_DIR=/opt/llvm/lib/cmake/llvm \ -DSTATIC_LINK_LLVM=ON \ + -DBUILD_LLVM=on \ -DWITH_RPCSX=off \ -DWITH_RPCS3=on \ -DWITH_RPCS3_QT_UI=on \ diff --git a/.ci/build-linux.sh b/.ci/build-linux.sh index 15c76cac7..54b5adc83 100755 --- a/.ci/build-linux.sh +++ b/.ci/build-linux.sh @@ -1,7 +1,5 @@ #!/bin/sh -ex -cd rpcs3/ || exit 1 - git config --global --add safe.directory '*' # Pull all the submodules except llvm and opencv @@ -11,40 +9,29 @@ git submodule -q update --init $(awk '/path/ && !/llvm/ && !/opencv/ { print $3 if [ "$COMPILER" = "gcc" ]; then # These are set in the dockerfile - export CC="${GCC_BINARY}" - export CXX="${GXX_BINARY}" - export LINKER=gold - # We need to set the following variables for LTO to link properly - export AR=/usr/bin/gcc-ar-"$GCCVER" - export RANLIB=/usr/bin/gcc-ranlib-"$GCCVER" - export CFLAGS="-fuse-linker-plugin" + export CC=gcc-14 + export CXX=g++-14 else - export CC="${CLANG_BINARY}" - export CXX="${CLANGXX_BINARY}" - export LINKER=lld - export AR=/usr/bin/llvm-ar-"$LLVMVER" - export RANLIB=/usr/bin/llvm-ranlib-"$LLVMVER" + export CC=clang + export CXX=clang++ + export LD=clang + export CFLAGS="$CFLAGS -fuse-ld=lld" fi -export CFLAGS="$CFLAGS -fuse-ld=${LINKER}" - cmake -B build \ -DCMAKE_INSTALL_PREFIX=/usr \ - -DUSE_NATIVE_INSTRUCTIONS=OFF \ - -DUSE_PRECOMPILED_HEADERS=OFF \ -DCMAKE_C_FLAGS="$CFLAGS" \ -DCMAKE_CXX_FLAGS="$CFLAGS" \ - -DCMAKE_AR="$AR" \ - -DCMAKE_RANLIB="$RANLIB" \ - -DUSE_SYSTEM_CURL=ON \ - -DUSE_SDL=ON \ - -DUSE_SYSTEM_SDL=ON \ + -DUSE_NATIVE_INSTRUCTIONS=OFF \ + -DUSE_PRECOMPILED_HEADERS=OFF \ + -DUSE_SDL=OFF \ + -DUSE_SYSTEM_CURL=OFF \ + -DUSE_SYSTEM_OPENAL=OFF \ -DUSE_SYSTEM_FFMPEG=OFF \ - -DUSE_SYSTEM_OPENCV=ON \ -DUSE_DISCORD_RPC=ON \ -DOpenGL_GL_PREFERENCE=LEGACY \ - -DLLVM_DIR=/opt/llvm/lib/cmake/llvm \ -DSTATIC_LINK_LLVM=ON \ + -DBUILD_LLVM=on \ -DWITH_RPCSX=off \ -DWITH_RPCS3=on \ -DWITH_RPCS3_QT_UI=on \ diff --git a/.ci/deploy-linux.sh b/.ci/deploy-linux.sh index e95c25a97..48a94c0d4 100755 --- a/.ci/deploy-linux.sh +++ b/.ci/deploy-linux.sh @@ -7,10 +7,10 @@ CPU_ARCH="${1:-x86_64}" if [ "$DEPLOY_APPIMAGE" = "true" ]; then DESTDIR=AppDir ninja install - curl -fsSLo /usr/bin/linuxdeploy "https://github.com/linuxdeploy/linuxdeploy/releases/download/continuous/linuxdeploy-$CPU_ARCH.AppImage" - chmod +x /usr/bin/linuxdeploy - curl -fsSLo /usr/bin/linuxdeploy-plugin-qt "https://github.com/linuxdeploy/linuxdeploy-plugin-qt/releases/download/continuous/linuxdeploy-plugin-qt-$CPU_ARCH.AppImage" - chmod +x /usr/bin/linuxdeploy-plugin-qt + sudo curl -fsSLo /usr/bin/linuxdeploy "https://github.com/linuxdeploy/linuxdeploy/releases/download/continuous/linuxdeploy-$CPU_ARCH.AppImage" + sudo chmod a+x /usr/bin/linuxdeploy + sudo curl -fsSLo /usr/bin/linuxdeploy-plugin-qt "https://github.com/linuxdeploy/linuxdeploy-plugin-qt/releases/download/continuous/linuxdeploy-plugin-qt-$CPU_ARCH.AppImage" + sudo chmod a+x /usr/bin/linuxdeploy-plugin-qt curl -fsSLo linuxdeploy-plugin-checkrt.sh https://github.com/darealshinji/linuxdeploy-plugin-checkrt/releases/download/continuous/linuxdeploy-plugin-checkrt.sh chmod +x ./linuxdeploy-plugin-checkrt.sh diff --git a/.clangd b/.clangd index f7e545dc3..b9be9e0aa 100644 --- a/.clangd +++ b/.clangd @@ -1,2 +1,3 @@ CompileFlags: Add: [-Wall, -Wextra, -Wno-missing-designated-field-initializers] + Remove: [ -fno-lifetime-dse ] diff --git a/.github/workflows/rpcs3.yml b/.github/workflows/rpcs3.yml index 6ca24bc05..d09e10357 100644 --- a/.github/workflows/rpcs3.yml +++ b/.github/workflows/rpcs3.yml @@ -29,16 +29,13 @@ jobs: matrix: include: - os: ubuntu-24.04 - docker_img: "rpcs3/rpcs3-ci-jammy:1.4" - build_sh: "rpcs3/.ci/build-linux.sh" + build_sh: ".ci/build-linux.sh" compiler: clang - os: ubuntu-24.04 - docker_img: "rpcs3/rpcs3-ci-jammy:1.4" - build_sh: "rpcs3/.ci/build-linux.sh" + build_sh: ".ci/build-linux.sh" compiler: gcc - os: ubuntu-24.04-arm - docker_img: "rpcs3/rpcs3-ci-jammy-aarch64:1.4" - build_sh: "rpcs3/.ci/build-linux-aarch64.sh" + build_sh: ".ci/build-linux-aarch64.sh" compiler: clang name: RPCS3 Qt UI (Legacy) for Linux ${{ matrix.os }} ${{ matrix.compiler }} runs-on: ${{ matrix.os }} @@ -46,9 +43,8 @@ jobs: CCACHE_DIR: ${{ github.workspace }}/ccache CI_HAS_ARTIFACTS: true DEPLOY_APPIMAGE: true - APPDIR: "/rpcs3/build/appdir" - ARTDIR: "/root/artifacts" - RELEASE_MESSAGE: "/rpcs3/GitHubReleaseMessage.txt" + APPDIR: "./appdir" + ARTDIR: "./artifacts" COMPILER: ${{ matrix.compiler }} RX_VERSION: "Unknown" RX_SHA: "Unknown" @@ -66,16 +62,26 @@ jobs: restore-keys: | ${{ runner.os }}-ccache-${{ matrix.compiler }}-${{ runner.arch }}- - - name: Docker setup and build + - name: Setup dependencies run: | - docker pull --quiet ${{ matrix.docker_img }} - docker run \ - -v $PWD:/rpcs3 \ - --env-file .ci/docker.env \ - -v ${{ env.CCACHE_DIR }}:/root/.ccache \ - -v ${{ github.workspace }}/artifacts:/root/artifacts \ - ${{ matrix.docker_img }} \ - ${{ matrix.build_sh }} + echo "Types: deb" | sudo tee -a /etc/apt/sources.list.d/ubuntu.sources + echo "URIs: ${{ matrix.os == 'ubuntu-24.04-arm' && 'http://ports.ubuntu.com/ubuntu-ports' || 'http://azure.archive.ubuntu.com/ubuntu/' }}" | sudo tee -a /etc/apt/sources.list.d/ubuntu.sources + echo "Suites: plucky plucky-updates plucky-security" | sudo tee -a /etc/apt/sources.list.d/ubuntu.sources + echo "Components: main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/ubuntu.sources + echo "Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg" | sudo tee -a /etc/apt/sources.list.d/ubuntu.sources + + sudo apt update + sudo apt install -y cmake build-essential libunwind-dev \ + libvulkan-dev vulkan-validationlayers \ + libsox-dev ninja-build libasound2-dev libglfw3-dev nasm libudev-dev \ + libpulse-dev libopenal-dev libglew-dev zlib1g-dev libedit-dev \ + libevdev-dev libjack-dev libsndio-dev libglvnd-dev \ + qt6-base-dev qt6-svg-dev qt6-base-private-dev qt6-multimedia-dev \ + clang lld gcc-14 g++-14 \ + + - name: Build + run: | + ${{ matrix.build_sh }} RX_VERSION=`cat .rx.version | awk -F'-' '{print $1}'` RX_SHA=`cat .rx.version | awk -F'-' '{print $5}'` diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 01f2b8938..28c39d893 100644 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -104,6 +104,9 @@ add_subdirectory(zlib EXCLUDE_FROM_ALL) # ZSTD add_subdirectory(zstd EXCLUDE_FROM_ALL) +# workaround for LLVM +add_library(zstd::libzstd_static ALIAS libzstd_static) + # 7zip sdk add_subdirectory(7zip EXCLUDE_FROM_ALL) @@ -342,10 +345,6 @@ if(NOT MSVC AND NOT ANDROID AND NOT WITHOUT_OPENGLEW) target_link_libraries(3rdparty_glew INTERFACE GLEW::GLEW) endif() - -# LLVM -add_subdirectory(llvm EXCLUDE_FROM_ALL) - # WOLFSSL add_subdirectory(wolfssl EXCLUDE_FROM_ALL) diff --git a/3rdparty/llvm/CMakeLists.txt b/3rdparty/llvm/CMakeLists.txt index b2880f7b9..0e11aac59 100644 --- a/3rdparty/llvm/CMakeLists.txt +++ b/3rdparty/llvm/CMakeLists.txt @@ -1,6 +1,6 @@ if(WITH_LLVM) - set(USE_LLVM_VERSION 19.1.7) + set(USE_LLVM_VERSION 20.1.3) if (NOT MSVC) check_cxx_compiler_flag("-msse -msse2 -mcx16" COMPILER_X86) check_cxx_compiler_flag("-march=armv8-a+lse" COMPILER_ARM) @@ -42,7 +42,18 @@ if(WITH_LLVM) set(LLVM_DOWNLOAD_BINARY "") - if ((WIN32 AND MSVC) OR (LINUX AND NOT ANDROID)) + if (ANDROID) + string(APPEND LLVM_DOWNLOAD_BINARY llvm-android-) + + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + string(APPEND LLVM_DOWNLOAD_BINARY arm64-v8a) + else() + string(APPEND LLVM_DOWNLOAD_BINARY x64) + endif() + + string(APPEND LLVM_DOWNLOAD_BINARY .7z) + elseif ((WIN32 AND MSVC) OR LINUX) + string(APPEND LLVM_DOWNLOAD_BINARY llvm-) if (WIN32) string(APPEND LLVM_DOWNLOAD_BINARY windows-) else() @@ -62,6 +73,8 @@ if(WITH_LLVM) string(APPEND LLVM_DOWNLOAD_BINARY MD) endif() endif() + + string(APPEND LLVM_DOWNLOAD_BINARY .7z) endif() if(CMAKE_SYSTEM_NAME STREQUAL "Linux") @@ -79,55 +92,76 @@ if(WITH_LLVM) # LLVM needs to be built out-of-tree add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/llvm/llvm ${CMAKE_CURRENT_BINARY_DIR}/llvm_build EXCLUDE_FROM_ALL) set(LLVM_DIR "${CMAKE_CURRENT_BINARY_DIR}/llvm_build/lib/cmake/llvm/") + set(MLIR_DIR "${CMAKE_CURRENT_BINARY_DIR}/llvm_build/lib/cmake/mlir/") else() set(LLVM_DOWNLOAD_LINK https://github.com/RPCSX/llvm-build/releases/download/${USE_LLVM_VERSION}) - if (NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z" AND + if (NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}" AND NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.unpacked") message(STATUS "Downloading LLVM") file(DOWNLOAD ${LLVM_DOWNLOAD_LINK}/${LLVM_DOWNLOAD_BINARY} - "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z.tmp" SHOW_PROGRESS + "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.tmp" SHOW_PROGRESS STATUS FILE_STATUS) list(GET FILE_STATUS 0 STATUS_CODE) if (NOT STATUS_CODE EQUAL 0) - file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z.tmp") + file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.tmp") message(FATAL_ERROR "Failed to download LLVM") endif() file(RENAME - "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z.tmp" - "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z" + "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.tmp" + "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}" ) endif() if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.unpacked") - file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}") - execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z" - WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}" RESULT_VARIABLE STATUS_CODE) + file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.dir") + execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}" + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.dir" RESULT_VARIABLE STATUS_CODE) if (NOT STATUS_CODE EQUAL 0) message(FATAL_ERROR "Failed to unpack LLVM") endif() file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.unpacked") - file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.7z") + file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}") endif() - file(GLOB LLVM_ROOT_DIR_LIST LIST_DIRECTORIES true "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}/*") + file(GLOB LLVM_ROOT_DIR_LIST LIST_DIRECTORIES true "${CMAKE_CURRENT_BINARY_DIR}/${USE_LLVM_VERSION}-${LLVM_DOWNLOAD_BINARY}.dir/*") list(GET LLVM_ROOT_DIR_LIST 0 LLVM_ROOT_DIR) - set(LLVM_DIR "${LLVM_ROOT_DIR}/lib/cmake/llvm") + set(LLVM_DIR "${LLVM_ROOT_DIR}/lib/cmake/llvm") + set(MLIR_DIR "${LLVM_ROOT_DIR}/lib/cmake/mlir") if (NOT EXISTS "${LLVM_DIR}") message(FATAL_ERROR "Failed to locate LLVM: ${LLVM_ROOT_DIR}") endif() + + if (NOT EXISTS "${MLIR_DIR}") + message(FATAL_ERROR "Failed to locate MLIR: ${LLVM_ROOT_DIR}") + endif() + + if (NOT ANDROID) + set(Clang_DIR "${LLVM_ROOT_DIR}/lib/cmake/clang") + if (NOT EXISTS "${Clang_DIR}") + message(FATAL_ERROR "Failed to locate Clang: ${LLVM_ROOT_DIR}") + endif() + endif() endif() set(STATIC_LINK_LLVM ON CACHE BOOL "Link against LLVM statically. This will get set to ON if you build LLVM from the submodule." FORCE) find_package(LLVM ${USE_LLVM_VERSION} CONFIG) + find_package(MLIR ${USE_LLVM_VERSION} CONFIG) - if(NOT LLVM_FOUND) + if(NOT LLVM_FOUND OR NOT MLIR_FOUND) message(FATAL_ERROR "Couldn't build LLVM from the submodule. You might need to run `git submodule update --init`") endif() + + if (NOT ANDROID) + find_package(Clang ${USE_LLVM_VERSION} CONFIG) + if(NOT Clang_FOUND) + message(FATAL_ERROR "Couldn't build Clang from the submodule. You might need to run `git submodule update --init`") + endif() + endif() else() message(STATUS "Using prebuilt or system LLVM") @@ -136,15 +170,36 @@ if(WITH_LLVM) set(LLVM_DIR ${CMAKE_SOURCE_DIR}/${LLVM_DIR}) endif() + if (MLIR_DIR AND NOT IS_ABSOLUTE "${MLIR_DIR}") + set(MLIR_DIR ${CMAKE_SOURCE_DIR}/${MLIR_DIR}) + endif() + + if (Clang_DIR AND NOT IS_ABSOLUTE "${Clang_DIR}") + set(Clang_DIR ${CMAKE_SOURCE_DIR}/${Clang_DIR}) + endif() + find_package(LLVM CONFIG) + find_package(MLIR CONFIG) if (NOT LLVM_FOUND) message(FATAL_ERROR "Can't find LLVM libraries from the CMAKE_PREFIX_PATH path or LLVM_DIR. \ Enable BUILD_LLVM option to build LLVM from included as a git submodule.") endif() if (LLVM_VERSION VERSION_LESS 18) - message(FATAL_ERROR "Found LLVM version ${LLVM_VERSION}. Required version 18 or above. \ - Enable BUILD_LLVM option to build LLVM from included as a git submodule.") + message(FATAL_ERROR "Found LLVM version ${LLVM_VERSION}. Required version 18 or above.") + endif() + + if (NOT MLIR_FOUND) + message(FATAL_ERROR "Can't find MLIR libraries from the CMAKE_PREFIX_PATH path or MLIR_DIR") + endif() + + + if (NOT ANDROID) + find_package(Clang CONFIG) + + if (NOT Clang_FOUND) + message(FATAL_ERROR "Can't find Clang from the CMAKE_PREFIX_PATH path or Clang_DIR.") + endif() endif() endif() @@ -164,9 +219,9 @@ if(WITH_LLVM) endif() # For Linux even if BUILD_LLVM is disabled (precompiled llvm used) - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - list (APPEND LLVM_ADDITIONAL_LIBS PerfJITEvents) - endif() + # if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + # list (APPEND LLVM_ADDITIONAL_LIBS PerfJITEvents) + # endif() llvm_map_components_to_libnames(LLVM_LIBS ${LLVM_TARGETS_TO_BUILD} @@ -176,17 +231,42 @@ if(WITH_LLVM) MCJIT Passes ) + + set(MLIR_LIBS MLIRIR MLIRInferTypeOpInterface MLIRFuncDialect MLIRSCFDialect MLIRSCFToControlFlow MLIRAffineAnalysis MLIRAsyncToLLVM) else() - set(LLVM_LIBS LLVM) + set(LLVM_LIBS LLVM MLIR) endif() + list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}") + list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}") + + include(TableGen) + include(AddLLVM) + include(AddMLIR) + + if (NOT ANDROID) + list(APPEND CMAKE_MODULE_PATH "${CLANG_CMAKE_DIR}") + include(AddClang) + + get_target_property(CLANG_EXECUTABLE clang LOCATION) + endif() + # include(HandleLLVMOptions) + add_library(3rdparty_llvm INTERFACE) target_link_libraries(3rdparty_llvm INTERFACE ${LLVM_LIBS}) target_include_directories(3rdparty_llvm INTERFACE ${LLVM_INCLUDE_DIRS}) separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS}) target_compile_definitions(3rdparty_llvm INTERFACE ${LLVM_DEFINITIONS_LIST} LLVM_AVAILABLE) + add_library(3rdparty_mlir INTERFACE) + target_link_libraries(3rdparty_mlir INTERFACE 3rdparty_llvm ${MLIR_LIBS}) + target_include_directories(3rdparty_mlir INTERFACE ${MLIR_INCLUDE_DIRS}) + separate_arguments(MLIR_DEFINITIONS_LIST NATIVE_COMMAND ${MLIR_DEFINITIONS}) + target_compile_definitions(3rdparty_mlir INTERFACE ${MLIR_DEFINITIONS_LIST} MLIR_AVAILABLE) + add_library(3rdparty::llvm ALIAS 3rdparty_llvm) + add_library(3rdparty::mlir ALIAS 3rdparty_mlir) else() add_library(3rdparty::llvm ALIAS 3rdparty_dummy_lib) + add_library(3rdparty::mlir ALIAS 3rdparty_dummy_lib) endif() diff --git a/CMakeLists.txt b/CMakeLists.txt index 600486065..7d4fa1202 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,6 +121,8 @@ endif() add_subdirectory(3rdparty EXCLUDE_FROM_ALL) add_subdirectory(rx EXCLUDE_FROM_ALL) +include(3rdparty/llvm/CMakeLists.txt) + if (NOT RX_TAG) set(RX_TAG 0) endif() @@ -183,9 +185,10 @@ if (WITH_RPCSX) add_subdirectory(tools) add_subdirectory(orbis-kernel) - add_subdirectory(rpcsx) endif() +add_subdirectory(rpcsx) + if (WITH_RPCS3) include(ConfigureCompiler) include(CheckFunctionExists) diff --git a/android/CMakeLists.txt b/android/CMakeLists.txt index f062e405f..f8e93167c 100644 --- a/android/CMakeLists.txt +++ b/android/CMakeLists.txt @@ -5,7 +5,7 @@ set(CMAKE_CXX_STANDARD 20) set(CMAKE_POSITION_INDEPENDENT_CODE on) set(FFMPEG_VERSION 5.1) -set(LLVM_VERSION 19.1) +set(LLVM_VERSION 20.1.2) option(USE_ARCH "Specify arch to build" "") @@ -88,26 +88,6 @@ target_link_libraries(3rdparty_ffmpeg INTERFACE add_dependencies(3rdparty_ffmpeg ffmpeg-unpack) - -if(NOT EXISTS ${CMAKE_BINARY_DIR}/llvm-${LLVM_VERSION}.tar.gz) - message(STATUS "Downloading llvm-${LLVM_VERSION}") - file(DOWNLOAD - https://github.com/RPCS3-Android/llvm-android/releases/download/${LLVM_VERSION}/llvm-${RPCS3_DOWNLOAD_ARCH}-Android.tar.gz - ${CMAKE_BINARY_DIR}/llvm-${LLVM_VERSION}.tar.gz - SHOW_PROGRESS - ) -endif() - -set(LLVM_DIR ${CMAKE_BINARY_DIR}/llvm-${LLVM_VERSION}.7-Android/lib/cmake/llvm) - -if (NOT EXISTS ${LLVM_DIR}) - message(STATUS "Unpacking llvm-${LLVM_VERSION}") - execute_process( - COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_BINARY_DIR}/llvm-${LLVM_VERSION}.tar.gz - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - ) -endif() - set(WITH_RPCSX off) set(WITH_RPCS3 on) set(USE_SYSTEM_LIBUSB off) @@ -117,10 +97,10 @@ set(USE_SYSTEM_OPENCV off) set(USE_SYSTEM_FFMPEG off) set(USE_FAUDIO off) set(USE_SDL2 off) -set(BUILD_LLVM off) +set(BUILD_LLVM on) set(STATIC_LINK_LLVM on) -set(DISABLE_LTO on) -set(USE_LTO off) +set(DISABLE_LTO off) +set(USE_LTO on) set(USE_OPENSL off) set(ASMJIT_NO_SHM_OPEN on) set(USE_SYSTEM_ZLIB on) diff --git a/ps3fw/cellGcmSys.cpp b/ps3fw/cellGcmSys.cpp index c8ecfa215..31b914a76 100644 --- a/ps3fw/cellGcmSys.cpp +++ b/ps3fw/cellGcmSys.cpp @@ -152,7 +152,7 @@ vm::ptr cellGcmGetReportDataAddressLocation(u32 index, u32 lo cellGcmSys.error("cellGcmGetReportDataAddressLocation: Wrong local index (%d)", index); } - return vm::cast(rsx::get_current_renderer()->label_addr + ::offset32(&RsxReports::report) + index * 0x10); + return vm::cast(rsx::get_current_renderer()->label_addr + OFFSET_OF(RsxReports, report) + index * 0x10); } u64 cellGcmGetTimeStamp(u32 index) @@ -164,7 +164,7 @@ u64 cellGcmGetTimeStamp(u32 index) cellGcmSys.error("cellGcmGetTimeStamp: Wrong local index (%d)", index); } - const u32 address = rsx::get_current_renderer()->label_addr + ::offset32(&RsxReports::report) + index * 0x10; + const u32 address = rsx::get_current_renderer()->label_addr + OFFSET_OF(RsxReports, report) + index * 0x10; return *vm::get_super_ptr(address); } @@ -193,7 +193,7 @@ u32 cellGcmGetNotifyDataAddress(u32 index) */ vm::ptr _cellGcmFunc12() { - return vm::ptr::make(rsx::get_current_renderer()->label_addr + ::offset32(&RsxReports::report)); // TODO + return vm::ptr::make(rsx::get_current_renderer()->label_addr + OFFSET_OF(RsxReports, report)); // TODO } u32 cellGcmGetReport(u32 type, u32 index) @@ -223,7 +223,7 @@ u32 cellGcmGetReportDataAddress(u32 index) cellGcmSys.error("cellGcmGetReportDataAddress: Wrong local index (%d)", index); } - return rsx::get_current_renderer()->label_addr + ::offset32(&RsxReports::report) + index * 0x10; + return rsx::get_current_renderer()->label_addr + OFFSET_OF(RsxReports, report) + index * 0x10; } u32 cellGcmGetReportDataLocation(u32 index, u32 location) diff --git a/ps3fw/cellGem.cpp b/ps3fw/cellGem.cpp index 2da34a643..15cfbd9cf 100644 --- a/ps3fw/cellGem.cpp +++ b/ps3fw/cellGem.cpp @@ -574,20 +574,7 @@ public: for (gem_controller& c : controllers) { ar(c.status, c.ext_status, c.ext_id, c.port, c.enabled_magnetometer, c.calibrated_magnetometer, c.enabled_filtering, c.enabled_tracking, c.enabled_LED, c.hue_set, c.rumble); - - // We need to add padding because we used bitwise serialization in version 1 - if (version < 2) - { - ar.add_padding(&gem_controller::rumble, &gem_controller::sphere_rgb); - } - ar(c.sphere_rgb, c.hue, c.distance_mm, c.radius, c.radius_valid, c.is_calibrating); - - if (version < 2) - { - ar.add_padding(&gem_controller::is_calibrating, &gem_controller::calibration_start_us); - } - ar(c.calibration_start_us); if (ar.is_writing() || version >= 2) diff --git a/ps3fw/cellSaveData.cpp b/ps3fw/cellSaveData.cpp index 1937f84bc..a2c5b5bda 100644 --- a/ps3fw/cellSaveData.cpp +++ b/ps3fw/cellSaveData.cpp @@ -1022,7 +1022,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr dirName, listSet->focusPosition = CELL_SAVEDATA_FOCUSPOS_LISTHEAD; std::memset(result.get_ptr(), 0, - ::offset32(&CellSaveDataCBResult::userdata)); + OFFSET_OF(CellSaveDataCBResult, userdata)); // List Callback funcList(ppu, result, listGet, listSet); @@ -1313,7 +1313,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr dirName, } std::memset(result.get_ptr(), 0, - ::offset32(&CellSaveDataCBResult::userdata)); + OFFSET_OF(CellSaveDataCBResult, userdata)); if (!funcDone) { @@ -1436,8 +1436,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr dirName, { lv2_sleep(ppu, 250); - std::memset(result.get_ptr(), 0, - ::offset32(&CellSaveDataCBResult::userdata)); + std::memset(result.get_ptr(), 0, OFFSET_OF(CellSaveDataCBResult, userdata)); // Fixed Callback funcFixed(ppu, result, listGet, fixedSet); @@ -1780,7 +1779,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr dirName, !save_entry.isNew ? ::narrow((size_bytes / 1024) + statGet->sysSizeKB) : 0; std::memset(result.get_ptr(), 0, - ::offset32(&CellSaveDataCBResult::userdata)); + OFFSET_OF(CellSaveDataCBResult, userdata)); // Stat Callback funcStat(ppu, result, statGet, statSet); @@ -2036,7 +2035,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr dirName, std::memset(fileSet.get_ptr(), 0, fileSet.size()); std::memset(fileGet->reserved, 0, sizeof(fileGet->reserved)); std::memset(result.get_ptr(), 0, - ::offset32(&CellSaveDataCBResult::userdata)); + OFFSET_OF(CellSaveDataCBResult, userdata)); funcFile(ppu, result, fileGet, fileSet); ppu.state += cpu_flag::wait; diff --git a/ps3fw/cellSpursSpu.cpp b/ps3fw/cellSpursSpu.cpp index 3bfa3d749..d68cf93f3 100644 --- a/ps3fw/cellSpursSpu.cpp +++ b/ps3fw/cellSpursSpu.cpp @@ -1215,7 +1215,7 @@ void spursSysServiceTraceUpdate(spu_thread& spu, SpursKernelContext* ctxt, u32 a if (((sysSrvMsgUpdateTrace & (1 << ctxt->spuNum)) != 0) || (arg3 != 0)) { // vm::reservation_acquire(ctxt->spurs.ptr(&CellSpurs::traceBuffer).addr()); - auto spurs = spu._ptr(0x80 - offset32(&CellSpurs::traceBuffer)); + auto spurs = spu._ptr(0x80 - OFFSET_OF(CellSpurs, traceBuffer)); if (ctxt->traceMsgCount != 0xffu || spurs->traceBuffer.addr() == 0u) { @@ -1238,7 +1238,7 @@ void spursSysServiceTraceUpdate(spu_thread& spu, SpursKernelContext* ctxt, u32 a if (notify) { - auto spurs = spu._ptr(0x2D80 - offset32(&CellSpurs::wklState1)); + auto spurs = spu._ptr(0x2D80 - OFFSET_OF(CellSpurs, wklState1)); sys_spu_thread_send_event(spu, spurs->spuPort, 2, 0); } } @@ -1427,12 +1427,12 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i // vm::reservation_op(vm::cast(ctxt->taskset.addr()), 128, [&]() { auto taskset = ctxt->taskset; - v128 waiting = vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::waiting)); - v128 running = vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::running)); - v128 ready = vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::ready)); - v128 pready = vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::pending_ready)); - v128 enabled = vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::enabled)); - v128 signalled = vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::signalled)); + v128 waiting = vm::_ref(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, waiting)); + v128 running = vm::_ref(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, running)); + v128 ready = vm::_ref(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, ready)); + v128 pready = vm::_ref(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, pending_ready)); + v128 enabled = vm::_ref(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, enabled)); + v128 signalled = vm::_ref(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, signalled)); // Verify taskset state is valid if ((waiting & running) != v128{} || (ready & pready) != v128{} || @@ -1599,12 +1599,12 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i spursHalt(spu); } - vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::waiting)) = waiting; - vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::running)) = running; - vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::ready)) = ready; - vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::pending_ready)) = v128{}; - vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::enabled)) = enabled; - vm::_ref(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::signalled)) = signalled; + vm::_ref(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, waiting)) = waiting; + vm::_ref(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, running)) = running; + vm::_ref(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, ready)) = ready; + vm::_ref(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, pending_ready)) = v128{}; + vm::_ref(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, enabled)) = enabled; + vm::_ref(ctxt->taskset.addr() + OFFSET_OF(CellSpursTaskset, signalled)) = signalled; std::memcpy(spu._ptr(0x2700), spu._ptr(0x100), 128); // Copy data } //); diff --git a/ps3fw/cellVdec.cpp b/ps3fw/cellVdec.cpp index ce4e07b11..c4f679688 100644 --- a/ps3fw/cellVdec.cpp +++ b/ps3fw/cellVdec.cpp @@ -1675,7 +1675,7 @@ error_code cellVdecGetPicItem(ppu_thread& ppu, u32 handle, info->status = CELL_OK; info->attr = attr; - const vm::addr_t picinfo_addr{info.addr() + ::offset32(&all_info_t::picInfo)}; + const vm::addr_t picinfo_addr{info.addr() + OFFSET_OF(all_info_t, picInfo)}; info->picInfo_addr = picinfo_addr; if (vdec->type == CELL_VDEC_CODEC_TYPE_AVC) diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index bfcf682a2..04a9d41da 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -445,6 +445,8 @@ target_link_libraries(rpcs3_emu 3rdparty::libusb 3rdparty::wolfssl Vulkan::Headers rpcsx::fw::ps3::api + rpcsx::cpu::cell::ppu + rpcsx::cpu::cell::ppu::semantic PRIVATE 3rdparty::glslang diff --git a/rpcs3/Emu/Cell/PPUFunction.cpp b/rpcs3/Emu/Cell/PPUFunction.cpp index df7eaa76f..0c767090b 100644 --- a/rpcs3/Emu/Cell/PPUFunction.cpp +++ b/rpcs3/Emu/Cell/PPUFunction.cpp @@ -1908,7 +1908,7 @@ auto gen_ghc_cpp_trampoline(ppu_intrp_func_t fn_target) // Take second ghc arg c.mov(args[0], x86::rbp); - c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia))); + c.mov(args[2].r32(), x86::dword_ptr(args[0], OFFSET_OF(ppu_thread, cia))); c.add(args[2], x86::qword_ptr(reinterpret_cast(&vm::g_base_addr))); c.jmp(fn_target); }; @@ -1942,7 +1942,7 @@ auto gen_ghc_cpp_trampoline(ppu_intrp_func_t fn_target) c.bind(base_addr); c.embedUInt64(reinterpret_cast(&vm::g_base_addr)); c.bind(cia_offset); - c.embedUInt64(static_cast(::offset32(&ppu_thread::cia))); + c.embedUInt64(static_cast(OFFSET_OF(ppu_thread, cia))); c.bind(jmp_target); c.embedUInt64(reinterpret_cast(fn_target)); }; diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index 77b7fd785..a84bf21c4 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -1,4 +1,5 @@ #include "stdafx.h" +#include "rx/cpu/cell/ppu/Decoder.hpp" #include "PPUInterpreter.h" #include "Emu/Memory/vm_reservation.h" @@ -189,13 +190,11 @@ namespace asmjit } // Indexed offset to ppu.member - template ().*MPtr)[0]), uint I, uint N> - x86::Mem ppu_mem(const bf_t&, bool last = false) + template + x86::Mem ppu_mem(const bf_t&, std::size_t offset, std::size_t size, std::size_t elemSize, bool last = false) { // Required index shift for array indexing - constexpr u32 Shift = std::countr_zero(sizeof((std::declval().*MPtr)[0])); - - const u32 offset = ::offset32(MPtr); + u32 Shift = std::countr_zero(elemSize); auto tmp_r32 = x86::eax; auto reg_ppu = arg_ppu; @@ -222,13 +221,13 @@ namespace asmjit } // Use max possible index shift - constexpr u32 X86Shift = Shift > 3 ? 3 : Shift; - constexpr u32 AddShift = Shift - X86Shift; - constexpr u32 AndMask = (1u << N) - 1; + u32 X86Shift = Shift > 3 ? 3 : Shift; + u32 AddShift = Shift - X86Shift; + u32 AndMask = (1u << N) - 1; - if constexpr (I >= AddShift) + if (I >= AddShift) { - if constexpr (I != AddShift) + if (I != AddShift) base::shr(tmp_r32, I - AddShift); base::and_(tmp_r32, AndMask << AddShift); } @@ -238,25 +237,24 @@ namespace asmjit base::shl(tmp_r32, I + AddShift); } - return x86::ptr(reg_ppu, tmp_r32.r64(), X86Shift, static_cast(offset - ppu_base), Size); + return x86::ptr(reg_ppu, tmp_r32.r64(), X86Shift, static_cast(offset - ppu_base), size); } // Generic offset to ppu.member - template ().*MPtr)> - x86::Mem ppu_mem() + x86::Mem ppu_mem(std::uint32_t offset, std::size_t size) { - return x86::ptr(arg_ppu, static_cast(::offset32(MPtr)), Size); + return x86::ptr(arg_ppu, offset, size); } template x86::Mem ppu_vr(const bf_t& bf, bool last = false) { - return ppu_mem<&ppu_thread::vr, Size>(bf, last); + return ppu_mem(bf, OFFSET_OF(ppu_thread, vr), Size, sizeof(ppu_thread::vr[0]), last); } x86::Mem ppu_sat() { - return ppu_mem<&ppu_thread::sat>(); + return ppu_mem(OFFSET_OF(ppu_thread, sat), sizeof(ppu_thread::sat)); } void ppu_ret(bool last = true) @@ -265,7 +263,7 @@ namespace asmjit base::mov(x86::rax, x86::qword_ptr(arg_next_fn)); base::add(arg_this_op, 4); if (is_debugger_present()) - base::mov(ppu_mem<&ppu_thread::cia>(), arg_this_op.r32()); + base::mov(ppu_mem(OFFSET_OF(ppu_thread, cia), sizeof(ppu_thread::cia)), arg_this_op.r32()); base::mov(arg_op, x86::dword_ptr(arg_this_op)); base::bswap(arg_op); base::add(arg_next_fn, 8); @@ -377,7 +375,7 @@ inline void ppu_cr_set(ppu_thread& ppu, u32 field, bool le, bool gt, bool eq, bo template inline void ppu_cr_set(ppu_thread& ppu, u32 field, const T& a, const T& b) { - ppu_cr_set(ppu, field, a b, a == b, ppu.xer.so); + ppu_cr_set(ppu, field, (a < b), (a > b), a == b, ppu.xer_so); } // TODO @@ -398,8 +396,8 @@ void ppu_set_cr(ppu_thread& ppu, u32 field, bool le, bool gt, bool eq, bool so) // Set XER.OV bit (overflow) inline void ppu_ov_set(ppu_thread& ppu, bool bit) { - ppu.xer.ov = bit; - ppu.xer.so |= bit; + ppu.xer_ov = bit; + ppu.xer_so |= bit; } // Write comparison results to FPCC field with optional CR field update @@ -428,7 +426,7 @@ void ppu_set_fpcc(ppu_thread& ppu, f64 a, f64 b, u64 cr_field = 1) fpcc[3] = cmp == std::partial_ordering::unordered; #endif - const u32 data = std::bit_cast(fpcc); + auto data = std::bit_cast(fpcc); // Write FPCC ppu.fpscr.fields[4] = data; @@ -440,7 +438,7 @@ void ppu_set_fpcc(ppu_thread& ppu, f64 a, f64 b, u64 cr_field = 1) if (g_cfg.core.ppu_debug) [[unlikely]] { - *reinterpret_cast(vm::g_stat_addr + ppu.cia) |= data; + *reinterpret_cast(vm::g_stat_addr + ppu.cia) |= std::bit_cast(data); } } } @@ -608,7 +606,7 @@ inline v128 ppu_select_vnan(v128 a, v128 b) return gv_selectfs(gv_eqfs(a, a), b, a | gv_bcst32(0x7fc00000u)); } -inline v128 ppu_select_vnan(v128 a, v128 b, Vector128 auto... args) +inline v128 ppu_select_vnan(v128 a, v128 b, rx::Vector128 auto... args) { return ppu_select_vnan(a, ppu_select_vnan(b, args...)); } @@ -633,7 +631,7 @@ inline v128 ppu_fix_vnan(v128 r) } template -inline v128 ppu_set_vnan(v128 r, Vector128 auto... args) +inline v128 ppu_set_vnan(v128 r, rx::Vector128 auto... args) { if constexpr (((Flags == set_vnan) || ...) && sizeof...(args) > 0) { @@ -712,7 +710,7 @@ auto VADDFP() static const auto exec = [](auto&& d, auto&& a_, auto&& b_, auto&& jm_mask) { - auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto m = gv_bcst32(jm_mask); auto a = ppu_flush_denormal(m, std::move(a_)); auto b = ppu_flush_denormal(m, std::move(b_)); d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_addfs(a, b), a, b)); @@ -1359,7 +1357,7 @@ auto VMADDFP() static const auto exec = [](auto&& d, auto&& a_, auto&& b_, auto&& c_, auto&& jm_mask) { - auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto m = gv_bcst32(jm_mask); auto a = ppu_flush_denormal(m, std::move(a_)); auto b = ppu_flush_denormal(m, std::move(b_)); auto c = ppu_flush_denormal(m, std::move(c_)); @@ -1377,7 +1375,7 @@ auto VMAXFP() static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& jm_mask) { - d = ppu_flush_denormal(gv_bcst32(jm_mask, &ppu_thread::jm_mask), ppu_set_vnan(gv_maxfs(a, b), a, b)); + d = ppu_flush_denormal(gv_bcst32(jm_mask), ppu_set_vnan(gv_maxfs(a, b), a, b)); }; RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.jm_mask); @@ -1524,7 +1522,7 @@ auto VMINFP() static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& jm_mask) { - d = ppu_flush_denormal(gv_bcst32(jm_mask, &ppu_thread::jm_mask), ppu_set_vnan(gv_minfs(a, b), a, b)); + d = ppu_flush_denormal(gv_bcst32(jm_mask), ppu_set_vnan(gv_minfs(a, b), a, b)); }; RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.jm_mask); @@ -1931,7 +1929,7 @@ auto VNMSUBFP() { // An odd case with (FLT_MIN, FLT_MIN, FLT_MIN) produces FLT_MIN instead of 0 auto s = gv_bcstfs(-0.0f); - auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto m = gv_bcst32(jm_mask); auto a = ppu_flush_denormal(m, std::move(a_)); auto b = ppu_flush_denormal(m, std::move(b_)); auto c = ppu_flush_denormal(m, std::move(c_)); @@ -2177,7 +2175,7 @@ auto VREFP() static const auto exec = [](auto&& d, auto&& b_, auto&& jm_mask) { - auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto m = gv_bcst32(jm_mask); auto b = ppu_flush_denormal(m, std::move(b_)); d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_divfs(gv_bcstfs(1.0f), b), b)); }; @@ -2193,7 +2191,7 @@ auto VRFIM() static const auto exec = [](auto&& d, auto&& b_, auto&& jm_mask) { - auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto m = gv_bcst32(jm_mask); auto b = ppu_flush_denormal(m, std::move(b_)); d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_roundfs_floor(b), b)); }; @@ -2209,7 +2207,7 @@ auto VRFIN() static const auto exec = [](auto&& d, auto&& b, auto&& jm_mask) { - auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto m = gv_bcst32(jm_mask); d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_roundfs_even(b), b)); }; @@ -2224,7 +2222,7 @@ auto VRFIP() static const auto exec = [](auto&& d, auto&& b_, auto&& jm_mask) { - auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto m = gv_bcst32(jm_mask); auto b = ppu_flush_denormal(m, std::move(b_)); d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_roundfs_ceil(b), b)); }; @@ -2240,7 +2238,7 @@ auto VRFIZ() static const auto exec = [](auto&& d, auto&& b, auto&& jm_mask) { - auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto m = gv_bcst32(jm_mask); d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_roundfs_trunc(b), b)); }; @@ -2297,7 +2295,7 @@ auto VRSQRTEFP() static const auto exec = [](auto&& d, auto&& b_, auto&& jm_mask) { - auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto m = gv_bcst32(jm_mask); auto b = ppu_flush_denormal(m, std::move(b_)); d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_divfs(gv_bcstfs(1.0f), gv_sqrtfs(b)), b)); }; @@ -2629,7 +2627,7 @@ auto VSUBFP() static const auto exec = [](auto&& d, auto&& a_, auto&& b_, auto&& jm_mask) { - auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); + auto m = gv_bcst32(jm_mask); auto a = ppu_flush_denormal(m, std::move(a_)); auto b = ppu_flush_denormal(m, std::move(b_)); d = ppu_flush_denormal(std::move(m), ppu_set_vnan(gv_subfs(a, b), a, b)); @@ -3113,7 +3111,7 @@ auto SUBFIC() const s64 i = op.simm16; const auto r = add64_flags(~a, i, 1); ppu.gpr[op.rd] = r.result; - ppu.xer.ca = r.carry; + ppu.xer_ca = r.carry; }; RETURN_(ppu, op); } @@ -3170,7 +3168,7 @@ auto ADDIC() const s64 i = op.simm16; const auto r = add64_flags(a, i); ppu.gpr[op.rd] = r.result; - ppu.xer.ca = r.carry; + ppu.xer_ca = r.carry; if (op.main & 1) [[unlikely]] ppu_cr_set(ppu, 0, r.result, 0); }; @@ -3827,7 +3825,7 @@ auto SUBFC() const u64 RB = ppu.gpr[op.rb]; const auto r = add64_flags(~RA, RB, 1); ppu.gpr[op.rd] = r.result; - ppu.xer.ca = r.carry; + ppu.xer_ca = r.carry; if constexpr (((Flags == has_oe) || ...)) ppu_ov_set(ppu, (~RA >> 63 == RB >> 63) && (~RA >> 63 != ppu.gpr[op.rd] >> 63)); if constexpr (((Flags == has_rc) || ...)) @@ -3863,7 +3861,7 @@ auto ADDC() const u64 RB = ppu.gpr[op.rb]; const auto r = add64_flags(RA, RB); ppu.gpr[op.rd] = r.result; - ppu.xer.ca = r.carry; + ppu.xer_ca = r.carry; if constexpr (((Flags == has_oe) || ...)) ppu_ov_set(ppu, (RA >> 63 == RB >> 63) && (RA >> 63 != ppu.gpr[op.rd] >> 63)); if constexpr (((Flags == has_rc) || ...)) @@ -4394,9 +4392,9 @@ auto SUBFE() { const u64 RA = ppu.gpr[op.ra]; const u64 RB = ppu.gpr[op.rb]; - const auto r = add64_flags(~RA, RB, ppu.xer.ca); + const auto r = add64_flags(~RA, RB, ppu.xer_ca); ppu.gpr[op.rd] = r.result; - ppu.xer.ca = r.carry; + ppu.xer_ca = r.carry; if constexpr (((Flags == has_oe) || ...)) ppu_ov_set(ppu, (~RA >> 63 == RB >> 63) && (~RA >> 63 != ppu.gpr[op.rd] >> 63)); if constexpr (((Flags == has_rc) || ...)) @@ -4415,9 +4413,9 @@ auto ADDE() { const u64 RA = ppu.gpr[op.ra]; const u64 RB = ppu.gpr[op.rb]; - const auto r = add64_flags(RA, RB, ppu.xer.ca); + const auto r = add64_flags(RA, RB, ppu.xer_ca); ppu.gpr[op.rd] = r.result; - ppu.xer.ca = r.carry; + ppu.xer_ca = r.carry; if constexpr (((Flags == has_oe) || ...)) ppu_ov_set(ppu, (RA >> 63 == RB >> 63) && (RA >> 63 != ppu.gpr[op.rd] >> 63)); if constexpr (((Flags == has_rc) || ...)) @@ -4434,23 +4432,23 @@ auto MTOCRF() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - alignas(4) static const u8 s_table[16][4]{ - {0, 0, 0, 0}, - {0, 0, 0, 1}, - {0, 0, 1, 0}, - {0, 0, 1, 1}, - {0, 1, 0, 0}, - {0, 1, 0, 1}, - {0, 1, 1, 0}, - {0, 1, 1, 1}, - {1, 0, 0, 0}, - {1, 0, 0, 1}, - {1, 0, 1, 0}, - {1, 0, 1, 1}, - {1, 1, 0, 0}, - {1, 1, 0, 1}, - {1, 1, 1, 0}, - {1, 1, 1, 1}, + alignas(4) static const CrField s_table[16]{ + CrField::From(false, false, false, false), + CrField::From(false, false, false, true), + CrField::From(false, false, true, false), + CrField::From(false, false, true, true), + CrField::From(false, true, false, false), + CrField::From(false, true, false, true), + CrField::From(false, true, true, false), + CrField::From(false, true, true, true), + CrField::From(true, false, false, false), + CrField::From(true, false, false, true), + CrField::From(true, false, true, false), + CrField::From(true, false, true, true), + CrField::From(true, true, false, false), + CrField::From(true, true, false, true), + CrField::From(true, true, true, false), + CrField::From(true, true, true, true), }; const u64 s = ppu.gpr[op.rs]; @@ -4461,7 +4459,7 @@ auto MTOCRF() const u32 n = std::countl_zero(op.crm) & 7; const u64 v = (s >> ((n * 4) ^ 0x1c)) & 0xf; - ppu.cr.fields[n] = *reinterpret_cast(s_table + v); + ppu.cr.fields[n] = s_table[v]; } else { @@ -4472,7 +4470,7 @@ auto MTOCRF() if (op.crm & (128 >> i)) { const u64 v = (s >> ((i * 4) ^ 0x1c)) & 0xf; - ppu.cr.fields[i] = *reinterpret_cast(s_table + v); + ppu.cr.fields[i] = s_table[v]; } } } @@ -4503,7 +4501,7 @@ auto STWCX() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]; - ppu_cr_set(ppu, 0, false, false, ppu_stwcx(ppu, vm::cast(addr), static_cast(ppu.gpr[op.rs])), ppu.xer.so); + ppu_cr_set(ppu, 0, false, false, ppu_stwcx(ppu, vm::cast(addr), static_cast(ppu.gpr[op.rs])), ppu.xer_so); }; RETURN_(ppu, op); } @@ -4591,9 +4589,9 @@ auto SUBFZE() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const u64 RA = ppu.gpr[op.ra]; - const auto r = add64_flags(~RA, 0, ppu.xer.ca); + const auto r = add64_flags(~RA, 0, ppu.xer_ca); ppu.gpr[op.rd] = r.result; - ppu.xer.ca = r.carry; + ppu.xer_ca = r.carry; if constexpr (((Flags == has_oe) || ...)) ppu_ov_set(ppu, (~RA >> 63 == 0) && (~RA >> 63 != ppu.gpr[op.rd] >> 63)); if constexpr (((Flags == has_rc) || ...)) @@ -4611,9 +4609,9 @@ auto ADDZE() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const u64 RA = ppu.gpr[op.ra]; - const auto r = add64_flags(RA, 0, ppu.xer.ca); + const auto r = add64_flags(RA, 0, ppu.xer_ca); ppu.gpr[op.rd] = r.result; - ppu.xer.ca = r.carry; + ppu.xer_ca = r.carry; if constexpr (((Flags == has_oe) || ...)) ppu_ov_set(ppu, (RA >> 63 == 0) && (RA >> 63 != ppu.gpr[op.rd] >> 63)); if constexpr (((Flags == has_rc) || ...)) @@ -4631,7 +4629,7 @@ auto STDCX() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]; - ppu_cr_set(ppu, 0, false, false, ppu_stdcx(ppu, vm::cast(addr), ppu.gpr[op.rs]), ppu.xer.so); + ppu_cr_set(ppu, 0, false, false, ppu_stdcx(ppu, vm::cast(addr), ppu.gpr[op.rs]), ppu.xer_so); }; RETURN_(ppu, op); } @@ -4695,9 +4693,9 @@ auto SUBFME() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const u64 RA = ppu.gpr[op.ra]; - const auto r = add64_flags(~RA, ~0ull, ppu.xer.ca); + const auto r = add64_flags(~RA, ~0ull, ppu.xer_ca); ppu.gpr[op.rd] = r.result; - ppu.xer.ca = r.carry; + ppu.xer_ca = r.carry; if constexpr (((Flags == has_oe) || ...)) ppu_ov_set(ppu, (~RA >> 63 == 1) && (~RA >> 63 != ppu.gpr[op.rd] >> 63)); if constexpr (((Flags == has_rc) || ...)) @@ -4715,9 +4713,9 @@ auto ADDME() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const s64 RA = ppu.gpr[op.ra]; - const auto r = add64_flags(RA, ~0ull, ppu.xer.ca); + const auto r = add64_flags(RA, ~0ull, ppu.xer_ca); ppu.gpr[op.rd] = r.result; - ppu.xer.ca = r.carry; + ppu.xer_ca = r.carry; if constexpr (((Flags == has_oe) || ...)) ppu_ov_set(ppu, (u64(RA) >> 63 == 1) && (u64(RA) >> 63 != ppu.gpr[op.rd] >> 63)); if constexpr (((Flags == has_rc) || ...)) @@ -4881,7 +4879,7 @@ auto MFSPR() switch (n) { - case 0x001: ppu.gpr[op.rd] = u32{ppu.xer.so} << 31 | ppu.xer.ov << 30 | ppu.xer.ca << 29 | ppu.xer.cnt; break; + case 0x001: ppu.gpr[op.rd] = u32{ppu.xer_so} << 31 | ppu.xer_ov << 30 | ppu.xer_ca << 29 | ppu.xer_cnt; break; case 0x008: ppu.gpr[op.rd] = ppu.lr; break; case 0x009: ppu.gpr[op.rd] = ppu.ctr; break; case 0x100: ppu.gpr[op.rd] = ppu.vrsave; break; @@ -5131,10 +5129,10 @@ auto MTSPR() case 0x001: { const u64 value = ppu.gpr[op.rs]; - ppu.xer.so = (value & 0x80000000) != 0; - ppu.xer.ov = (value & 0x40000000) != 0; - ppu.xer.ca = (value & 0x20000000) != 0; - ppu.xer.cnt = value & 0x7f; + ppu.xer_so = (value & 0x80000000) != 0; + ppu.xer_ov = (value & 0x40000000) != 0; + ppu.xer_ca = (value & 0x20000000) != 0; + ppu.xer_cnt = value & 0x7f; break; } case 0x008: ppu.lr = ppu.gpr[op.rs]; break; @@ -5264,7 +5262,7 @@ auto LSWX() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]; - u32 count = ppu.xer.cnt & 0x7f; + u32 count = ppu.xer_cnt & 0x7f; for (; count >= 4; count -= 4, addr += 4, op.rd = (op.rd + 1) & 31) { ppu.gpr[op.rd] = ppu_feed_data(ppu, addr); @@ -5497,7 +5495,7 @@ auto STSWX() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]; - u32 count = ppu.xer.cnt & 0x7F; + u32 count = ppu.xer_cnt & 0x7F; for (; count >= 4; count -= 4, addr += 4, op.rs = (op.rs + 1) & 31) { PPU_WRITE_32(vm::cast(addr), static_cast(ppu.gpr[op.rs])); @@ -5675,12 +5673,12 @@ auto SRAW() if (shift > 31) { ppu.gpr[op.ra] = 0 - (RS < 0); - ppu.xer.ca = (RS < 0); + ppu.xer_ca = (RS < 0); } else { ppu.gpr[op.ra] = RS >> shift; - ppu.xer.ca = (RS < 0) && ((ppu.gpr[op.ra] << shift) != static_cast(RS)); + ppu.xer_ca = (RS < 0) && ((ppu.gpr[op.ra] << shift) != static_cast(RS)); } if constexpr (((Flags == has_rc) || ...)) @@ -5702,12 +5700,12 @@ auto SRAD() if (shift > 63) { ppu.gpr[op.ra] = 0 - (RS < 0); - ppu.xer.ca = (RS < 0); + ppu.xer_ca = (RS < 0); } else { ppu.gpr[op.ra] = RS >> shift; - ppu.xer.ca = (RS < 0) && ((ppu.gpr[op.ra] << shift) != static_cast(RS)); + ppu.xer_ca = (RS < 0) && ((ppu.gpr[op.ra] << shift) != static_cast(RS)); } if constexpr (((Flags == has_rc) || ...)) @@ -5742,7 +5740,7 @@ auto SRAWI() { s32 RS = static_cast(ppu.gpr[op.rs]); ppu.gpr[op.ra] = RS >> op.sh32; - ppu.xer.ca = (RS < 0) && (static_cast(ppu.gpr[op.ra] << op.sh32) != static_cast(RS)); + ppu.xer_ca = (RS < 0) && (static_cast(ppu.gpr[op.ra] << op.sh32) != static_cast(RS)); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.ra], 0); @@ -5761,7 +5759,7 @@ auto SRADI() auto sh = op.sh64; s64 RS = ppu.gpr[op.rs]; ppu.gpr[op.ra] = RS >> sh; - ppu.xer.ca = (RS < 0) && ((ppu.gpr[op.ra] << sh) != static_cast(RS)); + ppu.xer_ca = (RS < 0) && ((ppu.gpr[op.ra] << sh) != static_cast(RS)); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.ra], 0); @@ -6562,9 +6560,9 @@ auto MTFSFI() } else { - static constexpr std::array all_values = []() -> std::array + static constexpr auto all_values = [] { - std::array values{}; + std::array values{}; for (u32 i = 0; i < values.size(); i++) { @@ -6576,7 +6574,7 @@ auto MTFSFI() value |= (im & 1) << (8 * 1); im >>= 1; value |= (im & 1) << (8 * 0); - values[i] = value; + values[i] = std::bit_cast(value); } return values; @@ -8085,3 +8083,1179 @@ ppu_intrp_func_t ppu_interpreter_rt::decode(u32 opv) const noexcept return table.decode(opv); } + +using isel_type = void (*)(PPUContext&, rx::cell::ppu::Instruction); + +#define IMPORT_DECODER(x) extern "C" isel_type ISEL_PPU_##x##_DEC +#define IMPORT_DECODER_ALIAS(x, name) \ + extern "C" isel_type ISEL_PPU_##name##_DEC; \ + inline isel_type ISEL_PPU_##x##_DEC = ISEL_PPU_##name##_DEC; + +IMPORT_DECODER(MFVSCR); +IMPORT_DECODER(MTVSCR); +IMPORT_DECODER(VADDCUW); +IMPORT_DECODER(VADDFP); +IMPORT_DECODER(VADDSBS); +IMPORT_DECODER(VADDSHS); +IMPORT_DECODER(VADDSWS); +IMPORT_DECODER(VADDUBM); +IMPORT_DECODER(VADDUBS); +IMPORT_DECODER(VADDUHM); +IMPORT_DECODER(VADDUHS); +IMPORT_DECODER(VADDUWM); +IMPORT_DECODER(VADDUWS); +IMPORT_DECODER(VAND); +IMPORT_DECODER(VANDC); +IMPORT_DECODER(VAVGSB); +IMPORT_DECODER(VAVGSH); +IMPORT_DECODER(VAVGSW); +IMPORT_DECODER(VAVGUB); +IMPORT_DECODER(VAVGUH); +IMPORT_DECODER(VAVGUW); +IMPORT_DECODER(VCFSX); +IMPORT_DECODER(VCFUX); +IMPORT_DECODER(VCMPBFP); +IMPORT_DECODER_ALIAS(VCMPBFP_, VCMPBFP); +IMPORT_DECODER(VCMPEQFP); +IMPORT_DECODER_ALIAS(VCMPEQFP_, VCMPEQFP); +IMPORT_DECODER(VCMPEQUB); +IMPORT_DECODER_ALIAS(VCMPEQUB_, VCMPEQUB); +IMPORT_DECODER(VCMPEQUH); +IMPORT_DECODER_ALIAS(VCMPEQUH_, VCMPEQUH); +IMPORT_DECODER(VCMPEQUW); +IMPORT_DECODER_ALIAS(VCMPEQUW_, VCMPEQUW); +IMPORT_DECODER(VCMPGEFP); +IMPORT_DECODER_ALIAS(VCMPGEFP_, VCMPGEFP); +IMPORT_DECODER(VCMPGTFP); +IMPORT_DECODER_ALIAS(VCMPGTFP_, VCMPGTFP); +IMPORT_DECODER(VCMPGTSB); +IMPORT_DECODER_ALIAS(VCMPGTSB_, VCMPGTSB); +IMPORT_DECODER(VCMPGTSH); +IMPORT_DECODER_ALIAS(VCMPGTSH_, VCMPGTSH); +IMPORT_DECODER(VCMPGTSW); +IMPORT_DECODER_ALIAS(VCMPGTSW_, VCMPGTSW); +IMPORT_DECODER(VCMPGTUB); +IMPORT_DECODER_ALIAS(VCMPGTUB_, VCMPGTUB); +IMPORT_DECODER(VCMPGTUH); +IMPORT_DECODER_ALIAS(VCMPGTUH_, VCMPGTUH); +IMPORT_DECODER(VCMPGTUW); +IMPORT_DECODER_ALIAS(VCMPGTUW_, VCMPGTUW); +IMPORT_DECODER(VCTSXS); +IMPORT_DECODER(VCTUXS); +IMPORT_DECODER(VEXPTEFP); +IMPORT_DECODER(VLOGEFP); +IMPORT_DECODER(VMADDFP); +IMPORT_DECODER(VMAXFP); +IMPORT_DECODER(VMAXSB); +IMPORT_DECODER(VMAXSH); +IMPORT_DECODER(VMAXSW); +IMPORT_DECODER(VMAXUB); +IMPORT_DECODER(VMAXUH); +IMPORT_DECODER(VMAXUW); +IMPORT_DECODER(VMHADDSHS); +IMPORT_DECODER(VMHRADDSHS); +IMPORT_DECODER(VMINFP); +IMPORT_DECODER(VMINSB); +IMPORT_DECODER(VMINSH); +IMPORT_DECODER(VMINSW); +IMPORT_DECODER(VMINUB); +IMPORT_DECODER(VMINUH); +IMPORT_DECODER(VMINUW); +IMPORT_DECODER(VMLADDUHM); +IMPORT_DECODER(VMRGHB); +IMPORT_DECODER(VMRGHH); +IMPORT_DECODER(VMRGHW); +IMPORT_DECODER(VMRGLB); +IMPORT_DECODER(VMRGLH); +IMPORT_DECODER(VMRGLW); +IMPORT_DECODER(VMSUMMBM); +IMPORT_DECODER(VMSUMSHM); +IMPORT_DECODER(VMSUMSHS); +IMPORT_DECODER(VMSUMUBM); +IMPORT_DECODER(VMSUMUHM); +IMPORT_DECODER(VMSUMUHS); +IMPORT_DECODER(VMULESB); +IMPORT_DECODER(VMULESH); +IMPORT_DECODER(VMULEUB); +IMPORT_DECODER(VMULEUH); +IMPORT_DECODER(VMULOSB); +IMPORT_DECODER(VMULOSH); +IMPORT_DECODER(VMULOUB); +IMPORT_DECODER(VMULOUH); +IMPORT_DECODER(VNMSUBFP); +IMPORT_DECODER(VNOR); +IMPORT_DECODER(VOR); +IMPORT_DECODER(VPERM); +IMPORT_DECODER(VPKPX); +IMPORT_DECODER(VPKSHSS); +IMPORT_DECODER(VPKSHUS); +IMPORT_DECODER(VPKSWSS); +IMPORT_DECODER(VPKSWUS); +IMPORT_DECODER(VPKUHUM); +IMPORT_DECODER(VPKUHUS); +IMPORT_DECODER(VPKUWUM); +IMPORT_DECODER(VPKUWUS); +IMPORT_DECODER(VREFP); +IMPORT_DECODER(VRFIM); +IMPORT_DECODER(VRFIN); +IMPORT_DECODER(VRFIP); +IMPORT_DECODER(VRFIZ); +IMPORT_DECODER(VRLB); +IMPORT_DECODER(VRLH); +IMPORT_DECODER(VRLW); +IMPORT_DECODER(VRSQRTEFP); +IMPORT_DECODER(VSEL); +IMPORT_DECODER(VSL); +IMPORT_DECODER(VSLB); +IMPORT_DECODER(VSLDOI); +IMPORT_DECODER(VSLH); +IMPORT_DECODER(VSLO); +IMPORT_DECODER(VSLW); +IMPORT_DECODER(VSPLTB); +IMPORT_DECODER(VSPLTH); +IMPORT_DECODER(VSPLTISB); +IMPORT_DECODER(VSPLTISH); +IMPORT_DECODER(VSPLTISW); +IMPORT_DECODER(VSPLTW); +IMPORT_DECODER(VSR); +IMPORT_DECODER(VSRAB); +IMPORT_DECODER(VSRAH); +IMPORT_DECODER(VSRAW); +IMPORT_DECODER(VSRB); +IMPORT_DECODER(VSRH); +IMPORT_DECODER(VSRO); +IMPORT_DECODER(VSRW); +IMPORT_DECODER(VSUBCUW); +IMPORT_DECODER(VSUBFP); +IMPORT_DECODER(VSUBSBS); +IMPORT_DECODER(VSUBSHS); +IMPORT_DECODER(VSUBSWS); +IMPORT_DECODER(VSUBUBM); +IMPORT_DECODER(VSUBUBS); +IMPORT_DECODER(VSUBUHM); +IMPORT_DECODER(VSUBUHS); +IMPORT_DECODER(VSUBUWM); +IMPORT_DECODER(VSUBUWS); +IMPORT_DECODER(VSUMSWS); +IMPORT_DECODER(VSUM2SWS); +IMPORT_DECODER(VSUM4SBS); +IMPORT_DECODER(VSUM4SHS); +IMPORT_DECODER(VSUM4UBS); +IMPORT_DECODER(VUPKHPX); +IMPORT_DECODER(VUPKHSB); +IMPORT_DECODER(VUPKHSH); +IMPORT_DECODER(VUPKLPX); +IMPORT_DECODER(VUPKLSB); +IMPORT_DECODER(VUPKLSH); +IMPORT_DECODER(VXOR); +IMPORT_DECODER(TDI); +IMPORT_DECODER(TWI); +IMPORT_DECODER(MULLI); +IMPORT_DECODER(SUBFIC); +IMPORT_DECODER(CMPLI); +IMPORT_DECODER(CMPI); +IMPORT_DECODER(ADDIC); +IMPORT_DECODER(ADDI); +IMPORT_DECODER(ADDIS); +IMPORT_DECODER(BC); +IMPORT_DECODER(SC); +IMPORT_DECODER(B); +IMPORT_DECODER(MCRF); +IMPORT_DECODER(BCLR); +IMPORT_DECODER(RFID); +IMPORT_DECODER(CRNOR); +IMPORT_DECODER(RFSCV); +IMPORT_DECODER(CRANDC); +IMPORT_DECODER(ISYNC); +IMPORT_DECODER(CRXOR); +IMPORT_DECODER(CRNAND); +IMPORT_DECODER(CRAND); +IMPORT_DECODER(HRFID); +IMPORT_DECODER(CREQV); +IMPORT_DECODER(URFID); +IMPORT_DECODER(STOP); +IMPORT_DECODER(CRORC); +IMPORT_DECODER(CROR); +IMPORT_DECODER(BCCTR); +IMPORT_DECODER(RLWIMI); +IMPORT_DECODER(RLWINM); +IMPORT_DECODER(RLWNM); +IMPORT_DECODER(ORI); +IMPORT_DECODER(ORIS); +IMPORT_DECODER(XORI); +IMPORT_DECODER(XORIS); +IMPORT_DECODER(ANDI); +IMPORT_DECODER(ANDIS); +IMPORT_DECODER(RLDICL); +IMPORT_DECODER(RLDICR); +IMPORT_DECODER(RLDIC); +IMPORT_DECODER(RLDIMI); +IMPORT_DECODER(RLDCL); +IMPORT_DECODER(RLDCR); +IMPORT_DECODER(CMP); +IMPORT_DECODER(TW); +IMPORT_DECODER(LVSL); +IMPORT_DECODER(LVEBX); +IMPORT_DECODER(SUBFC); +IMPORT_DECODER(MULHDU); +IMPORT_DECODER(ADDC); +IMPORT_DECODER(MULHWU); +IMPORT_DECODER(MFOCRF); +IMPORT_DECODER(LWARX); +IMPORT_DECODER(LDX); +IMPORT_DECODER(LWZX); +IMPORT_DECODER(SLW); +IMPORT_DECODER(CNTLZW); +IMPORT_DECODER(SLD); +IMPORT_DECODER(AND); +IMPORT_DECODER(CMPL); +IMPORT_DECODER(LVSR); +IMPORT_DECODER(LVEHX); +IMPORT_DECODER(SUBF); +IMPORT_DECODER(LDUX); +IMPORT_DECODER(DCBST); +IMPORT_DECODER(LWZUX); +IMPORT_DECODER(CNTLZD); +IMPORT_DECODER(ANDC); +IMPORT_DECODER(TD); +IMPORT_DECODER(LVEWX); +IMPORT_DECODER(MULHD); +IMPORT_DECODER(MULHW); +IMPORT_DECODER(LDARX); +IMPORT_DECODER(DCBF); +IMPORT_DECODER(LBZX); +IMPORT_DECODER(LVX); +IMPORT_DECODER(NEG); +IMPORT_DECODER(LBZUX); +IMPORT_DECODER(NOR); +IMPORT_DECODER(STVEBX); +IMPORT_DECODER(SUBFE); +IMPORT_DECODER(ADDE); +IMPORT_DECODER(MTOCRF); +IMPORT_DECODER(STDX); +IMPORT_DECODER(STWCX); +IMPORT_DECODER(STWX); +IMPORT_DECODER(STVEHX); +IMPORT_DECODER(STDUX); +IMPORT_DECODER(STWUX); +IMPORT_DECODER(STVEWX); +IMPORT_DECODER(SUBFZE); +IMPORT_DECODER(ADDZE); +IMPORT_DECODER(STDCX); +IMPORT_DECODER(STBX); +IMPORT_DECODER(STVX); +IMPORT_DECODER(MULLD); +IMPORT_DECODER(SUBFME); +IMPORT_DECODER(ADDME); +IMPORT_DECODER(MULLW); +IMPORT_DECODER(DCBTST); +IMPORT_DECODER(STBUX); +IMPORT_DECODER(ADD); +IMPORT_DECODER(DCBT); +IMPORT_DECODER(LHZX); +IMPORT_DECODER(EQV); +IMPORT_DECODER(ECIWX); +IMPORT_DECODER(LHZUX); +IMPORT_DECODER(XOR); +IMPORT_DECODER(MFSPR); +IMPORT_DECODER(LWAX); +IMPORT_DECODER(DST); +IMPORT_DECODER(LHAX); +IMPORT_DECODER(LVXL); +IMPORT_DECODER(MFTB); +IMPORT_DECODER(LWAUX); +IMPORT_DECODER(DSTST); +IMPORT_DECODER(LHAUX); +IMPORT_DECODER(STHX); +IMPORT_DECODER(ORC); +IMPORT_DECODER(ECOWX); +IMPORT_DECODER(STHUX); +IMPORT_DECODER(OR); +IMPORT_DECODER(DIVDU); +IMPORT_DECODER(DIVWU); +IMPORT_DECODER(MTSPR); +IMPORT_DECODER(DCBI); +IMPORT_DECODER(NAND); +IMPORT_DECODER(STVXL); +IMPORT_DECODER(DIVD); +IMPORT_DECODER(DIVW); +IMPORT_DECODER(LVLX); +IMPORT_DECODER(LDBRX); +IMPORT_DECODER(LSWX); +IMPORT_DECODER(LWBRX); +IMPORT_DECODER(LFSX); +IMPORT_DECODER(SRW); +IMPORT_DECODER(SRD); +IMPORT_DECODER(LVRX); +IMPORT_DECODER(LSWI); +IMPORT_DECODER(LFSUX); +IMPORT_DECODER(SYNC); +IMPORT_DECODER(LFDX); +IMPORT_DECODER(LFDUX); +IMPORT_DECODER(STVLX); +IMPORT_DECODER(STDBRX); +IMPORT_DECODER(STSWX); +IMPORT_DECODER(STWBRX); +IMPORT_DECODER(STFSX); +IMPORT_DECODER(STVRX); +IMPORT_DECODER(STFSUX); +IMPORT_DECODER(STSWI); +IMPORT_DECODER(STFDX); +IMPORT_DECODER(STFDUX); +IMPORT_DECODER(LVLXL); +IMPORT_DECODER(LHBRX); +IMPORT_DECODER(SRAW); +IMPORT_DECODER(SRAD); +IMPORT_DECODER(LVRXL); +IMPORT_DECODER(DSS); +IMPORT_DECODER(SRAWI); +IMPORT_DECODER(SRADI); +IMPORT_DECODER(EIEIO); +IMPORT_DECODER(STVLXL); +IMPORT_DECODER(STHBRX); +IMPORT_DECODER(EXTSH); +IMPORT_DECODER(STVRXL); +IMPORT_DECODER(EXTSB); +IMPORT_DECODER(STFIWX); +IMPORT_DECODER(EXTSW); +IMPORT_DECODER(ICBI); +IMPORT_DECODER(DCBZ); +IMPORT_DECODER(LWZ); +IMPORT_DECODER(LWZU); +IMPORT_DECODER(LBZ); +IMPORT_DECODER(LBZU); +IMPORT_DECODER(STW); +IMPORT_DECODER(STWU); +IMPORT_DECODER(STB); +IMPORT_DECODER(STBU); +IMPORT_DECODER(LHZ); +IMPORT_DECODER(LHZU); +IMPORT_DECODER(LHA); +IMPORT_DECODER(LHAU); +IMPORT_DECODER(STH); +IMPORT_DECODER(STHU); +IMPORT_DECODER(LMW); +IMPORT_DECODER(STMW); +IMPORT_DECODER(LFS); +IMPORT_DECODER(LFSU); +IMPORT_DECODER(LFD); +IMPORT_DECODER(LFDU); +IMPORT_DECODER(STFS); +IMPORT_DECODER(STFSU); +IMPORT_DECODER(STFD); +IMPORT_DECODER(STFDU); +IMPORT_DECODER(LD); +IMPORT_DECODER(LDU); +IMPORT_DECODER(LWA); +IMPORT_DECODER(STD); +IMPORT_DECODER(STDU); +IMPORT_DECODER(FDIVS); +IMPORT_DECODER(FSUBS); +IMPORT_DECODER(FADDS); +IMPORT_DECODER(FSQRTS); +IMPORT_DECODER(FRES); +IMPORT_DECODER(FMULS); +IMPORT_DECODER(FMADDS); +IMPORT_DECODER(FMSUBS); +IMPORT_DECODER(FNMSUBS); +IMPORT_DECODER(FNMADDS); +IMPORT_DECODER(MTFSB1); +IMPORT_DECODER(MCRFS); +IMPORT_DECODER(MTFSB0); +IMPORT_DECODER(MTFSFI); +IMPORT_DECODER(MFFS); +IMPORT_DECODER(MTFSF); +IMPORT_DECODER(FCMPU); +IMPORT_DECODER(FRSP); +IMPORT_DECODER(FCTIW); +IMPORT_DECODER(FCTIWZ); +IMPORT_DECODER(FDIV); +IMPORT_DECODER(FSUB); +IMPORT_DECODER(FADD); +IMPORT_DECODER(FSQRT); +IMPORT_DECODER(FSEL); +IMPORT_DECODER(FMUL); +IMPORT_DECODER(FRSQRTE); +IMPORT_DECODER(FMSUB); +IMPORT_DECODER(FMADD); +IMPORT_DECODER(FNMSUB); +IMPORT_DECODER(FNMADD); +IMPORT_DECODER(FCMPO); +IMPORT_DECODER(FNEG); +IMPORT_DECODER(FMR); +IMPORT_DECODER(FNABS); +IMPORT_DECODER(FABS); +IMPORT_DECODER(FCTID); +IMPORT_DECODER(FCTIDZ); +IMPORT_DECODER(FCFID); +IMPORT_DECODER(UNK); +IMPORT_DECODER(SUBFCO); +IMPORT_DECODER(ADDCO); +IMPORT_DECODER(SUBFO); +IMPORT_DECODER(NEGO); +IMPORT_DECODER(SUBFEO); +IMPORT_DECODER(ADDEO); +IMPORT_DECODER(SUBFZEO); +IMPORT_DECODER(ADDZEO); +IMPORT_DECODER(SUBFMEO); +IMPORT_DECODER(MULLDO); +IMPORT_DECODER(ADDMEO); +IMPORT_DECODER(MULLWO); +IMPORT_DECODER(ADDO); +IMPORT_DECODER(DIVDUO); +IMPORT_DECODER(DIVWUO); +IMPORT_DECODER(DIVDO); +IMPORT_DECODER(DIVWO); +IMPORT_DECODER_ALIAS(SUBFCO_, SUBFCO); +IMPORT_DECODER_ALIAS(ADDCO_, ADDCO); +IMPORT_DECODER_ALIAS(SUBFO_, SUBFO); +IMPORT_DECODER_ALIAS(NEGO_, NEGO); +IMPORT_DECODER_ALIAS(SUBFEO_, SUBFEO); +IMPORT_DECODER_ALIAS(ADDEO_, ADDEO); +IMPORT_DECODER_ALIAS(SUBFZEO_, SUBFZEO); +IMPORT_DECODER_ALIAS(ADDZEO_, ADDZEO); +IMPORT_DECODER_ALIAS(SUBFMEO_, SUBFMEO); +IMPORT_DECODER_ALIAS(MULLDO_, MULLDO); +IMPORT_DECODER_ALIAS(ADDMEO_, ADDMEO); +IMPORT_DECODER_ALIAS(MULLWO_, MULLWO); +IMPORT_DECODER_ALIAS(ADDO_, ADDO); +IMPORT_DECODER_ALIAS(DIVDUO_, DIVDUO); +IMPORT_DECODER_ALIAS(DIVWUO_, DIVWUO); +IMPORT_DECODER_ALIAS(DIVDO_, DIVDO); +IMPORT_DECODER_ALIAS(DIVWO_, DIVWO); +IMPORT_DECODER_ALIAS(RLWIMI_, RLWIMI); +IMPORT_DECODER_ALIAS(RLWINM_, RLWINM); +IMPORT_DECODER_ALIAS(RLWNM_, RLWNM); +IMPORT_DECODER_ALIAS(RLDICL_, RLDICL); +IMPORT_DECODER_ALIAS(RLDICR_, RLDICR); +IMPORT_DECODER_ALIAS(RLDIC_, RLDIC); +IMPORT_DECODER_ALIAS(RLDIMI_, RLDIMI); +IMPORT_DECODER_ALIAS(RLDCL_, RLDCL); +IMPORT_DECODER_ALIAS(RLDCR_, RLDCR); +IMPORT_DECODER_ALIAS(SUBFC_, SUBFC); +IMPORT_DECODER_ALIAS(MULHDU_, MULHDU); +IMPORT_DECODER_ALIAS(ADDC_, ADDC); +IMPORT_DECODER_ALIAS(MULHWU_, MULHWU); +IMPORT_DECODER_ALIAS(SLW_, SLW); +IMPORT_DECODER_ALIAS(CNTLZW_, CNTLZW); +IMPORT_DECODER_ALIAS(SLD_, SLD); +IMPORT_DECODER_ALIAS(AND_, AND); +IMPORT_DECODER_ALIAS(SUBF_, SUBF); +IMPORT_DECODER_ALIAS(CNTLZD_, CNTLZD); +IMPORT_DECODER_ALIAS(ANDC_, ANDC); +IMPORT_DECODER_ALIAS(MULHD_, MULHD); +IMPORT_DECODER_ALIAS(MULHW_, MULHW); +IMPORT_DECODER_ALIAS(NEG_, NEG); +IMPORT_DECODER_ALIAS(NOR_, NOR); +IMPORT_DECODER_ALIAS(SUBFE_, SUBFE); +IMPORT_DECODER_ALIAS(ADDE_, ADDE); +IMPORT_DECODER_ALIAS(SUBFZE_, SUBFZE); +IMPORT_DECODER_ALIAS(ADDZE_, ADDZE); +IMPORT_DECODER_ALIAS(MULLD_, MULLD); +IMPORT_DECODER_ALIAS(SUBFME_, SUBFME); +IMPORT_DECODER_ALIAS(ADDME_, ADDME); +IMPORT_DECODER_ALIAS(MULLW_, MULLW); +IMPORT_DECODER_ALIAS(ADD_, ADD); +IMPORT_DECODER_ALIAS(EQV_, EQV); +IMPORT_DECODER_ALIAS(XOR_, XOR); +IMPORT_DECODER_ALIAS(ORC_, ORC); +IMPORT_DECODER_ALIAS(OR_, OR); +IMPORT_DECODER_ALIAS(DIVDU_, DIVDU); +IMPORT_DECODER_ALIAS(DIVWU_, DIVWU); +IMPORT_DECODER_ALIAS(NAND_, NAND); +IMPORT_DECODER_ALIAS(DIVD_, DIVD); +IMPORT_DECODER_ALIAS(DIVW_, DIVW); +IMPORT_DECODER_ALIAS(SRW_, SRW); +IMPORT_DECODER_ALIAS(SRD_, SRD); +IMPORT_DECODER_ALIAS(SRAW_, SRAW); +IMPORT_DECODER_ALIAS(SRAD_, SRAD); +IMPORT_DECODER_ALIAS(SRAWI_, SRAWI); +IMPORT_DECODER_ALIAS(SRADI_, SRADI); +IMPORT_DECODER_ALIAS(EXTSH_, EXTSH); +IMPORT_DECODER_ALIAS(EXTSB_, EXTSB); +IMPORT_DECODER_ALIAS(EXTSW_, EXTSW); +IMPORT_DECODER_ALIAS(FDIVS_, FDIVS); +IMPORT_DECODER_ALIAS(FSUBS_, FSUBS); +IMPORT_DECODER_ALIAS(FADDS_, FADDS); +IMPORT_DECODER_ALIAS(FSQRTS_, FSQRTS); +IMPORT_DECODER_ALIAS(FRES_, FRES); +IMPORT_DECODER_ALIAS(FMULS_, FMULS); +IMPORT_DECODER_ALIAS(FMADDS_, FMADDS); +IMPORT_DECODER_ALIAS(FMSUBS_, FMSUBS); +IMPORT_DECODER_ALIAS(FNMSUBS_, FNMSUBS); +IMPORT_DECODER_ALIAS(FNMADDS_, FNMADDS); +IMPORT_DECODER_ALIAS(MTFSB1_, MTFSB1); +IMPORT_DECODER_ALIAS(MTFSB0_, MTFSB0); +IMPORT_DECODER_ALIAS(MTFSFI_, MTFSFI); +IMPORT_DECODER_ALIAS(MFFS_, MFFS); +IMPORT_DECODER_ALIAS(MTFSF_, MTFSF); +IMPORT_DECODER_ALIAS(FRSP_, FRSP); +IMPORT_DECODER_ALIAS(FCTIW_, FCTIW); +IMPORT_DECODER_ALIAS(FCTIWZ_, FCTIWZ); +IMPORT_DECODER_ALIAS(FDIV_, FDIV); +IMPORT_DECODER_ALIAS(FSUB_, FSUB); +IMPORT_DECODER_ALIAS(FADD_, FADD); +IMPORT_DECODER_ALIAS(FSQRT_, FSQRT); +IMPORT_DECODER_ALIAS(FSEL_, FSEL); +IMPORT_DECODER_ALIAS(FMUL_, FMUL); +IMPORT_DECODER_ALIAS(FRSQRTE_, FRSQRTE); +IMPORT_DECODER_ALIAS(FMSUB_, FMSUB); +IMPORT_DECODER_ALIAS(FMADD_, FMADD); +IMPORT_DECODER_ALIAS(FNMSUB_, FNMSUB); +IMPORT_DECODER_ALIAS(FNMADD_, FNMADD); +IMPORT_DECODER_ALIAS(FNEG_, FNEG); +IMPORT_DECODER_ALIAS(FMR_, FMR); +IMPORT_DECODER_ALIAS(FNABS_, FNABS); +IMPORT_DECODER_ALIAS(FABS_, FABS); +IMPORT_DECODER_ALIAS(FCTID_, FCTID); +IMPORT_DECODER_ALIAS(FCTIDZ_, FCTIDZ); +IMPORT_DECODER_ALIAS(FCFID_, FCFID); +#undef IMPORT_DECODER +#undef IMPORT_DECODER_ALIAS + +PPUInterpreter::PPUInterpreter() +{ + for (auto& isel : impl) + { + isel = [](PPUContext&, rx::cell::ppu::Instruction) + { + fmt::throw_exception("PPU Invalid Instruction"); + }; + } +#define DEFINE_DECODER(x) \ + impl[static_cast(rx::cell::ppu::Opcode::x)] = ISEL_PPU_##x##_DEC + + DEFINE_DECODER(MFVSCR); + DEFINE_DECODER(MTVSCR); + DEFINE_DECODER(VADDCUW); + DEFINE_DECODER(VADDFP); + DEFINE_DECODER(VADDSBS); + DEFINE_DECODER(VADDSHS); + DEFINE_DECODER(VADDSWS); + DEFINE_DECODER(VADDUBM); + DEFINE_DECODER(VADDUBS); + DEFINE_DECODER(VADDUHM); + DEFINE_DECODER(VADDUHS); + DEFINE_DECODER(VADDUWM); + DEFINE_DECODER(VADDUWS); + DEFINE_DECODER(VAND); + DEFINE_DECODER(VANDC); + DEFINE_DECODER(VAVGSB); + DEFINE_DECODER(VAVGSH); + DEFINE_DECODER(VAVGSW); + DEFINE_DECODER(VAVGUB); + DEFINE_DECODER(VAVGUH); + DEFINE_DECODER(VAVGUW); + DEFINE_DECODER(VCFSX); + DEFINE_DECODER(VCFUX); + DEFINE_DECODER(VCMPBFP); + DEFINE_DECODER(VCMPBFP_); + DEFINE_DECODER(VCMPEQFP); + DEFINE_DECODER(VCMPEQFP_); + DEFINE_DECODER(VCMPEQUB); + DEFINE_DECODER(VCMPEQUB_); + DEFINE_DECODER(VCMPEQUH); + DEFINE_DECODER(VCMPEQUH_); + DEFINE_DECODER(VCMPEQUW); + DEFINE_DECODER(VCMPEQUW_); + DEFINE_DECODER(VCMPGEFP); + DEFINE_DECODER(VCMPGEFP_); + DEFINE_DECODER(VCMPGTFP); + DEFINE_DECODER(VCMPGTFP_); + DEFINE_DECODER(VCMPGTSB); + DEFINE_DECODER(VCMPGTSB_); + DEFINE_DECODER(VCMPGTSH); + DEFINE_DECODER(VCMPGTSH_); + DEFINE_DECODER(VCMPGTSW); + DEFINE_DECODER(VCMPGTSW_); + DEFINE_DECODER(VCMPGTUB); + DEFINE_DECODER(VCMPGTUB_); + DEFINE_DECODER(VCMPGTUH); + DEFINE_DECODER(VCMPGTUH_); + DEFINE_DECODER(VCMPGTUW); + DEFINE_DECODER(VCMPGTUW_); + DEFINE_DECODER(VCTSXS); + DEFINE_DECODER(VCTUXS); + DEFINE_DECODER(VEXPTEFP); + DEFINE_DECODER(VLOGEFP); + DEFINE_DECODER(VMADDFP); + DEFINE_DECODER(VMAXFP); + DEFINE_DECODER(VMAXSB); + DEFINE_DECODER(VMAXSH); + DEFINE_DECODER(VMAXSW); + DEFINE_DECODER(VMAXUB); + DEFINE_DECODER(VMAXUH); + DEFINE_DECODER(VMAXUW); + DEFINE_DECODER(VMHADDSHS); + DEFINE_DECODER(VMHRADDSHS); + DEFINE_DECODER(VMINFP); + DEFINE_DECODER(VMINSB); + DEFINE_DECODER(VMINSH); + DEFINE_DECODER(VMINSW); + DEFINE_DECODER(VMINUB); + DEFINE_DECODER(VMINUH); + DEFINE_DECODER(VMINUW); + DEFINE_DECODER(VMLADDUHM); + DEFINE_DECODER(VMRGHB); + DEFINE_DECODER(VMRGHH); + DEFINE_DECODER(VMRGHW); + DEFINE_DECODER(VMRGLB); + DEFINE_DECODER(VMRGLH); + DEFINE_DECODER(VMRGLW); + DEFINE_DECODER(VMSUMMBM); + DEFINE_DECODER(VMSUMSHM); + DEFINE_DECODER(VMSUMSHS); + DEFINE_DECODER(VMSUMUBM); + DEFINE_DECODER(VMSUMUHM); + DEFINE_DECODER(VMSUMUHS); + DEFINE_DECODER(VMULESB); + DEFINE_DECODER(VMULESH); + DEFINE_DECODER(VMULEUB); + DEFINE_DECODER(VMULEUH); + DEFINE_DECODER(VMULOSB); + DEFINE_DECODER(VMULOSH); + DEFINE_DECODER(VMULOUB); + DEFINE_DECODER(VMULOUH); + DEFINE_DECODER(VNMSUBFP); + DEFINE_DECODER(VNOR); + DEFINE_DECODER(VOR); + DEFINE_DECODER(VPERM); + DEFINE_DECODER(VPKPX); + DEFINE_DECODER(VPKSHSS); + DEFINE_DECODER(VPKSHUS); + DEFINE_DECODER(VPKSWSS); + DEFINE_DECODER(VPKSWUS); + DEFINE_DECODER(VPKUHUM); + DEFINE_DECODER(VPKUHUS); + DEFINE_DECODER(VPKUWUM); + DEFINE_DECODER(VPKUWUS); + DEFINE_DECODER(VREFP); + DEFINE_DECODER(VRFIM); + DEFINE_DECODER(VRFIN); + DEFINE_DECODER(VRFIP); + DEFINE_DECODER(VRFIZ); + DEFINE_DECODER(VRLB); + DEFINE_DECODER(VRLH); + DEFINE_DECODER(VRLW); + DEFINE_DECODER(VRSQRTEFP); + DEFINE_DECODER(VSEL); + DEFINE_DECODER(VSL); + DEFINE_DECODER(VSLB); + DEFINE_DECODER(VSLDOI); + DEFINE_DECODER(VSLH); + DEFINE_DECODER(VSLO); + DEFINE_DECODER(VSLW); + DEFINE_DECODER(VSPLTB); + DEFINE_DECODER(VSPLTH); + DEFINE_DECODER(VSPLTISB); + DEFINE_DECODER(VSPLTISH); + DEFINE_DECODER(VSPLTISW); + DEFINE_DECODER(VSPLTW); + DEFINE_DECODER(VSR); + DEFINE_DECODER(VSRAB); + DEFINE_DECODER(VSRAH); + DEFINE_DECODER(VSRAW); + DEFINE_DECODER(VSRB); + DEFINE_DECODER(VSRH); + DEFINE_DECODER(VSRO); + DEFINE_DECODER(VSRW); + DEFINE_DECODER(VSUBCUW); + DEFINE_DECODER(VSUBFP); + DEFINE_DECODER(VSUBSBS); + DEFINE_DECODER(VSUBSHS); + DEFINE_DECODER(VSUBSWS); + DEFINE_DECODER(VSUBUBM); + DEFINE_DECODER(VSUBUBS); + DEFINE_DECODER(VSUBUHM); + DEFINE_DECODER(VSUBUHS); + DEFINE_DECODER(VSUBUWM); + DEFINE_DECODER(VSUBUWS); + DEFINE_DECODER(VSUMSWS); + DEFINE_DECODER(VSUM2SWS); + DEFINE_DECODER(VSUM4SBS); + DEFINE_DECODER(VSUM4SHS); + DEFINE_DECODER(VSUM4UBS); + DEFINE_DECODER(VUPKHPX); + DEFINE_DECODER(VUPKHSB); + DEFINE_DECODER(VUPKHSH); + DEFINE_DECODER(VUPKLPX); + DEFINE_DECODER(VUPKLSB); + DEFINE_DECODER(VUPKLSH); + DEFINE_DECODER(VXOR); + DEFINE_DECODER(TDI); + DEFINE_DECODER(TWI); + DEFINE_DECODER(MULLI); + DEFINE_DECODER(SUBFIC); + DEFINE_DECODER(CMPLI); + DEFINE_DECODER(CMPI); + DEFINE_DECODER(ADDIC); + DEFINE_DECODER(ADDI); + DEFINE_DECODER(ADDIS); + DEFINE_DECODER(BC); + DEFINE_DECODER(SC); + DEFINE_DECODER(B); + DEFINE_DECODER(MCRF); + DEFINE_DECODER(BCLR); + DEFINE_DECODER(RFID); + DEFINE_DECODER(CRNOR); + DEFINE_DECODER(RFSCV); + DEFINE_DECODER(CRANDC); + DEFINE_DECODER(ISYNC); + DEFINE_DECODER(CRXOR); + DEFINE_DECODER(CRNAND); + DEFINE_DECODER(CRAND); + DEFINE_DECODER(HRFID); + DEFINE_DECODER(CREQV); + DEFINE_DECODER(URFID); + DEFINE_DECODER(STOP); + DEFINE_DECODER(CRORC); + DEFINE_DECODER(CROR); + DEFINE_DECODER(BCCTR); + DEFINE_DECODER(RLWIMI); + DEFINE_DECODER(RLWINM); + DEFINE_DECODER(RLWNM); + DEFINE_DECODER(ORI); + DEFINE_DECODER(ORIS); + DEFINE_DECODER(XORI); + DEFINE_DECODER(XORIS); + DEFINE_DECODER(ANDI); + DEFINE_DECODER(ANDIS); + DEFINE_DECODER(RLDICL); + DEFINE_DECODER(RLDICR); + DEFINE_DECODER(RLDIC); + DEFINE_DECODER(RLDIMI); + DEFINE_DECODER(RLDCL); + DEFINE_DECODER(RLDCR); + DEFINE_DECODER(CMP); + DEFINE_DECODER(TW); + DEFINE_DECODER(LVSL); + DEFINE_DECODER(LVEBX); + DEFINE_DECODER(SUBFC); + DEFINE_DECODER(MULHDU); + DEFINE_DECODER(ADDC); + DEFINE_DECODER(MULHWU); + DEFINE_DECODER(MFOCRF); + DEFINE_DECODER(LWARX); + DEFINE_DECODER(LDX); + DEFINE_DECODER(LWZX); + DEFINE_DECODER(SLW); + DEFINE_DECODER(CNTLZW); + DEFINE_DECODER(SLD); + DEFINE_DECODER(AND); + DEFINE_DECODER(CMPL); + DEFINE_DECODER(LVSR); + DEFINE_DECODER(LVEHX); + DEFINE_DECODER(SUBF); + DEFINE_DECODER(LDUX); + DEFINE_DECODER(DCBST); + DEFINE_DECODER(LWZUX); + DEFINE_DECODER(CNTLZD); + DEFINE_DECODER(ANDC); + DEFINE_DECODER(TD); + DEFINE_DECODER(LVEWX); + DEFINE_DECODER(MULHD); + DEFINE_DECODER(MULHW); + DEFINE_DECODER(LDARX); + DEFINE_DECODER(DCBF); + DEFINE_DECODER(LBZX); + DEFINE_DECODER(LVX); + DEFINE_DECODER(NEG); + DEFINE_DECODER(LBZUX); + DEFINE_DECODER(NOR); + DEFINE_DECODER(STVEBX); + DEFINE_DECODER(SUBFE); + DEFINE_DECODER(ADDE); + DEFINE_DECODER(MTOCRF); + DEFINE_DECODER(STDX); + DEFINE_DECODER(STWCX); + DEFINE_DECODER(STWX); + DEFINE_DECODER(STVEHX); + DEFINE_DECODER(STDUX); + DEFINE_DECODER(STWUX); + DEFINE_DECODER(STVEWX); + DEFINE_DECODER(SUBFZE); + DEFINE_DECODER(ADDZE); + DEFINE_DECODER(STDCX); + DEFINE_DECODER(STBX); + DEFINE_DECODER(STVX); + DEFINE_DECODER(MULLD); + DEFINE_DECODER(SUBFME); + DEFINE_DECODER(ADDME); + DEFINE_DECODER(MULLW); + DEFINE_DECODER(DCBTST); + DEFINE_DECODER(STBUX); + DEFINE_DECODER(ADD); + DEFINE_DECODER(DCBT); + DEFINE_DECODER(LHZX); + DEFINE_DECODER(EQV); + DEFINE_DECODER(ECIWX); + DEFINE_DECODER(LHZUX); + DEFINE_DECODER(XOR); + DEFINE_DECODER(MFSPR); + DEFINE_DECODER(LWAX); + DEFINE_DECODER(DST); + DEFINE_DECODER(LHAX); + DEFINE_DECODER(LVXL); + DEFINE_DECODER(MFTB); + DEFINE_DECODER(LWAUX); + DEFINE_DECODER(DSTST); + DEFINE_DECODER(LHAUX); + DEFINE_DECODER(STHX); + DEFINE_DECODER(ORC); + DEFINE_DECODER(ECOWX); + DEFINE_DECODER(STHUX); + DEFINE_DECODER(OR); + DEFINE_DECODER(DIVDU); + DEFINE_DECODER(DIVWU); + DEFINE_DECODER(MTSPR); + DEFINE_DECODER(DCBI); + DEFINE_DECODER(NAND); + DEFINE_DECODER(STVXL); + DEFINE_DECODER(DIVD); + DEFINE_DECODER(DIVW); + DEFINE_DECODER(LVLX); + DEFINE_DECODER(LDBRX); + DEFINE_DECODER(LSWX); + DEFINE_DECODER(LWBRX); + DEFINE_DECODER(LFSX); + DEFINE_DECODER(SRW); + DEFINE_DECODER(SRD); + DEFINE_DECODER(LVRX); + DEFINE_DECODER(LSWI); + DEFINE_DECODER(LFSUX); + DEFINE_DECODER(SYNC); + DEFINE_DECODER(LFDX); + DEFINE_DECODER(LFDUX); + DEFINE_DECODER(STVLX); + DEFINE_DECODER(STDBRX); + DEFINE_DECODER(STSWX); + DEFINE_DECODER(STWBRX); + DEFINE_DECODER(STFSX); + DEFINE_DECODER(STVRX); + DEFINE_DECODER(STFSUX); + DEFINE_DECODER(STSWI); + DEFINE_DECODER(STFDX); + DEFINE_DECODER(STFDUX); + DEFINE_DECODER(LVLXL); + DEFINE_DECODER(LHBRX); + DEFINE_DECODER(SRAW); + DEFINE_DECODER(SRAD); + DEFINE_DECODER(LVRXL); + DEFINE_DECODER(DSS); + DEFINE_DECODER(SRAWI); + DEFINE_DECODER(SRADI); + DEFINE_DECODER(EIEIO); + DEFINE_DECODER(STVLXL); + DEFINE_DECODER(STHBRX); + DEFINE_DECODER(EXTSH); + DEFINE_DECODER(STVRXL); + DEFINE_DECODER(EXTSB); + DEFINE_DECODER(STFIWX); + DEFINE_DECODER(EXTSW); + DEFINE_DECODER(ICBI); + DEFINE_DECODER(DCBZ); + DEFINE_DECODER(LWZ); + DEFINE_DECODER(LWZU); + DEFINE_DECODER(LBZ); + DEFINE_DECODER(LBZU); + DEFINE_DECODER(STW); + DEFINE_DECODER(STWU); + DEFINE_DECODER(STB); + DEFINE_DECODER(STBU); + DEFINE_DECODER(LHZ); + DEFINE_DECODER(LHZU); + DEFINE_DECODER(LHA); + DEFINE_DECODER(LHAU); + DEFINE_DECODER(STH); + DEFINE_DECODER(STHU); + DEFINE_DECODER(LMW); + DEFINE_DECODER(STMW); + DEFINE_DECODER(LFS); + DEFINE_DECODER(LFSU); + DEFINE_DECODER(LFD); + DEFINE_DECODER(LFDU); + DEFINE_DECODER(STFS); + DEFINE_DECODER(STFSU); + DEFINE_DECODER(STFD); + DEFINE_DECODER(STFDU); + DEFINE_DECODER(LD); + DEFINE_DECODER(LDU); + DEFINE_DECODER(LWA); + DEFINE_DECODER(STD); + DEFINE_DECODER(STDU); + DEFINE_DECODER(FDIVS); + DEFINE_DECODER(FSUBS); + DEFINE_DECODER(FADDS); + DEFINE_DECODER(FSQRTS); + DEFINE_DECODER(FRES); + DEFINE_DECODER(FMULS); + DEFINE_DECODER(FMADDS); + DEFINE_DECODER(FMSUBS); + DEFINE_DECODER(FNMSUBS); + DEFINE_DECODER(FNMADDS); + DEFINE_DECODER(MTFSB1); + DEFINE_DECODER(MCRFS); + DEFINE_DECODER(MTFSB0); + DEFINE_DECODER(MTFSFI); + DEFINE_DECODER(MFFS); + DEFINE_DECODER(MTFSF); + DEFINE_DECODER(FCMPU); + DEFINE_DECODER(FRSP); + DEFINE_DECODER(FCTIW); + DEFINE_DECODER(FCTIWZ); + DEFINE_DECODER(FDIV); + DEFINE_DECODER(FSUB); + DEFINE_DECODER(FADD); + DEFINE_DECODER(FSQRT); + DEFINE_DECODER(FSEL); + DEFINE_DECODER(FMUL); + DEFINE_DECODER(FRSQRTE); + DEFINE_DECODER(FMSUB); + DEFINE_DECODER(FMADD); + DEFINE_DECODER(FNMSUB); + DEFINE_DECODER(FNMADD); + DEFINE_DECODER(FCMPO); + DEFINE_DECODER(FNEG); + DEFINE_DECODER(FMR); + DEFINE_DECODER(FNABS); + DEFINE_DECODER(FABS); + DEFINE_DECODER(FCTID); + DEFINE_DECODER(FCTIDZ); + DEFINE_DECODER(FCFID); + DEFINE_DECODER(UNK); + DEFINE_DECODER(SUBFCO); + DEFINE_DECODER(ADDCO); + DEFINE_DECODER(SUBFO); + DEFINE_DECODER(NEGO); + DEFINE_DECODER(SUBFEO); + DEFINE_DECODER(ADDEO); + DEFINE_DECODER(SUBFZEO); + DEFINE_DECODER(ADDZEO); + DEFINE_DECODER(SUBFMEO); + DEFINE_DECODER(MULLDO); + DEFINE_DECODER(ADDMEO); + DEFINE_DECODER(MULLWO); + DEFINE_DECODER(ADDO); + DEFINE_DECODER(DIVDUO); + DEFINE_DECODER(DIVWUO); + DEFINE_DECODER(DIVDO); + DEFINE_DECODER(DIVWO); + DEFINE_DECODER(SUBFCO_); + DEFINE_DECODER(ADDCO_); + DEFINE_DECODER(SUBFO_); + DEFINE_DECODER(NEGO_); + DEFINE_DECODER(SUBFEO_); + DEFINE_DECODER(ADDEO_); + DEFINE_DECODER(SUBFZEO_); + DEFINE_DECODER(ADDZEO_); + DEFINE_DECODER(SUBFMEO_); + DEFINE_DECODER(MULLDO_); + DEFINE_DECODER(ADDMEO_); + DEFINE_DECODER(MULLWO_); + DEFINE_DECODER(ADDO_); + DEFINE_DECODER(DIVDUO_); + DEFINE_DECODER(DIVWUO_); + DEFINE_DECODER(DIVDO_); + DEFINE_DECODER(DIVWO_); + DEFINE_DECODER(RLWIMI_); + DEFINE_DECODER(RLWINM_); + DEFINE_DECODER(RLWNM_); + DEFINE_DECODER(RLDICL_); + DEFINE_DECODER(RLDICR_); + DEFINE_DECODER(RLDIC_); + DEFINE_DECODER(RLDIMI_); + DEFINE_DECODER(RLDCL_); + DEFINE_DECODER(RLDCR_); + DEFINE_DECODER(SUBFC_); + DEFINE_DECODER(MULHDU_); + DEFINE_DECODER(ADDC_); + DEFINE_DECODER(MULHWU_); + DEFINE_DECODER(SLW_); + DEFINE_DECODER(CNTLZW_); + DEFINE_DECODER(SLD_); + DEFINE_DECODER(AND_); + DEFINE_DECODER(SUBF_); + DEFINE_DECODER(CNTLZD_); + DEFINE_DECODER(ANDC_); + DEFINE_DECODER(MULHD_); + DEFINE_DECODER(MULHW_); + DEFINE_DECODER(NEG_); + DEFINE_DECODER(NOR_); + DEFINE_DECODER(SUBFE_); + DEFINE_DECODER(ADDE_); + DEFINE_DECODER(SUBFZE_); + DEFINE_DECODER(ADDZE_); + DEFINE_DECODER(MULLD_); + DEFINE_DECODER(SUBFME_); + DEFINE_DECODER(ADDME_); + DEFINE_DECODER(MULLW_); + DEFINE_DECODER(ADD_); + DEFINE_DECODER(EQV_); + DEFINE_DECODER(XOR_); + DEFINE_DECODER(ORC_); + DEFINE_DECODER(OR_); + DEFINE_DECODER(DIVDU_); + DEFINE_DECODER(DIVWU_); + DEFINE_DECODER(NAND_); + DEFINE_DECODER(DIVD_); + DEFINE_DECODER(DIVW_); + DEFINE_DECODER(SRW_); + DEFINE_DECODER(SRD_); + DEFINE_DECODER(SRAW_); + DEFINE_DECODER(SRAD_); + DEFINE_DECODER(SRAWI_); + DEFINE_DECODER(SRADI_); + DEFINE_DECODER(EXTSH_); + DEFINE_DECODER(EXTSB_); + DEFINE_DECODER(EXTSW_); + DEFINE_DECODER(FDIVS_); + DEFINE_DECODER(FSUBS_); + DEFINE_DECODER(FADDS_); + DEFINE_DECODER(FSQRTS_); + DEFINE_DECODER(FRES_); + DEFINE_DECODER(FMULS_); + DEFINE_DECODER(FMADDS_); + DEFINE_DECODER(FMSUBS_); + DEFINE_DECODER(FNMSUBS_); + DEFINE_DECODER(FNMADDS_); + DEFINE_DECODER(MTFSB1_); + DEFINE_DECODER(MTFSB0_); + DEFINE_DECODER(MTFSFI_); + DEFINE_DECODER(MFFS_); + DEFINE_DECODER(MTFSF_); + DEFINE_DECODER(FRSP_); + DEFINE_DECODER(FCTIW_); + DEFINE_DECODER(FCTIWZ_); + DEFINE_DECODER(FDIV_); + DEFINE_DECODER(FSUB_); + DEFINE_DECODER(FADD_); + DEFINE_DECODER(FSQRT_); + DEFINE_DECODER(FSEL_); + DEFINE_DECODER(FMUL_); + DEFINE_DECODER(FRSQRTE_); + DEFINE_DECODER(FMSUB_); + DEFINE_DECODER(FMADD_); + DEFINE_DECODER(FNMSUB_); + DEFINE_DECODER(FNMADD_); + DEFINE_DECODER(FNEG_); + DEFINE_DECODER(FMR_); + DEFINE_DECODER(FNABS_); + DEFINE_DECODER(FABS_); + DEFINE_DECODER(FCTID_); + DEFINE_DECODER(FCTIDZ_); + DEFINE_DECODER(FCFID_); +#undef DEFINE_DECODER +} + +static ppu_intrp_func ppu_ret = {[](ppu_thread& ppu, ppu_opcode_t, be_t* this_op, ppu_intrp_func*) + { + // Fix PC and return (step execution) + ppu.cia = vm::get_addr(this_op); + return; + }}; + +void PPUInterpreter::interpret(PPUContext& context, std::uint32_t inst) +{ + auto op = rx::cell::ppu::getOpcode(inst); + auto instructionAddress = context.cia; + + auto this_op = reinterpret_cast*>(vm::g_base_addr + instructionAddress); + + const auto fn = *reinterpret_cast(vm::g_exec_addr + u64{instructionAddress} * 2); + + if (fn) + { + fn(static_cast(context), std::bit_cast(inst), this_op, &ppu_ret); + return; + } + + // if (op == rx::cell::ppu::Opcode::Invalid) + { + if (g_fxo->get().is_func(context.cia)) + { + ppu_intrp_func_t hle_function = nullptr; + auto hle_addr = g_fxo->get().addr; + // HLE function index + const u32 index = (context.cia - hle_addr) / 8; + + if (context.cia % 8 == 4 && index < ppu_function_manager::get().size()) + { + // HLE function placement + hle_function = ppu_function_manager::get()[index]; + } + + if (hle_function) + { + hle_function(static_cast(context), std::bit_cast(inst), this_op, nullptr); + return; + } + } + } + + // std::fprintf(stderr, "%08x: %s\n", instructionAddress, std::format("{}", op).c_str()); + impl[static_cast(op)](context, std::bit_cast(inst)); + + if (context.cia == instructionAddress && + op != rx::cell::ppu::Opcode::B && + op != rx::cell::ppu::Opcode::BC && + op != rx::cell::ppu::Opcode::BCLR && + op != rx::cell::ppu::Opcode::BCCTR) + { + context.cia += sizeof(std::uint32_t); + } +} + +extern "C" +{ + [[noreturn]] void rpcsx_trap() + { + fmt::throw_exception("PPU Trap"); + } + [[noreturn]] void rpcsx_invalid_instruction() + { + fmt::throw_exception("PPU Invalid Instruction"); + } + [[noreturn]] void rpcsx_unimplemented_instruction() + { + fmt::throw_exception("PPU Unimplemented Instruction"); + } + + void rpcsx_vm_read(std::uint64_t vaddr, void* dest, std::size_t size) + { + std::memcpy(dest, vm::g_base_addr + vaddr, size); + } + void rpcsx_vm_write(std::uint64_t vaddr, const void* src, std::size_t size) + { + std::memcpy(vm::g_base_addr + vaddr, src, size); + } + + std::uint64_t rpcsx_get_tb() + { + return get_timebased_time(); + } +} + +void ppu_execute_syscall(PPUContext& context, u64 code) +{ + return ppu_execute_syscall(static_cast(context), code); +} +u32 ppu_lwarx(PPUContext& context, u32 addr) +{ + return ppu_lwarx(static_cast(context), addr); +} +u64 ppu_ldarx(PPUContext& context, u32 addr) +{ + return ppu_ldarx(static_cast(context), addr); +} +bool ppu_stwcx(PPUContext& context, u32 addr, u32 reg_value) +{ + return ppu_stwcx(static_cast(context), addr, reg_value); +} +bool ppu_stdcx(PPUContext& context, u32 addr, u64 reg_value) +{ + return ppu_stdcx(static_cast(context), addr, reg_value); +} +void ppu_trap(PPUContext& context, u64 addr) +{ + return ppu_trap(static_cast(context), addr); +} diff --git a/rpcs3/Emu/Cell/PPUInterpreter.h b/rpcs3/Emu/Cell/PPUInterpreter.h index ca8d03db5..64703e5f8 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.h +++ b/rpcs3/Emu/Cell/PPUInterpreter.h @@ -1,6 +1,11 @@ #pragma once #include "PPUOpcodes.h" +#include "rx/cpu/cell/ppu/Instruction.hpp" +#include "rx/cpu/cell/ppu/Opcode.hpp" +#include "rx/cpu/cell/ppu/PPUContext.hpp" +#include "rx/refl.hpp" +#include class ppu_thread; @@ -42,3 +47,12 @@ struct ppu_interpreter_rt : ppu_interpreter_rt_base private: ppu_decoder, ppu_intrp_func_t> table; }; + +struct PPUContext; + +struct PPUInterpreter +{ + std::array> impl; + PPUInterpreter(); + void interpret(PPUContext& context, std::uint32_t inst); +}; diff --git a/rpcs3/Emu/Cell/PPUModule.cpp b/rpcs3/Emu/Cell/PPUModule.cpp index 05e5a1a5e..4fca8778f 100644 --- a/rpcs3/Emu/Cell/PPUModule.cpp +++ b/rpcs3/Emu/Cell/PPUModule.cpp @@ -333,7 +333,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link, utils::serial* ar = n }; // Initialize double-purpose fake OPD array for HLE functions - const auto& hle_funcs = ppu_function_manager::get(g_cfg.core.ppu_decoder != ppu_decoder_type::_static); + const auto& hle_funcs = ppu_function_manager::get(g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy); u32& hle_funcs_addr = g_fxo->get().addr; diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index bb3e7bb8d..34d6c13d4 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -1,4 +1,5 @@ #include "stdafx.h" +#include "rx/cpu/cell/ppu/Decoder.hpp" #include "util/JIT.h" #include "util/StrUtil.h" #include "util/serialization.hpp" @@ -27,6 +28,9 @@ #include "lv2/sys_overlay.h" #include "lv2/sys_process.h" #include "lv2/sys_spu.h" +#include +#include +#include #ifdef LLVM_AVAILABLE #ifdef _MSC_VER @@ -317,12 +321,12 @@ const auto ppu_gateway = build_function_asm("ppu_gateway" #endif // Save native stack pointer for longjmp emulation - c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp); + c.mov(x86::qword_ptr(args[0], OFFSET_OF(ppu_thread, hv_ctx.regs)), x86::rsp); // Initialize args c.mov(x86::r13, x86::qword_ptr(reinterpret_cast(&vm::g_exec_addr))); c.mov(x86::rbp, args[0]); - c.mov(x86::edx, x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia))); // Load PC + c.mov(x86::edx, x86::dword_ptr(x86::rbp, OFFSET_OF(ppu_thread, cia))); // Load PC c.mov(x86::rax, x86::qword_ptr(x86::r13, x86::edx, 1, 0)); // Load call target c.mov(x86::rdx, x86::rax); @@ -333,9 +337,9 @@ const auto ppu_gateway = build_function_asm("ppu_gateway" c.mov(x86::r12d, x86::edx); // Load relocation base c.mov(x86::rbx, x86::qword_ptr(reinterpret_cast(&vm::g_base_addr))); - c.mov(x86::r14, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 0))); // Load some registers - c.mov(x86::rsi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 1))); - c.mov(x86::rdi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 2))); + c.mov(x86::r14, x86::qword_ptr(x86::rbp, OFFSET_OF(ppu_thread, gpr[0]))); // Load some registers + c.mov(x86::rsi, x86::qword_ptr(x86::rbp, OFFSET_OF(ppu_thread, gpr[1]))); + c.mov(x86::rdi, x86::qword_ptr(x86::rbp, OFFSET_OF(ppu_thread, gpr[2]))); if (utils::has_avx()) { @@ -403,7 +407,7 @@ const auto ppu_gateway = build_function_asm("ppu_gateway" // pc, sp // x18, x19...x30 // NOTE: Do not touch x19..x30 before saving the registers! - const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs); + const u64 hv_register_array_offset = OFFSET_OF(ppu_thread, hv_ctx.regs); Label hv_ctx_pc = c.newLabel(); // Used to hold the far jump return address // Sanity @@ -434,7 +438,7 @@ const auto ppu_gateway = build_function_asm("ppu_gateway" const arm::GpX pc = a64::x15; const arm::GpX cia_addr_reg = a64::x11; // Load offset value - c.mov(cia_addr_reg, Imm(static_cast(::offset32(&ppu_thread::cia)))); + c.mov(cia_addr_reg, Imm(static_cast(OFFSET_OF(ppu_thread, cia)))); // Load cia c.ldr(pc.w(), arm::Mem(ppu_t_base, cia_addr_reg)); @@ -459,7 +463,7 @@ const auto ppu_gateway = build_function_asm("ppu_gateway" c.ldr(a64::x22, arm::Mem(a64::x22)); const arm::GpX gpr_addr_reg = a64::x9; - c.mov(gpr_addr_reg, Imm(static_cast(::offset32(&ppu_thread::gpr)))); + c.mov(gpr_addr_reg, Imm(static_cast(OFFSET_OF(ppu_thread, gpr)))); c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base); c.ldr(a64::x23, arm::Mem(gpr_addr_reg)); c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8)); @@ -514,7 +518,7 @@ const extern auto ppu_escape = build_function_asm("ppu_es #if defined(ARCH_X64) // Restore native stack pointer (longjmp emulation) - c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs))); + c.mov(x86::rsp, x86::qword_ptr(args[0], OFFSET_OF(ppu_thread, hv_ctx.regs))); // Return to the return location c.sub(x86::rsp, 8); @@ -523,7 +527,7 @@ const extern auto ppu_escape = build_function_asm("ppu_es // We really shouldn't be using this, but an implementation shoudln't hurt // Far jump return. Only clobbers x30. const arm::GpX ppu_t_base = a64::x20; - const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs); + const u64 hv_register_array_offset = OFFSET_OF(ppu_thread, hv_ctx.regs); c.mov(ppu_t_base, args[0]); c.mov(a64::x30, Imm(hv_register_array_offset)); c.ldr(a64::x30, arm::Mem(ppu_t_base, a64::x30)); @@ -581,7 +585,7 @@ static inline ppu_intrp_func_t ppu_read(u32 addr) // Get interpreter cache value static ppu_intrp_func_t ppu_cache(u32 addr) { - if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static) + if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy) { fmt::throw_exception("Invalid PPU decoder"); } @@ -882,7 +886,7 @@ extern void ppu_register_range(u32 addr, u32 size) while (size) { - if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm) + if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy) { // Assume addr is the start of first segment of PRX const uptr entry_value = reinterpret_cast(ppu_recompiler_fallback_ghc) | (seg_base << (32 + 3)); @@ -919,7 +923,7 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr = return; } - if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm) + if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static) { return; } @@ -1097,14 +1101,14 @@ struct ppu_far_jumps_t #ifdef ARCH_X64 c.mov(args[0], x86::rbp); - c.mov(x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)), pc); + c.mov(x86::dword_ptr(args[0], OFFSET_OF(ppu_thread, cia)), pc); c.jmp(ppu_far_jump); #else Label jmp_address = c.newLabel(); Label imm_address = c.newLabel(); c.ldr(args[1].w(), arm::ptr(imm_address)); - c.str(args[1].w(), arm::Mem(args[0], ::offset32(&ppu_thread::cia))); + c.str(args[1].w(), arm::Mem(args[0], OFFSET_OF(ppu_thread, cia))); c.ldr(args[1], arm::ptr(jmp_address)); c.br(args[1]); @@ -1204,7 +1208,7 @@ bool ppu_form_branch_to_code(u32 entry, u32 target, bool link, bool with_toc, st std::lock_guard lock(jumps.mutex); jumps.vals.insert_or_assign(entry, ppu_far_jumps_t::all_info_t{target, link, with_toc, std::move(module_name)}); - ppu_register_function_at(entry, 4, g_cfg.core.ppu_decoder == ppu_decoder_type::_static ? &ppu_far_jump : ensure(g_fxo->get().gen_jump(entry))); + ppu_register_function_at(entry, 4, g_cfg.core.ppu_decoder != ppu_decoder_type::llvm_legacy ? &ppu_far_jump : ensure(g_fxo->get().gen_jump(entry))); return true; } @@ -1288,7 +1292,7 @@ static void ppu_break(ppu_thread& ppu, ppu_opcode_t, be_t* this_op, ppu_int // Set or remove breakpoint extern bool ppu_breakpoint(u32 addr, bool is_adding) { - if (addr % 4 || !vm::check_addr(addr, vm::page_executable) || g_cfg.core.ppu_decoder == ppu_decoder_type::llvm) + if (addr % 4 || !vm::check_addr(addr, vm::page_executable) || g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy) { return false; } @@ -1359,7 +1363,7 @@ extern bool ppu_patch(u32 addr, u32 value) const bool is_exec = vm::check_addr(addr, vm::page_executable); - if (is_exec && g_cfg.core.ppu_decoder == ppu_decoder_type::llvm && !Emu.IsReady()) + if (is_exec && g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy && !Emu.IsReady()) { // TODO: support recompilers ppu_log.fatal("Patch failed at 0x%x: LLVM recompiler is used.", addr); @@ -1648,7 +1652,7 @@ void ppu_thread::dump_regs(std::string& ret, std::any& custom_data) const fmt::append(ret, "LR: 0x%llx\n", lr); fmt::append(ret, "CTR: 0x%llx\n", ctr); fmt::append(ret, "VRSAVE: 0x%08x\n", vrsave); - fmt::append(ret, "XER: [CA=%u | OV=%u | SO=%u | CNT=%u]\n", xer.ca, xer.ov, xer.so, xer.cnt); + fmt::append(ret, "XER: [CA=%u | OV=%u | SO=%u | CNT=%u]\n", xer_ca, xer_ov, xer_so, xer_cnt); fmt::append(ret, "VSCR: [SAT=%u | NJ=%u]\n", sat, nj); fmt::append(ret, "FPSCR: [FL=%u | FG=%u | FE=%u | FU=%u]\n", fpscr.fl, fpscr.fg, fpscr.fe, fpscr.fu); @@ -2441,9 +2445,10 @@ void ppu_thread::cpu_wait(bs_t old) state.wait(old); } +// static_assert(offsetof(ppu_thread, gpr[0]) == 24); void ppu_thread::exec_task() { - if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static) + if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy) { // HVContext push to allow recursion. This happens with guest callback invocations. const auto old_hv_ctx = hv_ctx; @@ -2464,9 +2469,28 @@ void ppu_thread::exec_task() return; } - const auto cache = vm::g_exec_addr; const auto mem_ = vm::g_base_addr; + if (g_cfg.core.ppu_decoder == ppu_decoder_type::interpreter) + { + static PPUInterpreter interpreter; + + while (true) + { + if (test_stopped()) [[unlikely]] + { + return; + } + + std::uint32_t inst = *reinterpret_cast*>(mem_ + std::uint64_t{cia}); + interpreter.interpret(*this, inst); + } + + return; + } + + const auto cache = vm::g_exec_addr; + while (true) { if (test_stopped()) [[unlikely]] @@ -2556,7 +2580,7 @@ void ppu_thread::serialize_common(utils::serial& ar) { [[maybe_unused]] const s32 version = GET_OR_USE_SERIALIZATION_VERSION(ar.is_writing(), ppu); - ar(gpr, fpr, cr, fpscr.bits, lr, ctr, vrsave, cia, xer, sat, nj, prio.raw().all); + // ar(gpr, fpr, cr, fpscr.bits, lr, ctr, vrsave, cia, xer, sat, nj, prio.raw().all); if (cia % 4 || (cia >> 28) >= 0xCu) { @@ -3309,7 +3333,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm(cpu_flag::pause)); + c.bt(x86::dword_ptr(args[2], OFFSET_OF(ppu_thread, state) - OFFSET_OF(ppu_thread, rdata)), static_cast(cpu_flag::pause)); c.jc(fall); c.xbegin(tx1); @@ -3410,7 +3434,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm& info, bool force_mem_release } } - if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm) + if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm_legacy) { return; } @@ -4034,7 +4058,7 @@ extern void ppu_finalize(const ppu_module& info, bool force_mem_release extern void ppu_precompile(std::vector& dir_queue, std::vector*>* loaded_modules) { - if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm) + if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm_legacy) { return; } @@ -4744,7 +4768,7 @@ extern void ppu_initialize() bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size, concurent_memory_limit& memory_limit) { - if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm) + if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm_legacy) { if (check_only || vm::base(info.segs[0].addr) != info.segs[0].ptr) { @@ -5106,7 +5130,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s c.add(x86::edx, seg0); c.mov(x86::rax, x86::qword_ptr(reinterpret_cast(&vm::g_exec_addr))); - c.mov(x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia)), x86::edx); + c.mov(x86::dword_ptr(x86::rbp, OFFSET_OF(ppu_thread, cia)), x86::edx); c.mov(x86::rax, x86::qword_ptr(x86::rax, x86::rdx, 1, 0)); // Load call target c.mov(x86::rdx, x86::rax); @@ -5137,7 +5161,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s code_size_until_jump = buf_end - buf_start; // Load offset value - c.mov(cia_addr_reg, static_cast(::offset32(&ppu_thread::cia))); + c.mov(cia_addr_reg, static_cast(OFFSET_OF(ppu_thread, cia))); // Update CIA c.str(pc.w(), arm::Mem(ppu_t_base, cia_addr_reg)); diff --git a/rpcs3/Emu/Cell/PPUThread.h b/rpcs3/Emu/Cell/PPUThread.h index 83c30a7b1..341a89dd9 100644 --- a/rpcs3/Emu/Cell/PPUThread.h +++ b/rpcs3/Emu/Cell/PPUThread.h @@ -3,6 +3,7 @@ #include "../CPU/CPUThread.h" #include "../CPU/Hypervisor.h" #include "../Memory/vm_ptr.h" +#include "rx/cpu/cell/ppu/PPUContext.hpp" #include "util/lockless.h" #include "util/BitField.h" @@ -134,7 +135,7 @@ enum class ppu_debugger_mode : u32 max_mode, }; -class ppu_thread : public cpu_thread +class ppu_thread : public cpu_thread, public PPUContext { public: static const u32 id_base = 0x01000000; // TODO (used to determine thread type) @@ -165,107 +166,6 @@ public: using cpu_thread::operator=; - u64 gpr[32] = {}; // General-Purpose Registers - f64 fpr[32] = {}; // Floating Point Registers - v128 vr[32] = {}; // Vector Registers - - union alignas(16) cr_bits - { - u8 bits[32]; - u32 fields[8]; - - u8& operator[](usz i) - { - return bits[i]; - } - - // Pack CR bits - u32 pack() const - { - u32 result{}; - - for (u32 bit : bits) - { - result <<= 1; - result |= bit; - } - - return result; - } - - // Unpack CR bits - void unpack(u32 value) - { - for (u8& b : bits) - { - b = !!(value & (1u << 31)); - value <<= 1; - } - } - }; - - cr_bits cr{}; // Condition Registers (unpacked) - - // Floating-Point Status and Control Register (unpacked) - union - { - struct - { - // TODO - bool _start[16]; - bool fl; // FPCC.FL - bool fg; // FPCC.FG - bool fe; // FPCC.FE - bool fu; // FPCC.FU - bool _end[12]; - }; - - u32 fields[8]; - cr_bits bits; - } fpscr{}; - - u64 lr{}; // Link Register - u64 ctr{}; // Counter Register - u32 vrsave{0xffffffff}; // VR Save Register - u32 cia{}; // Current Instruction Address - - // Fixed-Point Exception Register (abstract representation) - struct - { - ENABLE_BITWISE_SERIALIZATION; - - bool so{}; // Summary Overflow - bool ov{}; // Overflow - bool ca{}; // Carry - u8 cnt{}; // 0..6 - } xer; - - /* - Non-Java. A mode control bit that determines whether vector floating-point operations will be performed - in a Java-IEEE-C9X-compliant mode or a possibly faster non-Java/non-IEEE mode. - 0 The Java-IEEE-C9X-compliant mode is selected. Denormalized values are handled as specified - by Java, IEEE, and C9X standard. - 1 The non-Java/non-IEEE-compliant mode is selected. If an element in a source vector register - contains a denormalized value, the value '0' is used instead. If an instruction causes an underflow - exception, the corresponding element in the target vr is cleared to '0'. In both cases, the '0' - has the same sign as the denormalized or underflowing value. - */ - bool nj = true; - - // Sticky saturation bit - v128 sat{}; - - // Optimization: precomputed java-mode mask for handling denormals - u32 jm_mask = 0x7f80'0000; - - u32 raddr{0}; // Reservation addr - u64 rtime{0}; - alignas(64) std::byte rdata[128]{}; // Reservation data - bool use_full_rdata{}; - u32 res_cached{0}; // Reservation "cached" addresss - u32 res_notify{0}; - u64 res_notify_time{0}; - union ppu_prio_t { u64 all; diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 46d28c14e..819498292 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -60,7 +60,7 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo { .debug_info = false, // Set to "true" to insert debug frames on x27 .use_stack_frames = false, // We don't need this since the PPU GW allocates global scratch on the stack - .hypervisor_context_offset = ::offset32(&ppu_thread::hv_ctx), + .hypervisor_context_offset = OFFSET_OF(ppu_thread, hv_ctx), .exclusion_callback = {}, // Unused, we don't have special exclusion functions on PPU .base_register_lookup = base_reg_lookup, .faux_function_list = std::move(faux_functions_list)}; @@ -76,8 +76,8 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo reset_transforms(); // Thread context struct (TODO: safer member access) - const u32 off0 = offset32(&ppu_thread::state); - const u32 off1 = offset32(&ppu_thread::gpr); + const u32 off0 = OFFSET_OF(ppu_thread, state); + const u32 off1 = OFFSET_OF(ppu_thread, gpr); std::vector thread_struct; thread_struct.emplace_back(ArrayType::get(GetType(), off0)); thread_struct.emplace_back(GetType()); // state diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 94397eb76..b57a07b53 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -16,11 +16,11 @@ #include #include -#define SPU_OFF_128(x, ...) asmjit::x86::oword_ptr(*cpu, offset32(&spu_thread::x, ##__VA_ARGS__)) -#define SPU_OFF_64(x, ...) asmjit::x86::qword_ptr(*cpu, offset32(&spu_thread::x, ##__VA_ARGS__)) -#define SPU_OFF_32(x, ...) asmjit::x86::dword_ptr(*cpu, offset32(&spu_thread::x, ##__VA_ARGS__)) -#define SPU_OFF_16(x, ...) asmjit::x86::word_ptr(*cpu, offset32(&spu_thread::x, ##__VA_ARGS__)) -#define SPU_OFF_8(x, ...) asmjit::x86::byte_ptr(*cpu, offset32(&spu_thread::x, ##__VA_ARGS__)) +#define SPU_OFF_128(x) asmjit::x86::oword_ptr(*cpu, OFFSET_OF(spu_thread, x)) +#define SPU_OFF_64(x) asmjit::x86::qword_ptr(*cpu, OFFSET_OF(spu_thread, x)) +#define SPU_OFF_32(x) asmjit::x86::dword_ptr(*cpu, OFFSET_OF(spu_thread, x)) +#define SPU_OFF_16(x) asmjit::x86::word_ptr(*cpu, OFFSET_OF(spu_thread, x)) +#define SPU_OFF_8(x) asmjit::x86::byte_ptr(*cpu, OFFSET_OF(spu_thread, x)) const spu_decoder s_spu_decoder; @@ -945,9 +945,9 @@ spu_recompiler::XmmLink spu_recompiler::XmmGet(s8 reg, XmmType type) // get xmm switch (type) { - case XmmType::Int: c->movdqa(result, SPU_OFF_128(gpr, reg)); break; - case XmmType::Float: c->movaps(result, SPU_OFF_128(gpr, reg)); break; - case XmmType::Double: c->movapd(result, SPU_OFF_128(gpr, reg)); break; + case XmmType::Int: c->movdqa(result, SPU_OFF_128(gpr[reg])); break; + case XmmType::Float: c->movaps(result, SPU_OFF_128(gpr[reg])); break; + case XmmType::Double: c->movapd(result, SPU_OFF_128(gpr[reg])); break; default: fmt::throw_exception("Invalid XmmType"); } @@ -1117,9 +1117,9 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret) { // Get stack pointer, try to use native return address (check SPU return address) Label fail = c->newLabel(); - c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3)); + c->mov(qw1->r32(), SPU_OFF_32(gpr[1]._u32[3])); c->and_(qw1->r32(), 0x3fff0); - c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, ::offset32(&spu_thread::stack_mirror))); + c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, OFFSET_OF(spu_thread, stack_mirror))); c->cmp(x86::dword_ptr(*qw1, 8), *addr); c->jne(fail); c->mov(pc0->r32(), x86::dword_ptr(*qw1, 12)); @@ -1179,9 +1179,9 @@ void spu_recompiler::branch_set_link(u32 target) Label ret = c->newLabel(); // Get stack pointer, write native and SPU return addresses into the stack mirror - c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3)); + c->mov(qw1->r32(), SPU_OFF_32(gpr[1]._u32[3])); c->and_(qw1->r32(), 0x3fff0); - c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, ::offset32(&spu_thread::stack_mirror))); + c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, OFFSET_OF(spu_thread, stack_mirror))); c->lea(x86::r10, x86::qword_ptr(ret)); c->mov(x86::qword_ptr(*qw1, 0), x86::r10); c->lea(x86::r10, get_pc(target)); @@ -1194,10 +1194,10 @@ void spu_recompiler::branch_set_link(u32 target) // Clear return info after use c->align(AlignMode::kCode, 16); c->bind(ret); - c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3)); + c->mov(qw1->r32(), SPU_OFF_32(gpr[1]._u32[3])); c->and_(qw1->r32(), 0x3fff0); c->pcmpeqd(x86::xmm0, x86::xmm0); - c->movdqa(x86::dqword_ptr(*cpu, *qw1, 0, ::offset32(&spu_thread::stack_mirror)), x86::xmm0); + c->movdqa(x86::dqword_ptr(*cpu, *qw1, 0, OFFSET_OF(spu_thread, stack_mirror)), x86::xmm0); // Set block hash for profiling (if enabled) if (g_cfg.core.spu_prof) @@ -1319,7 +1319,7 @@ void spu_recompiler::MFSPR(spu_opcode_t op) // Check SPUInterpreter for notes. const XmmLink& vr = XmmAlloc(); c->pxor(vr, vr); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); } static u32 spu_rdch(spu_thread* _spu, u32 ch) @@ -1383,7 +1383,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) c->bind(ret); c->movd(x86::xmm0, *addr); c->pslldq(x86::xmm0, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), x86::xmm0); + c->movdqa(SPU_OFF_128(gpr[op.rt]), x86::xmm0); }; switch (op.ra) @@ -1393,7 +1393,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) const XmmLink& vr = XmmAlloc(); c->movd(vr, SPU_OFF_32(srr0)); c->pslldq(vr, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); return; } case SPU_RdInMbox: @@ -1411,7 +1411,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) const XmmLink& vr = XmmAlloc(); c->movd(vr, SPU_OFF_32(ch_tag_mask)); c->pslldq(vr, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); return; } case SPU_RdSigNotify1: @@ -1467,7 +1467,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) c->lea(addr->r64(), get_pc(m_pos)); c->and_(*addr, 0x3fffc); c->mov(SPU_OFF_32(pc), *addr); - c->lea(*arg1, SPU_OFF_128(gpr, op.rt)); + c->lea(*arg1, SPU_OFF_128(gpr[op.rt])); c->mov(*arg0, *cpu); c->call(g_cfg.core.spu_loop_detection ? +sub1 : +sub2); return; @@ -1478,7 +1478,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) c->movq(vr, SPU_OFF_64(ch_events)); c->psrldq(vr, 4); c->pslldq(vr, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); return; } case SPU_RdEventStat: @@ -1495,7 +1495,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) c->or_(addr->r32(), arg1->r32()); c->movd(vr, *addr); c->pslldq(vr, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); return; } default: break; @@ -1509,7 +1509,7 @@ void spu_recompiler::RDCH(spu_opcode_t op) c->call(spu_rdch); c->movd(x86::xmm0, *addr); c->pslldq(x86::xmm0, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), x86::xmm0); + c->movdqa(SPU_OFF_128(gpr[op.rt]), x86::xmm0); } static u32 spu_rchcnt(spu_thread* _spu, u32 ch) @@ -1530,7 +1530,7 @@ void spu_recompiler::RCHCNT(spu_opcode_t op) if (inv) c->pxor(vr, XmmConst(v128::from32p(1))); c->pslldq(vr, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); }; switch (op.ra) @@ -1549,7 +1549,7 @@ void spu_recompiler::RCHCNT(spu_opcode_t op) c->mov(addr->r32(), 1); c->movd(vr, addr->r32()); c->pslldq(vr, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); return; } @@ -1561,7 +1561,7 @@ void spu_recompiler::RCHCNT(spu_opcode_t op) c->movd(v1, SPU_OFF_32(mfc_size)); c->psubd(vr, v1); c->pslldq(vr, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); return; } @@ -1571,7 +1571,7 @@ void spu_recompiler::RCHCNT(spu_opcode_t op) c->movdqa(vr, SPU_OFF_128(ch_in_mbox)); c->pslldq(vr, 14); c->psrldq(vr, 3); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); return; } // Channels with a constant count of 1: @@ -1599,7 +1599,7 @@ void spu_recompiler::RCHCNT(spu_opcode_t op) c->mov(addr->r32(), 1); c->movd(vr, addr->r32()); c->pslldq(vr, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); return; } case SPU_RdEventStat: @@ -1622,22 +1622,22 @@ void spu_recompiler::RCHCNT(spu_opcode_t op) // Use result from the third argument c->movd(x86::xmm0, *addr); c->pslldq(x86::xmm0, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), x86::xmm0); + c->movdqa(SPU_OFF_128(gpr[op.rt]), x86::xmm0); } void spu_recompiler::SF(spu_opcode_t op) { // sub from const XmmLink& vb = XmmGet(op.rb, XmmType::Int); - c->psubd(vb, SPU_OFF_128(gpr, op.ra)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vb); + c->psubd(vb, SPU_OFF_128(gpr[op.ra])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::OR(spu_opcode_t op) { const XmmLink& vb = XmmGet(op.rb, XmmType::Int); - c->por(vb, SPU_OFF_128(gpr, op.ra)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vb); + c->por(vb, SPU_OFF_128(gpr[op.ra])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::BG(spu_opcode_t op) @@ -1652,24 +1652,24 @@ void spu_recompiler::BG(spu_opcode_t op) c->vpsubd(vi, vb, va); c->vpternlogd(va, vb, vi, 0x4d /* B?nandAC:norAC */); c->psrld(va, 31); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); return; } c->movdqa(vi, XmmConst(v128::from32p(0x80000000))); c->pxor(va, vi); - c->pxor(vi, SPU_OFF_128(gpr, op.rb)); + c->pxor(vi, SPU_OFF_128(gpr[op.rb])); c->pcmpgtd(va, vi); c->paddd(va, XmmConst(v128::from32p(1))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::SFH(spu_opcode_t op) { // sub from (halfword) const XmmLink& vb = XmmGet(op.rb, XmmType::Int); - c->psubw(vb, SPU_OFF_128(gpr, op.ra)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vb); + c->psubw(vb, SPU_OFF_128(gpr[op.ra])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::NOR(spu_opcode_t op) @@ -1678,14 +1678,14 @@ void spu_recompiler::NOR(spu_opcode_t op) if (utils::has_avx512()) { - c->vpternlogd(va, va, SPU_OFF_128(gpr, op.rb), 0x11 /* norCB */); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->vpternlogd(va, va, SPU_OFF_128(gpr[op.rb]), 0x11 /* norCB */); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); return; } - c->por(va, SPU_OFF_128(gpr, op.rb)); + c->por(va, SPU_OFF_128(gpr[op.rb])); c->pxor(va, XmmConst(v128::from32p(0xffffffff))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ABSDB(spu_opcode_t op) @@ -1697,7 +1697,7 @@ void spu_recompiler::ABSDB(spu_opcode_t op) c->pmaxub(va, vb); c->pminub(vb, vm); c->psubb(va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ROT(spu_opcode_t op) @@ -1708,7 +1708,7 @@ void spu_recompiler::ROT(spu_opcode_t op) const XmmLink& vb = XmmGet(op.rb, XmmType::Int); const XmmLink& vt = XmmAlloc(); c->vprolvd(vt, va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } @@ -1725,7 +1725,7 @@ void spu_recompiler::ROT(spu_opcode_t op) c->pandn(vb, v4); c->vpsrlvd(va, va, vb); c->por(vt, va); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } @@ -1735,16 +1735,16 @@ void spu_recompiler::ROT(spu_opcode_t op) const XmmLink& vb = XmmGet(op.rb, XmmType::Int); const XmmLink& vt = XmmAlloc(); c->vprotd(vt, va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } for (u32 i = 0; i < 4; i++) // unrolled loop { - c->mov(qw0->r32(), SPU_OFF_32(gpr, op.ra, &v128::_u32, i)); - c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr, op.rb, &v128::_u32, i)); + c->mov(qw0->r32(), SPU_OFF_32(gpr[op.ra]._u32[i])); + c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr[op.rb]._u32[i])); c->rol(qw0->r32(), asmjit::x86::cl); - c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32()); + c->mov(SPU_OFF_32(gpr[op.rt]._u32[i]), qw0->r32()); } } @@ -1758,7 +1758,7 @@ void spu_recompiler::ROTM(spu_opcode_t op) c->psubd(vb, XmmConst(v128::from32p(1))); c->pandn(vb, XmmConst(v128::from32p(0x3f))); c->vpsrlvd(vt, va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } @@ -1774,17 +1774,17 @@ void spu_recompiler::ROTM(spu_opcode_t op) c->pcmpgtd(vb, XmmConst(v128::from32p(31))); c->vpshld(vt, va, vt); c->vpandn(vt, vb, vt); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } for (u32 i = 0; i < 4; i++) // unrolled loop { - c->mov(qw0->r32(), SPU_OFF_32(gpr, op.ra, &v128::_u32, i)); - c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr, op.rb, &v128::_u32, i)); + c->mov(qw0->r32(), SPU_OFF_32(gpr[op.ra]._u32[i])); + c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr[op.rb]._u32[i])); c->neg(asmjit::x86::ecx); c->shr(*qw0, asmjit::x86::cl); - c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32()); + c->mov(SPU_OFF_32(gpr[op.rt]._u32[i]), qw0->r32()); } } @@ -1798,7 +1798,7 @@ void spu_recompiler::ROTMA(spu_opcode_t op) c->psubd(vb, XmmConst(v128::from32p(1))); c->pandn(vb, XmmConst(v128::from32p(0x3f))); c->vpsravd(vt, va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } @@ -1813,17 +1813,17 @@ void spu_recompiler::ROTMA(spu_opcode_t op) c->pminud(vb, XmmConst(v128::from32p(31))); c->psubd(vt, vb); c->vpshad(vt, va, vt); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } for (u32 i = 0; i < 4; i++) // unrolled loop { - c->movsxd(*qw0, SPU_OFF_32(gpr, op.ra, &v128::_u32, i)); - c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr, op.rb, &v128::_u32, i)); + c->movsxd(*qw0, SPU_OFF_32(gpr[op.ra]._u32[i])); + c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr[op.rb]._u32[i])); c->neg(asmjit::x86::ecx); c->sar(*qw0, asmjit::x86::cl); - c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32()); + c->mov(SPU_OFF_32(gpr[op.rt]._u32[i]), qw0->r32()); } } @@ -1836,7 +1836,7 @@ void spu_recompiler::SHL(spu_opcode_t op) const XmmLink& vt = XmmAlloc(); c->pand(vb, XmmConst(v128::from32p(0x3f))); c->vpsllvd(vt, va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } @@ -1849,16 +1849,16 @@ void spu_recompiler::SHL(spu_opcode_t op) c->vpcmpgtd(vt, vb, XmmConst(v128::from32p(31))); c->vpshld(vb, va, vb); c->pandn(vt, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } for (u32 i = 0; i < 4; i++) // unrolled loop { - c->mov(qw0->r32(), SPU_OFF_32(gpr, op.ra, &v128::_u32, i)); - c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr, op.rb, &v128::_u32, i)); + c->mov(qw0->r32(), SPU_OFF_32(gpr[op.ra]._u32[i])); + c->mov(asmjit::x86::ecx, SPU_OFF_32(gpr[op.rb]._u32[i])); c->shl(*qw0, asmjit::x86::cl); - c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32()); + c->mov(SPU_OFF_32(gpr[op.rt]._u32[i]), qw0->r32()); } } @@ -1878,7 +1878,7 @@ void spu_recompiler::ROTH(spu_opcode_t op) // nf c->vprolvd(va, va, v4); c->vprolvd(vb, vt, vb); c->vpblendw(vt, vb, va, 0xaa); - c->vmovdqa(SPU_OFF_128(gpr, op.rt), vt); + c->vmovdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } @@ -1888,16 +1888,16 @@ void spu_recompiler::ROTH(spu_opcode_t op) // nf const XmmLink& vb = XmmGet(op.rb, XmmType::Int); const XmmLink& vt = XmmAlloc(); c->vprotw(vt, va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } for (u32 i = 0; i < 8; i++) // unrolled loop { - c->movzx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i)); - c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr, op.rb, &v128::_u16, i)); + c->movzx(qw0->r32(), SPU_OFF_16(gpr[op.ra]._u16[i])); + c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr[op.rb]._u16[i])); c->rol(qw0->r16(), asmjit::x86::cl); - c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16()); + c->mov(SPU_OFF_16(gpr[op.rt]._u16[i]), qw0->r16()); } } @@ -1911,7 +1911,7 @@ void spu_recompiler::ROTHM(spu_opcode_t op) c->psubw(vb, XmmConst(v128::from16p(1))); c->pandn(vb, XmmConst(v128::from16p(0x1f))); c->vpsrlvw(vt, va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } @@ -1931,7 +1931,7 @@ void spu_recompiler::ROTHM(spu_opcode_t op) c->vpsrlvd(va, va, v4); c->vpsrlvd(vb, vb, v5); c->vpblendw(vt, vb, va, 0xaa); // can use vpblendvb with 0xffff0000 mask (vt) - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } @@ -1947,17 +1947,17 @@ void spu_recompiler::ROTHM(spu_opcode_t op) c->pcmpgtw(vb, XmmConst(v128::from16p(15))); c->vpshlw(vt, va, vt); c->vpandn(vt, vb, vt); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } for (u32 i = 0; i < 8; i++) // unrolled loop { - c->movzx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i)); - c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr, op.rb, &v128::_u16, i)); + c->movzx(qw0->r32(), SPU_OFF_16(gpr[op.ra]._u16[i])); + c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr[op.rb]._u16[i])); c->neg(asmjit::x86::ecx); c->shr(qw0->r32(), asmjit::x86::cl); - c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16()); + c->mov(SPU_OFF_16(gpr[op.rt]._u16[i]), qw0->r16()); } } @@ -1971,7 +1971,7 @@ void spu_recompiler::ROTMAH(spu_opcode_t op) c->psubw(vb, XmmConst(v128::from16p(1))); c->pandn(vb, XmmConst(v128::from16p(0x1f))); c->vpsravw(vt, va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } @@ -1993,7 +1993,7 @@ void spu_recompiler::ROTMAH(spu_opcode_t op) c->vpsravd(va, va, v4); c->vpsravd(vb, vb, v5); c->vpblendw(vt, vb, va, 0xaa); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } @@ -2008,17 +2008,17 @@ void spu_recompiler::ROTMAH(spu_opcode_t op) c->pminuw(vb, XmmConst(v128::from16p(15))); c->psubw(vt, vb); c->vpshaw(vt, va, vt); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } for (u32 i = 0; i < 8; i++) // unrolled loop { - c->movsx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i)); - c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr, op.rb, &v128::_u16, i)); + c->movsx(qw0->r32(), SPU_OFF_16(gpr[op.ra]._u16[i])); + c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr[op.rb]._u16[i])); c->neg(asmjit::x86::ecx); c->sar(qw0->r32(), asmjit::x86::cl); - c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16()); + c->mov(SPU_OFF_16(gpr[op.rt]._u16[i]), qw0->r16()); } } @@ -2031,7 +2031,7 @@ void spu_recompiler::SHLH(spu_opcode_t op) const XmmLink& vt = XmmAlloc(); c->pand(vb, XmmConst(v128::from16p(0x1f))); c->vpsllvw(vt, va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } @@ -2050,7 +2050,7 @@ void spu_recompiler::SHLH(spu_opcode_t op) c->vpsllvd(va, va, v5); c->vpsllvd(vb, vb, v4); c->vpblendw(vt, vb, va, 0x55); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } @@ -2063,16 +2063,16 @@ void spu_recompiler::SHLH(spu_opcode_t op) c->vpcmpgtw(vt, vb, XmmConst(v128::from16p(15))); c->vpshlw(vb, va, vb); c->pandn(vt, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } for (u32 i = 0; i < 8; i++) // unrolled loop { - c->movzx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i)); - c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr, op.rb, &v128::_u16, i)); + c->movzx(qw0->r32(), SPU_OFF_16(gpr[op.ra]._u16[i])); + c->movzx(asmjit::x86::ecx, SPU_OFF_16(gpr[op.rb]._u16[i])); c->shl(qw0->r32(), asmjit::x86::cl); - c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16()); + c->mov(SPU_OFF_16(gpr[op.rt]._u16[i]), qw0->r16()); } } @@ -2085,7 +2085,7 @@ void spu_recompiler::ROTI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->vprold(va, va, s); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); return; } @@ -2093,7 +2093,7 @@ void spu_recompiler::ROTI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->vprotd(va, va, s); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); return; } @@ -2103,7 +2103,7 @@ void spu_recompiler::ROTI(spu_opcode_t op) c->pslld(va, s); c->psrld(v1, 32 - s); c->por(va, v1); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ROTMI(spu_opcode_t op) @@ -2112,7 +2112,7 @@ void spu_recompiler::ROTMI(spu_opcode_t op) const int s = (0 - op.i7) & 0x3f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psrld(va, s); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ROTMAI(spu_opcode_t op) @@ -2121,7 +2121,7 @@ void spu_recompiler::ROTMAI(spu_opcode_t op) const int s = (0 - op.i7) & 0x3f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psrad(va, s); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::SHLI(spu_opcode_t op) @@ -2130,7 +2130,7 @@ void spu_recompiler::SHLI(spu_opcode_t op) const int s = op.i7 & 0x3f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pslld(va, s); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ROTHI(spu_opcode_t op) @@ -2143,7 +2143,7 @@ void spu_recompiler::ROTHI(spu_opcode_t op) c->psllw(va, s); c->psrlw(v1, 16 - s); c->por(va, v1); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ROTHMI(spu_opcode_t op) @@ -2152,7 +2152,7 @@ void spu_recompiler::ROTHMI(spu_opcode_t op) const int s = (0 - op.i7) & 0x1f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psrlw(va, s); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ROTMAHI(spu_opcode_t op) @@ -2161,7 +2161,7 @@ void spu_recompiler::ROTMAHI(spu_opcode_t op) const int s = (0 - op.i7) & 0x1f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psraw(va, s); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::SHLHI(spu_opcode_t op) @@ -2170,22 +2170,22 @@ void spu_recompiler::SHLHI(spu_opcode_t op) const int s = op.i7 & 0x1f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psllw(va, s); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::A(spu_opcode_t op) { const XmmLink& vb = XmmGet(op.rb, XmmType::Int); - c->paddd(vb, SPU_OFF_128(gpr, op.ra)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vb); + c->paddd(vb, SPU_OFF_128(gpr[op.ra])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::AND(spu_opcode_t op) { // and const XmmLink& vb = XmmGet(op.rb, XmmType::Int); - c->pand(vb, SPU_OFF_128(gpr, op.ra)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vb); + c->pand(vb, SPU_OFF_128(gpr[op.ra])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::CG(spu_opcode_t op) @@ -2199,7 +2199,7 @@ void spu_recompiler::CG(spu_opcode_t op) c->vpaddd(vi, vb, va); c->vpternlogd(vi, va, vb, 0x8e /* A?andBC:orBC */); c->psrld(vi, 31); - c->movdqa(SPU_OFF_128(gpr, op.rt), vi); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vi); return; } @@ -2209,14 +2209,14 @@ void spu_recompiler::CG(spu_opcode_t op) c->pxor(vb, vi); c->pcmpgtd(va, vb); c->psrld(va, 31); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::AH(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); - c->paddw(va, SPU_OFF_128(gpr, op.rb)); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->paddw(va, SPU_OFF_128(gpr[op.rb])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::NAND(spu_opcode_t op) @@ -2226,21 +2226,21 @@ void spu_recompiler::NAND(spu_opcode_t op) if (utils::has_avx512()) { - c->vpternlogd(va, va, SPU_OFF_128(gpr, op.rb), 0x77 /* nandCB */); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->vpternlogd(va, va, SPU_OFF_128(gpr[op.rb]), 0x77 /* nandCB */); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); return; } - c->pand(va, SPU_OFF_128(gpr, op.rb)); + c->pand(va, SPU_OFF_128(gpr[op.rb])); c->pxor(va, XmmConst(v128::from32p(0xffffffff))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::AVGB(spu_opcode_t op) { const XmmLink& vb = XmmGet(op.rb, XmmType::Int); - c->pavgb(vb, SPU_OFF_128(gpr, op.ra)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vb); + c->pavgb(vb, SPU_OFF_128(gpr[op.ra])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::MTSPR(spu_opcode_t) @@ -2284,7 +2284,7 @@ void spu_recompiler::WRCH(spu_opcode_t op) { case SPU_WrSRR0: { - c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3])); c->and_(*addr, 0x3fffc); c->mov(SPU_OFF_32(srr0), *addr); return; @@ -2299,7 +2299,7 @@ void spu_recompiler::WRCH(spu_opcode_t op) Label wait = c->newLabel(); Label again = c->newLabel(); Label ret = c->newLabel(); - c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); + c->mov(qw0->r32(), SPU_OFF_32(gpr[op.rt]._u32[3])); c->mov(addr->r64(), SPU_OFF_64(ch_out_mbox)); c->align(AlignMode::kCode, 16); c->bind(again); @@ -2329,7 +2329,7 @@ void spu_recompiler::WRCH(spu_opcode_t op) { Label upd = c->newLabel(); Label ret = c->newLabel(); - c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); + c->mov(qw0->r32(), SPU_OFF_32(gpr[op.rt]._u32[3])); c->mov(SPU_OFF_32(ch_tag_mask), qw0->r32()); c->cmp(SPU_OFF_32(ch_tag_upd), MFC_TAG_UPDATE_IMMEDIATE); c->jnz(upd); @@ -2354,7 +2354,7 @@ void spu_recompiler::WRCH(spu_opcode_t op) Label fail = c->newLabel(); Label zero = c->newLabel(); Label ret = c->newLabel(); - c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); + c->mov(qw0->r32(), SPU_OFF_32(gpr[op.rt]._u32[3])); c->cmp(qw0->r32(), 2); c->ja(fail); @@ -2394,40 +2394,40 @@ void spu_recompiler::WRCH(spu_opcode_t op) } case MFC_LSA: { - c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); - c->mov(SPU_OFF_32(ch_mfc_cmd, &spu_mfc_cmd::lsa), *addr); + c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3])); + c->mov(SPU_OFF_32(ch_mfc_cmd.lsa), *addr); return; } case MFC_EAH: { - c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); - c->mov(SPU_OFF_32(ch_mfc_cmd, &spu_mfc_cmd::eah), *addr); + c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3])); + c->mov(SPU_OFF_32(ch_mfc_cmd.eah), *addr); return; } case MFC_EAL: { - c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); - c->mov(SPU_OFF_32(ch_mfc_cmd, &spu_mfc_cmd::eal), *addr); + c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3])); + c->mov(SPU_OFF_32(ch_mfc_cmd.eal), *addr); return; } case MFC_Size: { - c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3])); c->and_(*addr, 0x7fff); - c->mov(SPU_OFF_16(ch_mfc_cmd, &spu_mfc_cmd::size), addr->r16()); + c->mov(SPU_OFF_16(ch_mfc_cmd.size), addr->r16()); return; } case MFC_TagID: { - c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3])); c->and_(*addr, 0x1f); - c->mov(SPU_OFF_8(ch_mfc_cmd, &spu_mfc_cmd::tag), addr->r8()); + c->mov(SPU_OFF_8(ch_mfc_cmd.tag), addr->r8()); return; } case MFC_Cmd: { - c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); - c->mov(SPU_OFF_8(ch_mfc_cmd, &spu_mfc_cmd::cmd), addr->r8()); + c->mov(*addr, SPU_OFF_32(gpr[op.rt]._u32[3])); + c->mov(SPU_OFF_8(ch_mfc_cmd.cmd), addr->r8()); c->lea(addr->r64(), get_pc(m_pos)); c->and_(*addr, 0x3fffc); c->mov(SPU_OFF_32(pc), *addr); @@ -2452,7 +2452,7 @@ void spu_recompiler::WRCH(spu_opcode_t op) }; Label ret = c->newLabel(); - c->mov(arg1->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); + c->mov(arg1->r32(), SPU_OFF_32(gpr[op.rt]._u32[3])); c->and_(arg1->r32(), 0x1f); c->btr(SPU_OFF_32(ch_stall_mask), arg1->r32()); c->jnc(ret); @@ -2471,7 +2471,7 @@ void spu_recompiler::WRCH(spu_opcode_t op) c->mov(*arg0, *cpu); c->call(+sub); - c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); + c->mov(qw0->r32(), SPU_OFF_32(gpr[op.rt]._u32[3])); c->mov(SPU_OFF_32(ch_dec_value), qw0->r32()); c->mov(SPU_OFF_8(is_dec_frozen), 0); return; @@ -2499,7 +2499,7 @@ void spu_recompiler::WRCH(spu_opcode_t op) c->and_(*addr, 0x3fffc); c->mov(SPU_OFF_32(pc), *addr); c->mov(arg1->r32(), +op.ra); - c->mov(qw0->r32(), SPU_OFF_32(gpr, op.rt, &v128::_u32, 3)); + c->mov(qw0->r32(), SPU_OFF_32(gpr[op.rt]._u32[3])); c->mov(*arg0, *cpu); c->call(spu_wrch); } @@ -2507,14 +2507,14 @@ void spu_recompiler::WRCH(spu_opcode_t op) void spu_recompiler::BIZ(spu_opcode_t op) { asmjit::Label branch_label = c->newLabel(); - c->cmp(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), 0); + c->cmp(SPU_OFF_32(gpr[op.rt]._u32[3]), 0); c->je(branch_label); after.emplace_back([=, this, jt = m_targets[m_pos].size() > 1] { c->align(asmjit::AlignMode::kCode, 16); c->bind(branch_label); - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); c->and_(*addr, 0x3fffc); branch_indirect(op, jt); }); @@ -2523,14 +2523,14 @@ void spu_recompiler::BIZ(spu_opcode_t op) void spu_recompiler::BINZ(spu_opcode_t op) { asmjit::Label branch_label = c->newLabel(); - c->cmp(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), 0); + c->cmp(SPU_OFF_32(gpr[op.rt]._u32[3]), 0); c->jne(branch_label); after.emplace_back([=, this, jt = m_targets[m_pos].size() > 1] { c->align(asmjit::AlignMode::kCode, 16); c->bind(branch_label); - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); c->and_(*addr, 0x3fffc); branch_indirect(op, jt); }); @@ -2539,14 +2539,14 @@ void spu_recompiler::BINZ(spu_opcode_t op) void spu_recompiler::BIHZ(spu_opcode_t op) { asmjit::Label branch_label = c->newLabel(); - c->cmp(SPU_OFF_16(gpr, op.rt, &v128::_u16, 6), 0); + c->cmp(SPU_OFF_16(gpr[op.rt]._u16[6]), 0); c->je(branch_label); after.emplace_back([=, this, jt = m_targets[m_pos].size() > 1] { c->align(asmjit::AlignMode::kCode, 16); c->bind(branch_label); - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); c->and_(*addr, 0x3fffc); branch_indirect(op, jt); }); @@ -2555,14 +2555,14 @@ void spu_recompiler::BIHZ(spu_opcode_t op) void spu_recompiler::BIHNZ(spu_opcode_t op) { asmjit::Label branch_label = c->newLabel(); - c->cmp(SPU_OFF_16(gpr, op.rt, &v128::_u16, 6), 0); + c->cmp(SPU_OFF_16(gpr[op.rt]._u16[6]), 0); c->jne(branch_label); after.emplace_back([=, this, jt = m_targets[m_pos].size() > 1] { c->align(asmjit::AlignMode::kCode, 16); c->bind(branch_label); - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); c->and_(*addr, 0x3fffc); branch_indirect(op, jt); }); @@ -2575,8 +2575,8 @@ void spu_recompiler::STOPD(spu_opcode_t) void spu_recompiler::STQX(spu_opcode_t op) { - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); - c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); + c->add(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); c->and_(*addr, 0x3fff0); if (utils::has_ssse3()) @@ -2587,8 +2587,8 @@ void spu_recompiler::STQX(spu_opcode_t op) } else { - c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0)); - c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1)); + c->mov(*qw0, SPU_OFF_64(gpr[op.rt]._u64[0])); + c->mov(*qw1, SPU_OFF_64(gpr[op.rt]._u64[1])); c->bswap(*qw0); c->bswap(*qw1); c->mov(asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 0), *qw1); @@ -2612,7 +2612,7 @@ void spu_recompiler::BI(spu_opcode_t op) return; } - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); c->and_(*addr, 0x3fffc); branch_indirect(op, is_jt, !is_jt); m_pos = -1; @@ -2620,14 +2620,14 @@ void spu_recompiler::BI(spu_opcode_t op) void spu_recompiler::BISL(spu_opcode_t op) { - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); c->and_(*addr, 0x3fffc); const XmmLink& vr = XmmAlloc(); c->lea(*qw0, get_pc(m_pos + 4)); c->and_(qw0->r32(), 0x3fffc); c->movd(vr, qw0->r32()); c->pslldq(vr, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); branch_set_link(m_pos + 4); branch_indirect(op, true, false); m_pos = -1; @@ -2647,14 +2647,14 @@ void spu_recompiler::BISLED(spu_opcode_t op) return _spu->get_events(_spu->ch_events.load().mask).count; }; - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); const XmmLink& vr = XmmAlloc(); c->lea(*qw0, get_pc(m_pos + 4)); c->movd(vr, qw0->r32()); c->pand(vr, XmmConst(v128::from32p(0x3fffc))); c->pslldq(vr, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); asmjit::Label branch_label = c->newLabel(); c->mov(*arg0, *cpu); @@ -2682,7 +2682,7 @@ void spu_recompiler::GB(spu_opcode_t op) c->movmskps(*addr, va); c->pxor(va, va); c->pinsrw(va, *addr, 6); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::GBH(spu_opcode_t op) @@ -2693,7 +2693,7 @@ void spu_recompiler::GBH(spu_opcode_t op) c->pmovmskb(*addr, va); c->pxor(va, va); c->pinsrw(va, *addr, 6); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::GBB(spu_opcode_t op) @@ -2703,7 +2703,7 @@ void spu_recompiler::GBB(spu_opcode_t op) c->pmovmskb(*addr, va); c->pxor(va, va); c->pinsrw(va, *addr, 6); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::FSM(spu_opcode_t op) @@ -2714,7 +2714,7 @@ void spu_recompiler::FSM(spu_opcode_t op) c->movdqa(vm, XmmConst(v128::from32r(8, 4, 2, 1))); c->pand(va, vm); c->pcmpeqd(va, vm); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::FSMH(spu_opcode_t op) @@ -2726,7 +2726,7 @@ void spu_recompiler::FSMH(spu_opcode_t op) c->movdqa(vm, XmmConst(v128::from64r(0x0080004000200010, 0x0008000400020001))); c->pand(va, vm); c->pcmpeqw(va, vm); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::FSMB(spu_opcode_t op) @@ -2748,7 +2748,7 @@ void spu_recompiler::FSMB(spu_opcode_t op) c->movdqa(vm, XmmConst(v128::from64p(0x8040201008040201))); c->pand(va, vm); c->pcmpeqb(va, vm); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::FREST(spu_opcode_t op) @@ -2791,7 +2791,7 @@ void spu_recompiler::FREST(spu_opcode_t op) c->orps(v_fraction, v_exponent); c->orps(v_sign, v_fraction); - c->movaps(SPU_OFF_128(gpr, op.rt), v_sign); + c->movaps(SPU_OFF_128(gpr[op.rt]), v_sign); } void spu_recompiler::FRSQEST(spu_opcode_t op) @@ -2824,13 +2824,13 @@ void spu_recompiler::FRSQEST(spu_opcode_t op) c->orps(v_fraction, v_exponent); - c->movaps(SPU_OFF_128(gpr, op.rt), v_fraction); + c->movaps(SPU_OFF_128(gpr[op.rt]), v_fraction); } void spu_recompiler::LQX(spu_opcode_t op) { - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); - c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); + c->add(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); c->and_(*addr, 0x3fff0); if (utils::has_ssse3()) @@ -2838,7 +2838,7 @@ void spu_recompiler::LQX(spu_opcode_t op) const XmmLink& vt = XmmAlloc(); c->movdqa(vt, asmjit::x86::oword_ptr(*ls, addr->r64())); c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } else { @@ -2846,8 +2846,8 @@ void spu_recompiler::LQX(spu_opcode_t op) c->mov(*qw1, asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 8)); c->bswap(*qw0); c->bswap(*qw1); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0); + c->mov(SPU_OFF_64(gpr[op.rt]._u64[0]), *qw1); + c->mov(SPU_OFF_64(gpr[op.rt]._u64[1]), *qw0); } } @@ -2860,10 +2860,10 @@ void spu_recompiler::ROTQBYBI(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->mov(*qw0, +g_spu_imm.rldq_pshufb); - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); c->and_(*addr, 0xf << 3); c->pshufb(va, asmjit::x86::oword_ptr(*qw0, addr->r64(), 1)); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ROTQMBYBI(spu_opcode_t op) @@ -2875,10 +2875,10 @@ void spu_recompiler::ROTQMBYBI(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->mov(*qw0, +g_spu_imm.srdq_pshufb); - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); c->and_(*addr, 0x1f << 3); c->pshufb(va, asmjit::x86::oword_ptr(*qw0, addr->r64(), 1)); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::SHLQBYBI(spu_opcode_t op) @@ -2890,63 +2890,63 @@ void spu_recompiler::SHLQBYBI(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->mov(*qw0, +g_spu_imm.sldq_pshufb); - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); c->and_(*addr, 0x1f << 3); c->pshufb(va, asmjit::x86::oword_ptr(*qw0, addr->r64(), 1)); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CBX(spu_opcode_t op) { - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); - c->add(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); + c->add(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); c->not_(*addr); c->and_(*addr, 0xf); const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); - c->mov(asmjit::x86::byte_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x03); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); + c->mov(asmjit::x86::byte_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), 0x03); } void spu_recompiler::CHX(spu_opcode_t op) { - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); - c->add(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); + c->add(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); c->not_(*addr); c->and_(*addr, 0xe); const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); - c->mov(asmjit::x86::word_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x0203); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); + c->mov(asmjit::x86::word_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), 0x0203); } void spu_recompiler::CWX(spu_opcode_t op) { - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); - c->add(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); + c->add(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); c->not_(*addr); c->and_(*addr, 0xc); const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); - c->mov(asmjit::x86::dword_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x00010203); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); + c->mov(asmjit::x86::dword_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), 0x00010203); } void spu_recompiler::CDX(spu_opcode_t op) { - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); - c->add(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); + c->add(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); c->not_(*addr); c->and_(*addr, 0x8); const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); c->mov(*qw0, asmjit::Imm(0x0001020304050607ull)); - c->mov(asmjit::x86::qword_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), *qw0); + c->mov(asmjit::x86::qword_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), *qw0); } void spu_recompiler::ROTQBI(spu_opcode_t op) @@ -2963,7 +2963,7 @@ void spu_recompiler::ROTQBI(spu_opcode_t op) c->psllq(va, vb); c->psrlq(vt, v4); c->por(vt, va); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } void spu_recompiler::ROTQMBI(spu_opcode_t op) @@ -2983,7 +2983,7 @@ void spu_recompiler::ROTQMBI(spu_opcode_t op) c->psrlq(va, vb); c->psllq(vt, v4); c->por(vt, va); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } void spu_recompiler::SHLQBI(spu_opcode_t op) @@ -3001,7 +3001,7 @@ void spu_recompiler::SHLQBI(spu_opcode_t op) c->psllq(va, vb); c->psrlq(vt, v4); c->por(vt, va); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } void spu_recompiler::ROTQBY(spu_opcode_t op) @@ -3013,11 +3013,11 @@ void spu_recompiler::ROTQBY(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->mov(*qw0, +g_spu_imm.rldq_pshufb); - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); c->and_(*addr, 0xf); c->shl(*addr, 4); c->pshufb(va, asmjit::x86::oword_ptr(*qw0, addr->r64())); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ROTQMBY(spu_opcode_t op) @@ -3029,11 +3029,11 @@ void spu_recompiler::ROTQMBY(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->mov(*qw0, +g_spu_imm.srdq_pshufb); - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); c->and_(*addr, 0x1f); c->shl(*addr, 4); c->pshufb(va, asmjit::x86::oword_ptr(*qw0, addr->r64())); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::SHLQBY(spu_opcode_t op) @@ -3045,11 +3045,11 @@ void spu_recompiler::SHLQBY(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->mov(*qw0, +g_spu_imm.sldq_pshufb); - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); c->and_(*addr, 0x1f); c->shl(*addr, 4); c->pshufb(va, asmjit::x86::oword_ptr(*qw0, addr->r64())); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ORX(spu_opcode_t op) @@ -3061,7 +3061,7 @@ void spu_recompiler::ORX(spu_opcode_t op) c->pshufd(v1, va, 0x4e); c->por(va, v1); c->pslldq(va, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CBD(spu_opcode_t op) @@ -3073,11 +3073,11 @@ void spu_recompiler::CBD(spu_opcode_t op) // v128 value = v128::fromV(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)); // value.u8r[op.i7 & 0xf] = 0x03; // c->movdqa(vr, XmmConst(value)); - // c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + // c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); // return; // } - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); if (op.i7) c->add(*addr, +op.i7); c->not_(*addr); @@ -3085,8 +3085,8 @@ void spu_recompiler::CBD(spu_opcode_t op) const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); - c->mov(asmjit::x86::byte_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x03); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); + c->mov(asmjit::x86::byte_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), 0x03); } void spu_recompiler::CHD(spu_opcode_t op) @@ -3098,11 +3098,11 @@ void spu_recompiler::CHD(spu_opcode_t op) // v128 value = v128::fromV(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)); // value.u16r[(op.i7 >> 1) & 0x7] = 0x0203; // c->movdqa(vr, XmmConst(value)); - // c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + // c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); // return; // } - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); if (op.i7) c->add(*addr, +op.i7); c->not_(*addr); @@ -3110,8 +3110,8 @@ void spu_recompiler::CHD(spu_opcode_t op) const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); - c->mov(asmjit::x86::word_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x0203); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); + c->mov(asmjit::x86::word_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), 0x0203); } void spu_recompiler::CWD(spu_opcode_t op) @@ -3123,11 +3123,11 @@ void spu_recompiler::CWD(spu_opcode_t op) // v128 value = v128::fromV(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)); // value.u32r[(op.i7 >> 2) & 0x3] = 0x00010203; // c->movdqa(vr, XmmConst(value)); - // c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + // c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); // return; // } - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); if (op.i7) c->add(*addr, +op.i7); c->not_(*addr); @@ -3135,8 +3135,8 @@ void spu_recompiler::CWD(spu_opcode_t op) const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); - c->mov(asmjit::x86::dword_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), 0x00010203); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); + c->mov(asmjit::x86::dword_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), 0x00010203); } void spu_recompiler::CDD(spu_opcode_t op) @@ -3148,11 +3148,11 @@ void spu_recompiler::CDD(spu_opcode_t op) // v128 value = v128::fromV(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)); // value.u64r[(op.i7 >> 3) & 0x1] = 0x0001020304050607ull; // c->movdqa(vr, XmmConst(value)); - // c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + // c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); // return; // } - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); if (op.i7) c->add(*addr, +op.i7); c->not_(*addr); @@ -3160,9 +3160,9 @@ void spu_recompiler::CDD(spu_opcode_t op) const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from32r(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); c->mov(*qw0, asmjit::Imm(0x0001020304050607ull)); - c->mov(asmjit::x86::qword_ptr(*cpu, addr->r64(), 0, offset32(&spu_thread::gpr, op.rt)), *qw0); + c->mov(asmjit::x86::qword_ptr(*cpu, addr->r64(), 0, OFFSET_OF(spu_thread, gpr[op.rt])), *qw0); } void spu_recompiler::ROTQBII(spu_opcode_t op) @@ -3173,7 +3173,7 @@ void spu_recompiler::ROTQBII(spu_opcode_t op) c->psllq(va, (op.i7 & 0x7)); c->psrlq(vt, 64 - (op.i7 & 0x7)); c->por(vt, va); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } void spu_recompiler::ROTQMBII(spu_opcode_t op) @@ -3185,7 +3185,7 @@ void spu_recompiler::ROTQMBII(spu_opcode_t op) c->psrlq(va, ((0 - op.i7) & 0x7)); c->psllq(vt, 64 - ((0 - op.i7) & 0x7)); c->por(vt, va); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } void spu_recompiler::SHLQBII(spu_opcode_t op) @@ -3197,7 +3197,7 @@ void spu_recompiler::SHLQBII(spu_opcode_t op) c->psllq(va, (op.i7 & 0x7)); c->psrlq(vt, 64 - (op.i7 & 0x7)); c->por(vt, va); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } void spu_recompiler::ROTQBYI(spu_opcode_t op) @@ -3225,7 +3225,7 @@ void spu_recompiler::ROTQBYI(spu_opcode_t op) c->por(va, v2); } - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ROTQMBYI(spu_opcode_t op) @@ -3233,7 +3233,7 @@ void spu_recompiler::ROTQMBYI(spu_opcode_t op) const int s = (0 - op.i7) & 0x1f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psrldq(va, s); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::SHLQBYI(spu_opcode_t op) @@ -3241,7 +3241,7 @@ void spu_recompiler::SHLQBYI(spu_opcode_t op) const int s = op.i7 & 0x1f; const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pslldq(va, s); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::NOP(spu_opcode_t) @@ -3251,23 +3251,23 @@ void spu_recompiler::NOP(spu_opcode_t) void spu_recompiler::CGT(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); - c->pcmpgtd(va, SPU_OFF_128(gpr, op.rb)); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->pcmpgtd(va, SPU_OFF_128(gpr[op.rb])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::XOR(spu_opcode_t op) { // xor const XmmLink& va = XmmGet(op.ra, XmmType::Int); - c->pxor(va, SPU_OFF_128(gpr, op.rb)); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->pxor(va, SPU_OFF_128(gpr[op.rb])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CGTH(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); - c->pcmpgtw(va, SPU_OFF_128(gpr, op.rb)); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->pcmpgtw(va, SPU_OFF_128(gpr[op.rb])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::EQV(spu_opcode_t op) @@ -3276,21 +3276,21 @@ void spu_recompiler::EQV(spu_opcode_t op) if (utils::has_avx512()) { - c->vpternlogd(vb, vb, SPU_OFF_128(gpr, op.ra), 0x99 /* xnorCB */); - c->movdqa(SPU_OFF_128(gpr, op.rt), vb); + c->vpternlogd(vb, vb, SPU_OFF_128(gpr[op.ra]), 0x99 /* xnorCB */); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vb); return; } c->pxor(vb, XmmConst(v128::from32p(0xffffffff))); - c->pxor(vb, SPU_OFF_128(gpr, op.ra)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vb); + c->pxor(vb, SPU_OFF_128(gpr[op.ra])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::CGTB(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); - c->pcmpgtb(va, SPU_OFF_128(gpr, op.rb)); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->pcmpgtb(va, SPU_OFF_128(gpr[op.rb])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::SUMB(spu_opcode_t op) @@ -3316,13 +3316,13 @@ void spu_recompiler::SUMB(spu_opcode_t op) c->paddw(va, v1); c->paddw(vb, v2); c->por(va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::HGT(spu_opcode_t op) { - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_s32, 3)); - c->cmp(*addr, SPU_OFF_32(gpr, op.rb, &v128::_s32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._s32[3])); + c->cmp(*addr, SPU_OFF_32(gpr[op.rb]._s32[3])); asmjit::Label label = c->newLabel(); asmjit::Label ret = c->newLabel(); @@ -3347,26 +3347,26 @@ void spu_recompiler::CLZ(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); const XmmLink& vt = XmmAlloc(); c->vplzcntd(vt, va); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); return; } c->mov(qw0->r32(), 32 + 31); for (u32 i = 0; i < 4; i++) // unrolled loop { - c->bsr(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, i)); + c->bsr(*addr, SPU_OFF_32(gpr[op.ra]._u32[i])); c->cmovz(*addr, qw0->r32()); c->xor_(*addr, 31); - c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), *addr); + c->mov(SPU_OFF_32(gpr[op.rt]._u32[i]), *addr); } } void spu_recompiler::XSWD(spu_opcode_t op) { - c->movsxd(*qw0, SPU_OFF_32(gpr, op.ra, &v128::_s32, 0)); - c->movsxd(*qw1, SPU_OFF_32(gpr, op.ra, &v128::_s32, 2)); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_s64, 0), *qw0); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_s64, 1), *qw1); + c->movsxd(*qw0, SPU_OFF_32(gpr[op.ra]._s32[0])); + c->movsxd(*qw1, SPU_OFF_32(gpr[op.ra]._s32[2])); + c->mov(SPU_OFF_64(gpr[op.rt]._s64[0]), *qw0); + c->mov(SPU_OFF_64(gpr[op.rt]._s64[1]), *qw1); } void spu_recompiler::XSHW(spu_opcode_t op) @@ -3374,7 +3374,7 @@ void spu_recompiler::XSHW(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pslld(va, 16); c->psrad(va, 16); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CNTB(spu_opcode_t op) @@ -3400,7 +3400,7 @@ void spu_recompiler::CNTB(spu_opcode_t op) c->psrlq(v1, 4); c->pand(v1, vm); c->paddb(va, v1); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::XSBH(spu_opcode_t op) @@ -3408,7 +3408,7 @@ void spu_recompiler::XSBH(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psllw(va, 8); c->psraw(va, 8); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CLGT(spu_opcode_t op) @@ -3418,17 +3418,17 @@ void spu_recompiler::CLGT(spu_opcode_t op) const XmmLink& vi = XmmAlloc(); c->movdqa(vi, XmmConst(v128::from32p(0x80000000))); c->pxor(va, vi); - c->pxor(vi, SPU_OFF_128(gpr, op.rb)); + c->pxor(vi, SPU_OFF_128(gpr[op.rb])); c->pcmpgtd(va, vi); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ANDC(spu_opcode_t op) { // and not const XmmLink& vb = XmmGet(op.rb, XmmType::Int); - c->pandn(vb, SPU_OFF_128(gpr, op.ra)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vb); + c->pandn(vb, SPU_OFF_128(gpr[op.ra])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::FCGT(spu_opcode_t op) @@ -3444,31 +3444,31 @@ void spu_recompiler::FCGT(spu_opcode_t op) c->pxor(tmp0, tmp0); c->pxor(tmp1, tmp1); - c->cmpps(tmp0, SPU_OFF_128(gpr, op.ra), 3); // tmp0 is true if a is extended (nan/inf) - c->cmpps(tmp1, SPU_OFF_128(gpr, op.rb), 3); // tmp1 is true if b is extended (nan/inf) + c->cmpps(tmp0, SPU_OFF_128(gpr[op.ra]), 3); // tmp0 is true if a is extended (nan/inf) + c->cmpps(tmp1, SPU_OFF_128(gpr[op.rb]), 3); // tmp1 is true if b is extended (nan/inf) // compute lower a and b c->movaps(tmp2, last_exp_bit); c->movaps(tmp3, last_exp_bit); - c->pandn(tmp2, SPU_OFF_128(gpr, op.ra)); // tmp2 = lowered_a - c->pandn(tmp3, SPU_OFF_128(gpr, op.rb)); // tmp3 = lowered_b + c->pandn(tmp2, SPU_OFF_128(gpr[op.ra])); // tmp2 = lowered_a + c->pandn(tmp3, SPU_OFF_128(gpr[op.rb])); // tmp3 = lowered_b // lower a if extended c->movaps(tmpv, tmp0); c->pand(tmpv, tmp2); - c->pandn(tmp0, SPU_OFF_128(gpr, op.ra)); + c->pandn(tmp0, SPU_OFF_128(gpr[op.ra])); c->orps(tmp0, tmpv); // lower b if extended c->movaps(tmpv, tmp1); c->pand(tmpv, tmp3); - c->pandn(tmp1, SPU_OFF_128(gpr, op.rb)); + c->pandn(tmp1, SPU_OFF_128(gpr[op.rb])); c->orps(tmp1, tmpv); // flush to 0 if denormalized c->pxor(tmpv, tmpv); - c->movaps(tmp2, SPU_OFF_128(gpr, op.ra)); - c->movaps(tmp3, SPU_OFF_128(gpr, op.rb)); + c->movaps(tmp2, SPU_OFF_128(gpr[op.ra])); + c->movaps(tmp3, SPU_OFF_128(gpr[op.rb])); c->andps(tmp2, all_exp_bits); c->andps(tmp3, all_exp_bits); c->cmpps(tmp2, tmpv, 0); @@ -3477,7 +3477,7 @@ void spu_recompiler::FCGT(spu_opcode_t op) c->pandn(tmp3, tmp1); c->cmpps(tmp3, tmp2, 1); - c->movaps(SPU_OFF_128(gpr, op.rt), tmp3); + c->movaps(SPU_OFF_128(gpr[op.rt]), tmp3); } void spu_recompiler::DFCGT(spu_opcode_t op) @@ -3488,15 +3488,15 @@ void spu_recompiler::DFCGT(spu_opcode_t op) void spu_recompiler::FA(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Float); - c->addps(va, SPU_OFF_128(gpr, op.rb)); - c->movaps(SPU_OFF_128(gpr, op.rt), va); + c->addps(va, SPU_OFF_128(gpr[op.rb])); + c->movaps(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::FS(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Float); - c->subps(va, SPU_OFF_128(gpr, op.rb)); - c->movaps(SPU_OFF_128(gpr, op.rt), va); + c->subps(va, SPU_OFF_128(gpr[op.rb])); + c->movaps(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::FM(spu_opcode_t op) @@ -3536,8 +3536,8 @@ void spu_recompiler::FM(spu_opcode_t op) c->movaps(tmp4, sign_bits); c->movaps(tmp5, sign_bits); c->movaps(tmp0, sign_bits); - c->andps(tmp4, SPU_OFF_128(gpr, op.ra)); - c->andps(tmp5, SPU_OFF_128(gpr, op.rb)); + c->andps(tmp4, SPU_OFF_128(gpr[op.ra])); + c->andps(tmp5, SPU_OFF_128(gpr[op.rb])); c->xorps(tmp4, tmp5); // sign mask c->pandn(tmp0, tmp2); c->orps(tmp4, tmp0); // add result sign back to original extended value @@ -3549,7 +3549,7 @@ void spu_recompiler::FM(spu_opcode_t op) c->andnps(tmp0, tmp3); c->andps(tmp2, tmp5); c->orps(tmp0, tmp2); - c->movaps(SPU_OFF_128(gpr, op.rt), tmp0); + c->movaps(SPU_OFF_128(gpr[op.rt]), tmp0); } void spu_recompiler::CLGTH(spu_opcode_t op) @@ -3559,9 +3559,9 @@ void spu_recompiler::CLGTH(spu_opcode_t op) const XmmLink& vi = XmmAlloc(); c->movdqa(vi, XmmConst(v128::from16p(0x8000))); c->pxor(va, vi); - c->pxor(vi, SPU_OFF_128(gpr, op.rb)); + c->pxor(vi, SPU_OFF_128(gpr[op.rb])); c->pcmpgtw(va, vi); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ORC(spu_opcode_t op) @@ -3570,14 +3570,14 @@ void spu_recompiler::ORC(spu_opcode_t op) if (utils::has_avx512()) { - c->vpternlogd(vb, vb, SPU_OFF_128(gpr, op.ra), 0xbb /* orC!B */); - c->movdqa(SPU_OFF_128(gpr, op.rt), vb); + c->vpternlogd(vb, vb, SPU_OFF_128(gpr[op.ra]), 0xbb /* orC!B */); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vb); return; } c->pxor(vb, XmmConst(v128::from32p(0xffffffff))); - c->por(vb, SPU_OFF_128(gpr, op.ra)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vb); + c->por(vb, SPU_OFF_128(gpr[op.ra])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::FCMGT(spu_opcode_t op) @@ -3596,19 +3596,19 @@ void spu_recompiler::FCMGT(spu_opcode_t op) c->pxor(tmp0, tmp0); c->pxor(tmp1, tmp1); - c->cmpps(tmp0, SPU_OFF_128(gpr, op.ra), 3); // tmp0 is true if a is extended (nan/inf) - c->cmpps(tmp1, SPU_OFF_128(gpr, op.rb), 3); // tmp1 is true if b is extended (nan/inf) + c->cmpps(tmp0, SPU_OFF_128(gpr[op.ra]), 3); // tmp0 is true if a is extended (nan/inf) + c->cmpps(tmp1, SPU_OFF_128(gpr[op.rb]), 3); // tmp1 is true if b is extended (nan/inf) // flush to 0 if denormalized c->pxor(tmpv, tmpv); - c->movaps(tmp2, SPU_OFF_128(gpr, op.ra)); - c->movaps(tmp3, SPU_OFF_128(gpr, op.rb)); + c->movaps(tmp2, SPU_OFF_128(gpr[op.ra])); + c->movaps(tmp3, SPU_OFF_128(gpr[op.rb])); c->andps(tmp2, all_exp_bits); c->andps(tmp3, all_exp_bits); c->cmpps(tmp2, tmpv, 0); c->cmpps(tmp3, tmpv, 0); - c->pandn(tmp2, SPU_OFF_128(gpr, op.ra)); - c->pandn(tmp3, SPU_OFF_128(gpr, op.rb)); + c->pandn(tmp2, SPU_OFF_128(gpr[op.ra])); + c->pandn(tmp3, SPU_OFF_128(gpr[op.rb])); // Set tmp1 to true where a is extended but b is not extended // This is a simplification since absolute values remove necessity of lowering @@ -3619,7 +3619,7 @@ void spu_recompiler::FCMGT(spu_opcode_t op) c->andps(tmp3, remove_sign_bits); c->cmpps(tmp3, tmp2, 1); c->orps(tmp3, tmp1); // Force result to all true if a is extended but b is not - c->movaps(SPU_OFF_128(gpr, op.rt), tmp3); + c->movaps(SPU_OFF_128(gpr[op.rt]), tmp3); } void spu_recompiler::DFCMGT(spu_opcode_t op) @@ -3630,22 +3630,22 @@ void spu_recompiler::DFCMGT(spu_opcode_t op) void spu_recompiler::DFA(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Double); - c->addpd(va, SPU_OFF_128(gpr, op.rb)); - c->movapd(SPU_OFF_128(gpr, op.rt), va); + c->addpd(va, SPU_OFF_128(gpr[op.rb])); + c->movapd(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::DFS(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Double); - c->subpd(va, SPU_OFF_128(gpr, op.rb)); - c->movapd(SPU_OFF_128(gpr, op.rt), va); + c->subpd(va, SPU_OFF_128(gpr[op.rb])); + c->movapd(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::DFM(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Double); - c->mulpd(va, SPU_OFF_128(gpr, op.rb)); - c->movapd(SPU_OFF_128(gpr, op.rt), va); + c->mulpd(va, SPU_OFF_128(gpr[op.rb])); + c->movapd(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CLGTB(spu_opcode_t op) @@ -3655,15 +3655,15 @@ void spu_recompiler::CLGTB(spu_opcode_t op) const XmmLink& vi = XmmAlloc(); c->movdqa(vi, XmmConst(v128::from8p(0x80))); c->pxor(va, vi); - c->pxor(vi, SPU_OFF_128(gpr, op.rb)); + c->pxor(vi, SPU_OFF_128(gpr[op.rb])); c->pcmpgtb(va, vi); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::HLGT(spu_opcode_t op) { - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); - c->cmp(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); + c->cmp(*addr, SPU_OFF_32(gpr[op.rb]._u32[3])); asmjit::Label label = c->newLabel(); asmjit::Label ret = c->newLabel(); @@ -3685,44 +3685,44 @@ void spu_recompiler::DFMA(spu_opcode_t op) { const XmmLink& vr = XmmGet(op.rt, XmmType::Double); const XmmLink& va = XmmGet(op.ra, XmmType::Double); - c->mulpd(va, SPU_OFF_128(gpr, op.rb)); + c->mulpd(va, SPU_OFF_128(gpr[op.rb])); c->addpd(vr, va); - c->movapd(SPU_OFF_128(gpr, op.rt), vr); + c->movapd(SPU_OFF_128(gpr[op.rt]), vr); } void spu_recompiler::DFMS(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Double); const XmmLink& vt = XmmGet(op.rt, XmmType::Double); - c->mulpd(va, SPU_OFF_128(gpr, op.rb)); + c->mulpd(va, SPU_OFF_128(gpr[op.rb])); c->subpd(va, vt); - c->movapd(SPU_OFF_128(gpr, op.rt), va); + c->movapd(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::DFNMS(spu_opcode_t op) { const XmmLink& vr = XmmGet(op.rt, XmmType::Double); const XmmLink& va = XmmGet(op.ra, XmmType::Double); - c->mulpd(va, SPU_OFF_128(gpr, op.rb)); + c->mulpd(va, SPU_OFF_128(gpr[op.rb])); c->subpd(vr, va); - c->movapd(SPU_OFF_128(gpr, op.rt), vr); + c->movapd(SPU_OFF_128(gpr[op.rt]), vr); } void spu_recompiler::DFNMA(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Double); const XmmLink& vt = XmmGet(op.rt, XmmType::Double); - c->mulpd(va, SPU_OFF_128(gpr, op.rb)); + c->mulpd(va, SPU_OFF_128(gpr[op.rb])); c->addpd(va, vt); c->xorpd(va, XmmConst(v128::from64p(0x8000000000000000))); - c->movapd(SPU_OFF_128(gpr, op.rt), va); + c->movapd(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CEQ(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); - c->pcmpeqd(va, SPU_OFF_128(gpr, op.rb)); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->pcmpeqd(va, SPU_OFF_128(gpr[op.rb])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::MPYHHU(spu_opcode_t op) @@ -3736,16 +3736,16 @@ void spu_recompiler::MPYHHU(spu_opcode_t op) c->pand(va, XmmConst(v128::from32p(0xffff0000))); c->psrld(va2, 16); c->por(va, va2); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ADDX(spu_opcode_t op) { const XmmLink& vt = XmmGet(op.rt, XmmType::Int); c->pand(vt, XmmConst(v128::from32p(1))); - c->paddd(vt, SPU_OFF_128(gpr, op.ra)); - c->paddd(vt, SPU_OFF_128(gpr, op.rb)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->paddd(vt, SPU_OFF_128(gpr[op.ra])); + c->paddd(vt, SPU_OFF_128(gpr[op.rb])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } void spu_recompiler::SFX(spu_opcode_t op) @@ -3753,9 +3753,9 @@ void spu_recompiler::SFX(spu_opcode_t op) const XmmLink& vt = XmmGet(op.rt, XmmType::Int); const XmmLink& vb = XmmGet(op.rb, XmmType::Int); c->pandn(vt, XmmConst(v128::from32p(1))); - c->psubd(vb, SPU_OFF_128(gpr, op.ra)); + c->psubd(vb, SPU_OFF_128(gpr[op.ra])); c->psubd(vb, vt); - c->movdqa(SPU_OFF_128(gpr, op.rt), vb); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::CGX(spu_opcode_t op) // nf @@ -3788,7 +3788,7 @@ void spu_recompiler::CGX(spu_opcode_t op) // nf c->pand(res, vt); c->por(res, va); c->psrld(res, 31); - c->movdqa(SPU_OFF_128(gpr, op.rt), res); + c->movdqa(SPU_OFF_128(gpr[op.rt]), res); } void spu_recompiler::BGX(spu_opcode_t op) // nf @@ -3818,7 +3818,7 @@ void spu_recompiler::BGX(spu_opcode_t op) // nf c->pcmpgtd(vb, va); c->por(vt, vb); c->psrld(vt, 31); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } void spu_recompiler::MPYHHA(spu_opcode_t op) @@ -3830,7 +3830,7 @@ void spu_recompiler::MPYHHA(spu_opcode_t op) c->psrld(vb, 16); c->pmaddwd(va, vb); c->paddd(vt, va); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } void spu_recompiler::MPYHHAU(spu_opcode_t op) @@ -3846,7 +3846,7 @@ void spu_recompiler::MPYHHAU(spu_opcode_t op) c->psrld(va2, 16); c->paddd(vt, va); c->paddd(vt, va2); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } void spu_recompiler::FSCRRD(spu_opcode_t op) @@ -3854,7 +3854,7 @@ void spu_recompiler::FSCRRD(spu_opcode_t op) // zero (hack) const XmmLink& v0 = XmmAlloc(); c->pxor(v0, v0); - c->movdqa(SPU_OFF_128(gpr, op.rt), v0); + c->movdqa(SPU_OFF_128(gpr[op.rt]), v0); } void spu_recompiler::FESD(spu_opcode_t op) @@ -3862,7 +3862,7 @@ void spu_recompiler::FESD(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Float); c->shufps(va, va, 0x8d); // _f[0] = _f[1]; _f[1] = _f[3]; c->cvtps2pd(va, va); - c->movapd(SPU_OFF_128(gpr, op.rt), va); + c->movapd(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::FRDS(spu_opcode_t op) @@ -3870,7 +3870,7 @@ void spu_recompiler::FRDS(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Double); c->cvtpd2ps(va, va); c->shufps(va, va, 0x72); // _f[1] = _f[0]; _f[3] = _f[1]; _f[0] = _f[2] = 0; - c->movaps(SPU_OFF_128(gpr, op.rt), va); + c->movaps(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::FSCRWR(spu_opcode_t /*op*/) @@ -3887,8 +3887,8 @@ void spu_recompiler::FCEQ(spu_opcode_t op) { // compare equal const XmmLink& vb = XmmGet(op.rb, XmmType::Float); - c->cmpps(vb, SPU_OFF_128(gpr, op.ra), 0); - c->movaps(SPU_OFF_128(gpr, op.rt), vb); + c->cmpps(vb, SPU_OFF_128(gpr[op.ra]), 0); + c->movaps(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::DFCEQ(spu_opcode_t op) @@ -3905,7 +3905,7 @@ void spu_recompiler::MPY(spu_opcode_t op) c->pand(va, vi); c->pand(vb, vi); c->pmaddwd(va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::MPYH(spu_opcode_t op) @@ -3915,7 +3915,7 @@ void spu_recompiler::MPYH(spu_opcode_t op) c->psrld(va, 16); c->pmullw(va, vb); c->pslld(va, 16); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::MPYHH(spu_opcode_t op) @@ -3925,7 +3925,7 @@ void spu_recompiler::MPYHH(spu_opcode_t op) c->psrld(va, 16); c->psrld(vb, 16); c->pmaddwd(va, vb); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::MPYS(spu_opcode_t op) @@ -3935,14 +3935,14 @@ void spu_recompiler::MPYS(spu_opcode_t op) c->pmulhw(va, vb); c->pslld(va, 16); c->psrad(va, 16); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CEQH(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); - c->pcmpeqw(va, SPU_OFF_128(gpr, op.rb)); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->pcmpeqw(va, SPU_OFF_128(gpr[op.rb])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::FCMEQ(spu_opcode_t op) @@ -3951,9 +3951,9 @@ void spu_recompiler::FCMEQ(spu_opcode_t op) const XmmLink& vi = XmmAlloc(); c->movaps(vi, XmmConst(v128::from32p(0x7fffffff))); c->andps(vb, vi); // abs - c->andps(vi, SPU_OFF_128(gpr, op.ra)); + c->andps(vi, SPU_OFF_128(gpr[op.ra])); c->cmpps(vb, vi, 0); // == - c->movaps(SPU_OFF_128(gpr, op.rt), vb); + c->movaps(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::DFCMEQ(spu_opcode_t op) @@ -3972,14 +3972,14 @@ void spu_recompiler::MPYU(spu_opcode_t op) c->pslld(va, 16); c->pand(va2, XmmConst(v128::from32p(0xffff))); c->por(va, va2); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CEQB(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); - c->pcmpeqb(va, SPU_OFF_128(gpr, op.rb)); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->pcmpeqb(va, SPU_OFF_128(gpr[op.rb])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::FI(spu_opcode_t op) @@ -4028,13 +4028,13 @@ void spu_recompiler::FI(spu_opcode_t op) c->pand(ymul, XmmConst(v128::from32p(1 << 23))); c->psubd(vb, ymul); - c->movaps(SPU_OFF_128(gpr, op.rt), vb); + c->movaps(SPU_OFF_128(gpr[op.rt]), vb); } void spu_recompiler::HEQ(spu_opcode_t op) { - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_s32, 3)); - c->cmp(*addr, SPU_OFF_32(gpr, op.rb, &v128::_s32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._s32[3])); + c->cmp(*addr, SPU_OFF_32(gpr[op.rb]._s32[3])); asmjit::Label label = c->newLabel(); asmjit::Label ret = c->newLabel(); @@ -4062,7 +4062,7 @@ void spu_recompiler::CFLTS(spu_opcode_t op) c->cmpps(vi, va, 2); c->cvttps2dq(va, va); // convert to ints with truncation c->pxor(va, vi); // fix result saturation (0x80000000 -> 0x7fffffff) - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CFLTU(spu_opcode_t op) @@ -4079,7 +4079,7 @@ void spu_recompiler::CFLTU(spu_opcode_t op) c->vcvttps2udq(vs, va); c->psrad(va, 31); c->pandn(va, vs); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); return; } @@ -4097,7 +4097,7 @@ void spu_recompiler::CFLTU(spu_opcode_t op) c->cvttps2dq(vs2, vs2); c->por(va, vs); c->por(va, vs2); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CSFLT(spu_opcode_t op) @@ -4106,7 +4106,7 @@ void spu_recompiler::CSFLT(spu_opcode_t op) c->cvtdq2ps(va, va); // convert to floats if (op.i8 != 155) c->mulps(va, XmmConst(v128::fromf32p(std::exp2(static_cast(static_cast(op.i8 - 155)))))); // scale - c->movaps(SPU_OFF_128(gpr, op.rt), va); + c->movaps(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CUFLT(spu_opcode_t op) @@ -4130,7 +4130,7 @@ void spu_recompiler::CUFLT(spu_opcode_t op) if (op.i8 != 155) c->mulps(va, XmmConst(v128::fromf32p(std::exp2(static_cast(static_cast(op.i8 - 155)))))); // scale - c->movaps(SPU_OFF_128(gpr, op.rt), va); + c->movaps(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::BRZ(spu_opcode_t op) @@ -4143,7 +4143,7 @@ void spu_recompiler::BRZ(spu_opcode_t op) } asmjit::Label branch_label = c->newLabel(); - c->cmp(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), 0); + c->cmp(SPU_OFF_32(gpr[op.rt]._u32[3]), 0); c->je(branch_label); after.emplace_back([=, this]() @@ -4164,8 +4164,8 @@ void spu_recompiler::STQA(spu_opcode_t op) } else { - c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0)); - c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1)); + c->mov(*qw0, SPU_OFF_64(gpr[op.rt]._u64[0])); + c->mov(*qw1, SPU_OFF_64(gpr[op.rt]._u64[1])); c->bswap(*qw0); c->bswap(*qw1); c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 0), *qw1); @@ -4183,7 +4183,7 @@ void spu_recompiler::BRNZ(spu_opcode_t op) } asmjit::Label branch_label = c->newLabel(); - c->cmp(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), 0); + c->cmp(SPU_OFF_32(gpr[op.rt]._u32[3]), 0); c->jne(branch_label); after.emplace_back([=, this]() @@ -4204,7 +4204,7 @@ void spu_recompiler::BRHZ(spu_opcode_t op) } asmjit::Label branch_label = c->newLabel(); - c->cmp(SPU_OFF_16(gpr, op.rt, &v128::_u16, 6), 0); + c->cmp(SPU_OFF_16(gpr[op.rt]._u16[6]), 0); c->je(branch_label); after.emplace_back([=, this]() @@ -4225,7 +4225,7 @@ void spu_recompiler::BRHNZ(spu_opcode_t op) } asmjit::Label branch_label = c->newLabel(); - c->cmp(SPU_OFF_16(gpr, op.rt, &v128::_u16, 6), 0); + c->cmp(SPU_OFF_16(gpr[op.rt]._u16[6]), 0); c->jne(branch_label); after.emplace_back([=, this]() @@ -4249,8 +4249,8 @@ void spu_recompiler::STQR(spu_opcode_t op) } else { - c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0)); - c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1)); + c->mov(*qw0, SPU_OFF_64(gpr[op.rt]._u64[0])); + c->mov(*qw1, SPU_OFF_64(gpr[op.rt]._u64[1])); c->bswap(*qw0); c->bswap(*qw1); c->mov(asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 0), *qw1); @@ -4273,7 +4273,7 @@ void spu_recompiler::LQA(spu_opcode_t op) const XmmLink& vt = XmmAlloc(); c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16))); c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } else { @@ -4281,8 +4281,8 @@ void spu_recompiler::LQA(spu_opcode_t op) c->mov(*qw1, asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 8)); c->bswap(*qw0); c->bswap(*qw1); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0); + c->mov(SPU_OFF_64(gpr[op.rt]._u64[0]), *qw1); + c->mov(SPU_OFF_64(gpr[op.rt]._u64[1]), *qw0); } } @@ -4295,7 +4295,7 @@ void spu_recompiler::BRASL(spu_opcode_t op) c->and_(*addr, 0x3fffc); c->movd(vr, *addr); c->pslldq(vr, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); branch_set_link(m_pos + 4); branch_fixed(target, true); @@ -4321,7 +4321,7 @@ void spu_recompiler::FSMBI(spu_opcode_t op) const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(data)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); } void spu_recompiler::BRSL(spu_opcode_t op) @@ -4333,7 +4333,7 @@ void spu_recompiler::BRSL(spu_opcode_t op) c->and_(*addr, 0x3fffc); c->movd(vr, *addr); c->pslldq(vr, 12); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); if (target != m_pos + 4) { @@ -4353,7 +4353,7 @@ void spu_recompiler::LQR(spu_opcode_t op) const XmmLink& vt = XmmAlloc(); c->movdqa(vt, asmjit::x86::oword_ptr(*ls, addr->r64())); c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } else { @@ -4361,8 +4361,8 @@ void spu_recompiler::LQR(spu_opcode_t op) c->mov(*qw1, asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 8)); c->bswap(*qw0); c->bswap(*qw1); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0); + c->mov(SPU_OFF_64(gpr[op.rt]._u64[0]), *qw1); + c->mov(SPU_OFF_64(gpr[op.rt]._u64[1]), *qw0); } } @@ -4370,28 +4370,28 @@ void spu_recompiler::IL(spu_opcode_t op) { const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from32p(op.si16))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); } void spu_recompiler::ILHU(spu_opcode_t op) { const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from32p(op.i16 << 16))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); } void spu_recompiler::ILH(spu_opcode_t op) { const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from16p(op.i16))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); } void spu_recompiler::IOHL(spu_opcode_t op) { const XmmLink& vt = XmmGet(op.rt, XmmType::Int); c->por(vt, XmmConst(v128::from32p(op.i16))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } void spu_recompiler::ORI(spu_opcode_t op) @@ -4399,58 +4399,58 @@ void spu_recompiler::ORI(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); if (op.si10) c->por(va, XmmConst(v128::from32p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ORHI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->por(va, XmmConst(v128::from16p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ORBI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->por(va, XmmConst(v128::from8p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::SFI(spu_opcode_t op) { const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from32p(op.si10))); - c->psubd(vr, SPU_OFF_128(gpr, op.ra)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->psubd(vr, SPU_OFF_128(gpr[op.ra])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); } void spu_recompiler::SFHI(spu_opcode_t op) { const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from16p(op.si10))); - c->psubw(vr, SPU_OFF_128(gpr, op.ra)); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->psubw(vr, SPU_OFF_128(gpr[op.ra])); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); } void spu_recompiler::ANDI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pand(va, XmmConst(v128::from32p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ANDHI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pand(va, XmmConst(v128::from16p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::ANDBI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pand(va, XmmConst(v128::from8p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::AI(spu_opcode_t op) @@ -4458,7 +4458,7 @@ void spu_recompiler::AI(spu_opcode_t op) // add const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->paddd(va, XmmConst(v128::from32p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::AHI(spu_opcode_t op) @@ -4466,12 +4466,12 @@ void spu_recompiler::AHI(spu_opcode_t op) // add const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->paddw(va, XmmConst(v128::from16p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::STQD(spu_opcode_t op) { - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); if (op.si10) c->add(*addr, op.si10 * 16); c->and_(*addr, 0x3fff0); @@ -4484,8 +4484,8 @@ void spu_recompiler::STQD(spu_opcode_t op) } else { - c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0)); - c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1)); + c->mov(*qw0, SPU_OFF_64(gpr[op.rt]._u64[0])); + c->mov(*qw1, SPU_OFF_64(gpr[op.rt]._u64[1])); c->bswap(*qw0); c->bswap(*qw1); c->mov(asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 0), *qw1); @@ -4495,7 +4495,7 @@ void spu_recompiler::STQD(spu_opcode_t op) void spu_recompiler::LQD(spu_opcode_t op) { - c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); + c->mov(*addr, SPU_OFF_32(gpr[op.ra]._u32[3])); if (op.si10) c->add(*addr, op.si10 * 16); c->and_(*addr, 0x3fff0); @@ -4505,7 +4505,7 @@ void spu_recompiler::LQD(spu_opcode_t op) const XmmLink& vt = XmmAlloc(); c->movdqa(vt, asmjit::x86::oword_ptr(*ls, addr->r64())); c->pshufb(vt, XmmConst(v128::from32r(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vt); } else { @@ -4513,8 +4513,8 @@ void spu_recompiler::LQD(spu_opcode_t op) c->mov(*qw1, asmjit::x86::qword_ptr(*ls, addr->r64(), 0, 8)); c->bswap(*qw0); c->bswap(*qw1); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0); + c->mov(SPU_OFF_64(gpr[op.rt]._u64[0]), *qw1); + c->mov(SPU_OFF_64(gpr[op.rt]._u64[1]), *qw0); } } @@ -4522,47 +4522,47 @@ void spu_recompiler::XORI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pxor(va, XmmConst(v128::from32p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::XORHI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pxor(va, XmmConst(v128::from16p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::XORBI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pxor(va, XmmConst(v128::from8p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CGTI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pcmpgtd(va, XmmConst(v128::from32p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CGTHI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pcmpgtw(va, XmmConst(v128::from16p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CGTBI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pcmpgtb(va, XmmConst(v128::from8p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::HGTI(spu_opcode_t op) { - c->cmp(SPU_OFF_32(gpr, op.ra, &v128::_s32, 3), +op.si10); + c->cmp(SPU_OFF_32(gpr[op.ra]._s32[3]), +op.si10); asmjit::Label label = c->newLabel(); asmjit::Label ret = c->newLabel(); @@ -4585,7 +4585,7 @@ void spu_recompiler::CLGTI(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pxor(va, XmmConst(v128::from32p(0x80000000))); c->pcmpgtd(va, XmmConst(v128::from32p(op.si10 - 0x80000000))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CLGTHI(spu_opcode_t op) @@ -4593,7 +4593,7 @@ void spu_recompiler::CLGTHI(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pxor(va, XmmConst(v128::from16p(0x8000))); c->pcmpgtw(va, XmmConst(v128::from16p(op.si10 - 0x8000))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CLGTBI(spu_opcode_t op) @@ -4601,12 +4601,12 @@ void spu_recompiler::CLGTBI(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->psubb(va, XmmConst(v128::from8p(0x80))); c->pcmpgtb(va, XmmConst(v128::from8p(op.si10 - 0x80))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::HLGTI(spu_opcode_t op) { - c->cmp(SPU_OFF_32(gpr, op.ra, &v128::_u32, 3), +op.si10); + c->cmp(SPU_OFF_32(gpr[op.ra]._u32[3]), +op.si10); asmjit::Label label = c->newLabel(); asmjit::Label ret = c->newLabel(); @@ -4628,7 +4628,7 @@ void spu_recompiler::MPYI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pmaddwd(va, XmmConst(v128::from32p(op.si10 & 0xffff))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::MPYUI(spu_opcode_t op) @@ -4642,33 +4642,33 @@ void spu_recompiler::MPYUI(spu_opcode_t op) c->pmullw(va2, vi); c->pslld(va, 16); c->por(va, va2); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CEQI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pcmpeqd(va, XmmConst(v128::from32p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CEQHI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pcmpeqw(va, XmmConst(v128::from16p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::CEQBI(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->pcmpeqb(va, XmmConst(v128::from8p(op.si10))); - c->movdqa(SPU_OFF_128(gpr, op.rt), va); + c->movdqa(SPU_OFF_128(gpr[op.rt]), va); } void spu_recompiler::HEQI(spu_opcode_t op) { - c->cmp(SPU_OFF_32(gpr, op.ra, &v128::_u32, 3), +op.si10); + c->cmp(SPU_OFF_32(gpr[op.ra]._u32[3]), +op.si10); asmjit::Label label = c->newLabel(); asmjit::Label ret = c->newLabel(); @@ -4698,7 +4698,7 @@ void spu_recompiler::ILA(spu_opcode_t op) { const XmmLink& vr = XmmAlloc(); c->movdqa(vr, XmmConst(v128::from32p(op.i18))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vr); + c->movdqa(SPU_OFF_128(gpr[op.rt]), vr); } void spu_recompiler::SELB(spu_opcode_t op) @@ -4708,22 +4708,22 @@ void spu_recompiler::SELB(spu_opcode_t op) if (utils::has_avx512()) { - c->vpternlogd(vc, vb, SPU_OFF_128(gpr, op.ra), 0xca /* A?B:C */); - c->movdqa(SPU_OFF_128(gpr, op.rt4), vc); + c->vpternlogd(vc, vb, SPU_OFF_128(gpr[op.ra]), 0xca /* A?B:C */); + c->movdqa(SPU_OFF_128(gpr[op.rt4]), vc); return; } if (utils::has_xop()) { - c->vpcmov(vc, vb, SPU_OFF_128(gpr, op.ra), vc); - c->movdqa(SPU_OFF_128(gpr, op.rt4), vc); + c->vpcmov(vc, vb, SPU_OFF_128(gpr[op.ra]), vc); + c->movdqa(SPU_OFF_128(gpr[op.rt4]), vc); return; } c->pand(vb, vc); - c->pandn(vc, SPU_OFF_128(gpr, op.ra)); + c->pandn(vc, SPU_OFF_128(gpr[op.ra])); c->por(vb, vc); - c->movdqa(SPU_OFF_128(gpr, op.rt4), vb); + c->movdqa(SPU_OFF_128(gpr[op.rt4]), vb); } void spu_recompiler::SHUFB(spu_opcode_t op) @@ -4748,7 +4748,7 @@ void spu_recompiler::SHUFB(spu_opcode_t op) c->setExtraReg(asmjit::x86::k1); c->vpshufb(vt, vb, vm); // {k1} c->vpternlogd(vt, va, vc, 0xf6 /* orAxorBC */); - c->movdqa(SPU_OFF_128(gpr, op.rt4), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt4]), vt); return; } @@ -4800,7 +4800,7 @@ void spu_recompiler::SHUFB(spu_opcode_t op) c->por(vt, vc); } - c->movdqa(SPU_OFF_128(gpr, op.rt4), vt); + c->movdqa(SPU_OFF_128(gpr[op.rt4]), vt); } void spu_recompiler::MPYA(spu_opcode_t op) @@ -4812,8 +4812,8 @@ void spu_recompiler::MPYA(spu_opcode_t op) c->pand(va, vi); c->pand(vb, vi); c->pmaddwd(va, vb); - c->paddd(va, SPU_OFF_128(gpr, op.rc)); - c->movdqa(SPU_OFF_128(gpr, op.rt4), va); + c->paddd(va, SPU_OFF_128(gpr[op.rc])); + c->movdqa(SPU_OFF_128(gpr[op.rt4]), va); } void spu_recompiler::FNMS(spu_opcode_t op) @@ -4834,9 +4834,9 @@ void spu_recompiler::FNMS(spu_opcode_t op) c->andps(vb, v2); // vb = rb & ~rb_extended c->mulps(va, vb); - c->movaps(vb, SPU_OFF_128(gpr, op.rc)); + c->movaps(vb, SPU_OFF_128(gpr[op.rc])); c->subps(vb, va); - c->movaps(SPU_OFF_128(gpr, op.rt4), vb); + c->movaps(SPU_OFF_128(gpr[op.rt4]), vb); } void spu_recompiler::FMA(spu_opcode_t op) @@ -4857,8 +4857,8 @@ void spu_recompiler::FMA(spu_opcode_t op) c->andps(vb, v2); // vb = rb & ~rb_extended c->mulps(va, vb); - c->addps(va, SPU_OFF_128(gpr, op.rc)); - c->movaps(SPU_OFF_128(gpr, op.rt4), va); + c->addps(va, SPU_OFF_128(gpr[op.rc])); + c->movaps(SPU_OFF_128(gpr[op.rt4]), va); } void spu_recompiler::FMS(spu_opcode_t op) @@ -4879,6 +4879,6 @@ void spu_recompiler::FMS(spu_opcode_t op) c->andps(vb, v2); // vb = rb & ~rb_extended c->mulps(va, vb); - c->subps(va, SPU_OFF_128(gpr, op.rc)); - c->movaps(SPU_OFF_128(gpr, op.rt4), va); + c->subps(va, SPU_OFF_128(gpr[op.rc])); + c->movaps(SPU_OFF_128(gpr[op.rt4]), va); } diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h index 622ce7f6d..410de7b37 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h @@ -5,8 +5,6 @@ #include -union v128; - // SPU ASMJIT Recompiler class spu_recompiler : public spu_recompiler_base { diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index 74ae4055a..0a26b572d 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -203,7 +203,7 @@ DECLARE(spu_runtime::tr_all) = [] *raw++ = 0x41; *raw++ = 0x8b; *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::pc)); + *raw++ = ::narrow(OFFSET_OF(spu_thread, pc)); // Get LS address starting from PC: lea rcx, [rbp + rax] *raw++ = 0x48; @@ -233,7 +233,7 @@ DECLARE(spu_runtime::tr_all) = [] *raw++ = 0x49; *raw++ = 0xc7; *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::block_hash)); + *raw++ = ::narrow(OFFSET_OF(spu_thread, block_hash)); *raw++ = 0x00; *raw++ = 0x00; *raw++ = 0x00; @@ -259,11 +259,11 @@ DECLARE(spu_runtime::tr_all) = [] // x19 = m_thread a.k.a arg[0] // x20 = ls_base // x21 - x22 = args[2 - 3] - // ensure(::offset32(&spu_thread::pc) <= 32760); - // ensure(::offset32(&spu_thread::block_hash) <= 32760); + // ensure(OFFSET_OF(spu_thread, pc) <= 32760); + // ensure(OFFSET_OF(spu_thread, block_hash) <= 32760); // Load PC - c.ldr(a64::w1, arm::Mem(a64::x19, ::offset32(&spu_thread::pc))); // REG_Base + offset(spu_thread::pc) + c.ldr(a64::w1, arm::Mem(a64::x19, OFFSET_OF(spu_thread, pc))); // REG_Base + offset(spu_thread::pc) // Compute LS address = REG_Sp + PC, store into x7 (use later) c.add(a64::x7, a64::x20, a64::x1); // Load 32b from LS address @@ -274,7 +274,7 @@ DECLARE(spu_runtime::tr_all) = [] c.mov(a64::x4, Imm(reinterpret_cast(g_dispatcher))); // Update block hash c.mov(a64::x5, Imm(0)); - c.str(a64::x5, arm::Mem(a64::x19, ::offset32(&spu_thread::block_hash))); // REG_Base + offset(spu_thread::block_hash) + c.str(a64::x5, arm::Mem(a64::x19, OFFSET_OF(spu_thread, block_hash))); // REG_Base + offset(spu_thread::block_hash) // Jump to [g_dispatcher + idx * 8] c.mov(a64::x6, Imm(8)); c.mul(a64::x6, a64::x3, a64::x6); @@ -327,7 +327,7 @@ DECLARE(spu_runtime::g_gateway) = build_function_asm("spu_gatewa #endif // Save native stack pointer for longjmp emulation - c.mov(x86::qword_ptr(args[0], ::offset32(&spu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp); + c.mov(x86::qword_ptr(args[0], OFFSET_OF(spu_thread, hv_ctx.regs)), x86::rsp); // Move 4 args (despite spu_function_t def) c.mov(x86::r13, args[0]); @@ -381,7 +381,7 @@ DECLARE(spu_runtime::g_gateway) = build_function_asm("spu_gatewa #elif defined(ARCH_ARM64) // Save non-volatile regs. We do this within the thread context instead of normal stack - const u32 hv_regs_base = ::offset32(&spu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs); + const u32 hv_regs_base = OFFSET_OF(spu_thread, hv_ctx.regs); // NOTE: A64 gp-gp-imm add only takes immediates of upto 4095. Larger numbers can work, but need to be multiples of 2 for lowering to replace the instruction correctly // Unfortunately asmjit fails silently on these patterns which can generate incorrect code c.mov(a64::x15, args[0]); @@ -447,14 +447,14 @@ DECLARE(spu_runtime::g_escape) = build_function_asm("spu_ #if defined(ARCH_X64) // Restore native stack pointer (longjmp emulation) - c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs))); + c.mov(x86::rsp, x86::qword_ptr(args[0], OFFSET_OF(spu_thread, hv_ctx.regs))); // Return to the return location c.sub(x86::rsp, 8); c.ret(); #elif defined(ARCH_ARM64) // Far ret, jumps to gateway epilogue - const u32 reg_base = ::offset32(&spu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs); + const u32 reg_base = OFFSET_OF(spu_thread, hv_ctx.regs); c.mov(a64::x19, args[0]); c.mov(a64::x15, Imm(reg_base)); c.add(a64::x15, a64::x15, args[0]); @@ -471,28 +471,28 @@ DECLARE(spu_runtime::g_tail_escape) = build_function_asm(::offset32(&spu_thread::block_hash)); + *raw++ = ::narrow(OFFSET_OF(spu_thread, block_hash)); // Load PC: mov eax, [r13 + spu_thread::pc] *raw++ = 0x41; *raw++ = 0x8b; *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::pc)); + *raw++ = ::narrow(OFFSET_OF(spu_thread, pc)); // Get LS address starting from PC: lea rcx, [rbp + rax] *raw++ = 0x48; @@ -7824,18 +7824,18 @@ struct spu_fast : public spu_recompiler_base *raw++ = 0x48; *raw++ = 0x8d; *raw++ = 0x7d; - *raw++ = ::narrow(::offset32(&spu_thread::gpr)); + *raw++ = ::narrow(OFFSET_OF(spu_thread, gpr)); // Save base pc: mov [rbp + spu_thread::base_pc], eax *raw++ = 0x89; *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::base_pc)); + *raw++ = ::narrow(OFFSET_OF(spu_thread, base_pc)); // inc block_counter *raw++ = 0x48; *raw++ = 0xff; *raw++ = 0x85; - const u32 blc_off = ::offset32(&spu_thread::block_counter); + const u32 blc_off = OFFSET_OF(spu_thread, block_counter); std::memcpy(raw, &blc_off, 4); raw += 4; @@ -7858,7 +7858,7 @@ struct spu_fast : public spu_recompiler_base *raw++ = 0x44; *raw++ = 0x89; *raw++ = 0x65; - *raw++ = ::narrow(::offset32(&spu_thread::pc)); + *raw++ = ::narrow(OFFSET_OF(spu_thread, pc)); // Epilogue: add rsp,0x28 *raw++ = 0x48; @@ -7890,7 +7890,7 @@ struct spu_fast : public spu_recompiler_base *raw++ = type == spu_itype::BRHZ || type == spu_itype::BRHNZ ? 0x66 : 0x90; *raw++ = 0x83; *raw++ = 0xbd; - const u32 off = ::offset32(&spu_thread::gpr, op.rt) + 12; + const u32 off = OFFSET_OF(spu_thread, gpr[op.rt]) + 12; std::memcpy(raw, &off, 4); raw += 4; *raw++ = 0x00; @@ -7957,7 +7957,7 @@ struct spu_fast : public spu_recompiler_base // sub eax, [rbp + spu_thread::base_pc] *raw++ = 0x2b; *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::base_pc)); + *raw++ = ::narrow(OFFSET_OF(spu_thread, base_pc)); // cmp eax, (0 - size) *raw++ = 0x3d; @@ -7992,7 +7992,7 @@ struct spu_fast : public spu_recompiler_base *raw++ = 0x44; *raw++ = 0x89; *raw++ = 0x65; - *raw++ = ::narrow(::offset32(&spu_thread::pc)); + *raw++ = ::narrow(OFFSET_OF(spu_thread, pc)); // Epilogue: add rsp,0x28 ; ret *raw++ = 0x48; diff --git a/rpcs3/Emu/Cell/SPUInterpreter.cpp b/rpcs3/Emu/Cell/SPUInterpreter.cpp index 537196b88..b1e76b01b 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp @@ -99,7 +99,7 @@ namespace asmjit c.shl(x86::eax, I + 4); } - const auto ptr = x86::oword_ptr(spu, x86::rax, 0, ::offset32(&spu_thread::gpr)); + const auto ptr = x86::oword_ptr(spu, x86::rax, 0, OFFSET_OF(spu_thread, gpr)); if (utils::has_avx()) { diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index eb92feb68..dfd1243f3 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -329,9 +329,9 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (!m_finfo->fn && !m_block) { - lr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::gpr, +s_reg_lr, &v128::_u32, 3)); - sp = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::gpr, +s_reg_sp)); - r3 = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::gpr, 3)); + lr = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, gpr[+s_reg_lr]._u32[3]))); + sp = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, gpr[+s_reg_sp]))); + r3 = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, gpr[3]))); } else { @@ -348,8 +348,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (!m_finfo->fn) { lr = m_ir->CreateAnd(lr, 0x3fffc); - m_ir->CreateStore(lr, spu_ptr(&spu_thread::pc)); - m_ir->CreateStore(_call, spu_ptr(&spu_thread::gpr, 3)); + m_ir->CreateStore(lr, spu_ptr(OFFSET_OF(spu_thread, pc))); + m_ir->CreateStore(_call, spu_ptr(OFFSET_OF(spu_thread, gpr[3]))); m_ir->CreateBr(add_block_indirect({}, value(lr))); } else if (tail) @@ -392,7 +392,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_blocks.clear(); m_block_queue.clear(); m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", m_function)); - m_memptr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::memory_base_addr)); + m_memptr = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, memory_base_addr))); } // Add block with current block as a predecessor @@ -415,7 +415,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_lsptr = fn->getArg(1); m_base_pc = fn->getArg(2); m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", fn)); - m_memptr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::memory_base_addr)); + m_memptr = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, memory_base_addr))); // Load registers at the entry chunk for (u32 i = 0; i < s_reg_max; i++) @@ -452,7 +452,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_base_pc, m_ir->getInt32(m_base)), next, fail); m_ir->SetInsertPoint(fail); - m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); + m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(OFFSET_OF(spu_thread, pc))); tail_chunk(nullptr); m_ir->SetInsertPoint(next); } @@ -490,7 +490,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { ensure(!m_finfo->fn); - m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); + m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(OFFSET_OF(spu_thread, pc))); } else { @@ -539,16 +539,16 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return m_ir->CreateGEP(get_type(), base, offset); } - template - llvm::Value* spu_ptr(Args... offset_args) + template + llvm::Value* spu_ptr(std::uint32_t offset) { - return _ptr(m_thread, ::offset32(offset_args...)); + return _ptr(m_thread, offset); } - template - llvm::Value* spu_ptr(value_t add, Args... offset_args) + template + llvm::Value* spu_ptr(value_t add, std::uint32_t offset) { - const auto off = m_ir->CreateGEP(get_type(), m_thread, m_ir->getInt64(::offset32(offset_args...))); + const auto off = m_ir->CreateGEP(get_type(), m_thread, m_ir->getInt64(offset)); return m_ir->CreateAdd(off, add.value); } @@ -578,15 +578,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { if (index < 128) { - return ::offset32(&spu_thread::gpr, index); + return OFFSET_OF(spu_thread, gpr[index]); } switch (index) { - case s_reg_mfc_eal: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::eal); - case s_reg_mfc_lsa: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::lsa); - case s_reg_mfc_tag: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::tag); - case s_reg_mfc_size: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::size); + case s_reg_mfc_eal: return OFFSET_OF(spu_thread, ch_mfc_cmd.eal); + case s_reg_mfc_lsa: return OFFSET_OF(spu_thread, ch_mfc_cmd.lsa); + case s_reg_mfc_tag: return OFFSET_OF(spu_thread, ch_mfc_cmd.tag); + case s_reg_mfc_size: return OFFSET_OF(spu_thread, ch_mfc_cmd.size); default: fmt::throw_exception("get_reg_offset(%u): invalid register index", index); } @@ -1049,13 +1049,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Update PC for current or explicitly specified instruction address void update_pc(u32 target = -1) { - m_ir->CreateStore(m_ir->CreateAnd(get_pc(target + 1 ? target : m_pos), 0x3fffc), spu_ptr(&spu_thread::pc))->setVolatile(true); + m_ir->CreateStore(m_ir->CreateAnd(get_pc(target + 1 ? target : m_pos), 0x3fffc), spu_ptr(OFFSET_OF(spu_thread, pc)))->setVolatile(true); } // Call cpu_thread::check_state if necessary and return or continue (full check) void check_state(u32 addr, bool may_be_unsafe_for_savestate = true) { - const auto pstate = spu_ptr(&spu_thread::state); + const auto pstate = spu_ptr(OFFSET_OF(spu_thread, state)); const auto _body = llvm::BasicBlock::Create(m_context, "", m_function); const auto check = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_ir->CreateLoad(get_type(), pstate, true), m_ir->getInt32(0)), _body, check, m_md_likely); @@ -1069,14 +1069,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (may_be_unsafe_for_savestate) { - m_ir->CreateStore(m_ir->getInt8(1), spu_ptr(&spu_thread::unsavable))->setVolatile(true); + m_ir->CreateStore(m_ir->getInt8(1), spu_ptr(OFFSET_OF(spu_thread, unsavable)))->setVolatile(true); } m_ir->CreateCall(m_test_state, {m_thread}); if (may_be_unsafe_for_savestate) { - m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(&spu_thread::unsavable))->setVolatile(true); + m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(OFFSET_OF(spu_thread, unsavable)))->setVolatile(true); } m_ir->CreateBr(_body); @@ -1145,7 +1145,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto _final = llvm::BasicBlock::Create(m_context, "__putllc16_final", m_function); const auto _eal = (get_reg_fixed(s_reg_mfc_eal) & -128).eval(m_ir); - const auto _raddr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::raddr)); + const auto _raddr = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, raddr))); m_ir->CreateCondBr(m_ir->CreateAnd(m_ir->CreateICmpEQ(_eal, _raddr), m_ir->CreateIsNotNull(_raddr)), _raddr_match, _fail, m_md_likely); m_ir->SetInsertPoint(_raddr_match); @@ -1259,7 +1259,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->SetInsertPoint(_fail); call("PUTLLC16_fail", +on_fail, m_thread, _eal); - m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr(&spu_thread::ch_atomic_stat)); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr(OFFSET_OF(spu_thread, ch_atomic_stat))); m_ir->CreateBr(_final); m_ir->SetInsertPoint(_final); @@ -1269,7 +1269,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto diff = m_ir->CreateZExt(m_ir->CreateSub(dest, _lsa), get_type()); const auto _new = m_ir->CreateAlignedLoad(get_type(), _ptr(m_lsptr, dest), llvm::MaybeAlign{16}); - const auto _rdata = m_ir->CreateAlignedLoad(get_type(), _ptr(spu_ptr(&spu_thread::rdata), m_ir->CreateAnd(diff, 0x70)), llvm::MaybeAlign{16}); + const auto _rdata = m_ir->CreateAlignedLoad(get_type(), _ptr(spu_ptr(OFFSET_OF(spu_thread, rdata)), m_ir->CreateAnd(diff, 0x70)), llvm::MaybeAlign{16}); const bool is_accurate_op = !!g_cfg.core.spu_accurate_reservations; @@ -1289,8 +1289,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Touch memory (on the opposite side of the page) m_ir->CreateAtomicRMW(llvm::AtomicRMWInst::Or, _ptr(m_memptr, m_ir->CreateXor(_eal, 4096 / 2)), m_ir->getInt8(0), llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent); - const auto rptr = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir)); - const auto rtime = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::rtime)); + const auto rptr = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, reserv_base_addr))), ((eal_val & 0xff80) >> 1).eval(m_ir)); + const auto rtime = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, rtime))); m_ir->CreateBr(_repeat_lock); m_ir->SetInsertPoint(_repeat_lock); @@ -1313,7 +1313,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->SetInsertPoint(_lock_success); // Commit 16 bytes compare-exchange - const auto sudo_ptr = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::memory_sudo_addr)), _eal); + const auto sudo_ptr = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, memory_sudo_addr))), _eal); m_ir->CreateCondBr( m_ir->CreateExtractValue(m_ir->CreateAtomicCmpXchg(_ptr(sudo_ptr, diff), _rdata, _new, llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent), 1), _success_and_unlock, _fail_and_unlock); @@ -1331,13 +1331,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Perform unlocked vm::reservation_update if no physical memory changes needed m_ir->SetInsertPoint(_inc_res); - const auto rptr2 = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir)); + const auto rptr2 = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, reserv_base_addr))), ((eal_val & 0xff80) >> 1).eval(m_ir)); llvm::Value* old_val{}; if (true || is_accurate_op) { - old_val = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::rtime)); + old_val = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, rtime))); } else { @@ -1358,8 +1358,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } m_ir->SetInsertPoint(_success); - m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr(&spu_thread::ch_atomic_stat)); - m_ir->CreateStore(m_ir->getInt32(0), spu_ptr(&spu_thread::raddr)); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr(OFFSET_OF(spu_thread, ch_atomic_stat))); + m_ir->CreateStore(m_ir->getInt32(0), spu_ptr(OFFSET_OF(spu_thread, raddr))); m_ir->CreateBr(_final); m_ir->SetInsertPoint(_fail_and_unlock); @@ -1368,7 +1368,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->SetInsertPoint(_fail); call("PUTLLC16_fail", +on_fail, m_thread, _eal); - m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr(&spu_thread::ch_atomic_stat)); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr(OFFSET_OF(spu_thread, ch_atomic_stat))); m_ir->CreateBr(_final); m_ir->SetInsertPoint(_final); @@ -1408,7 +1408,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto _final = llvm::BasicBlock::Create(m_context, "", m_function); const auto _eal = (get_reg_fixed(s_reg_mfc_eal) & -128).eval(m_ir); - const auto _raddr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::raddr)); + const auto _raddr = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, raddr))); m_ir->CreateCondBr(m_ir->CreateAnd(m_ir->CreateICmpEQ(_eal, _raddr), m_ir->CreateIsNotNull(_raddr)), _next, _fail, m_md_likely); m_ir->SetInsertPoint(_next); @@ -1416,23 +1416,23 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator value_t eal_val; eal_val.value = _eal; - const auto rptr = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir)); - const auto rval = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::rtime)); + const auto rptr = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, reserv_base_addr))), ((eal_val & 0xff80) >> 1).eval(m_ir)); + const auto rval = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, rtime))); m_ir->CreateCondBr( m_ir->CreateExtractValue(m_ir->CreateAtomicCmpXchg(rptr, rval, m_ir->CreateAdd(rval, m_ir->getInt64(128)), llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent), 1), _next0, g_cfg.core.spu_accurate_reservations ? _fail : _next0); // Succeed unconditionally m_ir->SetInsertPoint(_next0); // call("atomic_wait_engine::notify_all", static_cast(atomic_wait_engine::notify_all), rptr); - m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr(&spu_thread::ch_atomic_stat)); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr(OFFSET_OF(spu_thread, ch_atomic_stat))); m_ir->CreateBr(_final); m_ir->SetInsertPoint(_fail); call("PUTLLC0_fail", +on_fail, m_thread, _eal); - m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr(&spu_thread::ch_atomic_stat)); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr(OFFSET_OF(spu_thread, ch_atomic_stat))); m_ir->CreateBr(_final); m_ir->SetInsertPoint(_final); - m_ir->CreateStore(m_ir->getInt32(0), spu_ptr(&spu_thread::raddr)); + m_ir->CreateStore(m_ir->getInt32(0), spu_ptr(OFFSET_OF(spu_thread, raddr))); } public: @@ -1470,7 +1470,7 @@ public: { .debug_info = false, // Set to "true" to insert debug frames on x27 .use_stack_frames = false, // We don't need this since the SPU GW allocates global scratch on the stack - .hypervisor_context_offset = ::offset32(&spu_thread::hv_ctx), + .hypervisor_context_offset = OFFSET_OF(spu_thread, hv_ctx), .exclusion_callback = should_exclude_function, .base_register_lookup = {} // Unused, always x19 on SPU }; @@ -1618,10 +1618,10 @@ public: const auto label_stop = BasicBlock::Create(m_context, "", m_function); // Load PC, which will be the actual value of 'm_base' - m_base_pc = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::pc)); + m_base_pc = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, pc))); // Emit state check - const auto pstate = spu_ptr(&spu_thread::state); + const auto pstate = spu_ptr(OFFSET_OF(spu_thread, state)); m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), pstate), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely); // Emit code check @@ -1630,7 +1630,7 @@ public: // Set block hash for profiling (if enabled) if (g_cfg.core.spu_prof && g_cfg.core.spu_verification) - m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536)), spu_ptr(&spu_thread::block_hash)); + m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536)), spu_ptr(OFFSET_OF(spu_thread, block_hash))); if (!g_cfg.core.spu_verification) { @@ -1893,7 +1893,7 @@ public: // Increase block counter with statistics m_ir->SetInsertPoint(label_body); - const auto pbcount = spu_ptr(&spu_thread::block_counter); + const auto pbcount = spu_ptr(OFFSET_OF(spu_thread, block_counter)); m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(get_type(), pbcount), m_ir->getInt64(check_iterations)), pbcount); // Call the entry function chunk @@ -1927,7 +1927,7 @@ public: if (g_cfg.core.spu_verification) { - const auto pbfail = spu_ptr(&spu_thread::block_failure); + const auto pbfail = spu_ptr(OFFSET_OF(spu_thread, block_failure)); m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(get_type(), pbfail), m_ir->getInt64(1)), pbfail); const auto dispci = call("spu_dispatch", spu_runtime::tr_dispatch, m_thread, m_lsptr, main_arg2); dispci->setCallingConv(CallingConv::GHC); @@ -1987,7 +1987,7 @@ public: // Set block hash for profiling (if enabled) if (g_cfg.core.spu_prof) - m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536) | (m_entry >> 2)), spu_ptr(&spu_thread::block_hash)); + m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536) | (m_entry >> 2)), spu_ptr(OFFSET_OF(spu_thread, block_hash))); m_finfo = &m_functions[m_entry]; m_ir->CreateBr(add_block(m_entry)); @@ -2918,7 +2918,7 @@ public: set_function(main_func); // Load pc and opcode - m_interp_pc = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::pc)); + m_interp_pc = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, pc))); m_interp_op = m_ir->CreateLoad(get_type(), m_ir->CreateGEP(get_type(), m_lsptr, m_ir->CreateZExt(m_interp_pc, get_type()))); m_interp_op = m_ir->CreateCall(get_intrinsic(Intrinsic::bswap), {m_interp_op}); @@ -2932,7 +2932,7 @@ public: m_interp_regs = _ptr(m_thread, get_reg_offset(0)); // Save host thread's stack pointer - const auto native_sp = spu_ptr(&spu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs); + const auto native_sp = spu_ptr(OFFSET_OF(spu_thread, hv_ctx.regs)); #if defined(ARCH_X64) const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")})); #elif defined(ARCH_ARM64) @@ -3018,7 +3018,7 @@ public: m_interp_regs = f->getArg(6); m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", f)); - m_memptr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::memory_base_addr)); + m_memptr = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, memory_base_addr))); switch (itype) { @@ -3034,7 +3034,7 @@ public: case spu_itype::WRCH: { // Invalid or abortable instruction. Save current address. - m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); + m_ir->CreateStore(m_interp_pc, spu_ptr(OFFSET_OF(spu_thread, pc))); [[fallthrough]]; } default: @@ -3078,7 +3078,7 @@ public: { if (check) { - m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); + m_ir->CreateStore(m_interp_pc, spu_ptr(OFFSET_OF(spu_thread, pc))); } // Decode next instruction. @@ -3115,9 +3115,9 @@ public: { const auto _stop = BasicBlock::Create(m_context, "", f); const auto _next = BasicBlock::Create(m_context, "", f); - m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::state))), _stop, _next, m_md_unlikely); + m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, state)))), _stop, _next, m_md_unlikely); m_ir->SetInsertPoint(_stop); - m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); + m_ir->CreateStore(m_interp_pc, spu_ptr(OFFSET_OF(spu_thread, pc))); const auto escape_yes = BasicBlock::Create(m_context, "", f); const auto escape_no = BasicBlock::Create(m_context, "", f); @@ -3171,7 +3171,7 @@ public: // Call next instruction. const auto _stop = BasicBlock::Create(m_context, "", f); const auto _next = BasicBlock::Create(m_context, "", f); - m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::state))), _stop, _next, m_md_unlikely); + m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, state)))), _stop, _next, m_md_unlikely); m_ir->SetInsertPoint(_next); if (itype == spu_itype::WRCH || @@ -3189,7 +3189,7 @@ public: ncall->setTailCall(); m_ir->CreateRetVoid(); m_ir->SetInsertPoint(_stop); - m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); + m_ir->CreateStore(m_interp_pc, spu_ptr(OFFSET_OF(spu_thread, pc))); call("spu_escape", spu_runtime::g_escape, m_thread)->setTailCall(); m_ir->CreateRetVoid(); } @@ -3314,7 +3314,7 @@ public: { if (m_interp_magn) { - m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); + m_ir->CreateStore(m_interp_pc, spu_ptr(OFFSET_OF(spu_thread, pc))); call("spu_unknown", &exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); return; } @@ -3469,7 +3469,7 @@ public: { case SPU_RdSRR0: { - res.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::srr0)); + res.value = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, srr0))); break; } case SPU_RdInMbox: @@ -3481,36 +3481,36 @@ public: } case MFC_RdTagStat: { - res.value = get_rdch(op, ::offset32(&spu_thread::ch_tag_stat), false); + res.value = get_rdch(op, OFFSET_OF(spu_thread, ch_tag_stat), false); break; } case MFC_RdTagMask: { - res.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_tag_mask)); + res.value = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, ch_tag_mask))); break; } case SPU_RdSigNotify1: { update_pc(); ensure_gpr_stores(); - res.value = get_rdch(op, ::offset32(&spu_thread::ch_snr1), true); + res.value = get_rdch(op, OFFSET_OF(spu_thread, ch_snr1), true); break; } case SPU_RdSigNotify2: { update_pc(); ensure_gpr_stores(); - res.value = get_rdch(op, ::offset32(&spu_thread::ch_snr2), true); + res.value = get_rdch(op, OFFSET_OF(spu_thread, ch_snr2), true); break; } case MFC_RdAtomicStat: { - res.value = get_rdch(op, ::offset32(&spu_thread::ch_atomic_stat), false); + res.value = get_rdch(op, OFFSET_OF(spu_thread, ch_atomic_stat), false); break; } case MFC_RdListStallStat: { - res.value = get_rdch(op, ::offset32(&spu_thread::ch_stall_stat), false); + res.value = get_rdch(op, OFFSET_OF(spu_thread, ch_stall_stat), false); break; } case SPU_RdDec: @@ -3519,13 +3519,13 @@ public: if (utils::get_tsc_freq() && !(g_cfg.core.spu_loop_detection) && (g_cfg.core.clocks_scale == 100)) { const auto timebase_offs = m_ir->CreateLoad(get_type(), m_ir->CreateIntToPtr(m_ir->getInt64(reinterpret_cast(&g_timebase_offs)), get_type())); - const auto timestamp = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_dec_start_timestamp)); - const auto dec_value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_dec_value)); + const auto timestamp = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, ch_dec_start_timestamp))); + const auto dec_value = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, ch_dec_value))); const auto tsc = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_rdtsc)); const auto tscx = m_ir->CreateMul(m_ir->CreateUDiv(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)); const auto tscm = m_ir->CreateUDiv(m_ir->CreateMul(m_ir->CreateURem(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)), m_ir->getInt64(utils::get_tsc_freq())); const auto tsctb = m_ir->CreateSub(m_ir->CreateAdd(tscx, tscm), timebase_offs); - const auto frz = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::is_dec_frozen)); + const auto frz = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, is_dec_frozen))); const auto frzev = m_ir->CreateICmpEQ(frz, m_ir->getInt8(0)); const auto delta = m_ir->CreateTrunc(m_ir->CreateSub(tsctb, timestamp), get_type()); @@ -3539,7 +3539,7 @@ public: } case SPU_RdEventMask: { - const auto value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_events)); + const auto value = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, ch_events))); value->setAtomic(llvm::AtomicOrdering::Acquire); res.value = m_ir->CreateTrunc(m_ir->CreateLShr(value, 32), get_type()); break; @@ -3554,22 +3554,22 @@ public: } else { - m_ir->CreateStore(m_ir->getInt8(1), spu_ptr(&spu_thread::unsavable)); + m_ir->CreateStore(m_ir->getInt8(1), spu_ptr(OFFSET_OF(spu_thread, unsavable))); } res.value = call("spu_read_events", &exec_read_events, m_thread); if (!g_cfg.savestate.compatible_mode) { - m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(&spu_thread::unsavable)); + m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(OFFSET_OF(spu_thread, unsavable))); } break; } case SPU_RdMachStat: { - res.value = m_ir->CreateZExt(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::interrupts_enabled)), get_type()); - res.value = m_ir->CreateOr(res.value, m_ir->CreateAnd(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::thread_type)), m_ir->getInt32(2))); + res.value = m_ir->CreateZExt(m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, interrupts_enabled))), get_type()); + res.value = m_ir->CreateOr(res.value, m_ir->CreateAnd(m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, thread_type))), m_ir->getInt32(2))); break; } @@ -3673,22 +3673,22 @@ public: { case SPU_WrOutMbox: { - res.value = wait_rchcnt(::offset32(&spu_thread::ch_out_mbox), true); + res.value = wait_rchcnt(OFFSET_OF(spu_thread, ch_out_mbox), true); break; } case SPU_WrOutIntrMbox: { - res.value = wait_rchcnt(::offset32(&spu_thread::ch_out_intr_mbox), true); + res.value = wait_rchcnt(OFFSET_OF(spu_thread, ch_out_intr_mbox), true); break; } case SPU_RdSigNotify1: { - res.value = wait_rchcnt(::offset32(&spu_thread::ch_snr1)); + res.value = wait_rchcnt(OFFSET_OF(spu_thread, ch_snr1)); break; } case SPU_RdSigNotify2: { - res.value = wait_rchcnt(::offset32(&spu_thread::ch_snr2)); + res.value = wait_rchcnt(OFFSET_OF(spu_thread, ch_snr2)); break; } case SPU_RdInMbox: @@ -3698,7 +3698,7 @@ public: return ch->pop_wait(*_spu, false), ch->get_count(); }; - res.value = call("wait_spu_inbox", +wait_inbox, m_thread, spu_ptr(&spu_thread::ch_in_mbox)); + res.value = call("wait_spu_inbox", +wait_inbox, m_thread, spu_ptr(OFFSET_OF(spu_thread, ch_in_mbox))); break; } default: break; @@ -3715,37 +3715,37 @@ public: { case SPU_WrOutMbox: { - res.value = get_rchcnt(::offset32(&spu_thread::ch_out_mbox), true); + res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_out_mbox), true); break; } case SPU_WrOutIntrMbox: { - res.value = get_rchcnt(::offset32(&spu_thread::ch_out_intr_mbox), true); + res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_out_intr_mbox), true); break; } case MFC_RdTagStat: { - res.value = get_rchcnt(::offset32(&spu_thread::ch_tag_stat)); + res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_tag_stat)); break; } case MFC_RdListStallStat: { - res.value = get_rchcnt(::offset32(&spu_thread::ch_stall_stat)); + res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_stall_stat)); break; } case SPU_RdSigNotify1: { - res.value = get_rchcnt(::offset32(&spu_thread::ch_snr1)); + res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_snr1)); break; } case SPU_RdSigNotify2: { - res.value = get_rchcnt(::offset32(&spu_thread::ch_snr2)); + res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_snr2)); break; } case MFC_RdAtomicStat: { - res.value = get_rchcnt(::offset32(&spu_thread::ch_atomic_stat)); + res.value = get_rchcnt(OFFSET_OF(spu_thread, ch_atomic_stat)); break; } case MFC_WrTagUpdate: @@ -3755,13 +3755,13 @@ public: } case MFC_Cmd: { - res.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::mfc_size)); + res.value = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, mfc_size))); res.value = m_ir->CreateSub(m_ir->getInt32(16), res.value); break; } case SPU_RdInMbox: { - const auto value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_in_mbox)); + const auto value = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, ch_in_mbox))); value->setAtomic(llvm::AtomicOrdering::Acquire); res.value = value; res.value = m_ir->CreateLShr(res.value, 8); @@ -3770,7 +3770,7 @@ public: } case SPU_RdEventStat: { - const auto mask = m_ir->CreateTrunc(m_ir->CreateLShr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_events)), 32), get_type()); + const auto mask = m_ir->CreateTrunc(m_ir->CreateLShr(m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, ch_events))), 32), get_type()); res.value = call("spu_get_events", &exec_get_events, m_thread, mask); break; } @@ -3868,7 +3868,7 @@ public: { case SPU_WrSRR0: { - m_ir->CreateStore(eval(val & 0x3fffc).value, spu_ptr(&spu_thread::srr0)); + m_ir->CreateStore(eval(val & 0x3fffc).value, spu_ptr(OFFSET_OF(spu_thread, srr0))); return; } case SPU_WrOutIntrMbox: @@ -3884,10 +3884,10 @@ public: case MFC_WrTagMask: { // TODO - m_ir->CreateStore(val.value, spu_ptr(&spu_thread::ch_tag_mask)); + m_ir->CreateStore(val.value, spu_ptr(OFFSET_OF(spu_thread, ch_tag_mask))); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto _mfc = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_tag_upd)), m_ir->getInt32(MFC_TAG_UPDATE_IMMEDIATE)), _mfc, next); + m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, ch_tag_upd))), m_ir->getInt32(MFC_TAG_UPDATE_IMMEDIATE)), _mfc, next); m_ir->SetInsertPoint(_mfc); update_pc(); call("spu_write_channel", &exec_wrch, m_thread, m_ir->getInt32(op.ra), val.value); @@ -3899,11 +3899,11 @@ public: { if (true) { - const auto tag_mask = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_tag_mask)); - const auto mfc_fence = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::mfc_fence)); + const auto tag_mask = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, ch_tag_mask))); + const auto mfc_fence = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, mfc_fence))); const auto completed = m_ir->CreateAnd(tag_mask, m_ir->CreateNot(mfc_fence)); - const auto upd_ptr = spu_ptr(&spu_thread::ch_tag_upd); - const auto stat_ptr = spu_ptr(&spu_thread::ch_tag_stat); + const auto upd_ptr = spu_ptr(OFFSET_OF(spu_thread, ch_tag_upd)); + const auto stat_ptr = spu_ptr(OFFSET_OF(spu_thread, ch_tag_stat)); const auto stat_val = m_ir->CreateOr(m_ir->CreateZExt(completed, get_type()), s64{smin}); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); @@ -3955,7 +3955,7 @@ public: } spu_log.warning("[0x%x] MFC_EAH: $%u is not a zero constant", m_pos, +op.rt); - // m_ir->CreateStore(val.value, spu_ptr(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::eah)); + // m_ir->CreateStore(val.value, spu_ptr(OFFSET_OF(spu_thread, ch_mfc_cmd.eah))); return; } case MFC_EAL: @@ -4009,8 +4009,8 @@ public: const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto pf = spu_ptr(&spu_thread::mfc_fence); - const auto pb = spu_ptr(&spu_thread::mfc_barrier); + const auto pf = spu_ptr(OFFSET_OF(spu_thread, mfc_fence)); + const auto pb = spu_ptr(OFFSET_OF(spu_thread, mfc_barrier)); switch (u64 cmd = ci->getZExtValue()) { @@ -4035,7 +4035,7 @@ public: m_ir->SetInsertPoint(fail); m_ir->CreateUnreachable(); m_ir->SetInsertPoint(next); - m_ir->CreateStore(ci, spu_ptr(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd)); + m_ir->CreateStore(ci, spu_ptr(OFFSET_OF(spu_thread, ch_mfc_cmd.cmd))); update_pc(); ensure_gpr_stores(); call("spu_exec_mfc_cmd_saveable", &exec_mfc_cmd, m_thread); @@ -4054,7 +4054,7 @@ public: m_ir->SetInsertPoint(fail); m_ir->CreateUnreachable(); m_ir->SetInsertPoint(next); - m_ir->CreateStore(ci, spu_ptr(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd)); + m_ir->CreateStore(ci, spu_ptr(OFFSET_OF(spu_thread, ch_mfc_cmd.cmd))); update_pc(); call("spu_exec_mfc_cmd", &exec_mfc_cmd, m_thread); return; @@ -4114,7 +4114,7 @@ public: m_ir->SetInsertPoint(mmio); } - m_ir->CreateStore(ci, spu_ptr(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd)); + m_ir->CreateStore(ci, spu_ptr(OFFSET_OF(spu_thread, ch_mfc_cmd.cmd))); call("spu_exec_mfc_cmd", &exec_mfc_cmd, m_thread); m_ir->CreateBr(next); m_ir->SetInsertPoint(copy); @@ -4206,7 +4206,7 @@ public: } // Disable certain thing - m_ir->CreateStore(m_ir->getInt32(0), spu_ptr(&spu_thread::last_faddr)); + m_ir->CreateStore(m_ir->getInt32(0), spu_ptr(OFFSET_OF(spu_thread, last_faddr))); m_ir->CreateBr(next); break; } @@ -4214,7 +4214,7 @@ public: case MFC_EIEIO_CMD: case MFC_SYNC_CMD: { - const auto cond = m_ir->CreateIsNull(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::mfc_size))); + const auto cond = m_ir->CreateIsNull(m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, mfc_size)))); m_ir->CreateCondBr(cond, exec, fail, m_md_likely); m_ir->SetInsertPoint(exec); m_ir->CreateFence(llvm::AtomicOrdering::SequentiallyConsistent); @@ -4236,12 +4236,12 @@ public: m_ir->SetInsertPoint(fail); // Get MFC slot, redirect to invalid memory address - const auto slot = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::mfc_size)); - const auto off0 = m_ir->CreateAdd(m_ir->CreateMul(slot, m_ir->getInt32(sizeof(spu_mfc_cmd))), m_ir->getInt32(::offset32(&spu_thread::mfc_queue))); + const auto slot = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, mfc_size))); + const auto off0 = m_ir->CreateAdd(m_ir->CreateMul(slot, m_ir->getInt32(sizeof(spu_mfc_cmd))), m_ir->getInt32(OFFSET_OF(spu_thread, mfc_queue))); const auto ptr0 = m_ir->CreateGEP(get_type(), m_thread, m_ir->CreateZExt(off0, get_type())); const auto ptr1 = m_ir->CreateGEP(get_type(), m_memptr, m_ir->getInt64(0xffdeadf0)); const auto pmfc = m_ir->CreateSelect(m_ir->CreateICmpULT(slot, m_ir->getInt32(16)), ptr0, ptr1); - m_ir->CreateStore(ci, _ptr(pmfc, ::offset32(&spu_mfc_cmd::cmd))); + m_ir->CreateStore(ci, _ptr(pmfc, OFFSET_OF(spu_mfc_cmd, cmd))); switch (u64 cmd = ci->getZExtValue()) { @@ -4281,10 +4281,10 @@ public: case MFC_GETB_CMD: case MFC_GETF_CMD: { - m_ir->CreateStore(tag.value, _ptr(pmfc, ::offset32(&spu_mfc_cmd::tag))); - m_ir->CreateStore(size.value, _ptr(pmfc, ::offset32(&spu_mfc_cmd::size))); - m_ir->CreateStore(lsa.value, _ptr(pmfc, ::offset32(&spu_mfc_cmd::lsa))); - m_ir->CreateStore(eal.value, _ptr(pmfc, ::offset32(&spu_mfc_cmd::eal))); + m_ir->CreateStore(tag.value, _ptr(pmfc, OFFSET_OF(spu_mfc_cmd, tag))); + m_ir->CreateStore(size.value, _ptr(pmfc, OFFSET_OF(spu_mfc_cmd, size))); + m_ir->CreateStore(lsa.value, _ptr(pmfc, OFFSET_OF(spu_mfc_cmd, lsa))); + m_ir->CreateStore(eal.value, _ptr(pmfc, OFFSET_OF(spu_mfc_cmd, eal))); m_ir->CreateStore(m_ir->CreateOr(m_ir->CreateLoad(get_type(), pf), mask), pf); if (cmd & MFC_BARRIER_MASK) m_ir->CreateStore(m_ir->CreateOr(m_ir->CreateLoad(get_type(), pb), mask), pb); @@ -4305,7 +4305,7 @@ public: } } - m_ir->CreateStore(m_ir->CreateAdd(slot, m_ir->getInt32(1)), spu_ptr(&spu_thread::mfc_size)); + m_ir->CreateStore(m_ir->CreateAdd(slot, m_ir->getInt32(1)), spu_ptr(OFFSET_OF(spu_thread, mfc_size))); m_ir->CreateBr(next); m_ir->SetInsertPoint(next); return; @@ -4318,7 +4318,7 @@ public: case MFC_WrListStallAck: { const auto mask = eval(splat(1) << (val & 0x1f)); - const auto _ptr = spu_ptr(&spu_thread::ch_stall_mask); + const auto _ptr = spu_ptr(OFFSET_OF(spu_thread, ch_stall_mask)); const auto _old = m_ir->CreateLoad(get_type(), _ptr); const auto _new = m_ir->CreateAnd(_old, m_ir->CreateNot(mask.value)); m_ir->CreateStore(_new, _ptr); @@ -4345,16 +4345,16 @@ public: const auto tscx = m_ir->CreateMul(m_ir->CreateUDiv(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)); const auto tscm = m_ir->CreateUDiv(m_ir->CreateMul(m_ir->CreateURem(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)), m_ir->getInt64(utils::get_tsc_freq())); const auto tsctb = m_ir->CreateSub(m_ir->CreateAdd(tscx, tscm), timebase_offs); - m_ir->CreateStore(tsctb, spu_ptr(&spu_thread::ch_dec_start_timestamp)); + m_ir->CreateStore(tsctb, spu_ptr(OFFSET_OF(spu_thread, ch_dec_start_timestamp))); } else #endif { - m_ir->CreateStore(call("get_timebased_time", &get_timebased_time), spu_ptr(&spu_thread::ch_dec_start_timestamp)); + m_ir->CreateStore(call("get_timebased_time", &get_timebased_time), spu_ptr(OFFSET_OF(spu_thread, ch_dec_start_timestamp))); } - m_ir->CreateStore(val.value, spu_ptr(&spu_thread::ch_dec_value)); - m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(&spu_thread::is_dec_frozen)); + m_ir->CreateStore(val.value, spu_ptr(OFFSET_OF(spu_thread, ch_dec_value))); + m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(OFFSET_OF(spu_thread, is_dec_frozen))); return; } case SPU_Set_Bkmk_Tag: @@ -7641,7 +7641,7 @@ public: m_ir->CreateCondBr(cond.value, halt, next, m_md_unlikely); m_ir->SetInsertPoint(halt); if (m_interp_magn) - m_ir->CreateStore(m_function->getArg(2), spu_ptr(&spu_thread::pc)); + m_ir->CreateStore(m_function->getArg(2), spu_ptr(OFFSET_OF(spu_thread, pc))); else update_pc(); const auto ptr = _ptr(m_memptr, 0xffdead00); @@ -7748,7 +7748,7 @@ public: target->addIncoming(e_addr, e_exec); m_ir->CreateCondBr(get_imm(op.d).value, d_exec, d_done, m_md_unlikely); m_ir->SetInsertPoint(d_exec); - m_ir->CreateStore(m_ir->getFalse(), spu_ptr(&spu_thread::interrupts_enabled)); + m_ir->CreateStore(m_ir->getFalse(), spu_ptr(OFFSET_OF(spu_thread, interrupts_enabled))); m_ir->CreateBr(d_done); m_ir->SetInsertPoint(d_done); m_ir->CreateBr(m_interp_bblock); @@ -7784,7 +7784,7 @@ public: } else { - sp.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::gpr, 1, &v128::_u32, 3)); + sp.value = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, gpr[1]._u32[3]))); } } @@ -7799,15 +7799,15 @@ public: if (op.d) { - m_ir->CreateStore(m_ir->getFalse(), spu_ptr(&spu_thread::interrupts_enabled)); + m_ir->CreateStore(m_ir->getFalse(), spu_ptr(OFFSET_OF(spu_thread, interrupts_enabled))); } - m_ir->CreateStore(addr.value, spu_ptr(&spu_thread::pc)); + m_ir->CreateStore(addr.value, spu_ptr(OFFSET_OF(spu_thread, pc))); if (ret && g_cfg.core.spu_block_size >= spu_block_size_type::mega) { // Compare address stored in stack mirror with addr - const auto stack0 = eval(zext(sp) + ::offset32(&spu_thread::stack_mirror)); + const auto stack0 = eval(zext(sp) + OFFSET_OF(spu_thread, stack_mirror)); const auto stack1 = eval(stack0 + 8); const auto _ret = m_ir->CreateLoad(get_type(), m_ir->CreateGEP(get_type(), m_thread, stack0.value)); const auto link = m_ir->CreateLoad(get_type(), m_ir->CreateGEP(get_type(), m_thread, stack1.value)); @@ -8070,7 +8070,7 @@ public: if (op.d && tfound != m_targets.end() && tfound->second.size() == 1 && tfound->second[0] == spu_branch_target(m_pos, 1)) { // Interrupts-disable pattern - m_ir->CreateStore(m_ir->getFalse(), spu_ptr(&spu_thread::interrupts_enabled)); + m_ir->CreateStore(m_ir->getFalse(), spu_ptr(OFFSET_OF(spu_thread, interrupts_enabled))); return; } @@ -8130,7 +8130,7 @@ public: // Exit function on unexpected target m_ir->SetInsertPoint(sw->getDefaultDest()); - m_ir->CreateStore(addr.value, spu_ptr(&spu_thread::pc)); + m_ir->CreateStore(addr.value, spu_ptr(OFFSET_OF(spu_thread, pc))); if (m_finfo && m_finfo->fn) { @@ -8165,7 +8165,7 @@ public: if (m_block) m_block->block_end = m_ir->GetInsertBlock(); value_t srr0; - srr0.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::srr0)); + srr0.value = m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, srr0))); m_ir->CreateBr(add_block_indirect(op, srr0)); } @@ -8175,7 +8175,7 @@ public: m_block->block_end = m_ir->GetInsertBlock(); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); set_link(op); - const auto mask = m_ir->CreateTrunc(m_ir->CreateLShr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_events), true), 32), get_type()); + const auto mask = m_ir->CreateTrunc(m_ir->CreateLShr(m_ir->CreateLoad(get_type(), spu_ptr(OFFSET_OF(spu_thread, ch_events)), true), 32), get_type()); const auto res = call("spu_get_events", &exec_get_events, m_thread, mask); const auto target = add_block_indirect(op, addr); m_ir->CreateCondBr(m_ir->CreateICmpNE(res, m_ir->getInt32(0)), target, add_block_next()); @@ -8507,7 +8507,7 @@ public: { // Store the return function chunk address at the stack mirror const auto pfunc = add_function(m_pos + 4); - const auto stack0 = eval(zext(extract(get_reg_fixed(1), 3) & 0x3fff0) + ::offset32(&spu_thread::stack_mirror)); + const auto stack0 = eval(zext(extract(get_reg_fixed(1), 3) & 0x3fff0) + OFFSET_OF(spu_thread, stack_mirror)); const auto stack1 = eval(stack0 + 8); const auto rel_ptr = m_ir->CreateSub(m_ir->CreatePtrToInt(pfunc->chunk, get_type()), get_segment_base()); const auto ptr_plus_op = m_ir->CreateOr(m_ir->CreateShl(rel_ptr, 32), m_ir->getInt64(m_next_op)); diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 27f1434dc..e6d8c0a70 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -695,7 +695,7 @@ const auto spu_putllc_tx = build_function_asm(&g_rtm_tx_limit2))); @@ -703,7 +703,7 @@ const auto spu_putllc_tx = build_function_asm(cpu_flag::pause)); + c.bt(x86::dword_ptr(args[2], OFFSET_OF(spu_thread, state) - OFFSET_OF(spu_thread, rdata)), static_cast(cpu_flag::pause)); c.jc(fall); c.xbegin(tx1); @@ -761,7 +761,7 @@ const auto spu_putllc_tx = build_function_asm(&g_rtm_tx_limit1))); @@ -1039,7 +1039,7 @@ const auto spu_getllar_tx = build_function_asm(cpu_flag::pause)); + c.bt(x86::dword_ptr(args[2], OFFSET_OF(spu_thread, state)), static_cast(cpu_flag::pause)); c.jc(fall); c.mov(x86::rax, x86::qword_ptr(x86::r11)); c.and_(x86::rax, -128); @@ -1068,7 +1068,7 @@ const auto spu_getllar_tx = build_function_asm lv2_cond::load(utils::serial& ar) { - return load_func(make_shared(stx::exact_t(ar))); + return load_func(make_shared(exact_t(ar))); } void lv2_cond::save(utils::serial& ar) diff --git a/rpcs3/Emu/Cell/lv2/sys_event.cpp b/rpcs3/Emu/Cell/lv2/sys_event.cpp index 946bbd137..c65efd62c 100644 --- a/rpcs3/Emu/Cell/lv2/sys_event.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_event.cpp @@ -27,7 +27,7 @@ lv2_event_queue::lv2_event_queue(utils::serial& ar) noexcept std::function lv2_event_queue::load(utils::serial& ar) { - auto queue = make_shared(stx::exact_t(ar)); + auto queue = make_shared(exact_t(ar)); return [ptr = lv2_obj::load(queue->key, queue)](void* storage) { *static_cast*>(storage) = ptr; diff --git a/rpcs3/Emu/Cell/lv2/sys_event_flag.cpp b/rpcs3/Emu/Cell/lv2/sys_event_flag.cpp index 74cbb34d6..30c81ebf2 100644 --- a/rpcs3/Emu/Cell/lv2/sys_event_flag.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_event_flag.cpp @@ -18,7 +18,7 @@ lv2_event_flag::lv2_event_flag(utils::serial& ar) std::function lv2_event_flag::load(utils::serial& ar) { - return load_func(make_shared(stx::exact_t(ar))); + return load_func(make_shared(exact_t(ar))); } void lv2_event_flag::save(utils::serial& ar) diff --git a/rpcs3/Emu/Cell/lv2/sys_memory.cpp b/rpcs3/Emu/Cell/lv2/sys_memory.cpp index dbb513f9e..f5fc74321 100644 --- a/rpcs3/Emu/Cell/lv2/sys_memory.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_memory.cpp @@ -27,7 +27,7 @@ lv2_memory_container::lv2_memory_container(utils::serial& ar, bool from_idm) noe std::function lv2_memory_container::load(utils::serial& ar) { // Use idm::last_id() only for the instances at IDM - return [ptr = make_shared(stx::exact_t(ar), true)](void* storage) + return [ptr = make_shared(exact_t(ar), true)](void* storage) { *static_cast*>(storage) = ptr; }; diff --git a/rpcs3/Emu/Cell/lv2/sys_mmapper.cpp b/rpcs3/Emu/Cell/lv2/sys_mmapper.cpp index c7069a332..c71fac708 100644 --- a/rpcs3/Emu/Cell/lv2/sys_mmapper.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_mmapper.cpp @@ -72,7 +72,7 @@ CellError lv2_memory::on_id_create() std::function lv2_memory::load(utils::serial& ar) { - auto mem = make_shared(stx::exact_t(ar)); + auto mem = make_shared(exact_t(ar)); mem->exists++; // Disable on_id_create() auto func = load_func(mem, +mem->pshared); mem->exists--; diff --git a/rpcs3/Emu/Cell/lv2/sys_mutex.cpp b/rpcs3/Emu/Cell/lv2/sys_mutex.cpp index 8103d9595..28460f76c 100644 --- a/rpcs3/Emu/Cell/lv2/sys_mutex.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_mutex.cpp @@ -22,7 +22,7 @@ lv2_mutex::lv2_mutex(utils::serial& ar) std::function lv2_mutex::load(utils::serial& ar) { - return load_func(make_shared(stx::exact_t(ar))); + return load_func(make_shared(exact_t(ar))); } void lv2_mutex::save(utils::serial& ar) diff --git a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_native.cpp b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_native.cpp index 032ffeb0d..2f23375b9 100644 --- a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_native.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_native.cpp @@ -20,7 +20,7 @@ lv2_socket_native::lv2_socket_native(lv2_socket_family family, lv2_socket_type t } lv2_socket_native::lv2_socket_native(utils::serial& ar, lv2_socket_type type) - : lv2_socket(stx::make_exact(ar), type) + : lv2_socket(make_exact(ar), type) { [[maybe_unused]] const s32 version = GET_SERIALIZATION_VERSION(lv2_net); diff --git a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_p2p.cpp b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_p2p.cpp index 50b45c65c..157f5c4ec 100644 --- a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_p2p.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_p2p.cpp @@ -17,7 +17,7 @@ lv2_socket_p2p::lv2_socket_p2p(lv2_socket_family family, lv2_socket_type type, l } lv2_socket_p2p::lv2_socket_p2p(utils::serial& ar, lv2_socket_type type) - : lv2_socket(stx::make_exact(ar), type) + : lv2_socket(make_exact(ar), type) { ar(port, vport, bound_addr); diff --git a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_raw.cpp b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_raw.cpp index 6e74bd512..3dd109ca0 100644 --- a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_raw.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_raw.cpp @@ -27,7 +27,7 @@ lv2_socket_raw::lv2_socket_raw(lv2_socket_family family, lv2_socket_type type, l } lv2_socket_raw::lv2_socket_raw(utils::serial& ar, lv2_socket_type type) - : lv2_socket(stx::make_exact(ar), type) + : lv2_socket(make_exact(ar), type) { } diff --git a/rpcs3/Emu/Cell/lv2/sys_rwlock.cpp b/rpcs3/Emu/Cell/lv2/sys_rwlock.cpp index bdf9456a4..da47702cb 100644 --- a/rpcs3/Emu/Cell/lv2/sys_rwlock.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_rwlock.cpp @@ -18,7 +18,7 @@ lv2_rwlock::lv2_rwlock(utils::serial& ar) std::function lv2_rwlock::load(utils::serial& ar) { - return load_func(make_shared(stx::exact_t(ar))); + return load_func(make_shared(exact_t(ar))); } void lv2_rwlock::save(utils::serial& ar) diff --git a/rpcs3/Emu/Cell/lv2/sys_semaphore.cpp b/rpcs3/Emu/Cell/lv2/sys_semaphore.cpp index d34f056d0..dc685c02d 100644 --- a/rpcs3/Emu/Cell/lv2/sys_semaphore.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_semaphore.cpp @@ -18,7 +18,7 @@ lv2_sema::lv2_sema(utils::serial& ar) std::function lv2_sema::load(utils::serial& ar) { - return load_func(make_shared(stx::exact_t(ar))); + return load_func(make_shared(exact_t(ar))); } void lv2_sema::save(utils::serial& ar) diff --git a/rpcs3/Emu/GDB.cpp b/rpcs3/Emu/GDB.cpp index 702e4a28c..4cdff6437 100644 --- a/rpcs3/Emu/GDB.cpp +++ b/rpcs3/Emu/GDB.cpp @@ -1,6 +1,7 @@ #include "stdafx.h" #include "GDB.h" +#include "util/bit_set.h" #include "util/logs.hpp" #include "util/StrUtil.h" #include "Emu/Memory/vm.h" diff --git a/rpcs3/Emu/IdManager.h b/rpcs3/Emu/IdManager.h index bf0cb9ac5..9d981b02b 100644 --- a/rpcs3/Emu/IdManager.h +++ b/rpcs3/Emu/IdManager.h @@ -26,7 +26,7 @@ template concept IdmBaseCompatible = (std::is_final_v ? IdmCompatible : !!(requires() { u32{T::id_step}, u32{T::id_count}; })); template -concept IdmSavable = IdmBaseCompatible && T::savestate_init_pos != 0 && (requires(T& t, utils::serial& ar) { t.save(stx::exact_t(ar)); }); +concept IdmSavable = IdmBaseCompatible && T::savestate_init_pos != 0 && (requires(T& t, utils::serial& ar) { t.save(exact_t(ar)); }); // If id_base is declared in base type, than storage type must declare id_type template @@ -113,13 +113,13 @@ namespace id_manager static constexpr pointer_keeper (*load)(utils::serial&) = [](utils::serial& ar) -> pointer_keeper { stx::shared_ptr ptr; - if constexpr (std::is_constructible_v, stx::exact_t>) + if constexpr (std::is_constructible_v, exact_t>) { - ptr = stx::make_shared(stx::launch_retainer{}, stx::exact_t(ar)); + ptr = stx::make_shared(stx::launch_retainer{}, exact_t(ar)); } else { - ptr = stx::make_shared(stx::exact_t(ar)); + ptr = stx::make_shared(exact_t(ar)); } return [ptr](void* storage) @@ -134,7 +134,7 @@ namespace id_manager struct id_traits_load_func { static constexpr pointer_keeper (*load)(utils::serial&) = [](utils::serial& ar) -> pointer_keeper { - return T::load(stx::exact_t(ar)); + return T::load(exact_t(ar)); }; }; diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 1b7a0752a..c6b7ab50e 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -1399,7 +1399,7 @@ bool GLGSRender::release_GCM_label(u32 address, u32 args) // Now write to DMA and then to host context m_enqueued_host_write_buffer->get().copy_to(mapping.second, host_read_offset, mapping.first, 4); - m_enqueued_host_write_buffer->get().copy_to(m_host_gpu_context_data.get(), host_read_offset + 8, ::offset32(&rsx::host_gpu_context_t::commands_complete_event), 8); + m_enqueued_host_write_buffer->get().copy_to(m_host_gpu_context_data.get(), host_read_offset + 8, OFFSET_OF(rsx::host_gpu_context_t, commands_complete_event), 8); m_enqueued_host_write_buffer->push_barrier(host_read_offset, 16); host_ctx->on_label_release(); @@ -1425,7 +1425,7 @@ void GLGSRender::on_guest_texture_read() // Tag the read as being in progress u64 event_id = m_host_dma_ctrl->host_ctx()->inc_counter(); m_host_dma_ctrl->host_ctx()->texture_load_request_event = event_id; - enqueue_host_context_write(::offset32(&rsx::host_gpu_context_t::texture_load_complete_event), 8, &event_id); + enqueue_host_context_write(OFFSET_OF(rsx::host_gpu_context_t, texture_load_complete_event), 8, &event_id); } void GLGSRender::begin_occlusion_query(rsx::reports::occlusion_query_info* query) diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 0b297f2a9..8c494fbd8 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -165,7 +165,7 @@ namespace rsx { if (offset < sizeof(RsxReports::report) /*&& (offset % 0x10) == 0*/) { - return render->label_addr + ::offset32(&RsxReports::report) + offset; + return render->label_addr + OFFSET_OF(RsxReports, report) + offset; } msg = "Local RSX REPORT offset out of range!"sv; @@ -733,8 +733,8 @@ namespace rsx if (!ar.is_writing() && version < 3) { // Be compatible with previous bitwise serialization - ar(std::span(reinterpret_cast(this), ::offset32(&avconf::scan_mode))); - ar.pos += utils::align(::offset32(&avconf::scan_mode), alignof(avconf)) - ::offset32(&avconf::scan_mode); + ar(std::span(reinterpret_cast(this), OFFSET_OF(avconf, scan_mode))); + ar.pos += utils::align(OFFSET_OF(avconf, scan_mode), alignof(avconf)) - OFFSET_OF(avconf, scan_mode); return; } @@ -1209,7 +1209,7 @@ namespace rsx if (const u64 get_put = new_get_put.exchange(u64{umax}); get_put != umax) { - vm::_ref>(dma_address + ::offset32(&RsxDmaControl::put)).release(get_put); + vm::_ref>(dma_address + OFFSET_OF(RsxDmaControl, put)).release(get_put); fifo_ctrl->set_get(static_cast(get_put)); fifo_ctrl->abort(); fifo_ret_addr = RSX_CALL_STACK_EMPTY; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 90ddb5c9a..f54e503d3 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -1717,7 +1717,7 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args) auto cmd = m_secondary_cb_list.next(); cmd->begin(); VK_GET_SYMBOL(vkCmdUpdateBuffer)(*cmd, mapping.second->value, mapping.first, 4, &write_data); - VK_GET_SYMBOL(vkCmdUpdateBuffer)(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, &release_event_id); + VK_GET_SYMBOL(vkCmdUpdateBuffer)(*cmd, m_host_object_data->value, OFFSET_OF(vk::host_data_t, commands_complete_event), 8, &release_event_id); cmd->end(); vk::queue_submit_t submit_info = {m_device->get_graphics_queue(), nullptr}; @@ -1739,7 +1739,7 @@ void VKGSRender::on_guest_texture_read(const vk::command_buffer& cmd) // Queue a sync update on the CB doing the load auto host_ctx = ensure(m_host_dma_ctrl->host_ctx()); const auto event_id = host_ctx->on_texture_load_acquire(); - VK_GET_SYMBOL(vkCmdUpdateBuffer)(cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::texture_load_complete_event), sizeof(u64), &event_id); + VK_GET_SYMBOL(vkCmdUpdateBuffer)(cmd, m_host_object_data->value, OFFSET_OF(vk::host_data_t, texture_load_complete_event), sizeof(u64), &event_id); } void VKGSRender::sync_hint(rsx::FIFO::interrupt_hint hint, rsx::reports::sync_hint_payload_t payload) @@ -2520,7 +2520,7 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore { VK_GET_SYMBOL(vkCmdUpdateBuffer)(*m_current_command_buffer, m_host_object_data->value, - ::offset32(&vk::host_data_t::commands_complete_event), + OFFSET_OF(vk::host_data_t, commands_complete_event), sizeof(u64), const_cast(&m_host_dma_ctrl->host_ctx()->last_label_acquire_event)); diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp index 85f4c6523..19d1b8ca8 100644 --- a/rpcs3/Emu/System.cpp +++ b/rpcs3/Emu/System.cpp @@ -318,7 +318,7 @@ void init_fxo_for_exec(utils::serial* ar, bool full = false) // Some settings are not allowed in certain PPU decoders static void fixup_settings(const psf::registry* _psf) { - if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static) + if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm_legacy) { if (g_cfg.core.ppu_use_nj_bit) { diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index 1245148f5..cd6299903 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -21,7 +21,7 @@ struct cfg_root : cfg::node public: node_core(cfg::node* _this) : cfg::node(_this, "Core") {} - cfg::_enum ppu_decoder{this, "PPU Decoder", ppu_decoder_type::llvm}; + cfg::_enum ppu_decoder{this, "PPU Decoder", ppu_decoder_type::llvm_legacy}; cfg::_int<1, 8> ppu_threads{this, "PPU Threads", 2}; // Amount of PPU threads running simultaneously (must be 2) cfg::_bool ppu_debug{this, "PPU Debug"}; cfg::_bool ppu_call_history{this, "PPU Calling History"}; // Enable PPU calling history recording diff --git a/rpcs3/Emu/system_config_types.cpp b/rpcs3/Emu/system_config_types.cpp index 734042e1c..edf5e9474 100644 --- a/rpcs3/Emu/system_config_types.cpp +++ b/rpcs3/Emu/system_config_types.cpp @@ -520,8 +520,9 @@ void fmt_class_string::format(std::string& out, u64 arg) { switch (type) { - case ppu_decoder_type::_static: return "Interpreter (static)"; - case ppu_decoder_type::llvm: return "Recompiler (LLVM)"; + case ppu_decoder_type::_static: return "Interpreter (Legacy)"; + case ppu_decoder_type::llvm_legacy: return "LLVM Recompiler (Legacy)"; + case ppu_decoder_type::interpreter: return "Interpreter"; } return unknown; diff --git a/rpcs3/Emu/system_config_types.h b/rpcs3/Emu/system_config_types.h index ee9da8b0f..c7610a442 100644 --- a/rpcs3/Emu/system_config_types.h +++ b/rpcs3/Emu/system_config_types.h @@ -3,7 +3,8 @@ enum class ppu_decoder_type : unsigned { _static, - llvm, + llvm_legacy, + interpreter, }; enum class spu_decoder_type : unsigned diff --git a/rpcs3/Loader/ELF.h b/rpcs3/Loader/ELF.h index 56da9f464..6ccd4bbd6 100644 --- a/rpcs3/Loader/ELF.h +++ b/rpcs3/Loader/ELF.h @@ -3,6 +3,7 @@ #include "util/types.hpp" #include "util/File.h" #include "util/bit_set.h" +#include "util/endian.hpp" #include diff --git a/rpcs3/util/emu_utils.cpp b/rpcs3/util/emu_utils.cpp index 667133dff..8b7145d0c 100644 --- a/rpcs3/util/emu_utils.cpp +++ b/rpcs3/util/emu_utils.cpp @@ -14,7 +14,7 @@ bool is_using_interpreter(thread_class t_class) { switch (t_class) { - case thread_class::ppu: return g_cfg.core.ppu_decoder != ppu_decoder_type::llvm; + case thread_class::ppu: return g_cfg.core.ppu_decoder != ppu_decoder_type::llvm_legacy; case thread_class::spu: return g_cfg.core.spu_decoder != spu_decoder_type::asmjit && g_cfg.core.spu_decoder != spu_decoder_type::llvm; default: return true; } diff --git a/rpcs3/util/fixed_typemap.hpp b/rpcs3/util/fixed_typemap.hpp index 80cad14a4..0857bb5e7 100644 --- a/rpcs3/util/fixed_typemap.hpp +++ b/rpcs3/util/fixed_typemap.hpp @@ -146,10 +146,10 @@ namespace stx } template - requires requires(T& a, utils::serial& ar) { a.save(stx::exact_t(ar)); } + requires requires(T& a, utils::serial& ar) { a.save(exact_t(ar)); } static void call_save(void* ptr, utils::serial& ar) noexcept { - std::launder(static_cast(ptr))->save(stx::exact_t(ar)); + std::launder(static_cast(ptr))->save(exact_t(ar)); } template @@ -173,7 +173,7 @@ namespace stx r.thread_op = &call_thread_op; } - if constexpr (!!(requires(T& a, utils::serial& ar) { a.save(stx::exact_t(ar)); })) + if constexpr (!!(requires(T& a, utils::serial& ar) { a.save(exact_t(ar)); })) { r.save = &call_save; } diff --git a/rpcs3/util/serialization.hpp b/rpcs3/util/serialization.hpp index fddfc646a..4b3261396 100644 --- a/rpcs3/util/serialization.hpp +++ b/rpcs3/util/serialization.hpp @@ -98,24 +98,6 @@ namespace utils pos += padding; } - // Add padding needed between two members - template - void add_padding(T T2::* const first, T3 T2::* const second) - { - if (m_is_writing) - return; - - const u32 offset1 = ::offset32(first) + sizeof(T); - const u32 offset2 = ::offset32(second); - - AUDIT(::offset32(first) <= ::offset32(second)); - - if (offset2 > offset1) - { - pos += offset2 - offset1; - } - } - void set_expect_little_data(bool value) { m_expect_little_data = value; @@ -437,7 +419,7 @@ namespace utils } template - requires requires(T& obj, utils::serial& ar) { (obj.*(&T::operator()))(stx::exact_t(ar)); } + requires requires(T& obj, utils::serial& ar) { (obj.*(&T::operator()))(exact_t(ar)); } bool serialize(T& obj) { obj(*this); @@ -565,7 +547,7 @@ namespace utils template requires(std::is_copy_constructible_v>) && (std::is_constructible_v> || Bitcopy> || - std::is_constructible_v, stx::exact_t> || TupleAlike>) + std::is_constructible_v, exact_t> || TupleAlike>) operator T() noexcept { AUDIT(!is_writing()); @@ -604,9 +586,9 @@ namespace utils return type{std::move(first), this->operator second_t()}; } } - else if constexpr (std::is_constructible_v>) + else if constexpr (std::is_constructible_v>) { - return not_tuple_t(stx::exact_t(*this)); + return not_tuple_t(exact_t(*this)); } else if constexpr (std::is_constructible_v) { diff --git a/rpcs3/util/simd.hpp b/rpcs3/util/simd.hpp index 4a1bdb1af..e0b082fff 100644 --- a/rpcs3/util/simd.hpp +++ b/rpcs3/util/simd.hpp @@ -6,6 +6,7 @@ #include "util/sysinfo.hpp" #include "util/asm.hpp" #include "util/JIT.h" +#include #if defined(ARCH_X64) #ifdef _MSC_VER @@ -34,6 +35,8 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #endif +using namespace rx; + namespace asmjit { struct vec_builder; @@ -565,2774 +568,445 @@ namespace asmjit #endif } // namespace asmjit -inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false); -inline v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false); -inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false); -inline v128 gv_select32(const v128& _cmp, const v128& _true, const v128& _false); -inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false); - -template - requires(asmjit::any_operand_v) -inline asmjit::vec_type gv_gts32(A&&, B&&); - -inline void gv_set_zeroing_denormals() +namespace rx { -#if defined(ARCH_X64) - u32 cr = _mm_getcsr(); - cr = (cr & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_ZERO_ON; - cr = (cr & ~_MM_DENORMALS_ZERO_MASK) | _MM_DENORMALS_ZERO_ON; - cr = (cr | _MM_MASK_INVALID); - _mm_setcsr(cr); -#elif defined(ARCH_ARM64) - u64 cr; - __asm__ volatile("mrs %0, FPCR" : "=r"(cr)); - cr |= 0x1000000ull; - __asm__ volatile("msr FPCR, %0" ::"r"(cr)); -#else -#error "Not implemented" -#endif -} + inline bool g_use_avx = utils::has_avx(); -inline void gv_unset_zeroing_denormals() -{ + inline void gv_zeroupper() + { #if defined(ARCH_X64) - u32 cr = _mm_getcsr(); - cr = (cr & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_ZERO_OFF; - cr = (cr & ~_MM_DENORMALS_ZERO_MASK) | _MM_DENORMALS_ZERO_OFF; - cr = (cr | _MM_MASK_INVALID); - _mm_setcsr(cr); -#elif defined(ARCH_ARM64) - u64 cr; - __asm__ volatile("mrs %0, FPCR" : "=r"(cr)); - cr &= ~0x1000000ull; - __asm__ volatile("msr FPCR, %0" ::"r"(cr)); -#else -#error "Not implemented" -#endif -} - -inline bool g_use_avx = utils::has_avx(); - -inline void gv_zeroupper() -{ -#if defined(ARCH_X64) - if (!g_use_avx) - return; + if (!g_use_avx) + return; #if defined(_M_X64) && defined(_MSC_VER) - _mm256_zeroupper(); + _mm256_zeroupper(); #else - __asm__ volatile("vzeroupper;"); + __asm__ volatile("vzeroupper;"); #endif #endif -} - -inline v128 gv_bcst8(u8 value) -{ -#if defined(ARCH_X64) - return _mm_set1_epi8(value); -#elif defined(ARCH_ARM64) - return vdupq_n_s8(value); -#endif -} - -inline v128 gv_bcst16(u16 value) -{ -#if defined(ARCH_X64) - return _mm_set1_epi16(value); -#elif defined(ARCH_ARM64) - return vdupq_n_s16(value); -#endif -} - -// Optimized broadcast using constant offset assumption -inline v128 gv_bcst16(const u16& value, auto mptr, auto... args) -{ -#if defined(ARCH_X64) - const u32 offset = ::offset32(mptr, args...); - [[maybe_unused]] const __m128i* ptr = reinterpret_cast<__m128i*>(uptr(&value) - offset % 16); -#if !defined(__AVX2__) - if (offset % 16 == 0) - return _mm_shuffle_epi32(_mm_shufflelo_epi16(*ptr, 0), 0); - if (offset % 16 == 2) - return _mm_shuffle_epi32(_mm_shufflelo_epi16(*ptr, 0b01010101), 0); - if (offset % 16 == 4) - return _mm_shuffle_epi32(_mm_shufflelo_epi16(*ptr, 0b10101010), 0); - if (offset % 16 == 6) - return _mm_shuffle_epi32(_mm_shufflelo_epi16(*ptr, 0xff), 0); - if (offset % 16 == 8) - return _mm_shuffle_epi32(_mm_shufflehi_epi16(*ptr, 0), 0xff); - if (offset % 16 == 10) - return _mm_shuffle_epi32(_mm_shufflehi_epi16(*ptr, 0b01010101), 0xff); - if (offset % 16 == 12) - return _mm_shuffle_epi32(_mm_shufflehi_epi16(*ptr, 0b10101010), 0xff); - if (offset % 16 == 14) - return _mm_shuffle_epi32(_mm_shufflehi_epi16(*ptr, 0xff), 0xff); -#endif - return _mm_set1_epi16(value); -#else - static_cast(mptr); - return gv_bcst16(value); -#endif -} - -inline v128 gv_bcst32(u32 value) -{ -#if defined(ARCH_X64) - return _mm_set1_epi32(value); -#elif defined(ARCH_ARM64) - return vdupq_n_s32(value); -#endif -} - -// Optimized broadcast using constant offset assumption -inline v128 gv_bcst32(const u32& value, auto mptr, auto... args) -{ -#if defined(ARCH_X64) - const u32 offset = ::offset32(mptr, args...); - [[maybe_unused]] const __m128i* ptr = reinterpret_cast<__m128i*>(uptr(&value) - offset % 16); -#if !defined(__AVX__) - if (offset % 16 == 0) - return _mm_shuffle_epi32(*ptr, 0); - if (offset % 16 == 4) - return _mm_shuffle_epi32(*ptr, 0b01010101); - if (offset % 16 == 8) - return _mm_shuffle_epi32(*ptr, 0b10101010); - if (offset % 16 == 12) - return _mm_shuffle_epi32(*ptr, 0xff); -#endif - return _mm_set1_epi32(value); -#else - static_cast(mptr); - return gv_bcst32(value); -#endif -} - -inline v128 gv_bcst64(u64 value) -{ -#if defined(ARCH_X64) - return _mm_set1_epi64x(value); -#elif defined(ARCH_ARM64) - return vdupq_n_s64(value); -#endif -} - -// Optimized broadcast using constant offset assumption -inline v128 gv_bcst64(const u64& value, auto mptr, auto... args) -{ -#if defined(ARCH_X64) - const u32 offset = ::offset32(mptr, args...); - [[maybe_unused]] const __m128i* ptr = reinterpret_cast<__m128i*>(uptr(&value) - offset % 16); -#if !defined(__AVX__) - if (offset % 16 == 0) - return _mm_shuffle_epi32(*ptr, 0b00010001); - if (offset % 16 == 8) - return _mm_shuffle_epi32(*ptr, 0b10111011); -#endif - return _mm_set1_epi64x(value); -#else - static_cast(mptr); - return gv_bcst64(value); -#endif -} - -inline v128 gv_bcstfs(f32 value) -{ -#if defined(ARCH_X64) - return _mm_set1_ps(value); -#elif defined(ARCH_ARM64) - return vdupq_n_f32(value); -#endif -} - -inline v128 gv_and32(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_and_si128(a, b); -#elif defined(ARCH_ARM64) - return vandq_s32(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_and32(A&& a, B&& b) -{ - FOR_X64(binary_op, 4, kIdMovdqa, kIdPand, kIdVpand, kIdVpandd, std::forward(a), std::forward(b)); -} - -inline v128 gv_andfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_and_ps(a, b); -#elif defined(ARCH_ARM64) - return vandq_s32(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_andfs(A&& a, B&& b) -{ - FOR_X64(binary_op, 4, kIdMovaps, kIdAndps, kIdVandps, kIdVandps, std::forward(a), std::forward(b)); -} - -inline v128 gv_andn32(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_andnot_si128(a, b); -#elif defined(ARCH_ARM64) - return vbicq_s32(b, a); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_andn32(A&& a, B&& b) -{ - FOR_X64(binary_op, 4, kIdMovdqa, kIdPandn, kIdVpandn, kIdVpandnd, std::forward(a), std::forward(b)); -} - -inline v128 gv_andnfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_andnot_ps(a, b); -#elif defined(ARCH_ARM64) - return vbicq_s32(b, a); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_andnfs(A&& a, B&& b) -{ - FOR_X64(binary_op, 4, kIdMovaps, kIdAndnps, kIdVandnps, kIdVandnps, std::forward(a), std::forward(b)); -} - -inline v128 gv_or32(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_or_si128(a, b); -#elif defined(ARCH_ARM64) - return vorrq_s32(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_or32(A&& a, B&& b) -{ - FOR_X64(binary_op, 4, kIdMovdqa, kIdPor, kIdVpor, kIdVpord, std::forward(a), std::forward(b)); -} - -inline v128 gv_orfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_or_ps(a, b); -#elif defined(ARCH_ARM64) - return vorrq_s32(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_orfs(A&& a, B&& b) -{ - FOR_X64(binary_op, 4, kIdMovaps, kIdOrps, kIdVorps, kIdVorps, std::forward(a), std::forward(b)); -} - -inline v128 gv_xor32(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_xor_si128(a, b); -#elif defined(ARCH_ARM64) - return veorq_s32(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_xor32(A&& a, B&& b) -{ - FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::forward(a), std::forward(b)); -} - -inline v128 gv_xorfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_xor_ps(a, b); -#elif defined(ARCH_ARM64) - return veorq_s32(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_xorfs(A&& a, B&& b) -{ - FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::forward(a), std::forward(b)); -} - -inline v128 gv_not32(const v128& a) -{ -#if defined(ARCH_X64) - return _mm_xor_si128(a, _mm_set1_epi32(-1)); -#elif defined(ARCH_ARM64) - return vmvnq_u32(a); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_not32(A&& a) -{ -#if defined(ARCH_X64) - asmjit::vec_type ones = g_vc->vec_alloc(); - g_vc->pcmpeqd(ones, ones); - FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::move(ones), std::forward(a)); -#endif -} - -inline v128 gv_notfs(const v128& a) -{ -#if defined(ARCH_X64) - return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1))); -#elif defined(ARCH_ARM64) - return vmvnq_u32(a); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_notfs(A&& a) -{ -#if defined(ARCH_X64) - asmjit::vec_type ones = g_vc->vec_alloc(); - g_vc->pcmpeqd(ones, ones); - FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::move(ones), std::forward(a)); -#endif -} - -inline v128 gv_shl16(const v128& a, u32 count) -{ - if (count >= 16) - return v128{}; -#if defined(ARCH_X64) - return _mm_slli_epi16(a, count); -#elif defined(ARCH_ARM64) - return vshlq_s16(a, vdupq_n_s16(count)); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_shl16(A&& a, u32 count) -{ - FOR_X64(unary_op, kIdPsllw, kIdVpsllw, std::forward(a), count); -} - -inline v128 gv_shl32(const v128& a, u32 count) -{ - if (count >= 32) - return v128{}; -#if defined(ARCH_X64) - return _mm_slli_epi32(a, count); -#elif defined(ARCH_ARM64) - return vshlq_s32(a, vdupq_n_s32(count)); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_shl32(A&& a, u32 count) -{ - FOR_X64(unary_op, kIdPslld, kIdVpslld, std::forward(a), count); -} - -inline v128 gv_shl64(const v128& a, u32 count) -{ - if (count >= 64) - return v128{}; -#if defined(ARCH_X64) - return _mm_slli_epi64(a, count); -#elif defined(ARCH_ARM64) - return vshlq_s64(a, vdupq_n_s64(count)); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_shl64(A&& a, u32 count) -{ - FOR_X64(unary_op, kIdPsllq, kIdVpsllq, std::forward(a), count); -} - -inline v128 gv_shr16(const v128& a, u32 count) -{ - if (count >= 16) - return v128{}; -#if defined(ARCH_X64) - return _mm_srli_epi16(a, count); -#elif defined(ARCH_ARM64) - return vshlq_u16(a, vdupq_n_s16(0 - count)); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_shr16(A&& a, u32 count) -{ - FOR_X64(unary_op, kIdPsrlw, kIdVpsrlw, std::forward(a), count); -} - -inline v128 gv_shr32(const v128& a, u32 count) -{ - if (count >= 32) - return v128{}; -#if defined(ARCH_X64) - return _mm_srli_epi32(a, count); -#elif defined(ARCH_ARM64) - return vshlq_u32(a, vdupq_n_s32(0 - count)); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_shr32(A&& a, u32 count) -{ - FOR_X64(unary_op, kIdPsrld, kIdVpsrld, std::forward(a), count); -} - -inline v128 gv_shr64(const v128& a, u32 count) -{ - if (count >= 64) - return v128{}; -#if defined(ARCH_X64) - return _mm_srli_epi64(a, count); -#elif defined(ARCH_ARM64) - return vshlq_u64(a, vdupq_n_s64(0 - count)); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_shr64(A&& a, u32 count) -{ - FOR_X64(unary_op, kIdPsrlq, kIdVpsrlq, std::forward(a), count); -} - -inline v128 gv_sar16(const v128& a, u32 count) -{ - if (count >= 16) - count = 15; -#if defined(ARCH_X64) - return _mm_srai_epi16(a, count); -#elif defined(ARCH_ARM64) - return vshlq_s16(a, vdupq_n_s16(0 - count)); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_sar16(A&& a, u32 count) -{ - FOR_X64(unary_op, kIdPsraw, kIdVpsraw, std::forward(a), count); -} - -inline v128 gv_sar32(const v128& a, u32 count) -{ - if (count >= 32) - count = 31; -#if defined(ARCH_X64) - return _mm_srai_epi32(a, count); -#elif defined(ARCH_ARM64) - return vshlq_s32(a, vdupq_n_s32(0 - count)); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_sar32(A&& a, u32 count) -{ - FOR_X64(unary_op, kIdPsrad, kIdVpsrad, std::forward(a), count); -} - -inline v128 gv_sar64(const v128& a, u32 count) -{ - if (count >= 64) - count = 63; -#if defined(__AVX512VL__) - return _mm_srai_epi64(a, count); -#elif defined(__SSE2__) && !defined(_M_X64) - return static_cast<__v2di>(a) >> count; -#elif defined(ARCH_ARM64) - return vshlq_s64(a, vdupq_n_s64(0 - count)); -#else - v128 r; - r._s64[0] = a._s64[0] >> count; - r._s64[1] = a._s64[1] >> count; - return r; -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_sar64(A&& a, u32 count) -{ - if (count >= 64) - count = 63; -#if defined(ARCH_X64) - using enum asmjit::x86::Inst::Id; - if (utils::has_avx512()) - return asmjit::unary_op(kIdNone, kIdVpsraq, std::forward(a), count); - g_vc->fail_flag = true; - return std::forward(a); -#endif -} - -inline v128 gv_add8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_add_epi8(a, b); -#elif defined(ARCH_ARM64) - return vaddq_s8(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_add8(A&& a, B&& b) -{ - FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddb, kIdVpaddb, kIdNone, std::forward(a), std::forward(b)); -} - -inline v128 gv_add16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_add_epi16(a, b); -#elif defined(ARCH_ARM64) - return vaddq_s16(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_add16(A&& a, B&& b) -{ - FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddw, kIdVpaddw, kIdNone, std::forward(a), std::forward(b)); -} - -inline v128 gv_add32(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_add_epi32(a, b); -#elif defined(ARCH_ARM64) - return vaddq_s32(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_add32(A&& a, B&& b) -{ - FOR_X64(binary_op, 4, kIdMovdqa, kIdPaddd, kIdVpaddd, kIdVpaddd, std::forward(a), std::forward(b)); -} - -inline v128 gv_add64(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_add_epi64(a, b); -#elif defined(ARCH_ARM64) - return vaddq_s64(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_add64(A&& a, B&& b) -{ - FOR_X64(binary_op, 8, kIdMovdqa, kIdPaddq, kIdVpaddq, kIdVpaddq, std::forward(a), std::forward(b)); -} - -inline v128 gv_adds_s8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_adds_epi8(a, b); -#elif defined(ARCH_ARM64) - return vqaddq_s8(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_adds_s8(A&& a, B&& b) -{ - FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddsb, kIdVpaddsb, kIdNone, std::forward(a), std::forward(b)); -} - -inline v128 gv_adds_s16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_adds_epi16(a, b); -#elif defined(ARCH_ARM64) - return vqaddq_s16(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_adds_s16(A&& a, B&& b) -{ - FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddsw, kIdVpaddsw, kIdNone, std::forward(a), std::forward(b)); -} - -inline v128 gv_adds_s32(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - const v128 s = _mm_add_epi32(a, b); - const v128 m = (a ^ s) & (b ^ s); // overflow bit - const v128 x = _mm_srai_epi32(m, 31); // saturation mask - const v128 y = _mm_srai_epi32(_mm_and_si128(s, m), 31); // positive saturation mask - return _mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(x, 1), y), _mm_or_si128(s, x)); -#elif defined(ARCH_ARM64) - return vqaddq_s32(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_adds_s32(A&& a, B&& b) -{ -#if defined(ARCH_X64) - auto s = gv_add32(a, b); - auto m = gv_and32(gv_xor32(std::forward(a), s), gv_xor32(std::forward(b), s)); - auto x = gv_sar32(m, 31); - auto y = gv_sar32(gv_and32(s, std::move(m)), 31); - auto z = gv_xor32(gv_shr32(x, 1), std::move(y)); - return gv_xor32(std::move(z), gv_or32(std::move(s), std::move(x))); -#endif -} - -inline v128 gv_addus_u8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_adds_epu8(a, b); -#elif defined(ARCH_ARM64) - return vqaddq_u8(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_addus_u8(A&& a, B&& b) -{ - FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddusb, kIdVpaddusb, kIdNone, std::forward(a), std::forward(b)); -} - -inline v128 gv_addus_u16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_adds_epu16(a, b); -#elif defined(ARCH_ARM64) - return vqaddq_u16(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_addus_u16(A&& a, B&& b) -{ - FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddusw, kIdVpaddusw, kIdNone, std::forward(a), std::forward(b)); -} - -inline v128 gv_addus_u32(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_add_epi32(a, _mm_min_epu32(~a, b)); -#elif defined(ARCH_X64) - const v128 s = _mm_add_epi32(a, b); - return _mm_or_si128(s, _mm_cmpgt_epi32(_mm_xor_si128(b, _mm_set1_epi32(smin)), _mm_xor_si128(a, _mm_set1_epi32(smax)))); -#elif defined(ARCH_ARM64) - return vqaddq_u32(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline asmjit::vec_type gv_addus_u32(A&& a, B&& b) -{ -#if defined(ARCH_X64) - if (utils::has_sse41()) - return gv_add32(gv_minu32(std::forward(b), gv_not32(a)), std::forward(a)); - auto s = gv_add32(a, b); - auto x = gv_xor32(std::forward(b), gv_bcst32(0x80000000)); - auto y = gv_xor32(std::forward(a), gv_bcst32(0x7fffffff)); - return gv_or32(std::move(s), gv_gts32(std::move(x), std::move(y))); -#endif - return {}; -} - -inline v128 gv_addfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_add_ps(a, b); -#elif defined(ARCH_ARM64) - return vaddq_f32(a, b); -#endif -} - -inline v128 gv_addfd(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_add_pd(a, b); -#elif defined(ARCH_ARM64) - return vaddq_f64(a, b); -#endif -} - -inline v128 gv_sub8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_sub_epi8(a, b); -#elif defined(ARCH_ARM64) - return vsubq_s8(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_sub8(A&& a, B&& b) -{ - FOR_X64(binary_op, 1, kIdMovdqa, kIdPsubb, kIdVpsubb, kIdNone, std::forward(a), std::forward(b)); -} - -inline v128 gv_sub16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_sub_epi16(a, b); -#elif defined(ARCH_ARM64) - return vsubq_s16(a, b); -#endif -} - -inline v128 gv_sub32(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_sub_epi32(a, b); -#elif defined(ARCH_ARM64) - return vsubq_s32(a, b); -#endif -} - -inline v128 gv_sub64(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_sub_epi64(a, b); -#elif defined(ARCH_ARM64) - return vsubq_s64(a, b); -#endif -} - -inline v128 gv_subs_s8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_subs_epi8(a, b); -#elif defined(ARCH_ARM64) - return vqsubq_s8(a, b); -#endif -} - -inline v128 gv_subs_s16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_subs_epi16(a, b); -#elif defined(ARCH_ARM64) - return vqsubq_s16(a, b); -#endif -} - -inline v128 gv_subs_s32(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - const v128 d = _mm_sub_epi32(a, b); - const v128 m = (a ^ b) & (a ^ d); // overflow bit - const v128 x = _mm_srai_epi32(m, 31); - return _mm_or_si128(_mm_andnot_si128(x, d), _mm_and_si128(x, _mm_xor_si128(_mm_srli_epi32(x, 1), _mm_srai_epi32(a, 31)))); -#elif defined(ARCH_ARM64) - return vqsubq_s32(a, b); -#endif -} - -inline v128 gv_subus_u8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_subs_epu8(a, b); -#elif defined(ARCH_ARM64) - return vqsubq_u8(a, b); -#endif -} - -inline v128 gv_subus_u16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_subs_epu16(a, b); -#elif defined(ARCH_ARM64) - return vqsubq_u16(a, b); -#endif -} - -inline v128 gv_subus_u32(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_sub_epi32(a, _mm_min_epu32(a, b)); -#elif defined(ARCH_X64) - const auto sign = _mm_set1_epi32(smin); - return _mm_andnot_si128(_mm_cmpgt_epi32(_mm_xor_si128(b, sign), _mm_xor_si128(a, sign)), _mm_sub_epi32(a, b)); -#elif defined(ARCH_ARM64) - return vqsubq_u32(a, b); -#endif -} - -inline v128 gv_subfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_sub_ps(a, b); -#elif defined(ARCH_ARM64) - return vsubq_f32(a, b); -#endif -} - -inline v128 gv_subfd(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_sub_pd(a, b); -#elif defined(ARCH_ARM64) - return vsubq_f64(a, b); -#endif -} - -inline v128 gv_maxu8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_max_epu8(a, b); -#elif defined(ARCH_ARM64) - return vmaxq_u8(a, b); -#endif -} - -inline v128 gv_maxu16(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_max_epu16(a, b); -#elif defined(ARCH_X64) - return _mm_add_epi16(_mm_subs_epu16(a, b), b); -#elif defined(ARCH_ARM64) - return vmaxq_u16(a, b); -#endif -} - -inline v128 gv_maxu32(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_max_epu32(a, b); -#elif defined(ARCH_X64) - const __m128i s = _mm_set1_epi32(smin); - const __m128i m = _mm_cmpgt_epi32(_mm_xor_si128(a, s), _mm_xor_si128(b, s)); - return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b)); -#elif defined(ARCH_ARM64) - return vmaxq_u32(a, b); -#endif -} - -inline v128 gv_maxs8(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_max_epi8(a, b); -#elif defined(ARCH_X64) - const __m128i m = _mm_cmpgt_epi8(a, b); - return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b)); -#elif defined(ARCH_ARM64) - return vmaxq_s8(a, b); -#endif -} - -inline v128 gv_maxs16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_max_epi16(a, b); -#elif defined(ARCH_ARM64) - return vmaxq_s16(a, b); -#endif -} - -inline v128 gv_maxs32(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_max_epi32(a, b); -#elif defined(ARCH_X64) - const __m128i m = _mm_cmpgt_epi32(a, b); - return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b)); -#elif defined(ARCH_ARM64) - return vmaxq_s32(a, b); -#endif -} - -inline v128 gv_maxfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_and_ps(_mm_max_ps(a, b), _mm_max_ps(b, a)); -#elif defined(ARCH_ARM64) - return vmaxq_f32(a, b); -#endif -} - -inline v128 gv_minu8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_min_epu8(a, b); -#elif defined(ARCH_ARM64) - return vminq_u8(a, b); -#endif -} - -inline v128 gv_minu16(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_min_epu16(a, b); -#elif defined(ARCH_X64) - return _mm_sub_epi16(a, _mm_subs_epu16(a, b)); -#elif defined(ARCH_ARM64) - return vminq_u16(a, b); -#endif -} - -inline v128 gv_minu32(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_min_epu32(a, b); -#elif defined(ARCH_X64) - const __m128i s = _mm_set1_epi32(smin); - const __m128i m = _mm_cmpgt_epi32(_mm_xor_si128(a, s), _mm_xor_si128(b, s)); - return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b)); -#elif defined(ARCH_ARM64) - return vminq_u32(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline asmjit::vec_type gv_minu32(A&& a, B&& b) -{ -#if defined(ARCH_X64) - if (utils::has_sse41()) - FOR_X64(binary_op, 4, kIdMovdqa, kIdPminud, kIdVpminud, kIdVpminud, std::forward(a), std::forward(b)); - auto s = gv_bcst32(0x80000000); - auto x = gv_xor32(a, s); - auto m = gv_gts32(std::move(x), gv_xor32(std::move(s), b)); - auto z = gv_and32(m, std::move(b)); - return gv_or32(std::move(z), gv_andn32(std::move(m), std::move(a))); -#endif - return {}; -} - -inline v128 gv_mins8(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_min_epi8(a, b); -#elif defined(ARCH_X64) - const __m128i m = _mm_cmpgt_epi8(a, b); - return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b)); -#elif defined(ARCH_ARM64) - return vminq_s8(a, b); -#endif -} - -inline v128 gv_mins16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_min_epi16(a, b); -#elif defined(ARCH_ARM64) - return vminq_s16(a, b); -#endif -} - -inline v128 gv_mins32(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_min_epi32(a, b); -#elif defined(ARCH_X64) - const __m128i m = _mm_cmpgt_epi32(a, b); - return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b)); -#elif defined(ARCH_ARM64) - return vminq_s32(a, b); -#endif -} - -inline v128 gv_minfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a)); -#elif defined(ARCH_ARM64) - return vminq_f32(a, b); -#endif -} - -inline v128 gv_eq8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpeq_epi8(a, b); -#elif defined(ARCH_ARM64) - return vceqq_s8(a, b); -#endif -} - -inline v128 gv_eq16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpeq_epi16(a, b); -#elif defined(ARCH_ARM64) - return vceqq_s16(a, b); -#endif -} - -inline v128 gv_eq32(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpeq_epi32(a, b); -#elif defined(ARCH_ARM64) - return vceqq_s32(a, b); -#endif -} - -// Ordered and equal -inline v128 gv_eqfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpeq_ps(a, b); -#elif defined(ARCH_ARM64) - return vceqq_f32(a, b); -#endif -} - -// Unordered or not equal -inline v128 gv_neqfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpneq_ps(a, b); -#elif defined(ARCH_ARM64) - return ~vceqq_f32(a, b); -#endif -} - -inline v128 gv_gtu8(const v128& a, const v128& b) -{ -#if defined(__AVX512VL__) && defined(__AVX512BW__) - return _mm_movm_epi8(_mm_cmpgt_epu8_mask(a, b)); -#elif defined(ARCH_X64) - return _mm_cmpeq_epi8(_mm_cmpeq_epi8(a, _mm_min_epu8(a, b)), _mm_setzero_si128()); -#elif defined(ARCH_ARM64) - return vcgtq_u8(a, b); -#endif -} - -inline v128 gv_gtu16(const v128& a, const v128& b) -{ -#if defined(__AVX512VL__) && defined(__AVX512BW__) - return _mm_movm_epi16(_mm_cmpgt_epu16_mask(a, b)); -#elif defined(__SSE4_1__) - return _mm_cmpeq_epi16(_mm_cmpeq_epi16(a, _mm_min_epu16(a, b)), _mm_setzero_si128()); -#elif defined(ARCH_X64) - return _mm_cmpeq_epi16(_mm_cmpeq_epi16(_mm_subs_epu16(a, b), _mm_setzero_si128()), _mm_setzero_si128()); -#elif defined(ARCH_ARM64) - return vcgtq_u16(a, b); -#endif -} - -inline v128 gv_gtu32(const v128& a, const v128& b) -{ -#if defined(__AVX512VL__) && defined(__AVX512DQ__) - return _mm_movm_epi32(_mm_cmpgt_epu32_mask(a, b)); -#elif defined(__SSE4_1__) - return _mm_cmpeq_epi32(_mm_cmpeq_epi32(a, _mm_min_epu32(a, b)), _mm_setzero_si128()); -#elif defined(ARCH_X64) - const auto sign = _mm_set1_epi32(smin); - return _mm_cmpgt_epi32(_mm_xor_si128(a, sign), _mm_xor_si128(b, sign)); -#elif defined(ARCH_ARM64) - return vcgtq_u32(a, b); -#endif -} - -// Ordered and greater than -inline v128 gv_gtfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpgt_ps(a, b); -#elif defined(ARCH_ARM64) - return vcgtq_f32(a, b); -#endif -} - -// Ordered and less than -inline v128 gv_ltfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmplt_ps(a, b); -#elif defined(ARCH_ARM64) - return vcltq_f32(a, b); -#endif -} - -// Unordered or less or equal -inline v128 gv_ngtfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpngt_ps(a, b); -#elif defined(ARCH_ARM64) - return ~vcgtq_f32(a, b); -#endif -} - -// Unordered or greater or equal -inline v128 gv_nlefs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpnle_ps(a, b); -#elif defined(ARCH_ARM64) - return ~vcleq_f32(a, b); -#endif -} - -inline v128 gv_geu8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpeq_epi8(b, _mm_min_epu8(a, b)); -#elif defined(ARCH_ARM64) - return vcgeq_u8(a, b); -#endif -} - -inline v128 gv_geu16(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_cmpeq_epi16(b, _mm_min_epu16(a, b)); -#elif defined(ARCH_X64) - return _mm_cmpeq_epi16(_mm_subs_epu16(b, a), _mm_setzero_si128()); -#elif defined(ARCH_ARM64) - return vcgeq_u16(a, b); -#endif -} - -inline v128 gv_geu32(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_cmpeq_epi32(b, _mm_min_epu32(a, b)); -#elif defined(ARCH_X64) - const auto sign = _mm_set1_epi32(smin); - return _mm_cmpeq_epi32(_mm_cmpgt_epi32(_mm_xor_si128(b, sign), _mm_xor_si128(a, sign)), _mm_setzero_si128()); -#elif defined(ARCH_ARM64) - return vcgeq_u32(a, b); -#endif -} - -// Ordered and not less than -inline v128 gv_gefs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpge_ps(a, b); -#elif defined(ARCH_ARM64) - return vcgeq_f32(a, b); -#endif -} - -// Unordered or less than -inline v128 gv_ngefs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpnge_ps(a, b); -#elif defined(ARCH_ARM64) - return ~vcgeq_f32(a, b); -#endif -} - -inline v128 gv_gts8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpgt_epi8(a, b); -#elif defined(ARCH_ARM64) - return vcgtq_s8(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline asmjit::vec_type gv_gts8(A&& a, B&& b) -{ - FOR_X64(binary_op, 1, kIdMovdqa, kIdPcmpgtb, kIdVpcmpgtb, kIdNone, std::forward(a), std::forward(b)); - return {}; -} - -inline v128 gv_gts16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpgt_epi16(a, b); -#elif defined(ARCH_ARM64) - return vcgtq_s16(a, b); -#endif -} - -inline v128 gv_gts32(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_cmpgt_epi32(a, b); -#elif defined(ARCH_ARM64) - return vcgtq_s32(a, b); -#endif -} - -template - requires(asmjit::any_operand_v) -inline asmjit::vec_type gv_gts32(A&& a, B&& b) -{ - FOR_X64(binary_op, 4, kIdMovdqa, kIdPcmpgtd, kIdVpcmpgtd, kIdNone, std::forward(a), std::forward(b)); - return {}; -} - -inline v128 gv_avgu8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_avg_epu8(a, b); -#elif defined(ARCH_ARM64) - return vrhaddq_u8(a, b); -#endif -} - -inline v128 gv_avgu16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_avg_epu16(a, b); -#elif defined(ARCH_ARM64) - return vrhaddq_u16(a, b); -#endif -} - -inline v128 gv_avgu32(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - const auto ones = _mm_set1_epi32(-1); - const auto summ = gv_sub32(gv_add32(a, b), ones); - const auto carry = _mm_slli_epi32(gv_geu32(a, summ), 31); - return _mm_or_si128(carry, _mm_srli_epi32(summ, 1)); -#elif defined(ARCH_ARM64) - return vrhaddq_u32(a, b); -#endif -} - -inline v128 gv_avgs8(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - const v128 sign = _mm_set1_epi8(smin); - return gv_avgu8(a ^ sign, b ^ sign) ^ sign; -#elif defined(ARCH_ARM64) - return vrhaddq_s8(a, b); -#endif -} - -inline v128 gv_avgs16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - const v128 sign = _mm_set1_epi16(smin); - return gv_avgu16(a ^ sign, b ^ sign) ^ sign; -#elif defined(ARCH_ARM64) - return vrhaddq_s16(a, b); -#endif -} - -inline v128 gv_avgs32(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - const v128 sign = _mm_set1_epi32(smin); - return gv_avgu32(a ^ sign, b ^ sign) ^ sign; -#elif defined(ARCH_ARM64) - return vrhaddq_s32(a, b); -#endif -} - -inline v128 gv_divfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_div_ps(a, b); -#elif defined(ARCH_ARM64) - return vdivq_f32(a, b); -#endif -} - -inline v128 gv_sqrtfs(const v128& a) -{ -#if defined(ARCH_X64) - return _mm_sqrt_ps(a); -#elif defined(ARCH_ARM64) - return vsqrtq_f32(a); -#endif -} - -inline v128 gv_fmafs(const v128& a, const v128& b, const v128& c) -{ -#if defined(ARCH_X64) && defined(__FMA__) - return _mm_fmadd_ps(a, b, c); -#elif defined(__FMA4__) - return _mm_macc_ps(a, b, c); -#elif defined(ARCH_X64) - // This is inaccurate implementation -#ifdef __AVX__ - const __m128 r = _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(a), _mm256_cvtps_pd(b)), _mm256_cvtps_pd(c))); -#else - const __m128d a0 = _mm_cvtps_pd(a); - const __m128d a1 = _mm_cvtps_pd(_mm_movehl_ps(a, a)); - const __m128d b0 = _mm_cvtps_pd(b); - const __m128d b1 = _mm_cvtps_pd(_mm_movehl_ps(b, b)); - const __m128d c0 = _mm_cvtps_pd(c); - const __m128d c1 = _mm_cvtps_pd(_mm_movehl_ps(c, c)); - const __m128d m0 = _mm_mul_pd(a0, b0); - const __m128d m1 = _mm_mul_pd(a1, b1); - const __m128d r0 = _mm_add_pd(m0, c0); - const __m128d r1 = _mm_add_pd(m1, c1); - const __m128 r = _mm_movelh_ps(_mm_cvtpd_ps(r0), _mm_cvtpd_ps(r1)); -#endif - return r; -#elif defined(ARCH_ARM64) - return vfmaq_f32(c, a, b); -#else - v128 r; - for (int i = 0; i < 4; i++) - { - r._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]); } - return r; -#endif -} -inline v128 gv_muladdfs(const v128& a, const v128& b, const v128& c) -{ -#if defined(ARCH_X64) && defined(__FMA__) - return _mm_fmadd_ps(a, b, c); -#elif defined(__FMA4__) - return _mm_macc_ps(a, b, c); -#elif defined(ARCH_ARM64) - return vfmaq_f32(c, a, b); -#elif defined(ARCH_X64) - return _mm_add_ps(_mm_mul_ps(a, b), c); -#endif -} + template + requires(asmjit::any_operand_v) + inline asmjit::vec_type gv_gts32(A&&, B&&); -// -> ssat((a * b * 2 + (c << 16) + 0x8000) >> 16) -inline v128 gv_rmuladds_hds16(const v128& a, const v128& b, const v128& c) -{ -#if defined(ARCH_ARM64) -#if defined(ANDROID) - // This function used in optimized PPU interpreter only, we do not use interperters in android - return a; -#else - return vqrdmlahq_s16(c, a, b); -#endif -#elif defined(ARCH_X64) - const auto x80 = _mm_set1_epi16(0x80); // 0x80 * 0x80 = 0x4000, add this to the product - const auto al = _mm_unpacklo_epi16(a, x80); - const auto ah = _mm_unpackhi_epi16(a, x80); - const auto bl = _mm_unpacklo_epi16(b, x80); - const auto bh = _mm_unpackhi_epi16(b, x80); - const auto ml = _mm_srai_epi32(_mm_madd_epi16(al, bl), 15); - const auto mh = _mm_srai_epi32(_mm_madd_epi16(ah, bh), 15); - const auto cl = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), c), 16); - const auto ch = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), c), 16); - const auto sl = _mm_add_epi32(ml, cl); - const auto sh = _mm_add_epi32(mh, ch); - return _mm_packs_epi32(sl, sh); -#endif -} - -// -> ssat((a * b * 2 + 0x8000) >> 16) -inline v128 gv_rmuls_hds16(const v128& a, const v128& b) -{ -#if defined(ARCH_ARM64) - return vqrdmulhq_s16(a, b); -#elif defined(ARCH_X64) - const auto x80 = _mm_set1_epi16(0x80); // 0x80 * 0x80 = 0x4000, add this to the product - const auto al = _mm_unpacklo_epi16(a, x80); - const auto ah = _mm_unpackhi_epi16(a, x80); - const auto bl = _mm_unpacklo_epi16(b, x80); - const auto bh = _mm_unpackhi_epi16(b, x80); - const auto ml = _mm_srai_epi32(_mm_madd_epi16(al, bl), 15); - const auto mh = _mm_srai_epi32(_mm_madd_epi16(ah, bh), 15); - return _mm_packs_epi32(ml, mh); -#endif -} - -// -> ssat((a * b * 2) >> 16) -inline v128 gv_muls_hds16(const v128& a, const v128& b) -{ -#if defined(ARCH_ARM64) - return vqdmulhq_s16(a, b); -#elif defined(ARCH_X64) - const auto m = _mm_or_si128(_mm_srli_epi16(_mm_mullo_epi16(a, b), 15), _mm_slli_epi16(_mm_mulhi_epi16(a, b), 1)); - const auto s = _mm_cmpeq_epi16(m, _mm_set1_epi16(-0x8000)); // detect special case (positive 0x8000) - return _mm_xor_si128(m, s); -#endif -} - -inline v128 gv_muladd16(const v128& a, const v128& b, const v128& c) -{ -#if defined(ARCH_X64) - return _mm_add_epi16(_mm_mullo_epi16(a, b), c); -#elif defined(ARCH_ARM64) - return vmlaq_s16(c, a, b); -#endif -} - -inline v128 gv_mul16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_mullo_epi16(a, b); -#elif defined(ARCH_ARM64) - return vmulq_s16(a, b); -#endif -} - -inline v128 gv_mul32(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) - return _mm_mullo_epi32(a, b); -#elif defined(ARCH_X64) - const __m128i lows = _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8); - const __m128i highs = _mm_shuffle_epi32(_mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32)), 8); - return _mm_unpacklo_epi32(lows, highs); -#elif defined(ARCH_ARM64) - return vmulq_s32(a, b); -#endif -} - -inline v128 gv_mulfs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_mul_ps(a, b); -#elif defined(ARCH_ARM64) - return vmulq_f32(a, b); -#endif -} - -inline v128 gv_mulfs(const v128& a, f32 b) -{ -#if defined(ARCH_X64) - return _mm_mul_ps(a, _mm_set_ps1(b)); -#elif defined(ARCH_ARM64) - return vmulq_n_f32(a, b); -#endif -} - -inline v128 gv_hadds8x2(const v128& a) -{ -#if defined(__SSSE3__) - return _mm_maddubs_epi16(_mm_set1_epi8(1), a); -#elif defined(ARCH_X64) - return _mm_add_epi16(_mm_srai_epi16(a, 8), _mm_srai_epi16(_mm_slli_epi16(a, 8), 8)); -#elif defined(ARCH_ARM64) - return vpaddlq_s8(a); -#endif -} - -inline v128 gv_hadds8x4(const v128& a, const v128& c) -{ -#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) - return _mm_dpbusd_epi32(c, _mm_set1_epi8(1), a); -#elif defined(__SSSE3__) - return _mm_add_epi32(_mm_madd_epi16(_mm_maddubs_epi16(_mm_set1_epi8(1), a), _mm_set1_epi16(1)), c); -#elif defined(ARCH_X64) - return _mm_add_epi32(_mm_madd_epi16(_mm_add_epi16(_mm_srai_epi16(a, 8), _mm_srai_epi16(_mm_slli_epi16(a, 8), 8)), _mm_set1_epi16(1)), c); -#elif defined(ARCH_ARM64) - return vaddq_s32(vpaddlq_s16(vpaddlq_s8(a)), c); -#endif -} - -inline v128 gv_haddu8x2(const v128& a) -{ -#if defined(__SSSE3__) - return _mm_maddubs_epi16(a, _mm_set1_epi8(1)); -#elif defined(ARCH_X64) - return _mm_add_epi16(_mm_srli_epi16(a, 8), _mm_and_si128(a, _mm_set1_epi16(0x00ff))); -#elif defined(ARCH_ARM64) - return vpaddlq_u8(a); -#endif -} - -inline v128 gv_haddu8x4(const v128& a) -{ -#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) - return _mm_dpbusd_epi32(_mm_setzero_si128(), a, _mm_set1_epi8(1)); -#elif defined(__SSSE3__) - return _mm_madd_epi16(_mm_maddubs_epi16(a, _mm_set1_epi8(1)), _mm_set1_epi16(1)); -#elif defined(ARCH_X64) - return _mm_madd_epi16(_mm_add_epi16(_mm_srli_epi16(a, 8), _mm_and_si128(a, _mm_set1_epi16(0x00ff))), _mm_set1_epi16(1)); -#elif defined(ARCH_ARM64) - return vpaddlq_u16(vpaddlq_u8(a)); -#endif -} - -inline v128 gv_hadds16x2(const v128& a, const v128& c) -{ -#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) - return _mm_dpwssd_epi32(c, a, _mm_set1_epi8(1)); -#elif defined(ARCH_X64) - return _mm_add_epi32(_mm_madd_epi16(a, _mm_set1_epi16(1)), c); -#elif defined(ARCH_ARM64) - return vaddq_s32(vpaddlq_s16(a), c); -#endif -} - -// Unsigned bytes from a, signed bytes from b, 32-bit accumulator c -inline v128 gv_dotu8s8x4(const v128& a, const v128& b, const v128& c) -{ -#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) - return _mm_dpbusd_epi32(c, a, b); -#elif defined(ARCH_X64) - const __m128i ah = _mm_srli_epi16(a, 8); - const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff)); - const __m128i bh = _mm_srai_epi16(b, 8); - const __m128i bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8); - const __m128i mh = _mm_madd_epi16(ah, bh); - const __m128i ml = _mm_madd_epi16(al, bl); - const __m128i x = _mm_add_epi32(mh, ml); - return _mm_add_epi32(c, x); -#elif defined(__ARM_FEATURE_MATMUL_INT8) - return vusdotq_s32(c, a, b); -#elif defined(ARCH_ARM64) - const auto l = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), vmovl_s8(vget_low_s8(b)))); - const auto h = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), vmovl_s8(vget_high_s8(b)))); - return vaddq_s32(c, vaddq_s32(vuzp1q_s32(l, h), vuzp2q_s32(l, h))); -#endif -} - -inline v128 gv_dotu8x4(const v128& a, const v128& b, const v128& c) -{ -#if defined(ARCH_X64) - const __m128i ah = _mm_srli_epi16(a, 8); - const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff)); - const __m128i bh = _mm_srli_epi16(b, 8); - const __m128i bl = _mm_and_si128(b, _mm_set1_epi16(0x00ff)); - const __m128i mh = _mm_madd_epi16(ah, bh); - const __m128i ml = _mm_madd_epi16(al, bl); - const __m128i x = _mm_add_epi32(mh, ml); - return _mm_add_epi32(c, x); -#elif defined(__ARM_FEATURE_DOTPROD) - return vdotq_u32(c, a, b); -#elif defined(ARCH_ARM64) - const auto l = vpaddlq_u16(vmulq_u16(vmovl_u8(vget_low_u8(a)), vmovl_u8(vget_low_u8(b)))); - const auto h = vpaddlq_u16(vmulq_u16(vmovl_u8(vget_high_u8(a)), vmovl_u8(vget_high_u8(b)))); - return vaddq_u32(c, vaddq_u32(vuzp1q_u32(l, h), vuzp2q_u32(l, h))); -#endif -} - -inline v128 gv_dots16x2(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_madd_epi16(a, b); -#elif defined(ARCH_ARM64) - const auto ml = vmull_s16(vget_low_s16(a), vget_low_s16(b)); - const auto mh = vmull_s16(vget_high_s16(a), vget_high_s16(b)); - const auto sl = vpadd_s32(vget_low_s32(ml), vget_high_s32(ml)); - const auto sh = vpadd_s32(vget_low_s32(mh), vget_high_s32(mh)); - return vcombine_s32(sl, sh); -#endif -} - -// Signed s16 from a and b, 32-bit accumulator c -inline v128 gv_dots16x2(const v128& a, const v128& b, const v128& c) -{ -#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) - return _mm_dpwssd_epi32(c, a, b); -#else - return gv_add32(c, gv_dots16x2(a, b)); -#endif -} - -inline v128 gv_dotu16x2(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - const auto ml = _mm_mullo_epi16(a, b); // low results - const auto mh = _mm_mulhi_epu16(a, b); // high results - const auto ls = _mm_add_epi32(_mm_srli_epi32(ml, 16), _mm_and_si128(ml, _mm_set1_epi32(0x0000ffff))); - const auto hs = _mm_add_epi32(_mm_slli_epi32(mh, 16), _mm_and_si128(mh, _mm_set1_epi32(0xffff0000))); - return _mm_add_epi32(ls, hs); -#elif defined(ARCH_ARM64) - const auto ml = vmull_u16(vget_low_u16(a), vget_low_u16(b)); - const auto mh = vmull_u16(vget_high_u16(a), vget_high_u16(b)); - const auto sl = vpadd_u32(vget_low_u32(ml), vget_high_u32(ml)); - const auto sh = vpadd_u32(vget_low_u32(mh), vget_high_u32(mh)); - return vcombine_u32(sl, sh); -#endif -} - -// Unsigned bytes from a, signed bytes from b, 32-bit accumulator c -inline v128 gv_dots_u8s8x4(const v128& a, const v128& b, const v128& c) -{ -#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) - return _mm_dpbusds_epi32(c, a, b); -#elif defined(ARCH_X64) - const __m128i ah = _mm_srli_epi16(a, 8); - const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff)); - const __m128i bh = _mm_srai_epi16(b, 8); - const __m128i bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8); - const __m128i mh = _mm_madd_epi16(ah, bh); - const __m128i ml = _mm_madd_epi16(al, bl); - return gv_adds_s32(c, _mm_add_epi32(mh, ml)); -#elif defined(ARCH_ARM64) - const auto l = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), vmovl_s8(vget_low_s8(b)))); - const auto h = vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), vmovl_s8(vget_high_s8(b)))); - return vqaddq_s32(c, vaddq_s32(vuzp1q_s32(l, h), vuzp2q_s32(l, h))); -#endif -} - -// Signed s16 from a and b, 32-bit accumulator c; signed saturation -inline v128 gv_dots_s16x2(const v128& a, const v128& b, const v128& c) -{ -#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) - return _mm_dpwssds_epi32(c, a, b); -#else - const auto ab = gv_dots16x2(a, b); - const auto s0 = gv_adds_s32(ab, c); - const auto s1 = gv_eq32(ab, gv_bcst32(0x80000000)); // +0x80000000, negative c -> c^0x80000000; otherwise 0x7fffffff - const auto s2 = gv_select32(gv_gts32(gv_bcst32(0), c), gv_xor32(c, gv_bcst32(0x80000000)), gv_bcst32(0x7fffffff)); - return gv_select32(s1, s2, s0); -#endif -} - -// Multiply s16 elements 0, 2, 4, 6 to produce s32 results in corresponding lanes -inline v128 gv_mul_even_s16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - const auto c = _mm_set1_epi32(0x0000ffff); - return _mm_madd_epi16(_mm_and_si128(a, c), _mm_and_si128(b, c)); -#else - // TODO - return gv_mul32(gv_sar32(gv_shl32(a, 16), 16), gv_sar32(gv_shl32(b, 16), 16)); -#endif -} - -// Multiply u16 elements 0, 2, 4, 6 to produce u32 results in corresponding lanes -inline v128 gv_mul_even_u16(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) || defined(ARCH_ARM64) - const auto c = gv_bcst32(0x0000ffff); - return gv_mul32(a & c, b & c); -#elif defined(ARCH_X64) - const auto ml = _mm_mullo_epi16(a, b); - const auto mh = _mm_mulhi_epu16(a, b); - return _mm_or_si128(_mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)), _mm_slli_epi32(mh, 16)); -#endif -} - -// Multiply s16 elements 1, 3, 5, 7 to produce s32 results in corresponding lanes -inline v128 gv_mul_odds_s16(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_madd_epi16(_mm_srli_epi32(a, 16), _mm_srli_epi32(b, 16)); -#else - return gv_mul32(gv_sar32(a, 16), gv_sar32(b, 16)); -#endif -} - -// Multiply u16 elements 1, 3, 5, 7 to produce u32 results in corresponding lanes -inline v128 gv_mul_odds_u16(const v128& a, const v128& b) -{ -#if defined(__SSE4_1__) || defined(ARCH_ARM64) - return gv_mul32(gv_shr32(a, 16), gv_shr32(b, 16)); -#elif defined(ARCH_X64) - const auto ml = _mm_mullo_epi16(a, b); - const auto mh = _mm_mulhi_epu16(a, b); - return _mm_or_si128(_mm_and_si128(mh, _mm_set1_epi32(0xffff0000)), _mm_srli_epi32(ml, 16)); -#endif -} - -inline v128 gv_cvts32_tofs(const v128& src) -{ -#if defined(ARCH_X64) - return _mm_cvtepi32_ps(src); -#elif defined(ARCH_ARM64) - return vcvtq_f32_s32(src); -#endif -} - -inline v128 gv_cvtu32_tofs(const v128& src) -{ -#if defined(__AVX512VL__) - return _mm_cvtepu32_ps(src); -#elif defined(ARCH_X64) - const auto fix = _mm_and_ps(_mm_castsi128_ps(_mm_srai_epi32(src, 31)), _mm_set1_ps(0x80000000)); - return _mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(src, _mm_set1_epi32(0x7fffffff))), fix); -#elif defined(ARCH_ARM64) - return vcvtq_f32_u32(src); -#endif -} - -inline v128 gv_cvtfs_tos32(const v128& src) -{ -#if defined(ARCH_X64) - return _mm_cvttps_epi32(src); -#elif defined(ARCH_ARM64) - return vcvtq_s32_f32(src); -#endif -} - -inline v128 gv_cvtfs_tou32(const v128& src) -{ -#if defined(__AVX512VL__) - return _mm_cvttps_epu32(src); -#elif defined(ARCH_X64) - const auto c1 = _mm_cvttps_epi32(src); - const auto s1 = _mm_srai_epi32(c1, 31); - const auto c2 = _mm_cvttps_epi32(_mm_sub_ps(src, _mm_set1_ps(2147483648.))); - return _mm_or_si128(c1, _mm_and_si128(c2, s1)); -#elif defined(ARCH_ARM64) - return vcvtq_u32_f32(src); -#endif -} - -namespace utils -{ - inline f32 roundevenf32(f32 arg) + template + requires(asmjit::any_operand_v) + inline auto gv_and32(A&& a, B&& b) { - u32 val = std::bit_cast(arg); - u32 exp = (val >> 23) & 0xff; - u32 abs = val & 0x7fffffff; - - if (exp >= 127 + 23) - { - // Big enough, NaN or INF - return arg; - } - else if (exp >= 127) - { - u32 int_pos = (127 + 23) - exp; - u32 half_pos = int_pos - 1; - u32 half_bit = 1u << half_pos; - u32 int_bit = 1u << int_pos; - if (val & (int_bit | (half_bit - 1))) - val += half_bit; - val &= ~(int_bit - 1); - } - else if (exp == 126 && abs > 0x3f000000) - { - val &= 0x80000000; - val |= 0x3f800000; - } - else - { - val &= 0x80000000; - } - - return std::bit_cast(val); + FOR_X64(binary_op, 4, kIdMovdqa, kIdPand, kIdVpand, kIdVpandd, std::forward(a), std::forward(b)); } -} // namespace utils -#if defined(ARCH_X64) -template -const auto sse41_roundf = build_function_asm<__m128 (*)(__m128)>("sse41_roundf", [](native_asm& c, native_args&) + template + requires(asmjit::any_operand_v) + inline auto gv_andfs(A&& a, B&& b) + { + FOR_X64(binary_op, 4, kIdMovaps, kIdAndps, kIdVandps, kIdVandps, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_andn32(A&& a, B&& b) + { + FOR_X64(binary_op, 4, kIdMovdqa, kIdPandn, kIdVpandn, kIdVpandnd, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_andnfs(A&& a, B&& b) + { + FOR_X64(binary_op, 4, kIdMovaps, kIdAndnps, kIdVandnps, kIdVandnps, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_or32(A&& a, B&& b) + { + FOR_X64(binary_op, 4, kIdMovdqa, kIdPor, kIdVpor, kIdVpord, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_orfs(A&& a, B&& b) + { + FOR_X64(binary_op, 4, kIdMovaps, kIdOrps, kIdVorps, kIdVorps, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_xor32(A&& a, B&& b) + { + FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_xorfs(A&& a, B&& b) + { + FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_not32(A&& a) + { +#if defined(ARCH_X64) + asmjit::vec_type ones = g_vc->vec_alloc(); + g_vc->pcmpeqd(ones, ones); + FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::move(ones), std::forward(a)); +#endif + } + + template + requires(asmjit::any_operand_v) + inline auto gv_notfs(A&& a) + { +#if defined(ARCH_X64) + asmjit::vec_type ones = g_vc->vec_alloc(); + g_vc->pcmpeqd(ones, ones); + FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::move(ones), std::forward(a)); +#endif + } + + template + requires(asmjit::any_operand_v) + inline auto gv_shl16(A&& a, u32 count) + { + FOR_X64(unary_op, kIdPsllw, kIdVpsllw, std::forward(a), count); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_shl32(A&& a, u32 count) + { + FOR_X64(unary_op, kIdPslld, kIdVpslld, std::forward(a), count); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_shl64(A&& a, u32 count) + { + FOR_X64(unary_op, kIdPsllq, kIdVpsllq, std::forward(a), count); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_shr16(A&& a, u32 count) + { + FOR_X64(unary_op, kIdPsrlw, kIdVpsrlw, std::forward(a), count); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_shr32(A&& a, u32 count) + { + FOR_X64(unary_op, kIdPsrld, kIdVpsrld, std::forward(a), count); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_shr64(A&& a, u32 count) + { + FOR_X64(unary_op, kIdPsrlq, kIdVpsrlq, std::forward(a), count); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_sar16(A&& a, u32 count) + { + FOR_X64(unary_op, kIdPsraw, kIdVpsraw, std::forward(a), count); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_sar32(A&& a, u32 count) + { + FOR_X64(unary_op, kIdPsrad, kIdVpsrad, std::forward(a), count); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_sar64(A&& a, u32 count) + { + if (count >= 64) + count = 63; +#if defined(ARCH_X64) + using enum asmjit::x86::Inst::Id; + if (utils::has_avx512()) + return asmjit::unary_op(kIdNone, kIdVpsraq, std::forward(a), count); + g_vc->fail_flag = true; + return std::forward(a); +#endif + } + + template + requires(asmjit::any_operand_v) + inline auto gv_add8(A&& a, B&& b) + { + FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddb, kIdVpaddb, kIdNone, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_add16(A&& a, B&& b) + { + FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddw, kIdVpaddw, kIdNone, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_add32(A&& a, B&& b) + { + FOR_X64(binary_op, 4, kIdMovdqa, kIdPaddd, kIdVpaddd, kIdVpaddd, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_add64(A&& a, B&& b) + { + FOR_X64(binary_op, 8, kIdMovdqa, kIdPaddq, kIdVpaddq, kIdVpaddq, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_adds_s8(A&& a, B&& b) + { + FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddsb, kIdVpaddsb, kIdNone, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_adds_s16(A&& a, B&& b) + { + FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddsw, kIdVpaddsw, kIdNone, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_adds_s32(A&& a, B&& b) + { +#if defined(ARCH_X64) + auto s = gv_add32(a, b); + auto m = gv_and32(gv_xor32(std::forward(a), s), gv_xor32(std::forward(b), s)); + auto x = gv_sar32(m, 31); + auto y = gv_sar32(gv_and32(s, std::move(m)), 31); + auto z = gv_xor32(gv_shr32(x, 1), std::move(y)); + return gv_xor32(std::move(z), gv_or32(std::move(s), std::move(x))); +#endif + } + + template + requires(asmjit::any_operand_v) + inline auto gv_addus_u8(A&& a, B&& b) + { + FOR_X64(binary_op, 1, kIdMovdqa, kIdPaddusb, kIdVpaddusb, kIdNone, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline auto gv_addus_u16(A&& a, B&& b) + { + FOR_X64(binary_op, 2, kIdMovdqa, kIdPaddusw, kIdVpaddusw, kIdNone, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline asmjit::vec_type gv_minu32(A&& a, B&& b) + { +#if defined(ARCH_X64) + if (utils::has_sse41()) + FOR_X64(binary_op, 4, kIdMovdqa, kIdPminud, kIdVpminud, kIdVpminud, std::forward(a), std::forward(b)); + auto s = gv_bcst32(0x80000000); + auto x = gv_xor32(a, s); + auto m = gv_gts32(std::move(x), gv_xor32(std::move(s), b)); + auto z = gv_and32(m, std::move(b)); + return gv_or32(std::move(z), gv_andn32(std::move(m), std::move(a))); +#endif + return {}; + } + + template + requires(asmjit::any_operand_v) + inline asmjit::vec_type gv_addus_u32(A&& a, B&& b) + { +#if defined(ARCH_X64) + if (utils::has_sse41()) + return gv_add32(gv_minu32(std::forward(b), gv_not32(a)), std::forward(a)); + auto s = gv_add32(a, b); + auto x = gv_xor32(std::forward(b), gv_bcst32(0x80000000)); + auto y = gv_xor32(std::forward(a), gv_bcst32(0x7fffffff)); + return gv_or32(std::move(s), gv_gts32(std::move(x), std::move(y))); +#endif + return {}; + } + + template + requires(asmjit::any_operand_v) + inline auto gv_sub8(A&& a, B&& b) + { + FOR_X64(binary_op, 1, kIdMovdqa, kIdPsubb, kIdVpsubb, kIdNone, std::forward(a), std::forward(b)); + } + + template + requires(asmjit::any_operand_v) + inline asmjit::vec_type gv_gts8(A&& a, B&& b) + { + FOR_X64(binary_op, 1, kIdMovdqa, kIdPcmpgtb, kIdVpcmpgtb, kIdNone, std::forward(a), std::forward(b)); + return {}; + } + + template + requires(asmjit::any_operand_v) + inline asmjit::vec_type gv_gts32(A&& a, B&& b) + { + FOR_X64(binary_op, 4, kIdMovdqa, kIdPcmpgtd, kIdVpcmpgtd, kIdNone, std::forward(a), std::forward(b)); + return {}; + } + + template + requires(asmjit::any_operand_v) + inline asmjit::vec_type gv_signselect8(A&& bits, B&& _true, C&& _false) { - static_assert(Mode < 4); using namespace asmjit; +#if defined(ARCH_X64) if (utils::has_avx()) - c.vroundps(x86::xmm0, x86::xmm0, 8 + Mode); - else if (utils::has_sse41()) - c.roundps(x86::xmm0, x86::xmm0, 8 + Mode); - else - c.jmp(+[](__m128 a) -> __m128 - { - v128 r = a; - for (u32 i = 0; i < 4; i++) - if constexpr (Mode == 0) - r._f[i] = utils::roundevenf32(r._f[i]); - else if constexpr (Mode == 1) - r._f[i] = ::floorf(r._f[i]); - else if constexpr (Mode == 2) - r._f[i] = ::ceilf(r._f[i]); - else if constexpr (Mode == 3) - r._f[i] = ::truncf(r._f[i]); - return r; - }); - c.ret(); - }); + { + Operand arg0{}; + Operand arg1 = arg_eval(std::forward(bits), 16); + Operand arg2 = arg_eval(std::forward(_true), 16); + Operand arg3 = arg_eval(std::forward(_false), 16); + if constexpr (!std::is_reference_v) + arg0.isReg() ? arg_free(bits) : arg0.copyFrom(arg1); + if constexpr (!std::is_reference_v) + arg0.isReg() ? arg_free(_true) : arg0.copyFrom(arg2); + if constexpr (!std::is_reference_v) + arg0.isReg() ? arg_free(_false) : arg0.copyFrom(arg3); + if (arg0.isNone()) + arg0 = g_vc->vec_alloc(); + g_vc->emit(x86::Inst::kIdVpblendvb, arg0, arg3, arg2, arg1); + vec_type r; + r.copyFrom(arg0); + return r; + } #endif - -inline v128 gv_roundfs_even(const v128& a) -{ -#if defined(__SSE4_1__) - return _mm_round_ps(a, 8 + 0); -#elif defined(ARCH_ARM64) - return vrndnq_f32(a); -#elif defined(ARCH_X64) - return sse41_roundf<0>(a); -#else - v128 r; - for (u32 i = 0; i < 4; i++) - r._f[i] = utils::roundevenf32(a._f[i]); - return r; -#endif -} - -inline v128 gv_roundfs_ceil(const v128& a) -{ -#if defined(__SSE4_1__) - return _mm_round_ps(a, 8 + 2); -#elif defined(ARCH_ARM64) - return vrndpq_f32(a); -#elif defined(ARCH_X64) - return sse41_roundf<2>(a); -#else - v128 r; - for (u32 i = 0; i < 4; i++) - r._f[i] = ::ceilf(a._f[i]); - return r; -#endif -} - -inline v128 gv_roundfs_floor(const v128& a) -{ -#if defined(__SSE4_1__) - return _mm_round_ps(a, 8 + 1); -#elif defined(ARCH_ARM64) - return vrndmq_f32(a); -#elif defined(ARCH_X64) - return sse41_roundf<1>(a); -#else - v128 r; - for (u32 i = 0; i < 4; i++) - r._f[i] = ::floorf(a._f[i]); - return r; -#endif -} - -inline v128 gv_roundfs_trunc(const v128& a) -{ -#if defined(__SSE4_1__) - return _mm_round_ps(a, 8 + 3); -#elif defined(ARCH_ARM64) - return vrndq_f32(a); -#elif defined(ARCH_X64) - return sse41_roundf<3>(a); -#else - v128 r; - for (u32 i = 0; i < 4; i++) - r._f[i] = ::truncf(a._f[i]); - return r; -#endif -} - -inline bool gv_testz(const v128& a) -{ -#if defined(__SSE4_1__) - return !!_mm_testz_si128(a, a); -#elif defined(ARCH_X64) - return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == 0; -#elif defined(ARCH_ARM64) - return std::bit_cast(vqmovn_s32(a)) == 0; -#else - return !(a._u64[0] | a._u64[1]); -#endif -} - -// Same as gv_testz but tuned for pairing with gv_testall1 -inline bool gv_testall0(const v128& a) -{ -#if defined(__SSE4_1__) - return !!_mm_testz_si128(a, _mm_set1_epi32(-1)); -#elif defined(ARCH_X64) - return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == 0; -#elif defined(ARCH_ARM64) - return std::bit_cast(vqmovn_s32(a)) == 0; -#else - return !(a._u64[0] | a._u64[1]); -#endif -} - -inline bool gv_testall1(const v128& a) -{ -#if defined(__SSE4_1__) - return !!_mm_test_all_ones(a); -#elif defined(ARCH_X64) - return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == -1; -#elif defined(ARCH_ARM64) - return std::bit_cast(vqmovn_s32(a)) == -1; -#else - return (a._u64[0] & a._u64[1]) == UINT64_MAX; -#endif -} - -// result = (~a) & (b) -inline v128 gv_andn(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_andnot_si128(a, b); -#elif defined(ARCH_ARM64) - return vbicq_s32(b, a); -#endif -} - -// Select elements; _cmp must be result of SIMD comparison; undefined otherwise -FORCE_INLINE v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false) -{ -#if defined(__SSE4_1__) - return _mm_blendv_epi8(_false, _true, _cmp); -#elif defined(ARCH_ARM64) - return vbslq_u8(_cmp, _true, _false); -#else - return (_cmp & _true) | gv_andn(_cmp, _false); -#endif -} - -// Select elements using sign bit only -FORCE_INLINE v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false) -{ -#if defined(__SSE4_1__) - return _mm_blendv_epi8(_false, _true, bits); -#else - return gv_select8(gv_gts8(gv_bcst8(0), bits), _true, _false); -#endif -} - -template - requires(asmjit::any_operand_v) -inline asmjit::vec_type gv_signselect8(A&& bits, B&& _true, C&& _false) -{ - using namespace asmjit; -#if defined(ARCH_X64) - if (utils::has_avx()) - { - Operand arg0{}; - Operand arg1 = arg_eval(std::forward(bits), 16); - Operand arg2 = arg_eval(std::forward(_true), 16); - Operand arg3 = arg_eval(std::forward(_false), 16); - if constexpr (!std::is_reference_v) - arg0.isReg() ? arg_free(bits) : arg0.copyFrom(arg1); - if constexpr (!std::is_reference_v) - arg0.isReg() ? arg_free(_true) : arg0.copyFrom(arg2); - if constexpr (!std::is_reference_v) - arg0.isReg() ? arg_free(_false) : arg0.copyFrom(arg3); - if (arg0.isNone()) - arg0 = g_vc->vec_alloc(); - g_vc->emit(x86::Inst::kIdVpblendvb, arg0, arg3, arg2, arg1); - vec_type r; - r.copyFrom(arg0); - return r; + g_vc->fail_flag = true; + return vec_type{0}; } -#endif - g_vc->fail_flag = true; - return vec_type{0}; -} -// Select elements; _cmp must be result of SIMD comparison; undefined otherwise -inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false) -{ -#if defined(__SSE4_1__) - return _mm_blendv_epi8(_false, _true, _cmp); -#elif defined(ARCH_ARM64) - return vbslq_u16(_cmp, _true, _false); -#else - return (_cmp & _true) | gv_andn(_cmp, _false); -#endif -} - -// Select elements; _cmp must be result of SIMD comparison; undefined otherwise -inline v128 gv_select32(const v128& _cmp, const v128& _true, const v128& _false) -{ -#if defined(__SSE4_1__) - return _mm_blendv_epi8(_false, _true, _cmp); -#elif defined(ARCH_ARM64) - return vbslq_u32(_cmp, _true, _false); -#else - return (_cmp & _true) | gv_andn(_cmp, _false); -#endif -} - -// Select elements; _cmp must be result of SIMD comparison; undefined otherwise -inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false) -{ -#if defined(__SSE4_1__) - return _mm_blendv_ps(_false, _true, _cmp); -#elif defined(ARCH_ARM64) - return vbslq_f32(_cmp, _true, _false); -#else - return _mm_or_ps(_mm_and_ps(_cmp, _true), _mm_andnot_ps(_cmp, _false)); -#endif -} - -inline v128 gv_packss_s16(const v128& low, const v128& high) -{ + template + requires(asmjit::any_operand_v) + inline auto gv_extend_lo_s8(A&& a) + { #if defined(ARCH_X64) - return _mm_packs_epi16(low, high); -#elif defined(ARCH_ARM64) - return vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); + using enum asmjit::x86::Inst::Id; + if (utils::has_sse41()) + return asmjit::unary_op(kIdNone, kIdPmovsxbw, std::forward(a)); + return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpcklbw, std::forward(a)), 8); #endif -} + } -inline v128 gv_packus_s16(const v128& low, const v128& high) -{ + template + requires(asmjit::any_operand_v) + inline auto gv_extend_hi_s8(A&& a) + { #if defined(ARCH_X64) - return _mm_packus_epi16(low, high); -#elif defined(ARCH_ARM64) - return vcombine_u8(vqmovun_s16(low), vqmovun_s16(high)); + using enum asmjit::x86::Inst::Id; + return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpckhbw, std::forward(a)), 8); #endif -} + } -inline v128 gv_packus_u16(const v128& low, const v128& high) -{ -#if defined(__SSE4_1__) - return _mm_packus_epi16(_mm_min_epu16(low, _mm_set1_epi16(0xff)), _mm_min_epu16(high, _mm_set1_epi16(0xff))); -#elif defined(ARCH_X64) - return _mm_packus_epi16(_mm_sub_epi16(low, _mm_subs_epu16(low, _mm_set1_epi16(0xff))), _mm_sub_epi16(high, _mm_subs_epu16(high, _mm_set1_epi16(0xff)))); -#elif defined(ARCH_ARM64) - return vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); -#endif -} - -inline v128 gv_packtu16(const v128& low, const v128& high) -{ + template + requires(asmjit::any_operand_v) + inline auto gv_extend_lo_s16(A&& a) + { #if defined(ARCH_X64) - return _mm_packus_epi16(low & _mm_set1_epi16(0xff), high & _mm_set1_epi16(0xff)); -#elif defined(ARCH_ARM64) - return vuzp1q_s8(low, high); + using enum asmjit::x86::Inst::Id; + if (utils::has_sse41()) + return asmjit::unary_op(kIdNone, kIdPmovsxwd, std::forward(a)); + return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpcklwd, std::forward(a)), 16); #endif -} + } -inline v128 gv_packss_s32(const v128& low, const v128& high) -{ + template + requires(asmjit::any_operand_v) + inline auto gv_extend_hi_s16(A&& a) + { #if defined(ARCH_X64) - return _mm_packs_epi32(low, high); -#elif defined(ARCH_ARM64) - return vcombine_s16(vqmovn_s32(low), vqmovn_s32(high)); + using enum asmjit::x86::Inst::Id; + return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpckhwd, std::forward(a)), 16); #endif -} + } -inline v128 gv_packus_s32(const v128& low, const v128& high) -{ -#if defined(__SSE4_1__) - return _mm_packus_epi32(low, high); -#elif defined(ARCH_X64) - const auto s = _mm_srai_epi16(_mm_packs_epi32(low, high), 15); - const auto r = gv_add16(_mm_packs_epi32(gv_sub32(low, gv_bcst32(0x8000)), gv_sub32(high, gv_bcst32(0x8000))), gv_bcst16(0x8000)); - return gv_andn(s, r); -#elif defined(ARCH_ARM64) - return vcombine_u16(vqmovun_s32(low), vqmovun_s32(high)); -#endif -} + template + requires(asmjit::any_operand_v) + inline auto gv_shuffle_left(A&& a) + { + FOR_X64(unary_op, kIdPslldq, kIdVpslldq, std::forward(a), Count); + } -inline v128 gv_packus_u32(const v128& low, const v128& high) -{ -#if defined(__SSE4_1__) - return _mm_packus_epi32(_mm_min_epu32(low, _mm_set1_epi32(0xffff)), _mm_min_epu32(high, _mm_set1_epi32(0xffff))); -#elif defined(ARCH_X64) - const v128 s = _mm_cmpgt_epi16(_mm_packs_epi32(_mm_srli_epi32(low, 16), _mm_srli_epi32(high, 16)), _mm_setzero_si128()); - const v128 r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(low, 16), 16), _mm_srai_epi32(_mm_slli_epi32(high, 16), 16)); - return _mm_or_si128(r, s); -#elif defined(ARCH_ARM64) - return vcombine_u16(vqmovn_u32(low), vqmovn_u32(high)); -#endif -} + template + requires(asmjit::any_operand_v) + inline auto gv_shuffle_right(A&& a) + { + FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward(a), Count); + } -inline v128 gv_packtu32(const v128& low, const v128& high) -{ -#if defined(__SSE4_1__) - return _mm_packus_epi32(low & _mm_set1_epi32(0xffff), high & _mm_set1_epi32(0xffff)); -#elif defined(ARCH_X64) - return _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(low, 16), 16), _mm_srai_epi32(_mm_slli_epi32(high, 16), 16)); -#elif defined(ARCH_ARM64) - return vuzp1q_s16(low, high); -#endif -} - -inline v128 gv_unpacklo8(const v128& lows, const v128& highs) -{ -#if defined(ARCH_X64) - return _mm_unpacklo_epi8(lows, highs); -#elif defined(ARCH_ARM64) - return vzip1q_s8(lows, highs); -#endif -} - -inline v128 gv_extend_lo_s8(const v128& vec) -{ -#if defined(__SSE4_1__) - return _mm_cvtepi8_epi16(vec); -#elif defined(ARCH_X64) - return _mm_srai_epi16(_mm_unpacklo_epi8(vec, vec), 8); -#elif defined(ARCH_ARM64) - return int16x8_t(vmovl_s8(vget_low_s8(vec))); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_extend_lo_s8(A&& a) -{ -#if defined(ARCH_X64) - using enum asmjit::x86::Inst::Id; - if (utils::has_sse41()) - return asmjit::unary_op(kIdNone, kIdPmovsxbw, std::forward(a)); - return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpcklbw, std::forward(a)), 8); -#endif -} - -inline v128 gv_extend_hi_s8(const v128& vec) -{ -#if defined(__SSE4_1__) - return _mm_cvtepi8_epi16(_mm_loadu_si64(vec._bytes + 8)); -#elif defined(ARCH_X64) - return _mm_srai_epi16(_mm_unpackhi_epi8(vec, vec), 8); -#elif defined(ARCH_ARM64) - return int16x8_t(vmovl_s8(vget_high_s8(vec))); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_extend_hi_s8(A&& a) -{ -#if defined(ARCH_X64) - using enum asmjit::x86::Inst::Id; - return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpckhbw, std::forward(a)), 8); -#endif -} - -inline v128 gv_unpacklo16(const v128& lows, const v128& highs) -{ -#if defined(ARCH_X64) - return _mm_unpacklo_epi16(lows, highs); -#elif defined(ARCH_ARM64) - return vzip1q_s16(lows, highs); -#endif -} - -inline v128 gv_extend_lo_s16(const v128& vec) -{ -#if defined(__SSE4_1__) - return _mm_cvtepi16_epi32(vec); -#elif defined(ARCH_X64) - return _mm_srai_epi32(_mm_unpacklo_epi16(vec, vec), 16); -#elif defined(ARCH_ARM64) - return int32x4_t(vmovl_s16(vget_low_s16(vec))); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_extend_lo_s16(A&& a) -{ -#if defined(ARCH_X64) - using enum asmjit::x86::Inst::Id; - if (utils::has_sse41()) - return asmjit::unary_op(kIdNone, kIdPmovsxwd, std::forward(a)); - return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpcklwd, std::forward(a)), 16); -#endif -} - -inline v128 gv_extend_hi_s16(const v128& vec) -{ -#if defined(__SSE4_1__) - return _mm_cvtepi16_epi32(_mm_loadu_si64(vec._bytes + 8)); -#elif defined(ARCH_X64) - return _mm_srai_epi32(_mm_unpackhi_epi16(vec, vec), 16); -#elif defined(ARCH_ARM64) - return int32x4_t(vmovl_s16(vget_high_s16(vec))); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_extend_hi_s16(A&& a) -{ -#if defined(ARCH_X64) - using enum asmjit::x86::Inst::Id; - return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpckhwd, std::forward(a)), 16); -#endif -} - -inline v128 gv_unpacklo32(const v128& lows, const v128& highs) -{ -#if defined(ARCH_X64) - return _mm_unpacklo_epi32(lows, highs); -#elif defined(ARCH_ARM64) - return vzip1q_s32(lows, highs); -#endif -} - -inline v128 gv_unpackhi8(const v128& lows, const v128& highs) -{ -#if defined(ARCH_X64) - return _mm_unpackhi_epi8(lows, highs); -#elif defined(ARCH_ARM64) - return vzip2q_s8(lows, highs); -#endif -} - -inline v128 gv_unpackhi16(const v128& lows, const v128& highs) -{ -#if defined(ARCH_X64) - return _mm_unpackhi_epi16(lows, highs); -#elif defined(ARCH_ARM64) - return vzip2q_s16(lows, highs); -#endif -} - -inline v128 gv_unpackhi32(const v128& lows, const v128& highs) -{ -#if defined(ARCH_X64) - return _mm_unpackhi_epi32(lows, highs); -#elif defined(ARCH_ARM64) - return vzip2q_s32(lows, highs); -#endif -} - -inline bool v128::operator==(const v128& b) const -{ -#if defined(ARCH_X64) - return gv_testz(_mm_xor_si128(*this, b)); -#else - return gv_testz(*this ^ b); -#endif -} - -inline v128 v128::operator|(const v128& rhs) const -{ -#if defined(ARCH_X64) - return _mm_or_si128(*this, rhs); -#elif defined(ARCH_ARM64) - return vorrq_s32(*this, rhs); -#endif -} - -inline v128 v128::operator&(const v128& rhs) const -{ -#if defined(ARCH_X64) - return _mm_and_si128(*this, rhs); -#elif defined(ARCH_ARM64) - return vandq_s32(*this, rhs); -#endif -} - -inline v128 v128::operator^(const v128& rhs) const -{ -#if defined(ARCH_X64) - return _mm_xor_si128(*this, rhs); -#elif defined(ARCH_ARM64) - return veorq_s32(*this, rhs); -#endif -} - -inline v128 v128::operator~() const -{ -#if defined(ARCH_X64) - return _mm_xor_si128(*this, _mm_set1_epi32(-1)); -#elif defined(ARCH_ARM64) - return vmvnq_u32(*this); -#endif -} - -inline v128 gv_exp2_approxfs(const v128& a) -{ - // TODO -#if 0 - const auto x0 = _mm_max_ps(_mm_min_ps(a, _mm_set1_ps(127.4999961f)), _mm_set1_ps(-127.4999961f)); - const auto x1 = _mm_add_ps(x0, _mm_set1_ps(0.5f)); - const auto x2 = _mm_sub_epi32(_mm_cvtps_epi32(x1), _mm_and_si128(_mm_castps_si128(_mm_cmpnlt_ps(_mm_setzero_ps(), x1)), _mm_set1_epi32(1))); - const auto x3 = _mm_sub_ps(x0, _mm_cvtepi32_ps(x2)); - const auto x4 = _mm_mul_ps(x3, x3); - const auto x5 = _mm_mul_ps(x3, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(x4, _mm_set1_ps(0.023093347705f)), _mm_set1_ps(20.20206567f)), x4), _mm_set1_ps(1513.906801f))); - const auto x6 = _mm_mul_ps(x5, _mm_rcp_ps(_mm_sub_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(233.1842117f), x4), _mm_set1_ps(4368.211667f)), x5))); - return _mm_mul_ps(_mm_add_ps(_mm_add_ps(x6, x6), _mm_set1_ps(1.0f)), _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(x2, _mm_set1_epi32(127)), 23))); -#else - v128 r; - for (u32 i = 0; i < 4; i++) - r._f[i] = std::exp2f(a._f[i]); - return r; -#endif -} - -inline v128 gv_log2_approxfs(const v128& a) -{ - // TODO -#if 0 - const auto _1 = _mm_set1_ps(1.0f); - const auto _c = _mm_set1_ps(1.442695040f); - const auto x0 = _mm_max_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x00800000))); - const auto x1 = _mm_or_ps(_mm_and_ps(x0, _mm_castsi128_ps(_mm_set1_epi32(0x807fffff))), _1); - const auto x2 = _mm_rcp_ps(_mm_add_ps(x1, _1)); - const auto x3 = _mm_mul_ps(_mm_sub_ps(x1, _1), x2); - const auto x4 = _mm_add_ps(x3, x3); - const auto x5 = _mm_mul_ps(x4, x4); - const auto x6 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.7895802789f), x5), _mm_set1_ps(16.38666457f)), x5), _mm_set1_ps(-64.1409953f)); - const auto x7 = _mm_rcp_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-35.67227983f), x5), _mm_set1_ps(312.0937664f)), x5), _mm_set1_ps(-769.6919436f))); - const auto x8 = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x0), 23), _mm_set1_epi32(127))); - return _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(x5, x6), x7), x4), _c), _mm_add_ps(_mm_mul_ps(x4, _c), x8)); -#else - v128 r; - for (u32 i = 0; i < 4; i++) - r._f[i] = std::log2f(a._f[i]); - return r; -#endif -} - -// For each 8-bit element, r = a << (b & 7) -inline v128 gv_shl8(const v128& a, const v128& b) -{ + // For each 8-bit element, r = (a << (c & 7)) | (b >> (~c & 7) >> 1) + template + inline auto gv_fshl8(A&& a, B&& b, C&& c) + { #if defined(ARCH_ARM64) - return vshlq_u8(a, vandq_s8(b, gv_bcst8(7))); + const auto amt1 = vandq_s8(c, gv_bcst8(7)); + const auto amt2 = vsubq_s8(amt1, gv_bcst8(8)); + return v128(vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(b, amt2))); #else - const v128 x1 = gv_add8(a, a); // shift left by 1 - const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a); - const v128 x2 = gv_and32(gv_shl64(r1, 2), gv_bcst8(0xfc)); // shift by 2 - const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1); - const v128 x3 = gv_and32(gv_shl64(r2, 4), gv_bcst8(0xf0)); // shift by 4 - return gv_signselect8(gv_shl64(b, 5), x3, r2); + auto x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), b)); + auto s1 = gv_shl64(c, 7); + auto r1 = gv_signselect8(s1, std::move(x1), std::forward(a)); + auto b1 = gv_signselect8(std::move(s1), gv_shl64(b, 1), std::forward(b)); + auto c2 = gv_bcst8(0x3); + auto x2 = gv_and32(gv_shr64(b1, 6), c2); + x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(r1, 2))); + auto s2 = gv_shl64(c, 6); + auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1)); + auto b2 = gv_signselect8(std::move(s2), gv_shl64(b1, 2), std::move(b1)); + auto c3 = gv_bcst8(0xf); + auto x3 = gv_and32(gv_shr64(std::move(b2), 4), c3); + x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(r2, 4))); + return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), + std::move(r2)); #endif -} + } -// For each 16-bit element, r = a << (b & 15) -inline v128 gv_shl16(const v128& a, const v128& b) -{ -#if defined(__AVX512VL__) && defined(__AVX512BW__) - return _mm_sllv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15))); -#elif defined(ARCH_ARM64) - return vshlq_u16(a, vandq_s16(b, gv_bcst8(15))); -#else - v128 r; - for (u32 i = 0; i < 8; i++) - r._u16[i] = a._u16[i] << (b._u16[i] & 15); - return r; -#endif -} - -// For each 32-bit element, r = a << (b & 31) -inline v128 gv_shl32(const v128& a, const v128& b) -{ -#if defined(__AVX2__) - return _mm_sllv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31))); -#elif defined(ARCH_ARM64) - return vshlq_u32(a, vandq_s32(b, gv_bcst8(31))); -#else - v128 r; - for (u32 i = 0; i < 4; i++) - r._u32[i] = a._u32[i] << (b._u32[i] & 31); - return r; -#endif -} - -// For each unsigned 8-bit element, r = a >> (b & 7) -inline v128 gv_shr8(const v128& a, const v128& b) -{ + // For each 8-bit element, r = (b >> (c & 7)) | (a << (~c & 7) << 1) + template + inline auto gv_fshr8(A&& a, B&& b, C&& c) + { #if defined(ARCH_ARM64) - return vshlq_u8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7)))); + const auto amt1 = vandq_s8(c, gv_bcst8(7)); + const auto amt2 = vsubq_s8(gv_bcst8(8), amt1); + return vorrq_u8(vshlq_u8(b, vnegq_s8(amt1)), vshlq_u8(a, amt2)); #else - const v128 x1 = gv_and32(gv_shr64(a, 1), gv_bcst8(0x7f)); // shift right by 1 - const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a); - const v128 x2 = gv_and32(gv_shr64(r1, 2), gv_bcst8(0x3f)); // shift by 2 - const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1); - const v128 x3 = gv_and32(gv_shr64(r2, 4), gv_bcst8(0x0f)); // shift by 4 - return gv_signselect8(gv_shl64(b, 5), x3, r2); + auto c1 = gv_bcst8(0x7f); + auto x1 = gv_and32(gv_shr64(b, 1), c1); + x1 = gv_or32(std::move(x1), gv_andn32(std::move(c1), gv_shl64(a, 7))); + auto s1 = gv_shl64(c, 7); + auto r1 = gv_signselect8(s1, std::move(x1), std::move(b)); + auto a1 = gv_signselect8(std::move(s1), gv_shr64(a, 1), std::move(a)); + auto c2 = gv_bcst8(0x3f); + auto x2 = gv_and32(gv_shr64(r1, 2), c2); + x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(a1, 6))); + auto s2 = gv_shl64(c, 6); + auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1)); + auto a2 = gv_signselect8(std::move(s2), gv_shr64(a1, 2), std::move(a1)); + auto c3 = gv_bcst8(0x0f); + auto x3 = gv_and32(gv_shr64(r2, 4), c3); + x3 = gv_or32(std::move(x3), + gv_andn32(std::move(c3), gv_shl64(std::move(a2), 4))); + return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), + std::move(r2)); #endif -} - -// For each unsigned 16-bit element, r = a >> (b & 15) -inline v128 gv_shr16(const v128& a, const v128& b) -{ -#if defined(__AVX512VL__) && defined(__AVX512BW__) - return _mm_srlv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15))); -#elif defined(ARCH_ARM64) - return vshlq_u16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15)))); -#else - v128 r; - for (u32 i = 0; i < 8; i++) - r._u16[i] = a._u16[i] >> (b._u16[i] & 15); - return r; -#endif -} - -// For each unsigned 32-bit element, r = a >> (b & 31) -inline v128 gv_shr32(const v128& a, const v128& b) -{ -#if defined(__AVX2__) - return _mm_srlv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31))); -#elif defined(ARCH_ARM64) - return vshlq_u32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31)))); -#else - v128 r; - for (u32 i = 0; i < 4; i++) - r._u32[i] = a._u32[i] >> (b._u32[i] & 31); - return r; -#endif -} - -// For each signed 8-bit element, r = a >> (b & 7) -inline v128 gv_sar8(const v128& a, const v128& b) -{ -#if defined(ARCH_ARM64) - return vshlq_s8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7)))); -#else - v128 r; - for (u32 i = 0; i < 16; i++) - r._s8[i] = a._s8[i] >> (b._s8[i] & 7); - return r; -#endif -} - -// For each signed 16-bit element, r = a >> (b & 15) -inline v128 gv_sar16(const v128& a, const v128& b) -{ -#if defined(__AVX512VL__) && defined(__AVX512BW__) - return _mm_srav_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15))); -#elif defined(ARCH_ARM64) - return vshlq_s16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15)))); -#else - v128 r; - for (u32 i = 0; i < 8; i++) - r._s16[i] = a._s16[i] >> (b._s16[i] & 15); - return r; -#endif -} - -// For each signed 32-bit element, r = a >> (b & 31) -inline v128 gv_sar32(const v128& a, const v128& b) -{ -#if defined(__AVX2__) - return _mm_srav_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31))); -#elif defined(ARCH_ARM64) - return vshlq_s32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31)))); -#else - v128 r; - for (u32 i = 0; i < 4; i++) - r._s32[i] = a._s32[i] >> (b._s32[i] & 31); - return r; -#endif -} - -// For each 8-bit element, r = rotate a by b -inline v128 gv_rol8(const v128& a, const v128& b) -{ -#if defined(ARCH_ARM64) - const auto amt1 = vandq_s8(b, gv_bcst8(7)); - const auto amt2 = vsubq_s8(amt1, gv_bcst8(8)); - return vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(a, amt2)); -#else - const v128 x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), a)); // rotate left by 1 - const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a); - const v128 c2 = gv_bcst8(0x3); - const v128 x2 = gv_or32(gv_and32(gv_shr64(r1, 6), c2), gv_andn32(c2, gv_shl64(r1, 2))); // rotate by 2 - const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1); - const v128 c3 = gv_bcst8(0xf); - const v128 x3 = gv_or32(gv_and32(gv_shr64(r2, 4), c3), gv_andn32(c3, gv_shl64(r2, 4))); // rotate by 4 - return gv_signselect8(gv_shl64(b, 5), x3, r2); -#endif -} - -// For each 16-bit element, r = rotate a by b -inline v128 gv_rol16(const v128& a, const v128& b) -{ -#if defined(ARCH_ARM64) - const auto amt1 = vandq_s16(b, gv_bcst16(15)); - const auto amt2 = vsubq_s16(amt1, gv_bcst16(16)); - return vorrq_u16(vshlq_u16(a, amt1), vshlq_u16(a, amt2)); -#else - v128 r; - for (u32 i = 0; i < 8; i++) - r._u16[i] = utils::rol16(a._u16[i], b._u16[i]); - return r; -#endif -} - -// For each 16-bit element, r = rotate a by count -template -inline v128 gv_rol16(const v128& a) -{ - constexpr u8 count = Count & 0xf; -#if defined(ARCH_X64) - return _mm_or_si128(_mm_srli_epi16(a, 16 - count), _mm_slli_epi16(a, count)); -#elif defined(ARCH_ARM64) - return vorrq_u16(vshrq_n_u16(a, 16 - count), vshlq_n_u16(a, count)); -#else - v128 r; - for (u32 i = 0; i < 8; i++) - r._u16[i] = std::rotl(a._u16[i], count); - return r; -#endif -} - -// For each 32-bit element, r = rotate a by b -inline v128 gv_rol32(const v128& a, const v128& b) -{ -#if defined(__AVX512VL__) - return _mm_rolv_epi32(a, b); -#elif defined(ARCH_ARM64) - const auto amt1 = vandq_s32(b, gv_bcst32(31)); - const auto amt2 = vsubq_s32(amt1, gv_bcst32(32)); - return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2)); -#else - v128 r; - for (u32 i = 0; i < 4; i++) - r._u32[i] = utils::rol32(a._u32[i], b._u32[i]); - return r; -#endif -} - -// For each 32-bit element, r = rotate a by count -template -inline v128 gv_rol32(const v128& a) -{ - constexpr u8 count = Count & 0x1f; -#if defined(__AVX512VL__) - return _mm_rol_epi32(a, count); -#elif defined(ARCH_X64) - return _mm_or_si128(_mm_srli_epi32(a, 32 - count), _mm_slli_epi32(a, count)); -#elif defined(ARCH_ARM64) - return vorrq_u32(vshrq_n_u32(a, 32 - count), vshlq_n_u32(a, count)); -#else - v128 r; - for (u32 i = 0; i < 4; i++) - r._u32[i] = utils::rol32(a._u32[i], count); - return r; -#endif -} - -// For each 8-bit element, r = (a << (c & 7)) | (b >> (~c & 7) >> 1) -template -inline auto gv_fshl8(A&& a, B&& b, C&& c) -{ -#if defined(ARCH_ARM64) - const auto amt1 = vandq_s8(c, gv_bcst8(7)); - const auto amt2 = vsubq_s8(amt1, gv_bcst8(8)); - return v128(vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(b, amt2))); -#else - auto x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), b)); - auto s1 = gv_shl64(c, 7); - auto r1 = gv_signselect8(s1, std::move(x1), std::forward(a)); - auto b1 = gv_signselect8(std::move(s1), gv_shl64(b, 1), std::forward(b)); - auto c2 = gv_bcst8(0x3); - auto x2 = gv_and32(gv_shr64(b1, 6), c2); - x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(r1, 2))); - auto s2 = gv_shl64(c, 6); - auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1)); - auto b2 = gv_signselect8(std::move(s2), gv_shl64(b1, 2), std::move(b1)); - auto c3 = gv_bcst8(0xf); - auto x3 = gv_and32(gv_shr64(std::move(b2), 4), c3); - x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(r2, 4))); - return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2)); -#endif -} - -// For each 8-bit element, r = (b >> (c & 7)) | (a << (~c & 7) << 1) -template -inline auto gv_fshr8(A&& a, B&& b, C&& c) -{ -#if defined(ARCH_ARM64) - const auto amt1 = vandq_s8(c, gv_bcst8(7)); - const auto amt2 = vsubq_s8(gv_bcst8(8), amt1); - return vorrq_u8(vshlq_u8(b, vnegq_s8(amt1)), vshlq_u8(a, amt2)); -#else - auto c1 = gv_bcst8(0x7f); - auto x1 = gv_and32(gv_shr64(b, 1), c1); - x1 = gv_or32(std::move(x1), gv_andn32(std::move(c1), gv_shl64(a, 7))); - auto s1 = gv_shl64(c, 7); - auto r1 = gv_signselect8(s1, std::move(x1), std::move(b)); - auto a1 = gv_signselect8(std::move(s1), gv_shr64(a, 1), std::move(a)); - auto c2 = gv_bcst8(0x3f); - auto x2 = gv_and32(gv_shr64(r1, 2), c2); - x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(a1, 6))); - auto s2 = gv_shl64(c, 6); - auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1)); - auto a2 = gv_signselect8(std::move(s2), gv_shr64(a1, 2), std::move(a1)); - auto c3 = gv_bcst8(0x0f); - auto x3 = gv_and32(gv_shr64(r2, 4), c3); - x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(std::move(a2), 4))); - return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2)); -#endif -} - -// Shift left by byte amount -template -inline v128 gv_shuffle_left(const v128& a) -{ - if (Count > 15) - return {}; -#if defined(ARCH_X64) - return _mm_slli_si128(a, Count); -#elif defined(ARCH_ARM64) - v128 idx; - for (u32 i = 0; i < 16; i++) - idx._u8[i] = u8(i - Count); - return vqtbl1q_u8(a, idx); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_shuffle_left(A&& a) -{ - FOR_X64(unary_op, kIdPslldq, kIdVpslldq, std::forward(a), Count); -} - -// Shift right by byte amount -template -inline v128 gv_shuffle_right(const v128& a) -{ - if (Count > 15) - return {}; -#if defined(ARCH_X64) - return _mm_srli_si128(a, Count); -#elif defined(ARCH_ARM64) - v128 idx; - for (u32 i = 0; i < 16; i++) - idx._u8[i] = u8(i + Count); - return vqtbl1q_u8(a, idx); -#endif -} - -template - requires(asmjit::any_operand_v) -inline auto gv_shuffle_right(A&& a) -{ - FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward(a), Count); -} - -// Load 32-bit integer into the first element of a new vector, set other elements to zero -inline v128 gv_loadu32(const void* ptr) -{ -#if defined(ARCH_X64) - return _mm_loadu_si32(ptr); -#elif defined(ARCH_ARM64) - return vld1q_lane_u32(static_cast(ptr), vdupq_n_u32(0), 0); -#endif -} - -// Load 16-bit integer into an existing vector at the position specified by Index -template -inline v128 gv_insert16(const v128& vec, u16 value) -{ -#if defined(ARCH_X64) - return _mm_insert_epi16(vec, value, Index); -#elif defined(ARCH_ARM64) - return vsetq_lane_u16(value, vec, Index & 0x7); -#endif -} - -// For each 8-bit element, -// if ctrl >= 0 && ctrl < 16 then r = vec[ctrl], -// else if ctrl < 0 then r = 0 -inline v128 gv_shuffle8(const v128& vec, const v128& ctrl) -{ - AUDIT(std::ranges::none_of(ctrl._chars, [](s8 i) - { - return i >= static_cast(sizeof(v128)); - }), - "All indices must be in the range [0, 15] or negative, since PSHUFB and TBL behave differently otherwise"); -#if defined(__SSSE3__) - return _mm_shuffle_epi8(vec, ctrl); -#elif defined(ARCH_ARM64) - return vqtbl1q_s8(vec, ctrl); -#else - v128 r; - for (s32 i = 0; i < 16; i++) - r._s8[i] = ctrl._s8[i] < 0 ? 0 : vec._s8[ctrl._s8[i] & 0xf]; - return r; -#endif -} - -// For each 2-bit index in Control, r = vec[index] -template -inline v128 gv_shuffle32(const v128& vec) -{ -#if defined(ARCH_X64) - return _mm_shuffle_epi32(vec, Control); -#elif defined(ARCH_ARM64) - constexpr u8 idx0 = (Control & 3) * sizeof(s32); - constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32); - constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32); - constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32); - - constexpr uint8x16_t idx_vec = {idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3}; - - return vqtbl1q_s8(vec, idx_vec); -#endif -} - -// For each index, r = vec[index & 3] -template -inline v128 gv_shuffle32(const v128& vec) -{ -#if defined(ARCH_X64) - return _mm_shuffle_epi32(vec, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6); -#elif defined(ARCH_ARM64) - constexpr u8 idx0 = (Index0 & 3) * sizeof(s32); - constexpr u8 idx1 = (Index1 & 3) * sizeof(s32); - constexpr u8 idx2 = (Index2 & 3) * sizeof(s32); - constexpr u8 idx3 = (Index3 & 3) * sizeof(s32); - - constexpr uint8x16_t idx_vec = {idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3}; - - return vqtbl1q_s8(vec, idx_vec); -#endif -} - -// For the first two 2-bit indices in Control, r = a[index], -// for the last two indices, r = b[index] -template -inline v128 gv_shufflefs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_shuffle_ps(a, b, Control); -#elif defined(ARCH_ARM64) - constexpr u8 idx0 = (Control & 3) * sizeof(s32); - constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32); - constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32) + sizeof(v128); - constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32) + sizeof(v128); - - constexpr uint8x16_t idx_vec = {idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3}; - - return vqtbl2q_s8({a, b}, idx_vec); -#endif -} - -// For the first two indices, r = a[index & 3], -// for the last two indices, r = b[index & 3] -template -inline v128 gv_shufflefs(const v128& a, const v128& b) -{ -#if defined(ARCH_X64) - return _mm_shuffle_ps(a, b, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6); -#elif defined(ARCH_ARM64) - constexpr u8 idx0 = (Index0 & 3) * sizeof(s32); - constexpr u8 idx1 = (Index1 & 3) * sizeof(s32); - constexpr u8 idx2 = (Index2 & 3) * sizeof(s32) + sizeof(v128); - constexpr u8 idx3 = (Index3 & 3) * sizeof(s32) + sizeof(v128); - - constexpr uint8x16_t idx_vec = {idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3}; - - return vqtbl2q_s8({a, b}, idx_vec); -#endif -} - -// For each 32-bit element, reverse byte order -inline v128 gv_rev32(const v128& vec) -{ -#if defined(__SSSE3__) - return _mm_shuffle_epi8(vec, _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12)); -#elif defined(ARCH_ARM64) - return vrev32q_u8(vec); -#else - return gv_rol32<16>(gv_rol16<8>(vec)); -#endif -} - -// For each 32-bit element, convert between big-endian and native-endian -inline v128 gv_to_be32(const v128& vec) -{ - if constexpr (std::endian::native == std::endian::little) - return gv_rev32(vec); - return vec; -} + } +} // namespace rx #if defined(__clang__) #pragma clang diagnostic pop diff --git a/rpcs3/util/to_endian.hpp b/rpcs3/util/to_endian.hpp index f4ec045d9..15480792b 100644 --- a/rpcs3/util/to_endian.hpp +++ b/rpcs3/util/to_endian.hpp @@ -3,7 +3,11 @@ #include "util/types.hpp" #include "util/endian.hpp" -union v128; +namespace rx +{ + union v128; +} +using rx::v128; // Type converter: converts native endianness arithmetic/enum types to appropriate se_t<> type template diff --git a/rpcs3/util/types.hpp b/rpcs3/util/types.hpp index 7462d1a93..15b1a8422 100644 --- a/rpcs3/util/types.hpp +++ b/rpcs3/util/types.hpp @@ -1,195 +1,5 @@ -#pragma once // No BOM and only basic ASCII in this header, or a neko will die - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) || defined(__amd64__) -#define ARCH_X64 1 -#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) -#define ARCH_ARM64 1 -// v8.4a+ gives us atomic 16 byte ld/st -// See Arm C Language Extensions Documentation -// Currently there is no feature macro for LSE2 specifically so we define it ourself -// Unfortunately the __ARM_ARCH integer macro isn't universally defined so we use this hack instead -#if defined(__ARM_ARCH_8_4__) || defined(__ARM_ARCH_8_5__) || defined(__ARM_ARCH_8_6__) || defined(__ARM_ARCH_9__) -#define ARM_FEATURE_LSE2 1 -#endif -#endif - -using std::chrono::steady_clock; - -using namespace std::literals; - -#ifndef __has_builtin -#define __has_builtin(x) 0 -#endif - -#ifdef _MSC_VER -#define SAFE_BUFFERS(...) __declspec(safebuffers) __VA_ARGS__ -#define NEVER_INLINE __declspec(noinline) -#define FORCE_INLINE __forceinline -#else // not _MSC_VER -#ifdef __clang__ -#define SAFE_BUFFERS(...) __attribute__((no_stack_protector)) __VA_ARGS__ -#else -#define SAFE_BUFFERS(...) __VA_ARGS__ __attribute__((__optimize__("no-stack-protector"))) -#endif -#define NEVER_INLINE __attribute__((noinline)) inline -#define FORCE_INLINE __attribute__((always_inline)) inline -#endif // _MSC_VER - -#define CHECK_SIZE(type, size) static_assert(sizeof(type) == size, "Invalid " #type " type size") -#define CHECK_ALIGN(type, align) static_assert(alignof(type) == align, "Invalid " #type " type alignment") -#define CHECK_MAX_SIZE(type, size) static_assert(sizeof(type) <= size, #type " type size is too big") -#define CHECK_SIZE_ALIGN(type, size, align) \ - CHECK_SIZE(type, size); \ - CHECK_ALIGN(type, align) - -#define DECLARE(...) decltype(__VA_ARGS__) __VA_ARGS__ - -#define STR_CASE(...) \ - case __VA_ARGS__: return #__VA_ARGS__ - -#if defined(_DEBUG) || defined(_AUDIT) -#define AUDIT(...) (static_cast(ensure(__VA_ARGS__))) -#else -#define AUDIT(...) (static_cast>(0)) -#endif - -namespace utils -{ - template - struct fn_helper - { - F f; - - fn_helper(F&& f) - : f(std::forward(f)) - { - } - - template - auto operator()(Args&&... args) const - { - if constexpr (sizeof...(Args) == 0) - return f(0, 0, 0, 0); - else if constexpr (sizeof...(Args) == 1) - return f(std::forward(args)..., 0, 0, 0); - else if constexpr (sizeof...(Args) == 2) - return f(std::forward(args)..., 0, 0); - else if constexpr (sizeof...(Args) == 3) - return f(std::forward(args)..., 0); - else if constexpr (sizeof...(Args) == 4) - return f(std::forward(args)...); - else - static_assert(sizeof...(Args) <= 4); - } - }; - - template - fn_helper(F&& f) -> fn_helper; -} // namespace utils - -// Shorter lambda. -#define FN(...) \ - ::utils::fn_helper([&]( \ - [[maybe_unused]] auto&& x, \ - [[maybe_unused]] auto&& y, \ - [[maybe_unused]] auto&& z, \ - [[maybe_unused]] auto&& w) { \ - return (__VA_ARGS__); \ - }) - -#if __cpp_lib_bit_cast < 201806L -namespace std -{ - template - [[nodiscard]] constexpr To bit_cast(const From& from) noexcept - { - return __builtin_bit_cast(To, from); - } -} // namespace std -#endif - -#if defined(__INTELLISENSE__) || (defined(__clang__) && (__clang_major__ <= 16)) -#define consteval constexpr -#define constinit -#endif - -using schar = signed char; -using uchar = unsigned char; -using ushort = unsigned short; -using uint = unsigned int; -using ulong = unsigned long; -using ullong = unsigned long long; -using llong = long long; - -using uptr = std::uintptr_t; - -using u8 = std::uint8_t; -using u16 = std::uint16_t; -using u32 = std::uint32_t; -using u64 = std::uint64_t; -using usz = std::size_t; - -using s8 = std::int8_t; -using s16 = std::int16_t; -using s32 = std::int32_t; -using s64 = std::int64_t; -using ssz = std::make_signed_t; - -// Get integral type from type size -template -struct get_int_impl -{ -}; - -template <> -struct get_int_impl -{ - using utype = u8; -}; - -template <> -struct get_int_impl -{ - using utype = u16; -}; - -template <> -struct get_int_impl -{ - using utype = u32; -}; - -template <> -struct get_int_impl -{ - using utype = u64; -}; - -template -using get_uint_t = typename get_int_impl::utype; - -template -std::remove_cvref_t as_rvalue(T&& obj) -{ - return std::forward(obj); -} - -template -class atomic_t; +#pragma once +#include namespace stx { @@ -203,1148 +13,6 @@ namespace stx struct generator; } // namespace stx -using stx::se_t; - -// se_t<> with native endianness -template -using nse_t = se_t; - -template -using be_t = se_t; -template -using le_t = se_t; - -template -using atomic_be_t = atomic_t, Align>; -template -using atomic_le_t = atomic_t, Align>; - -// Bool type equivalent -class b8 -{ - u8 m_value; - -public: - b8() = default; - - using enable_bitcopy = std::true_type; - - constexpr b8(bool value) noexcept - : m_value(value) - { - } - - constexpr operator bool() const noexcept - { - return m_value != 0; - } - - constexpr bool set(bool value) noexcept - { - m_value = value; - return value; - } -}; - -#if defined(ARCH_X64) && !defined(_MSC_VER) -using __m128i = long long __attribute__((vector_size(16))); -using __m128d = double __attribute__((vector_size(16))); -using __m128 = float __attribute__((vector_size(16))); -#endif - -#ifndef _MSC_VER -using u128 = __uint128_t; -using s128 = __int128_t; -#else - -extern "C" -{ - union __m128; - union __m128i; - struct __m128d; - - uchar _addcarry_u64(uchar, u64, u64, u64*); - uchar _subborrow_u64(uchar, u64, u64, u64*); - u64 __shiftleft128(u64, u64, uchar); - u64 __shiftright128(u64, u64, uchar); - u64 _umul128(u64, u64, u64*); -} - -// Unsigned 128-bit integer implementation (TODO) -struct alignas(16) u128 -{ - u64 lo, hi; - - u128() noexcept = default; - - template - requires std::is_unsigned_v - constexpr u128(T arg) noexcept - : lo(arg), hi(0) - { - } - - template - requires std::is_signed_v - constexpr u128(T arg) noexcept - : lo(s64{arg}), hi(s64{arg} >> 63) - { - } - - constexpr explicit operator bool() const noexcept - { - return !!(lo | hi); - } - - constexpr explicit operator u64() const noexcept - { - return lo; - } - - constexpr explicit operator s64() const noexcept - { - return lo; - } - - constexpr friend u128 operator+(const u128& l, const u128& r) - { - u128 value = l; - value += r; - return value; - } - - constexpr friend u128 operator-(const u128& l, const u128& r) - { - u128 value = l; - value -= r; - return value; - } - - constexpr friend u128 operator*(const u128& l, const u128& r) - { - u128 value = l; - value *= r; - return value; - } - - constexpr u128 operator+() const - { - return *this; - } - - constexpr u128 operator-() const - { - u128 value{}; - value -= *this; - return value; - } - - constexpr u128& operator++() - { - *this += 1; - return *this; - } - - constexpr u128 operator++(int) - { - u128 value = *this; - *this += 1; - return value; - } - - constexpr u128& operator--() - { - *this -= 1; - return *this; - } - - constexpr u128 operator--(int) - { - u128 value = *this; - *this -= 1; - return value; - } - - constexpr u128 operator<<(u128 shift_value) const - { - u128 value = *this; - value <<= shift_value; - return value; - } - - constexpr u128 operator>>(u128 shift_value) const - { - u128 value = *this; - value >>= shift_value; - return value; - } - - constexpr u128 operator~() const - { - u128 value{}; - value.lo = ~lo; - value.hi = ~hi; - return value; - } - - constexpr friend u128 operator&(const u128& l, const u128& r) - { - u128 value{}; - value.lo = l.lo & r.lo; - value.hi = l.hi & r.hi; - return value; - } - - constexpr friend u128 operator|(const u128& l, const u128& r) - { - u128 value{}; - value.lo = l.lo | r.lo; - value.hi = l.hi | r.hi; - return value; - } - - constexpr friend u128 operator^(const u128& l, const u128& r) - { - u128 value{}; - value.lo = l.lo ^ r.lo; - value.hi = l.hi ^ r.hi; - return value; - } - - constexpr u128& operator+=(const u128& r) - { - if (std::is_constant_evaluated()) - { - lo += r.lo; - hi += r.hi + (lo < r.lo); - } - else - { - _addcarry_u64(_addcarry_u64(0, r.lo, lo, &lo), r.hi, hi, &hi); - } - - return *this; - } - - constexpr u128& operator-=(const u128& r) - { - if (std::is_constant_evaluated()) - { - hi -= r.hi + (lo < r.lo); - lo -= r.lo; - } - else - { - _subborrow_u64(_subborrow_u64(0, lo, r.lo, &lo), hi, r.hi, &hi); - } - - return *this; - } - - constexpr u128& operator*=(const u128& r) - { - const u64 _hi = r.hi * lo + r.lo * hi; - - if (std::is_constant_evaluated()) - { - hi = (lo >> 32) * (r.lo >> 32) + (((lo >> 32) * (r.lo & 0xffffffff)) >> 32) + (((r.lo >> 32) * (lo & 0xffffffff)) >> 32); - lo = lo * r.lo; - } - else - { - lo = _umul128(lo, r.lo, &hi); - } - - hi += _hi; - return *this; - } - - constexpr u128& operator<<=(const u128& r) - { - if (std::is_constant_evaluated()) - { - if (r.hi == 0 && r.lo < 64) - { - hi = (hi << r.lo) | (lo >> (64 - r.lo)); - lo = (lo << r.lo); - return *this; - } - else if (r.hi == 0 && r.lo < 128) - { - hi = (lo << (r.lo - 64)); - lo = 0; - return *this; - } - } - - const u64 v0 = lo << (r.lo & 63); - const u64 v1 = __shiftleft128(lo, hi, static_cast(r.lo)); - lo = (r.lo & 64) ? 0 : v0; - hi = (r.lo & 64) ? v0 : v1; - return *this; - } - - constexpr u128& operator>>=(const u128& r) - { - if (std::is_constant_evaluated()) - { - if (r.hi == 0 && r.lo < 64) - { - lo = (lo >> r.lo) | (hi << (64 - r.lo)); - hi = (hi >> r.lo); - return *this; - } - else if (r.hi == 0 && r.lo < 128) - { - lo = (hi >> (r.lo - 64)); - hi = 0; - return *this; - } - } - - const u64 v0 = hi >> (r.lo & 63); - const u64 v1 = __shiftright128(lo, hi, static_cast(r.lo)); - lo = (r.lo & 64) ? v0 : v1; - hi = (r.lo & 64) ? 0 : v0; - return *this; - } - - constexpr u128& operator&=(const u128& r) - { - lo &= r.lo; - hi &= r.hi; - return *this; - } - - constexpr u128& operator|=(const u128& r) - { - lo |= r.lo; - hi |= r.hi; - return *this; - } - - constexpr u128& operator^=(const u128& r) - { - lo ^= r.lo; - hi ^= r.hi; - return *this; - } -}; - -// Signed 128-bit integer implementation -struct s128 : u128 -{ - using u128::u128; - - constexpr s128 operator>>(u128 shift_value) const - { - s128 value = *this; - value >>= shift_value; - return value; - } - - constexpr s128& operator>>=(const u128& r) - { - if (std::is_constant_evaluated()) - { - if (r.hi == 0 && r.lo < 64) - { - lo = (lo >> r.lo) | (hi << (64 - r.lo)); - hi = (static_cast(hi) >> r.lo); - return *this; - } - else if (r.hi == 0 && r.lo < 128) - { - s64 _lo = static_cast(hi) >> (r.lo - 64); - lo = _lo; - hi = _lo >> 63; - return *this; - } - } - - const u64 v0 = static_cast(hi) >> (r.lo & 63); - const u64 v1 = __shiftright128(lo, hi, static_cast(r.lo)); - lo = (r.lo & 64) ? v0 : v1; - hi = (r.lo & 64) ? static_cast(hi) >> 63 : v0; - return *this; - } -}; -#endif - -// Optimization for u64*u64=u128 -constexpr u128 u128_from_mul(u64 a, u64 b) -{ -#ifdef _MSC_VER - if (!std::is_constant_evaluated()) - { - u64 hi; - u128 result = _umul128(a, b, &hi); - result.hi = hi; - return result; - } -#endif - - return u128{a} * b; -} - -template <> -struct get_int_impl<16> -{ - using utype = u128; - using stype = s128; -}; - -enum class f16 : u16 -{ -}; - -using f32 = float; -using f64 = double; - -template -concept UnsignedInt = std::is_unsigned_v> || std::is_same_v, u128>; - -template -concept SignedInt = (std::is_signed_v> && std::is_integral_v>) || std::is_same_v, s128>; - -template -concept FPInt = std::is_floating_point_v> || std::is_same_v, f16>; - -template -concept Integral = std::is_integral_v> || std::is_same_v, u128> || std::is_same_v, s128>; - -template -constexpr T min_v; - -template -constexpr std::common_type_t min_v = 0; - -template -constexpr std::common_type_t min_v = static_cast>(-1) << (sizeof(std::common_type_t) * 8 - 1); - -template <> -constexpr inline f16 min_v{0xfbffu}; - -template <> -constexpr inline f32 min_v = std::bit_cast(0xff'7fffffu); - -template <> -constexpr inline f64 min_v = std::bit_cast(0xffe'7ffff'ffffffffu); - -template -constexpr std::common_type_t min_v = min_v>; - -template -constexpr T max_v; - -template -constexpr std::common_type_t max_v = -1; - -template -constexpr std::common_type_t max_v = static_cast>(~min_v); - -template <> -constexpr inline f16 max_v{0x7bffu}; - -template <> -constexpr inline f32 max_v = std::bit_cast(0x7f'7fffffu); - -template <> -constexpr inline f64 max_v = std::bit_cast(0x7fe'fffff'ffffffffu); - -template -constexpr std::common_type_t max_v = max_v>; - -// Return magic value for any unsigned type -constexpr struct umax_impl_t -{ - template - constexpr bool operator==(const T& rhs) const - { - return rhs == max_v; - } - - template - constexpr std::strong_ordering operator<=>(const T& rhs) const - { - return rhs == max_v ? std::strong_ordering::equal : std::strong_ordering::greater; - } - - template - constexpr operator T() const - { - return max_v; - } -} umax; - -constexpr struct smin_impl_t -{ - template - constexpr bool operator==(const T& rhs) const - { - return rhs == min_v; - } - - template - constexpr std::strong_ordering operator<=>(const T& rhs) const - { - return rhs == min_v ? std::strong_ordering::equal : std::strong_ordering::less; - } - - template - constexpr operator T() const - { - return min_v; - } -} smin; - -constexpr struct smax_impl_t -{ - template - constexpr bool operator==(const T& rhs) const - { - return rhs == max_v; - } - - template - constexpr std::strong_ordering operator<=>(const T& rhs) const - { - return rhs == max_v ? std::strong_ordering::equal : std::strong_ordering::greater; - } - - template - constexpr operator T() const - { - return max_v; - } -} smax; - -// Compare signed or unsigned type with its max value -constexpr struct amax_impl_t -{ - template - requires SignedInt || UnsignedInt - constexpr bool operator==(const T& rhs) const - { - return rhs == max_v; - } - - template - requires SignedInt || UnsignedInt - constexpr std::strong_ordering operator<=>(const T& rhs) const - { - return max_v <=> rhs; - } - - template - requires SignedInt || UnsignedInt - constexpr operator T() const - { - return max_v; - } -} amax; - -// Compare signed or unsigned type with its minimal value (like zero or INT_MIN) -constexpr struct amin_impl_t -{ - template - requires SignedInt || UnsignedInt - constexpr bool operator==(const T& rhs) const - { - return rhs == min_v; - } - - template - requires SignedInt || UnsignedInt - constexpr std::strong_ordering operator<=>(const T& rhs) const - { - return min_v <=> rhs; - } - - template - requires SignedInt || UnsignedInt - constexpr operator T() const - { - return min_v; - } -} amin; - -template -inline u32 offset32(T T2::* const mptr) -{ -#ifdef _MSC_VER - return std::bit_cast(mptr); -#elif __GNUG__ - return std::bit_cast(mptr); -#else - static_assert(sizeof(mptr) == 0, "Unsupported pointer-to-member size"); -#endif -} - -template -struct offset32_array -{ - static_assert(std::is_array_v, "Invalid pointer-to-member type (array expected)"); - - template - static inline u32 index32(const Arg& arg) - { - return u32{sizeof(std::remove_extent_t)} * static_cast(arg); - } -}; - -template -struct offset32_array> -{ - template - static inline u32 index32(const Arg& arg) - { - return u32{sizeof(T)} * static_cast(arg); - } -}; - -template -struct offset32_detail; - -template -inline u32 offset32(T T2::* const mptr, const Arg& arg, const Args&... args) -{ - return offset32_detail::offset32(mptr, arg, args...); -} - -template -struct offset32_detail -{ - template - static inline u32 offset32(T T2::* const mptr, const Arg& arg, const Args&... args) - { - return ::offset32(mptr, args...) + offset32_array::index32(arg); - } -}; - -template -struct offset32_detail -{ - template - static inline u32 offset32(T T2::* const mptr, T3 T4::* const mptr2, const Args&... args) - { - return ::offset32(mptr) + ::offset32(mptr2, args...); - } -}; - -// Convert 0-2-byte string to u16 value like reinterpret_cast does -constexpr u16 operator""_u16(const char* s, usz /*length*/) -{ - char buf[2]{s[0], s[1]}; - return std::bit_cast(buf); -} - -// Convert 3-4-byte string to u32 value like reinterpret_cast does -constexpr u32 operator""_u32(const char* s, usz /*length*/) -{ - char buf[4]{s[0], s[1], s[2], s[3]}; - return std::bit_cast(buf); -} - -// Convert 5-8-byte string to u64 value like reinterpret_cast does -constexpr u64 operator""_u64(const char* s, usz len) -{ - char buf[8]{s[0], s[1], s[2], s[3], s[4], (len < 6 ? '\0' : s[5]), (len < 7 ? '\0' : s[6]), (len < 8 ? '\0' : s[7])}; - return std::bit_cast(buf); -} - -#if !defined(__INTELLISENSE__) && !__has_builtin(__builtin_COLUMN) && !defined(_MSC_VER) -constexpr unsigned __builtin_COLUMN() -{ - return -1; -} -#endif - -template -struct const_str_t -{ - static constexpr usz size = Size; - - char8_t chars[Size + 1]{}; - - constexpr const_str_t(const char (&a)[Size + 1]) - { - for (usz i = 0; i <= Size; i++) - chars[i] = a[i]; - } - - constexpr const_str_t(const char8_t (&a)[Size + 1]) - { - for (usz i = 0; i <= Size; i++) - chars[i] = a[i]; - } - - operator const char*() const - { - return reinterpret_cast(chars); - } - - constexpr operator const char8_t*() const - { - return chars; - } -}; - -template <> -struct const_str_t -{ - const usz size; - - union - { - const char8_t* chars; - const char* chars2; - }; - - constexpr const_str_t() - : size(0), chars(nullptr) - { - } - - template - constexpr const_str_t(const char8_t (&a)[N]) - : size(N - 1), chars(+a) - { - } - - template - constexpr const_str_t(const char (&a)[N]) - : size(N - 1), chars2(+a) - { - } - - constexpr operator const char*() const - { - return std::launder(chars2); - } - - constexpr operator const char8_t*() const - { - return chars; - } -}; - -template -const_str_t(const char (&a)[Size]) -> const_str_t; - -template -const_str_t(const char8_t (&a)[Size]) -> const_str_t; - -using const_str = const_str_t<>; - -namespace fmt -{ - [[noreturn]] void raw_verify_error(std::source_location loc, const char8_t* msg, usz object); - [[noreturn]] void raw_range_error(std::source_location loc, std::string_view index, usz container_size); - [[noreturn]] void raw_range_error(std::source_location loc, usz index, usz container_size); -} // namespace fmt - -// No full implementation to ease on header weight -template -std::conditional_t>, usz, std::string_view> format_object_simplified(const T& obj) -{ - using type = std::remove_cvref_t; - - if constexpr (std::is_integral_v || std::is_same_v || std::is_same_v) - { - return obj; - } - else if constexpr (std::is_array_v && std::is_constructible_v) - { - return {obj, std::size(obj) - 1}; - } - else - { - return std::string_view{}; - } -} - -template -constexpr decltype(auto) ensure(T&& arg, const_str msg = const_str(), std::source_location src_loc = std::source_location::current()) noexcept -{ - if (std::forward(arg)) [[likely]] - { - return std::forward(arg); - } - - fmt::raw_verify_error(src_loc, msg, 0); -} - -template - requires(std::is_invocable_v) -constexpr decltype(auto) ensure(T&& arg, F&& pred, const_str msg = const_str(), std::source_location src_loc = std::source_location::current()) noexcept -{ - if (std::forward(pred)(std::forward(arg))) [[likely]] - { - return std::forward(arg); - } - - fmt::raw_verify_error(src_loc, msg, 0); -} - -template - requires(std::is_integral_v() + std::declval())>) -[[nodiscard]] constexpr To narrow(const From& value, std::source_location src_loc = std::source_location::current()) -{ - // Narrow check - using CommonFrom = std::common_type_t; - using CommonTo = std::common_type_t; - - using UnFrom = std::make_unsigned_t; - using UnTo = std::make_unsigned_t; - - constexpr bool is_from_signed = std::is_signed_v; - constexpr bool is_to_signed = std::is_signed_v; - - constexpr auto from_mask = (is_from_signed && !is_to_signed) ? UnFrom{umax} >> 1 : UnFrom{umax}; - constexpr auto to_mask = (is_to_signed && !is_from_signed) ? UnTo{umax} >> 1 : UnTo{umax}; - - constexpr auto mask = ~(from_mask & to_mask); - - // Signed to unsigned always require test - // Otherwise, this is bit-wise narrowing or conversion between types of different signedness of the same size - if constexpr ((is_from_signed && !is_to_signed) || to_mask < from_mask) - { - // Try to optimize test if both are of the same signedness - if (is_from_signed != is_to_signed ? !!(value & mask) : static_cast(value) != value) [[unlikely]] - { - fmt::raw_verify_error(src_loc, u8"Narrowing error", +value); - } - } - - return static_cast(value); -} - -// Returns u32 size() for container -template - requires requires(const CT& x) { std::size(x); } -[[nodiscard]] constexpr u32 size32(const CT& container, std::source_location src_loc = std::source_location::current()) -{ - // TODO: Support std::array - constexpr bool is_const = std::is_array_v>; - - if constexpr (is_const) - { - constexpr usz Size = sizeof(container) / sizeof(container[0]); - return std::conditional_t{Size}; - } - else - { - return narrow(container.size(), src_loc); - } -} - -template - requires requires(CT&& x) { std::size(x); std::data(x); } || requires(CT&& x) { std::size(x); x.front(); } -[[nodiscard]] constexpr auto& at32(CT&& container, T&& index, std::source_location src_loc = std::source_location::current()) -{ - // Make sure the index is within u32 range - const std::make_unsigned_t> idx = index; - const u32 csz = ::size32(container, src_loc); - if (csz <= idx) [[unlikely]] - fmt::raw_range_error(src_loc, format_object_simplified(index), csz); - auto it = std::begin(std::forward(container)); - std::advance(it, idx); - return *it; -} - -template - requires requires(CT&& x, T&& y) { x.count(y); x.find(y); } -[[nodiscard]] constexpr auto& at32(CT&& container, T&& index, std::source_location src_loc = std::source_location::current()) -{ - // Associative container - const auto found = container.find(std::forward(index)); - usz csv = umax; - if constexpr ((requires() { container.size(); })) - csv = container.size(); - if (found == container.end()) [[unlikely]] - fmt::raw_range_error(src_loc, format_object_simplified(index), csv); - return found->second; -} - -// Simplified hash algorithm. May be used in std::unordered_(map|set). -template -struct value_hash -{ - usz operator()(T value) const - { - return static_cast(value) >> Shift; - } -}; - -template -struct fill_array_t -{ - std::tuple args; - - template - constexpr std::unwrap_reference_t get() const - { - return std::get(args); - } - - template - constexpr std::array fill(std::index_sequence, std::index_sequence) const - { - return {(static_cast(Idx), U(get()...))...}; - } - - template - constexpr operator std::array() const - { - return fill(std::make_index_sequence(), std::make_index_sequence()); - } -}; - -template -constexpr auto fill_array(const T&... args) -{ - return fill_array_t{{args...}}; -} - -template -concept PtrCastable = requires(const volatile X* x, const volatile Y* y) { - static_cast(x); - static_cast(y); -}; - -template - requires PtrCastable -consteval bool is_same_ptr() -{ - if constexpr (std::is_void_v || std::is_void_v || std::is_same_v, std::remove_cv_t>) - { - return true; - } - else if constexpr (sizeof(X) == sizeof(Y)) - { - return true; - } - else - { - bool result = false; - - if constexpr (sizeof(X) < sizeof(Y)) - { - std::allocator a{}; - Y* ptr = a.allocate(1); - result = static_cast(ptr) == static_cast(ptr); - a.deallocate(ptr, 1); - } - else - { - std::allocator a{}; - X* ptr = a.allocate(1); - result = static_cast(ptr) == static_cast(ptr); - a.deallocate(ptr, 1); - } - - return result; - } -} - -template - requires PtrCastable -constexpr bool is_same_ptr(const volatile Y* ptr) -{ - return static_cast(ptr) == static_cast(ptr); -} - -template -concept PtrSame = (is_same_ptr()); - -namespace stx -{ - template - struct exact_t - { - static_assert(std::is_reference_v || std::is_convertible_v); - - T obj; - - explicit exact_t(T&& _obj) : obj(std::forward(_obj)) {} - exact_t& operator=(const exact_t&) = delete; - - template - requires(std::is_same_v) - operator U&() const noexcept - { - return obj; - }; - - template - requires(std::is_same_v) - operator const U&() const noexcept - { - return obj; - }; - - template - requires(std::is_same_v && std::is_copy_constructible_v) - operator U() const noexcept - { - return obj; - }; - }; - - template - stx::exact_t make_exact(T&& obj) noexcept - { - return stx::exact_t(static_cast(obj)); - } -} // namespace stx - -// Read object of type T from raw pointer, array, string, vector, or any contiguous container -template -constexpr T read_from_ptr(U&& array, usz pos = 0) -{ - // TODO: ensure array element types are trivial - static_assert(sizeof(T) % sizeof(array[0]) == 0); - std::decay_t buf[sizeof(T) / sizeof(array[0])]; - if (!std::is_constant_evaluated()) - std::memcpy(+buf, &array[pos], sizeof(buf)); - else - for (usz i = 0; i < pos; buf[i] = array[pos + i], i++) - ; - return std::bit_cast(buf); -} - -template -constexpr void write_to_ptr(U&& array, usz pos, const T& value) -{ - static_assert(sizeof(T) % sizeof(array[0]) == 0); - if (!std::is_constant_evaluated()) - std::memcpy(static_cast(&array[pos]), &value, sizeof(value)); - else - ensure(!"Unimplemented"); -} - -template -constexpr void write_to_ptr(U&& array, const T& value) -{ - static_assert(sizeof(T) % sizeof(array[0]) == 0); - if (!std::is_constant_evaluated()) - std::memcpy(&array[0], &value, sizeof(value)); - else - ensure(!"Unimplemented"); -} - -constexpr struct aref_tag_t -{ -} aref_tag{}; - -template -class aref final -{ - U* m_ptr; - - static_assert(sizeof(std::decay_t) % sizeof(U) == 0); - -public: - aref() = delete; - - constexpr aref(const aref&) = default; - - explicit constexpr aref(aref_tag_t, U* ptr) - : m_ptr(ptr) - { - } - - constexpr T value() const - { - return read_from_ptr(m_ptr); - } - - constexpr operator T() const - { - return read_from_ptr(m_ptr); - } - - aref& operator=(const aref&) = delete; - - constexpr aref& operator=(const T& value) const - { - write_to_ptr(m_ptr, value); - return *this; - } - - template - requires(std::is_convertible_v) && PtrSame - aref ref(MT T2::* const mptr) const - { - return aref(aref_tag, m_ptr + offset32(mptr) / sizeof(U)); - } - - template > - requires(std::is_convertible_v) && PtrSame - aref ref(MT T2::* const mptr, usz index) const - { - return aref(aref_tag, m_ptr + offset32(mptr) / sizeof(U) + sizeof(ET) / sizeof(U) * index); - } -}; - -template -class aref -{ - U* m_ptr; - - static_assert(sizeof(std::decay_t) % sizeof(U) == 0); - -public: - aref() = delete; - - constexpr aref(const aref&) = default; - - explicit constexpr aref(aref_tag_t, U* ptr) - : m_ptr(ptr) - { - } - - aref& operator=(const aref&) = delete; - - constexpr aref operator[](usz index) const - { - return aref(aref_tag, m_ptr + index * (sizeof(T) / sizeof(U))); - } -}; - -template -class aref -{ - U* m_ptr; - - static_assert(sizeof(std::decay_t) % sizeof(U) == 0); - -public: - aref() = delete; - - constexpr aref(const aref&) = default; - - explicit constexpr aref(aref_tag_t, U* ptr) - : m_ptr(ptr) - { - } - - aref& operator=(const aref&) = delete; - - constexpr aref operator[](usz index) const - { - return aref(aref_tag, m_ptr + index * (sizeof(T) / sizeof(U))); - } -}; - -// Reference object of type T, see read_from_ptr -template -constexpr auto ref_ptr(U&& array, usz pos = 0) -> aref> -{ - return aref>(aref_tag, &array[pos]); -} - namespace utils { struct serial; @@ -1374,9 +42,3 @@ extern bool serialize(utils::serial& ar, T& obj); #define ENABLE_BITWISE_SERIALIZATION using enable_bitcopy = std::true_type; #define SAVESTATE_INIT_POS(...) static constexpr double savestate_init_pos = (__VA_ARGS__) - -#define UNUSED(expr) \ - do \ - { \ - (void)(expr); \ - } while (0) diff --git a/rpcs3/util/v128.hpp b/rpcs3/util/v128.hpp index 0a5061dcd..80151d29e 100644 --- a/rpcs3/util/v128.hpp +++ b/rpcs3/util/v128.hpp @@ -1,223 +1,6 @@ #pragma once // No BOM and only basic ASCII in this header, or a neko will die #include "util/types.hpp" +#include -template -concept Vector128 = (sizeof(T) == 16) && (std::is_trivial_v); - -// 128-bit vector type -union alignas(16) v128 -{ - uchar _bytes[16]; - char _chars[16]; - - template - struct masked_array_t // array type accessed as (index ^ M) - { - T m_data[N]; - - public: - T& operator[](usz index) - { - return m_data[index ^ M]; - } - - const T& operator[](usz index) const - { - return m_data[index ^ M]; - } - }; - - template - using normal_array_t = masked_array_t; - template - using reversed_array_t = masked_array_t; - - normal_array_t _u64; - normal_array_t _s64; - reversed_array_t u64r; - reversed_array_t s64r; - - normal_array_t _u32; - normal_array_t _s32; - reversed_array_t u32r; - reversed_array_t s32r; - - normal_array_t _u16; - normal_array_t _s16; - reversed_array_t u16r; - reversed_array_t s16r; - - normal_array_t _u8; - normal_array_t _s8; - reversed_array_t u8r; - reversed_array_t s8r; - - normal_array_t _f; - normal_array_t _d; - reversed_array_t fr; - reversed_array_t dr; - - u128 _u; - s128 _s; - - v128() = default; - - constexpr v128(const v128&) noexcept = default; - - template - constexpr v128(const T& rhs) noexcept - : v128(std::bit_cast(rhs)) - { - } - - constexpr v128& operator=(const v128&) noexcept = default; - - template - constexpr operator T() const noexcept - { - return std::bit_cast(*this); - } - - ENABLE_BITWISE_SERIALIZATION; - - static v128 from64(u64 _0, u64 _1 = 0) - { - v128 ret; - ret._u64[0] = _0; - ret._u64[1] = _1; - return ret; - } - - static v128 from64r(u64 _1, u64 _0 = 0) - { - return from64(_0, _1); - } - - static v128 from64p(u64 value) - { - v128 ret; - ret._u64[0] = value; - ret._u64[1] = value; - return ret; - } - - static v128 from32(u32 _0, u32 _1 = 0, u32 _2 = 0, u32 _3 = 0) - { - v128 ret; - ret._u32[0] = _0; - ret._u32[1] = _1; - ret._u32[2] = _2; - ret._u32[3] = _3; - return ret; - } - - static v128 from32r(u32 _3, u32 _2 = 0, u32 _1 = 0, u32 _0 = 0) - { - return from32(_0, _1, _2, _3); - } - - static v128 from32p(u32 value) - { - v128 ret; - ret._u32[0] = value; - ret._u32[1] = value; - ret._u32[2] = value; - ret._u32[3] = value; - return ret; - } - - static v128 fromf32p(f32 value) - { - v128 ret; - ret._f[0] = value; - ret._f[1] = value; - ret._f[2] = value; - ret._f[3] = value; - return ret; - } - - static v128 from16p(u16 value) - { - v128 ret; - ret._u16[0] = value; - ret._u16[1] = value; - ret._u16[2] = value; - ret._u16[3] = value; - ret._u16[4] = value; - ret._u16[5] = value; - ret._u16[6] = value; - ret._u16[7] = value; - return ret; - } - - static v128 from8p(u8 value) - { - v128 ret; - std::memset(&ret, value, sizeof(ret)); - return ret; - } - - static v128 undef() - { -#if defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wuninitialized" -#elif _MSC_VER -#pragma warning(push) -#pragma warning(disable : 6001) -#endif - v128 ret; - return ret; -#if defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic pop -#elif _MSC_VER -#pragma warning(pop) -#endif - } - - // Unaligned load with optional index offset - static v128 loadu(const void* ptr, usz index = 0) - { - v128 ret; - std::memcpy(&ret, static_cast(ptr) + index * sizeof(v128), sizeof(v128)); - return ret; - } - - // Unaligned store with optional index offset - static void storeu(v128 value, void* ptr, usz index = 0) - { - std::memcpy(static_cast(ptr) + index * sizeof(v128), &value, sizeof(v128)); - } - - v128 operator|(const v128&) const; - v128 operator&(const v128&) const; - v128 operator^(const v128&) const; - v128 operator~() const; - - bool operator==(const v128& right) const; - - void clear() - { - *this = {}; - } -}; - -template -struct offset32_array> -{ - template - static inline u32 index32(const Arg& arg) - { - return u32{sizeof(T)} * (static_cast(arg) ^ static_cast(M)); - } -}; - -template <> -struct std::hash -{ - usz operator()(const v128& key) const - { - return key._u64[0] + key._u64[1]; - } -}; +using rx::v128; diff --git a/rpcs3/util/vm_native.cpp b/rpcs3/util/vm_native.cpp index 008b13fde..188857a03 100644 --- a/rpcs3/util/vm_native.cpp +++ b/rpcs3/util/vm_native.cpp @@ -188,16 +188,19 @@ namespace utils { static const long r = []() -> long { + long result; #ifdef _WIN32 SYSTEM_INFO info; ::GetSystemInfo(&info); - return info.dwPageSize; + result = info.dwPageSize; #else - return ::sysconf(_SC_PAGESIZE); + result = ::sysconf(_SC_PAGESIZE); #endif + ensure(result, FN(((x & (x - 1)) == 0 && x > 0 && x <= 0x10000))); + return result; }(); - return ensure(r, FN(((x & (x - 1)) == 0 && x > 0 && x <= 0x10000))); + return r; } // Convert memory protection (internal) diff --git a/rpcs3qt-legacy/emu_settings.cpp b/rpcs3qt-legacy/emu_settings.cpp index 277f1d354..459262c4b 100644 --- a/rpcs3qt-legacy/emu_settings.cpp +++ b/rpcs3qt-legacy/emu_settings.cpp @@ -1244,8 +1244,9 @@ QString emu_settings::GetLocalizedSetting(const QString& original, emu_settings_ case emu_settings_type::PPUDecoder: switch (static_cast(index)) { - case ppu_decoder_type::_static: return tr("Interpreter (static)", "PPU decoder"); - case ppu_decoder_type::llvm: return tr("Recompiler (LLVM)", "PPU decoder"); + case ppu_decoder_type::_static: return tr("Interpreter (Legacy)", "PPU decoder"); + case ppu_decoder_type::llvm_legacy: return tr("LLVM Recompiler (Legacy)", "PPU decoder"); + case ppu_decoder_type::interpreter: return tr("Interpreter", "PPU decoder"); } break; case emu_settings_type::SPUDecoder: diff --git a/rpcs3qt-legacy/settings_dialog.cpp b/rpcs3qt-legacy/settings_dialog.cpp index 38d731bec..090af6a06 100644 --- a/rpcs3qt-legacy/settings_dialog.cpp +++ b/rpcs3qt-legacy/settings_dialog.cpp @@ -367,7 +367,8 @@ settings_dialog::settings_dialog(std::shared_ptr gui_settings, std QButtonGroup* ppu_bg = new QButtonGroup(this); ppu_bg->addButton(ui->ppu__static, static_cast(ppu_decoder_type::_static)); - ppu_bg->addButton(ui->ppu_llvm, static_cast(ppu_decoder_type::llvm)); + ppu_bg->addButton(ui->ppu_llvm, static_cast(ppu_decoder_type::llvm_legacy)); + ppu_bg->addButton(ui->ppu_interpreter, static_cast(ppu_decoder_type::interpreter)); connect(ppu_bg, &QButtonGroup::idToggled, [this](int id, bool checked) { @@ -376,12 +377,13 @@ settings_dialog::settings_dialog(std::shared_ptr gui_settings, std switch (id) { + case static_cast(ppu_decoder_type::interpreter): case static_cast(ppu_decoder_type::_static): ui->accuratePPUFPCC->setEnabled(true); ui->accuratePPUNJ->setEnabled(true); ui->accuratePPUVNAN->setEnabled(true); break; - case static_cast(ppu_decoder_type::llvm): + case static_cast(ppu_decoder_type::llvm_legacy): ui->accuratePPUFPCC->setEnabled(false); ui->accuratePPUNJ->setEnabled(false); ui->accuratePPUVNAN->setEnabled(false); diff --git a/rpcs3qt-legacy/settings_dialog.ui b/rpcs3qt-legacy/settings_dialog.ui index 08a453d1f..39ab1d1fa 100644 --- a/rpcs3qt-legacy/settings_dialog.ui +++ b/rpcs3qt-legacy/settings_dialog.ui @@ -74,14 +74,21 @@ - Interpreter (static) + Interpreter (Legacy) - LLVM Recompiler (fastest) + LLVM Recompiler (Legacy) + + + + + + + Interpreter diff --git a/rpcsx/CMakeLists.txt b/rpcsx/CMakeLists.txt index 869e24851..fd653db2a 100644 --- a/rpcsx/CMakeLists.txt +++ b/rpcsx/CMakeLists.txt @@ -1,103 +1,107 @@ -find_package(libunwind REQUIRED) -find_package(sox REQUIRED) -find_package(ALSA REQUIRED) - add_library(standalone-config INTERFACE) target_include_directories(standalone-config INTERFACE orbis-kernel-config) add_library(orbis::kernel::config ALIAS standalone-config) -add_executable(rpcsx - audio/AudioDevice.cpp - audio/AlsaDevice.cpp +add_subdirectory(cpu) - iodev/a53io.cpp - iodev/ajm.cpp - iodev/blockpool.cpp - iodev/bt.cpp - iodev/camera.cpp - iodev/cd.cpp - iodev/console.cpp - iodev/hdd.cpp - iodev/dce.cpp - iodev/dipsw.cpp - iodev/dmem.cpp - iodev/gc.cpp - iodev/hid.cpp - iodev/hmd_3da.cpp - iodev/hmd_cmd.cpp - iodev/hmd_mmap.cpp - iodev/hmd_snsr.cpp - iodev/hmd2_cmd.cpp - iodev/hmd2_imu.cpp - iodev/hmd2_gen_data.cpp - iodev/hmd2_gaze.cpp - iodev/icc_configuration.cpp - iodev/mbus.cpp - iodev/metadbg.cpp - iodev/notification.cpp - iodev/npdrm.cpp - iodev/nsid_ctl.cpp - iodev/null.cpp - iodev/rng.cpp - iodev/sbl_srv.cpp - iodev/shm.cpp - iodev/urandom.cpp - iodev/xpt.cpp - iodev/zero.cpp - iodev/aout.cpp - iodev/av_control.cpp - iodev/hdmi.cpp - iodev/mbus_av.cpp - iodev/scanin.cpp - iodev/s3da.cpp - iodev/gbase.cpp - iodev/devstat.cpp - iodev/devact.cpp - iodev/devctl.cpp - iodev/uvd.cpp - iodev/vce.cpp - iodev/evlg.cpp - iodev/srtc.cpp - iodev/sshot.cpp - iodev/lvdctl.cpp - iodev/icc_power.cpp - iodev/cayman_reg.cpp +if(LINUX AND WITH_RPCSX) + find_package(libunwind REQUIRED) + find_package(sox REQUIRED) + find_package(ALSA REQUIRED) - main.cpp - AudioOut.cpp - backtrace.cpp - vm.cpp - ops.cpp - linker.cpp - io-device.cpp - thread.cpp - vfs.cpp - ipmi.cpp -) + add_subdirectory(gpu) + add_subdirectory(core) -add_subdirectory(gpu) -add_subdirectory(core) + add_executable(rpcsx + audio/AudioDevice.cpp + audio/AlsaDevice.cpp -target_include_directories(rpcsx PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) -target_link_libraries(rpcsx -PUBLIC - ffmpeg::avcodec - ffmpeg::swresample - ffmpeg::avutil - Atrac9 - rpcsx-gpu - orbis::kernel - rx - libcrypto - libunwind::unwind-x86_64 - xbyak::xbyak - sox::sox - ALSA::ALSA - rpcsx-core -) + iodev/a53io.cpp + iodev/ajm.cpp + iodev/blockpool.cpp + iodev/bt.cpp + iodev/camera.cpp + iodev/cd.cpp + iodev/console.cpp + iodev/hdd.cpp + iodev/dce.cpp + iodev/dipsw.cpp + iodev/dmem.cpp + iodev/gc.cpp + iodev/hid.cpp + iodev/hmd_3da.cpp + iodev/hmd_cmd.cpp + iodev/hmd_mmap.cpp + iodev/hmd_snsr.cpp + iodev/hmd2_cmd.cpp + iodev/hmd2_imu.cpp + iodev/hmd2_gen_data.cpp + iodev/hmd2_gaze.cpp + iodev/icc_configuration.cpp + iodev/mbus.cpp + iodev/metadbg.cpp + iodev/notification.cpp + iodev/npdrm.cpp + iodev/nsid_ctl.cpp + iodev/null.cpp + iodev/rng.cpp + iodev/sbl_srv.cpp + iodev/shm.cpp + iodev/urandom.cpp + iodev/xpt.cpp + iodev/zero.cpp + iodev/aout.cpp + iodev/av_control.cpp + iodev/hdmi.cpp + iodev/mbus_av.cpp + iodev/scanin.cpp + iodev/s3da.cpp + iodev/gbase.cpp + iodev/devstat.cpp + iodev/devact.cpp + iodev/devctl.cpp + iodev/uvd.cpp + iodev/vce.cpp + iodev/evlg.cpp + iodev/srtc.cpp + iodev/sshot.cpp + iodev/lvdctl.cpp + iodev/icc_power.cpp + iodev/cayman_reg.cpp -target_base_address(rpcsx 0x0000070000000000) -target_compile_options(rpcsx PRIVATE "-mfsgsbase") + main.cpp + AudioOut.cpp + backtrace.cpp + vm.cpp + ops.cpp + linker.cpp + io-device.cpp + thread.cpp + vfs.cpp + ipmi.cpp + ) + + target_base_address(rpcsx 0x0000070000000000) + target_compile_options(rpcsx PRIVATE "-mfsgsbase") + set_target_properties(rpcsx PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + install(TARGETS rpcsx RUNTIME DESTINATION bin) + + target_include_directories(rpcsx PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + target_link_libraries(rpcsx + PUBLIC + ffmpeg::avcodec + ffmpeg::swresample + ffmpeg::avutil + Atrac9 + rpcsx-gpu + orbis::kernel + rx + libcrypto + libunwind::unwind-x86_64 + xbyak::xbyak + sox::sox + ALSA::ALSA + rpcsx-core + ) +endif() -set_target_properties(rpcsx PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -install(TARGETS rpcsx RUNTIME DESTINATION bin) diff --git a/rpcsx/cpu/CMakeLists.txt b/rpcsx/cpu/CMakeLists.txt new file mode 100644 index 000000000..4ddccd7cb --- /dev/null +++ b/rpcsx/cpu/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(cell) diff --git a/rpcsx/cpu/cell/CMakeLists.txt b/rpcsx/cpu/cell/CMakeLists.txt new file mode 100644 index 000000000..c17385891 --- /dev/null +++ b/rpcsx/cpu/cell/CMakeLists.txt @@ -0,0 +1,3 @@ + +add_subdirectory(ppu) + diff --git a/rpcsx/cpu/cell/ppu/CMakeLists.txt b/rpcsx/cpu/cell/ppu/CMakeLists.txt new file mode 100644 index 000000000..0da7bccf1 --- /dev/null +++ b/rpcsx/cpu/cell/ppu/CMakeLists.txt @@ -0,0 +1,32 @@ +add_library( + rpcsx_cpu_cell_ppu STATIC + src/Decoder.cpp +) + +add_library(rpcsx_cpu_cell_ppu_semantic +STATIC + semantic/ppu.cpp +) +target_include_directories(rpcsx_cpu_cell_ppu_semantic PUBLIC include PRIVATE include/rx/cpu/cell/ppu) +target_link_libraries(rpcsx_cpu_cell_ppu_semantic PUBLIC rx) + +# add_custom_command( +# OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ppu.ll +# COMMAND ${CLANG_EXECUTABLE} -O3 -S -emit-llvm semantic/ppu.cpp -o ${CMAKE_CURRENT_BINARY_DIR}/ppu.ll -I include/rx/cpu/cell/ppu/ -I ../../../../rx/include/ -std=c++23 -fno-exceptions +# WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} +# ) + +# add_custom_target(ppu-semantic DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/ppu.ll) + +target_include_directories(rpcsx_cpu_cell_ppu + PUBLIC + include + PRIVATE + include/rx/cpu/cell/ppu +) + +target_link_libraries(rpcsx_cpu_cell_ppu PUBLIC rx) +# add_dependencies(rpcsx_cpu_cell_ppu ppu-semantic) +add_library(rpcsx::cpu::cell::ppu ALIAS rpcsx_cpu_cell_ppu) +add_library(rpcsx::cpu::cell::ppu::semantic ALIAS rpcsx_cpu_cell_ppu_semantic) + diff --git a/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Decoder.hpp b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Decoder.hpp new file mode 100644 index 000000000..b945b1c7c --- /dev/null +++ b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Decoder.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include "Opcode.hpp" +#include +#include +#include + +namespace rx::cell::ppu { +template using DecoderTable = std::array; + +extern DecoderTable g_ppuOpcodeTable; +// extern std::array> g_opcodeForms; + +inline Opcode getOpcode(std::uint32_t instruction) { + auto decode = [](std::uint32_t inst) { + return ((inst >> 26) | (inst << 6)) & 0x1ffff; // Rotate + mask + }; + + return g_ppuOpcodeTable[decode(instruction)]; +} + +Opcode fixOpcode(Opcode opcode, std::uint32_t instruction); +} // namespace rx::cell::ppu diff --git a/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Emitter.hpp b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Emitter.hpp new file mode 100644 index 000000000..93b1d358e --- /dev/null +++ b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Emitter.hpp @@ -0,0 +1,410 @@ +#pragma once + +#include "Instruction.hpp" +#include + +namespace rx::cell::ppu { +inline namespace registers { +enum { + r0, + r1, + r2, + r3, + r4, + r5, + r6, + r7, + r8, + r9, + r10, + r11, + r12, + r13, + r14, + r15, + r16, + r17, + r18, + r19, + r20, + r21, + r22, + r23, + r24, + r25, + r26, + r27, + r28, + r29, + r30, + r31, +}; + +enum { + f0, + f1, + f2, + f3, + f4, + f5, + f6, + f7, + f8, + f9, + f10, + f11, + f12, + f13, + f14, + f15, + F16, + f17, + f18, + f19, + f20, + f21, + f22, + f23, + f24, + f25, + f26, + f27, + f28, + f29, + f30, + f31, +}; + +enum { + v0, + v1, + v2, + v3, + v4, + v5, + v6, + v7, + v8, + v9, + v10, + v11, + v12, + v13, + v14, + v15, + v16, + v17, + v18, + v19, + v20, + v21, + v22, + v23, + v24, + v25, + v26, + v27, + v28, + v29, + v30, + v31, +}; + +enum { + cr0, + cr1, + cr2, + cr3, + cr4, + cr5, + cr6, + cr7, +}; +} // namespace registers + +inline std::uint32_t ADDI(std::uint32_t rt, std::uint32_t ra, std::int32_t si) { + Instruction op{0x0eu << 26}; + op.rd = rt; + op.ra = ra; + op.simm16 = si; + return op.raw; +} +inline std::uint32_t ADDIS(std::uint32_t rt, std::uint32_t ra, + std::int32_t si) { + Instruction op{0x0fu << 26}; + op.rd = rt; + op.ra = ra; + op.simm16 = si; + return op.raw; +} +inline std::uint32_t XORIS(std::uint32_t rt, std::uint32_t ra, + std::int32_t si) { + Instruction op{0x1bu << 26}; + op.rd = rt; + op.ra = ra; + op.simm16 = si; + return op.raw; +} +inline std::uint32_t ORI(std::uint32_t rt, std::uint32_t ra, std::uint32_t ui) { + Instruction op{0x18u << 26}; + op.rd = rt; + op.ra = ra; + op.uimm16 = ui; + return op.raw; +} +inline std::uint32_t ORIS(std::uint32_t rt, std::uint32_t ra, + std::uint32_t ui) { + Instruction op{0x19u << 26}; + op.rd = rt; + op.ra = ra; + op.uimm16 = ui; + return op.raw; +} +inline std::uint32_t OR(std::uint32_t ra, std::uint32_t rs, std::uint32_t rb, + bool rc = false) { + Instruction op{0x1fu << 26 | 0x1bcu << 1}; + op.rs = rs; + op.ra = ra; + op.rb = rb; + op.rc = rc; + return op.raw; +} +inline std::uint32_t SC(std::uint32_t lev) { + Instruction op{0x11u << 26 | 1 << 1}; + op.lev = lev; + return op.raw; +} +inline std::uint32_t B(std::int32_t li, bool aa = false, bool lk = false) { + Instruction op{0x12u << 26}; + op.ll = li; + op.aa = aa; + op.lk = lk; + return op.raw; +} +inline std::uint32_t BC(std::uint32_t bo, std::uint32_t bi, std::int32_t bd, + bool aa = false, bool lk = false) { + Instruction op{0x10u << 26}; + op.bo = bo; + op.bi = bi; + op.ds = bd / 4; + op.aa = aa; + op.lk = lk; + return op.raw; +} +inline std::uint32_t BCLR(std::uint32_t bo, std::uint32_t bi, std::uint32_t bh, + bool lk = false) { + Instruction op{0x13u << 26 | 0x10u << 1}; + op.bo = bo; + op.bi = bi; + op.bh = bh; + op.lk = lk; + return op.raw; +} +inline std::uint32_t BCCTR(std::uint32_t bo, std::uint32_t bi, std::uint32_t bh, + bool lk = false) { + Instruction op{0x13u << 26 | 0x210u << 1}; + op.bo = bo; + op.bi = bi; + op.bh = bh; + op.lk = lk; + return op.raw; +} +inline std::uint32_t MFSPR(std::uint32_t rt, std::uint32_t spr) { + Instruction op{0x1fu << 26 | 0x153u << 1}; + op.rd = rt; + op.spr = spr; + return op.raw; +} +inline std::uint32_t MTSPR(std::uint32_t spr, std::uint32_t rs) { + Instruction op{0x1fu << 26 | 0x1d3u << 1}; + op.rs = rs; + op.spr = spr; + return op.raw; +} +inline std::uint32_t LWZ(std::uint32_t rt, std::uint32_t ra, std::int32_t si) { + Instruction op{0x20u << 26}; + op.rd = rt; + op.ra = ra; + op.simm16 = si; + return op.raw; +} +inline std::uint32_t STW(std::uint32_t rt, std::uint32_t ra, std::int32_t si) { + Instruction op{0x24u << 26}; + op.rd = rt; + op.ra = ra; + op.simm16 = si; + return op.raw; +} +inline std::uint32_t STD(std::uint32_t rs, std::uint32_t ra, std::int32_t si) { + Instruction op{0x3eu << 26}; + op.rs = rs; + op.ra = ra; + op.ds = si / 4; + return op.raw; +} +inline std::uint32_t STDU(std::uint32_t rs, std::uint32_t ra, std::int32_t si) { + Instruction op{0x3eu << 26 | 1}; + op.rs = rs; + op.ra = ra; + op.ds = si / 4; + return op.raw; +} +inline std::uint32_t LD(std::uint32_t rt, std::uint32_t ra, std::int32_t si) { + Instruction op{0x3au << 26}; + op.rd = rt; + op.ra = ra; + op.ds = si / 4; + return op.raw; +} +inline std::uint32_t LDU(std::uint32_t rt, std::uint32_t ra, std::int32_t si) { + Instruction op{0x3au << 26 | 1}; + op.rd = rt; + op.ra = ra; + op.ds = si / 4; + return op.raw; +} +inline std::uint32_t CMPI(std::uint32_t bf, std::uint32_t l, std::uint32_t ra, + std::uint32_t ui) { + Instruction op{0xbu << 26}; + op.crfd = bf; + op.l10 = l; + op.ra = ra; + op.uimm16 = ui; + return op.raw; +} +inline std::uint32_t CMPLI(std::uint32_t bf, std::uint32_t l, std::uint32_t ra, + std::uint32_t ui) { + Instruction op{0xau << 26}; + op.crfd = bf; + op.l10 = l; + op.ra = ra; + op.uimm16 = ui; + return op.raw; +} +inline std::uint32_t RLDICL(std::uint32_t ra, std::uint32_t rs, + std::uint32_t sh, std::uint32_t mb, + bool rc = false) { + Instruction op{30 << 26}; + op.ra = ra; + op.rs = rs; + op.sh64 = sh; + op.mbe64 = mb; + op.rc = rc; + return op.raw; +} +inline std::uint32_t RLDICR(std::uint32_t ra, std::uint32_t rs, + std::uint32_t sh, std::uint32_t mb, + bool rc = false) { + return RLDICL(ra, rs, sh, mb, rc) | 1 << 2; +} +inline std::uint32_t STFD(std::uint32_t frs, std::uint32_t ra, + std::int32_t si) { + Instruction op{54u << 26}; + op.frs = frs; + op.ra = ra; + op.simm16 = si; + return op.raw; +} +inline std::uint32_t STVX(std::uint32_t vs, std::uint32_t ra, + std::uint32_t rb) { + Instruction op{31 << 26 | 231 << 1}; + op.vs = vs; + op.ra = ra; + op.rb = rb; + return op.raw; +} +inline std::uint32_t LFD(std::uint32_t frd, std::uint32_t ra, std::int32_t si) { + Instruction op{50u << 26}; + op.frd = frd; + op.ra = ra; + op.simm16 = si; + return op.raw; +} +inline std::uint32_t LVX(std::uint32_t vd, std::uint32_t ra, std::uint32_t rb) { + Instruction op{31 << 26 | 103 << 1}; + op.vd = vd; + op.ra = ra; + op.rb = rb; + return op.raw; +} +inline constexpr std::uint32_t EIEIO() { return 0x7c0006ac; } + +inline namespace implicts { +inline std::uint32_t NOP() { return ORI(r0, r0, 0); } +inline std::uint32_t MR(std::uint32_t rt, std::uint32_t ra) { + return OR(rt, ra, ra, false); +} +inline std::uint32_t LI(std::uint32_t rt, std::uint32_t imm) { + return ADDI(rt, r0, imm); +} +inline std::uint32_t LIS(std::uint32_t rt, std::uint32_t imm) { + return ADDIS(rt, r0, imm); +} + +inline std::uint32_t BLR() { return BCLR(0x10 | 0x04, 0, 0); } +inline std::uint32_t BCTR() { return BCCTR(0x10 | 0x04, 0, 0); } +inline std::uint32_t BCTRL() { return BCCTR(0x10 | 0x04, 0, 0, true); } +inline std::uint32_t MFCTR(std::uint32_t reg) { return MFSPR(reg, 9 << 5); } +inline std::uint32_t MTCTR(std::uint32_t reg) { return MTSPR(9 << 5, reg); } +inline std::uint32_t MFLR(std::uint32_t reg) { return MFSPR(reg, 8 << 5); } +inline std::uint32_t MTLR(std::uint32_t reg) { return MTSPR(8 << 5, reg); } + +inline std::uint32_t BNE(std::uint32_t cr, std::int32_t imm) { + return BC(4, 2 | cr << 2, imm); +} +inline std::uint32_t BEQ(std::uint32_t cr, std::int32_t imm) { + return BC(12, 2 | cr << 2, imm); +} +inline std::uint32_t BGT(std::uint32_t cr, std::int32_t imm) { + return BC(12, 1 | cr << 2, imm); +} +inline std::uint32_t BNE(std::int32_t imm) { return BNE(cr0, imm); } +inline std::uint32_t BEQ(std::int32_t imm) { return BEQ(cr0, imm); } +inline std::uint32_t BGT(std::int32_t imm) { return BGT(cr0, imm); } + +inline std::uint32_t CMPDI(std::uint32_t cr, std::uint32_t reg, + std::uint32_t imm) { + return CMPI(cr, 1, reg, imm); +} +inline std::uint32_t CMPDI(std::uint32_t reg, std::uint32_t imm) { + return CMPDI(cr0, reg, imm); +} +inline std::uint32_t CMPWI(std::uint32_t cr, std::uint32_t reg, + std::uint32_t imm) { + return CMPI(cr, 0, reg, imm); +} +inline std::uint32_t CMPWI(std::uint32_t reg, std::uint32_t imm) { + return CMPWI(cr0, reg, imm); +} +inline std::uint32_t CMPLDI(std::uint32_t cr, std::uint32_t reg, + std::uint32_t imm) { + return CMPLI(cr, 1, reg, imm); +} +inline std::uint32_t CMPLDI(std::uint32_t reg, std::uint32_t imm) { + return CMPLDI(cr0, reg, imm); +} +inline std::uint32_t CMPLWI(std::uint32_t cr, std::uint32_t reg, + std::uint32_t imm) { + return CMPLI(cr, 0, reg, imm); +} +inline std::uint32_t CMPLWI(std::uint32_t reg, std::uint32_t imm) { + return CMPLWI(cr0, reg, imm); +} + +inline std::uint32_t EXTRDI(std::uint32_t x, std::uint32_t y, std::uint32_t n, + std::uint32_t b) { + return RLDICL(x, y, b + n, 64 - b, false); +} +inline std::uint32_t SRDI(std::uint32_t x, std::uint32_t y, std::uint32_t n) { + return RLDICL(x, y, 64 - n, n, false); +} +inline std::uint32_t CLRLDI(std::uint32_t x, std::uint32_t y, std::uint32_t n) { + return RLDICL(x, y, 0, n, false); +} +inline std::uint32_t CLRRDI(std::uint32_t x, std::uint32_t y, std::uint32_t n) { + return RLDICR(x, y, 0, 63 - n, false); +} + +inline constexpr std::uint32_t TRAP() { return 0x7FE00008; } // tw 31,r0,r0 +} // namespace implicts +} // namespace rx::cell::ppu diff --git a/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Instruction.hpp b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Instruction.hpp new file mode 100644 index 000000000..918aac79d --- /dev/null +++ b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Instruction.hpp @@ -0,0 +1,72 @@ +#pragma once +#include +#include + +namespace rx::cell::ppu { +union Instruction { + template + using bf = BitField; + + std::uint32_t raw; + + bf main; // 0..5 + BitFieldPack, bf> + sh64; // 30 + 16..20 + BitFieldPack, bf> + mbe64; // 26 + 21..25 + bf vuimm; // 11..15 + bf vs; // 6..10 + bf vsh; // 22..25 + bf oe; // 21 + bf spr; // 11..20 + bf vc; // 21..25 + bf vb; // 16..20 + bf va; // 11..15 + bf vd; // 6..10 + bf lk; // 31 + bf aa; // 30 + bf rb; // 16..20 + bf ra; // 11..15 + bf rd; // 6..10 + bf uimm16; // 16..31 + bf l11; // 11 + bf rs; // 6..10 + bf simm16; // 16..31, signed + bf ds; // 16..29, signed + bf vsimm; // 11..15, signed + bf ll; // 6..31, signed + bf li; // 6..29, signed + bf lev; // 20..26 + bf i; // 16..19 + bf crfs; // 11..13 + bf l10; // 10 + bf crfd; // 6..8 + bf crbb; // 16..20 + bf crba; // 11..15 + bf crbd; // 6..10 + bf rc; // 31 + bf me32; // 26..30 + bf mb32; // 21..25 + bf sh32; // 16..20 + bf bi; // 11..15 + bf bo; // 6..10 + bf bh; // 19..20 + bf frc; // 21..25 + bf frb; // 16..20 + bf fra; // 11..15 + bf frd; // 6..10 + bf crm; // 12..19 + bf frs; // 6..10 + bf flm; // 7..14 + bf l6; // 6 + bf l15; // 15 + + BitFieldPack, BitFieldFixed> + bt14; + + BitFieldPack, BitFieldFixed> + bt24; +}; + +static_assert(sizeof(Instruction) == sizeof(std::uint32_t)); +} // namespace rx::cell::ppu diff --git a/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Opcode.hpp b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Opcode.hpp new file mode 100644 index 000000000..1d573fcb5 --- /dev/null +++ b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/Opcode.hpp @@ -0,0 +1,858 @@ +#pragma once + +namespace rx::cell::ppu { +enum class Opcode { + Invalid, + + MFVSCR, + MTVSCR, + VADDCUW, + VADDFP, + VADDSBS, + VADDSHS, + VADDSWS, + VADDUBM, + VADDUBS, + VADDUHM, + VADDUHS, + VADDUWM, + VADDUWS, + VAND, + VANDC, + VAVGSB, + VAVGSH, + VAVGSW, + VAVGUB, + VAVGUH, + VAVGUW, + VCFSX, + VCFUX, + VCMPBFP, + VCMPBFP_, + VCMPEQFP, + VCMPEQFP_, + VCMPEQUB, + VCMPEQUB_, + VCMPEQUH, + VCMPEQUH_, + VCMPEQUW, + VCMPEQUW_, + VCMPGEFP, + VCMPGEFP_, + VCMPGTFP, + VCMPGTFP_, + VCMPGTSB, + VCMPGTSB_, + VCMPGTSH, + VCMPGTSH_, + VCMPGTSW, + VCMPGTSW_, + VCMPGTUB, + VCMPGTUB_, + VCMPGTUH, + VCMPGTUH_, + VCMPGTUW, + VCMPGTUW_, + VCTSXS, + VCTUXS, + VEXPTEFP, + VLOGEFP, + VMADDFP, + VMAXFP, + VMAXSB, + VMAXSH, + VMAXSW, + VMAXUB, + VMAXUH, + VMAXUW, + VMHADDSHS, + VMHRADDSHS, + VMINFP, + VMINSB, + VMINSH, + VMINSW, + VMINUB, + VMINUH, + VMINUW, + VMLADDUHM, + VMRGHB, + VMRGHH, + VMRGHW, + VMRGLB, + VMRGLH, + VMRGLW, + VMSUMMBM, + VMSUMSHM, + VMSUMSHS, + VMSUMUBM, + VMSUMUHM, + VMSUMUHS, + VMULESB, + VMULESH, + VMULEUB, + VMULEUH, + VMULOSB, + VMULOSH, + VMULOUB, + VMULOUH, + VNMSUBFP, + VNOR, + VOR, + VPERM, + VPKPX, + VPKSHSS, + VPKSHUS, + VPKSWSS, + VPKSWUS, + VPKUHUM, + VPKUHUS, + VPKUWUM, + VPKUWUS, + VREFP, + VRFIM, + VRFIN, + VRFIP, + VRFIZ, + VRLB, + VRLH, + VRLW, + VRSQRTEFP, + VSEL, + VSL, + VSLB, + VSLDOI, + VSLH, + VSLO, + VSLW, + VSPLTB, + VSPLTH, + VSPLTISB, + VSPLTISH, + VSPLTISW, + VSPLTW, + VSR, + VSRAB, + VSRAH, + VSRAW, + VSRB, + VSRH, + VSRO, + VSRW, + VSUBCUW, + VSUBFP, + VSUBSBS, + VSUBSHS, + VSUBSWS, + VSUBUBM, + VSUBUBS, + VSUBUHM, + VSUBUHS, + VSUBUWM, + VSUBUWS, + VSUMSWS, + VSUM2SWS, + VSUM4SBS, + VSUM4SHS, + VSUM4UBS, + VUPKHPX, + VUPKHSB, + VUPKHSH, + VUPKLPX, + VUPKLSB, + VUPKLSH, + VXOR, + TDI, + TWI, + MULLI, + SUBFIC, + CMPLI, + CMPI, + ADDIC, + ADDI, + ADDIS, + BC, + SC, + B, + MCRF, + BCLR, + RFID, + CRNOR, + RFSCV, + CRANDC, + ISYNC, + CRXOR, + CRNAND, + CRAND, + HRFID, + CREQV, + URFID, + STOP, + CRORC, + CROR, + BCCTR, + RLWIMI, + RLWINM, + RLWNM, + ORI, + ORIS, + XORI, + XORIS, + ANDI, + ANDIS, + RLDICL, + RLDICR, + RLDIC, + RLDIMI, + RLDCL, + RLDCR, + CMP, + TW, + LVSL, + LVEBX, + SUBFC, + MULHDU, + ADDC, + MULHWU, + MFOCRF, + LWARX, + LDX, + LWZX, + SLW, + CNTLZW, + SLD, + AND, + CMPL, + LVSR, + LVEHX, + SUBF, + LDUX, + DCBST, + LWZUX, + CNTLZD, + ANDC, + TD, + LVEWX, + MULHD, + MULHW, + LDARX, + DCBF, + LBZX, + LVX, + NEG, + LBZUX, + NOR, + STVEBX, + SUBFE, + ADDE, + MTOCRF, + STDX, + STWCX, + STWX, + STVEHX, + STDUX, + STWUX, + STVEWX, + SUBFZE, + ADDZE, + STDCX, + STBX, + STVX, + MULLD, + SUBFME, + ADDME, + MULLW, + DCBTST, + STBUX, + ADD, + DCBT, + LHZX, + EQV, + ECIWX, + LHZUX, + XOR, + MFSPR, + LWAX, + DST, + LHAX, + LVXL, + MFTB, + LWAUX, + DSTST, + LHAUX, + STHX, + ORC, + ECOWX, + STHUX, + OR, + DIVDU, + DIVWU, + MTSPR, + DCBI, + NAND, + STVXL, + DIVD, + DIVW, + LVLX, + LDBRX, + LSWX, + LWBRX, + LFSX, + SRW, + SRD, + LVRX, + LSWI, + LFSUX, + SYNC, + LFDX, + LFDUX, + STVLX, + STDBRX, + STSWX, + STWBRX, + STFSX, + STVRX, + STFSUX, + STSWI, + STFDX, + STFDUX, + LVLXL, + LHBRX, + SRAW, + SRAD, + LVRXL, + DSS, + SRAWI, + SRADI, + EIEIO, + STVLXL, + STHBRX, + EXTSH, + STVRXL, + EXTSB, + STFIWX, + EXTSW, + ICBI, + DCBZ, + LWZ, + LWZU, + LBZ, + LBZU, + STW, + STWU, + STB, + STBU, + LHZ, + LHZU, + LHA, + LHAU, + STH, + STHU, + LMW, + STMW, + LFS, + LFSU, + LFD, + LFDU, + STFS, + STFSU, + STFD, + STFDU, + LD, + LDU, + LWA, + STD, + STDU, + FDIVS, + FSUBS, + FADDS, + FSQRTS, + FRES, + FMULS, + FMADDS, + FMSUBS, + FNMSUBS, + FNMADDS, + MTFSB1, + MCRFS, + MTFSB0, + MTFSFI, + MFFS, + MTFSF, + FCMPU, + FRSP, + FCTIW, + FCTIWZ, + FDIV, + FSUB, + FADD, + FSQRT, + FSEL, + FMUL, + FRSQRTE, + FMSUB, + FMADD, + FNMSUB, + FNMADD, + FCMPO, + FNEG, + FMR, + FNABS, + FABS, + FCTID, + FCTIDZ, + FCFID, + UNK, + SUBFCO, + ADDCO, + SUBFO, + NEGO, + SUBFEO, + ADDEO, + SUBFZEO, + ADDZEO, + SUBFMEO, + MULLDO, + ADDMEO, + MULLWO, + ADDO, + DIVDUO, + DIVWUO, + DIVDO, + DIVWO, + SUBFCO_, + ADDCO_, + SUBFO_, + NEGO_, + SUBFEO_, + ADDEO_, + SUBFZEO_, + ADDZEO_, + SUBFMEO_, + MULLDO_, + ADDMEO_, + MULLWO_, + ADDO_, + DIVDUO_, + DIVWUO_, + DIVDO_, + DIVWO_, + RLWIMI_, + RLWINM_, + RLWNM_, + RLDICL_, + RLDICR_, + RLDIC_, + RLDIMI_, + RLDCL_, + RLDCR_, + SUBFC_, + MULHDU_, + ADDC_, + MULHWU_, + SLW_, + CNTLZW_, + SLD_, + AND_, + SUBF_, + CNTLZD_, + ANDC_, + MULHD_, + MULHW_, + NEG_, + NOR_, + SUBFE_, + ADDE_, + SUBFZE_, + ADDZE_, + MULLD_, + SUBFME_, + ADDME_, + MULLW_, + ADD_, + EQV_, + XOR_, + ORC_, + OR_, + DIVDU_, + DIVWU_, + NAND_, + DIVD_, + DIVW_, + SRW_, + SRD_, + SRAW_, + SRAD_, + SRAWI_, + SRADI_, + EXTSH_, + EXTSB_, + EXTSW_, + FDIVS_, + FSUBS_, + FADDS_, + FSQRTS_, + FRES_, + FMULS_, + FMADDS_, + FMSUBS_, + FNMSUBS_, + FNMADDS_, + MTFSB1_, + MTFSB0_, + MTFSFI_, + MFFS_, + MTFSF_, + FRSP_, + FCTIW_, + FCTIWZ_, + FDIV_, + FSUB_, + FADD_, + FSQRT_, + FSEL_, + FMUL_, + FRSQRTE_, + FMSUB_, + FMADD_, + FNMSUB_, + FNMADD_, + FNEG_, + FMR_, + FNABS_, + FABS_, + FCTID_, + FCTIDZ_, + FCFID_, + + // extended mnemonic + LI, + LIS, + NOP, + MR, + + CLRLDI, + ROTLDI, + SRDI, + + CMPD, + CMPW, + + CMPLD, + CMPLW, + + NOT, + + MTCRF, + MFXER, + MFLR, + MFCTR, + + MFTBU, + + CCTPL, + CCTPM, + CCTPH, + DB8CYC, + DB10CYC, + DB12CYC, + DB16CYC, + + CRNOT, + + BDNZF, + BDZF, + BDNZT, + BDZT, + BDZ, + BDZ_P, + BDZ_M, + BDNZ, + BDNZ_P, + BDNZ_M, + BGE, + BGE_P, + BGE_M, + BLE, + BLE_P, + BLE_M, + BNE, + BNE_P, + BNE_M, + BNS, + BNS_P, + BNS_M, + BLT, + BLT_P, + BLT_M, + BGT, + BGT_P, + BGT_M, + BEQ, + BEQ_P, + BEQ_M, + BSO, + BSO_P, + BSO_M, + + BDNZFL, + BDZFL, + BDNZTL, + BDZTL, + BDZL, + BDZL_P, + BDZL_M, + BDNZL, + BDNZL_P, + BDNZL_M, + BGEL, + BGEL_P, + BGEL_M, + BLEL, + BLEL_P, + BLEL_M, + BNEL, + BNEL_P, + BNEL_M, + BNSL, + BNSL_P, + BNSL_M, + BLTL, + BLTL_P, + BLTL_M, + BGTL, + BGTL_P, + BGTL_M, + BEQL, + BEQL_P, + BEQL_M, + BSOL, + BSOL_P, + BSOL_M, + + BDNZFA, + BDZFA, + BDNZTA, + BDZTA, + BDZA, + BDZA_P, + BDZA_M, + BDNZA, + BDNZA_P, + BDNZA_M, + BGEA, + BGEA_P, + BGEA_M, + BLEA, + BLEA_P, + BLEA_M, + BNEA, + BNEA_P, + BNEA_M, + BNSA, + BNSA_P, + BNSA_M, + BLTA, + BLTA_P, + BLTA_M, + BGTA, + BGTA_P, + BGTA_M, + BEQA, + BEQA_P, + BEQA_M, + BSOA, + BSOA_P, + BSOA_M, + + BDNZFLA, + BDZFLA, + BDNZTLA, + BDZTLA, + BDZLA, + BDZLA_P, + BDZLA_M, + BDNZLA, + BDNZLA_P, + BDNZLA_M, + BGELA, + BGELA_P, + BGELA_M, + BLELA, + BLELA_P, + BLELA_M, + BNELA, + BNELA_P, + BNELA_M, + BNSLA, + BNSLA_P, + BNSLA_M, + BLTLA, + BLTLA_P, + BLTLA_M, + BGTLA, + BGTLA_P, + BGTLA_M, + BEQLA, + BEQLA_P, + BEQLA_M, + BSOLA, + BSOLA_P, + BSOLA_M, + + BDNZFLR, + BDZFLR, + BDNZTLR, + BDZTLR, + BDZLR, + BDZLR_P, + BDZLR_M, + BDNZLR, + BDNZLR_P, + BDNZLR_M, + BGELR, + BGELR_P, + BGELR_M, + BLELR, + BLELR_P, + BLELR_M, + BNELR, + BNELR_P, + BNELR_M, + BNSLR, + BNSLR_P, + BNSLR_M, + BLTLR, + BLTLR_P, + BLTLR_M, + BGTLR, + BGTLR_P, + BGTLR_M, + BEQLR, + BEQLR_P, + BEQLR_M, + BSOLR, + BSOLR_P, + BSOLR_M, + + BDNZFCTR, + BDZFCTR, + BDNZTCTR, + BDZTCTR, + BDZCTR, + BDZCTR_P, + BDZCTR_M, + BDNZCTR, + BDNZCTR_P, + BDNZCTR_M, + BGECTR, + BGECTR_P, + BGECTR_M, + BLECTR, + BLECTR_P, + BLECTR_M, + BNECTR, + BNECTR_P, + BNECTR_M, + BNSCTR, + BNSCTR_P, + BNSCTR_M, + BLTCTR, + BLTCTR_P, + BLTCTR_M, + BGTCTR, + BGTCTR_P, + BGTCTR_M, + BEQCTR, + BEQCTR_P, + BEQCTR_M, + BSOCTR, + BSOCTR_P, + BSOCTR_M, + + BDNZFCTRL, + BDZFCTRL, + BDNZTCTRL, + BDZTCTRL, + BDZCTRL, + BDZCTRL_P, + BDZCTRL_M, + BDNZCTRL, + BDNZCTRL_P, + BDNZCTRL_M, + BGECTRL, + BGECTRL_P, + BGECTRL_M, + BLECTRL, + BLECTRL_P, + BLECTRL_M, + BNECTRL, + BNECTRL_P, + BNECTRL_M, + BNSCTRL, + BNSCTRL_P, + BNSCTRL_M, + BLTCTRL, + BLTCTRL_P, + BLTCTRL_M, + BGTCTRL, + BGTCTRL_P, + BGTCTRL_M, + BEQCTRL, + BEQCTRL_P, + BEQCTRL_M, + BSOCTRL, + BSOCTRL_P, + BSOCTRL_M, + + BDNZFLRL, + BDZFLRL, + BDNZTLRL, + BDZTLRL, + BDZLRL, + BDZLRL_P, + BDZLRL_M, + BDNZLRL, + BDNZLRL_P, + BDNZLRL_M, + BGELRL, + BGELRL_P, + BGELRL_M, + BLELRL, + BLELRL_P, + BLELRL_M, + BNELRL, + BNELRL_P, + BNELRL_M, + BNSLRL, + BNSLRL_P, + BNSLRL_M, + BLTLRL, + BLTLRL_P, + BLTLRL_M, + BGTLRL, + BGTLRL_P, + BGTLRL_M, + BEQLRL, + BEQLRL_P, + BEQLRL_M, + BSOLRL, + BSOLRL_P, + BSOLRL_M, + + BL, + BA, + BLA, + BCL, + BCA, + BCLA, + BLR, + BTLR, + BFLR, + BCTRL, + BCCTRL, + BTCTRL, + BFCTRL, + + _count +}; +} diff --git a/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/PPUContext.hpp b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/PPUContext.hpp new file mode 100644 index 000000000..da5102ec3 --- /dev/null +++ b/rpcsx/cpu/cell/ppu/include/rx/cpu/cell/ppu/PPUContext.hpp @@ -0,0 +1,129 @@ +#pragma once + +#include "rx/v128.hpp" +#include + +struct alignas(4) CrField { + std::uint8_t bits[4]; + + constexpr void set(bool lt, bool gt, bool eq, bool so) { + bits[0] = lt; + bits[1] = gt; + bits[2] = eq; + bits[3] = so; + } + + template + constexpr void update(const T &lhs, const T &rhs, bool so) { + bits[0] = lhs < rhs; + bits[1] = lhs > rhs; + bits[2] = lhs == rhs; + bits[3] = so; + } + + static constexpr CrField From(bool lt, bool gt, bool eq, bool so) { + CrField result; + result.set(lt, gt, eq, so); + return result; + } + + [[nodiscard]] constexpr bool isLt() const { return bits[0] != 0; } + [[nodiscard]] constexpr bool isGt() const { return bits[1] != 0; } + [[nodiscard]] constexpr bool isEq() const { return bits[2] != 0; } + [[nodiscard]] constexpr bool isSo() const { return bits[3] != 0; } +}; + +struct PPUContext { + std::uint64_t gpr[32] = {}; // General-Purpose Registers + double fpr[32] = {}; // Floating Point Registers + rx::v128 vr[32] = {}; // Vector Registers + + union alignas(16) cr_bits { + std::uint8_t bits[32]; + CrField fields[8]; + + std::uint8_t &operator[](std::size_t i) { return bits[i]; } + + // Pack CR bits + [[nodiscard]] std::uint32_t pack() const { + std::uint32_t result{}; + + for (u32 bit : bits) { + result <<= 1; + result |= bit; + } + + return result; + } + + // Unpack CR bits + void unpack(std::uint32_t value) { + for (u8 &b : bits) { + b = !!(value & (1u << 31)); + value <<= 1; + } + } + }; + + cr_bits cr{}; // Condition Registers (unpacked) + + // Floating-Point Status and Control Register (unpacked) + union alignas(16) { + struct { + // TODO + bool _start[16]; + bool fl; // FPCC.FL + bool fg; // FPCC.FG + bool fe; // FPCC.FE + bool fu; // FPCC.FU + bool _end[12]; + }; + + CrField fields[8]; + cr_bits bits; + } fpscr{}; + + std::uint64_t lr{}; // Link Register + std::uint64_t ctr{}; // Counter Register + std::uint32_t vrsave{0xffffffff}; // vr Save Register + std::uint32_t cia{}; // Current Instruction Address + + // Fixed-Point Exception Register (abstract representation) + bool xer_so{}; // Summary Overflow + bool xer_ov{}; // Overflow + bool xer_ca{}; // Carry + std::uint8_t xer_cnt{}; // 0..6 + + /* + Non-Java. A mode control bit that determines whether vector floating-point + operations will be performed in a Java-IEEE-C9X-compliant mode or a + possibly faster non-Java/non-IEEE mode. 0 The Java-IEEE-C9X-compliant mode + is selected. Denormalized values are handled as specified by Java, IEEE, + and C9X standard. 1 The non-Java/non-IEEE-compliant mode is + selected. If an element in a source vector register contains a denormalized + value, the value '0' is used instead. If an instruction causes an underflow + exception, the corresponding element in the target vr is cleared to + '0'. In both cases, the '0' has the same sign as the denormalized or + underflowing value. + */ + bool nj = true; + + // Sticky saturation bit + rx::v128 sat{}; + + // Optimization: precomputed java-mode mask for handling denormals + std::uint32_t jm_mask = 0x7f80'0000; + + std::uint32_t raddr{0}; // Reservation addr + std::uint64_t rtime{0}; + alignas(64) std::byte rdata[128]{}; // Reservation data + bool use_full_rdata{}; + std::uint32_t res_cached{0}; // Reservation "cached" addresss + std::uint32_t res_notify{0}; + std::uint64_t res_notify_time{0}; + + inline void setOV(bool bit) { + xer_ov = bit; + xer_so |= bit; + } +}; diff --git a/rpcsx/cpu/cell/ppu/semantic/ppu.cpp b/rpcsx/cpu/cell/ppu/semantic/ppu.cpp new file mode 100644 index 000000000..81e6e6436 --- /dev/null +++ b/rpcsx/cpu/cell/ppu/semantic/ppu.cpp @@ -0,0 +1,4230 @@ +#include "Instruction.hpp" +#include "PPUContext.hpp" +#include +#include +#include +#include +#include +#include +#include + +using namespace rx; +using namespace rx::cell::ppu; + +#define EXPORT_SEMANTIC(x) \ + extern "C" { \ + auto ISEL_PPU_##x = x; \ + auto ISEL_PPU_##x##_DEC = x##_DEC; \ + } + +#define SEMANTIC(x) inline x [[gnu::always_inline]] +#define DECODER(x) \ + inline x##_DEC \ + [[gnu::always_inline]] ([[maybe_unused]] PPUContext & context, \ + [[maybe_unused]] Instruction inst) + +template struct add_flags_result_t { + T result; + bool carry; + + add_flags_result_t() = default; + + // Straighforward ADD with flags + add_flags_result_t(T a, T b) : result(a + b), carry(result < a) {} + + // Straighforward ADC with flags + add_flags_result_t(T a, T b, bool c) : add_flags_result_t(a, b) { + add_flags_result_t r(result, c); + result = r.result; + carry |= r.carry; + } +}; + +static add_flags_result_t add64_flags(u64 a, u64 b) { return {a, b}; } + +static add_flags_result_t add64_flags(u64 a, u64 b, bool c) { + return {a, b, c}; +} + +extern "C" { +[[noreturn]] void rpcsx_trap(); +[[noreturn]] void rpcsx_invalid_instruction(); +[[noreturn]] void rpcsx_unimplemented_instruction(); + +void rpcsx_vm_read(std::uint64_t vaddr, void *dest, std::size_t size); +void rpcsx_vm_write(std::uint64_t vaddr, const void *src, std::size_t size); + +std::uint64_t rpcsx_get_tb(); +} + +namespace { +u32 ppu_fres_mantissas[128] = { + 0x007f0000, 0x007d0800, 0x007b1800, 0x00793000, 0x00775000, 0x00757000, + 0x0073a000, 0x0071e000, 0x00700000, 0x006e4000, 0x006ca000, 0x006ae000, + 0x00694000, 0x00678000, 0x00660000, 0x00646000, 0x0062c000, 0x00614000, + 0x005fc000, 0x005e4000, 0x005cc000, 0x005b4000, 0x0059c000, 0x00584000, + 0x00570000, 0x00558000, 0x00540000, 0x0052c000, 0x00518000, 0x00500000, + 0x004ec000, 0x004d8000, 0x004c0000, 0x004b0000, 0x00498000, 0x00488000, + 0x00474000, 0x00460000, 0x0044c000, 0x00438000, 0x00428000, 0x00418000, + 0x00400000, 0x003f0000, 0x003e0000, 0x003d0000, 0x003bc000, 0x003ac000, + 0x00398000, 0x00388000, 0x00378000, 0x00368000, 0x00358000, 0x00348000, + 0x00338000, 0x00328000, 0x00318000, 0x00308000, 0x002f8000, 0x002ec000, + 0x002e0000, 0x002d0000, 0x002c0000, 0x002b0000, 0x002a0000, 0x00298000, + 0x00288000, 0x00278000, 0x0026c000, 0x00260000, 0x00250000, 0x00244000, + 0x00238000, 0x00228000, 0x00220000, 0x00210000, 0x00200000, 0x001f8000, + 0x001e8000, 0x001e0000, 0x001d0000, 0x001c8000, 0x001b8000, 0x001b0000, + 0x001a0000, 0x00198000, 0x00190000, 0x00180000, 0x00178000, 0x00168000, + 0x00160000, 0x00158000, 0x00148000, 0x00140000, 0x00138000, 0x00128000, + 0x00120000, 0x00118000, 0x00108000, 0x00100000, 0x000f8000, 0x000f0000, + 0x000e0000, 0x000d8000, 0x000d0000, 0x000c8000, 0x000b8000, 0x000b0000, + 0x000a8000, 0x000a0000, 0x00098000, 0x00090000, 0x00080000, 0x00078000, + 0x00070000, 0x00068000, 0x00060000, 0x00058000, 0x00050000, 0x00048000, + 0x00040000, 0x00038000, 0x00030000, 0x00028000, 0x00020000, 0x00018000, + 0x00010000, 0x00000000, +}; + +u32 ppu_frsqrte_mantissas[16] = { + 0x000f1000u, 0x000d8000u, 0x000c0000u, 0x000a8000u, + 0x00098000u, 0x00088000u, 0x00080000u, 0x00070000u, + 0x00060000u, 0x0004c000u, 0x0003c000u, 0x00030000u, + 0x00020000u, 0x00018000u, 0x00010000u, 0x00008000u, +}; + +// Large lookup table for FRSQRTE instruction +struct ppu_frsqrte_lut_t { + // Store only high 32 bits of doubles + u32 data[0x8000]{}; + + constexpr ppu_frsqrte_lut_t() noexcept { + for (u64 i = 0; i < 0x8000; i++) { + // Decomposed LUT index + const u64 sign = i >> 14; + const u64 expv = (i >> 3) & 0x7ff; + + // (0x3FF - (((EXP_BITS(b) - 0x3FF) >> 1) + 1)) << 52 + const u64 exp = 0x3fe0'0000 - (((expv + 0x1c01) >> 1) << (52 - 32)); + + if (expv == 0) // ±INF on zero/denormal, not accurate + { + data[i] = static_cast(0x7ff0'0000 | (sign << 31)); + } else if (expv == 0x7ff) { + if (i == (0x7ff << 3)) + data[i] = 0; // Zero on +INF, inaccurate + else + data[i] = 0x7ff8'0000; // QNaN + } else if (sign) { + data[i] = 0x7ff8'0000; // QNaN + } else { + // ((MAN_BITS(b) >> 49) & 7ull) + (!(EXP_BITS(b) & 1) << 3) + const u64 idx = 8 ^ (i & 0xf); + + data[i] = static_cast(ppu_frsqrte_mantissas[idx] | exp); + } + } + } +} inline ppu_frqrte_lut; +} // namespace + +namespace vm { +namespace detail { +template struct vm_type_selector { + using type = be_t; +}; +template struct vm_type_selector> { + using type = le_t; +}; +template struct vm_type_selector> { + using type = be_t; +}; + +template + requires(sizeof(T) == 1) +struct vm_type_selector { + using type = T; +}; +} // namespace detail + +template T read(std::uint64_t vaddr) { + typename detail::vm_type_selector::type result; + rpcsx_vm_read(vaddr, &result, sizeof(result)); + return T(result); +} + +template void write(std::uint64_t vaddr, const T &data) { + typename detail::vm_type_selector::type value = data; + rpcsx_vm_write(vaddr, &value, sizeof(value)); +} + +std::uint64_t cast(std::uint64_t address) { return address; } +} // namespace vm + +extern void ppu_execute_syscall(PPUContext &context, u64 code); +extern u32 ppu_lwarx(PPUContext &context, u32 addr); +extern u64 ppu_ldarx(PPUContext &context, u32 addr); +extern bool ppu_stwcx(PPUContext &context, u32 addr, u32 reg_value); +extern bool ppu_stdcx(PPUContext &context, u32 addr, u64 reg_value); +extern void ppu_trap(PPUContext &context, u64 addr); + +void do_cell_atomic_128_store(u32 addr, const void *to_write); + +// NaNs production precedence: NaN from Va, Vb, Vc +// and lastly the result of the operation in case none of the operands is a NaN +// Signaling NaNs are 'quieted' (MSB of fraction is set) with other bits of data +// remain the same +inline v128 ppu_select_vnan(v128 a) { return a; } + +inline v128 ppu_select_vnan(v128 a, v128 b) { + return gv_selectfs(gv_eqfs(a, a), b, a | gv_bcst32(0x7fc00000u)); +} + +inline v128 ppu_select_vnan(v128 a, v128 b, Vector128 auto... args) { + return ppu_select_vnan(a, ppu_select_vnan(b, args...)); +} + +// Flush denormals to zero if NJ is 1 +inline v128 ppu_flush_denormal(const v128 &mask, const v128 &a) { + return gv_andn(gv_shr32(gv_eq32(mask & a, gv_bcst32(0)), 1), a); +} + +inline v128 ppu_fix_vnan(v128 r) { + return gv_selectfs(gv_eqfs(r, r), r, gv_bcst32(0x7fc00000u)); +} + +inline v128 ppu_set_vnan(v128 r, Vector128 auto... args) { + return ppu_select_vnan(args..., ppu_fix_vnan(r)); +} + +template auto ppu_feed_data(PPUContext &, u64 addr) { + static_assert(sizeof(T) <= 128, + "Incompatible type-size, break down into smaller loads"); + + return vm::read(addr); +} + +constexpr u64 ppu_rotate_mask(u32 mb, u32 me) { + const u64 mask = ~0ull << (~(me - mb) & 63); + return (mask >> (mb & 63)) | (mask << ((64 - mb) & 63)); +} +inline u64 dup32(u32 x) { return x | static_cast(x) << 32; } + +void SEMANTIC(MFVSCR)(v128 &d, v128 sat, bool nj) { + u32 sat_bit = !gv_testz(sat); + d._u64[0] = 0; + d._u64[1] = u64(sat_bit | (u32{nj} << 16)) << 32; +} +void DECODER(MFVSCR) { MFVSCR(context.vr[inst.vd], context.sat, context.nj); } +EXPORT_SEMANTIC(MFVSCR); + +void SEMANTIC(MTVSCR)(v128 &sat, bool &nj, u32 &jm_mask, v128 b) { + const u32 vscr = b._u32[3]; + sat._u = vscr & 1; + jm_mask = (vscr & 0x10000) ? 0x7f80'0000 : 0x7fff'ffff; + nj = (vscr & 0x10000) != 0; +} +void DECODER(MTVSCR) { + MTVSCR(context.sat, context.nj, context.jm_mask, context.vr[inst.vb]); +} +EXPORT_SEMANTIC(MTVSCR); + +void SEMANTIC(VADDCUW)(v128 &d, v128 a, v128 b) { + d = gv_sub32(gv_geu32(gv_not32(a), b), gv_bcst32(-1)); +} +void DECODER(VADDCUW) { + VADDCUW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VADDCUW); + +void SEMANTIC(VADDFP)(v128 &d, v128 a, v128 b, u32 jm_mask) { + auto m = gv_bcst32(jm_mask); + a = ppu_flush_denormal(m, a); + b = ppu_flush_denormal(m, b); + d = ppu_flush_denormal(m, ppu_set_vnan(gv_addfs(a, b), a, b)); +} +void DECODER(VADDFP) { + VADDFP(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.jm_mask); +} +EXPORT_SEMANTIC(VADDFP); + +void SEMANTIC(VADDSBS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_adds_s8(a, b); + sat = gv_or32(gv_xor32(gv_add8(a, b), r), sat); + d = r; +} +void DECODER(VADDSBS) { + VADDSBS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VADDSBS); + +void SEMANTIC(VADDSHS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_adds_s16(a, b); + sat = gv_or32(gv_xor32(gv_add16(a, b), r), sat); + d = r; +} +void DECODER(VADDSHS) { + VADDSHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VADDSHS); + +void SEMANTIC(VADDSWS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_adds_s32(a, b); + sat = gv_or32(gv_xor32(gv_add32(a, b), r), sat); + d = r; +} +void DECODER(VADDSWS) { + VADDSWS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VADDSWS); + +void SEMANTIC(VADDUBM)(v128 &d, v128 a, v128 b) { d = gv_add8(a, b); } +void DECODER(VADDUBM) { + VADDUBM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VADDUBM); + +void SEMANTIC(VADDUBS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_addus_u8(a, b); + sat = gv_or32(gv_xor32(gv_add8(a, b), r), sat); + d = r; +} +void DECODER(VADDUBS) { + VADDUBS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VADDUBS); + +void SEMANTIC(VADDUHM)(v128 &d, v128 a, v128 b) { d = gv_add16(a, b); } +void DECODER(VADDUHM) { + VADDUHM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VADDUHM); + +void SEMANTIC(VADDUHS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_addus_u16(a, b); + sat = gv_or32(gv_xor32(gv_add16(a, b), r), sat); + d = r; +} +void DECODER(VADDUHS) { + VADDUHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VADDUHS); + +void SEMANTIC(VADDUWM)(v128 &d, v128 a, v128 b) { d = gv_add32(a, b); } +void DECODER(VADDUWM) { + VADDUWM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VADDUWM); + +void SEMANTIC(VADDUWS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_addus_u32(a, b); + sat = gv_or32(gv_xor32(gv_add32(a, b), r), sat); + d = r; +} +void DECODER(VADDUWS) { + VADDUWS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VADDUWS); + +void SEMANTIC(VAND)(v128 &d, v128 a, v128 b) { d = gv_andfs(a, b); } +void DECODER(VAND) { + VAND(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VAND); + +void SEMANTIC(VANDC)(v128 &d, v128 a, v128 b) { d = gv_andnfs(b, a); } +void DECODER(VANDC) { + VANDC(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VANDC); + +void SEMANTIC(VAVGSB)(v128 &d, v128 a, v128 b) { d = gv_avgs8(a, b); } +void DECODER(VAVGSB) { + VAVGSB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VAVGSB); + +void SEMANTIC(VAVGSH)(v128 &d, v128 a, v128 b) { d = gv_avgs16(a, b); } +void DECODER(VAVGSH) { + VAVGSH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VAVGSH); + +void SEMANTIC(VAVGSW)(v128 &d, v128 a, v128 b) { d = gv_avgs32(a, b); } +void DECODER(VAVGSW) { + VAVGSW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VAVGSW); + +void SEMANTIC(VAVGUB)(v128 &d, v128 a, v128 b) { d = gv_avgu8(a, b); } +void DECODER(VAVGUB) { + VAVGUB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VAVGUB); + +void SEMANTIC(VAVGUH)(v128 &d, v128 a, v128 b) { d = gv_avgu16(a, b); } +void DECODER(VAVGUH) { + VAVGUH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VAVGUH); + +void SEMANTIC(VAVGUW)(v128 &d, v128 &a, v128 &b) { d = gv_avgu32(a, b); } +void DECODER(VAVGUW) { + VAVGUW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VAVGUW); + +void SEMANTIC(VCFSX)(v128 &d, v128 b, u32 i) { + d = gv_subus_u16(gv_cvts32_tofs(b), gv_bcst32(i)); +} +void DECODER(VCFSX) { + VCFSX(context.vr[inst.vd], context.vr[inst.vb], inst.vuimm << 23); +} +EXPORT_SEMANTIC(VCFSX); + +void SEMANTIC(VCFUX)(v128 &d, v128 b, u32 i) { + d = gv_subus_u16(gv_cvtu32_tofs(b), gv_bcst32(i)); +} +void DECODER(VCFUX) { + VCFUX(context.vr[inst.vd], context.vr[inst.vb], inst.vuimm << 23); +} +EXPORT_SEMANTIC(VCFUX); + +void SEMANTIC(VCMPBFP)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto sign = gv_bcstfs(-0.); + auto cmp1 = gv_nlefs(a, b); + auto cmp2 = gv_ngefs(a, b ^ sign); + auto r = (cmp1 & sign) | gv_shr32(cmp2 & sign, 1); + if (cr6 != nullptr) { + cr6->set(false, false, gv_testz(r), false); + } + d = r; +} +void DECODER(VCMPBFP) { + VCMPBFP(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPBFP); + +void SEMANTIC(VCMPEQFP)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto r = gv_eqfs(a, b); + if (cr6 != nullptr) { + cr6->set(gv_testall1(r), false, gv_testall0(r), false); + } + d = r; +} +void DECODER(VCMPEQFP) { + VCMPEQFP(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPEQFP); + +void SEMANTIC(VCMPEQUB)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto r = gv_eq8(a, b); + if (cr6 != nullptr) { + cr6->set(gv_testall1(r), false, gv_testall0(r), false); + } + d = r; +} +void DECODER(VCMPEQUB) { + VCMPEQUB(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPEQUB); + +void SEMANTIC(VCMPEQUH)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto r = gv_eq16(a, b); + if (cr6 != nullptr) { + cr6->set(gv_testall1(r), false, gv_testall0(r), false); + } + d = r; +} +void DECODER(VCMPEQUH) { + VCMPEQUH(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPEQUH); + +void SEMANTIC(VCMPEQUW)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto r = gv_eq32(a, b); + if (cr6 != nullptr) { + cr6->set(gv_testall1(r), false, gv_testall0(r), false); + } + d = r; +} +void DECODER(VCMPEQUW) { + VCMPEQUW(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPEQUW); + +void SEMANTIC(VCMPGEFP)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto r = gv_gefs(a, b); + if (cr6 != nullptr) { + cr6->set(gv_testall1(r), false, gv_testall0(r), false); + } + d = r; +} +void DECODER(VCMPGEFP) { + VCMPGEFP(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPGEFP); + +void SEMANTIC(VCMPGTFP)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto r = gv_gtfs(a, b); + if (cr6 != nullptr) { + cr6->set(gv_testall1(r), false, gv_testall0(r), false); + } + d = r; +} +void DECODER(VCMPGTFP) { + VCMPGTFP(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPGTFP); + +void SEMANTIC(VCMPGTSB)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto r = gv_gts8(a, b); + if (cr6 != nullptr) { + cr6->set(gv_testall1(r), false, gv_testall0(r), false); + } + d = r; +} +void DECODER(VCMPGTSB) { + VCMPGTSB(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPGTSB); + +void SEMANTIC(VCMPGTSH)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto r = gv_gts16(a, b); + if (cr6 != nullptr) { + cr6->set(gv_testall1(r), false, gv_testall0(r), false); + } + d = r; +} +void DECODER(VCMPGTSH) { + VCMPGTSH(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPGTSH); + +void SEMANTIC(VCMPGTSW)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto r = gv_gts32(a, b); + if (cr6 != nullptr) { + cr6->set(gv_testall1(r), false, gv_testall0(r), false); + } + d = r; +} +void DECODER(VCMPGTSW) { + VCMPGTSW(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPGTSW); + +void SEMANTIC(VCMPGTUB)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto r = gv_gtu8(a, b); + if (cr6 != nullptr) { + cr6->set(gv_testall1(r), false, gv_testall0(r), false); + } + d = r; +} +void DECODER(VCMPGTUB) { + VCMPGTUB(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPGTUB); + +void SEMANTIC(VCMPGTUH)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto r = gv_gtu16(a, b); + if (cr6 != nullptr) { + cr6->set(gv_testall1(r), false, gv_testall0(r), false); + } + d = r; +} +void DECODER(VCMPGTUH) { + VCMPGTUH(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPGTUH); + +void SEMANTIC(VCMPGTUW)(CrField *cr6, v128 &d, v128 a, v128 b) { + auto r = gv_gtu32(a, b); + if (cr6 != nullptr) { + cr6->set(gv_testall1(r), false, gv_testall0(r), false); + } + d = r; +} +void DECODER(VCMPGTUW) { + VCMPGTUW(inst.oe ? context.cr.fields + 6 : nullptr, context.vr[inst.vd], + context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VCMPGTUW); + +void SEMANTIC(VCTSXS)(v128 &d, v128 b, v128 &sat, u32 i) { + auto r = gv_mulfs(b, gv_bcst32(i)); + auto l = gv_ltfs(r, gv_bcstfs(-2147483648.)); + auto h = gv_gefs(r, gv_bcstfs(2147483648.)); +#if !defined(ARCH_X64) && !defined(ARCH_ARM64) + r = gv_selectfs(l, gv_bcstfs(-2147483648.), r); +#endif + r = gv_cvtfs_tos32(r); +#if !defined(ARCH_ARM64) + r = gv_select32(h, gv_bcst32(0x7fffffff), r); +#endif + r = gv_and32(r, gv_eqfs(b, b)); + sat = gv_or32(gv_or32(l, h), sat); + d = r; +} +void DECODER(VCTSXS) { + VCTSXS(context.vr[inst.vd], context.vr[inst.vb], context.sat, + (inst.vuimm + 127) << 23); +} +EXPORT_SEMANTIC(VCTSXS); + +void SEMANTIC(VCTUXS)(v128 &d, v128 b, v128 &sat, u32 i) { + auto r = gv_mulfs(b, gv_bcst32(i)); + auto l = gv_ltfs(r, gv_bcstfs(0.)); + auto h = gv_gefs(r, gv_bcstfs(4294967296.)); + r = gv_cvtfs_tou32(r); +#if !defined(ARCH_ARM64) + r = gv_andn32(l, r); // saturate to zero +#endif +#if !defined(__AVX512VL__) && !defined(ARCH_ARM64) + r = gv_or32(r, h); // saturate to 0xffffffff +#endif + r = gv_and32(r, gv_eqfs(b, b)); + + sat = gv_or32(gv_or32(l, h), sat); + d = r; +} +void DECODER(VCTUXS) { + VCTUXS(context.vr[inst.vd], context.vr[inst.vb], context.sat, + (inst.vuimm + 127) << 23); +} +EXPORT_SEMANTIC(VCTUXS); + +void SEMANTIC(VEXPTEFP)(v128 &d, v128 b) { + // for (u32 i = 0; i < 4; i++) d._f[i] = std::exp2f(b._f[i]); + d = ppu_set_vnan(gv_exp2_approxfs(b)); +} +void DECODER(VEXPTEFP) { VEXPTEFP(context.vr[inst.vd], context.vr[inst.vb]); } +EXPORT_SEMANTIC(VEXPTEFP); + +void SEMANTIC(VLOGEFP)(v128 &d, v128 b) { + // for (u32 i = 0; i < 4; i++) d._f[i] = std::log2f(b._f[i]); + d = ppu_set_vnan(gv_log2_approxfs(b)); +} +void DECODER(VLOGEFP) { VLOGEFP(context.vr[inst.vd], context.vr[inst.vb]); } +EXPORT_SEMANTIC(VLOGEFP); + +void SEMANTIC(VMADDFP)(v128 &d, v128 a_, v128 b_, v128 c_, u32 jm_mask) { + auto m = gv_bcst32(jm_mask); + auto a = ppu_flush_denormal(m, a_); + auto b = ppu_flush_denormal(m, b_); + auto c = ppu_flush_denormal(m, c_); + d = ppu_flush_denormal(m, ppu_set_vnan(gv_fmafs(a, c, b))); +} +void DECODER(VMADDFP) { + VMADDFP(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc], context.jm_mask); +} +EXPORT_SEMANTIC(VMADDFP); + +void SEMANTIC(VMAXFP)(v128 &d, v128 a, v128 b, u32 jm_mask) { + d = ppu_flush_denormal(gv_bcst32(jm_mask), + ppu_set_vnan(gv_maxfs(a, b), a, b)); +} +void DECODER(VMAXFP) { + VMAXFP(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.jm_mask); +} +EXPORT_SEMANTIC(VMAXFP); + +void SEMANTIC(VMAXSB)(v128 &d, v128 a, v128 b) { d = gv_maxs8(a, b); } +void DECODER(VMAXSB) { + VMAXSB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMAXSB); + +void SEMANTIC(VMAXSH)(v128 &d, v128 a, v128 b) { d = gv_maxs16(a, b); } +void DECODER(VMAXSH) { + VMAXSH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMAXSH); + +void SEMANTIC(VMAXSW)(v128 &d, v128 a, v128 b) { d = gv_maxs32(a, b); } +void DECODER(VMAXSW) { + VMAXSW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMAXSW); + +void SEMANTIC(VMAXUB)(v128 &d, v128 a, v128 b) { d = gv_maxu8(a, b); } +void DECODER(VMAXUB) { + VMAXUB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMAXUB); + +void SEMANTIC(VMAXUH)(v128 &d, v128 a, v128 b) { d = gv_maxu16(a, b); } +void DECODER(VMAXUH) { + VMAXUH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMAXUH); + +void SEMANTIC(VMAXUW)(v128 &d, v128 a, v128 b) { d = gv_maxu32(a, b); } +void DECODER(VMAXUW) { + VMAXUW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMAXUW); + +void SEMANTIC(VMHADDSHS)(v128 &d, v128 a, v128 b, v128 c, v128 &sat) { + auto m = gv_muls_hds16(a, b); + auto f = gv_gts16(gv_bcst16(0), c); + auto x = gv_eq16(gv_maxs16(a, b), gv_bcst16(0x8000)); + auto r = gv_sub16(gv_adds_s16(m, c), gv_and32(x, f)); + auto s = gv_add16(m, c); + + sat = gv_or32(gv_or32(gv_andn32(f, x), gv_andn32(x, gv_xor32(s, r))), sat); + d = r; +} +void DECODER(VMHADDSHS) { + VMHADDSHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc], context.sat); +} +EXPORT_SEMANTIC(VMHADDSHS); + +void SEMANTIC(VMHRADDSHS)(v128 &d, v128 a, v128 b, v128 c, v128 &sat) { + auto m = gv_rmuls_hds16(a, b); + auto f = gv_gts16(gv_bcst16(0), c); + auto x = gv_eq16(gv_maxs16(a, b), gv_bcst16(0x8000)); + auto r = gv_sub16(gv_adds_s16(m, c), gv_and32(x, f)); + auto s = gv_add16(m, c); + sat = gv_or32(gv_or32(gv_andn32(f, x), gv_andn32(x, gv_xor32(s, r))), sat); + d = r; +} +void DECODER(VMHRADDSHS) { + VMHRADDSHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc], context.sat); +} +EXPORT_SEMANTIC(VMHRADDSHS); + +void SEMANTIC(VMINFP)(v128 &d, v128 a, v128 b, u32 jm_mask) { + d = ppu_flush_denormal(gv_bcst32(jm_mask), + ppu_set_vnan(gv_minfs(a, b), a, b)); +} +void DECODER(VMINFP) { + VMINFP(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.jm_mask); +} +EXPORT_SEMANTIC(VMINFP); + +void SEMANTIC(VMINSB)(v128 &d, v128 a, v128 b) { d = gv_mins8(a, b); } +void DECODER(VMINSB) { + VMINSB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMINSB); + +void SEMANTIC(VMINSH)(v128 &d, v128 a, v128 b) { d = gv_mins16(a, b); } +void DECODER(VMINSH) { + VMINSH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMINSH); + +void SEMANTIC(VMINSW)(v128 &d, v128 a, v128 b) { d = gv_mins32(a, b); } +void DECODER(VMINSW) { + VMINSW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMINSW); + +void SEMANTIC(VMINUB)(v128 &d, v128 a, v128 b) { d = gv_minu8(a, b); } +void DECODER(VMINUB) { + VMINUB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMINUB); + +void SEMANTIC(VMINUH)(v128 &d, v128 a, v128 b) { d = gv_minu16(a, b); } +void DECODER(VMINUH) { + VMINUH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMINUH); + +void SEMANTIC(VMINUW)(v128 &d, v128 a, v128 b) { d = gv_minu32(a, b); } +void DECODER(VMINUW) { + VMINUW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMINUW); + +void SEMANTIC(VMLADDUHM)(v128 &d, v128 a, v128 b, v128 c) { + d = gv_muladd16(a, b, c); +} +void DECODER(VMLADDUHM) { + VMLADDUHM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc]); +} +EXPORT_SEMANTIC(VMLADDUHM); + +void SEMANTIC(VMRGHB)(v128 &d, v128 a, v128 b) { d = gv_unpackhi8(b, a); } +void DECODER(VMRGHB) { + VMRGHB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMRGHB); + +void SEMANTIC(VMRGHH)(v128 &d, v128 a, v128 &b) { d = gv_unpackhi16(b, a); } +void DECODER(VMRGHH) { + VMRGHH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMRGHH); + +void SEMANTIC(VMRGHW)(v128 &d, v128 a, v128 b) { d = gv_unpackhi32(b, a); } +void DECODER(VMRGHW) { + VMRGHW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMRGHW); + +void SEMANTIC(VMRGLB)(v128 &d, v128 a, v128 b) { d = gv_unpacklo8(b, a); } +void DECODER(VMRGLB) { + VMRGLB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMRGLB); + +void SEMANTIC(VMRGLH)(v128 &d, v128 a, v128 b) { d = gv_unpacklo16(b, a); } +void DECODER(VMRGLH) { + VMRGLH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMRGLH); + +void SEMANTIC(VMRGLW)(v128 &d, v128 a, v128 b) { d = gv_unpacklo32(b, a); } +void DECODER(VMRGLW) { + VMRGLW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMRGLW); + +void SEMANTIC(VMSUMMBM)(v128 &d, v128 a, v128 b, v128 c) { + d = gv_dotu8s8x4(b, a, c); +} +void DECODER(VMSUMMBM) { + VMSUMMBM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc]); +} +EXPORT_SEMANTIC(VMSUMMBM); + +void SEMANTIC(VMSUMSHM)(v128 &d, v128 a, v128 b, v128 c) { + d = gv_dots16x2(a, b, c); +} +void DECODER(VMSUMSHM) { + VMSUMSHM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc]); +} +EXPORT_SEMANTIC(VMSUMSHM); + +void SEMANTIC(VMSUMSHS)(v128 &d, v128 a, v128 b, v128 c, v128 &sat) { + auto r = gv_dots_s16x2(a, b, c); + auto s = gv_dots16x2(a, b, c); + sat = gv_or32(gv_xor32(s, r), sat); + d = r; +} +void DECODER(VMSUMSHS) { + VMSUMSHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc], context.sat); +} +EXPORT_SEMANTIC(VMSUMSHS); + +void SEMANTIC(VMSUMUBM)(v128 &d, v128 a, v128 b, v128 c) { + d = gv_dotu8x4(a, b, c); +} +void DECODER(VMSUMUBM) { + VMSUMUBM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc]); +} +EXPORT_SEMANTIC(VMSUMUBM); + +void SEMANTIC(VMSUMUHM)(v128 &d, v128 a, v128 b, v128 c) { + d = gv_add32(c, gv_dotu16x2(a, b)); +} +void DECODER(VMSUMUHM) { + VMSUMUHM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc]); +} +EXPORT_SEMANTIC(VMSUMUHM); + +void SEMANTIC(VMSUMUHS)(v128 d, v128 a, v128 b, v128 c, v128 &sat) { + auto m1 = gv_mul_even_u16(a, b); + auto m2 = gv_mul_odds_u16(a, b); + auto s1 = gv_add32(m1, m2); + auto x1 = gv_gtu32(m1, s1); + auto s2 = gv_or32(gv_add32(s1, c), x1); + auto x2 = gv_gtu32(s1, s2); + sat = gv_or32(gv_or32(x1, x2), sat); + d = gv_or32(s2, x2); +} +void DECODER(VMSUMUHS) { + VMSUMUHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc], context.sat); +} +EXPORT_SEMANTIC(VMSUMUHS); + +void SEMANTIC(VMULESB)(v128 &d, v128 a, v128 b) { + d = gv_mul16(gv_sar16(a, 8), gv_sar16(b, 8)); +} +void DECODER(VMULESB) { + VMULESB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMULESB); + +void SEMANTIC(VMULESH)(v128 &d, v128 a, v128 b) { d = gv_mul_odds_s16(a, b); } +void DECODER(VMULESH) { + VMULESH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMULESH); + +void SEMANTIC(VMULEUB)(v128 &d, v128 a, v128 b) { + d = gv_mul16(gv_shr16(a, 8), gv_shr16(b, 8)); +} +void DECODER(VMULEUB) { + VMULEUB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMULEUB); + +void SEMANTIC(VMULEUH)(v128 &d, v128 a, v128 b) { d = gv_mul_odds_u16(a, b); } +void DECODER(VMULEUH) { + VMULEUH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMULEUH); + +void SEMANTIC(VMULOSB)(v128 &d, v128 a, v128 b) { + d = gv_mul16(gv_sar16(gv_shl16(a, 8), 8), gv_sar16(gv_shl16(b, 8), 8)); +} +void DECODER(VMULOSB) { + VMULOSB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMULOSB); + +void SEMANTIC(VMULOSH)(v128 &d, v128 a, v128 b) { d = gv_mul_even_s16(a, b); } +void DECODER(VMULOSH) { + VMULOSH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMULOSH); + +void SEMANTIC(VMULOUB)(v128 &d, v128 a, v128 b) { + auto mask = gv_bcst16(0x00ff); + d = gv_mul16(gv_and32(a, mask), gv_and32(b, mask)); +} +void DECODER(VMULOUB) { + VMULOUB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMULOUB); + +void SEMANTIC(VMULOUH)(v128 &d, v128 a, v128 b) { d = gv_mul_even_u16(a, b); } +void DECODER(VMULOUH) { + VMULOUH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VMULOUH); + +void SEMANTIC(VNMSUBFP)(v128 &d, v128 a_, v128 b_, v128 c_, u32 jm_mask) { + // An odd case with (FLT_MIN, FLT_MIN, FLT_MIN) produces FLT_MIN instead of + // 0 + auto s = gv_bcstfs(-0.0f); + auto m = gv_bcst32(jm_mask); + auto a = ppu_flush_denormal(m, a_); + auto b = ppu_flush_denormal(m, b_); + auto c = ppu_flush_denormal(m, c_); + auto r = gv_xorfs(s, gv_fmafs(a, c, gv_xorfs(b, s))); + d = ppu_flush_denormal(m, ppu_set_vnan(r)); +} +void DECODER(VNMSUBFP) { + VNMSUBFP(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc], context.jm_mask); +} +EXPORT_SEMANTIC(VNMSUBFP); + +void SEMANTIC(VNOR)(v128 &d, v128 a, v128 b) { d = gv_notfs(gv_orfs(a, b)); } +void DECODER(VNOR) { + VNOR(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VNOR); + +void SEMANTIC(VOR)(v128 &d, v128 a, v128 b) { d = gv_orfs(a, b); } +void DECODER(VOR) { + VOR(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VOR); + +void SEMANTIC(VPERM)(v128 &d, v128 a, v128 b, v128 c) { +#if defined(ARCH_ARM64) + uint8x16x2_t ab; + ab.val[0] = b; + ab.val[1] = a; + d = vqtbl2q_u8(ab, vbicq_u8(vdupq_n_u8(0x1f), c)); +#else + u8 ab[32]; + std::memcpy(ab + 0, &b, 16); + std::memcpy(ab + 16, &a, 16); + + for (u32 i = 0; i < 16; i++) { + d._u8[i] = ab[~c._u8[i] & 0x1f]; + } +#endif +} +void DECODER(VPERM) { + VPERM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc]); +} +EXPORT_SEMANTIC(VPERM); + +void SEMANTIC(VPKPX)(v128 &d, v128 a, v128 b) { + auto a1 = gv_sar32(gv_shl32(a, 7), 7 + 9); + auto b1 = gv_sar32(gv_shl32(b, 7), 7 + 9); + auto a2 = gv_sar32(gv_shl32(a, 16), 16 + 3); + auto b2 = gv_sar32(gv_shl32(b, 16), 16 + 3); + auto p1 = gv_packss_s32(b1, a1); + auto p2 = gv_packss_s32(b2, a2); + d = gv_or32(gv_or32(gv_and32(p1, gv_bcst16(0xfc00)), + gv_shl16(gv_and32(p1, gv_bcst16(0x7c)), 3)), + gv_and32(p2, gv_bcst16(0x1f))); +} +void DECODER(VPKPX) { + VPKPX(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VPKPX); + +void SEMANTIC(VPKSHSS)(v128 &d, v128 a, v128 b, v128 &sat) { + sat = gv_or32( + gv_shr16(gv_add16(a, gv_bcst16(0x80)) | gv_add16(b, gv_bcst16(0x80)), 8), + sat); + d = gv_packss_s16(b, a); +} +void DECODER(VPKSHSS) { + VPKSHSS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VPKSHSS); + +void SEMANTIC(VPKSHUS)(v128 &d, v128 a, v128 b, v128 &sat) { + sat = gv_or32(gv_shr16(a | b, 8), sat); + d = gv_packus_s16(b, a); +} +void DECODER(VPKSHUS) { + VPKSHUS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VPKSHUS); + +void SEMANTIC(VPKSWSS)(v128 &d, v128 a, v128 b, v128 &sat) { + sat = gv_or32( + gv_shr32(gv_add32(a, gv_bcst32(0x8000)) | gv_add32(b, gv_bcst32(0x8000)), + 16), + sat); + d = gv_packss_s32(b, a); +} +void DECODER(VPKSWSS) { + VPKSWSS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VPKSWSS); + +void SEMANTIC(VPKSWUS)(v128 &d, v128 a, v128 b, v128 sat) { + sat = gv_or32(gv_shr32(a | b, 16), sat); + d = gv_packus_s32(b, a); +} +void DECODER(VPKSWUS) { + VPKSWUS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VPKSWUS); + +void SEMANTIC(VPKUHUM)(v128 &d, v128 a, v128 b) { d = gv_packtu16(b, a); } +void DECODER(VPKUHUM) { + VPKUHUM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VPKUHUM); + +void SEMANTIC(VPKUHUS)(v128 &d, v128 a, v128 b, v128 &sat) { + sat = gv_or32(gv_shr16(a | b, 8), sat); + d = gv_packus_u16(b, a); +} +void DECODER(VPKUHUS) { + VPKUHUS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VPKUHUS); + +void SEMANTIC(VPKUWUM)(v128 &d, v128 a, v128 b) { d = gv_packtu32(b, a); } +void DECODER(VPKUWUM) { + VPKUWUM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VPKUWUM); + +void SEMANTIC(VPKUWUS)(v128 &d, v128 a, v128 b, v128 &sat) { + sat = gv_or32(gv_shr32(a | b, 16), sat); + d = gv_packus_u32(b, a); +} +void DECODER(VPKUWUS) { + VPKUWUS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VPKUWUS); + +void SEMANTIC(VREFP)(v128 &d, v128 b_, u32 jm_mask) { + auto m = gv_bcst32(jm_mask); + auto b = ppu_flush_denormal(m, b_); + d = ppu_flush_denormal(m, ppu_set_vnan(gv_divfs(gv_bcstfs(1.0f), b), b)); +} +void DECODER(VREFP) { + VREFP(context.vr[inst.vd], context.vr[inst.vb], context.jm_mask); +} +EXPORT_SEMANTIC(VREFP); + +void SEMANTIC(VRFIM)(v128 &d, v128 b_, u32 jm_mask) { + auto m = gv_bcst32(jm_mask); + auto b = ppu_flush_denormal(m, b_); + d = ppu_flush_denormal(m, ppu_set_vnan(gv_roundfs_floor(b), b)); +} +void DECODER(VRFIM) { + VRFIM(context.vr[inst.vd], context.vr[inst.vb], context.jm_mask); +} +EXPORT_SEMANTIC(VRFIM); + +void SEMANTIC(VRFIN)(v128 &d, v128 b, u32 jm_mask) { + auto m = gv_bcst32(jm_mask); + d = ppu_flush_denormal(m, ppu_set_vnan(gv_roundfs_even(b), b)); +} +void DECODER(VRFIN) { + VRFIN(context.vr[inst.vd], context.vr[inst.vb], context.jm_mask); +} +EXPORT_SEMANTIC(VRFIN); + +void SEMANTIC(VRFIP)(v128 &d, v128 b_, u32 jm_mask) { + auto m = gv_bcst32(jm_mask); + auto b = ppu_flush_denormal(m, b_); + d = ppu_flush_denormal(m, ppu_set_vnan(gv_roundfs_ceil(b), b)); +} +void DECODER(VRFIP) { + VRFIP(context.vr[inst.vd], context.vr[inst.vb], context.jm_mask); +} +EXPORT_SEMANTIC(VRFIP); + +void SEMANTIC(VRFIZ)(v128 &d, v128 b, u32 jm_mask) { + auto m = gv_bcst32(jm_mask); + d = ppu_flush_denormal(m, ppu_set_vnan(gv_roundfs_trunc(b), b)); +} +void DECODER(VRFIZ) { + VRFIZ(context.vr[inst.vd], context.vr[inst.vb], context.jm_mask); +} +EXPORT_SEMANTIC(VRFIZ); + +void SEMANTIC(VRLB)(v128 &d, v128 a, v128 b) { d = gv_rol8(a, b); } +void DECODER(VRLB) { + VRLB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VRLB); + +void SEMANTIC(VRLH)(v128 &d, v128 a, v128 b) { d = gv_rol16(a, b); } +void DECODER(VRLH) { + VRLH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VRLH); + +void SEMANTIC(VRLW)(v128 &d, v128 a, v128 b) { d = gv_rol32(a, b); } +void DECODER(VRLW) { + VRLW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VRLW); + +void SEMANTIC(VRSQRTEFP)(v128 &d, v128 b_, u32 jm_mask) { + auto m = gv_bcst32(jm_mask); + auto b = ppu_flush_denormal(m, b_); + d = ppu_flush_denormal( + m, ppu_set_vnan(gv_divfs(gv_bcstfs(1.0f), gv_sqrtfs(b)), b)); +} +void DECODER(VRSQRTEFP) { + VRSQRTEFP(context.vr[inst.vd], context.vr[inst.vb], context.jm_mask); +} +EXPORT_SEMANTIC(VRSQRTEFP); + +void SEMANTIC(VSEL)(v128 &d, v128 a, v128 b, v128 c) { + auto x = gv_andfs(b, c); + d = gv_orfs(x, gv_andnfs(c, a)); +} +void DECODER(VSEL) { + VSEL(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.vr[inst.vc]); +} +EXPORT_SEMANTIC(VSEL); + +void SEMANTIC(VSL)(v128 &d, v128 a, v128 b) { + d = gv_fshl8(a, gv_shuffle_left<1>(a), b); +} +void DECODER(VSL) { + VSL(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSL); + +void SEMANTIC(VSLB)(v128 &d, v128 a, v128 b) { d = gv_shl8(a, b); } +void DECODER(VSLB) { + VSLB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSLB); + +template static void VSLDOI_IMPL(v128 &d, v128 a, v128 b) { + d = gv_or32(gv_shuffle_left(a), gv_shuffle_right<16 - Count>(b)); +} +void SEMANTIC(VSLDOI)(v128 &d, v128 a, v128 b, u32 vsh) { + switch (vsh) { + case 0: + VSLDOI_IMPL<0>(d, a, b); + break; + case 1: + VSLDOI_IMPL<1>(d, a, b); + break; + case 2: + VSLDOI_IMPL<2>(d, a, b); + break; + case 3: + VSLDOI_IMPL<3>(d, a, b); + break; + case 4: + VSLDOI_IMPL<4>(d, a, b); + break; + case 5: + VSLDOI_IMPL<5>(d, a, b); + break; + case 6: + VSLDOI_IMPL<6>(d, a, b); + break; + case 7: + VSLDOI_IMPL<7>(d, a, b); + break; + case 8: + VSLDOI_IMPL<8>(d, a, b); + break; + case 9: + VSLDOI_IMPL<9>(d, a, b); + break; + case 10: + VSLDOI_IMPL<10>(d, a, b); + break; + case 11: + VSLDOI_IMPL<11>(d, a, b); + break; + case 12: + VSLDOI_IMPL<12>(d, a, b); + break; + case 13: + VSLDOI_IMPL<13>(d, a, b); + break; + case 14: + VSLDOI_IMPL<14>(d, a, b); + break; + case 15: + VSLDOI_IMPL<15>(d, a, b); + break; + } +} +void DECODER(VSLDOI) { + VSLDOI(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + inst.vsh); +} +EXPORT_SEMANTIC(VSLDOI); + +void SEMANTIC(VSLH)(v128 &d, v128 a, v128 b) { d = gv_shl16(a, b); } +void DECODER(VSLH) { + VSLH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSLH); + +void SEMANTIC(VSLO)(v128 &d, v128 a, v128 b) { + d._u = a._u << (b._u8[0] & 0x78); +} +void DECODER(VSLO) { + VSLO(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSLO); + +void SEMANTIC(VSLW)(v128 &d, v128 a, v128 b) { d = gv_shl32(a, b); } +void DECODER(VSLW) { + VSLW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSLW); + +void SEMANTIC(VSPLTB)(v128 &d, v128 b, std::uint32_t imm) { + d = gv_bcst8(b.u8r[imm & 15]); +} +void DECODER(VSPLTB) { + VSPLTB(context.vr[inst.vd], context.vr[inst.vb], inst.vuimm); +} +EXPORT_SEMANTIC(VSPLTB); + +void SEMANTIC(VSPLTH)(v128 &d, v128 b, std::uint32_t imm) { + d = gv_bcst16(b.u16r[imm & 7]); +} +void DECODER(VSPLTH) { + VSPLTH(context.vr[inst.vd], context.vr[inst.vb], inst.vuimm); +} +EXPORT_SEMANTIC(VSPLTH); + +void SEMANTIC(VSPLTISB)(v128 &d, std::int32_t imm) { d = gv_bcst8(imm); } +void DECODER(VSPLTISB) { VSPLTISB(context.vr[inst.vd], inst.vsimm); } +EXPORT_SEMANTIC(VSPLTISB); + +void SEMANTIC(VSPLTISH)(v128 &d, std::int32_t imm) { d = gv_bcst16(imm); } +void DECODER(VSPLTISH) { VSPLTISH(context.vr[inst.vd], inst.vsimm); } +EXPORT_SEMANTIC(VSPLTISH); + +void SEMANTIC(VSPLTISW)(v128 &d, std::int32_t imm) { d = gv_bcst32(imm); } +void DECODER(VSPLTISW) { VSPLTISW(context.vr[inst.vd], inst.vsimm); } +EXPORT_SEMANTIC(VSPLTISW); + +void SEMANTIC(VSPLTW)(v128 &d, v128 b, u32 imm) { + d = gv_bcst32(b.u32r[imm & 3]); +} +void DECODER(VSPLTW) { + VSPLTW(context.vr[inst.vd], context.vr[inst.vb], inst.vuimm); +} +EXPORT_SEMANTIC(VSPLTW); + +void SEMANTIC(VSR)(v128 &d, v128 a, v128 b) { + d = gv_fshr8(gv_shuffle_right<1>(a), a, b); +} +void DECODER(VSR) { + VSR(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSR); + +void SEMANTIC(VSRAB)(v128 &d, v128 a, v128 b) { d = gv_sar8(a, b); } +void DECODER(VSRAB) { + VSRAB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSRAB); + +void SEMANTIC(VSRAH)(v128 &d, v128 a, v128 b) { d = gv_sar16(a, b); } +void DECODER(VSRAH) { + VSRAH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSRAH); + +void SEMANTIC(VSRAW)(v128 &d, v128 a, v128 b) { d = gv_sar32(a, b); } +void DECODER(VSRAW) { + VSRAW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSRAW); + +void SEMANTIC(VSRB)(v128 &d, v128 a, v128 b) { d = gv_shr8(a, b); } +void DECODER(VSRB) { + VSRB(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSRB); + +void SEMANTIC(VSRH)(v128 &d, v128 a, v128 b) { d = gv_shr16(a, b); } +void DECODER(VSRH) { + VSRH(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSRH); + +void SEMANTIC(VSRO)(v128 &d, v128 a, v128 b) { + d._u = a._u >> (b._u8[0] & 0x78); +} +void DECODER(VSRO) { + VSRO(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSRO); + +void SEMANTIC(VSRW)(v128 &d, v128 a, v128 b) { d = gv_shr32(a, b); } +void DECODER(VSRW) { + VSRW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSRW); + +void SEMANTIC(VSUBCUW)(v128 &d, v128 a, v128 b) { + d = gv_shr32(gv_geu32(a, b), 31); +} +void DECODER(VSUBCUW) { + VSUBCUW(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSUBCUW); + +void SEMANTIC(VSUBFP)(v128 &d, v128 a_, v128 b_, u32 jm_mask) { + auto m = gv_bcst32(jm_mask); + auto a = ppu_flush_denormal(m, a_); + auto b = ppu_flush_denormal(m, b_); + d = ppu_flush_denormal(m, ppu_set_vnan(gv_subfs(a, b), a, b)); +} +void DECODER(VSUBFP) { + VSUBFP(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.jm_mask); +} +EXPORT_SEMANTIC(VSUBFP); + +void SEMANTIC(VSUBSBS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_subs_s8(a, b); + sat = gv_or32(gv_xor32(gv_sub8(a, b), r), sat); + d = r; +} +void DECODER(VSUBSBS) { + VSUBSBS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VSUBSBS); + +void SEMANTIC(VSUBSHS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_subs_s16(a, b); + sat = gv_or32(gv_xor32(gv_sub16(a, b), r), sat); + d = r; +} +void DECODER(VSUBSHS) { + VSUBSHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VSUBSHS); + +void SEMANTIC(VSUBSWS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_subs_s32(a, b); + sat = gv_or32(gv_xor32(gv_sub32(a, b), r), sat); + d = r; +} +void DECODER(VSUBSWS) { + VSUBSWS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VSUBSWS); + +void SEMANTIC(VSUBUBM)(v128 &d, v128 a, v128 b) { d = gv_sub8(a, b); } +void DECODER(VSUBUBM) { + VSUBUBM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSUBUBM); + +void SEMANTIC(VSUBUBS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_subus_u8(a, b); + sat = gv_or32(gv_xor32(gv_sub8(a, b), r), sat); + d = r; +} +void DECODER(VSUBUBS) { + VSUBUBS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VSUBUBS); + +void SEMANTIC(VSUBUHM)(v128 &d, v128 a, v128 b) { d = gv_sub16(a, b); } +void DECODER(VSUBUHM) { + VSUBUHM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSUBUHM); + +void SEMANTIC(VSUBUHS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_subus_u16(a, b); + sat = gv_or32(gv_xor32(gv_sub16(a, b), r), sat); + d = r; +} +void DECODER(VSUBUHS) { + VSUBUHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VSUBUHS); + +void SEMANTIC(VSUBUWM)(v128 &d, v128 a, v128 b) { d = gv_sub32(a, b); } +void DECODER(VSUBUWM) { + VSUBUWM(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VSUBUWM); + +void SEMANTIC(VSUBUWS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_subus_u32(a, b); + sat = gv_or32(gv_xor32(gv_sub32(a, b), r), sat); + d = r; +} +void DECODER(VSUBUWS) { + VSUBUWS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VSUBUWS); + +void SEMANTIC(VSUMSWS)(v128 &d, v128 a, v128 b, v128 &sat) { + s64 sum = s64{b._s32[0]} + a._s32[0] + a._s32[1] + a._s32[2] + a._s32[3]; + if (sum > INT32_MAX) { + sum = u32(INT32_MAX); + sat._bytes[0] = 1; + } else if (sum < INT32_MIN) { + sum = u32(INT32_MIN); + sat._bytes[0] = 1; + } else { + sum = static_cast(sum); + } + + d._u = sum; +} +void DECODER(VSUMSWS) { + VSUMSWS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VSUMSWS); + +void SEMANTIC(VSUM2SWS)(v128 &d, v128 a, v128 b, v128 &sat) { +#if defined(__AVX512VL__) + const auto x = gv_add64(gv_sar64(gv_shl64(a, 32), 32), gv_sar64(a, 32)); + const auto y = gv_add64(x, gv_sar64(gv_shl64(b, 32), 32)); + const auto r = + _mm_unpacklo_epi32(_mm_cvtsepi64_epi32(y), _mm_setzero_si128()); +#elif defined(ARCH_ARM64) + const auto x = + vaddl_s32(vget_low_s32(vuzp1q_s32(a, a)), vget_low_s32(vuzp2q_s32(a, a))); + const auto y = vaddw_s32(x, vget_low_s32(vuzp1q_s32(b, b))); + const auto r = vmovl_u32(uint32x2_t(vqmovn_s64(y))); +#else + v128 y{}; + y._s64[0] = s64{a._s32[0]} + a._s32[1] + b._s32[0]; + y._s64[1] = s64{a._s32[2]} + a._s32[3] + b._s32[2]; + v128 r{}; + r._u64[0] = y._s64[0] > INT32_MAX ? INT32_MAX + : y._s64[0] < INT32_MIN ? u32(INT32_MIN) + : static_cast(y._s64[0]); + r._u64[1] = y._s64[1] > INT32_MAX ? INT32_MAX + : y._s64[1] < INT32_MIN ? u32(INT32_MIN) + : static_cast(y._s64[1]); +#endif + sat = gv_or32(gv_shr64(gv_add64(y, gv_bcst64(0x80000000u)), 32), sat); + d = r; +} +void DECODER(VSUM2SWS) { + VSUM2SWS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VSUM2SWS); + +void SEMANTIC(VSUM4SBS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_dots_u8s8x4(gv_bcst8(1), a, b); + sat = gv_or32(gv_xor32(gv_hadds8x4(a, b), r), sat); + d = r; +} +void DECODER(VSUM4SBS) { + VSUM4SBS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VSUM4SBS); + +void SEMANTIC(VSUM4SHS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto r = gv_dots_s16x2(a, gv_bcst16(1), b); + sat = gv_or32(gv_xor32(gv_hadds16x2(a, b), r), sat); + d = r; +} +void DECODER(VSUM4SHS) { + VSUM4SHS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VSUM4SHS); + +void SEMANTIC(VSUM4UBS)(v128 &d, v128 a, v128 b, v128 &sat) { + auto x = gv_haddu8x4(a); + auto r = gv_addus_u32(x, b); + sat = gv_or32(gv_xor32(gv_add32(x, b), r), sat); + d = r; +} +void DECODER(VSUM4UBS) { + VSUM4UBS(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb], + context.sat); +} +EXPORT_SEMANTIC(VSUM4UBS); + +void SEMANTIC(VUPKHPX)(v128 &d, v128 b) { + auto x = gv_extend_hi_s16(b); + auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)), + gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00))); + d = gv_or32(y, gv_and32(x, gv_bcst32(0xff00001f))); +} +void DECODER(VUPKHPX) { VUPKHPX(context.vr[inst.vd], context.vr[inst.vb]); } +EXPORT_SEMANTIC(VUPKHPX); + +void SEMANTIC(VUPKHSB)(v128 &d, v128 b) { d = gv_extend_hi_s8(b); } +void DECODER(VUPKHSB) { VUPKHSB(context.vr[inst.vd], context.vr[inst.vb]); } +EXPORT_SEMANTIC(VUPKHSB); + +void SEMANTIC(VUPKHSH)(v128 &d, v128 b) { d = gv_extend_hi_s16(b); } +void DECODER(VUPKHSH) { VUPKHSH(context.vr[inst.vd], context.vr[inst.vb]); } +EXPORT_SEMANTIC(VUPKHSH); + +void SEMANTIC(VUPKLPX)(v128 &d, v128 b) { + auto x = gv_extend_lo_s16(b); + auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)), + gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00))); + d = gv_or32(y, gv_and32(x, gv_bcst32(0xff00001f))); +} +void DECODER(VUPKLPX) { VUPKLPX(context.vr[inst.vd], context.vr[inst.vb]); } +EXPORT_SEMANTIC(VUPKLPX); + +void SEMANTIC(VUPKLSB)(v128 &d, v128 b) { d = gv_extend_lo_s8(b); } +void DECODER(VUPKLSB) { VUPKLSB(context.vr[inst.vd], context.vr[inst.vb]); } +EXPORT_SEMANTIC(VUPKLSB); + +void SEMANTIC(VUPKLSH)(v128 &d, v128 b) { d = gv_extend_lo_s16(b); } +void DECODER(VUPKLSH) { VUPKLSH(context.vr[inst.vd], context.vr[inst.vb]); } +EXPORT_SEMANTIC(VUPKLSH); + +void SEMANTIC(VXOR)(v128 &d, v128 a, v128 b) { d = gv_xorfs(a, b); } +void DECODER(VXOR) { + VXOR(context.vr[inst.vd], context.vr[inst.va], context.vr[inst.vb]); +} +EXPORT_SEMANTIC(VXOR); + +void SEMANTIC(TDI)(s64 ra, u8 bo, s16 simm16) { + if ((bo & 0x10) && ra < (s64)simm16) { + rpcsx_trap(); + } + + if ((bo & 0x8) && ra > (s64)simm16) { + rpcsx_trap(); + } + + if ((bo & 0x4) && ra == (s64)simm16) { + rpcsx_trap(); + } + + if ((bo & 0x2) && (u64)ra < (u64)simm16) { + rpcsx_trap(); + } + + if ((bo & 0x1) && (u64)ra > (u64)simm16) { + rpcsx_trap(); + } +} +void DECODER(TDI) { TDI(context.gpr[inst.ra], inst.bo, inst.simm16); } +EXPORT_SEMANTIC(TDI); + +void SEMANTIC(TWI)(s32 ra, u8 bo, s16 simm16) { + if ((bo & 0x10) && ra < (s32)simm16) { + rpcsx_trap(); + } + + if ((bo & 0x8) && ra > (s32)simm16) { + rpcsx_trap(); + } + + if ((bo & 0x4) && ra == (s32)simm16) { + rpcsx_trap(); + } + + if ((bo & 0x2) && (u32)ra < (u32)simm16) { + rpcsx_trap(); + } + + if ((bo & 0x1) && (u32)ra > (u32)simm16) { + rpcsx_trap(); + } +} + +void DECODER(TWI) { TWI(context.gpr[inst.ra], inst.bo, inst.simm16); } + +EXPORT_SEMANTIC(TWI); + +void SEMANTIC(MULLI)(PPUContext &context, Instruction inst) { + context.gpr[inst.rd] = static_cast(context.gpr[inst.ra]) * inst.simm16; +} +void DECODER(MULLI) { MULLI(context, inst); } +EXPORT_SEMANTIC(MULLI); + +void SEMANTIC(SUBFIC)(PPUContext &context, Instruction inst) { + const u64 a = context.gpr[inst.ra]; + const s64 i = inst.simm16; + const auto r = add64_flags(~a, i, 1); + context.gpr[inst.rd] = r.result; + context.xer_ca = r.carry; +} +void DECODER(SUBFIC) { SUBFIC(context, inst); } +EXPORT_SEMANTIC(SUBFIC); + +void SEMANTIC(CMPLI)(PPUContext &context, Instruction inst) { + if (inst.l10) { + context.cr.fields[inst.crfd].update(context.gpr[inst.ra], inst.uimm16, + context.xer_so); + } else { + context.cr.fields[inst.crfd].update( + static_cast(context.gpr[inst.ra]), inst.uimm16, context.xer_so); + } +} +void DECODER(CMPLI) { CMPLI(context, inst); } +EXPORT_SEMANTIC(CMPLI); + +void SEMANTIC(CMPI)(PPUContext &context, Instruction inst) { + if (inst.l10) { + context.cr.fields[inst.crfd].update(context.gpr[inst.ra], inst.simm16, + context.xer_so); + } else { + context.cr.fields[inst.crfd].update( + static_cast(context.gpr[inst.ra]), inst.simm16, context.xer_so); + } +} +void DECODER(CMPI) { CMPI(context, inst); } +EXPORT_SEMANTIC(CMPI); + +void SEMANTIC(ADDIC)(PPUContext &context, Instruction inst) { + const s64 a = context.gpr[inst.ra]; + const s64 i = inst.simm16; + const auto r = add64_flags(a, i); + context.gpr[inst.rd] = r.result; + context.xer_ca = r.carry; + if (inst.main & 1) [[unlikely]] + context.cr.fields[0].update(r.result, 0, context.xer_so); +} +void DECODER(ADDIC) { ADDIC(context, inst); } +EXPORT_SEMANTIC(ADDIC); + +void SEMANTIC(ADDI)(PPUContext &context, Instruction inst) { + context.gpr[inst.rd] = + inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; +} +void DECODER(ADDI) { ADDI(context, inst); } +EXPORT_SEMANTIC(ADDI); + +void SEMANTIC(ADDIS)(PPUContext &context, Instruction inst) { + context.gpr[inst.rd] = inst.ra ? context.gpr[inst.ra] + (inst.simm16 * 65536) + : (inst.simm16 * 65536); +} +void DECODER(ADDIS) { ADDIS(context, inst); } +EXPORT_SEMANTIC(ADDIS); + +void SEMANTIC(BC)(std::uint32_t &cia, std::uint64_t &lr, std::uint64_t &ctr, + u8 bo, u8 crBit, bool lk, std::uint32_t target) { + bool bo0 = (bo & 0x10) != 0; + bool bo1 = (bo & 0x08) != 0; + bool bo2 = (bo & 0x04) != 0; + bool bo3 = (bo & 0x02) != 0; + + ctr -= (bo2 ^ true); + + bool ctr_ok = bo2 | ((ctr != 0) ^ bo3); + bool cond_ok = bo0 | (!!crBit ^ (bo1 ^ true)); + + u32 nextInst = cia + 4; + if (lk) { + lr = nextInst; + } + + if (ctr_ok && cond_ok) { + cia = target; + } else { + cia = nextInst; + } +} +void DECODER(BC) { + BC(context.cia, context.lr, context.ctr, inst.bo, context.cr[inst.bi], + inst.lk, (inst.aa ? 0 : context.cia) + inst.bt14); +} +EXPORT_SEMANTIC(BC); + +void SEMANTIC(SC)(PPUContext &context, std::uint64_t sysId) { + ppu_execute_syscall(context, sysId); +} +void DECODER(SC) { SC(context, context.gpr[11]); } +EXPORT_SEMANTIC(SC); + +void SEMANTIC(B)(std::uint32_t &cia, std::uint64_t &lr, bool lk, + std::uint32_t target) { + u32 nextInst = cia + 4; + if (lk) { + lr = nextInst; + } + + cia = target; +} +void DECODER(B) { + B(context.cia, context.lr, inst.lk, (inst.aa ? 0 : context.cia) + inst.bt24); +} +EXPORT_SEMANTIC(B); + +void SEMANTIC(MCRF)(PPUContext &context, Instruction inst) { + context.cr.fields[inst.crfd] = context.cr.fields[inst.crfs]; +} +void DECODER(MCRF) { MCRF(context, inst); } +EXPORT_SEMANTIC(MCRF); + +void SEMANTIC(BCLR)(std::uint32_t &cia, std::uint64_t &lr, u64 &ctr, u8 bo, + u8 crBit, bool lk) { + bool bo0 = (bo & 0x10) != 0; + bool bo1 = (bo & 0x08) != 0; + bool bo2 = (bo & 0x04) != 0; + bool bo3 = (bo & 0x02) != 0; + + ctr -= (bo2 ^ true); + + bool ctr_ok = bo2 | ((ctr != 0) ^ bo3); + bool cond_ok = bo0 | (!!crBit ^ (bo1 ^ true)); + + u32 target = static_cast(lr) & ~3; + u32 nextInst = cia + 4; + if (lk) { + lr = nextInst; + } + + if (ctr_ok && cond_ok) { + cia = target; + } else { + cia = nextInst; + } +} +void DECODER(BCLR) { + BCLR(context.cia, context.lr, context.ctr, inst.bo, context.cr[inst.bi], + inst.lk); +} +EXPORT_SEMANTIC(BCLR); + +void SEMANTIC(CRNOR)(PPUContext &context, Instruction inst) { + context.cr[inst.crbd] = + (context.cr[inst.crba] | context.cr[inst.crbb]) ^ true; +} +void DECODER(CRNOR) { CRNOR(context, inst); } +EXPORT_SEMANTIC(CRNOR); + +void SEMANTIC(CRANDC)(PPUContext &context, Instruction inst) { + context.cr[inst.crbd] = + context.cr[inst.crba] & (context.cr[inst.crbb] ^ true); +} +void DECODER(CRANDC) { CRANDC(context, inst); } +EXPORT_SEMANTIC(CRANDC); + +void SEMANTIC(ISYNC)() { std::atomic_thread_fence(std::memory_order::acquire); } +void DECODER(ISYNC) { ISYNC(); } +EXPORT_SEMANTIC(ISYNC); + +void SEMANTIC(CRXOR)(PPUContext &context, Instruction inst) { + context.cr[inst.crbd] = context.cr[inst.crba] ^ context.cr[inst.crbb]; +} +void DECODER(CRXOR) { CRXOR(context, inst); } +EXPORT_SEMANTIC(CRXOR); + +void SEMANTIC(CRNAND)(PPUContext &context, Instruction inst) { + context.cr[inst.crbd] = + (context.cr[inst.crba] & context.cr[inst.crbb]) ^ true; +} +void DECODER(CRNAND) { CRNAND(context, inst); } +EXPORT_SEMANTIC(CRNAND); + +void SEMANTIC(CRAND)(PPUContext &context, Instruction inst) { + context.cr[inst.crbd] = context.cr[inst.crba] & context.cr[inst.crbb]; +} +void DECODER(CRAND) { CRAND(context, inst); } +EXPORT_SEMANTIC(CRAND); + +void SEMANTIC(CREQV)(PPUContext &context, Instruction inst) { + context.cr[inst.crbd] = + (context.cr[inst.crba] ^ context.cr[inst.crbb]) ^ true; +} +void DECODER(CREQV) { CREQV(context, inst); } +EXPORT_SEMANTIC(CREQV); + +void SEMANTIC(CRORC)(PPUContext &context, Instruction inst) { + context.cr[inst.crbd] = + context.cr[inst.crba] | (context.cr[inst.crbb] ^ true); +} +void DECODER(CRORC) { CRORC(context, inst); } +EXPORT_SEMANTIC(CRORC); + +void SEMANTIC(CROR)(PPUContext &context, Instruction inst) { + context.cr[inst.crbd] = context.cr[inst.crba] | context.cr[inst.crbb]; +} +void DECODER(CROR) { CROR(context, inst); } +EXPORT_SEMANTIC(CROR); + +void SEMANTIC(BCCTR)(std::uint32_t &cia, std::uint64_t &lr, std::uint64_t ctr, + u8 bo, u8 crBit, bool lk) { + u32 target = static_cast(ctr) & ~3; + u32 nextInst = cia + 4; + + if (lk) { + lr = nextInst; + } + + if (bo & 0x10 || crBit == ((bo & 0x8) != 0)) { + cia = target; + } else { + cia = nextInst; + } +} +void DECODER(BCCTR) { + BCCTR(context.cia, context.lr, context.ctr, inst.bo, context.cr[inst.bi], + inst.lk); +} +EXPORT_SEMANTIC(BCCTR); + +void SEMANTIC(RLWIMI)(PPUContext &context, Instruction inst) { + const u64 mask = ppu_rotate_mask(32 + inst.mb32, 32 + inst.me32); + context.gpr[inst.ra] = + (context.gpr[inst.ra] & ~mask) | + (dup32(rol32(static_cast(context.gpr[inst.rs]), inst.sh32)) & mask); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(RLWIMI) { RLWIMI(context, inst); } +EXPORT_SEMANTIC(RLWIMI); + +void SEMANTIC(RLWINM)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = + dup32(rol32(static_cast(context.gpr[inst.rs]), inst.sh32)) & + ppu_rotate_mask(32 + inst.mb32, 32 + inst.me32); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(RLWINM) { RLWINM(context, inst); } +EXPORT_SEMANTIC(RLWINM); + +void SEMANTIC(RLWNM)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = dup32(rol32(static_cast(context.gpr[inst.rs]), + context.gpr[inst.rb] & 0x1f)) & + ppu_rotate_mask(32 + inst.mb32, 32 + inst.me32); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(RLWNM) { RLWNM(context, inst); } +EXPORT_SEMANTIC(RLWNM); + +void SEMANTIC(ORI)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = context.gpr[inst.rs] | inst.uimm16; +} +void DECODER(ORI) { ORI(context, inst); } +EXPORT_SEMANTIC(ORI); + +void SEMANTIC(ORIS)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = context.gpr[inst.rs] | (u64{inst.uimm16} << 16); +} +void DECODER(ORIS) { ORIS(context, inst); } +EXPORT_SEMANTIC(ORIS); + +void SEMANTIC(XORI)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = context.gpr[inst.rs] ^ inst.uimm16; +} +void DECODER(XORI) { XORI(context, inst); } +EXPORT_SEMANTIC(XORI); + +void SEMANTIC(XORIS)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = context.gpr[inst.rs] ^ (u64{inst.uimm16} << 16); +} +void DECODER(XORIS) { XORIS(context, inst); } +EXPORT_SEMANTIC(XORIS); + +void SEMANTIC(ANDI)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = context.gpr[inst.rs] & inst.uimm16; + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); +} +void DECODER(ANDI) { ANDI(context, inst); } +EXPORT_SEMANTIC(ANDI); + +void SEMANTIC(ANDIS)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = context.gpr[inst.rs] & (u64{inst.uimm16} << 16); + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); +} +void DECODER(ANDIS) { ANDIS(context, inst); } +EXPORT_SEMANTIC(ANDIS); + +void SEMANTIC(RLDICL)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = + rol64(context.gpr[inst.rs], inst.sh64) & (~0ull >> inst.mbe64); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(RLDICL) { RLDICL(context, inst); } +EXPORT_SEMANTIC(RLDICL); + +void SEMANTIC(RLDICR)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = + rol64(context.gpr[inst.rs], inst.sh64) & (~0ull << (inst.mbe64 ^ 63)); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(RLDICR) { RLDICR(context, inst); } +EXPORT_SEMANTIC(RLDICR); + +void SEMANTIC(RLDIC)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = rol64(context.gpr[inst.rs], inst.sh64) & + ppu_rotate_mask(inst.mbe64, inst.sh64 ^ 63); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(RLDIC) { RLDIC(context, inst); } +EXPORT_SEMANTIC(RLDIC); + +void SEMANTIC(RLDIMI)(PPUContext &context, Instruction inst) { + const u64 mask = ppu_rotate_mask(inst.mbe64, inst.sh64 ^ 63); + context.gpr[inst.ra] = (context.gpr[inst.ra] & ~mask) | + (rol64(context.gpr[inst.rs], inst.sh64) & mask); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(RLDIMI) { RLDIMI(context, inst); } +EXPORT_SEMANTIC(RLDIMI); + +void SEMANTIC(RLDCL)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = + rol64(context.gpr[inst.rs], context.gpr[inst.rb] & 0x3f) & + (~0ull >> inst.mbe64); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(RLDCL) { RLDCL(context, inst); } +EXPORT_SEMANTIC(RLDCL); + +void SEMANTIC(RLDCR)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = + rol64(context.gpr[inst.rs], context.gpr[inst.rb] & 0x3f) & + (~0ull << (inst.mbe64 ^ 63)); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(RLDCR) { RLDCR(context, inst); } +EXPORT_SEMANTIC(RLDCR); + +void SEMANTIC(CMP)(PPUContext &context, Instruction inst) { + if (inst.l10) { + context.cr.fields[inst.crfd].update( + context.gpr[inst.ra], context.gpr[inst.rb], context.xer_so); + } else { + context.cr.fields[inst.crfd].update( + context.gpr[inst.ra], static_cast(context.gpr[inst.rb]), + static_cast(context.xer_so)); + } +} +void DECODER(CMP) { CMP(context, inst); } +EXPORT_SEMANTIC(CMP); + +void SEMANTIC(TW)(s32 ra, u8 bo, s32 rb) { + if ((bo & 0x10) && ra < rb) { + rpcsx_trap(); + } + + if ((bo & 0x8) && ra > rb) { + rpcsx_trap(); + } + + if ((bo & 0x4) && ra == rb) { + rpcsx_trap(); + } + + if ((bo & 0x2) && (u32)ra < (u32)rb) { + rpcsx_trap(); + } + + if ((bo & 0x1) && (u32)ra > (u32)rb) { + rpcsx_trap(); + } +} + +void DECODER(TW) { TW(context.gpr[inst.ra], inst.bo, context.gpr[inst.rb]); } + +EXPORT_SEMANTIC(TW); + +static const v128 s_lvsl_base = + v128::from64r(0x0001020304050607, 0x08090a0b0c0d0e0f); + +static const v128 s_lvsl_consts[16] = { + gv_add8(s_lvsl_base, gv_bcst8(0)), gv_add8(s_lvsl_base, gv_bcst8(1)), + gv_add8(s_lvsl_base, gv_bcst8(2)), gv_add8(s_lvsl_base, gv_bcst8(3)), + gv_add8(s_lvsl_base, gv_bcst8(4)), gv_add8(s_lvsl_base, gv_bcst8(5)), + gv_add8(s_lvsl_base, gv_bcst8(6)), gv_add8(s_lvsl_base, gv_bcst8(7)), + gv_add8(s_lvsl_base, gv_bcst8(8)), gv_add8(s_lvsl_base, gv_bcst8(9)), + gv_add8(s_lvsl_base, gv_bcst8(10)), gv_add8(s_lvsl_base, gv_bcst8(11)), + gv_add8(s_lvsl_base, gv_bcst8(12)), gv_add8(s_lvsl_base, gv_bcst8(13)), + gv_add8(s_lvsl_base, gv_bcst8(14)), gv_add8(s_lvsl_base, gv_bcst8(15)), +}; + +void SEMANTIC(LVSL)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.vr[inst.vd] = s_lvsl_consts[addr % 16]; +} +void DECODER(LVSL) { LVSL(context, inst); } +EXPORT_SEMANTIC(LVSL); + +void SEMANTIC(LVEBX)(PPUContext &context, Instruction inst) { + const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]) & + ~0xfull; + context.vr[inst.vd] = ppu_feed_data(context, addr); +} +void DECODER(LVEBX) { LVEBX(context, inst); } +EXPORT_SEMANTIC(LVEBX); + +void SEMANTIC(SUBFC)(PPUContext &context, Instruction inst) { + const u64 RA = context.gpr[inst.ra]; + const u64 RB = context.gpr[inst.rb]; + const auto r = add64_flags(~RA, RB, 1); + context.gpr[inst.rd] = r.result; + context.xer_ca = r.carry; + + if (inst.oe) { + context.setOV((~RA >> 63 == RB >> 63) && + (~RA >> 63 != context.gpr[inst.rd] >> 63)); + } + + if (inst.rc) { + context.cr.fields[0].update(r.result, 0, context.xer_so); + } +} +void DECODER(SUBFC) { SUBFC(context, inst); } +EXPORT_SEMANTIC(SUBFC); + +void SEMANTIC(MULHDU)(PPUContext &context, Instruction inst) { + context.gpr[inst.rd] = umulh64(context.gpr[inst.ra], context.gpr[inst.rb]); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(MULHDU) { MULHDU(context, inst); } +EXPORT_SEMANTIC(MULHDU); + +void SEMANTIC(ADDC)(PPUContext &context, Instruction inst) { + const u64 RA = context.gpr[inst.ra]; + const u64 RB = context.gpr[inst.rb]; + const auto r = add64_flags(RA, RB); + context.gpr[inst.rd] = r.result; + context.xer_ca = r.carry; + + if (inst.oe) { + context.setOV((RA >> 63 == RB >> 63) && + (RA >> 63 != context.gpr[inst.rd] >> 63)); + } + + if (inst.rc) { + context.cr.fields[0].update(r.result, 0, context.xer_so); + } +} +void DECODER(ADDC) { ADDC(context, inst); } +EXPORT_SEMANTIC(ADDC); + +void SEMANTIC(MULHWU)(PPUContext &context, Instruction inst) { + u32 a = static_cast(context.gpr[inst.ra]); + u32 b = static_cast(context.gpr[inst.rb]); + context.gpr[inst.rd] = (u64{a} * b) >> 32; + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(MULHWU) { MULHWU(context, inst); } +EXPORT_SEMANTIC(MULHWU); + +void SEMANTIC(MFCR)(PPUContext &context, std::uint64_t &d) { +#if defined(ARCH_X64) + be_t lane0, lane1; + std::memcpy(&lane0, context.cr.fields, sizeof(v128)); + std::memcpy(&lane1, context.cr.fields + 4, sizeof(v128)); + const u32 mh = _mm_movemask_epi8(_mm_slli_epi64(lane0.value(), 7)); + const u32 ml = _mm_movemask_epi8(_mm_slli_epi64(lane1.value(), 7)); + + d = (mh << 16) | ml; +#else + d = context.cr.pack(); +#endif +} +void DECODER(MFCR) { MFCR(context, context.gpr[inst.rd]); } +EXPORT_SEMANTIC(MFCR); + +void SEMANTIC(MFOCRF)(u64 &d, u32 crIndex, CrField &cr) { + const u32 v = + cr.bits[0] << 3 | cr.bits[1] << 2 | cr.bits[2] << 1 | cr.bits[3] << 0; + + d = v << ((crIndex * 4) ^ 0x1c); +} +void DECODER(MFOCRF) { + if (inst.l11) { + auto crIndex = std::countl_zero(inst.crm) & 7; + MFOCRF(context.gpr[inst.rd], crIndex, context.cr.fields[crIndex]); + } else { + MFCR(context, context.gpr[inst.rd]); + } +} +EXPORT_SEMANTIC(MFOCRF); + +void SEMANTIC(LWARX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_lwarx(context, vm::cast(addr)); +} +void DECODER(LWARX) { LWARX(context, inst); } +EXPORT_SEMANTIC(LWARX); + +void SEMANTIC(LDX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data(context, addr); +} +void DECODER(LDX) { LDX(context, inst); } +EXPORT_SEMANTIC(LDX); + +void SEMANTIC(LWZX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data(context, addr); +} +void DECODER(LWZX) { LWZX(context, inst); } +EXPORT_SEMANTIC(LWZX); + +void SEMANTIC(SLW)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = + static_cast(context.gpr[inst.rs] << (context.gpr[inst.rb] & 0x3f)); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(SLW) { SLW(context, inst); } +EXPORT_SEMANTIC(SLW); + +void SEMANTIC(CNTLZW)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = + std::countl_zero(static_cast(context.gpr[inst.rs])); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(CNTLZW) { CNTLZW(context, inst); } +EXPORT_SEMANTIC(CNTLZW); + +void SEMANTIC(SLD)(PPUContext &context, Instruction inst) { + const u32 n = context.gpr[inst.rb] & 0x7f; + context.gpr[inst.ra] = n & 0x40 ? 0 : context.gpr[inst.rs] << n; + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(SLD) { SLD(context, inst); } +EXPORT_SEMANTIC(SLD); + +void SEMANTIC(AND)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = context.gpr[inst.rs] & context.gpr[inst.rb]; + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(AND) { AND(context, inst); } +EXPORT_SEMANTIC(AND); + +void SEMANTIC(CMPL)(PPUContext &context, Instruction inst) { + if (inst.l10) { + context.cr.fields[inst.crfd].update( + context.gpr[inst.ra], context.gpr[inst.rb], context.xer_so); + } else { + context.cr.fields[inst.crfd].update( + static_cast(context.gpr[inst.ra]), + static_cast(context.gpr[inst.rb]), context.xer_so); + } +} +void DECODER(CMPL) { CMPL(context, inst); } +EXPORT_SEMANTIC(CMPL); + +static const v128 s_lvsr_consts[16] = { + gv_add8(s_lvsl_base, gv_bcst8(16)), gv_add8(s_lvsl_base, gv_bcst8(15)), + gv_add8(s_lvsl_base, gv_bcst8(14)), gv_add8(s_lvsl_base, gv_bcst8(13)), + gv_add8(s_lvsl_base, gv_bcst8(12)), gv_add8(s_lvsl_base, gv_bcst8(11)), + gv_add8(s_lvsl_base, gv_bcst8(10)), gv_add8(s_lvsl_base, gv_bcst8(9)), + gv_add8(s_lvsl_base, gv_bcst8(8)), gv_add8(s_lvsl_base, gv_bcst8(7)), + gv_add8(s_lvsl_base, gv_bcst8(6)), gv_add8(s_lvsl_base, gv_bcst8(5)), + gv_add8(s_lvsl_base, gv_bcst8(4)), gv_add8(s_lvsl_base, gv_bcst8(3)), + gv_add8(s_lvsl_base, gv_bcst8(2)), gv_add8(s_lvsl_base, gv_bcst8(1)), +}; + +void SEMANTIC(LVSR)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.vr[inst.vd] = s_lvsr_consts[addr % 16]; +} +void DECODER(LVSR) { LVSR(context, inst); } +EXPORT_SEMANTIC(LVSR); + +void SEMANTIC(LVEHX)(PPUContext &context, Instruction inst) { + const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]) & + ~0xfull; + context.vr[inst.vd] = ppu_feed_data(context, addr); +} +void DECODER(LVEHX) { LVEHX(context, inst); } +EXPORT_SEMANTIC(LVEHX); + +void SEMANTIC(SUBF)(PPUContext &context, Instruction inst) { + const u64 RA = context.gpr[inst.ra]; + const u64 RB = context.gpr[inst.rb]; + context.gpr[inst.rd] = RB - RA; + + if (inst.oe) { + context.setOV((~RA >> 63 == RB >> 63) && + (~RA >> 63 != context.gpr[inst.rd] >> 63)); + } + + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(SUBF) { SUBF(context, inst); } +EXPORT_SEMANTIC(SUBF); + +void SEMANTIC(LDUX)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LDUX) { LDUX(context, inst); } +EXPORT_SEMANTIC(LDUX); + +void SEMANTIC(DCBST)() {} +void DECODER(DCBST) { DCBST(); } +EXPORT_SEMANTIC(DCBST); + +void SEMANTIC(LWZUX)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LWZUX) { LWZUX(context, inst); } +EXPORT_SEMANTIC(LWZUX); + +void SEMANTIC(CNTLZD)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = std::countl_zero(context.gpr[inst.rs]); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(CNTLZD) { CNTLZD(context, inst); } +EXPORT_SEMANTIC(CNTLZD); + +void SEMANTIC(ANDC)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = context.gpr[inst.rs] & ~context.gpr[inst.rb]; + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(ANDC) { ANDC(context, inst); } +EXPORT_SEMANTIC(ANDC); + +void SEMANTIC(TD)(s64 ra, u8 bo, s64 rb) { + if ((bo & 0x10) && ra < rb) { + rpcsx_trap(); + } + + if ((bo & 0x8) && ra > rb) { + rpcsx_trap(); + } + + if ((bo & 0x4) && ra == rb) { + rpcsx_trap(); + } + + if ((bo & 0x2) && (u64)ra < (u64)rb) { + rpcsx_trap(); + } + + if ((bo & 0x1) && (u64)ra > (u64)rb) { + rpcsx_trap(); + } +} +void DECODER(TD) { TD(context.gpr[inst.ra], inst.bo, context.gpr[inst.rb]); } +EXPORT_SEMANTIC(TD); + +void SEMANTIC(LVEWX)(PPUContext &context, Instruction inst) { + const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]) & + ~0xfull; + context.vr[inst.vd] = ppu_feed_data(context, addr); +} +void DECODER(LVEWX) { LVEWX(context, inst); } +EXPORT_SEMANTIC(LVEWX); + +void SEMANTIC(MULHD)(PPUContext &context, Instruction inst) { + context.gpr[inst.rd] = mulh64(context.gpr[inst.ra], context.gpr[inst.rb]); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(MULHD) { MULHD(context, inst); } +EXPORT_SEMANTIC(MULHD); + +void SEMANTIC(MULHW)(PPUContext &context, Instruction inst) { + s32 a = static_cast(context.gpr[inst.ra]); + s32 b = static_cast(context.gpr[inst.rb]); + context.gpr[inst.rd] = (s64{a} * b) >> 32; + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(MULHW) { MULHW(context, inst); } +EXPORT_SEMANTIC(MULHW); + +void SEMANTIC(LDARX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_ldarx(context, vm::cast(addr)); +} +void DECODER(LDARX) { LDARX(context, inst); } +EXPORT_SEMANTIC(LDARX); + +void SEMANTIC(DCBF)() {} +void DECODER(DCBF) { DCBF(); } +EXPORT_SEMANTIC(DCBF); + +void SEMANTIC(LBZX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data(context, addr); +} +void DECODER(LBZX) { LBZX(context, inst); } +EXPORT_SEMANTIC(LBZX); + +void SEMANTIC(LVX)(PPUContext &context, Instruction inst) { + const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]) & + ~0xfull; + context.vr[inst.vd] = ppu_feed_data(context, addr); +} +void DECODER(LVX) { LVX(context, inst); } +EXPORT_SEMANTIC(LVX); + +void SEMANTIC(NEG)(PPUContext &context, Instruction inst) { + const u64 RA = context.gpr[inst.ra]; + context.gpr[inst.rd] = 0 - RA; + + if (inst.oe) { + // FIXME: verify + context.setOV(RA == (1ull << 63)); + } + + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(NEG) { NEG(context, inst); } +EXPORT_SEMANTIC(NEG); + +void SEMANTIC(LBZUX)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LBZUX) { LBZUX(context, inst); } +EXPORT_SEMANTIC(LBZUX); + +void SEMANTIC(NOR)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = ~(context.gpr[inst.rs] | context.gpr[inst.rb]); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(NOR) { NOR(context, inst); } +EXPORT_SEMANTIC(NOR); + +void SEMANTIC(STVEBX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + const u8 eb = addr & 0xf; + vm::write(vm::cast(addr), context.vr[inst.vs]._u8[15 - eb]); +} +void DECODER(STVEBX) { STVEBX(context, inst); } +EXPORT_SEMANTIC(STVEBX); + +void SEMANTIC(SUBFE)(PPUContext &context, Instruction inst) { + const u64 RA = context.gpr[inst.ra]; + const u64 RB = context.gpr[inst.rb]; + const auto r = add64_flags(~RA, RB, context.xer_ca); + context.gpr[inst.rd] = r.result; + context.xer_ca = r.carry; + + if (inst.oe) { + context.setOV((~RA >> 63 == RB >> 63) && + (~RA >> 63 != context.gpr[inst.rd] >> 63)); + } + + if (inst.rc) { + context.cr.fields[0].update(r.result, 0, context.xer_so); + } +} +void DECODER(SUBFE) { SUBFE(context, inst); } +EXPORT_SEMANTIC(SUBFE); + +void SEMANTIC(ADDE)(PPUContext &context, Instruction inst) { + const u64 RA = context.gpr[inst.ra]; + const u64 RB = context.gpr[inst.rb]; + const auto r = add64_flags(RA, RB, context.xer_ca); + context.gpr[inst.rd] = r.result; + context.xer_ca = r.carry; + + if (inst.oe) { + context.setOV((RA >> 63 == RB >> 63) && + (RA >> 63 != context.gpr[inst.rd] >> 63)); + } + + if (inst.rc) { + context.cr.fields[0].update(r.result, 0, context.xer_so); + } +} +void DECODER(ADDE) { ADDE(context, inst); } +EXPORT_SEMANTIC(ADDE); + +void SEMANTIC(MTOCRF)(PPUContext &context, Instruction inst) { + static constexpr CrField s_table[16]{ + CrField::From(false, false, false, false), + CrField::From(false, false, false, true), + CrField::From(false, false, true, false), + CrField::From(false, false, true, true), + CrField::From(false, true, false, false), + CrField::From(false, true, false, true), + CrField::From(false, true, true, false), + CrField::From(false, true, true, true), + CrField::From(true, false, false, false), + CrField::From(true, false, false, true), + CrField::From(true, false, true, false), + CrField::From(true, false, true, true), + CrField::From(true, true, false, false), + CrField::From(true, true, false, true), + CrField::From(true, true, true, false), + CrField::From(true, true, true, true), + }; + + const u64 s = context.gpr[inst.rs]; + + if (inst.l11) { + // MTOCRF + + const u32 n = std::countl_zero(inst.crm) & 7; + const u64 v = (s >> ((n * 4) ^ 0x1c)) & 0xf; + context.cr.fields[n] = s_table[v]; + } else { + // MTCRF + + for (u32 i = 0; i < 8; i++) { + if (inst.crm & (128 >> i)) { + const u64 v = (s >> ((i * 4) ^ 0x1c)) & 0xf; + context.cr.fields[i] = s_table[v]; + } + } + } +} +void DECODER(MTOCRF) { MTOCRF(context, inst); } +EXPORT_SEMANTIC(MTOCRF); + +void SEMANTIC(STDX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + vm::write(vm::cast(addr), context.gpr[inst.rs]); +} +void DECODER(STDX) { STDX(context, inst); } +EXPORT_SEMANTIC(STDX); + +void SEMANTIC(STWCX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.cr.fields[0].set(false, false, + ppu_stwcx(context, vm::cast(addr), + static_cast(context.gpr[inst.rs])), + context.xer_so); +} +void DECODER(STWCX) { STWCX(context, inst); } +EXPORT_SEMANTIC(STWCX); + +void SEMANTIC(STWX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + vm::write(vm::cast(addr), static_cast(context.gpr[inst.rs])); +} +void DECODER(STWX) { STWX(context, inst); } +EXPORT_SEMANTIC(STWX); + +void SEMANTIC(STVEHX)(PPUContext &context, Instruction inst) { + const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]) & + ~1ULL; + const u8 eb = (addr & 0xf) >> 1; + vm::write(vm::cast(addr), context.vr[inst.vs]._u16[7 - eb]); +} +void DECODER(STVEHX) { STVEHX(context, inst); } +EXPORT_SEMANTIC(STVEHX); + +void SEMANTIC(STDUX)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb]; + vm::write(vm::cast(addr), context.gpr[inst.rs]); + context.gpr[inst.ra] = addr; +} +void DECODER(STDUX) { STDUX(context, inst); } +EXPORT_SEMANTIC(STDUX); + +void SEMANTIC(STWUX)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb]; + vm::write(vm::cast(addr), static_cast(context.gpr[inst.rs])); + context.gpr[inst.ra] = addr; +} +void DECODER(STWUX) { STWUX(context, inst); } +EXPORT_SEMANTIC(STWUX); + +void SEMANTIC(STVEWX)(PPUContext &context, Instruction inst) { + const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]) & + ~3ULL; + const u8 eb = (addr & 0xf) >> 2; + vm::write(vm::cast(addr), context.vr[inst.vs]._u32[3 - eb]); +} +void DECODER(STVEWX) { STVEWX(context, inst); } +EXPORT_SEMANTIC(STVEWX); + +void SEMANTIC(SUBFZE)(PPUContext &context, Instruction inst) { + const u64 RA = context.gpr[inst.ra]; + const auto r = add64_flags(~RA, 0, context.xer_ca); + context.gpr[inst.rd] = r.result; + context.xer_ca = r.carry; + + if (inst.oe) { + context.setOV((~RA >> 63 == 0) && (~RA >> 63 != r.result >> 63)); + } + + if (inst.rc) { + context.cr.fields[0].update(r.result, 0, context.xer_so); + } +} +void DECODER(SUBFZE) { SUBFZE(context, inst); } +EXPORT_SEMANTIC(SUBFZE); + +void SEMANTIC(ADDZE)(PPUContext &context, Instruction inst) { + const u64 RA = context.gpr[inst.ra]; + const auto r = add64_flags(RA, 0, context.xer_ca); + context.gpr[inst.rd] = r.result; + context.xer_ca = r.carry; + + if (inst.oe) { + context.setOV((RA >> 63 == 0) && (RA >> 63 != r.result >> 63)); + } + + if (inst.rc) { + context.cr.fields[0].update(r.result, 0, context.xer_so); + } +} +void DECODER(ADDZE) { ADDZE(context, inst); } +EXPORT_SEMANTIC(ADDZE); + +void SEMANTIC(STDCX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.cr.fields[0].set( + false, false, ppu_stdcx(context, vm::cast(addr), context.gpr[inst.rs]), + context.xer_so); +} +void DECODER(STDCX) { STDCX(context, inst); } +EXPORT_SEMANTIC(STDCX); + +void SEMANTIC(STBX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + vm::write(vm::cast(addr), static_cast(context.gpr[inst.rs])); +} +void DECODER(STBX) { STBX(context, inst); } +EXPORT_SEMANTIC(STBX); + +void SEMANTIC(STVX)(PPUContext &context, Instruction inst) { + const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]) & + ~0xfull; + vm::write(vm::cast(addr), context.vr[inst.vs]); +} +void DECODER(STVX) { STVX(context, inst); } +EXPORT_SEMANTIC(STVX); + +void SEMANTIC(MULLD)(PPUContext &context, Instruction inst) { + const s64 RA = context.gpr[inst.ra]; + const s64 RB = context.gpr[inst.rb]; + context.gpr[inst.rd] = RA * RB; + if (inst.oe) { + const s64 high = mulh64(RA, RB); + // FIXME: verify + context.setOV(high != s64(context.gpr[inst.rd]) >> 63); + } + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(MULLD) { MULLD(context, inst); } +EXPORT_SEMANTIC(MULLD); + +void SEMANTIC(SUBFME)(PPUContext &context, Instruction inst) { + const u64 RA = context.gpr[inst.ra]; + const auto r = add64_flags(~RA, ~0ull, context.xer_ca); + context.gpr[inst.rd] = r.result; + context.xer_ca = r.carry; + if (inst.oe) { + context.setOV((~RA >> 63 == 1) && (~RA >> 63 != r.result >> 63)); + } + if (inst.rc) { + context.cr.fields[0].update(r.result, 0, context.xer_so); + } +} +void DECODER(SUBFME) { SUBFME(context, inst); } +EXPORT_SEMANTIC(SUBFME); + +void SEMANTIC(ADDME)(PPUContext &context, Instruction inst) { + const s64 RA = context.gpr[inst.ra]; + const auto r = add64_flags(RA, ~0ull, context.xer_ca); + context.gpr[inst.rd] = r.result; + context.xer_ca = r.carry; + if (inst.oe) { + context.setOV((u64(RA) >> 63 == 1) && (u64(RA) >> 63 != r.result >> 63)); + } + if (inst.rc) { + context.cr.fields[0].update(r.result, 0, context.xer_so); + } +} +void DECODER(ADDME) { ADDME(context, inst); } +EXPORT_SEMANTIC(ADDME); + +void SEMANTIC(MULLW)(PPUContext &context, Instruction inst) { + context.gpr[inst.rd] = s64{static_cast(context.gpr[inst.ra])} * + static_cast(context.gpr[inst.rb]); + + if (inst.oe) { + context.setOV(s64(context.gpr[inst.rd]) < INT32_MIN || + s64(context.gpr[inst.rd]) > INT32_MAX); + } + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(MULLW) { MULLW(context, inst); } +EXPORT_SEMANTIC(MULLW); + +void SEMANTIC(DCBTST)() {} +void DECODER(DCBTST) { DCBTST(); } +EXPORT_SEMANTIC(DCBTST); + +void SEMANTIC(STBUX)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb]; + vm::write(vm::cast(addr), static_cast(context.gpr[inst.rs])); + context.gpr[inst.ra] = addr; +} +void DECODER(STBUX) { STBUX(context, inst); } +EXPORT_SEMANTIC(STBUX); + +void SEMANTIC(ADD)(PPUContext &context, Instruction inst) { + const u64 RA = context.gpr[inst.ra]; + const u64 RB = context.gpr[inst.rb]; + context.gpr[inst.rd] = RA + RB; + + if (inst.oe) { + context.setOV((RA >> 63 == RB >> 63) && + (RA >> 63 != context.gpr[inst.rd] >> 63)); + } + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(ADD) { ADD(context, inst); } +EXPORT_SEMANTIC(ADD); + +void SEMANTIC(DCBT)() {} +void DECODER(DCBT) { DCBT(); } +EXPORT_SEMANTIC(DCBT); + +void SEMANTIC(LHZX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data(context, addr); +} +void DECODER(LHZX) { LHZX(context, inst); } +EXPORT_SEMANTIC(LHZX); + +void SEMANTIC(EQV)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = ~(context.gpr[inst.rs] ^ context.gpr[inst.rb]); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(EQV) { EQV(context, inst); } +EXPORT_SEMANTIC(EQV); + +void SEMANTIC(ECIWX)() { rpcsx_unimplemented_instruction(); } +void DECODER(ECIWX) { ECIWX(); } +EXPORT_SEMANTIC(ECIWX); + +void SEMANTIC(LHZUX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LHZUX) { LHZUX(context, inst); } +EXPORT_SEMANTIC(LHZUX); + +void SEMANTIC(XOR)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = context.gpr[inst.rs] ^ context.gpr[inst.rb]; + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(XOR) { XOR(context, inst); } +EXPORT_SEMANTIC(XOR); + +void SEMANTIC(MFSPR)(PPUContext &context, Instruction inst) { + const u32 n = (inst.spr >> 5) | ((inst.spr & 0x1f) << 5); + + switch (n) { + case 0x001: + context.gpr[inst.rd] = u32{context.xer_so} << 31 | context.xer_ov << 30 | + context.xer_ca << 29 | context.xer_cnt; + break; + case 0x008: + context.gpr[inst.rd] = context.lr; + break; + case 0x009: + context.gpr[inst.rd] = context.ctr; + break; + case 0x100: + context.gpr[inst.rd] = context.vrsave; + break; + + case 0x10C: + context.gpr[inst.rd] = rpcsx_get_tb(); + break; + case 0x10D: + context.gpr[inst.rd] = rpcsx_get_tb() >> 32; + break; + default: + rpcsx_invalid_instruction(); + } +} +void DECODER(MFSPR) { MFSPR(context, inst); } +EXPORT_SEMANTIC(MFSPR); + +void SEMANTIC(LWAX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data(context, addr); +} +void DECODER(LWAX) { LWAX(context, inst); } +EXPORT_SEMANTIC(LWAX); + +void SEMANTIC(DST)() {} +void DECODER(DST) { DST(); } +EXPORT_SEMANTIC(DST); + +void SEMANTIC(LHAX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data(context, addr); +} +void DECODER(LHAX) { LHAX(context, inst); } +EXPORT_SEMANTIC(LHAX); + +void SEMANTIC(LVXL)(PPUContext &context, Instruction inst) { + const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]) & + ~0xfull; + context.vr[inst.vd] = ppu_feed_data(context, addr); +} +void DECODER(LVXL) { LVXL(context, inst); } +EXPORT_SEMANTIC(LVXL); + +void SEMANTIC(MFTB)(PPUContext &context, Instruction inst) { + const u32 n = (inst.spr >> 5) | ((inst.spr & 0x1f) << 5); + + switch (n) { + case 0x10C: + context.gpr[inst.rd] = rpcsx_get_tb(); + break; + case 0x10D: + context.gpr[inst.rd] = rpcsx_get_tb() >> 32; + break; + default: + rpcsx_invalid_instruction(); + } +} +void DECODER(MFTB) { MFTB(context, inst); } +EXPORT_SEMANTIC(MFTB); + +void SEMANTIC(LWAUX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LWAUX) { LWAUX(context, inst); } +EXPORT_SEMANTIC(LWAUX); + +void SEMANTIC(DSTST)() {} +void DECODER(DSTST) { DSTST(); } +EXPORT_SEMANTIC(DSTST); + +void SEMANTIC(LHAUX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LHAUX) { LHAUX(context, inst); } +EXPORT_SEMANTIC(LHAUX); + +void SEMANTIC(STHX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + vm::write(vm::cast(addr), static_cast(context.gpr[inst.rs])); +} +void DECODER(STHX) { STHX(context, inst); } +EXPORT_SEMANTIC(STHX); + +void SEMANTIC(ORC)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = context.gpr[inst.rs] | ~context.gpr[inst.rb]; + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(ORC) { ORC(context, inst); } +EXPORT_SEMANTIC(ORC); + +void SEMANTIC(ECOWX)() { rpcsx_unimplemented_instruction(); } +void DECODER(ECOWX) { ECOWX(); } +EXPORT_SEMANTIC(ECOWX); + +void SEMANTIC(STHUX)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb]; + vm::write(vm::cast(addr), static_cast(context.gpr[inst.rs])); + context.gpr[inst.ra] = addr; +} +void DECODER(STHUX) { STHUX(context, inst); } +EXPORT_SEMANTIC(STHUX); + +void SEMANTIC(OR)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = context.gpr[inst.rs] | context.gpr[inst.rb]; + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(OR) { OR(context, inst); } +EXPORT_SEMANTIC(OR); + +void SEMANTIC(DIVDU)(PPUContext &context, Instruction inst) { + const u64 RA = context.gpr[inst.ra]; + const u64 RB = context.gpr[inst.rb]; + context.gpr[inst.rd] = RB == 0 ? 0 : RA / RB; + + if (inst.oe) { + context.setOV(RB == 0); + } + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(DIVDU) { DIVDU(context, inst); } +EXPORT_SEMANTIC(DIVDU); + +void SEMANTIC(DIVWU)(PPUContext &context, Instruction inst) { + const u32 RA = static_cast(context.gpr[inst.ra]); + const u32 RB = static_cast(context.gpr[inst.rb]); + context.gpr[inst.rd] = RB == 0 ? 0 : RA / RB; + if (inst.oe) { + context.setOV(RB == 0); + } + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(DIVWU) { DIVWU(context, inst); } +EXPORT_SEMANTIC(DIVWU); + +void SEMANTIC(MTSPR)(PPUContext &context, Instruction inst) { + const u32 n = (inst.spr >> 5) | ((inst.spr & 0x1f) << 5); + + switch (n) { + case 0x001: { + const u64 value = context.gpr[inst.rs]; + context.xer_so = (value & 0x80000000) != 0; + context.xer_ov = (value & 0x40000000) != 0; + context.xer_ca = (value & 0x20000000) != 0; + context.xer_cnt = value & 0x7f; + break; + } + case 0x008: + context.lr = context.gpr[inst.rs]; + break; + case 0x009: + context.ctr = context.gpr[inst.rs]; + break; + case 0x100: + context.vrsave = static_cast(context.gpr[inst.rs]); + break; + default: + rpcsx_invalid_instruction(); + } +} +void DECODER(MTSPR) { MTSPR(context, inst); } +EXPORT_SEMANTIC(MTSPR); + +void SEMANTIC(DCBI)() {} +void DECODER(DCBI) { DCBI(); } +EXPORT_SEMANTIC(DCBI); + +void SEMANTIC(NAND)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = ~(context.gpr[inst.rs] & context.gpr[inst.rb]); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(NAND) { NAND(context, inst); } +EXPORT_SEMANTIC(NAND); + +void SEMANTIC(STVXL)(PPUContext &context, Instruction inst) { + const u64 addr = (inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]) & + ~0xfull; + vm::write(vm::cast(addr), context.vr[inst.vs]); +} +void DECODER(STVXL) { STVXL(context, inst); } +EXPORT_SEMANTIC(STVXL); + +void SEMANTIC(DIVD)(PPUContext &context, Instruction inst) { + const s64 RA = context.gpr[inst.ra]; + const s64 RB = context.gpr[inst.rb]; + const bool o = RB == 0 || (RA == INT64_MIN && RB == -1); + context.gpr[inst.rd] = o ? 0 : RA / RB; + if (inst.oe) { + context.setOV(o); + } + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(DIVD) { DIVD(context, inst); } +EXPORT_SEMANTIC(DIVD); + +void SEMANTIC(DIVW)(PPUContext &context, Instruction inst) { + const s32 RA = static_cast(context.gpr[inst.ra]); + const s32 RB = static_cast(context.gpr[inst.rb]); + const bool o = RB == 0 || (RA == INT32_MIN && RB == -1); + context.gpr[inst.rd] = o ? 0 : static_cast(RA / RB); + if (inst.oe) { + context.setOV(o); + } + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.rd], 0, context.xer_so); + } +} +void DECODER(DIVW) { DIVW(context, inst); } +EXPORT_SEMANTIC(DIVW); + +void SEMANTIC(LVLX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + const u128 data = ppu_feed_data(context, addr & -16); + context.vr[inst.vd] = data << ((addr & 15) * 8); +} +void DECODER(LVLX) { LVLX(context, inst); } +EXPORT_SEMANTIC(LVLX); + +void SEMANTIC(LDBRX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data>(context, addr); +} +void DECODER(LDBRX) { LDBRX(context, inst); } +EXPORT_SEMANTIC(LDBRX); + +void SEMANTIC(LSWX)(PPUContext &context, Instruction inst) { + u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + u32 count = context.xer_cnt & 0x7f; + for (; count >= 4; count -= 4, addr += 4, inst.rd = (inst.rd + 1) & 31) { + context.gpr[inst.rd] = ppu_feed_data(context, addr); + } + if (count) { + u32 value = 0; + for (u32 byte = 0; byte < count; byte++) { + u32 byte_value = ppu_feed_data(context, addr + byte); + value |= byte_value << ((3 ^ byte) * 8); + } + context.gpr[inst.rd] = value; + } +} +void DECODER(LSWX) { LSWX(context, inst); } +EXPORT_SEMANTIC(LSWX); + +void SEMANTIC(LWBRX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data>(context, addr); +} +void DECODER(LWBRX) { LWBRX(context, inst); } +EXPORT_SEMANTIC(LWBRX); + +void SEMANTIC(LFSX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.fpr[inst.frd] = ppu_feed_data(context, addr); +} +void DECODER(LFSX) { LFSX(context, inst); } +EXPORT_SEMANTIC(LFSX); + +void SEMANTIC(SRW)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = + (context.gpr[inst.rs] & 0xffffffff) >> (context.gpr[inst.rb] & 0x3f); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(SRW) { SRW(context, inst); } +EXPORT_SEMANTIC(SRW); + +void SEMANTIC(SRD)(PPUContext &context, Instruction inst) { + const u32 n = context.gpr[inst.rb] & 0x7f; + context.gpr[inst.ra] = n & 0x40 ? 0 : context.gpr[inst.rs] >> n; + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(SRD) { SRD(context, inst); } +EXPORT_SEMANTIC(SRD); + +void SEMANTIC(LVRX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + + if ((addr & 15) == 0) { + context.vr[inst.vd] = u128(0); + } else { + const auto data = ppu_feed_data(context, addr & -16); + context.vr[inst.vd] = data >> ((~addr & 15) * 8) >> 8; + } +} +void DECODER(LVRX) { LVRX(context, inst); } +EXPORT_SEMANTIC(LVRX); + +void SEMANTIC(LSWI)(PPUContext &context, Instruction inst) { + u64 addr = inst.ra ? context.gpr[inst.ra] : 0; + u64 N = inst.rb ? inst.rb : 32; + u8 reg = inst.rd; + + while (N > 0) { + if (N > 3) { + context.gpr[reg] = ppu_feed_data(context, addr); + addr += 4; + N -= 4; + } else { + u32 buf = 0; + u32 i = 3; + while (N > 0) { + N = N - 1; + buf |= ppu_feed_data(context, addr) << (i * 8); + addr++; + i--; + } + context.gpr[reg] = buf; + } + reg = (reg + 1) % 32; + } +} +void DECODER(LSWI) { LSWI(context, inst); } +EXPORT_SEMANTIC(LSWI); + +void SEMANTIC(LFSUX)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb]; + context.fpr[inst.frd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LFSUX) { LFSUX(context, inst); } +EXPORT_SEMANTIC(LFSUX); + +void SEMANTIC(SYNC)() { std::atomic_thread_fence(std::memory_order::seq_cst); } +void DECODER(SYNC) { SYNC(); } +EXPORT_SEMANTIC(SYNC); + +void SEMANTIC(LFDX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.fpr[inst.frd] = ppu_feed_data(context, addr); +} +void DECODER(LFDX) { LFDX(context, inst); } +EXPORT_SEMANTIC(LFDX); + +void SEMANTIC(LFDUX)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb]; + context.fpr[inst.frd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LFDUX) { LFDUX(context, inst); } +EXPORT_SEMANTIC(LFDUX); + +void SEMANTIC(STVLX)(v128 s, std::uint64_t a, std::uint64_t b) { + const u64 addr = a + b; + const u32 tail = u32(addr & 15); + std::uint8_t data[16]; + for (u32 j = 0; j < 16 - tail; j++) + data[j] = s.u8r[j]; + + rpcsx_vm_write(addr, data, 16 - tail); +} +void DECODER(STVLX) { + STVLX(context.vr[inst.vs], inst.ra ? context.gpr[inst.ra] : 0, + context.gpr[inst.rb]); +} +EXPORT_SEMANTIC(STVLX); + +void SEMANTIC(STDBRX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + vm::write>(vm::cast(addr), context.gpr[inst.rs]); +} +void DECODER(STDBRX) { STDBRX(context, inst); } +EXPORT_SEMANTIC(STDBRX); + +void SEMANTIC(STSWX)(PPUContext &context, Instruction inst) { + u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + u32 count = context.xer_cnt & 0x7F; + for (; count >= 4; count -= 4, addr += 4, inst.rs = (inst.rs + 1) & 31) { + vm::write(vm::cast(addr), static_cast(context.gpr[inst.rs])); + } + if (count) { + u32 value = static_cast(context.gpr[inst.rs]); + for (u32 byte = 0; byte < count; byte++) { + u8 byte_value = static_cast(value >> ((3 ^ byte) * 8)); + vm::write(vm::cast(addr + byte), byte_value); + } + } +} +void DECODER(STSWX) { STSWX(context, inst); } +EXPORT_SEMANTIC(STSWX); + +void SEMANTIC(STWBRX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + vm::write>(vm::cast(addr), static_cast(context.gpr[inst.rs])); +} +void DECODER(STWBRX) { STWBRX(context, inst); } +EXPORT_SEMANTIC(STWBRX); + +void SEMANTIC(STFSX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + vm::write(vm::cast(addr), static_cast(context.fpr[inst.frs])); +} +void DECODER(STFSX) { STFSX(context, inst); } +EXPORT_SEMANTIC(STFSX); + +void SEMANTIC(STVRX)(v128 s, std::uint64_t a, std::uint64_t b) { + const u64 addr = a + b; + const u32 tail = u32(addr & 15); + std::uint8_t data[16]; + for (u32 i = 15; i > 15 - tail; i--) + data[i] = s.u8r[i]; + + // FIXME: verify + rpcsx_vm_write(addr - 16, data + 15 - tail, tail + 1); + // u8 *ptr = vm::_ptr(addr - 16); +} +void DECODER(STVRX) { + STVRX(context.vr[inst.vs], inst.ra ? context.gpr[inst.ra] : 0, + context.gpr[inst.rb]); +} +EXPORT_SEMANTIC(STVRX); + +void SEMANTIC(STFSUX)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb]; + vm::write(vm::cast(addr), static_cast(context.fpr[inst.frs])); + context.gpr[inst.ra] = addr; +} +void DECODER(STFSUX) { STFSUX(context, inst); } +EXPORT_SEMANTIC(STFSUX); + +void SEMANTIC(STSWI)(PPUContext &context, Instruction inst) { + u64 addr = inst.ra ? context.gpr[inst.ra] : 0; + u64 N = inst.rb ? inst.rb : 32; + u8 reg = inst.rd; + + while (N > 0) { + if (N > 3) { + vm::write(vm::cast(addr), static_cast(context.gpr[reg])); + addr += 4; + N -= 4; + } else { + u32 buf = static_cast(context.gpr[reg]); + while (N > 0) { + N = N - 1; + vm::write(vm::cast(addr), (0xFF000000 & buf) >> 24); + buf <<= 8; + addr++; + } + } + reg = (reg + 1) % 32; + } +} +void DECODER(STSWI) { STSWI(context, inst); } +EXPORT_SEMANTIC(STSWI); + +void SEMANTIC(STFDX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + vm::write(vm::cast(addr), context.fpr[inst.frs]); +} +void DECODER(STFDX) { STFDX(context, inst); } +EXPORT_SEMANTIC(STFDX); + +void SEMANTIC(STFDUX)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + context.gpr[inst.rb]; + vm::write(vm::cast(addr), context.fpr[inst.frs]); + context.gpr[inst.ra] = addr; +} +void DECODER(STFDUX) { STFDUX(context, inst); } +EXPORT_SEMANTIC(STFDUX); + +void SEMANTIC(LVLXL)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + const u128 data = ppu_feed_data(context, addr & -16); + context.vr[inst.vd] = data << ((addr & 15) * 8); +} +void DECODER(LVLXL) { LVLXL(context, inst); } +EXPORT_SEMANTIC(LVLXL); + +void SEMANTIC(LHBRX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + context.gpr[inst.rd] = ppu_feed_data>(context, addr); +} +void DECODER(LHBRX) { LHBRX(context, inst); } +EXPORT_SEMANTIC(LHBRX); + +void SEMANTIC(SRAW)(PPUContext &context, Instruction inst) { + s32 RS = static_cast(context.gpr[inst.rs]); + u8 shift = context.gpr[inst.rb] & 63; + if (shift > 31) { + context.gpr[inst.ra] = 0 - (RS < 0); + context.xer_ca = (RS < 0); + } else { + context.gpr[inst.ra] = RS >> shift; + context.xer_ca = + (RS < 0) && ((context.gpr[inst.ra] << shift) != static_cast(RS)); + } + + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(SRAW) { SRAW(context, inst); } +EXPORT_SEMANTIC(SRAW); + +void SEMANTIC(SRAD)(PPUContext &context, Instruction inst) { + s64 RS = context.gpr[inst.rs]; + u8 shift = context.gpr[inst.rb] & 127; + if (shift > 63) { + context.gpr[inst.ra] = 0 - (RS < 0); + context.xer_ca = (RS < 0); + } else { + context.gpr[inst.ra] = RS >> shift; + context.xer_ca = + (RS < 0) && ((context.gpr[inst.ra] << shift) != static_cast(RS)); + } + + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(SRAD) { SRAD(context, inst); } +EXPORT_SEMANTIC(SRAD); + +void SEMANTIC(LVRXL)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + + if ((addr & 15) == 0) { + context.vr[inst.vd] = u128(0); + } else { + const u128 data = ppu_feed_data(context, addr & -16); + context.vr[inst.vd] = data >> ((~addr & 15) * 8) >> 8; + } +} +void DECODER(LVRXL) { LVRXL(context, inst); } +EXPORT_SEMANTIC(LVRXL); + +void SEMANTIC(DSS)() {} +void DECODER(DSS) { DSS(); } +EXPORT_SEMANTIC(DSS); + +void SEMANTIC(SRAWI)(PPUContext &context, Instruction inst) { + s32 RS = static_cast(context.gpr[inst.rs]); + context.gpr[inst.ra] = RS >> inst.sh32; + context.xer_ca = + (RS < 0) && (static_cast(context.gpr[inst.ra] << inst.sh32) != + static_cast(RS)); + + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(SRAWI) { SRAWI(context, inst); } +EXPORT_SEMANTIC(SRAWI); + +void SEMANTIC(SRADI)(PPUContext &context, Instruction inst) { + auto sh = inst.sh64; + s64 RS = context.gpr[inst.rs]; + context.gpr[inst.ra] = RS >> sh; + context.xer_ca = + (RS < 0) && ((context.gpr[inst.ra] << sh) != static_cast(RS)); + + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(SRADI) { SRADI(context, inst); } +EXPORT_SEMANTIC(SRADI); + +void SEMANTIC(EIEIO)() { std::atomic_thread_fence(std::memory_order::seq_cst); } +void DECODER(EIEIO) { EIEIO(); } +EXPORT_SEMANTIC(EIEIO); + +void SEMANTIC(STVLXL)(v128 s, u64 a, u64 b) { + const u64 addr = a + b; + const u32 tail = u32(addr & 15); + // FIXME + for (u32 j = 0; j < 16 - tail; j++) + vm::write(addr + j, s.u8r[j]); +} +void DECODER(STVLXL) { + STVLXL(context.vr[inst.vs], inst.ra ? context.gpr[inst.ra] : 0, + context.gpr[inst.rb]); +} +EXPORT_SEMANTIC(STVLXL); + +void SEMANTIC(STHBRX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + vm::write>(vm::cast(addr), static_cast(context.gpr[inst.rs])); +} +void DECODER(STHBRX) { STHBRX(context, inst); } +EXPORT_SEMANTIC(STHBRX); + +void SEMANTIC(EXTSH)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = static_cast(context.gpr[inst.rs]); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(EXTSH) { EXTSH(context, inst); } +EXPORT_SEMANTIC(EXTSH); + +void SEMANTIC(STVRXL)(v128 s, u64 a, u64 b) { + const u64 addr = a + b; + const u32 tail = u32(addr & 15); + + // FIXME + for (u32 i = 15; i > 15 - tail; i--) + vm::write(addr - 16 + i, s.u8r[i]); +} +void DECODER(STVRXL) { + STVRXL(context.vr[inst.vs], inst.ra ? context.gpr[inst.ra] : 0, + context.gpr[inst.rb]); +} +EXPORT_SEMANTIC(STVRXL); + +void SEMANTIC(EXTSB)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = static_cast(context.gpr[inst.rs]); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(EXTSB) { EXTSB(context, inst); } +EXPORT_SEMANTIC(EXTSB); + +void SEMANTIC(STFIWX)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + vm::write(vm::cast(addr), + static_cast(std::bit_cast(context.fpr[inst.frs]))); +} +void DECODER(STFIWX) { STFIWX(context, inst); } +EXPORT_SEMANTIC(STFIWX); + +void SEMANTIC(EXTSW)(PPUContext &context, Instruction inst) { + context.gpr[inst.ra] = static_cast(context.gpr[inst.rs]); + if (inst.rc) { + context.cr.fields[0].update(context.gpr[inst.ra], 0, context.xer_so); + } +} +void DECODER(EXTSW) { EXTSW(context, inst); } +EXPORT_SEMANTIC(EXTSW); + +void SEMANTIC(ICBI)() {} +void DECODER(ICBI) { ICBI(); } +EXPORT_SEMANTIC(ICBI); + +void SEMANTIC(DCBZ)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + context.gpr[inst.rb] + : context.gpr[inst.rb]; + const u32 addr0 = vm::cast(addr) & ~127; + + alignas(64) static constexpr u8 zero_buf[128]{}; + do_cell_atomic_128_store(addr0, zero_buf); +} +void DECODER(DCBZ) { DCBZ(context, inst); } +EXPORT_SEMANTIC(DCBZ); + +void SEMANTIC(LWZ)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + context.gpr[inst.rd] = ppu_feed_data(context, addr); +} +void DECODER(LWZ) { LWZ(context, inst); } +EXPORT_SEMANTIC(LWZ); + +void SEMANTIC(LWZU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + inst.simm16; + context.gpr[inst.rd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LWZU) { LWZU(context, inst); } +EXPORT_SEMANTIC(LWZU); + +void SEMANTIC(LBZ)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + context.gpr[inst.rd] = ppu_feed_data(context, addr); +} +void DECODER(LBZ) { LBZ(context, inst); } +EXPORT_SEMANTIC(LBZ); + +void SEMANTIC(LBZU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + inst.simm16; + context.gpr[inst.rd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LBZU) { LBZU(context, inst); } +EXPORT_SEMANTIC(LBZU); + +void SEMANTIC(STW)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + const u32 value = static_cast(context.gpr[inst.rs]); + vm::write(vm::cast(addr), value); + + // Insomniac engine v3 & v4 (newer R&C, Fuse, Resitance 3) + // if (value == 0xAAAAAAAA) [[unlikely]] { + // vm::reservation_update(vm::cast(addr)); + // } +} +void DECODER(STW) { STW(context, inst); } +EXPORT_SEMANTIC(STW); + +void SEMANTIC(STWU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + inst.simm16; + vm::write(vm::cast(addr), static_cast(context.gpr[inst.rs])); + context.gpr[inst.ra] = addr; +} +void DECODER(STWU) { STWU(context, inst); } +EXPORT_SEMANTIC(STWU); + +void SEMANTIC(STB)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + vm::write(vm::cast(addr), static_cast(context.gpr[inst.rs])); +} +void DECODER(STB) { STB(context, inst); } +EXPORT_SEMANTIC(STB); + +void SEMANTIC(STBU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + inst.simm16; + vm::write(vm::cast(addr), static_cast(context.gpr[inst.rs])); + context.gpr[inst.ra] = addr; +} +void DECODER(STBU) { STBU(context, inst); } +EXPORT_SEMANTIC(STBU); + +void SEMANTIC(LHZ)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + context.gpr[inst.rd] = ppu_feed_data(context, addr); +} +void DECODER(LHZ) { LHZ(context, inst); } +EXPORT_SEMANTIC(LHZ); + +void SEMANTIC(LHZU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + inst.simm16; + context.gpr[inst.rd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LHZU) { LHZU(context, inst); } +EXPORT_SEMANTIC(LHZU); + +void SEMANTIC(LHA)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + context.gpr[inst.rd] = ppu_feed_data(context, addr); +} +void DECODER(LHA) { LHA(context, inst); } +EXPORT_SEMANTIC(LHA); + +void SEMANTIC(LHAU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + inst.simm16; + context.gpr[inst.rd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LHAU) { LHAU(context, inst); } +EXPORT_SEMANTIC(LHAU); + +void SEMANTIC(STH)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + vm::write(vm::cast(addr), static_cast(context.gpr[inst.rs])); +} +void DECODER(STH) { STH(context, inst); } +EXPORT_SEMANTIC(STH); + +void SEMANTIC(STHU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + inst.simm16; + vm::write(vm::cast(addr), static_cast(context.gpr[inst.rs])); + context.gpr[inst.ra] = addr; +} +void DECODER(STHU) { STHU(context, inst); } +EXPORT_SEMANTIC(STHU); + +void SEMANTIC(LMW)(PPUContext &context, Instruction inst) { + u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + for (u32 i = inst.rd; i < 32; ++i, addr += 4) { + context.gpr[i] = ppu_feed_data(context, addr); + } +} +void DECODER(LMW) { LMW(context, inst); } +EXPORT_SEMANTIC(LMW); + +void SEMANTIC(STMW)(PPUContext &context, Instruction inst) { + u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + for (u32 i = inst.rs; i < 32; ++i, addr += 4) { + vm::write(vm::cast(addr), static_cast(context.gpr[i])); + } +} +void DECODER(STMW) { STMW(context, inst); } +EXPORT_SEMANTIC(STMW); + +void SEMANTIC(LFS)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + context.fpr[inst.frd] = ppu_feed_data(context, addr); +} +void DECODER(LFS) { LFS(context, inst); } +EXPORT_SEMANTIC(LFS); + +void SEMANTIC(LFSU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + inst.simm16; + context.fpr[inst.frd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LFSU) { LFSU(context, inst); } +EXPORT_SEMANTIC(LFSU); + +void SEMANTIC(LFD)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + context.fpr[inst.frd] = ppu_feed_data(context, addr); +} +void DECODER(LFD) { LFD(context, inst); } +EXPORT_SEMANTIC(LFD); + +void SEMANTIC(LFDU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + inst.simm16; + context.fpr[inst.frd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LFDU) { LFDU(context, inst); } +EXPORT_SEMANTIC(LFDU); + +void SEMANTIC(STFS)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + vm::write(vm::cast(addr), static_cast(context.fpr[inst.frs])); +} +void DECODER(STFS) { STFS(context, inst); } +EXPORT_SEMANTIC(STFS); + +void SEMANTIC(STFSU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + inst.simm16; + vm::write(vm::cast(addr), static_cast(context.fpr[inst.frs])); + context.gpr[inst.ra] = addr; +} +void DECODER(STFSU) { STFSU(context, inst); } +EXPORT_SEMANTIC(STFSU); + +void SEMANTIC(STFD)(PPUContext &context, Instruction inst) { + const u64 addr = inst.ra ? context.gpr[inst.ra] + inst.simm16 : inst.simm16; + vm::write(vm::cast(addr), context.fpr[inst.frs]); +} +void DECODER(STFD) { STFD(context, inst); } +EXPORT_SEMANTIC(STFD); + +void SEMANTIC(STFDU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + inst.simm16; + vm::write(vm::cast(addr), context.fpr[inst.frs]); + context.gpr[inst.ra] = addr; +} +void DECODER(STFDU) { STFDU(context, inst); } +EXPORT_SEMANTIC(STFDU); + +void SEMANTIC(LD)(PPUContext &context, Instruction inst) { + const u64 addr = (inst.simm16 & ~3) + (inst.ra ? context.gpr[inst.ra] : 0); + context.gpr[inst.rd] = ppu_feed_data(context, addr); +} +void DECODER(LD) { LD(context, inst); } +EXPORT_SEMANTIC(LD); + +void SEMANTIC(LDU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + (inst.simm16 & ~3); + context.gpr[inst.rd] = ppu_feed_data(context, addr); + context.gpr[inst.ra] = addr; +} +void DECODER(LDU) { LDU(context, inst); } +EXPORT_SEMANTIC(LDU); + +void SEMANTIC(LWA)(PPUContext &context, Instruction inst) { + const u64 addr = (inst.simm16 & ~3) + (inst.ra ? context.gpr[inst.ra] : 0); + context.gpr[inst.rd] = ppu_feed_data(context, addr); +} +void DECODER(LWA) { LWA(context, inst); } +EXPORT_SEMANTIC(LWA); + +void SEMANTIC(STD)(PPUContext &context, Instruction inst) { + const u64 addr = (inst.simm16 & ~3) + (inst.ra ? context.gpr[inst.ra] : 0); + vm::write(vm::cast(addr), context.gpr[inst.rs]); +} +void DECODER(STD) { STD(context, inst); } +EXPORT_SEMANTIC(STD); + +void SEMANTIC(STDU)(PPUContext &context, Instruction inst) { + const u64 addr = context.gpr[inst.ra] + (inst.simm16 & ~3); + vm::write(vm::cast(addr), context.gpr[inst.rs]); + context.gpr[inst.ra] = addr; +} +void DECODER(STDU) { STDU(context, inst); } +EXPORT_SEMANTIC(STDU); + +static void ppu_set_fpcc(PPUContext &context, bool updateCr, f64 a, f64 b, + u64 cr_field = 1) { + static_assert(std::endian::native == std::endian::little, "Not implemented"); + + bool fpcc[4]; +#if defined(ARCH_X64) && !defined(_M_X64) + __asm__("comisd %[b], %[a]\n" + : "=@ccb"(fpcc[0]), "=@cca"(fpcc[1]), "=@ccz"(fpcc[2]), + "=@ccp"(fpcc[3]) + : [a] "x"(a), [b] "x"(b) + : "cc"); + if (fpcc[3]) [[unlikely]] { + fpcc[0] = fpcc[1] = fpcc[2] = false; + } +#else + const auto cmp = a <=> b; + fpcc[0] = cmp == std::partial_ordering::less; + fpcc[1] = cmp == std::partial_ordering::greater; + fpcc[2] = cmp == std::partial_ordering::equivalent; + fpcc[3] = cmp == std::partial_ordering::unordered; +#endif + + auto data = std::bit_cast(fpcc); + + // Write FPCC + context.fpscr.fields[4] = data; + + if (updateCr) { + // Previous behaviour was throwing an exception; TODO + context.cr.fields[cr_field] = data; + } +} + +void SEMANTIC(FDIVS)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = f32(context.fpr[inst.fra] / context.fpr[inst.frb]); + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FDIVS) { FDIVS(context, inst); } +EXPORT_SEMANTIC(FDIVS); + +void SEMANTIC(FSUBS)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = f32(context.fpr[inst.fra] - context.fpr[inst.frb]); + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FSUBS) { FSUBS(context, inst); } +EXPORT_SEMANTIC(FSUBS); + +void SEMANTIC(FADDS)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = f32(context.fpr[inst.fra] + context.fpr[inst.frb]); + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FADDS) { FADDS(context, inst); } +EXPORT_SEMANTIC(FADDS); + +void SEMANTIC(FSQRTS)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = f32(std::sqrt(context.fpr[inst.frb])); + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FSQRTS) { FSQRTS(context, inst); } +EXPORT_SEMANTIC(FSQRTS); + +void SEMANTIC(FRES)(PPUContext &context, Instruction inst) { + const f64 a = context.fpr[inst.frb]; + const u64 b = std::bit_cast(a); + const u64 e = (b >> 52) & 0x7ff; // double exp + const u64 i = (b >> 45) & 0x7f; // mantissa LUT index + const u64 r = e >= (0x3ff + 0x80) + ? 0 + : (0x7ff - 2 - e) << 52 | u64{ppu_fres_mantissas[i]} + << (32 - 3); + + context.fpr[inst.frd] = f32(std::bit_cast( + a == a ? (b & 0x8000'0000'0000'0000) | r : (0x8'0000'0000'0000 | b))); + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FRES) { FRES(context, inst); } +EXPORT_SEMANTIC(FRES); + +void SEMANTIC(FMULS)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = f32(context.fpr[inst.fra] * context.fpr[inst.frc]); + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FMULS) { FMULS(context, inst); } +EXPORT_SEMANTIC(FMULS); + +void SEMANTIC(FMADDS)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = f32(std::fma( + context.fpr[inst.fra], context.fpr[inst.frc], context.fpr[inst.frb])); + + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FMADDS) { FMADDS(context, inst); } +EXPORT_SEMANTIC(FMADDS); + +void SEMANTIC(FMSUBS)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = f32(std::fma( + context.fpr[inst.fra], context.fpr[inst.frc], -context.fpr[inst.frb])); + + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FMSUBS) { FMSUBS(context, inst); } +EXPORT_SEMANTIC(FMSUBS); + +void SEMANTIC(FNMSUBS)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = f32(-std::fma( + context.fpr[inst.fra], context.fpr[inst.frc], -context.fpr[inst.frb])); + + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FNMSUBS) { FNMSUBS(context, inst); } +EXPORT_SEMANTIC(FNMSUBS); + +void SEMANTIC(FNMADDS)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = f32(-std::fma( + context.fpr[inst.fra], context.fpr[inst.frc], context.fpr[inst.frb])); + + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FNMADDS) { FNMADDS(context, inst); } +EXPORT_SEMANTIC(FNMADDS); + +void SEMANTIC(MTFSB1)(PPUContext &context, Instruction inst) { + const u32 bit = inst.crbd; + context.fpscr.bits[bit] = 1; + context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl, context.fpscr.fe, + context.fpscr.fu); +} +void DECODER(MTFSB1) { MTFSB1(context, inst); } +EXPORT_SEMANTIC(MTFSB1); + +void SEMANTIC(MCRFS)(PPUContext &context, Instruction inst) { + std::memcpy(context.cr.fields + inst.crfd, context.fpscr.fields + inst.crfs, + sizeof(u32)); +} +void DECODER(MCRFS) { MCRFS(context, inst); } +EXPORT_SEMANTIC(MCRFS); + +void SEMANTIC(MTFSB0)(PPUContext &context, Instruction inst) { + const u32 bit = inst.crbd; + // if (bit < 16 || bit > 19) + // ppu_log.warning("MTFSB0(%d)", bit); + context.fpscr.bits[bit] = 0; + context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl, context.fpscr.fe, + context.fpscr.fu); +} +void DECODER(MTFSB0) { MTFSB0(context, inst); } +EXPORT_SEMANTIC(MTFSB0); + +void SEMANTIC(MTFSFI)(PPUContext &context, Instruction inst) { + const u32 bf = inst.crfd; + + if (bf != 4) { + // Do nothing on non-FPCC field (TODO) + // ppu_log.warning("MTFSFI(%d)", inst.crfd); + } else { + static constexpr auto all_values = [] { + std::array values{}; + + for (u32 i = 0; i < values.size(); i++) { + u32 value = 0, im = i; + value |= (im & 1) << (8 * 3); + im >>= 1; + value |= (im & 1) << (8 * 2); + im >>= 1; + value |= (im & 1) << (8 * 1); + im >>= 1; + value |= (im & 1) << (8 * 0); + values[i] = std::bit_cast(value); + } + + return values; + }(); + + context.fpscr.fields[bf] = all_values[inst.i]; + } + + if (inst.rc) { + context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl, + context.fpscr.fe, context.fpscr.fu); + } +} +void DECODER(MTFSFI) { MTFSFI(context, inst); } +EXPORT_SEMANTIC(MTFSFI); + +void SEMANTIC(MFFS)(PPUContext &context, Instruction inst) { + // ppu_log.warning("MFFS"); + context.fpr[inst.frd] = std::bit_cast( + u64{context.fpscr.fl} << 15 | u64{context.fpscr.fg} << 14 | + u64{context.fpscr.fe} << 13 | u64{context.fpscr.fu} << 12); + if (inst.rc) { + context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl, + context.fpscr.fe, context.fpscr.fu); + } +} +void DECODER(MFFS) { MFFS(context, inst); } +EXPORT_SEMANTIC(MFFS); + +void SEMANTIC(MTFSF)(PPUContext &context, Instruction inst) { + if (inst.rc) { + context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl, + context.fpscr.fe, context.fpscr.fu); + } +} +void DECODER(MTFSF) { MTFSF(context, inst); } +EXPORT_SEMANTIC(MTFSF); + +void SEMANTIC(FCMPU)(PPUContext &context, Instruction inst) { + const f64 a = context.fpr[inst.fra]; + const f64 b = context.fpr[inst.frb]; + ppu_set_fpcc(context, true, a, b, inst.crfd); +} +void DECODER(FCMPU) { FCMPU(context, inst); } +EXPORT_SEMANTIC(FCMPU); + +void SEMANTIC(FCTIW)(PPUContext &context, Instruction inst, f64 &d, f64 b) { +#if defined(ARCH_X64) + const auto val = _mm_set_sd(b); + const auto res = _mm_xor_si128( + _mm_cvtpd_epi32(val), + _mm_castpd_si128(_mm_cmpge_pd(val, _mm_set1_pd(0x80000000)))); + d = std::bit_cast(_mm_cvtsi128_si32(res)); +#elif defined(ARCH_ARM64) + d = std::bit_cast(!(b == b) + ? INT32_MIN + : vqmovnd_s64(std::bit_cast(vrndi_f64( + std::bit_cast(b))))); +#endif + ppu_set_fpcc(context, inst.rc, 0., 0.); // undefined (TODO) +} +void DECODER(FCTIW) { + FCTIW(context, inst, context.fpr[inst.frd], context.fpr[inst.frb]); +} +EXPORT_SEMANTIC(FCTIW); + +void SEMANTIC(FCTIWZ)(PPUContext &context, Instruction inst, f64 &d, f64 b) { +#if defined(ARCH_X64) + const auto val = _mm_set_sd(b); + const auto res = _mm_xor_si128( + _mm_cvttpd_epi32(val), + _mm_castpd_si128(_mm_cmpge_pd(val, _mm_set1_pd(0x80000000)))); + d = std::bit_cast(_mm_cvtsi128_si32(res)); +#elif defined(ARCH_ARM64) + d = std::bit_cast(!(b == b) + ? INT32_MIN + : vqmovnd_s64(std::bit_cast(vcvt_s64_f64( + std::bit_cast(b))))); +#endif + ppu_set_fpcc(context, inst.rc, 0., 0.); // undefined (TODO) +} +void DECODER(FCTIWZ) { + FCTIWZ(context, inst, context.fpr[inst.frd], context.fpr[inst.frb]); +} +EXPORT_SEMANTIC(FCTIWZ); + +void SEMANTIC(FRSP)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = f32(context.fpr[inst.frb]); + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FRSP) { FRSP(context, inst); } +EXPORT_SEMANTIC(FRSP); + +void SEMANTIC(FDIV)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = context.fpr[inst.fra] / context.fpr[inst.frb]; + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FDIV) { FDIV(context, inst); } +EXPORT_SEMANTIC(FDIV); + +void SEMANTIC(FSUB)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = context.fpr[inst.fra] - context.fpr[inst.frb]; + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FSUB) { FSUB(context, inst); } +EXPORT_SEMANTIC(FSUB); + +void SEMANTIC(FADD)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = context.fpr[inst.fra] + context.fpr[inst.frb]; + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FADD) { FADD(context, inst); } +EXPORT_SEMANTIC(FADD); + +void SEMANTIC(FSQRT)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = std::sqrt(context.fpr[inst.frb]); + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FSQRT) { FSQRT(context, inst); } +EXPORT_SEMANTIC(FSQRT); + +void SEMANTIC(FSEL)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = context.fpr[inst.fra] >= 0.0 ? context.fpr[inst.frc] + : context.fpr[inst.frb]; + if (inst.rc) { + context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl, + context.fpscr.fe, context.fpscr.fu); + } +} +void DECODER(FSEL) { FSEL(context, inst); } +EXPORT_SEMANTIC(FSEL); + +void SEMANTIC(FMUL)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = context.fpr[inst.fra] * context.fpr[inst.frc]; + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FMUL) { FMUL(context, inst); } +EXPORT_SEMANTIC(FMUL); + +void SEMANTIC(FRSQRTE)(PPUContext &context, Instruction inst) { + const u64 b = std::bit_cast(context.fpr[inst.frb]); + context.fpr[inst.frd] = + std::bit_cast(u64{ppu_frqrte_lut.data[b >> 49]} << 32); + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FRSQRTE) { FRSQRTE(context, inst); } +EXPORT_SEMANTIC(FRSQRTE); + +void SEMANTIC(FMSUB)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = std::fma(context.fpr[inst.fra], context.fpr[inst.frc], + -context.fpr[inst.frb]); + + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FMSUB) { FMSUB(context, inst); } +EXPORT_SEMANTIC(FMSUB); + +void SEMANTIC(FMADD)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = std::fma(context.fpr[inst.fra], context.fpr[inst.frc], + context.fpr[inst.frb]); + + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FMADD) { FMADD(context, inst); } +EXPORT_SEMANTIC(FMADD); + +void SEMANTIC(FNMSUB)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = -std::fma( + context.fpr[inst.fra], context.fpr[inst.frc], -context.fpr[inst.frb]); + + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FNMSUB) { FNMSUB(context, inst); } +EXPORT_SEMANTIC(FNMSUB); + +void SEMANTIC(FNMADD)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = -std::fma( + context.fpr[inst.fra], context.fpr[inst.frc], context.fpr[inst.frb]); + + ppu_set_fpcc(context, inst.rc, context.fpr[inst.frd], 0.); +} +void DECODER(FNMADD) { FNMADD(context, inst); } +EXPORT_SEMANTIC(FNMADD); + +void SEMANTIC(FCMPO)(PPUContext &context, Instruction inst) { + const f64 a = context.fpr[inst.fra]; + const f64 b = context.fpr[inst.frb]; + ppu_set_fpcc(context, true, a, b, inst.crfd); +} +void DECODER(FCMPO) { FCMPO(context, inst); } +EXPORT_SEMANTIC(FCMPO); + +void SEMANTIC(FNEG)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = -context.fpr[inst.frb]; + if (inst.rc) { + context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl, + context.fpscr.fe, context.fpscr.fu); + } +} +void DECODER(FNEG) { FNEG(context, inst); } +EXPORT_SEMANTIC(FNEG); + +void SEMANTIC(FMR)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = context.fpr[inst.frb]; + if (inst.rc) { + context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl, + context.fpscr.fe, context.fpscr.fu); + } +} +void DECODER(FMR) { FMR(context, inst); } +EXPORT_SEMANTIC(FMR); + +void SEMANTIC(FNABS)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = -std::fabs(context.fpr[inst.frb]); + if (inst.rc) { + context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl, + context.fpscr.fe, context.fpscr.fu); + } +} +void DECODER(FNABS) { FNABS(context, inst); } +EXPORT_SEMANTIC(FNABS); + +void SEMANTIC(FABS)(PPUContext &context, Instruction inst) { + context.fpr[inst.frd] = std::fabs(context.fpr[inst.frb]); + if (inst.rc) { + context.cr.fields[1].set(context.fpscr.fg, context.fpscr.fl, + context.fpscr.fe, context.fpscr.fu); + } +} +void DECODER(FABS) { FABS(context, inst); } +EXPORT_SEMANTIC(FABS); + +void SEMANTIC(FCTID)(PPUContext &context, Instruction inst, f64 &d, f64 b) { +#if defined(ARCH_X64) + const auto val = _mm_set_sd(b); + const auto res = _mm_xor_si128( + _mm_set1_epi64x(_mm_cvtsd_si64(val)), + _mm_castpd_si128(_mm_cmpge_pd(val, _mm_set1_pd(f64(1ull << 63))))); + d = std::bit_cast(_mm_cvtsi128_si64(res)); +#elif defined(ARCH_ARM64) + d = std::bit_cast( + !(b == b) ? f64{INT64_MIN} + : std::bit_cast(vrndi_f64(std::bit_cast(b)))); +#endif + ppu_set_fpcc(context, inst.rc, 0., 0.); // undefined (TODO) +} +void DECODER(FCTID) { + FCTID(context, inst, context.fpr[inst.frd], context.fpr[inst.frb]); +} +EXPORT_SEMANTIC(FCTID); + +void SEMANTIC(FCTIDZ)(PPUContext &context, Instruction inst, f64 &d, f64 b) { +#if defined(ARCH_X64) + const auto val = _mm_set_sd(b); + const auto res = _mm_xor_si128( + _mm_set1_epi64x(_mm_cvttsd_si64(val)), + _mm_castpd_si128(_mm_cmpge_pd(val, _mm_set1_pd(f64(1ull << 63))))); + d = std::bit_cast(_mm_cvtsi128_si64(res)); +#elif defined(ARCH_ARM64) + d = std::bit_cast(!(b == b) + ? int64x1_t{INT64_MIN} + : vcvt_s64_f64(std::bit_cast(b))); +#endif + ppu_set_fpcc(context, inst.rc, 0., 0.); // undefined (TODO) +} +void DECODER(FCTIDZ) { + FCTIDZ(context, inst, context.fpr[inst.frd], context.fpr[inst.frb]); +} +EXPORT_SEMANTIC(FCTIDZ); + +void SEMANTIC(FCFID)(PPUContext &context, Instruction inst, f64 &d, f64 b) { + f64 r = static_cast(std::bit_cast(b)); + d = r; + ppu_set_fpcc(context, inst.rc, r, 0.); +} +void DECODER(FCFID) { + FCFID(context, inst, context.fpr[inst.frd], context.fpr[inst.frb]); +} +EXPORT_SEMANTIC(FCFID); + +void SEMANTIC(RFID)() { rpcsx_unimplemented_instruction(); } +void DECODER(RFID) { RFID(); } +EXPORT_SEMANTIC(RFID); + +void SEMANTIC(RFSCV)() { rpcsx_unimplemented_instruction(); } +void DECODER(RFSCV) { RFSCV(); } +EXPORT_SEMANTIC(RFSCV); + +void SEMANTIC(HRFID)() { rpcsx_unimplemented_instruction(); } +void DECODER(HRFID) { HRFID(); } +EXPORT_SEMANTIC(HRFID); + +void SEMANTIC(STOP)() { rpcsx_unimplemented_instruction(); } +void DECODER(STOP) { STOP(); } +EXPORT_SEMANTIC(STOP); + +void SEMANTIC(URFID)() { rpcsx_unimplemented_instruction(); } +void DECODER(URFID) { URFID(); } +EXPORT_SEMANTIC(URFID); + +void SEMANTIC(SUBFCO)() { rpcsx_unimplemented_instruction(); } +void DECODER(SUBFCO) { SUBFCO(); } +EXPORT_SEMANTIC(SUBFCO); + +void SEMANTIC(ADDCO)() { rpcsx_unimplemented_instruction(); } +void DECODER(ADDCO) { ADDCO(); } +EXPORT_SEMANTIC(ADDCO); + +void SEMANTIC(UNK)() { rpcsx_unimplemented_instruction(); } +void DECODER(UNK) { UNK(); } +EXPORT_SEMANTIC(UNK); + +void SEMANTIC(SUBFEO)() { rpcsx_unimplemented_instruction(); } +void DECODER(SUBFEO) { SUBFEO(); } +EXPORT_SEMANTIC(SUBFEO); + +void SEMANTIC(ADDEO)() { rpcsx_unimplemented_instruction(); } +void DECODER(ADDEO) { ADDEO(); } +EXPORT_SEMANTIC(ADDEO); + +void SEMANTIC(SUBFO)() { rpcsx_unimplemented_instruction(); } +void DECODER(SUBFO) { SUBFO(); } +EXPORT_SEMANTIC(SUBFO); + +void SEMANTIC(NEGO)() { rpcsx_unimplemented_instruction(); } +void DECODER(NEGO) { NEGO(); } +EXPORT_SEMANTIC(NEGO); + +void SEMANTIC(SUBFMEO)() { rpcsx_unimplemented_instruction(); } +void DECODER(SUBFMEO) { SUBFMEO(); } +EXPORT_SEMANTIC(SUBFMEO); + +void SEMANTIC(MULLDO)() { rpcsx_unimplemented_instruction(); } +void DECODER(MULLDO) { MULLDO(); } +EXPORT_SEMANTIC(MULLDO); + +void SEMANTIC(SUBFZEO)() { rpcsx_unimplemented_instruction(); } +void DECODER(SUBFZEO) { SUBFZEO(); } +EXPORT_SEMANTIC(SUBFZEO); + +void SEMANTIC(ADDZEO)() { rpcsx_unimplemented_instruction(); } +void DECODER(ADDZEO) { ADDZEO(); } +EXPORT_SEMANTIC(ADDZEO); + +void SEMANTIC(ADDO)() { rpcsx_unimplemented_instruction(); } +void DECODER(ADDO) { ADDO(); } +EXPORT_SEMANTIC(ADDO); + +void SEMANTIC(DIVDUO)() { rpcsx_unimplemented_instruction(); } +void DECODER(DIVDUO) { DIVDUO(); } +EXPORT_SEMANTIC(DIVDUO); + +void SEMANTIC(ADDMEO)() { rpcsx_unimplemented_instruction(); } +void DECODER(ADDMEO) { ADDMEO(); } +EXPORT_SEMANTIC(ADDMEO); + +void SEMANTIC(MULLWO)() { rpcsx_unimplemented_instruction(); } +void DECODER(MULLWO) { MULLWO(); } +EXPORT_SEMANTIC(MULLWO); + +void SEMANTIC(DIVWO)() { rpcsx_unimplemented_instruction(); } +void DECODER(DIVWO) { DIVWO(); } +EXPORT_SEMANTIC(DIVWO); + +void SEMANTIC(DIVWUO)() { rpcsx_unimplemented_instruction(); } +void DECODER(DIVWUO) { DIVWUO(); } +EXPORT_SEMANTIC(DIVWUO); + +void SEMANTIC(DIVDO)() { rpcsx_unimplemented_instruction(); } +void DECODER(DIVDO) { DIVDO(); } +EXPORT_SEMANTIC(DIVDO); diff --git a/rpcsx/cpu/cell/ppu/src/Decoder.cpp b/rpcsx/cpu/cell/ppu/src/Decoder.cpp new file mode 100644 index 000000000..11f49f6ed --- /dev/null +++ b/rpcsx/cpu/cell/ppu/src/Decoder.cpp @@ -0,0 +1,501 @@ +#include "Decoder.hpp" +#include "Instruction.hpp" +#include "Opcode.hpp" +#include +#include + +struct InstructionEncodingInfo { + std::uint32_t value; + rx::cell::ppu::Opcode opcode; + rx::cell::ppu::Opcode rcOpcode; + std::uint32_t magn = 0; + + constexpr InstructionEncodingInfo(std::uint32_t value, + rx::cell::ppu::Opcode opcode, + rx::cell::ppu::Opcode rcOpcode) + : value(value), opcode(opcode), rcOpcode(rcOpcode) {} + + constexpr InstructionEncodingInfo(std::uint32_t value, + rx::cell::ppu::Opcode opcode, + rx::cell::ppu::Opcode rcOpcode, + std::uint32_t magn) + : value(value), opcode(opcode), rcOpcode(rcOpcode), magn(magn) {} +}; + +static constexpr rx::cell::ppu::DecoderTable +buildOpcodeTable() { + // Main opcodes (field 0..5) + rx::cell::ppu::DecoderTable result; + result.fill(rx::cell::ppu::Opcode::Invalid); + + auto fill_table = + [&](std::uint32_t main_op, std::uint32_t count, std::uint32_t sh, + std::initializer_list entries) noexcept { + if (sh < 11) { + for (const auto &v : entries) { + for (std::uint32_t i = 0; i < 1u << (v.magn + (11 - sh - count)); + i++) { + for (std::uint32_t j = 0; j < 1u << sh; j++) { + const std::uint32_t k = + (((i << (count - v.magn)) | v.value) << sh) | j; + result[(k << 6) | main_op] = i & 1 ? v.rcOpcode : v.opcode; + } + } + } + } else { + // Main table (special case) + for (const auto &v : entries) { + for (std::uint32_t i = 0; i < 1u << 11; i++) { + result[i << 6 | v.value] = i & 1 ? v.rcOpcode : v.opcode; + } + } + } + }; + +#define GET(name) rx::cell::ppu::Opcode::name, rx::cell::ppu::Opcode::name +#define GETRC(name) rx::cell::ppu::Opcode::name, rx::cell::ppu::Opcode::name##_ + + fill_table( + 0x00, 6, -1, + { + {0x02, GET(TDI)}, {0x03, GET(TWI)}, {0x07, GET(MULLI)}, + {0x08, GET(SUBFIC)}, {0x0a, GET(CMPLI)}, {0x0b, GET(CMPI)}, + {0x0c, GET(ADDIC)}, {0x0d, GET(ADDIC)}, {0x0e, GET(ADDI)}, + {0x0f, GET(ADDIS)}, {0x10, GET(BC)}, {0x11, GET(SC)}, + {0x12, GET(B)}, {0x14, GETRC(RLWIMI)}, {0x15, GETRC(RLWINM)}, + {0x17, GETRC(RLWNM)}, {0x18, GET(ORI)}, {0x19, GET(ORIS)}, + {0x1a, GET(XORI)}, {0x1b, GET(XORIS)}, {0x1c, GET(ANDI)}, + {0x1d, GET(ANDIS)}, {0x20, GET(LWZ)}, {0x21, GET(LWZU)}, + {0x22, GET(LBZ)}, {0x23, GET(LBZU)}, {0x24, GET(STW)}, + {0x25, GET(STWU)}, {0x26, GET(STB)}, {0x27, GET(STBU)}, + {0x28, GET(LHZ)}, {0x29, GET(LHZU)}, {0x2a, GET(LHA)}, + {0x2b, GET(LHAU)}, {0x2c, GET(STH)}, {0x2d, GET(STHU)}, + {0x2e, GET(LMW)}, {0x2f, GET(STMW)}, {0x30, GET(LFS)}, + {0x31, GET(LFSU)}, {0x32, GET(LFD)}, {0x33, GET(LFDU)}, + {0x34, GET(STFS)}, {0x35, GET(STFSU)}, {0x36, GET(STFD)}, + {0x37, GET(STFDU)}, + }); + + // Group 0x04 opcodes (field 21..31) + fill_table(0x04, 11, 0, + { + {0x0, GET(VADDUBM)}, {0x2, GET(VMAXUB)}, + {0x4, GET(VRLB)}, {0x006, GET(VCMPEQUB)}, + {0x406, GET(VCMPEQUB_)}, {0x8, GET(VMULOUB)}, + {0xa, GET(VADDFP)}, {0xc, GET(VMRGHB)}, + {0xe, GET(VPKUHUM)}, + + {0x20, GET(VMHADDSHS), 5}, {0x21, GET(VMHRADDSHS), 5}, + {0x22, GET(VMLADDUHM), 5}, {0x24, GET(VMSUMUBM), 5}, + {0x25, GET(VMSUMMBM), 5}, {0x26, GET(VMSUMUHM), 5}, + {0x27, GET(VMSUMUHS), 5}, {0x28, GET(VMSUMSHM), 5}, + {0x29, GET(VMSUMSHS), 5}, {0x2a, GET(VSEL), 5}, + {0x2b, GET(VPERM), 5}, {0x2c, GET(VSLDOI), 5}, + {0x2e, GET(VMADDFP), 5}, {0x2f, GET(VNMSUBFP), 5}, + + {0x40, GET(VADDUHM)}, {0x42, GET(VMAXUH)}, + {0x44, GET(VRLH)}, {0x046, GET(VCMPEQUH)}, + {0x446, GET(VCMPEQUH_)}, {0x48, GET(VMULOUH)}, + {0x4a, GET(VSUBFP)}, {0x4c, GET(VMRGHH)}, + {0x4e, GET(VPKUWUM)}, {0x80, GET(VADDUWM)}, + {0x82, GET(VMAXUW)}, {0x84, GET(VRLW)}, + {0x086, GET(VCMPEQUW)}, {0x486, GET(VCMPEQUW_)}, + {0x8c, GET(VMRGHW)}, {0x8e, GET(VPKUHUS)}, + {0x0c6, GET(VCMPEQFP)}, {0x4c6, GET(VCMPEQFP_)}, + {0xce, GET(VPKUWUS)}, + + {0x102, GET(VMAXSB)}, {0x104, GET(VSLB)}, + {0x108, GET(VMULOSB)}, {0x10a, GET(VREFP)}, + {0x10c, GET(VMRGLB)}, {0x10e, GET(VPKSHUS)}, + {0x142, GET(VMAXSH)}, {0x144, GET(VSLH)}, + {0x148, GET(VMULOSH)}, {0x14a, GET(VRSQRTEFP)}, + {0x14c, GET(VMRGLH)}, {0x14e, GET(VPKSWUS)}, + {0x180, GET(VADDCUW)}, {0x182, GET(VMAXSW)}, + {0x184, GET(VSLW)}, {0x18a, GET(VEXPTEFP)}, + {0x18c, GET(VMRGLW)}, {0x18e, GET(VPKSHSS)}, + {0x1c4, GET(VSL)}, {0x1c6, GET(VCMPGEFP)}, + {0x5c6, GET(VCMPGEFP_)}, {0x1ca, GET(VLOGEFP)}, + {0x1ce, GET(VPKSWSS)}, {0x200, GET(VADDUBS)}, + {0x202, GET(VMINUB)}, {0x204, GET(VSRB)}, + {0x206, GET(VCMPGTUB)}, {0x606, GET(VCMPGTUB_)}, + {0x208, GET(VMULEUB)}, {0x20a, GET(VRFIN)}, + {0x20c, GET(VSPLTB)}, {0x20e, GET(VUPKHSB)}, + {0x240, GET(VADDUHS)}, {0x242, GET(VMINUH)}, + {0x244, GET(VSRH)}, {0x246, GET(VCMPGTUH)}, + {0x646, GET(VCMPGTUH_)}, {0x248, GET(VMULEUH)}, + {0x24a, GET(VRFIZ)}, {0x24c, GET(VSPLTH)}, + {0x24e, GET(VUPKHSH)}, {0x280, GET(VADDUWS)}, + {0x282, GET(VMINUW)}, {0x284, GET(VSRW)}, + {0x286, GET(VCMPGTUW)}, {0x686, GET(VCMPGTUW_)}, + {0x28a, GET(VRFIP)}, {0x28c, GET(VSPLTW)}, + {0x28e, GET(VUPKLSB)}, {0x2c4, GET(VSR)}, + {0x2c6, GET(VCMPGTFP)}, {0x6c6, GET(VCMPGTFP_)}, + {0x2ca, GET(VRFIM)}, {0x2ce, GET(VUPKLSH)}, + {0x300, GET(VADDSBS)}, {0x302, GET(VMINSB)}, + {0x304, GET(VSRAB)}, {0x306, GET(VCMPGTSB)}, + {0x706, GET(VCMPGTSB_)}, {0x308, GET(VMULESB)}, + {0x30a, GET(VCFUX)}, {0x30c, GET(VSPLTISB)}, + {0x30e, GET(VPKPX)}, {0x340, GET(VADDSHS)}, + {0x342, GET(VMINSH)}, {0x344, GET(VSRAH)}, + {0x346, GET(VCMPGTSH)}, {0x746, GET(VCMPGTSH_)}, + {0x348, GET(VMULESH)}, {0x34a, GET(VCFSX)}, + {0x34c, GET(VSPLTISH)}, {0x34e, GET(VUPKHPX)}, + {0x380, GET(VADDSWS)}, {0x382, GET(VMINSW)}, + {0x384, GET(VSRAW)}, {0x386, GET(VCMPGTSW)}, + {0x786, GET(VCMPGTSW_)}, {0x38a, GET(VCTUXS)}, + {0x38c, GET(VSPLTISW)}, {0x3c6, GET(VCMPBFP)}, + {0x7c6, GET(VCMPBFP_)}, {0x3ca, GET(VCTSXS)}, + {0x3ce, GET(VUPKLPX)}, {0x400, GET(VSUBUBM)}, + {0x402, GET(VAVGUB)}, {0x404, GET(VAND)}, + {0x40a, GET(VMAXFP)}, {0x40c, GET(VSLO)}, + {0x440, GET(VSUBUHM)}, {0x442, GET(VAVGUH)}, + {0x444, GET(VANDC)}, {0x44a, GET(VMINFP)}, + {0x44c, GET(VSRO)}, {0x480, GET(VSUBUWM)}, + {0x482, GET(VAVGUW)}, {0x484, GET(VOR)}, + {0x4c4, GET(VXOR)}, {0x502, GET(VAVGSB)}, + {0x504, GET(VNOR)}, {0x542, GET(VAVGSH)}, + {0x580, GET(VSUBCUW)}, {0x582, GET(VAVGSW)}, + {0x600, GET(VSUBUBS)}, {0x604, GET(MFVSCR)}, + {0x608, GET(VSUM4UBS)}, {0x640, GET(VSUBUHS)}, + {0x644, GET(MTVSCR)}, {0x648, GET(VSUM4SHS)}, + {0x680, GET(VSUBUWS)}, {0x688, GET(VSUM2SWS)}, + {0x700, GET(VSUBSBS)}, {0x708, GET(VSUM4SBS)}, + {0x740, GET(VSUBSHS)}, {0x780, GET(VSUBSWS)}, + {0x788, GET(VSUMSWS)}, + }); + + // Group 0x13 opcodes (field 21..30) + fill_table(0x13, 10, 1, + { + {0x000, GET(MCRF)}, + {0x010, GET(BCLR)}, + {0x012, GET(RFID)}, + {0x021, GET(CRNOR)}, + {0x052, GET(RFSCV)}, + {0x081, GET(CRANDC)}, + {0x096, GET(ISYNC)}, + {0x0c1, GET(CRXOR)}, + {0x0e1, GET(CRNAND)}, + {0x101, GET(CRAND)}, + {0x112, GET(HRFID)}, + {0x121, GET(CREQV)}, + {0x132, GET(URFID)}, + {0x172, GET(STOP)}, + {0x1a1, GET(CRORC)}, + {0x1c1, GET(CROR)}, + {0x210, GET(BCCTR)}, + }); + + // Group 0x1e opcodes (field 27..30) + fill_table(0x1e, 4, 1, + { + {0x0, GETRC(RLDICL)}, + {0x1, GETRC(RLDICL)}, + {0x2, GETRC(RLDICR)}, + {0x3, GETRC(RLDICR)}, + {0x4, GETRC(RLDIC)}, + {0x5, GETRC(RLDIC)}, + {0x6, GETRC(RLDIMI)}, + {0x7, GETRC(RLDIMI)}, + {0x8, GETRC(RLDCL)}, + {0x9, GETRC(RLDCR)}, + }); + + // Group 0x1f opcodes (field 21..30) + fill_table(0x1f, 10, 1, + { + {0x000, GET(CMP)}, {0x004, GET(TW)}, + {0x006, GET(LVSL)}, {0x007, GET(LVEBX)}, + {0x008, GETRC(SUBFC)}, {0x208, GETRC(SUBFCO)}, + {0x009, GETRC(MULHDU)}, {0x00a, GETRC(ADDC)}, + {0x20a, GETRC(ADDCO)}, {0x00b, GETRC(MULHWU)}, + {0x013, GET(MFOCRF)}, {0x014, GET(LWARX)}, + {0x015, GET(LDX)}, {0x017, GET(LWZX)}, + {0x018, GETRC(SLW)}, {0x01a, GETRC(CNTLZW)}, + {0x01b, GETRC(SLD)}, {0x01c, GETRC(AND)}, + {0x020, GET(CMPL)}, {0x026, GET(LVSR)}, + {0x027, GET(LVEHX)}, {0x028, GETRC(SUBF)}, + {0x228, GETRC(SUBFO)}, {0x035, GET(LDUX)}, + {0x036, GET(DCBST)}, {0x037, GET(LWZUX)}, + {0x03a, GETRC(CNTLZD)}, {0x03c, GETRC(ANDC)}, + {0x044, GET(TD)}, {0x047, GET(LVEWX)}, + {0x049, GETRC(MULHD)}, {0x04b, GETRC(MULHW)}, + {0x054, GET(LDARX)}, {0x056, GET(DCBF)}, + {0x057, GET(LBZX)}, {0x067, GET(LVX)}, + {0x068, GETRC(NEG)}, {0x268, GETRC(NEGO)}, + {0x077, GET(LBZUX)}, {0x07c, GETRC(NOR)}, + {0x087, GET(STVEBX)}, {0x088, GETRC(SUBFE)}, + {0x288, GETRC(SUBFEO)}, {0x08a, GETRC(ADDE)}, + {0x28a, GETRC(ADDEO)}, {0x090, GET(MTOCRF)}, + {0x095, GET(STDX)}, {0x096, GET(STWCX)}, + {0x097, GET(STWX)}, {0x0a7, GET(STVEHX)}, + {0x0b5, GET(STDUX)}, {0x0b7, GET(STWUX)}, + {0x0c7, GET(STVEWX)}, {0x0c8, GETRC(SUBFZE)}, + {0x2c8, GETRC(SUBFZEO)}, {0x0ca, GETRC(ADDZE)}, + {0x2ca, GETRC(ADDZEO)}, {0x0d6, GET(STDCX)}, + {0x0d7, GET(STBX)}, {0x0e7, GET(STVX)}, + {0x0e8, GETRC(SUBFME)}, {0x2e8, GETRC(SUBFMEO)}, + {0x0e9, GETRC(MULLD)}, {0x2e9, GETRC(MULLDO)}, + {0x0ea, GETRC(ADDME)}, {0x2ea, GETRC(ADDMEO)}, + {0x0eb, GETRC(MULLW)}, {0x2eb, GETRC(MULLWO)}, + {0x0f6, GET(DCBTST)}, {0x0f7, GET(STBUX)}, + {0x10a, GETRC(ADD)}, {0x30a, GETRC(ADDO)}, + {0x116, GET(DCBT)}, {0x117, GET(LHZX)}, + {0x11c, GETRC(EQV)}, {0x136, GET(ECIWX)}, + {0x137, GET(LHZUX)}, {0x13c, GETRC(XOR)}, + {0x153, GET(MFSPR)}, {0x155, GET(LWAX)}, + {0x156, GET(DST)}, {0x157, GET(LHAX)}, + {0x167, GET(LVXL)}, {0x173, GET(MFTB)}, + {0x175, GET(LWAUX)}, {0x176, GET(DSTST)}, + {0x177, GET(LHAUX)}, {0x197, GET(STHX)}, + {0x19c, GETRC(ORC)}, {0x1b6, GET(ECOWX)}, + {0x1b7, GET(STHUX)}, {0x1bc, GETRC(OR)}, + {0x1c9, GETRC(DIVDU)}, {0x3c9, GETRC(DIVDUO)}, + {0x1cb, GETRC(DIVWU)}, {0x3cb, GETRC(DIVWUO)}, + {0x1d3, GET(MTSPR)}, {0x1d6, GET(DCBI)}, + {0x1dc, GETRC(NAND)}, {0x1e7, GET(STVXL)}, + {0x1e9, GETRC(DIVD)}, {0x3e9, GETRC(DIVDO)}, + {0x1eb, GETRC(DIVW)}, {0x3eb, GETRC(DIVWO)}, + {0x207, GET(LVLX)}, {0x214, GET(LDBRX)}, + {0x215, GET(LSWX)}, {0x216, GET(LWBRX)}, + {0x217, GET(LFSX)}, {0x218, GETRC(SRW)}, + {0x21b, GETRC(SRD)}, {0x227, GET(LVRX)}, + {0x237, GET(LFSUX)}, {0x255, GET(LSWI)}, + {0x256, GET(SYNC)}, {0x257, GET(LFDX)}, + {0x277, GET(LFDUX)}, {0x287, GET(STVLX)}, + {0x294, GET(STDBRX)}, {0x295, GET(STSWX)}, + {0x296, GET(STWBRX)}, {0x297, GET(STFSX)}, + {0x2a7, GET(STVRX)}, {0x2b7, GET(STFSUX)}, + {0x2d5, GET(STSWI)}, {0x2d7, GET(STFDX)}, + {0x2f7, GET(STFDUX)}, {0x307, GET(LVLXL)}, + {0x316, GET(LHBRX)}, {0x318, GETRC(SRAW)}, + {0x31a, GETRC(SRAD)}, {0x327, GET(LVRXL)}, + {0x336, GET(DSS)}, {0x338, GETRC(SRAWI)}, + {0x33a, GETRC(SRADI)}, {0x33b, GETRC(SRADI)}, + {0x356, GET(EIEIO)}, {0x387, GET(STVLXL)}, + {0x396, GET(STHBRX)}, {0x39a, GETRC(EXTSH)}, + {0x3a7, GET(STVRXL)}, {0x3ba, GETRC(EXTSB)}, + {0x3d7, GET(STFIWX)}, {0x3da, GETRC(EXTSW)}, + {0x3d6, GET(ICBI)}, {0x3f6, GET(DCBZ)}, + }); + + // Group 0x3a opcodes (field 30..31) + fill_table(0x3a, 2, 0, + { + {0x0, GET(LD)}, + {0x1, GET(LDU)}, + {0x2, GET(LWA)}, + }); + + // Group 0x3b opcodes (field 21..30) + fill_table(0x3b, 10, 1, + { + {0x12, GETRC(FDIVS), 5}, + {0x14, GETRC(FSUBS), 5}, + {0x15, GETRC(FADDS), 5}, + {0x16, GETRC(FSQRTS), 5}, + {0x18, GETRC(FRES), 5}, + {0x19, GETRC(FMULS), 5}, + {0x1c, GETRC(FMSUBS), 5}, + {0x1d, GETRC(FMADDS), 5}, + {0x1e, GETRC(FNMSUBS), 5}, + {0x1f, GETRC(FNMADDS), 5}, + }); + + // Group 0x3e opcodes (field 30..31) + fill_table(0x3e, 2, 0, + { + {0x0, GET(STD)}, + {0x1, GET(STDU)}, + }); + + // Group 0x3f opcodes (field 21..30) + fill_table(0x3f, 10, 1, + { + {0x026, GETRC(MTFSB1)}, {0x040, GET(MCRFS)}, + {0x046, GETRC(MTFSB0)}, {0x086, GETRC(MTFSFI)}, + {0x247, GETRC(MFFS)}, {0x2c7, GETRC(MTFSF)}, + + {0x000, GET(FCMPU)}, {0x00c, GETRC(FRSP)}, + {0x00e, GETRC(FCTIW)}, {0x00f, GETRC(FCTIWZ)}, + + {0x012, GETRC(FDIV), 5}, {0x014, GETRC(FSUB), 5}, + {0x015, GETRC(FADD), 5}, {0x016, GETRC(FSQRT), 5}, + {0x017, GETRC(FSEL), 5}, {0x019, GETRC(FMUL), 5}, + {0x01a, GETRC(FRSQRTE), 5}, {0x01c, GETRC(FMSUB), 5}, + {0x01d, GETRC(FMADD), 5}, {0x01e, GETRC(FNMSUB), 5}, + {0x01f, GETRC(FNMADD), 5}, + + {0x020, GET(FCMPO)}, {0x028, GETRC(FNEG)}, + {0x048, GETRC(FMR)}, {0x088, GETRC(FNABS)}, + {0x108, GETRC(FABS)}, {0x32e, GETRC(FCTID)}, + {0x32f, GETRC(FCTIDZ)}, {0x34e, GETRC(FCFID)}, + }); + + return result; +} + +rx::cell::ppu::DecoderTable + rx::cell::ppu::g_ppuOpcodeTable = buildOpcodeTable(); + +rx::cell::ppu::Opcode rx::cell::ppu::fixOpcode(Opcode opcode, + std::uint32_t instruction) { + auto inst = std::bit_cast(instruction); + + if (opcode == Opcode::ADDI) { + if (inst.ra == 0) { + return Opcode::LI; + } + + return opcode; + } + + if (opcode == Opcode::ADDIS) { + if (inst.ra == 0) { + return Opcode::LIS; + } + + return opcode; + } + + if (opcode == Opcode::CRNOR) { + if (inst.crba == inst.crbb) { + return Opcode::CRNOT; + } + + return opcode; + } + + if (opcode == Opcode::B) { + if (inst.aa && inst.lk) { + return Opcode::BLA; + } else if (inst.lk) { + return Opcode::BL; + } else if (inst.aa) { + return Opcode::BA; + } + + return opcode; + } + + if (opcode == Opcode::ORI) { + if (inst.rs == 0 && inst.ra == 0 && inst.uimm16 == 0) { + return Opcode::NOP; + } + + if (inst.uimm16 == 0) { + return Opcode::MR; + } + + return opcode; + } + + if (opcode == Opcode::ORIS) { + if (inst.rs == 0 && inst.ra == 0 && inst.uimm16 == 0) { + return Opcode::NOP; + } + + return opcode; + } + + if (opcode == Opcode::RLDICL) { + if (inst.sh64 == 0) { + return Opcode::CLRLDI; + } + + if (inst.mbe64 == 0) { + return Opcode::ROTLDI; + } + + if (inst.mbe64 == 64 - inst.sh64) { + return Opcode::SRDI; + } + + return opcode; + } + + if (opcode == Opcode::CMP) { + if (inst.l10) { + return Opcode::CMPD; + } + return Opcode::CMPW; + } + + if (opcode == Opcode::CMPL) { + if (inst.l10) { + return Opcode::CMPLD; + } + return Opcode::CMPLW; + } + + if (opcode == Opcode::NOR) { + if (inst.rs == inst.rb) { + return Opcode::NOT; + } + + return opcode; + } + + if (opcode == Opcode::MTOCRF) { + if (!inst.l10) { + return Opcode::MTCRF; + } + + return opcode; + } + + if (opcode == Opcode::MFSPR) { + auto n = (inst.spr >> 5) | ((inst.spr & 0x1f) << 5); + + switch (n) { + case 1: + return Opcode::MFXER; + case 8: + return Opcode::MFLR; + case 9: + return Opcode::MFCTR; + } + + return opcode; + } + + if (opcode == Opcode::MFTB) { + auto n = (inst.spr >> 5) | ((inst.spr & 0x1f) << 5); + + switch (n) { + case 268: + return Opcode::MFTB; + case 269: + return Opcode::MFTBU; + } + + return opcode; + } + + if (opcode == Opcode::OR) { + if (inst.rs == inst.rb) { + switch (inst.raw) { + case 0x7c210b78: + return Opcode::CCTPL; + case 0x7c421378: + return Opcode::CCTPM; + case 0x7c631b78: + return Opcode::CCTPH; + case 0x7f9ce378: + return Opcode::DB8CYC; + case 0x7fbdeb78: + return Opcode::DB10CYC; + case 0x7fdef378: + return Opcode::DB12CYC; + case 0x7ffffb78: + return Opcode::DB16CYC; + } + + return Opcode::MR; + } + + return opcode; + } + + return opcode; +} diff --git a/rx/include/rx/BitField.h b/rx/include/rx/BitField.h new file mode 100644 index 000000000..a644bfde9 --- /dev/null +++ b/rx/include/rx/BitField.h @@ -0,0 +1,244 @@ +#pragma once + +#include +#include + +#ifndef _MSC_VER +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Weffc++" +#endif + +namespace rx { +template struct BitFieldBase { + using type = T; + using vtype = std::common_type_t; + using utype = std::make_unsigned_t; + + static constexpr bool can_be_packed = + N < (sizeof(int) * 8 + (std::is_unsigned_v ? 1 : 0)) && + sizeof(vtype) > sizeof(int); + using compact_type = std::conditional_t< + can_be_packed, + std::conditional_t, std::size_t, int>, vtype>; + + // Datatype bitsize + static constexpr std::size_t bitmax = sizeof(T) * 8; + static_assert(N - 1 < bitmax, "BitFieldBase<> error: N out of bounds"); + + // Field bitsize + static constexpr std::size_t bitsize = N; + + // All ones mask + static constexpr utype mask1 = static_cast(~static_cast(0)); + + // Value mask + static constexpr utype vmask = mask1 >> (bitmax - bitsize); + +protected: + type m_data; +}; + +// Bitfield accessor (N bits from I position, 0 is LSB) +template +struct BitField : BitFieldBase { + using type = typename BitField::type; + using vtype = typename BitField::vtype; + using utype = typename BitField::utype; + using compact_type = typename BitField::compact_type; + + // Field offset + static constexpr std::size_t bitpos = I; + static_assert(bitpos + N <= BitField::bitmax, + "BitField<> error: I out of bounds"); + + // Get bitmask of size N, at I pos + static constexpr utype data_mask() { + return static_cast( + static_cast(BitField::mask1 >> + (BitField::bitmax - BitField::bitsize)) + << bitpos); + } + + // Bitfield extraction + static constexpr compact_type extract(const T &data) noexcept { + if constexpr (std::is_signed_v) { + // Load signed value (sign-extended) + return static_cast( + static_cast(static_cast(data) + << (BitField::bitmax - bitpos - N)) >> + (BitField::bitmax - N)); + } else { + // Load unsigned value + return static_cast((static_cast(data) >> bitpos) & + BitField::vmask); + } + } + + // Bitfield insertion + static constexpr vtype insert(compact_type value) { + return static_cast((value & BitField::vmask) << bitpos); + } + + // Load bitfield value + constexpr operator compact_type() const noexcept { + return extract(this->m_data); + } + + // Load raw data with mask applied + constexpr T unshifted() const { + return static_cast(this->m_data & data_mask()); + } + + // Optimized bool conversion (must be removed if inappropriate) + explicit constexpr operator bool() const noexcept { + return unshifted() != 0u; + } + + // Store bitfield value + BitField &operator=(compact_type value) noexcept { + this->m_data = + static_cast((this->m_data & ~data_mask()) | insert(value)); + return *this; + } + + compact_type operator++(int) { + compact_type result = *this; + *this = static_cast(result + 1u); + return result; + } + + BitField &operator++() { + return *this = static_cast(*this + 1u); + } + + compact_type operator--(int) { + compact_type result = *this; + *this = static_cast(result - 1u); + return result; + } + + BitField &operator--() { + return *this = static_cast(*this - 1u); + } + + BitField &operator+=(compact_type right) { + return *this = static_cast(*this + right); + } + + BitField &operator-=(compact_type right) { + return *this = static_cast(*this - right); + } + + BitField &operator*=(compact_type right) { + return *this = static_cast(*this * right); + } + + BitField &operator&=(compact_type right) { + this->m_data &= static_cast( + ((static_cast(right + 0u) & BitField::vmask) << bitpos) | + ~(BitField::vmask << bitpos)); + return *this; + } + + BitField &operator|=(compact_type right) { + this->m_data |= static_cast( + (static_cast(right + 0u) & BitField::vmask) << bitpos); + return *this; + } + + BitField &operator^=(compact_type right) { + this->m_data ^= static_cast( + (static_cast(right + 0u) & BitField::vmask) << bitpos); + return *this; + } +}; + +// Field pack (concatenated from left to right) +template +struct BitFieldPack + : BitFieldBase::bitsize> { + using type = typename BitFieldPack::type; + using vtype = typename BitFieldPack::vtype; + using utype = typename BitFieldPack::utype; + using compact_type = typename BitFieldPack::compact_type; + + // Get disjunction of all "data" masks of concatenated values + static constexpr vtype data_mask() { + return static_cast(F::data_mask() | + BitFieldPack::data_mask()); + } + + // Extract all bitfields and concatenate + static constexpr compact_type extract(const type &data) { + return static_cast(static_cast(F::extract(data)) + << BitFieldPack::bitsize | + BitFieldPack::extract(data)); + } + + // Split bitfields and insert them + static constexpr vtype insert(compact_type value) { + return static_cast( + F::insert(value >> BitFieldPack::bitsize) | + BitFieldPack::insert(value)); + } + + // Load value + constexpr operator compact_type() const noexcept { + return extract(this->m_data); + } + + // Store value + BitFieldPack &operator=(compact_type value) noexcept { + this->m_data = (this->m_data & ~data_mask()) | insert(value); + return *this; + } +}; + +// Empty field pack (recursion terminator) +template <> struct BitFieldPack { + static constexpr std::size_t bitsize = 0; + + static constexpr std::size_t data_mask() { return 0; } + + template + static constexpr auto extract(const T &) -> decltype(+T()) { + return 0; + } + + template static constexpr T insert(T /*value*/) { return 0; } +}; + +// Fixed field (provides constant values in field pack) +template +struct BitFieldFixed : BitFieldBase { + using type = typename BitFieldFixed::type; + using vtype = typename BitFieldFixed::vtype; + + // Return constant value + static constexpr vtype extract(const type &) { + static_assert((V & BitFieldFixed::vmask) == V, + "BitFieldFixed<> error: V out of bounds"); + return V; + } + + // Get value + constexpr operator vtype() const noexcept { return V; } +}; +} // namespace rx + +template +struct std::common_type, rx::BitField> + : std::common_type {}; + +template +struct std::common_type, T2> + : std::common_type> {}; + +template +struct std::common_type> + : std::common_type, T2> {}; + +#ifndef _MSC_VER +#pragma GCC diagnostic pop +#endif diff --git a/rx/include/rx/BitSet.h b/rx/include/rx/BitSet.h new file mode 100644 index 000000000..55fe47ab1 --- /dev/null +++ b/rx/include/rx/BitSet.h @@ -0,0 +1,268 @@ +#pragma once + +/* +This header implements bs_t<> class for scoped enum types (enum class). +To enable bs_t<>, enum scope must contain `__bitset_enum_max` entry. + +enum class flagzz : u32 +{ + flag1, // Bit indices start from zero + flag2, +}; + +This also enables helper operators for this enum type. + +Examples: +`flagzz::flag1 | flagzz::flag2` - bitset union +`flagzz::flag1 & ~flagzz::flag2` - bitset difference +Intersection (&) and symmetric difference (^) is also available. +*/ + +#include "refl.hpp" +#include "types.hpp" + +namespace rx { +template +concept BitSetEnum = + std::is_enum_v && requires(T x) { rx::fieldCount > 0; }; + +template class BitSet; + +namespace detail { +template class InvertedBitSet final { + using underlying_type = std::underlying_type_t; + underlying_type m_data; + constexpr InvertedBitSet(underlying_type data) : m_data(data) {} + friend BitSet; +}; +} // namespace detail + +// Bitset type for enum class with available bits [0, fieldCount) +template class BitSet final { +public: + // Underlying type + using underlying_type = std::underlying_type_t; + +private: + // Underlying value + underlying_type m_data; + + // Value constructor + constexpr explicit BitSet(int, underlying_type data) noexcept + : m_data(data) {} + +public: + static constexpr usz bitmax = sizeof(T) * 8; + static constexpr usz bitsize = + static_cast(rx::fieldCount); + + static_assert(std::is_enum_v, + "BitSet<> error: invalid type (must be enum)"); + static_assert(bitsize <= bitmax, + "BitSet<> error: failed to determine enum field count"); + static_assert(bitsize != bitmax || std::is_unsigned_v, + "BitSet<> error: invalid field count (sign bit)"); + + // Helper function + static constexpr underlying_type shift(T value) { + return static_cast(1) + << static_cast(value); + } + + BitSet() = default; + + // Construct from a single bit + constexpr BitSet(T bit) noexcept : m_data(shift(bit)) {} + + // Test for empty bitset + constexpr explicit operator bool() const noexcept { return m_data != 0; } + + // Extract underlying data + constexpr explicit operator underlying_type() const noexcept { + return m_data; + } + + constexpr detail::InvertedBitSet operator~() const { return {m_data}; } + + constexpr BitSet &operator+=(BitSet rhs) { + m_data |= static_cast(rhs); + return *this; + } + + constexpr BitSet &operator-=(BitSet rhs) { + m_data &= ~static_cast(rhs); + return *this; + } + + constexpr BitSet without(BitSet rhs) const { + BitSet result = *this; + result.m_data &= ~static_cast(rhs); + return result; + } + + constexpr BitSet with(BitSet rhs) const { + BitSet result = *this; + result.m_data |= static_cast(rhs); + return result; + } + + constexpr BitSet &operator&=(BitSet rhs) { + m_data &= static_cast(rhs); + return *this; + } + + constexpr BitSet &operator^=(BitSet rhs) { + m_data ^= static_cast(rhs); + return *this; + } + + [[deprecated("Use operator|")]] friend constexpr BitSet + operator+(BitSet lhs, BitSet rhs) { + return BitSet(0, lhs.m_data | rhs.m_data); + } + + friend constexpr BitSet operator-(BitSet lhs, BitSet rhs) { + return BitSet(0, lhs.m_data & ~rhs.m_data); + } + + friend constexpr BitSet operator|(BitSet lhs, BitSet rhs) { + return BitSet(0, lhs.m_data | rhs.m_data); + } + + friend constexpr BitSet operator&(BitSet lhs, BitSet rhs) { + return BitSet(0, lhs.m_data & rhs.m_data); + } + + friend constexpr BitSet operator&(BitSet lhs, detail::InvertedBitSet rhs) { + return BitSet(0, lhs.m_data & rhs.m_data); + } + + friend constexpr BitSet operator^(BitSet lhs, BitSet rhs) { + return BitSet(0, lhs.m_data ^ rhs.m_data); + } + + constexpr bool operator==(BitSet rhs) const noexcept { + return m_data == rhs.m_data; + } + + constexpr bool test_and_set(T bit) { + bool r = (m_data & shift(bit)) != 0; + m_data |= shift(bit); + return r; + } + + constexpr bool test_and_reset(T bit) { + bool r = (m_data & shift(bit)) != 0; + m_data &= ~shift(bit); + return r; + } + + constexpr bool test_and_complement(T bit) { + bool r = (m_data & shift(bit)) != 0; + m_data ^= shift(bit); + return r; + } + + constexpr bool any_of(BitSet arg) const { return (m_data & arg.m_data) != 0; } + + constexpr bool all_of(BitSet arg) const { + return (m_data & arg.m_data) == arg.m_data; + } + + constexpr bool none_of(BitSet arg) const { + return (m_data & arg.m_data) == 0; + } +}; + +namespace bitset { +// Unary '+' operator: promote plain enum value to bitset value +template +[[deprecated("Use toBitSet(bit)")]] constexpr BitSet operator+(T bit) { + return BitSet(bit); +} + +template constexpr BitSet toBitSet(T bit) { + return BitSet(bit); +} + +// Binary '+' operator: bitset union +template + requires(std::is_constructible_v, U>) +[[deprecated("Use operator|")]] constexpr BitSet operator+(T lhs, + const U &rhs) { + return BitSet(lhs) | BitSet(rhs); +} + +// Binary '+' operator: bitset union +template + requires(std::is_constructible_v, U> && !std::is_enum_v) +[[deprecated("Use operator|")]] constexpr BitSet operator+(const U &lhs, + T rhs) { + return BitSet(lhs) | BitSet(rhs); +} + +// Binary '|' operator: bitset union +template + requires(std::is_constructible_v, U>) +constexpr BitSet operator|(T lhs, const U &rhs) { + return BitSet(lhs) | BitSet(rhs); +} + +// Binary '|' operator: bitset union +template + requires(std::is_constructible_v, U> && !std::is_enum_v) +constexpr BitSet operator|(const U &lhs, T rhs) { + return BitSet(lhs) | BitSet(rhs); +} + +// Binary '-' operator: bitset difference +template + requires(std::is_constructible_v, U>) +constexpr BitSet operator-(T lhs, const U &rhs) { + return BitSet(lhs) - BitSet(rhs); +} + +// Binary '-' operator: bitset difference +template + requires(std::is_constructible_v, U> && !std::is_enum_v) +constexpr BitSet operator-(const U &lhs, T rhs) { + return BitSet(lhs) - BitSet(rhs); +} + +// Binary '&' operator: bitset intersection +template + requires(std::is_constructible_v, U>) +constexpr BitSet operator&(T lhs, const U &rhs) { + return BitSet(lhs) & BitSet(rhs); +} + +// Binary '&' operator: bitset intersection +template + requires(std::is_constructible_v, U> && !std::is_enum_v) +constexpr BitSet operator&(const U &lhs, T rhs) { + return BitSet(lhs) & BitSet(rhs); +} + +// Binary '&' operator: bitset intersection +template +constexpr BitSet operator&(T lhs, detail::InvertedBitSet rhs) { + return BitSet(lhs) & rhs; +} + +// Binary '^' operator: bitset symmetric difference +template + requires(std::is_constructible_v, U>) +constexpr BitSet operator^(T lhs, const U &rhs) { + return BitSet(lhs) ^ BitSet(rhs); +} + +// Binary '^' operator: bitset symmetric difference +template + requires(std::is_constructible_v, U> && !std::is_enum_v) +constexpr BitSet operator^(const U &lhs, T rhs) { + return BitSet(lhs) ^ BitSet(rhs); +} +} // namespace bitset +} // namespace rx + +using namespace rx::bitset; diff --git a/rx/include/rx/asm.hpp b/rx/include/rx/asm.hpp new file mode 100644 index 000000000..2faed84ea --- /dev/null +++ b/rx/include/rx/asm.hpp @@ -0,0 +1,358 @@ +#pragma once + +#include "types.hpp" +#include + +extern bool g_use_rtm; +extern u64 g_rtm_tx_limit1; + +#ifdef _M_X64 +#ifdef _MSC_VER +extern "C" { +u32 _xbegin(); +void _xend(); +void _mm_pause(); +void _mm_prefetch(const char *, int); +void _m_prefetchw(const volatile void *); + +uchar _rotl8(uchar, uchar); +ushort _rotl16(ushort, uchar); +u64 __popcnt64(u64); + +s64 __mulh(s64, s64); +u64 __umulh(u64, u64); + +s64 _div128(s64, s64, s64, s64 *); +u64 _udiv128(u64, u64, u64, u64 *); +void __debugbreak(); +} +#include +#else +#include +#endif +#endif + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +namespace rx { +// Try to prefetch to Level 2 cache since it's not split to data/code on most +// processors +template constexpr void prefetch_exec(T func) { + if (std::is_constant_evaluated()) { + return; + } + + const u64 value = reinterpret_cast(func); + const void *ptr = reinterpret_cast(value); + +#ifdef _M_X64 + return _mm_prefetch(static_cast(ptr), _MM_HINT_T1); +#else + return __builtin_prefetch(ptr, 0, 2); +#endif +} + +// Try to prefetch to Level 1 cache +constexpr void prefetch_read(const void *ptr) { + if (std::is_constant_evaluated()) { + return; + } + +#ifdef _M_X64 + return _mm_prefetch(static_cast(ptr), _MM_HINT_T0); +#else + return __builtin_prefetch(ptr, 0, 3); +#endif +} + +constexpr void prefetch_write(void *ptr) { + if (std::is_constant_evaluated()) { + return; + } + +#if defined(_M_X64) && !defined(__clang__) + return _m_prefetchw(ptr); +#else + return __builtin_prefetch(ptr, 1, 0); +#endif +} + +constexpr u8 rol8(u8 x, u8 n) { + if (std::is_constant_evaluated()) { + return (x << (n & 7)) | (x >> ((-n & 7))); + } + +#ifdef _MSC_VER + return _rotl8(x, n); +#elif defined(__clang__) + return __builtin_rotateleft8(x, n); +#elif defined(ARCH_X64) + return __builtin_ia32_rolqi(x, n); +#else + return (x << (n & 7)) | (x >> ((-n & 7))); +#endif +} + +constexpr u16 rol16(u16 x, u16 n) { + if (std::is_constant_evaluated()) { + return (x << (n & 15)) | (x >> ((-n & 15))); + } + +#ifdef _MSC_VER + return _rotl16(x, static_cast(n)); +#elif defined(__clang__) + return __builtin_rotateleft16(x, n); +#elif defined(ARCH_X64) + return __builtin_ia32_rolhi(x, n); +#else + return (x << (n & 15)) | (x >> ((-n & 15))); +#endif +} + +constexpr u32 rol32(u32 x, u32 n) { + if (std::is_constant_evaluated()) { + return (x << (n & 31)) | (x >> (((0 - n) & 31))); + } + +#ifdef _MSC_VER + return _rotl(x, n); +#elif defined(__clang__) + return __builtin_rotateleft32(x, n); +#else + return (x << (n & 31)) | (x >> (((0 - n) & 31))); +#endif +} + +constexpr u64 rol64(u64 x, u64 n) { + if (std::is_constant_evaluated()) { + return (x << (n & 63)) | (x >> (((0 - n) & 63))); + } + +#ifdef _MSC_VER + return _rotl64(x, static_cast(n)); +#elif defined(__clang__) + return __builtin_rotateleft64(x, n); +#else + return (x << (n & 63)) | (x >> (((0 - n) & 63))); +#endif +} + +constexpr u32 popcnt64(u64 v) { +#if !defined(_MSC_VER) || defined(__SSE4_2__) + if (std::is_constant_evaluated()) +#endif + { + v = (v & 0xaaaaaaaaaaaaaaaa) / 2 + (v & 0x5555555555555555); + v = (v & 0xcccccccccccccccc) / 4 + (v & 0x3333333333333333); + v = (v & 0xf0f0f0f0f0f0f0f0) / 16 + (v & 0x0f0f0f0f0f0f0f0f); + v = (v & 0xff00ff00ff00ff00) / 256 + (v & 0x00ff00ff00ff00ff); + v = ((v & 0xffff0000ffff0000) >> 16) + (v & 0x0000ffff0000ffff); + return static_cast((v >> 32) + v); + } + +#if !defined(_MSC_VER) || defined(__SSE4_2__) +#ifdef _MSC_VER + return static_cast(__popcnt64(v)); +#else + return __builtin_popcountll(v); +#endif +#endif +} + +constexpr u32 popcnt128(const u128 &v) { +#ifdef _MSC_VER + return popcnt64(v.lo) + popcnt64(v.hi); +#else + return popcnt64(v) + popcnt64(v >> 64); +#endif +} + +constexpr u64 umulh64(u64 x, u64 y) { +#ifdef _MSC_VER + if (std::is_constant_evaluated()) +#endif + { + return static_cast((u128{x} * u128{y}) >> 64); + } + +#ifdef _MSC_VER + return __umulh(x, y); +#endif +} + +inline s64 mulh64(s64 x, s64 y) { +#ifdef _MSC_VER + return __mulh(x, y); +#else + return (s128{x} * s128{y}) >> 64; +#endif +} + +inline s64 div128(s64 high, s64 low, s64 divisor, s64 *remainder = nullptr) { +#ifdef _MSC_VER + s64 rem = 0; + s64 r = _div128(high, low, divisor, &rem); + + if (remainder) { + *remainder = rem; + } +#else + const s128 x = (u128{static_cast(high)} << 64) | u64(low); + const s128 r = x / divisor; + + if (remainder) { + *remainder = x % divisor; + } +#endif + return r; +} + +inline u64 udiv128(u64 high, u64 low, u64 divisor, u64 *remainder = nullptr) { +#ifdef _MSC_VER + u64 rem = 0; + u64 r = _udiv128(high, low, divisor, &rem); + + if (remainder) { + *remainder = rem; + } +#else + const u128 x = (u128{high} << 64) | low; + const u128 r = x / divisor; + + if (remainder) { + *remainder = x % divisor; + } +#endif + return r; +} + +#ifdef _MSC_VER +inline u128 operator/(u128 lhs, u64 rhs) { + u64 rem = 0; + return _udiv128(lhs.hi, lhs.lo, rhs, &rem); +} +#endif + +constexpr u32 ctz128(u128 arg) { +#ifdef _MSC_VER + if (!arg.lo) + return std::countr_zero(arg.hi) + 64u; + else + return std::countr_zero(arg.lo); +#else + if (u64 lo = static_cast(arg)) + return std::countr_zero(lo); + else + return std::countr_zero(arg >> 64) + 64; +#endif +} + +constexpr u32 clz128(u128 arg) { +#ifdef _MSC_VER + if (arg.hi) + return std::countl_zero(arg.hi); + else + return std::countl_zero(arg.lo) + 64; +#else + if (u64 hi = static_cast(arg >> 64)) + return std::countl_zero(hi); + else + return std::countl_zero(arg) + 64; +#endif +} + +inline void pause() { +#if defined(ARCH_ARM64) + __asm__ volatile("yield"); +#elif defined(_M_X64) + _mm_pause(); +#elif defined(ARCH_X64) + __builtin_ia32_pause(); +#else +#error "Missing pause() implementation" +#endif +} + +// Align to power of 2 +template + requires std::is_unsigned_v +constexpr std::make_unsigned_t> align(T value, + U align) { + return static_cast>>( + (value + (align - 1)) & (T{0} - align)); +} + +// General purpose aligned division, the result is rounded up not truncated +template + requires std::is_unsigned_v +constexpr T aligned_div(T value, std::type_identity_t align) { + return static_cast(value / align + T{!!(value % align)}); +} + +// General purpose aligned division, the result is rounded to nearest +template + requires std::is_integral_v +constexpr T rounded_div(T value, std::type_identity_t align) { + if constexpr (std::is_unsigned_v) { + return static_cast(value / align + T{(value % align) > (align / 2)}); + } + + return static_cast(value / align + + (value > 0 ? T{(value % align) > (align / 2)} + : 0 - T{(value % align) < (align / 2)})); +} + +// Multiplying by ratio, semi-resistant to overflows +template +constexpr T rational_mul(T value, std::type_identity_t numerator, + std::type_identity_t denominator) { + if constexpr (sizeof(T) <= sizeof(u64) / 2) { + return static_cast(value * u64{numerator} / u64{denominator}); + } + +#if is_u128_emulated + if constexpr (sizeof(T) <= sizeof(u128) / 2) { + return static_cast(u128_from_mul(value, numerator) / u64{denominator}); + } +#endif + + return static_cast(value / denominator * numerator + + (value % denominator) * numerator / denominator); +} + +template constexpr T add_saturate(T addend1, T addend2) { + return static_cast(~addend1) < addend2 ? T{umax} + : static_cast(addend1 + addend2); +} + +template constexpr T sub_saturate(T minuend, T subtrahend) { + return minuend < subtrahend ? T{0} : static_cast(minuend - subtrahend); +} + +template constexpr T mul_saturate(T factor1, T factor2) { + return factor1 > 0 && T{umax} / factor1 < factor2 + ? T{umax} + : static_cast(factor1 * factor2); +} + +inline void trigger_write_page_fault(void *ptr) { +#if defined(ARCH_X64) && !defined(_MSC_VER) + __asm__ volatile("lock orl $0, 0(%0)" ::"r"(ptr)); +#elif defined(ARCH_ARM64) && !defined(ANDROID) + u32 value = 0; + u32 *u32_ptr = static_cast(ptr); + __asm__ volatile("ldset %w0, %w0, %1" + : "+r"(value), "=Q"(*u32_ptr) + : "r"(value)); +#else + static_cast *>(ptr)->fetch_or( + 0, std::memory_order::relaxed); +#endif +} +} // namespace rx + +#ifdef _MSC_VER +using rx::operator/; +#endif diff --git a/rx/include/rx/format.hpp b/rx/include/rx/format.hpp index 682999ce3..bed16fbef 100644 --- a/rx/include/rx/format.hpp +++ b/rx/include/rx/format.hpp @@ -218,13 +218,15 @@ struct std::formatter { std::string fieldName; - auto underlying = std::to_underlying(value); + // FIXME: requires C++23 + // auto underlying = std::to_underlying(value); + auto underlying = static_cast(value); if (underlying < 0) { fieldName = queryUnknownField( underlying, std::integral_constant{}, std::make_integer_sequence{}); - } else if (underlying >= rx::fieldCount) { + } else if (static_cast(underlying) >= rx::fieldCount) { fieldName = queryUnknownField( underlying, std::integral_constant>{}, std::make_integer_sequence{}); diff --git a/rx/include/rx/simd.hpp b/rx/include/rx/simd.hpp new file mode 100644 index 000000000..c9ed0feea --- /dev/null +++ b/rx/include/rx/simd.hpp @@ -0,0 +1,2236 @@ +#pragma once + +#include "asm.hpp" +#include "types.hpp" +#include "v128.hpp" + +#if defined(ARCH_X64) +#ifdef _MSC_VER +#include +#else +#include +#endif + +#include +#include +#endif + +#if defined(ARCH_ARM64) +#include +#endif + +#include +#include + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-parameter" +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +namespace rx { +inline v128 gv_select8(const v128 &_cmp, const v128 &_true, const v128 &_false); +inline v128 gv_signselect8(const v128 &bits, const v128 &_true, + const v128 &_false); +inline v128 gv_select16(const v128 &_cmp, const v128 &_true, + const v128 &_false); +inline v128 gv_select32(const v128 &_cmp, const v128 &_true, + const v128 &_false); +inline v128 gv_selectfs(const v128 &_cmp, const v128 &_true, + const v128 &_false); + +inline void gv_set_zeroing_denormals() { +#if defined(ARCH_X64) + u32 cr = _mm_getcsr(); + cr = (cr & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_ZERO_ON; + cr = (cr & ~_MM_DENORMALS_ZERO_MASK) | _MM_DENORMALS_ZERO_ON; + cr = (cr | _MM_MASK_INVALID); + _mm_setcsr(cr); +#elif defined(ARCH_ARM64) + u64 cr; + __asm__ volatile("mrs %0, FPCR" : "=r"(cr)); + cr |= 0x1000000ull; + __asm__ volatile("msr FPCR, %0" ::"r"(cr)); +#else +#error "Not implemented" +#endif +} + +inline void gv_unset_zeroing_denormals() { +#if defined(ARCH_X64) + u32 cr = _mm_getcsr(); + cr = (cr & ~_MM_FLUSH_ZERO_MASK) | _MM_FLUSH_ZERO_OFF; + cr = (cr & ~_MM_DENORMALS_ZERO_MASK) | _MM_DENORMALS_ZERO_OFF; + cr = (cr | _MM_MASK_INVALID); + _mm_setcsr(cr); +#elif defined(ARCH_ARM64) + u64 cr; + __asm__ volatile("mrs %0, FPCR" : "=r"(cr)); + cr &= ~0x1000000ull; + __asm__ volatile("msr FPCR, %0" ::"r"(cr)); +#else +#error "Not implemented" +#endif +} + +inline v128 gv_bcst8(u8 value) { +#if defined(ARCH_X64) + return _mm_set1_epi8(value); +#elif defined(ARCH_ARM64) + return vdupq_n_s8(value); +#endif +} + +inline v128 gv_bcst16(u16 value) { +#if defined(ARCH_X64) + return _mm_set1_epi16(value); +#elif defined(ARCH_ARM64) + return vdupq_n_s16(value); +#endif +} + +inline v128 gv_bcst32(u32 value) { +#if defined(ARCH_X64) + return _mm_set1_epi32(value); +#elif defined(ARCH_ARM64) + return vdupq_n_s32(value); +#endif +} + +inline v128 gv_bcst64(u64 value) { +#if defined(ARCH_X64) + return _mm_set1_epi64x(value); +#elif defined(ARCH_ARM64) + return vdupq_n_s64(value); +#endif +} + +inline v128 gv_bcstfs(f32 value) { +#if defined(ARCH_X64) + return _mm_set1_ps(value); +#elif defined(ARCH_ARM64) + return vdupq_n_f32(value); +#endif +} + +inline v128 gv_and32(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_and_si128(a, b); +#elif defined(ARCH_ARM64) + return vandq_s32(a, b); +#endif +} + +inline v128 gv_andfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_and_ps(a, b); +#elif defined(ARCH_ARM64) + return vandq_s32(a, b); +#endif +} + +inline v128 gv_andn32(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_andnot_si128(a, b); +#elif defined(ARCH_ARM64) + return vbicq_s32(b, a); +#endif +} + +inline v128 gv_andnfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_andnot_ps(a, b); +#elif defined(ARCH_ARM64) + return vbicq_s32(b, a); +#endif +} + +inline v128 gv_or32(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_or_si128(a, b); +#elif defined(ARCH_ARM64) + return vorrq_s32(a, b); +#endif +} + +inline v128 gv_orfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_or_ps(a, b); +#elif defined(ARCH_ARM64) + return vorrq_s32(a, b); +#endif +} + +inline v128 gv_xor32(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_xor_si128(a, b); +#elif defined(ARCH_ARM64) + return veorq_s32(a, b); +#endif +} + +inline v128 gv_xorfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_xor_ps(a, b); +#elif defined(ARCH_ARM64) + return veorq_s32(a, b); +#endif +} + +inline v128 gv_not32(const v128 &a) { +#if defined(ARCH_X64) + return _mm_xor_si128(a, _mm_set1_epi32(-1)); +#elif defined(ARCH_ARM64) + return vmvnq_u32(a); +#endif +} + +inline v128 gv_notfs(const v128 &a) { +#if defined(ARCH_X64) + return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1))); +#elif defined(ARCH_ARM64) + return vmvnq_u32(a); +#endif +} + +inline v128 gv_shl16(const v128 &a, u32 count) { + if (count >= 16) + return v128{}; +#if defined(ARCH_X64) + return _mm_slli_epi16(a, count); +#elif defined(ARCH_ARM64) + return vshlq_s16(a, vdupq_n_s16(count)); +#endif +} + +inline v128 gv_shl32(const v128 &a, u32 count) { + if (count >= 32) + return v128{}; +#if defined(ARCH_X64) + return _mm_slli_epi32(a, count); +#elif defined(ARCH_ARM64) + return vshlq_s32(a, vdupq_n_s32(count)); +#endif +} + +inline v128 gv_shl64(const v128 &a, u32 count) { + if (count >= 64) + return v128{}; +#if defined(ARCH_X64) + return _mm_slli_epi64(a, count); +#elif defined(ARCH_ARM64) + return vshlq_s64(a, vdupq_n_s64(count)); +#endif +} + +inline v128 gv_shr16(const v128 &a, u32 count) { + if (count >= 16) + return v128{}; +#if defined(ARCH_X64) + return _mm_srli_epi16(a, count); +#elif defined(ARCH_ARM64) + return vshlq_u16(a, vdupq_n_s16(0 - count)); +#endif +} + +inline v128 gv_shr32(const v128 &a, u32 count) { + if (count >= 32) + return v128{}; +#if defined(ARCH_X64) + return _mm_srli_epi32(a, count); +#elif defined(ARCH_ARM64) + return vshlq_u32(a, vdupq_n_s32(0 - count)); +#endif +} + +inline v128 gv_shr64(const v128 &a, u32 count) { + if (count >= 64) + return v128{}; +#if defined(ARCH_X64) + return _mm_srli_epi64(a, count); +#elif defined(ARCH_ARM64) + return vshlq_u64(a, vdupq_n_s64(0 - count)); +#endif +} + +inline v128 gv_sar16(const v128 &a, u32 count) { + if (count >= 16) + count = 15; +#if defined(ARCH_X64) + return _mm_srai_epi16(a, count); +#elif defined(ARCH_ARM64) + return vshlq_s16(a, vdupq_n_s16(0 - count)); +#endif +} + +inline v128 gv_sar32(const v128 &a, u32 count) { + if (count >= 32) + count = 31; +#if defined(ARCH_X64) + return _mm_srai_epi32(a, count); +#elif defined(ARCH_ARM64) + return vshlq_s32(a, vdupq_n_s32(0 - count)); +#endif +} + +inline v128 gv_sar64(const v128 &a, u32 count) { + if (count >= 64) + count = 63; +#if defined(__AVX512VL__) + return _mm_srai_epi64(a, count); +#elif defined(__SSE2__) && !defined(_M_X64) + return static_cast<__v2di>(a) >> count; +#elif defined(ARCH_ARM64) + return vshlq_s64(a, vdupq_n_s64(0 - count)); +#else + v128 r; + r._s64[0] = a._s64[0] >> count; + r._s64[1] = a._s64[1] >> count; + return r; +#endif +} +inline v128 gv_add8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_add_epi8(a, b); +#elif defined(ARCH_ARM64) + return vaddq_s8(a, b); +#endif +} + +inline v128 gv_add16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_add_epi16(a, b); +#elif defined(ARCH_ARM64) + return vaddq_s16(a, b); +#endif +} + +inline v128 gv_add32(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_add_epi32(a, b); +#elif defined(ARCH_ARM64) + return vaddq_s32(a, b); +#endif +} + +inline v128 gv_add64(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_add_epi64(a, b); +#elif defined(ARCH_ARM64) + return vaddq_s64(a, b); +#endif +} + +inline v128 gv_adds_s8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_adds_epi8(a, b); +#elif defined(ARCH_ARM64) + return vqaddq_s8(a, b); +#endif +} + +inline v128 gv_adds_s16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_adds_epi16(a, b); +#elif defined(ARCH_ARM64) + return vqaddq_s16(a, b); +#endif +} + +inline v128 gv_adds_s32(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + const v128 s = _mm_add_epi32(a, b); + const v128 m = (a ^ s) & (b ^ s); // overflow bit + const v128 x = _mm_srai_epi32(m, 31); // saturation mask + const v128 y = + _mm_srai_epi32(_mm_and_si128(s, m), 31); // positive saturation mask + return _mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(x, 1), y), + _mm_or_si128(s, x)); +#elif defined(ARCH_ARM64) + return vqaddq_s32(a, b); +#endif +} + +inline v128 gv_addus_u8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_adds_epu8(a, b); +#elif defined(ARCH_ARM64) + return vqaddq_u8(a, b); +#endif +} + +inline v128 gv_addus_u16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_adds_epu16(a, b); +#elif defined(ARCH_ARM64) + return vqaddq_u16(a, b); +#endif +} + +inline v128 gv_addus_u32(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_add_epi32(a, _mm_min_epu32(~a, b)); +#elif defined(ARCH_X64) + const v128 s = _mm_add_epi32(a, b); + return _mm_or_si128(s, + _mm_cmpgt_epi32(_mm_xor_si128(b, _mm_set1_epi32(smin)), + _mm_xor_si128(a, _mm_set1_epi32(smax)))); +#elif defined(ARCH_ARM64) + return vqaddq_u32(a, b); +#endif +} + +inline v128 gv_addfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_add_ps(a, b); +#elif defined(ARCH_ARM64) + return vaddq_f32(a, b); +#endif +} + +inline v128 gv_addfd(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_add_pd(a, b); +#elif defined(ARCH_ARM64) + return vaddq_f64(a, b); +#endif +} + +inline v128 gv_sub8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_sub_epi8(a, b); +#elif defined(ARCH_ARM64) + return vsubq_s8(a, b); +#endif +} + +inline v128 gv_sub16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_sub_epi16(a, b); +#elif defined(ARCH_ARM64) + return vsubq_s16(a, b); +#endif +} + +inline v128 gv_sub32(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_sub_epi32(a, b); +#elif defined(ARCH_ARM64) + return vsubq_s32(a, b); +#endif +} + +inline v128 gv_sub64(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_sub_epi64(a, b); +#elif defined(ARCH_ARM64) + return vsubq_s64(a, b); +#endif +} + +inline v128 gv_subs_s8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_subs_epi8(a, b); +#elif defined(ARCH_ARM64) + return vqsubq_s8(a, b); +#endif +} + +inline v128 gv_subs_s16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_subs_epi16(a, b); +#elif defined(ARCH_ARM64) + return vqsubq_s16(a, b); +#endif +} + +inline v128 gv_subs_s32(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + const v128 d = _mm_sub_epi32(a, b); + const v128 m = (a ^ b) & (a ^ d); // overflow bit + const v128 x = _mm_srai_epi32(m, 31); + return _mm_or_si128(_mm_andnot_si128(x, d), + _mm_and_si128(x, _mm_xor_si128(_mm_srli_epi32(x, 1), + _mm_srai_epi32(a, 31)))); +#elif defined(ARCH_ARM64) + return vqsubq_s32(a, b); +#endif +} + +inline v128 gv_subus_u8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_subs_epu8(a, b); +#elif defined(ARCH_ARM64) + return vqsubq_u8(a, b); +#endif +} + +inline v128 gv_subus_u16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_subs_epu16(a, b); +#elif defined(ARCH_ARM64) + return vqsubq_u16(a, b); +#endif +} + +inline v128 gv_subus_u32(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_sub_epi32(a, _mm_min_epu32(a, b)); +#elif defined(ARCH_X64) + const auto sign = _mm_set1_epi32(smin); + return _mm_andnot_si128( + _mm_cmpgt_epi32(_mm_xor_si128(b, sign), _mm_xor_si128(a, sign)), + _mm_sub_epi32(a, b)); +#elif defined(ARCH_ARM64) + return vqsubq_u32(a, b); +#endif +} + +inline v128 gv_subfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_sub_ps(a, b); +#elif defined(ARCH_ARM64) + return vsubq_f32(a, b); +#endif +} + +inline v128 gv_subfd(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_sub_pd(a, b); +#elif defined(ARCH_ARM64) + return vsubq_f64(a, b); +#endif +} + +inline v128 gv_maxu8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_max_epu8(a, b); +#elif defined(ARCH_ARM64) + return vmaxq_u8(a, b); +#endif +} + +inline v128 gv_maxu16(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_max_epu16(a, b); +#elif defined(ARCH_X64) + return _mm_add_epi16(_mm_subs_epu16(a, b), b); +#elif defined(ARCH_ARM64) + return vmaxq_u16(a, b); +#endif +} + +inline v128 gv_maxu32(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_max_epu32(a, b); +#elif defined(ARCH_X64) + const __m128i s = _mm_set1_epi32(smin); + const __m128i m = _mm_cmpgt_epi32(_mm_xor_si128(a, s), _mm_xor_si128(b, s)); + return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b)); +#elif defined(ARCH_ARM64) + return vmaxq_u32(a, b); +#endif +} + +inline v128 gv_maxs8(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_max_epi8(a, b); +#elif defined(ARCH_X64) + const __m128i m = _mm_cmpgt_epi8(a, b); + return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b)); +#elif defined(ARCH_ARM64) + return vmaxq_s8(a, b); +#endif +} + +inline v128 gv_maxs16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_max_epi16(a, b); +#elif defined(ARCH_ARM64) + return vmaxq_s16(a, b); +#endif +} + +inline v128 gv_maxs32(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_max_epi32(a, b); +#elif defined(ARCH_X64) + const __m128i m = _mm_cmpgt_epi32(a, b); + return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b)); +#elif defined(ARCH_ARM64) + return vmaxq_s32(a, b); +#endif +} + +inline v128 gv_maxfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_and_ps(_mm_max_ps(a, b), _mm_max_ps(b, a)); +#elif defined(ARCH_ARM64) + return vmaxq_f32(a, b); +#endif +} + +inline v128 gv_minu8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_min_epu8(a, b); +#elif defined(ARCH_ARM64) + return vminq_u8(a, b); +#endif +} + +inline v128 gv_minu16(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_min_epu16(a, b); +#elif defined(ARCH_X64) + return _mm_sub_epi16(a, _mm_subs_epu16(a, b)); +#elif defined(ARCH_ARM64) + return vminq_u16(a, b); +#endif +} + +inline v128 gv_minu32(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_min_epu32(a, b); +#elif defined(ARCH_X64) + const __m128i s = _mm_set1_epi32(smin); + const __m128i m = _mm_cmpgt_epi32(_mm_xor_si128(a, s), _mm_xor_si128(b, s)); + return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b)); +#elif defined(ARCH_ARM64) + return vminq_u32(a, b); +#endif +} + +inline v128 gv_mins8(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_min_epi8(a, b); +#elif defined(ARCH_X64) + const __m128i m = _mm_cmpgt_epi8(a, b); + return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b)); +#elif defined(ARCH_ARM64) + return vminq_s8(a, b); +#endif +} + +inline v128 gv_mins16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_min_epi16(a, b); +#elif defined(ARCH_ARM64) + return vminq_s16(a, b); +#endif +} + +inline v128 gv_mins32(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_min_epi32(a, b); +#elif defined(ARCH_X64) + const __m128i m = _mm_cmpgt_epi32(a, b); + return _mm_or_si128(_mm_andnot_si128(m, a), _mm_and_si128(m, b)); +#elif defined(ARCH_ARM64) + return vminq_s32(a, b); +#endif +} + +inline v128 gv_minfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a)); +#elif defined(ARCH_ARM64) + return vminq_f32(a, b); +#endif +} + +inline v128 gv_eq8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpeq_epi8(a, b); +#elif defined(ARCH_ARM64) + return vceqq_s8(a, b); +#endif +} + +inline v128 gv_eq16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpeq_epi16(a, b); +#elif defined(ARCH_ARM64) + return vceqq_s16(a, b); +#endif +} + +inline v128 gv_eq32(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpeq_epi32(a, b); +#elif defined(ARCH_ARM64) + return vceqq_s32(a, b); +#endif +} + +// Ordered and equal +inline v128 gv_eqfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpeq_ps(a, b); +#elif defined(ARCH_ARM64) + return vceqq_f32(a, b); +#endif +} + +// Unordered or not equal +inline v128 gv_neqfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpneq_ps(a, b); +#elif defined(ARCH_ARM64) + return ~vceqq_f32(a, b); +#endif +} + +inline v128 gv_gtu8(const v128 &a, const v128 &b) { +#if defined(__AVX512VL__) && defined(__AVX512BW__) + return _mm_movm_epi8(_mm_cmpgt_epu8_mask(a, b)); +#elif defined(ARCH_X64) + return _mm_cmpeq_epi8(_mm_cmpeq_epi8(a, _mm_min_epu8(a, b)), + _mm_setzero_si128()); +#elif defined(ARCH_ARM64) + return vcgtq_u8(a, b); +#endif +} + +inline v128 gv_gtu16(const v128 &a, const v128 &b) { +#if defined(__AVX512VL__) && defined(__AVX512BW__) + return _mm_movm_epi16(_mm_cmpgt_epu16_mask(a, b)); +#elif defined(__SSE4_1__) + return _mm_cmpeq_epi16(_mm_cmpeq_epi16(a, _mm_min_epu16(a, b)), + _mm_setzero_si128()); +#elif defined(ARCH_X64) + return _mm_cmpeq_epi16( + _mm_cmpeq_epi16(_mm_subs_epu16(a, b), _mm_setzero_si128()), + _mm_setzero_si128()); +#elif defined(ARCH_ARM64) + return vcgtq_u16(a, b); +#endif +} + +inline v128 gv_gtu32(const v128 &a, const v128 &b) { +#if defined(__AVX512VL__) && defined(__AVX512DQ__) + return _mm_movm_epi32(_mm_cmpgt_epu32_mask(a, b)); +#elif defined(__SSE4_1__) + return _mm_cmpeq_epi32(_mm_cmpeq_epi32(a, _mm_min_epu32(a, b)), + _mm_setzero_si128()); +#elif defined(ARCH_X64) + const auto sign = _mm_set1_epi32(smin); + return _mm_cmpgt_epi32(_mm_xor_si128(a, sign), _mm_xor_si128(b, sign)); +#elif defined(ARCH_ARM64) + return vcgtq_u32(a, b); +#endif +} + +// Ordered and greater than +inline v128 gv_gtfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpgt_ps(a, b); +#elif defined(ARCH_ARM64) + return vcgtq_f32(a, b); +#endif +} + +// Ordered and less than +inline v128 gv_ltfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmplt_ps(a, b); +#elif defined(ARCH_ARM64) + return vcltq_f32(a, b); +#endif +} + +// Unordered or less or equal +inline v128 gv_ngtfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpngt_ps(a, b); +#elif defined(ARCH_ARM64) + return ~vcgtq_f32(a, b); +#endif +} + +// Unordered or greater or equal +inline v128 gv_nlefs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpnle_ps(a, b); +#elif defined(ARCH_ARM64) + return ~vcleq_f32(a, b); +#endif +} + +inline v128 gv_geu8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpeq_epi8(b, _mm_min_epu8(a, b)); +#elif defined(ARCH_ARM64) + return vcgeq_u8(a, b); +#endif +} + +inline v128 gv_geu16(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_cmpeq_epi16(b, _mm_min_epu16(a, b)); +#elif defined(ARCH_X64) + return _mm_cmpeq_epi16(_mm_subs_epu16(b, a), _mm_setzero_si128()); +#elif defined(ARCH_ARM64) + return vcgeq_u16(a, b); +#endif +} + +inline v128 gv_geu32(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_cmpeq_epi32(b, _mm_min_epu32(a, b)); +#elif defined(ARCH_X64) + const auto sign = _mm_set1_epi32(smin); + return _mm_cmpeq_epi32( + _mm_cmpgt_epi32(_mm_xor_si128(b, sign), _mm_xor_si128(a, sign)), + _mm_setzero_si128()); +#elif defined(ARCH_ARM64) + return vcgeq_u32(a, b); +#endif +} + +// Ordered and not less than +inline v128 gv_gefs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpge_ps(a, b); +#elif defined(ARCH_ARM64) + return vcgeq_f32(a, b); +#endif +} + +// Unordered or less than +inline v128 gv_ngefs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpnge_ps(a, b); +#elif defined(ARCH_ARM64) + return ~vcgeq_f32(a, b); +#endif +} + +inline v128 gv_gts8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpgt_epi8(a, b); +#elif defined(ARCH_ARM64) + return vcgtq_s8(a, b); +#endif +} + +inline v128 gv_gts16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpgt_epi16(a, b); +#elif defined(ARCH_ARM64) + return vcgtq_s16(a, b); +#endif +} + +inline v128 gv_gts32(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_cmpgt_epi32(a, b); +#elif defined(ARCH_ARM64) + return vcgtq_s32(a, b); +#endif +} + +inline v128 gv_avgu8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_avg_epu8(a, b); +#elif defined(ARCH_ARM64) + return vrhaddq_u8(a, b); +#endif +} + +inline v128 gv_avgu16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_avg_epu16(a, b); +#elif defined(ARCH_ARM64) + return vrhaddq_u16(a, b); +#endif +} + +inline v128 gv_avgu32(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + const auto ones = _mm_set1_epi32(-1); + const auto summ = gv_sub32(gv_add32(a, b), ones); + const auto carry = _mm_slli_epi32(gv_geu32(a, summ), 31); + return _mm_or_si128(carry, _mm_srli_epi32(summ, 1)); +#elif defined(ARCH_ARM64) + return vrhaddq_u32(a, b); +#endif +} + +inline v128 gv_avgs8(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + const v128 sign = _mm_set1_epi8(smin); + return gv_avgu8(a ^ sign, b ^ sign) ^ sign; +#elif defined(ARCH_ARM64) + return vrhaddq_s8(a, b); +#endif +} + +inline v128 gv_avgs16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + const v128 sign = _mm_set1_epi16(smin); + return gv_avgu16(a ^ sign, b ^ sign) ^ sign; +#elif defined(ARCH_ARM64) + return vrhaddq_s16(a, b); +#endif +} + +inline v128 gv_avgs32(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + const v128 sign = _mm_set1_epi32(smin); + return gv_avgu32(a ^ sign, b ^ sign) ^ sign; +#elif defined(ARCH_ARM64) + return vrhaddq_s32(a, b); +#endif +} + +inline v128 gv_divfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_div_ps(a, b); +#elif defined(ARCH_ARM64) + return vdivq_f32(a, b); +#endif +} + +inline v128 gv_sqrtfs(const v128 &a) { +#if defined(ARCH_X64) + return _mm_sqrt_ps(a); +#elif defined(ARCH_ARM64) + return vsqrtq_f32(a); +#endif +} + +inline v128 gv_fmafs(const v128 &a, const v128 &b, const v128 &c) { +#if defined(ARCH_X64) && defined(__FMA__) + return _mm_fmadd_ps(a, b, c); +#elif defined(__FMA4__) + return _mm_macc_ps(a, b, c); +#elif defined(ARCH_X64) + // This is inaccurate implementation +#ifdef __AVX__ + const __m128 r = _mm256_cvtpd_ps( + _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(a), _mm256_cvtps_pd(b)), + _mm256_cvtps_pd(c))); +#else + const __m128d a0 = _mm_cvtps_pd(a); + const __m128d a1 = _mm_cvtps_pd(_mm_movehl_ps(a, a)); + const __m128d b0 = _mm_cvtps_pd(b); + const __m128d b1 = _mm_cvtps_pd(_mm_movehl_ps(b, b)); + const __m128d c0 = _mm_cvtps_pd(c); + const __m128d c1 = _mm_cvtps_pd(_mm_movehl_ps(c, c)); + const __m128d m0 = _mm_mul_pd(a0, b0); + const __m128d m1 = _mm_mul_pd(a1, b1); + const __m128d r0 = _mm_add_pd(m0, c0); + const __m128d r1 = _mm_add_pd(m1, c1); + const __m128 r = _mm_movelh_ps(_mm_cvtpd_ps(r0), _mm_cvtpd_ps(r1)); +#endif + return r; +#elif defined(ARCH_ARM64) + return vfmaq_f32(c, a, b); +#else + v128 r; + for (int i = 0; i < 4; i++) { + r._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]); + } + return r; +#endif +} + +// -> ssat((a * b * 2 + (c << 16) + 0x8000) >> 16) +inline v128 gv_rmuladds_hds16(const v128 &a, const v128 &b, const v128 &c) { +#if defined(ARCH_ARM64) +#if defined(__ARM_FEATURE_QRDMX) + return vqrdmlahq_s16(c, a, b); +#else + v128 d; + + for (uint h = 0; h < 8; h++) { + s32 result = ((s32)a._s16[h] * (s32)b._s16[h]) + 0x4000; + result = (result >> 15) + (s32)c._s16[h]; + + if (result > INT16_MAX) { + d._s16[h] = (s16)INT16_MAX; + } else if (result < INT16_MIN) { + d._s16[h] = (s16)INT16_MIN; + } else { + d._s16[h] = (s16)result; + } + } + + return d; +#endif +#elif defined(ARCH_X64) + const auto x80 = + _mm_set1_epi16(0x80); // 0x80 * 0x80 = 0x4000, add this to the product + const auto al = _mm_unpacklo_epi16(a, x80); + const auto ah = _mm_unpackhi_epi16(a, x80); + const auto bl = _mm_unpacklo_epi16(b, x80); + const auto bh = _mm_unpackhi_epi16(b, x80); + const auto ml = _mm_srai_epi32(_mm_madd_epi16(al, bl), 15); + const auto mh = _mm_srai_epi32(_mm_madd_epi16(ah, bh), 15); + const auto cl = + _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), c), 16); + const auto ch = + _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), c), 16); + const auto sl = _mm_add_epi32(ml, cl); + const auto sh = _mm_add_epi32(mh, ch); + return _mm_packs_epi32(sl, sh); +#endif +} + +// -> ssat((a * b * 2 + 0x8000) >> 16) +inline v128 gv_rmuls_hds16(const v128 &a, const v128 &b) { +#if defined(ARCH_ARM64) + return vqrdmulhq_s16(a, b); +#elif defined(ARCH_X64) + const auto x80 = + _mm_set1_epi16(0x80); // 0x80 * 0x80 = 0x4000, add this to the product + const auto al = _mm_unpacklo_epi16(a, x80); + const auto ah = _mm_unpackhi_epi16(a, x80); + const auto bl = _mm_unpacklo_epi16(b, x80); + const auto bh = _mm_unpackhi_epi16(b, x80); + const auto ml = _mm_srai_epi32(_mm_madd_epi16(al, bl), 15); + const auto mh = _mm_srai_epi32(_mm_madd_epi16(ah, bh), 15); + return _mm_packs_epi32(ml, mh); +#endif +} + +// -> ssat((a * b * 2) >> 16) +inline v128 gv_muls_hds16(const v128 &a, const v128 &b) { +#if defined(ARCH_ARM64) + return vqdmulhq_s16(a, b); +#elif defined(ARCH_X64) + const auto m = _mm_or_si128(_mm_srli_epi16(_mm_mullo_epi16(a, b), 15), + _mm_slli_epi16(_mm_mulhi_epi16(a, b), 1)); + const auto s = _mm_cmpeq_epi16( + m, _mm_set1_epi16(-0x8000)); // detect special case (positive 0x8000) + return _mm_xor_si128(m, s); +#endif +} + +inline v128 gv_muladd16(const v128 &a, const v128 &b, const v128 &c) { +#if defined(ARCH_X64) + return _mm_add_epi16(_mm_mullo_epi16(a, b), c); +#elif defined(ARCH_ARM64) + return vmlaq_s16(c, a, b); +#endif +} + +inline v128 gv_mul16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_mullo_epi16(a, b); +#elif defined(ARCH_ARM64) + return vmulq_s16(a, b); +#endif +} + +inline v128 gv_mul32(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) + return _mm_mullo_epi32(a, b); +#elif defined(ARCH_X64) + const __m128i lows = _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8); + const __m128i highs = _mm_shuffle_epi32( + _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32)), 8); + return _mm_unpacklo_epi32(lows, highs); +#elif defined(ARCH_ARM64) + return vmulq_s32(a, b); +#endif +} + +inline v128 gv_mulfs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_mul_ps(a, b); +#elif defined(ARCH_ARM64) + return vmulq_f32(a, b); +#endif +} + +inline v128 gv_mulfs(const v128 &a, f32 b) { +#if defined(ARCH_X64) + return _mm_mul_ps(a, _mm_set_ps1(b)); +#elif defined(ARCH_ARM64) + return vmulq_n_f32(a, b); +#endif +} + +inline v128 gv_hadds8x2(const v128 &a) { +#if defined(__SSSE3__) + return _mm_maddubs_epi16(_mm_set1_epi8(1), a); +#elif defined(ARCH_X64) + return _mm_add_epi16(_mm_srai_epi16(a, 8), + _mm_srai_epi16(_mm_slli_epi16(a, 8), 8)); +#elif defined(ARCH_ARM64) + return vpaddlq_s8(a); +#endif +} + +inline v128 gv_hadds8x4(const v128 &a, const v128 &c) { +#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) + return _mm_dpbusd_epi32(c, _mm_set1_epi8(1), a); +#elif defined(__SSSE3__) + return _mm_add_epi32( + _mm_madd_epi16(_mm_maddubs_epi16(_mm_set1_epi8(1), a), _mm_set1_epi16(1)), + c); +#elif defined(ARCH_X64) + return _mm_add_epi32( + _mm_madd_epi16(_mm_add_epi16(_mm_srai_epi16(a, 8), + _mm_srai_epi16(_mm_slli_epi16(a, 8), 8)), + _mm_set1_epi16(1)), + c); +#elif defined(ARCH_ARM64) + return vaddq_s32(vpaddlq_s16(vpaddlq_s8(a)), c); +#endif +} + +inline v128 gv_haddu8x2(const v128 &a) { +#if defined(__SSSE3__) + return _mm_maddubs_epi16(a, _mm_set1_epi8(1)); +#elif defined(ARCH_X64) + return _mm_add_epi16(_mm_srli_epi16(a, 8), + _mm_and_si128(a, _mm_set1_epi16(0x00ff))); +#elif defined(ARCH_ARM64) + return vpaddlq_u8(a); +#endif +} + +inline v128 gv_haddu8x4(const v128 &a) { +#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) + return _mm_dpbusd_epi32(_mm_setzero_si128(), a, _mm_set1_epi8(1)); +#elif defined(__SSSE3__) + return _mm_madd_epi16(_mm_maddubs_epi16(a, _mm_set1_epi8(1)), + _mm_set1_epi16(1)); +#elif defined(ARCH_X64) + return _mm_madd_epi16(_mm_add_epi16(_mm_srli_epi16(a, 8), + _mm_and_si128(a, _mm_set1_epi16(0x00ff))), + _mm_set1_epi16(1)); +#elif defined(ARCH_ARM64) + return vpaddlq_u16(vpaddlq_u8(a)); +#endif +} + +inline v128 gv_hadds16x2(const v128 &a, const v128 &c) { +#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) + return _mm_dpwssd_epi32(c, a, _mm_set1_epi8(1)); +#elif defined(ARCH_X64) + return _mm_add_epi32(_mm_madd_epi16(a, _mm_set1_epi16(1)), c); +#elif defined(ARCH_ARM64) + return vaddq_s32(vpaddlq_s16(a), c); +#endif +} + +// Unsigned bytes from a, signed bytes from b, 32-bit accumulator c +inline v128 gv_dotu8s8x4(const v128 &a, const v128 &b, const v128 &c) { +#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) + return _mm_dpbusd_epi32(c, a, b); +#elif defined(ARCH_X64) + const __m128i ah = _mm_srli_epi16(a, 8); + const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff)); + const __m128i bh = _mm_srai_epi16(b, 8); + const __m128i bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8); + const __m128i mh = _mm_madd_epi16(ah, bh); + const __m128i ml = _mm_madd_epi16(al, bl); + const __m128i x = _mm_add_epi32(mh, ml); + return _mm_add_epi32(c, x); +#elif defined(__ARM_FEATURE_MATMUL_INT8) + return vusdotq_s32(c, a, b); +#elif defined(ARCH_ARM64) + const auto l = + vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), + vmovl_s8(vget_low_s8(b)))); + const auto h = + vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), + vmovl_s8(vget_high_s8(b)))); + return vaddq_s32(c, vaddq_s32(vuzp1q_s32(l, h), vuzp2q_s32(l, h))); +#endif +} + +inline v128 gv_dotu8x4(const v128 &a, const v128 &b, const v128 &c) { +#if defined(ARCH_X64) + const __m128i ah = _mm_srli_epi16(a, 8); + const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff)); + const __m128i bh = _mm_srli_epi16(b, 8); + const __m128i bl = _mm_and_si128(b, _mm_set1_epi16(0x00ff)); + const __m128i mh = _mm_madd_epi16(ah, bh); + const __m128i ml = _mm_madd_epi16(al, bl); + const __m128i x = _mm_add_epi32(mh, ml); + return _mm_add_epi32(c, x); +#elif defined(__ARM_FEATURE_DOTPROD) + return vdotq_u32(c, a, b); +#elif defined(ARCH_ARM64) + const auto l = vpaddlq_u16( + vmulq_u16(vmovl_u8(vget_low_u8(a)), vmovl_u8(vget_low_u8(b)))); + const auto h = vpaddlq_u16( + vmulq_u16(vmovl_u8(vget_high_u8(a)), vmovl_u8(vget_high_u8(b)))); + return vaddq_u32(c, vaddq_u32(vuzp1q_u32(l, h), vuzp2q_u32(l, h))); +#endif +} + +inline v128 gv_dots16x2(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_madd_epi16(a, b); +#elif defined(ARCH_ARM64) + const auto ml = vmull_s16(vget_low_s16(a), vget_low_s16(b)); + const auto mh = vmull_s16(vget_high_s16(a), vget_high_s16(b)); + const auto sl = vpadd_s32(vget_low_s32(ml), vget_high_s32(ml)); + const auto sh = vpadd_s32(vget_low_s32(mh), vget_high_s32(mh)); + return vcombine_s32(sl, sh); +#endif +} + +// Signed s16 from a and b, 32-bit accumulator c +inline v128 gv_dots16x2(const v128 &a, const v128 &b, const v128 &c) { +#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) + return _mm_dpwssd_epi32(c, a, b); +#else + return gv_add32(c, gv_dots16x2(a, b)); +#endif +} + +inline v128 gv_dotu16x2(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + const auto ml = _mm_mullo_epi16(a, b); // low results + const auto mh = _mm_mulhi_epu16(a, b); // high results + const auto ls = _mm_add_epi32(_mm_srli_epi32(ml, 16), + _mm_and_si128(ml, _mm_set1_epi32(0x0000ffff))); + const auto hs = _mm_add_epi32(_mm_slli_epi32(mh, 16), + _mm_and_si128(mh, _mm_set1_epi32(0xffff0000))); + return _mm_add_epi32(ls, hs); +#elif defined(ARCH_ARM64) + const auto ml = vmull_u16(vget_low_u16(a), vget_low_u16(b)); + const auto mh = vmull_u16(vget_high_u16(a), vget_high_u16(b)); + const auto sl = vpadd_u32(vget_low_u32(ml), vget_high_u32(ml)); + const auto sh = vpadd_u32(vget_low_u32(mh), vget_high_u32(mh)); + return vcombine_u32(sl, sh); +#endif +} + +// Unsigned bytes from a, signed bytes from b, 32-bit accumulator c +inline v128 gv_dots_u8s8x4(const v128 &a, const v128 &b, const v128 &c) { +#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) + return _mm_dpbusds_epi32(c, a, b); +#elif defined(ARCH_X64) + const __m128i ah = _mm_srli_epi16(a, 8); + const __m128i al = _mm_and_si128(a, _mm_set1_epi16(0x00ff)); + const __m128i bh = _mm_srai_epi16(b, 8); + const __m128i bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8); + const __m128i mh = _mm_madd_epi16(ah, bh); + const __m128i ml = _mm_madd_epi16(al, bl); + return gv_adds_s32(c, _mm_add_epi32(mh, ml)); +#elif defined(ARCH_ARM64) + const auto l = + vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), + vmovl_s8(vget_low_s8(b)))); + const auto h = + vpaddlq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), + vmovl_s8(vget_high_s8(b)))); + return vqaddq_s32(c, vaddq_s32(vuzp1q_s32(l, h), vuzp2q_s32(l, h))); +#endif +} + +// Signed s16 from a and b, 32-bit accumulator c; signed saturation +inline v128 gv_dots_s16x2(const v128 &a, const v128 &b, const v128 &c) { +#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) + return _mm_dpwssds_epi32(c, a, b); +#else + const auto ab = gv_dots16x2(a, b); + const auto s0 = gv_adds_s32(ab, c); + const auto s1 = + gv_eq32(ab, gv_bcst32(0x80000000)); // +0x80000000, negative c -> + // c^0x80000000; otherwise 0x7fffffff + const auto s2 = + gv_select32(gv_gts32(gv_bcst32(0), c), gv_xor32(c, gv_bcst32(0x80000000)), + gv_bcst32(0x7fffffff)); + return gv_select32(s1, s2, s0); +#endif +} + +// Multiply s16 elements 0, 2, 4, 6 to produce s32 results in corresponding +// lanes +inline v128 gv_mul_even_s16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + const auto c = _mm_set1_epi32(0x0000ffff); + return _mm_madd_epi16(_mm_and_si128(a, c), _mm_and_si128(b, c)); +#else + // TODO + return gv_mul32(gv_sar32(gv_shl32(a, 16), 16), gv_sar32(gv_shl32(b, 16), 16)); +#endif +} + +// Multiply u16 elements 0, 2, 4, 6 to produce u32 results in corresponding +// lanes +inline v128 gv_mul_even_u16(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) || defined(ARCH_ARM64) + const auto c = gv_bcst32(0x0000ffff); + return gv_mul32(a & c, b & c); +#elif defined(ARCH_X64) + const auto ml = _mm_mullo_epi16(a, b); + const auto mh = _mm_mulhi_epu16(a, b); + return _mm_or_si128(_mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)), + _mm_slli_epi32(mh, 16)); +#endif +} + +// Multiply s16 elements 1, 3, 5, 7 to produce s32 results in corresponding +// lanes +inline v128 gv_mul_odds_s16(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_madd_epi16(_mm_srli_epi32(a, 16), _mm_srli_epi32(b, 16)); +#else + return gv_mul32(gv_sar32(a, 16), gv_sar32(b, 16)); +#endif +} + +// Multiply u16 elements 1, 3, 5, 7 to produce u32 results in corresponding +// lanes +inline v128 gv_mul_odds_u16(const v128 &a, const v128 &b) { +#if defined(__SSE4_1__) || defined(ARCH_ARM64) + return gv_mul32(gv_shr32(a, 16), gv_shr32(b, 16)); +#elif defined(ARCH_X64) + const auto ml = _mm_mullo_epi16(a, b); + const auto mh = _mm_mulhi_epu16(a, b); + return _mm_or_si128(_mm_and_si128(mh, _mm_set1_epi32(0xffff0000)), + _mm_srli_epi32(ml, 16)); +#endif +} + +inline v128 gv_cvts32_tofs(const v128 &src) { +#if defined(ARCH_X64) + return _mm_cvtepi32_ps(src); +#elif defined(ARCH_ARM64) + return vcvtq_f32_s32(src); +#endif +} + +inline v128 gv_cvtu32_tofs(const v128 &src) { +#if defined(__AVX512VL__) + return _mm_cvtepu32_ps(src); +#elif defined(ARCH_X64) + const auto fix = _mm_and_ps(_mm_castsi128_ps(_mm_srai_epi32(src, 31)), + _mm_set1_ps(0x80000000)); + return _mm_add_ps( + _mm_cvtepi32_ps(_mm_and_si128(src, _mm_set1_epi32(0x7fffffff))), fix); +#elif defined(ARCH_ARM64) + return vcvtq_f32_u32(src); +#endif +} + +inline v128 gv_cvtfs_tos32(const v128 &src) { +#if defined(ARCH_X64) + return _mm_cvttps_epi32(src); +#elif defined(ARCH_ARM64) + return vcvtq_s32_f32(src); +#endif +} + +inline v128 gv_cvtfs_tou32(const v128 &src) { +#if defined(__AVX512VL__) + return _mm_cvttps_epu32(src); +#elif defined(ARCH_X64) + const auto c1 = _mm_cvttps_epi32(src); + const auto s1 = _mm_srai_epi32(c1, 31); + const auto c2 = _mm_cvttps_epi32(_mm_sub_ps(src, _mm_set1_ps(2147483648.))); + return _mm_or_si128(c1, _mm_and_si128(c2, s1)); +#elif defined(ARCH_ARM64) + return vcvtq_u32_f32(src); +#endif +} + +inline f32 roundevenf32(f32 arg) { + u32 val = std::bit_cast(arg); + u32 exp = (val >> 23) & 0xff; + u32 abs = val & 0x7fffffff; + + if (exp >= 127 + 23) { + // Big enough, NaN or INF + return arg; + } + + if (exp >= 127) { + u32 int_pos = (127 + 23) - exp; + u32 half_pos = int_pos - 1; + u32 half_bit = 1u << half_pos; + u32 int_bit = 1u << int_pos; + if (val & (int_bit | (half_bit - 1))) + val += half_bit; + val &= ~(int_bit - 1); + } else if (exp == 126 && abs > 0x3f000000) { + val &= 0x80000000; + val |= 0x3f800000; + } else { + val &= 0x80000000; + } + + return std::bit_cast(val); +} + +#if defined(ARCH_X64) +enum class RoundMode { Even, Floor, Ceil, Trunc }; + +template __m128 sse41_roundf(__m128 a) { + v128 r = a; + for (u32 i = 0; i < 4; i++) + if constexpr (Mode == RoundMode::Even) + r._f[i] = roundevenf32(r._f[i]); + else if constexpr (Mode == RoundMode::Floor) + r._f[i] = ::floorf(r._f[i]); + else if constexpr (Mode == RoundMode::Ceil) + r._f[i] = ::ceilf(r._f[i]); + else if constexpr (Mode == RoundMode::Trunc) + r._f[i] = ::truncf(r._f[i]); + return r; +} +#endif + +inline v128 gv_roundfs_even(const v128 &a) { +#if defined(__SSE4_1__) + return _mm_round_ps(a, 8 + 0); +#elif defined(ARCH_ARM64) + return vrndnq_f32(a); +#elif defined(ARCH_X64) + return sse41_roundf(a); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._f[i] = roundevenf32(a._f[i]); + return r; +#endif +} + +inline v128 gv_roundfs_ceil(const v128 &a) { +#if defined(__SSE4_1__) + return _mm_round_ps(a, 8 + 2); +#elif defined(ARCH_ARM64) + return vrndpq_f32(a); +#elif defined(ARCH_X64) + return sse41_roundf(a); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._f[i] = ::ceilf(a._f[i]); + return r; +#endif +} + +inline v128 gv_roundfs_floor(const v128 &a) { +#if defined(__SSE4_1__) + return _mm_round_ps(a, 8 + 1); +#elif defined(ARCH_ARM64) + return vrndmq_f32(a); +#elif defined(ARCH_X64) + return sse41_roundf(a); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._f[i] = ::floorf(a._f[i]); + return r; +#endif +} + +inline v128 gv_roundfs_trunc(const v128 &a) { +#if defined(__SSE4_1__) + return _mm_round_ps(a, 8 + 3); +#elif defined(ARCH_ARM64) + return vrndq_f32(a); +#elif defined(ARCH_X64) + return sse41_roundf(a); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._f[i] = ::truncf(a._f[i]); + return r; +#endif +} + +inline bool gv_testz(const v128 &a) { +#if defined(__SSE4_1__) + return !!_mm_testz_si128(a, a); +#elif defined(ARCH_X64) + return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == 0; +#elif defined(ARCH_ARM64) + return std::bit_cast(vqmovn_s32(a)) == 0; +#else + return !(a._u64[0] | a._u64[1]); +#endif +} + +// Same as gv_testz but tuned for pairing with gv_testall1 +inline bool gv_testall0(const v128 &a) { +#if defined(__SSE4_1__) + return !!_mm_testz_si128(a, _mm_set1_epi32(-1)); +#elif defined(ARCH_X64) + return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == 0; +#elif defined(ARCH_ARM64) + return std::bit_cast(vqmovn_s32(a)) == 0; +#else + return !(a._u64[0] | a._u64[1]); +#endif +} + +inline bool gv_testall1(const v128 &a) { +#if defined(__SSE4_1__) + return !!_mm_test_all_ones(a); +#elif defined(ARCH_X64) + return _mm_cvtsi128_si64(_mm_packs_epi32(a, a)) == -1; +#elif defined(ARCH_ARM64) + return std::bit_cast(vqmovn_s32(a)) == -1; +#else + return (a._u64[0] & a._u64[1]) == UINT64_MAX; +#endif +} + +// result = (~a) & (b) +inline v128 gv_andn(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_andnot_si128(a, b); +#elif defined(ARCH_ARM64) + return vbicq_s32(b, a); +#endif +} + +// Select elements; _cmp must be result of SIMD comparison; undefined otherwise +FORCE_INLINE v128 gv_select8(const v128 &_cmp, const v128 &_true, + const v128 &_false) { +#if defined(__SSE4_1__) + return _mm_blendv_epi8(_false, _true, _cmp); +#elif defined(ARCH_ARM64) + return vbslq_u8(_cmp, _true, _false); +#else + return (_cmp & _true) | gv_andn(_cmp, _false); +#endif +} + +// Select elements using sign bit only +FORCE_INLINE v128 gv_signselect8(const v128 &bits, const v128 &_true, + const v128 &_false) { +#if defined(__SSE4_1__) + return _mm_blendv_epi8(_false, _true, bits); +#else + return gv_select8(gv_gts8(gv_bcst8(0), bits), _true, _false); +#endif +} + +// Select elements; _cmp must be result of SIMD comparison; undefined otherwise +inline v128 gv_select16(const v128 &_cmp, const v128 &_true, + const v128 &_false) { +#if defined(__SSE4_1__) + return _mm_blendv_epi8(_false, _true, _cmp); +#elif defined(ARCH_ARM64) + return vbslq_u16(_cmp, _true, _false); +#else + return (_cmp & _true) | gv_andn(_cmp, _false); +#endif +} + +// Select elements; _cmp must be result of SIMD comparison; undefined otherwise +inline v128 gv_select32(const v128 &_cmp, const v128 &_true, + const v128 &_false) { +#if defined(__SSE4_1__) + return _mm_blendv_epi8(_false, _true, _cmp); +#elif defined(ARCH_ARM64) + return vbslq_u32(_cmp, _true, _false); +#else + return (_cmp & _true) | gv_andn(_cmp, _false); +#endif +} + +// Select elements; _cmp must be result of SIMD comparison; undefined otherwise +inline v128 gv_selectfs(const v128 &_cmp, const v128 &_true, + const v128 &_false) { +#if defined(__SSE4_1__) + return _mm_blendv_ps(_false, _true, _cmp); +#elif defined(ARCH_ARM64) + return vbslq_f32(_cmp, _true, _false); +#else + return _mm_or_ps(_mm_and_ps(_cmp, _true), _mm_andnot_ps(_cmp, _false)); +#endif +} + +inline v128 gv_packss_s16(const v128 &low, const v128 &high) { +#if defined(ARCH_X64) + return _mm_packs_epi16(low, high); +#elif defined(ARCH_ARM64) + return vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); +#endif +} + +inline v128 gv_packus_s16(const v128 &low, const v128 &high) { +#if defined(ARCH_X64) + return _mm_packus_epi16(low, high); +#elif defined(ARCH_ARM64) + return vcombine_u8(vqmovun_s16(low), vqmovun_s16(high)); +#endif +} + +inline v128 gv_packus_u16(const v128 &low, const v128 &high) { +#if defined(__SSE4_1__) + return _mm_packus_epi16(_mm_min_epu16(low, _mm_set1_epi16(0xff)), + _mm_min_epu16(high, _mm_set1_epi16(0xff))); +#elif defined(ARCH_X64) + return _mm_packus_epi16( + _mm_sub_epi16(low, _mm_subs_epu16(low, _mm_set1_epi16(0xff))), + _mm_sub_epi16(high, _mm_subs_epu16(high, _mm_set1_epi16(0xff)))); +#elif defined(ARCH_ARM64) + return vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); +#endif +} + +inline v128 gv_packtu16(const v128 &low, const v128 &high) { +#if defined(ARCH_X64) + return _mm_packus_epi16(low & _mm_set1_epi16(0xff), + high & _mm_set1_epi16(0xff)); +#elif defined(ARCH_ARM64) + return vuzp1q_s8(low, high); +#endif +} + +inline v128 gv_packss_s32(const v128 &low, const v128 &high) { +#if defined(ARCH_X64) + return _mm_packs_epi32(low, high); +#elif defined(ARCH_ARM64) + return vcombine_s16(vqmovn_s32(low), vqmovn_s32(high)); +#endif +} + +inline v128 gv_packus_s32(const v128 &low, const v128 &high) { +#if defined(__SSE4_1__) + return _mm_packus_epi32(low, high); +#elif defined(ARCH_X64) + const auto s = _mm_srai_epi16(_mm_packs_epi32(low, high), 15); + const auto r = gv_add16(_mm_packs_epi32(gv_sub32(low, gv_bcst32(0x8000)), + gv_sub32(high, gv_bcst32(0x8000))), + gv_bcst16(0x8000)); + return gv_andn(s, r); +#elif defined(ARCH_ARM64) + return vcombine_u16(vqmovun_s32(low), vqmovun_s32(high)); +#endif +} + +inline v128 gv_packus_u32(const v128 &low, const v128 &high) { +#if defined(__SSE4_1__) + return _mm_packus_epi32(_mm_min_epu32(low, _mm_set1_epi32(0xffff)), + _mm_min_epu32(high, _mm_set1_epi32(0xffff))); +#elif defined(ARCH_X64) + const v128 s = _mm_cmpgt_epi16( + _mm_packs_epi32(_mm_srli_epi32(low, 16), _mm_srli_epi32(high, 16)), + _mm_setzero_si128()); + const v128 r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(low, 16), 16), + _mm_srai_epi32(_mm_slli_epi32(high, 16), 16)); + return _mm_or_si128(r, s); +#elif defined(ARCH_ARM64) + return vcombine_u16(vqmovn_u32(low), vqmovn_u32(high)); +#endif +} + +inline v128 gv_packtu32(const v128 &low, const v128 &high) { +#if defined(__SSE4_1__) + return _mm_packus_epi32(low & _mm_set1_epi32(0xffff), + high & _mm_set1_epi32(0xffff)); +#elif defined(ARCH_X64) + return _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(low, 16), 16), + _mm_srai_epi32(_mm_slli_epi32(high, 16), 16)); +#elif defined(ARCH_ARM64) + return vuzp1q_s16(low, high); +#endif +} + +inline v128 gv_unpacklo8(const v128 &lows, const v128 &highs) { +#if defined(ARCH_X64) + return _mm_unpacklo_epi8(lows, highs); +#elif defined(ARCH_ARM64) + return vzip1q_s8(lows, highs); +#endif +} + +inline v128 gv_extend_lo_s8(const v128 &vec) { +#if defined(__SSE4_1__) + return _mm_cvtepi8_epi16(vec); +#elif defined(ARCH_X64) + return _mm_srai_epi16(_mm_unpacklo_epi8(vec, vec), 8); +#elif defined(ARCH_ARM64) + return int16x8_t(vmovl_s8(vget_low_s8(vec))); +#endif +} + +inline v128 gv_extend_hi_s8(const v128 &vec) { +#if defined(__SSE4_1__) + return _mm_cvtepi8_epi16(_mm_loadu_si64(vec._bytes + 8)); +#elif defined(ARCH_X64) + return _mm_srai_epi16(_mm_unpackhi_epi8(vec, vec), 8); +#elif defined(ARCH_ARM64) + return int16x8_t(vmovl_s8(vget_high_s8(vec))); +#endif +} + +inline v128 gv_unpacklo16(const v128 &lows, const v128 &highs) { +#if defined(ARCH_X64) + return _mm_unpacklo_epi16(lows, highs); +#elif defined(ARCH_ARM64) + return vzip1q_s16(lows, highs); +#endif +} + +inline v128 gv_extend_lo_s16(const v128 &vec) { +#if defined(__SSE4_1__) + return _mm_cvtepi16_epi32(vec); +#elif defined(ARCH_X64) + return _mm_srai_epi32(_mm_unpacklo_epi16(vec, vec), 16); +#elif defined(ARCH_ARM64) + return int32x4_t(vmovl_s16(vget_low_s16(vec))); +#endif +} + +inline v128 gv_extend_hi_s16(const v128 &vec) { +#if defined(__SSE4_1__) + return _mm_cvtepi16_epi32(_mm_loadu_si64(vec._bytes + 8)); +#elif defined(ARCH_X64) + return _mm_srai_epi32(_mm_unpackhi_epi16(vec, vec), 16); +#elif defined(ARCH_ARM64) + return int32x4_t(vmovl_s16(vget_high_s16(vec))); +#endif +} + +inline v128 gv_unpacklo32(const v128 &lows, const v128 &highs) { +#if defined(ARCH_X64) + return _mm_unpacklo_epi32(lows, highs); +#elif defined(ARCH_ARM64) + return vzip1q_s32(lows, highs); +#endif +} + +inline v128 gv_unpackhi8(const v128 &lows, const v128 &highs) { +#if defined(ARCH_X64) + return _mm_unpackhi_epi8(lows, highs); +#elif defined(ARCH_ARM64) + return vzip2q_s8(lows, highs); +#endif +} + +inline v128 gv_unpackhi16(const v128 &lows, const v128 &highs) { +#if defined(ARCH_X64) + return _mm_unpackhi_epi16(lows, highs); +#elif defined(ARCH_ARM64) + return vzip2q_s16(lows, highs); +#endif +} + +inline v128 gv_unpackhi32(const v128 &lows, const v128 &highs) { +#if defined(ARCH_X64) + return _mm_unpackhi_epi32(lows, highs); +#elif defined(ARCH_ARM64) + return vzip2q_s32(lows, highs); +#endif +} + +inline bool v128::operator==(const v128 &b) const { +#if defined(ARCH_X64) + return gv_testz(_mm_xor_si128(*this, b)); +#else + return gv_testz(*this ^ b); +#endif +} + +inline v128 v128::operator|(const v128 &rhs) const { +#if defined(ARCH_X64) + return _mm_or_si128(*this, rhs); +#elif defined(ARCH_ARM64) + return vorrq_s32(*this, rhs); +#endif +} + +inline v128 v128::operator&(const v128 &rhs) const { +#if defined(ARCH_X64) + return _mm_and_si128(*this, rhs); +#elif defined(ARCH_ARM64) + return vandq_s32(*this, rhs); +#endif +} + +inline v128 v128::operator^(const v128 &rhs) const { +#if defined(ARCH_X64) + return _mm_xor_si128(*this, rhs); +#elif defined(ARCH_ARM64) + return veorq_s32(*this, rhs); +#endif +} + +inline v128 v128::operator~() const { +#if defined(ARCH_X64) + return _mm_xor_si128(*this, _mm_set1_epi32(-1)); +#elif defined(ARCH_ARM64) + return vmvnq_u32(*this); +#endif +} + +inline v128 gv_exp2_approxfs(const v128 &a) { + // TODO +#if 0 + const auto x0 = _mm_max_ps(_mm_min_ps(a, _mm_set1_ps(127.4999961f)), _mm_set1_ps(-127.4999961f)); + const auto x1 = _mm_add_ps(x0, _mm_set1_ps(0.5f)); + const auto x2 = _mm_sub_epi32(_mm_cvtps_epi32(x1), _mm_and_si128(_mm_castps_si128(_mm_cmpnlt_ps(_mm_setzero_ps(), x1)), _mm_set1_epi32(1))); + const auto x3 = _mm_sub_ps(x0, _mm_cvtepi32_ps(x2)); + const auto x4 = _mm_mul_ps(x3, x3); + const auto x5 = _mm_mul_ps(x3, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(x4, _mm_set1_ps(0.023093347705f)), _mm_set1_ps(20.20206567f)), x4), _mm_set1_ps(1513.906801f))); + const auto x6 = _mm_mul_ps(x5, _mm_rcp_ps(_mm_sub_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(233.1842117f), x4), _mm_set1_ps(4368.211667f)), x5))); + return _mm_mul_ps(_mm_add_ps(_mm_add_ps(x6, x6), _mm_set1_ps(1.0f)), _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(x2, _mm_set1_epi32(127)), 23))); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._f[i] = std::exp2f(a._f[i]); + return r; +#endif +} + +inline v128 gv_log2_approxfs(const v128 &a) { + // TODO +#if 0 + const auto _1 = _mm_set1_ps(1.0f); + const auto _c = _mm_set1_ps(1.442695040f); + const auto x0 = _mm_max_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x00800000))); + const auto x1 = _mm_or_ps(_mm_and_ps(x0, _mm_castsi128_ps(_mm_set1_epi32(0x807fffff))), _1); + const auto x2 = _mm_rcp_ps(_mm_add_ps(x1, _1)); + const auto x3 = _mm_mul_ps(_mm_sub_ps(x1, _1), x2); + const auto x4 = _mm_add_ps(x3, x3); + const auto x5 = _mm_mul_ps(x4, x4); + const auto x6 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.7895802789f), x5), _mm_set1_ps(16.38666457f)), x5), _mm_set1_ps(-64.1409953f)); + const auto x7 = _mm_rcp_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-35.67227983f), x5), _mm_set1_ps(312.0937664f)), x5), _mm_set1_ps(-769.6919436f))); + const auto x8 = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x0), 23), _mm_set1_epi32(127))); + return _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(x5, x6), x7), x4), _c), _mm_add_ps(_mm_mul_ps(x4, _c), x8)); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._f[i] = std::log2f(a._f[i]); + return r; +#endif +} + +// For each 8-bit element, r = a << (b & 7) +inline v128 gv_shl8(const v128 &a, const v128 &b) { +#if defined(ARCH_ARM64) + return vshlq_u8(a, vandq_s8(b, gv_bcst8(7))); +#else + const v128 x1 = gv_add8(a, a); // shift left by 1 + const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a); + const v128 x2 = gv_and32(gv_shl64(r1, 2), gv_bcst8(0xfc)); // shift by 2 + const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1); + const v128 x3 = gv_and32(gv_shl64(r2, 4), gv_bcst8(0xf0)); // shift by 4 + return gv_signselect8(gv_shl64(b, 5), x3, r2); +#endif +} + +// For each 16-bit element, r = a << (b & 15) +inline v128 gv_shl16(const v128 &a, const v128 &b) { +#if defined(__AVX512VL__) && defined(__AVX512BW__) + return _mm_sllv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15))); +#elif defined(ARCH_ARM64) + return vshlq_u16(a, vandq_s16(b, gv_bcst8(15))); +#else + v128 r; + for (u32 i = 0; i < 8; i++) + r._u16[i] = a._u16[i] << (b._u16[i] & 15); + return r; +#endif +} + +// For each 32-bit element, r = a << (b & 31) +inline v128 gv_shl32(const v128 &a, const v128 &b) { +#if defined(__AVX2__) + return _mm_sllv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31))); +#elif defined(ARCH_ARM64) + return vshlq_u32(a, vandq_s32(b, gv_bcst8(31))); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._u32[i] = a._u32[i] << (b._u32[i] & 31); + return r; +#endif +} + +// For each unsigned 8-bit element, r = a >> (b & 7) +inline v128 gv_shr8(const v128 &a, const v128 &b) { +#if defined(ARCH_ARM64) + return vshlq_u8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7)))); +#else + const v128 x1 = gv_and32(gv_shr64(a, 1), gv_bcst8(0x7f)); // shift right by 1 + const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a); + const v128 x2 = gv_and32(gv_shr64(r1, 2), gv_bcst8(0x3f)); // shift by 2 + const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1); + const v128 x3 = gv_and32(gv_shr64(r2, 4), gv_bcst8(0x0f)); // shift by 4 + return gv_signselect8(gv_shl64(b, 5), x3, r2); +#endif +} + +// For each unsigned 16-bit element, r = a >> (b & 15) +inline v128 gv_shr16(const v128 &a, const v128 &b) { +#if defined(__AVX512VL__) && defined(__AVX512BW__) + return _mm_srlv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15))); +#elif defined(ARCH_ARM64) + return vshlq_u16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15)))); +#else + v128 r; + for (u32 i = 0; i < 8; i++) + r._u16[i] = a._u16[i] >> (b._u16[i] & 15); + return r; +#endif +} + +// For each unsigned 32-bit element, r = a >> (b & 31) +inline v128 gv_shr32(const v128 &a, const v128 &b) { +#if defined(__AVX2__) + return _mm_srlv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31))); +#elif defined(ARCH_ARM64) + return vshlq_u32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31)))); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._u32[i] = a._u32[i] >> (b._u32[i] & 31); + return r; +#endif +} + +// For each signed 8-bit element, r = a >> (b & 7) +inline v128 gv_sar8(const v128 &a, const v128 &b) { +#if defined(ARCH_ARM64) + return vshlq_s8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7)))); +#else + v128 r; + for (u32 i = 0; i < 16; i++) + r._s8[i] = a._s8[i] >> (b._s8[i] & 7); + return r; +#endif +} + +// For each signed 16-bit element, r = a >> (b & 15) +inline v128 gv_sar16(const v128 &a, const v128 &b) { +#if defined(__AVX512VL__) && defined(__AVX512BW__) + return _mm_srav_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15))); +#elif defined(ARCH_ARM64) + return vshlq_s16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15)))); +#else + v128 r; + for (u32 i = 0; i < 8; i++) + r._s16[i] = a._s16[i] >> (b._s16[i] & 15); + return r; +#endif +} + +// For each signed 32-bit element, r = a >> (b & 31) +inline v128 gv_sar32(const v128 &a, const v128 &b) { +#if defined(__AVX2__) + return _mm_srav_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31))); +#elif defined(ARCH_ARM64) + return vshlq_s32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31)))); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._s32[i] = a._s32[i] >> (b._s32[i] & 31); + return r; +#endif +} + +// For each 8-bit element, r = rotate a by b +inline v128 gv_rol8(const v128 &a, const v128 &b) { +#if defined(ARCH_ARM64) + const auto amt1 = vandq_s8(b, gv_bcst8(7)); + const auto amt2 = vsubq_s8(amt1, gv_bcst8(8)); + return vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(a, amt2)); +#else + const v128 x1 = + gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), a)); // rotate left by 1 + const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a); + const v128 c2 = gv_bcst8(0x3); + const v128 x2 = gv_or32(gv_and32(gv_shr64(r1, 6), c2), + gv_andn32(c2, gv_shl64(r1, 2))); // rotate by 2 + const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1); + const v128 c3 = gv_bcst8(0xf); + const v128 x3 = gv_or32(gv_and32(gv_shr64(r2, 4), c3), + gv_andn32(c3, gv_shl64(r2, 4))); // rotate by 4 + return gv_signselect8(gv_shl64(b, 5), x3, r2); +#endif +} + +// For each 16-bit element, r = rotate a by b +inline v128 gv_rol16(const v128 &a, const v128 &b) { +#if defined(ARCH_ARM64) + const auto amt1 = vandq_s16(b, gv_bcst16(15)); + const auto amt2 = vsubq_s16(amt1, gv_bcst16(16)); + return vorrq_u16(vshlq_u16(a, amt1), vshlq_u16(a, amt2)); +#else + v128 r; + for (u32 i = 0; i < 8; i++) + r._u16[i] = rol16(a._u16[i], b._u16[i]); + return r; +#endif +} + +// For each 16-bit element, r = rotate a by count +template inline v128 gv_rol16(const v128 &a) { + constexpr u8 count = Count & 0xf; +#if defined(ARCH_X64) + return _mm_or_si128(_mm_srli_epi16(a, 16 - count), _mm_slli_epi16(a, count)); +#elif defined(ARCH_ARM64) + return vorrq_u16(vshrq_n_u16(a, 16 - count), vshlq_n_u16(a, count)); +#else + v128 r; + for (u32 i = 0; i < 8; i++) + r._u16[i] = std::rotl(a._u16[i], count); + return r; +#endif +} + +// For each 32-bit element, r = rotate a by b +inline v128 gv_rol32(const v128 &a, const v128 &b) { +#if defined(__AVX512VL__) + return _mm_rolv_epi32(a, b); +#elif defined(ARCH_ARM64) + const auto amt1 = vandq_s32(b, gv_bcst32(31)); + const auto amt2 = vsubq_s32(amt1, gv_bcst32(32)); + return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2)); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._u32[i] = rol32(a._u32[i], b._u32[i]); + return r; +#endif +} + +// For each 32-bit element, r = rotate a by count +template inline v128 gv_rol32(const v128 &a) { + constexpr u8 count = Count & 0x1f; +#if defined(__AVX512VL__) + return _mm_rol_epi32(a, count); +#elif defined(ARCH_X64) + return _mm_or_si128(_mm_srli_epi32(a, 32 - count), _mm_slli_epi32(a, count)); +#elif defined(ARCH_ARM64) + return vorrq_u32(vshrq_n_u32(a, 32 - count), vshlq_n_u32(a, count)); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._u32[i] = rol32(a._u32[i], count); + return r; +#endif +} + +// For each 8-bit element, r = (a << (c & 7)) | (b >> (~c & 7) >> 1) +inline auto gv_fshl8(const v128 &a, const v128 &b, const v128 &c) { +#if defined(ARCH_ARM64) + const auto amt1 = vandq_s8(c, gv_bcst8(7)); + const auto amt2 = vsubq_s8(amt1, gv_bcst8(8)); + return v128(vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(b, amt2))); +#else + auto x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), b)); + auto s1 = gv_shl64(c, 7); + auto r1 = gv_signselect8(s1, x1, a); + auto b1 = gv_signselect8(s1, gv_shl64(b, 1), b); + auto c2 = gv_bcst8(0x3); + auto x2 = gv_and32(gv_shr64(b1, 6), c2); + x2 = gv_or32(x2, gv_andn32(c2, gv_shl64(r1, 2))); + auto s2 = gv_shl64(c, 6); + auto r2 = gv_signselect8(s2, x2, r1); + auto b2 = gv_signselect8(s2, gv_shl64(b1, 2), b1); + auto c3 = gv_bcst8(0xf); + auto x3 = gv_and32(gv_shr64(b2, 4), c3); + x3 = gv_or32(x3, gv_andn32(c3, gv_shl64(r2, 4))); + return gv_signselect8(gv_shl64(c, 5), x3, r2); +#endif +} + +// For each 8-bit element, r = (b >> (c & 7)) | (a << (~c & 7) << 1) +inline auto gv_fshr8(const v128 &a, const v128 &b, const v128 &c) { +#if defined(ARCH_ARM64) + const auto amt1 = vandq_s8(c, gv_bcst8(7)); + const auto amt2 = vsubq_s8(gv_bcst8(8), amt1); + return vorrq_u8(vshlq_u8(b, vnegq_s8(amt1)), vshlq_u8(a, amt2)); +#else + auto c1 = gv_bcst8(0x7f); + auto x1 = gv_and32(gv_shr64(b, 1), c1); + x1 = gv_or32(x1, gv_andn32(c1, gv_shl64(a, 7))); + auto s1 = gv_shl64(c, 7); + auto r1 = gv_signselect8(s1, x1, b); + auto a1 = gv_signselect8(s1, gv_shr64(a, 1), a); + auto c2 = gv_bcst8(0x3f); + auto x2 = gv_and32(gv_shr64(r1, 2), c2); + x2 = gv_or32(x2, gv_andn32(c2, gv_shl64(a1, 6))); + auto s2 = gv_shl64(c, 6); + auto r2 = gv_signselect8(s2, x2, r1); + auto a2 = gv_signselect8(s2, gv_shr64(a1, 2), a1); + auto c3 = gv_bcst8(0x0f); + auto x3 = gv_and32(gv_shr64(r2, 4), c3); + x3 = gv_or32(x3, gv_andn32(c3, gv_shl64(a2, 4))); + return gv_signselect8(gv_shl64(c, 5), x3, r2); +#endif +} + +// Shift left by byte amount +template inline v128 gv_shuffle_left(const v128 &a) { + if (Count > 15) + return {}; +#if defined(ARCH_X64) + return _mm_slli_si128(a, Count); +#elif defined(ARCH_ARM64) + v128 idx; + for (u32 i = 0; i < 16; i++) + idx._u8[i] = u8(i - Count); + return vqtbl1q_u8(a, idx); +#endif +} + +// Shift right by byte amount +template inline v128 gv_shuffle_right(const v128 &a) { + if (Count > 15) + return {}; +#if defined(ARCH_X64) + return _mm_srli_si128(a, Count); +#elif defined(ARCH_ARM64) + v128 idx; + for (u32 i = 0; i < 16; i++) + idx._u8[i] = u8(i + Count); + return vqtbl1q_u8(a, idx); +#endif +} + +// Load 32-bit integer into the first element of a new vector, set other +// elements to zero +inline v128 gv_loadu32(const void *ptr) { +#if defined(ARCH_X64) + return _mm_loadu_si32(ptr); +#elif defined(ARCH_ARM64) + return vld1q_lane_u32(static_cast(ptr), vdupq_n_u32(0), 0); +#endif +} + +// Load 16-bit integer into an existing vector at the position specified by +// Index +template inline v128 gv_insert16(const v128 &vec, u16 value) { +#if defined(ARCH_X64) + return _mm_insert_epi16(vec, value, Index); +#elif defined(ARCH_ARM64) + return vsetq_lane_u16(value, vec, Index & 0x7); +#endif +} + +// For each 8-bit element, +// if ctrl >= 0 && ctrl < 16 then r = vec[ctrl], +// else if ctrl < 0 then r = 0 +inline v128 gv_shuffle8(const v128 &vec, const v128 &ctrl) { + AUDIT( + std::ranges::none_of( + ctrl._chars, [](s8 i) { return i >= static_cast(sizeof(v128)); }), + "All indices must be in the range [0, 15] or negative, since PSHUFB and " + "TBL behave differently otherwise"); +#if defined(__SSSE3__) + return _mm_shuffle_epi8(vec, ctrl); +#elif defined(ARCH_ARM64) + return vqtbl1q_s8(vec, ctrl); +#else + v128 r; + for (s32 i = 0; i < 16; i++) + r._s8[i] = ctrl._s8[i] < 0 ? 0 : vec._s8[ctrl._s8[i] & 0xf]; + return r; +#endif +} + +// For each 2-bit index in Control, r = vec[index] +template inline v128 gv_shuffle32(const v128 &vec) { +#if defined(ARCH_X64) + return _mm_shuffle_epi32(vec, Control); +#elif defined(ARCH_ARM64) + constexpr u8 idx0 = (Control & 3) * sizeof(s32); + constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32); + constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32); + constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32); + + constexpr uint8x16_t idx_vec = { + idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, + idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3}; + + return vqtbl1q_s8(vec, idx_vec); +#endif +} + +// For each index, r = vec[index & 3] +template +inline v128 gv_shuffle32(const v128 &vec) { +#if defined(ARCH_X64) + return _mm_shuffle_epi32(vec, (Index0 & 3) | (Index1 & 3) << 2 | + (Index2 & 3) << 4 | (Index3 & 3) << 6); +#elif defined(ARCH_ARM64) + constexpr u8 idx0 = (Index0 & 3) * sizeof(s32); + constexpr u8 idx1 = (Index1 & 3) * sizeof(s32); + constexpr u8 idx2 = (Index2 & 3) * sizeof(s32); + constexpr u8 idx3 = (Index3 & 3) * sizeof(s32); + + constexpr uint8x16_t idx_vec = { + idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, + idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3}; + + return vqtbl1q_s8(vec, idx_vec); +#endif +} + +// For the first two 2-bit indices in Control, r = a[index], +// for the last two indices, r = b[index] +template inline v128 gv_shufflefs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_shuffle_ps(a, b, Control); +#elif defined(ARCH_ARM64) + constexpr u8 idx0 = (Control & 3) * sizeof(s32); + constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32); + constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32) + sizeof(v128); + constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32) + sizeof(v128); + + constexpr uint8x16_t idx_vec = { + idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, + idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3}; + + return vqtbl2q_s8({a, b}, idx_vec); +#endif +} + +// For the first two indices, r = a[index & 3], +// for the last two indices, r = b[index & 3] +template +inline v128 gv_shufflefs(const v128 &a, const v128 &b) { +#if defined(ARCH_X64) + return _mm_shuffle_ps(a, b, + (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | + (Index3 & 3) << 6); +#elif defined(ARCH_ARM64) + constexpr u8 idx0 = (Index0 & 3) * sizeof(s32); + constexpr u8 idx1 = (Index1 & 3) * sizeof(s32); + constexpr u8 idx2 = (Index2 & 3) * sizeof(s32) + sizeof(v128); + constexpr u8 idx3 = (Index3 & 3) * sizeof(s32) + sizeof(v128); + + constexpr uint8x16_t idx_vec = { + idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, + idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3}; + + return vqtbl2q_s8({a, b}, idx_vec); +#endif +} + +// For each 32-bit element, reverse byte order +inline v128 gv_rev32(const v128 &vec) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8( + vec, _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12)); +#elif defined(ARCH_ARM64) + return vrev32q_u8(vec); +#else + return gv_rol32<16>(gv_rol16<8>(vec)); +#endif +} + +// For each 32-bit element, convert between big-endian and native-endian +inline v128 gv_to_be32(const v128 &vec) { + if constexpr (std::endian::native == std::endian::little) + return gv_rev32(vec); + return vec; +} + +#if defined(__clang__) +#pragma clang diagnostic pop +#elif defined(__GNUC__) +#pragma GCC diagnostic pop +#endif +} // namespace rx diff --git a/rx/include/rx/types.hpp b/rx/include/rx/types.hpp new file mode 100644 index 000000000..c545a3c57 --- /dev/null +++ b/rx/include/rx/types.hpp @@ -0,0 +1,1522 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \ + defined(__x86_64__) || defined(__amd64__) +#define ARCH_X64 1 +#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) +#define ARCH_ARM64 1 +// v8.4a+ gives us atomic 16 byte ld/st +// See Arm C Language Extensions Documentation +// Currently there is no feature macro for LSE2 specifically so we define it +// ourself Unfortunately the __ARM_ARCH integer macro isn't universally defined +// so we use this hack instead +#if defined(__ARM_ARCH_8_4__) || defined(__ARM_ARCH_8_5__) || \ + defined(__ARM_ARCH_8_6__) || defined(__ARM_ARCH_9__) +#define ARM_FEATURE_LSE2 1 +#endif +#endif + +using std::chrono::steady_clock; + +using namespace std::literals; + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#ifdef _MSC_VER +#define SAFE_BUFFERS(...) __declspec(safebuffers) __VA_ARGS__ +#define NEVER_INLINE __declspec(noinline) +#define FORCE_INLINE __forceinline +#else // not _MSC_VER +#ifdef __clang__ +#define SAFE_BUFFERS(...) __attribute__((no_stack_protector)) __VA_ARGS__ +#else +#define SAFE_BUFFERS(...) \ + __VA_ARGS__ __attribute__((__optimize__("no-stack-protector"))) +#endif +#define NEVER_INLINE __attribute__((noinline)) inline +#define FORCE_INLINE __attribute__((always_inline)) inline +#endif // _MSC_VER + +#define CHECK_SIZE(type, size) \ + static_assert(sizeof(type) == size, "Invalid " #type " type size") +#define CHECK_ALIGN(type, align) \ + static_assert(alignof(type) == align, "Invalid " #type " type alignment") +#define CHECK_MAX_SIZE(type, size) \ + static_assert(sizeof(type) <= size, #type " type size is too big") +#define CHECK_SIZE_ALIGN(type, size, align) \ + CHECK_SIZE(type, size); \ + CHECK_ALIGN(type, align) + +#define DECLARE(...) decltype(__VA_ARGS__) __VA_ARGS__ + +#define STR_CASE(...) \ + case __VA_ARGS__: \ + return #__VA_ARGS__ + +#if defined(_DEBUG) || defined(_AUDIT) +#define AUDIT(...) (static_cast(ensure(__VA_ARGS__))) +#else +#define AUDIT(...) (static_cast>(0)) +#endif + +namespace rx { +template struct fn_helper { + F f; + + fn_helper(F &&f) : f(std::forward(f)) {} + + template auto operator()(Args &&...args) const { + if constexpr (sizeof...(Args) == 0) + return f(0, 0, 0, 0); + else if constexpr (sizeof...(Args) == 1) + return f(std::forward(args)..., 0, 0, 0); + else if constexpr (sizeof...(Args) == 2) + return f(std::forward(args)..., 0, 0); + else if constexpr (sizeof...(Args) == 3) + return f(std::forward(args)..., 0); + else if constexpr (sizeof...(Args) == 4) + return f(std::forward(args)...); + else + static_assert(sizeof...(Args) <= 4); + } +}; + +template fn_helper(F &&f) -> fn_helper; +} // namespace rx + +// Shorter lambda. +#define FN(...) \ + ::rx::fn_helper([&]([[maybe_unused]] auto &&x, [[maybe_unused]] auto &&y, \ + [[maybe_unused]] auto &&z, \ + [[maybe_unused]] auto &&w) { return (__VA_ARGS__); }) + +#if __cpp_lib_bit_cast < 201806L +namespace std { +template +[[nodiscard]] constexpr To bit_cast(const From &from) noexcept { + return __builtin_bit_cast(To, from); +} +} // namespace std +#endif + +#if defined(__INTELLISENSE__) || (defined(__clang__) && (__clang_major__ <= 16)) +#define consteval constexpr +#define constinit +#endif + +// FIXME: move to ps3 kernel implementation +using schar = signed char; +using uchar = unsigned char; +using ushort = unsigned short; +using uint = unsigned int; +using ulong = unsigned long; +using ullong = unsigned long long; +using llong = long long; + +using uptr = std::uintptr_t; + +using u8 = std::uint8_t; +using u16 = std::uint16_t; +using u32 = std::uint32_t; +using u64 = std::uint64_t; +using usz = std::size_t; + +using s8 = std::int8_t; +using s16 = std::int16_t; +using s32 = std::int32_t; +using s64 = std::int64_t; +using ssz = std::make_signed_t; + +// Get integral type from type size +template struct get_int_impl {}; + +template <> struct get_int_impl { + using utype = u8; +}; + +template <> struct get_int_impl { + using utype = u16; +}; + +template <> struct get_int_impl { + using utype = u32; +}; + +template <> struct get_int_impl { + using utype = u64; +}; + +template using get_uint_t = typename get_int_impl::utype; + +template std::remove_cvref_t as_rvalue(T &&obj) { + return std::forward(obj); +} + +template class se_t; + +// se_t<> with native endianness +template +using nse_t = se_t; + +template +using be_t = se_t; +template +using le_t = se_t; + +// FIXME: remove +template class atomic_t; +template +using atomic_be_t = atomic_t, Align>; +template +using atomic_le_t = atomic_t, Align>; + +// Bool type equivalent +class b8 { + u8 m_value; + +public: + b8() = default; + + using enable_bitcopy = std::true_type; + + constexpr b8(bool value) noexcept : m_value(value) {} + + constexpr operator bool() const noexcept { return m_value != 0; } + + constexpr bool set(bool value) noexcept { + m_value = value; + return value; + } +}; + +#if defined(ARCH_X64) && !defined(_MSC_VER) +using __m128i = long long __attribute__((vector_size(16))); +using __m128d = double __attribute__((vector_size(16))); +using __m128 = float __attribute__((vector_size(16))); +#endif + +#ifndef _MSC_VER +using u128 = __uint128_t; +using s128 = __int128_t; +#else + +extern "C" { +union __m128; +union __m128i; +struct __m128d; + +uchar _addcarry_u64(uchar, u64, u64, u64 *); +uchar _subborrow_u64(uchar, u64, u64, u64 *); +u64 __shiftleft128(u64, u64, uchar); +u64 __shiftright128(u64, u64, uchar); +u64 _umul128(u64, u64, u64 *); +} + +// Unsigned 128-bit integer implementation (TODO) +struct alignas(16) u128 { + u64 lo, hi; + + u128() noexcept = default; + + template + requires std::is_unsigned_v + constexpr u128(T arg) noexcept : lo(arg), hi(0) {} + + template + requires std::is_signed_v + constexpr u128(T arg) noexcept : lo(s64{arg}), hi(s64{arg} >> 63) {} + + constexpr explicit operator bool() const noexcept { return !!(lo | hi); } + + constexpr explicit operator u64() const noexcept { return lo; } + + constexpr explicit operator s64() const noexcept { return lo; } + + constexpr friend u128 operator+(const u128 &l, const u128 &r) { + u128 value = l; + value += r; + return value; + } + + constexpr friend u128 operator-(const u128 &l, const u128 &r) { + u128 value = l; + value -= r; + return value; + } + + constexpr friend u128 operator*(const u128 &l, const u128 &r) { + u128 value = l; + value *= r; + return value; + } + + constexpr u128 operator+() const { return *this; } + + constexpr u128 operator-() const { + u128 value{}; + value -= *this; + return value; + } + + constexpr u128 &operator++() { + *this += 1; + return *this; + } + + constexpr u128 operator++(int) { + u128 value = *this; + *this += 1; + return value; + } + + constexpr u128 &operator--() { + *this -= 1; + return *this; + } + + constexpr u128 operator--(int) { + u128 value = *this; + *this -= 1; + return value; + } + + constexpr u128 operator<<(u128 shift_value) const { + u128 value = *this; + value <<= shift_value; + return value; + } + + constexpr u128 operator>>(u128 shift_value) const { + u128 value = *this; + value >>= shift_value; + return value; + } + + constexpr u128 operator~() const { + u128 value{}; + value.lo = ~lo; + value.hi = ~hi; + return value; + } + + constexpr friend u128 operator&(const u128 &l, const u128 &r) { + u128 value{}; + value.lo = l.lo & r.lo; + value.hi = l.hi & r.hi; + return value; + } + + constexpr friend u128 operator|(const u128 &l, const u128 &r) { + u128 value{}; + value.lo = l.lo | r.lo; + value.hi = l.hi | r.hi; + return value; + } + + constexpr friend u128 operator^(const u128 &l, const u128 &r) { + u128 value{}; + value.lo = l.lo ^ r.lo; + value.hi = l.hi ^ r.hi; + return value; + } + + constexpr u128 &operator+=(const u128 &r) { + if (std::is_constant_evaluated()) { + lo += r.lo; + hi += r.hi + (lo < r.lo); + } else { + _addcarry_u64(_addcarry_u64(0, r.lo, lo, &lo), r.hi, hi, &hi); + } + + return *this; + } + + constexpr u128 &operator-=(const u128 &r) { + if (std::is_constant_evaluated()) { + hi -= r.hi + (lo < r.lo); + lo -= r.lo; + } else { + _subborrow_u64(_subborrow_u64(0, lo, r.lo, &lo), hi, r.hi, &hi); + } + + return *this; + } + + constexpr u128 &operator*=(const u128 &r) { + const u64 _hi = r.hi * lo + r.lo * hi; + + if (std::is_constant_evaluated()) { + hi = (lo >> 32) * (r.lo >> 32) + + (((lo >> 32) * (r.lo & 0xffffffff)) >> 32) + + (((r.lo >> 32) * (lo & 0xffffffff)) >> 32); + lo = lo * r.lo; + } else { + lo = _umul128(lo, r.lo, &hi); + } + + hi += _hi; + return *this; + } + + constexpr u128 &operator<<=(const u128 &r) { + if (std::is_constant_evaluated()) { + if (r.hi == 0 && r.lo < 64) { + hi = (hi << r.lo) | (lo >> (64 - r.lo)); + lo = (lo << r.lo); + return *this; + } else if (r.hi == 0 && r.lo < 128) { + hi = (lo << (r.lo - 64)); + lo = 0; + return *this; + } + } + + const u64 v0 = lo << (r.lo & 63); + const u64 v1 = __shiftleft128(lo, hi, static_cast(r.lo)); + lo = (r.lo & 64) ? 0 : v0; + hi = (r.lo & 64) ? v0 : v1; + return *this; + } + + constexpr u128 &operator>>=(const u128 &r) { + if (std::is_constant_evaluated()) { + if (r.hi == 0 && r.lo < 64) { + lo = (lo >> r.lo) | (hi << (64 - r.lo)); + hi = (hi >> r.lo); + return *this; + } else if (r.hi == 0 && r.lo < 128) { + lo = (hi >> (r.lo - 64)); + hi = 0; + return *this; + } + } + + const u64 v0 = hi >> (r.lo & 63); + const u64 v1 = __shiftright128(lo, hi, static_cast(r.lo)); + lo = (r.lo & 64) ? v0 : v1; + hi = (r.lo & 64) ? 0 : v0; + return *this; + } + + constexpr u128 &operator&=(const u128 &r) { + lo &= r.lo; + hi &= r.hi; + return *this; + } + + constexpr u128 &operator|=(const u128 &r) { + lo |= r.lo; + hi |= r.hi; + return *this; + } + + constexpr u128 &operator^=(const u128 &r) { + lo ^= r.lo; + hi ^= r.hi; + return *this; + } +}; + +// Signed 128-bit integer implementation +struct s128 : u128 { + using u128::u128; + + constexpr s128 operator>>(u128 shift_value) const { + s128 value = *this; + value >>= shift_value; + return value; + } + + constexpr s128 &operator>>=(const u128 &r) { + if (std::is_constant_evaluated()) { + if (r.hi == 0 && r.lo < 64) { + lo = (lo >> r.lo) | (hi << (64 - r.lo)); + hi = (static_cast(hi) >> r.lo); + return *this; + } else if (r.hi == 0 && r.lo < 128) { + s64 _lo = static_cast(hi) >> (r.lo - 64); + lo = _lo; + hi = _lo >> 63; + return *this; + } + } + + const u64 v0 = static_cast(hi) >> (r.lo & 63); + const u64 v1 = __shiftright128(lo, hi, static_cast(r.lo)); + lo = (r.lo & 64) ? v0 : v1; + hi = (r.lo & 64) ? static_cast(hi) >> 63 : v0; + return *this; + } +}; +#endif + +// Optimization for u64*u64=u128 +constexpr u128 u128_from_mul(u64 a, u64 b) { +#ifdef _MSC_VER + if (!std::is_constant_evaluated()) { + u64 hi; + u128 result = _umul128(a, b, &hi); + result.hi = hi; + return result; + } +#endif + + return u128{a} * b; +} + +template <> struct get_int_impl<16> { + using utype = u128; + using stype = s128; +}; + +enum class f16 : u16 {}; + +using f32 = float; +using f64 = double; + +template +concept UnsignedInt = std::is_unsigned_v> || + std::is_same_v, u128>; + +template +concept SignedInt = (std::is_signed_v> && + std::is_integral_v>) || + std::is_same_v, s128>; + +template +concept FPInt = std::is_floating_point_v> || + std::is_same_v, f16>; + +template +concept Integral = std::is_integral_v> || + std::is_same_v, u128> || + std::is_same_v, s128>; + +template constexpr T min_v; + +template constexpr std::common_type_t min_v = 0; + +template +constexpr std::common_type_t min_v = + static_cast>(-1) + << (sizeof(std::common_type_t) * 8 - 1); + +template <> constexpr inline f16 min_v{0xfbffu}; + +template <> +constexpr inline f32 min_v = std::bit_cast(0xff'7fffffu); + +template <> +constexpr inline f64 min_v = + std::bit_cast(0xffe'7ffff'ffffffffu); + +template +constexpr std::common_type_t min_v = min_v>; + +template constexpr T max_v; + +template constexpr std::common_type_t max_v = -1; + +template +constexpr std::common_type_t max_v = + static_cast>(~min_v); + +template <> constexpr inline f16 max_v{0x7bffu}; + +template <> +constexpr inline f32 max_v = std::bit_cast(0x7f'7fffffu); + +template <> +constexpr inline f64 max_v = + std::bit_cast(0x7fe'fffff'ffffffffu); + +template +constexpr std::common_type_t max_v = max_v>; + +// Return magic value for any unsigned type +constexpr struct umax_impl_t { + template constexpr bool operator==(const T &rhs) const { + return rhs == max_v; + } + + template + constexpr std::strong_ordering operator<=>(const T &rhs) const { + return rhs == max_v ? std::strong_ordering::equal + : std::strong_ordering::greater; + } + + template constexpr operator T() const { return max_v; } +} umax; + +constexpr struct smin_impl_t { + template constexpr bool operator==(const T &rhs) const { + return rhs == min_v; + } + + template + constexpr std::strong_ordering operator<=>(const T &rhs) const { + return rhs == min_v ? std::strong_ordering::equal + : std::strong_ordering::less; + } + + template constexpr operator T() const { return min_v; } +} smin; + +constexpr struct smax_impl_t { + template constexpr bool operator==(const T &rhs) const { + return rhs == max_v; + } + + template + constexpr std::strong_ordering operator<=>(const T &rhs) const { + return rhs == max_v ? std::strong_ordering::equal + : std::strong_ordering::greater; + } + + template constexpr operator T() const { return max_v; } +} smax; + +// Compare signed or unsigned type with its max value +constexpr struct amax_impl_t { + template + requires SignedInt || UnsignedInt + constexpr bool operator==(const T &rhs) const { + return rhs == max_v; + } + + template + requires SignedInt || UnsignedInt + constexpr std::strong_ordering operator<=>(const T &rhs) const { + return max_v <=> rhs; + } + + template + requires SignedInt || UnsignedInt + constexpr operator T() const { + return max_v; + } +} amax; + +// Compare signed or unsigned type with its minimal value (like zero or INT_MIN) +constexpr struct amin_impl_t { + template + requires SignedInt || UnsignedInt + constexpr bool operator==(const T &rhs) const { + return rhs == min_v; + } + + template + requires SignedInt || UnsignedInt + constexpr std::strong_ordering operator<=>(const T &rhs) const { + return min_v <=> rhs; + } + + template + requires SignedInt || UnsignedInt + constexpr operator T() const { + return min_v; + } +} amin; + +namespace rx::detail { +template union UndefinedObject { + T s; + + UndefinedObject() {} + ~UndefinedObject() {} +}; +} // namespace rx::detail + +#define OFFSET_OF(STRUCT, FIELD) \ + ([&] { \ + ::rx::detail::UndefinedObject undefinedObject; \ + return static_cast( \ + std::bit_cast(&(undefinedObject.s.FIELD)) - \ + std::bit_cast(&undefinedObject.s)); \ + }()) + +template inline u32 offset32(T T2::*const mptr) { +#ifdef _MSC_VER + return std::bit_cast(mptr); +#elif __GNUG__ + return std::bit_cast(mptr); +#else + static_assert(sizeof(mptr) == 0, "Unsupported pointer-to-member size"); +#endif +} + +template struct offset32_array { + static_assert(std::is_array_v, + "Invalid pointer-to-member type (array expected)"); + + template static inline u32 index32(const Arg &arg) { + return u32{sizeof(std::remove_extent_t)} * static_cast(arg); + } +}; + +template struct offset32_array> { + template static inline u32 index32(const Arg &arg) { + return u32{sizeof(T)} * static_cast(arg); + } +}; + +template struct offset32_detail; + +template +inline u32 offset32(T T2::*const mptr, const Arg &arg, const Args &...args) { + return offset32_detail::offset32(mptr, arg, args...); +} + +template struct offset32_detail { + template + static inline u32 offset32(T T2::*const mptr, const Arg &arg, + const Args &...args) { + return ::offset32(mptr, args...) + offset32_array::index32(arg); + } +}; + +template struct offset32_detail { + template + static inline u32 offset32(T T2::*const mptr, T3 T4::*const mptr2, + const Args &...args) { + return ::offset32(mptr) + ::offset32(mptr2, args...); + } +}; + +// Convert 0-2-byte string to u16 value like reinterpret_cast does +constexpr u16 operator""_u16(const char *s, usz /*length*/) { + char buf[2]{s[0], s[1]}; + return std::bit_cast(buf); +} + +// Convert 3-4-byte string to u32 value like reinterpret_cast does +constexpr u32 operator""_u32(const char *s, usz /*length*/) { + char buf[4]{s[0], s[1], s[2], s[3]}; + return std::bit_cast(buf); +} + +// Convert 5-8-byte string to u64 value like reinterpret_cast does +constexpr u64 operator""_u64(const char *s, usz len) { + char buf[8]{s[0], + s[1], + s[2], + s[3], + s[4], + (len < 6 ? '\0' : s[5]), + (len < 7 ? '\0' : s[6]), + (len < 8 ? '\0' : s[7])}; + return std::bit_cast(buf); +} + +#if !defined(__INTELLISENSE__) && !__has_builtin(__builtin_COLUMN) && \ + !defined(_MSC_VER) +constexpr unsigned __builtin_COLUMN() { return -1; } +#endif + +template struct const_str_t { + static constexpr usz size = Size; + + char8_t chars[Size + 1]{}; + + constexpr const_str_t(const char (&a)[Size + 1]) { + for (usz i = 0; i <= Size; i++) + chars[i] = a[i]; + } + + constexpr const_str_t(const char8_t (&a)[Size + 1]) { + for (usz i = 0; i <= Size; i++) + chars[i] = a[i]; + } + + operator const char *() const { + return reinterpret_cast(chars); + } + + constexpr operator const char8_t *() const { return chars; } +}; + +template <> struct const_str_t { + const usz size; + + union { + const char8_t *chars; + const char *chars2; + }; + + constexpr const_str_t() : size(0), chars(nullptr) {} + + template + constexpr const_str_t(const char8_t (&a)[N]) : size(N - 1), chars(+a) {} + + template + constexpr const_str_t(const char (&a)[N]) : size(N - 1), chars2(+a) {} + + constexpr operator const char *() const { return std::launder(chars2); } + + constexpr operator const char8_t *() const { return chars; } +}; + +template const_str_t(const char (&a)[Size]) -> const_str_t; + +template +const_str_t(const char8_t (&a)[Size]) -> const_str_t; + +using const_str = const_str_t<>; + +namespace fmt { +[[noreturn]] void raw_verify_error(std::source_location loc, const char8_t *msg, + usz object); +[[noreturn]] void raw_range_error(std::source_location loc, + std::string_view index, usz container_size); +[[noreturn]] void raw_range_error(std::source_location loc, usz index, + usz container_size); +} // namespace fmt + +// No full implementation to ease on header weight +template +std::conditional_t>, usz, + std::string_view> +format_object_simplified(const T &obj) { + using type = std::remove_cvref_t; + + if constexpr (std::is_integral_v || std::is_same_v || + std::is_same_v) { + return obj; + } else if constexpr (std::is_array_v && + std::is_constructible_v) { + return {obj, std::size(obj) - 1}; + } else { + return std::string_view{}; + } +} + +template +constexpr decltype(auto) ensure( + T &&arg, const_str msg = const_str(), + std::source_location src_loc = std::source_location::current()) noexcept { + if (std::forward(arg)) [[likely]] { + return std::forward(arg); + } + + fmt::raw_verify_error(src_loc, msg, 0); +} + +template + requires(std::is_invocable_v) +constexpr decltype(auto) ensure( + T &&arg, F &&pred, const_str msg = const_str(), + std::source_location src_loc = std::source_location::current()) noexcept { + if (std::forward(pred)(std::forward(arg))) [[likely]] { + return std::forward(arg); + } + + fmt::raw_verify_error(src_loc, msg, 0); +} + +template + requires( + std::is_integral_v() + std::declval())>) +[[nodiscard]] constexpr To +narrow(const From &value, + std::source_location src_loc = std::source_location::current()) { + // Narrow check + using CommonFrom = std::common_type_t; + using CommonTo = std::common_type_t; + + using UnFrom = std::make_unsigned_t; + using UnTo = std::make_unsigned_t; + + constexpr bool is_from_signed = std::is_signed_v; + constexpr bool is_to_signed = std::is_signed_v; + + constexpr auto from_mask = + (is_from_signed && !is_to_signed) ? UnFrom{umax} >> 1 : UnFrom{umax}; + constexpr auto to_mask = + (is_to_signed && !is_from_signed) ? UnTo{umax} >> 1 : UnTo{umax}; + + constexpr auto mask = ~(from_mask & to_mask); + + // Signed to unsigned always require test + // Otherwise, this is bit-wise narrowing or conversion between types of + // different signedness of the same size + if constexpr ((is_from_signed && !is_to_signed) || to_mask < from_mask) { + // Try to optimize test if both are of the same signedness + if (is_from_signed != is_to_signed ? !!(value & mask) + : static_cast(value) != value) + [[unlikely]] { + fmt::raw_verify_error(src_loc, u8"Narrowing error", +value); + } + } + + return static_cast(value); +} + +// Returns u32 size() for container +template + requires requires(const CT &x) { std::size(x); } +[[nodiscard]] constexpr u32 +size32(const CT &container, + std::source_location src_loc = std::source_location::current()) { + // TODO: Support std::array + constexpr bool is_const = std::is_array_v>; + + if constexpr (is_const) { + constexpr usz Size = sizeof(container) / sizeof(container[0]); + return std::conditional_t{Size}; + } else { + return narrow(container.size(), src_loc); + } +} + +template + requires requires(CT &&x) { + std::size(x); + std::data(x); + } || requires(CT &&x) { + std::size(x); + x.front(); + } +[[nodiscard]] constexpr auto & +at32(CT &&container, T &&index, + std::source_location src_loc = std::source_location::current()) { + // Make sure the index is within u32 range + const std::make_unsigned_t> idx = index; + const u32 csz = ::size32(container, src_loc); + if (csz <= idx) [[unlikely]] + fmt::raw_range_error(src_loc, format_object_simplified(index), csz); + auto it = std::begin(std::forward(container)); + std::advance(it, idx); + return *it; +} + +template + requires requires(CT &&x, T &&y) { + x.count(y); + x.find(y); + } +[[nodiscard]] constexpr auto & +at32(CT &&container, T &&index, + std::source_location src_loc = std::source_location::current()) { + // Associative container + const auto found = container.find(std::forward(index)); + usz csv = umax; + if constexpr ((requires() { container.size(); })) + csv = container.size(); + if (found == container.end()) [[unlikely]] + fmt::raw_range_error(src_loc, format_object_simplified(index), csv); + return found->second; +} + +// Simplified hash algorithm. May be used in std::unordered_(map|set). +template struct value_hash { + usz operator()(T value) const { return static_cast(value) >> Shift; } +}; + +template struct fill_array_t { + std::tuple args; + + template + constexpr std::unwrap_reference_t get() const { + return std::get(args); + } + + template + constexpr std::array fill(std::index_sequence, + std::index_sequence) const { + return {(static_cast(Idx), U(get()...))...}; + } + + template constexpr operator std::array() const { + return fill(std::make_index_sequence(), + std::make_index_sequence()); + } +}; + +template constexpr auto fill_array(const T &...args) { + return fill_array_t{{args...}}; +} + +template +concept PtrCastable = requires(const volatile X *x, const volatile Y *y) { + static_cast(x); + static_cast(y); +}; + +template + requires PtrCastable +consteval bool is_same_ptr() { + if constexpr (std::is_void_v || std::is_void_v || + std::is_same_v, std::remove_cv_t>) { + return true; + } else if constexpr (sizeof(X) == sizeof(Y)) { + return true; + } else { + bool result = false; + + if constexpr (sizeof(X) < sizeof(Y)) { + std::allocator a{}; + Y *ptr = a.allocate(1); + result = static_cast(ptr) == static_cast(ptr); + a.deallocate(ptr, 1); + } else { + std::allocator a{}; + X *ptr = a.allocate(1); + result = static_cast(ptr) == static_cast(ptr); + a.deallocate(ptr, 1); + } + + return result; + } +} + +template + requires PtrCastable +constexpr bool is_same_ptr(const volatile Y *ptr) { + return static_cast(ptr) == + static_cast(ptr); +} + +template +concept PtrSame = (is_same_ptr()); + +template struct exact_t { + static_assert(std::is_reference_v || std::is_convertible_v); + + T obj; + + explicit exact_t(T &&_obj) : obj(std::forward(_obj)) {} + exact_t &operator=(const exact_t &) = delete; + + template + requires(std::is_same_v) + operator U &() const noexcept { + return obj; + }; + + template + requires(std::is_same_v) + operator const U &() const noexcept { + return obj; + }; + + template + requires(std::is_same_v && std::is_copy_constructible_v) + operator U() const noexcept { + return obj; + }; +}; + +template exact_t make_exact(T &&obj) noexcept { + return exact_t(static_cast(obj)); +} + +// Read object of type T from raw pointer, array, string, vector, or any +// contiguous container +template +constexpr T read_from_ptr(U &&array, usz pos = 0) { + // TODO: ensure array element types are trivial + static_assert(sizeof(T) % sizeof(array[0]) == 0); + std::decay_t buf[sizeof(T) / sizeof(array[0])]; + if (!std::is_constant_evaluated()) + std::memcpy(+buf, &array[pos], sizeof(buf)); + else + for (usz i = 0; i < pos; buf[i] = array[pos + i], i++) + ; + return std::bit_cast(buf); +} + +template +constexpr void write_to_ptr(U &&array, usz pos, const T &value) { + static_assert(sizeof(T) % sizeof(array[0]) == 0); + if (!std::is_constant_evaluated()) + std::memcpy(static_cast(&array[pos]), &value, sizeof(value)); + else + ensure(!"Unimplemented"); +} + +template +constexpr void write_to_ptr(U &&array, const T &value) { + static_assert(sizeof(T) % sizeof(array[0]) == 0); + if (!std::is_constant_evaluated()) + std::memcpy(&array[0], &value, sizeof(value)); + else + ensure(!"Unimplemented"); +} + +constexpr struct aref_tag_t { +} aref_tag{}; + +template class aref final { + U *m_ptr; + + static_assert(sizeof(std::decay_t) % sizeof(U) == 0); + +public: + aref() = delete; + + constexpr aref(const aref &) = default; + + explicit constexpr aref(aref_tag_t, U *ptr) : m_ptr(ptr) {} + + constexpr T value() const { return read_from_ptr(m_ptr); } + + constexpr operator T() const { return read_from_ptr(m_ptr); } + + aref &operator=(const aref &) = delete; + + constexpr aref &operator=(const T &value) const { + write_to_ptr(m_ptr, value); + return *this; + } +}; + +template class aref { + U *m_ptr; + + static_assert(sizeof(std::decay_t) % sizeof(U) == 0); + +public: + aref() = delete; + + constexpr aref(const aref &) = default; + + explicit constexpr aref(aref_tag_t, U *ptr) : m_ptr(ptr) {} + + aref &operator=(const aref &) = delete; + + constexpr aref operator[](usz index) const { + return aref(aref_tag, m_ptr + index * (sizeof(T) / sizeof(U))); + } +}; + +template class aref { + U *m_ptr; + + static_assert(sizeof(std::decay_t) % sizeof(U) == 0); + +public: + aref() = delete; + + constexpr aref(const aref &) = default; + + explicit constexpr aref(aref_tag_t, U *ptr) : m_ptr(ptr) {} + + aref &operator=(const aref &) = delete; + + constexpr aref operator[](usz index) const { + return aref(aref_tag, m_ptr + index * (sizeof(T) / sizeof(U))); + } +}; + +// Reference object of type T, see read_from_ptr +template +constexpr auto ref_ptr(U &&array, usz pos = 0) + -> aref> { + return aref>(aref_tag, &array[pos]); +} + +template +struct se_storage { + struct type8 { + alignas(Align > alignof(T) ? alignof(T) : Align) uchar data[sizeof(T)]; + }; + + struct type64 { + alignas(8) u64 data[sizeof(T) < 8 ? 1 : sizeof(T) / 8]; + }; + + using type = + std::conditional_t<(Align >= 8 && sizeof(T) % 8 == 0), type64, type8>; + + // Possibly unoptimized generic byteswap for unaligned data + static constexpr type swap(const type &src) noexcept; +}; + +template struct se_storage { + using type = u16; + + static constexpr u16 swap(u16 src) noexcept { +#if __cpp_lib_byteswap >= 202110L + return std::byteswap(src); +#elif defined(__GNUG__) + return __builtin_bswap16(src); +#else + if (std::is_constant_evaluated()) { + return (src >> 8) | (src << 8); + } + + return _byteswap_ushort(src); +#endif + } +}; + +template struct se_storage { + using type = u32; + + static constexpr u32 swap(u32 src) noexcept { +#if __cpp_lib_byteswap >= 202110L + return std::byteswap(src); +#elif defined(__GNUG__) + return __builtin_bswap32(src); +#else + if (std::is_constant_evaluated()) { + const u32 v0 = ((src << 8) & 0xff00ff00) | ((src >> 8) & 0x00ff00ff); + return (v0 << 16) | (v0 >> 16); + } + + return _byteswap_ulong(src); +#endif + } +}; + +template struct se_storage { + using type = u64; + + static constexpr u64 swap(u64 src) noexcept { +#if __cpp_lib_byteswap >= 202110L + return std::byteswap(src); +#elif defined(__GNUG__) + return __builtin_bswap64(src); +#else + if (std::is_constant_evaluated()) { + const u64 v0 = + ((src << 8) & 0xff00ff00ff00ff00) | ((src >> 8) & 0x00ff00ff00ff00ff); + const u64 v1 = + ((v0 << 16) & 0xffff0000ffff0000) | ((v0 >> 16) & 0x0000ffff0000ffff); + return (v1 << 32) | (v1 >> 32); + } + + return _byteswap_uint64(src); +#endif + } +}; + +template +constexpr typename se_storage::type +se_storage::swap(const type &src) noexcept { + // Try to keep u16/u32/u64 optimizations at the cost of more bitcasts + if constexpr (sizeof(T) == 1) { + return src; + } else if constexpr (sizeof(T) == 2) { + return std::bit_cast(se_storage::swap(std::bit_cast(src))); + } else if constexpr (sizeof(T) == 4) { + return std::bit_cast(se_storage::swap(std::bit_cast(src))); + } else if constexpr (sizeof(T) == 8) { + return std::bit_cast(se_storage::swap(std::bit_cast(src))); + } else if constexpr (sizeof(T) % 8 == 0) { + type64 tmp = std::bit_cast(src); + type64 dst{}; + + // Swap u64 blocks + for (usz i = 0; i < sizeof(T) / 8; i++) { + dst.data[i] = se_storage::swap(tmp.data[sizeof(T) / 8 - 1 - i]); + } + + return std::bit_cast(dst); + } else { + type dst{}; + + // Swap by moving every byte + for (usz i = 0; i < sizeof(T); i++) { + dst.data[i] = src.data[sizeof(T) - 1 - i]; + } + + return dst; + } +} + +// Endianness support template +template +class alignas(Align) se_t { + using type = std::remove_cv_t; + using stype = typename se_storage::type; + using storage = se_storage; + + stype m_data; + + static_assert(!std::is_pointer_v, + "se_t<> error: invalid type (pointer)"); + static_assert(!std::is_reference_v, + "se_t<> error: invalid type (reference)"); + static_assert(!std::is_array_v, "se_t<> error: invalid type (array)"); + static_assert(sizeof(type) == alignof(type), + "se_t<> error: unexpected alignment"); + + static constexpr stype to_data(type value) noexcept { + if constexpr (Swap) { + return storage::swap(std::bit_cast(value)); + } else { + return std::bit_cast(value); + } + } + + static constexpr auto int_or_enum() { + if constexpr (std::is_enum_v) { + return std::underlying_type_t{}; + } else { + return type{}; + } + } + + using under = decltype(int_or_enum()); + +public: + se_t() noexcept = default; + + constexpr se_t(type value) noexcept : m_data(to_data(value)) {} + + constexpr type value() const noexcept { + if constexpr (Swap) { + return std::bit_cast(storage::swap(m_data)); + } else { + return std::bit_cast(m_data); + } + } + + constexpr type get() const noexcept { return value(); } + + constexpr se_t &operator=(type value) noexcept { + m_data = to_data(value); + return *this; + } + + constexpr operator type() const noexcept { return value(); } + +#ifdef _MSC_VER + explicit constexpr operator bool() const noexcept { + static_assert(!type{}); + static_assert(!std::is_floating_point_v); + return !!std::bit_cast(m_data); + } +#endif + + constexpr auto operator~() const noexcept { + if constexpr ((std::is_integral_v || std::is_enum_v) && + std::is_convertible_v) { + // Return se_t of integral type if possible. Promotion to int is omitted + // on purpose (a compromise). + return std::bit_cast>( + static_cast(~std::bit_cast(m_data))); + } else { + return ~value(); + } + } + +private: + // Compatible bit pattern cast + template + static constexpr To right_arg_cast(const T2 &rhs) noexcept { + return std::bit_cast(static_cast>(rhs)); + } + + template + static constexpr To + right_arg_cast(const se_t &rhs) noexcept { + if constexpr ((std::is_integral_v || std::is_enum_v) && + std::is_convertible_v && sizeof(R) == sizeof(T)) { + // Optimization: allow to reuse bit pattern of any se_t with + // bit-compatible type + return std::bit_cast(rhs); + } else { + return std::bit_cast(static_cast>(rhs.value())); + } + } + +public: + template + requires requires(const T2 &t2) { +t2; } + constexpr bool operator==(const T2 &rhs) const noexcept { + using R = std::common_type_t; + + if constexpr ((std::is_integral_v || std::is_enum_v) && + (std::is_integral_v || std::is_enum_v)) { + if constexpr (sizeof(T) >= sizeof(R)) { + if constexpr (std::is_convertible_v && + std::is_convertible_v) { + return std::bit_cast(m_data) == right_arg_cast(rhs); + } else { + // Compare with strict type on the right side (possibly scoped enum) + return std::bit_cast(m_data) == right_arg_cast(rhs); + } + } + } + + // Keep outside of if constexpr to make sure it fails on invalid comparison + return value() == rhs; + } + +private: + template static constexpr bool check_args_for_bitwise_op() { + using R = std::common_type_t; + + if constexpr ((std::is_integral_v || std::is_enum_v) && + (std::is_integral_v || std::is_enum_v)) { + if constexpr (std::is_convertible_v && + std::is_convertible_v && sizeof(T) >= sizeof(R)) { + return true; + } + } + + return false; + } + +public: + template + constexpr auto operator&(const T2 &rhs) const noexcept { + if constexpr (check_args_for_bitwise_op()) { + return std::bit_cast>(static_cast( + std::bit_cast(m_data) & right_arg_cast(rhs))); + } else { + return value() & rhs; + } + } + + template + constexpr auto operator|(const T2 &rhs) const noexcept { + if constexpr (check_args_for_bitwise_op()) { + return std::bit_cast>(static_cast( + std::bit_cast(m_data) | right_arg_cast(rhs))); + } else { + return value() | rhs; + } + } + + template + constexpr auto operator^(const T2 &rhs) const noexcept { + if constexpr (check_args_for_bitwise_op()) { + return std::bit_cast>(static_cast( + std::bit_cast(m_data) ^ right_arg_cast(rhs))); + } else { + return value() ^ rhs; + } + } + + template constexpr se_t &operator+=(const T1 &rhs) { + *this = value() + rhs; + return *this; + } + + template constexpr se_t &operator-=(const T1 &rhs) { + *this = value() - rhs; + return *this; + } + + template constexpr se_t &operator*=(const T1 &rhs) { + *this = value() * rhs; + return *this; + } + + template constexpr se_t &operator/=(const T1 &rhs) { + *this = value() / rhs; + return *this; + } + + template constexpr se_t &operator%=(const T1 &rhs) { + *this = value() % rhs; + return *this; + } + + template constexpr se_t &operator&=(const T1 &rhs) { + if constexpr (std::is_integral_v) { + m_data = std::bit_cast(static_cast( + std::bit_cast(m_data) & right_arg_cast(rhs))); + return *this; + } + + *this = value() & rhs; + return *this; + } + + template constexpr se_t &operator|=(const T1 &rhs) { + if constexpr (std::is_integral_v) { + m_data = std::bit_cast(static_cast( + std::bit_cast(m_data) | right_arg_cast(rhs))); + return *this; + } + + *this = value() | rhs; + return *this; + } + + template constexpr se_t &operator^=(const T1 &rhs) { + if constexpr (std::is_integral_v) { + m_data = std::bit_cast(static_cast( + std::bit_cast(m_data) ^ right_arg_cast(rhs))); + return *this; + } + + *this = value() ^ rhs; + return *this; + } + + template constexpr se_t &operator<<=(const T1 &rhs) { + *this = value() << rhs; + return *this; + } + + template constexpr se_t &operator>>=(const T1 &rhs) { + *this = value() >> rhs; + return *this; + } + + constexpr se_t &operator++() { + T value = *this; + *this = ++value; + return *this; + } + + constexpr se_t &operator--() { + T value = *this; + *this = --value; + return *this; + } + + constexpr T operator++(int) { + T value = *this; + T result = value++; + *this = value; + return result; + } + + constexpr T operator--(int) { + T value = *this; + T result = value--; + *this = value; + return result; + } +}; + +// Specializations + +template +struct std::common_type, se_t> + : std::common_type {}; + +template +struct std::common_type, T2> + : std::common_type> {}; + +template +struct std::common_type> + : std::common_type, T2> {}; + +#define UNUSED(expr) \ + do { \ + (void)(expr); \ + } while (0) diff --git a/rx/include/rx/v128.hpp b/rx/include/rx/v128.hpp new file mode 100644 index 000000000..3e477a1cc --- /dev/null +++ b/rx/include/rx/v128.hpp @@ -0,0 +1,187 @@ +#pragma once // No BOM and only basic ASCII in this header, or a neko will die + +#include "types.hpp" + +namespace rx { +template +concept Vector128 = (sizeof(T) == 16) && (std::is_trivial_v); + +// 128-bit vector type +union alignas(16) v128 { + using enable_bitcopy = std::true_type; // FIXME: remove + + uchar _bytes[16]; + char _chars[16]; + + template + struct masked_array_t // array type accessed as (index ^ M) + { + T data[N]; + + T &operator[](usz index) { return data[index ^ M]; } + const T &operator[](usz index) const { return data[index ^ M]; } + }; + + template + using normal_array_t = + masked_array_t; + template + using reversed_array_t = + masked_array_t; + + normal_array_t _u64; + normal_array_t _s64; + reversed_array_t u64r; + reversed_array_t s64r; + + normal_array_t _u32; + normal_array_t _s32; + reversed_array_t u32r; + reversed_array_t s32r; + + normal_array_t _u16; + normal_array_t _s16; + reversed_array_t u16r; + reversed_array_t s16r; + + normal_array_t _u8; + normal_array_t _s8; + reversed_array_t u8r; + reversed_array_t s8r; + + normal_array_t _f; + normal_array_t _d; + reversed_array_t fr; + reversed_array_t dr; + + u128 _u; + s128 _s; + + v128() = default; + + constexpr v128(const v128 &) noexcept = default; + + template + constexpr v128(const T &rhs) noexcept : v128(std::bit_cast(rhs)) {} + + constexpr v128 &operator=(const v128 &) noexcept = default; + + template constexpr operator T() const noexcept { + return std::bit_cast(*this); + } + + static v128 from64(u64 _0, u64 _1 = 0) { + v128 ret; + ret._u64[0] = _0; + ret._u64[1] = _1; + return ret; + } + + static v128 from64r(u64 _1, u64 _0 = 0) { return from64(_0, _1); } + + static v128 from64p(u64 value) { + v128 ret; + ret._u64[0] = value; + ret._u64[1] = value; + return ret; + } + + static v128 from32(u32 _0, u32 _1 = 0, u32 _2 = 0, u32 _3 = 0) { + v128 ret; + ret._u32[0] = _0; + ret._u32[1] = _1; + ret._u32[2] = _2; + ret._u32[3] = _3; + return ret; + } + + static v128 from32r(u32 _3, u32 _2 = 0, u32 _1 = 0, u32 _0 = 0) { + return from32(_0, _1, _2, _3); + } + + static v128 from32p(u32 value) { + v128 ret; + ret._u32[0] = value; + ret._u32[1] = value; + ret._u32[2] = value; + ret._u32[3] = value; + return ret; + } + + static v128 fromf32p(f32 value) { + v128 ret; + ret._f[0] = value; + ret._f[1] = value; + ret._f[2] = value; + ret._f[3] = value; + return ret; + } + + static v128 from16p(u16 value) { + v128 ret; + ret._u16[0] = value; + ret._u16[1] = value; + ret._u16[2] = value; + ret._u16[3] = value; + ret._u16[4] = value; + ret._u16[5] = value; + ret._u16[6] = value; + ret._u16[7] = value; + return ret; + } + + static v128 from8p(u8 value) { + v128 ret; + std::memset(&ret, value, sizeof(ret)); + return ret; + } + + static v128 undef() { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#elif _MSC_VER +#pragma warning(push) +#pragma warning(disable : 6001) +#endif + v128 ret; + return ret; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#elif _MSC_VER +#pragma warning(pop) +#endif + } + + // Unaligned load with optional index offset + static v128 loadu(const void *ptr, usz index = 0) { + v128 ret; + std::memcpy(&ret, static_cast(ptr) + index * sizeof(v128), + sizeof(v128)); + return ret; + } + + // Unaligned store with optional index offset + static void storeu(v128 value, void *ptr, usz index = 0) { + std::memcpy(static_cast(ptr) + index * sizeof(v128), &value, + sizeof(v128)); + } + + v128 operator|(const v128 &) const; + v128 operator&(const v128 &) const; + v128 operator^(const v128 &) const; + v128 operator~() const; + + bool operator==(const v128 &right) const; + + void clear() { *this = {}; } +}; +} // namespace rx + +template <> struct std::hash { + usz operator()(const rx::v128 &key) const { + return key._u64[0] ^ (key._u64[1] << 1); + } +}; diff --git a/rx/src/debug.cpp b/rx/src/debug.cpp index c8ed6855c..d8a2d9d32 100644 --- a/rx/src/debug.cpp +++ b/rx/src/debug.cpp @@ -5,12 +5,47 @@ #include #include -#ifdef __GNUC__ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#ifdef _WIN32 +#include +#else + +#ifdef __linux__ #include #include +#endif #include +#endif + bool rx::isDebuggerPresent() { +#ifdef _WIN32 + return ::IsDebuggerPresent(); +#elif defined(__APPLE__) || defined(__DragonFly__) || defined(__FreeBSD__) || \ + defined(__NetBSD__) || defined(__OpenBSD__) + int mib[] = { + CTL_KERN, + KERN_PROC, + KERN_PROC_PID, + getpid(), +#if defined(__NetBSD__) || defined(__OpenBSD__) + sizeof(struct kinfo_proc), + 1, +#endif + }; + u_int miblen = std::size(mib); + struct kinfo_proc info; + usz size = sizeof(info); + + if (sysctl(mib, miblen, &info, &size, NULL, 0)) { + return false; + } + + return info.KP_FLAGS & P_TRACED; +#elif defined(__linux__) std::ifstream in("/proc/self/status"); std::string line; while (std::getline(in, line)) { @@ -30,6 +65,7 @@ bool rx::isDebuggerPresent() { } return false; +#endif } void rx::waitForDebugger() { @@ -49,6 +85,7 @@ void rx::waitForDebugger() { } void rx::runDebugger() { +#ifdef __linux__ int pid = ::getpid(); char path[PATH_MAX]; ::readlink("/proc/self/exe", path, sizeof(path)); @@ -78,19 +115,20 @@ void rx::runDebugger() { argv.push_back(nullptr); execv(gdbPath, (char **)argv.data()); -} - -#else -bool rx::isDebuggerPresent() { return false; } -void rx::waitForDebugger() {} -void rx::runDebugger() {} #endif +} void rx::breakpoint() { #if __has_builtin(__builtin_debugtrap) __builtin_debugtrap(); -#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#elif defined(__GNUC__) +#if defined(__i386__) || defined(__x86_64__) __asm__ volatile("int3"); +#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) + __asm__ volatile("brk 0x42"); +#endif +#elif defined(_M_X64) + __debugbreak(); #endif } diff --git a/rx/src/mem.cpp b/rx/src/mem.cpp index 69afd161b..c3e225fa7 100644 --- a/rx/src/mem.cpp +++ b/rx/src/mem.cpp @@ -1,4 +1,7 @@ #include "mem.hpp" + +#ifdef __linux__ + #include #include #include @@ -44,3 +47,4 @@ void rx::mem::printStats() { free(line); fclose(maps); } +#endif