#pragma once // No BOM and only basic ASCII in this header, or a neko will die #include "util/types.hpp" // 128-bit vector type union alignas(16) v128 { uchar _bytes[16]; char _chars[16]; template struct masked_array_t // array type accessed as (index ^ M) { T m_data[N]; public: T& operator[](usz index) { return m_data[index ^ M]; } const T& operator[](usz index) const { return m_data[index ^ M]; } }; template using normal_array_t = masked_array_t; template using reversed_array_t = masked_array_t; normal_array_t _u64; normal_array_t _s64; reversed_array_t u64r; reversed_array_t s64r; normal_array_t _u32; normal_array_t _s32; reversed_array_t u32r; reversed_array_t s32r; normal_array_t _u16; normal_array_t _s16; reversed_array_t u16r; reversed_array_t s16r; normal_array_t _u8; normal_array_t _s8; reversed_array_t u8r; reversed_array_t s8r; normal_array_t _f; normal_array_t _d; reversed_array_t fr; reversed_array_t dr; u128 _u; //s128 _s; #ifdef _MSC_VER template struct opaque_wrapper { u128 m_data; opaque_wrapper() = default; opaque_wrapper(const T& value) : m_data(std::bit_cast(value)) { } opaque_wrapper& operator=(const T& value) { m_data = std::bit_cast(value); return *this; } operator T() const { return std::bit_cast(m_data); } }; opaque_wrapper<__m128> vf; opaque_wrapper<__m128i> vi; opaque_wrapper<__m128d> vd; #else __m128 vf; __m128i vi; __m128d vd; #endif static v128 from64(u64 _0, u64 _1 = 0) { v128 ret; ret._u64[0] = _0; ret._u64[1] = _1; return ret; } static v128 from64r(u64 _1, u64 _0 = 0) { return from64(_0, _1); } static v128 from32(u32 _0, u32 _1 = 0, u32 _2 = 0, u32 _3 = 0) { v128 ret; ret._u32[0] = _0; ret._u32[1] = _1; ret._u32[2] = _2; ret._u32[3] = _3; return ret; } static v128 from32r(u32 _3, u32 _2 = 0, u32 _1 = 0, u32 _0 = 0) { return from32(_0, _1, _2, _3); } static v128 from32p(u32 value) { v128 ret; ret._u32[0] = value; ret._u32[1] = value; ret._u32[2] = value; ret._u32[3] = value; return ret; } static v128 from16p(u16 value) { v128 ret; ret._u16[0] = value; ret._u16[1] = value; ret._u16[2] = value; ret._u16[3] = value; ret._u16[4] = value; ret._u16[5] = value; ret._u16[6] = value; ret._u16[7] = value; return ret; } static v128 from8p(u8 value) { v128 ret; std::memset(&ret, value, sizeof(ret)); return ret; } static inline v128 fromV(const __m128i& value); static inline v128 fromF(const __m128& value); static inline v128 fromD(const __m128d& value); // Unaligned load with optional index offset static v128 loadu(const void* ptr, usz index = 0) { v128 ret; std::memcpy(&ret, static_cast(ptr) + index * sizeof(v128), sizeof(v128)); return ret; } // Unaligned store with optional index offset static void storeu(v128 value, void* ptr, usz index = 0) { std::memcpy(static_cast(ptr) + index * sizeof(v128), &value, sizeof(v128)); } static inline v128 add8(const v128& left, const v128& right); static inline v128 add16(const v128& left, const v128& right); static inline v128 add32(const v128& left, const v128& right); static inline v128 addfs(const v128& left, const v128& right); static inline v128 addfd(const v128& left, const v128& right); static inline v128 sub8(const v128& left, const v128& right); static inline v128 sub16(const v128& left, const v128& right); static inline v128 sub32(const v128& left, const v128& right); static inline v128 subfs(const v128& left, const v128& right); static inline v128 subfd(const v128& left, const v128& right); static inline v128 maxu8(const v128& left, const v128& right); static inline v128 minu8(const v128& left, const v128& right); static inline v128 eq8(const v128& left, const v128& right); static inline v128 eq16(const v128& left, const v128& right); static inline v128 eq32(const v128& left, const v128& right); static inline v128 eq32f(const v128& left, const v128& right); static inline v128 fma32f(v128 a, const v128& b, const v128& c); bool operator==(const v128& right) const; bool operator!=(const v128& right) const; // result = (~left) & (right) static inline v128 andnot(const v128& left, const v128& right); void clear() { *this = {}; } }; template struct offset32_array> { template static inline u32 index32(const Arg& arg) { return u32{sizeof(T)} * (static_cast(arg) ^ static_cast(M)); } };