xenia/src/xenia/gpu/xenos.h
Triang3l a06be03f1b [GPU] Cleanup definitions of some registers
VS/PS_NUM_REG is 6-bit on Adreno 200, and games aren't seen using the
bit 7 to indicate that no GPRs are used. It's not clear why Freedreno
configures it this way.

Some texture fetch fields were deprecated or moved during the development
of the Xenos, reflect that in the comments.

Add definitions of the registers configuring the conversion of vertex
positions to fixed-point. Although there isn't much that can be done with
it when emulating using PC GPU APIs, there are some places in Xenia that
wrongly (though sometimes deliberately, for results closer to the behavior
of the host GPU) assume that the conversion works like in Direct3D 10+,
however the Xenos supports only up to 4 subpixel bits rather than 8. The
effects of this difference are largely negligible, though.

Also add more detailed info about register references and differences from
other ATI/AMD GPUs for potential future contributors.
2025-08-06 13:21:19 +03:00

1671 lines
65 KiB
C++

/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_GPU_XENOS_H_
#define XENIA_GPU_XENOS_H_
#include <algorithm>
#include "xenia/base/assert.h"
#include "xenia/base/byte_order.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "xenia/base/platform.h"
namespace xe {
namespace gpu {
namespace xenos {
// enum types used in the GPU registers or the microcode must be : uint32_t or
// : int32_t, as Visual C++ restarts bit field packing when a field requires
// different alignment than the previous one, so only 32-bit types must be used
// in bit fields (registers are 32-bit, and the microcode consists of triples of
// 32-bit words).
constexpr fourcc_t kSwapSignature = make_fourcc("SWAP");
enum class ShaderType : uint32_t {
kVertex = 0,
kPixel = 1,
};
// Only the lower 24 bits of the vertex index are used (tested on an Adreno 200
// phone using a GL_UNSIGNED_INT element array buffer with junk in the upper 8
// bits that had no effect on drawing).
constexpr uint32_t kVertexIndexBits = 24;
constexpr uint32_t kVertexIndexMask = (uint32_t(1) << kVertexIndexBits) - 1;
enum class PrimitiveType : uint32_t {
kNone = 0x00,
kPointList = 0x01,
kLineList = 0x02,
kLineStrip = 0x03,
kTriangleList = 0x04,
kTriangleFan = 0x05,
kTriangleStrip = 0x06,
kTriangleWithWFlags = 0x07,
kRectangleList = 0x08,
kLineLoop = 0x0C,
kQuadList = 0x0D,
kQuadStrip = 0x0E,
kPolygon = 0x0F,
// Starting with this primitive type, explicit major mode is assumed (in the
// R6xx/R7xx registers, k2DCopyRectListV0 is 22, and implicit major mode is
// only used for primitive types 0 through 21) - and tessellation patches use
// the range that starts from k2DCopyRectListV0.
// TODO(Triang3l): Verify if this is also true for the Xenos.
kExplicitMajorModeForceStart = 0x10,
k2DCopyRectListV0 = 0x10,
k2DCopyRectListV1 = 0x11,
k2DCopyRectListV2 = 0x12,
k2DCopyRectListV3 = 0x13,
k2DFillRectList = 0x14,
k2DLineStrip = 0x15,
k2DTriStrip = 0x16,
// Tessellation patches when VGT_OUTPUT_PATH_CNTL::path_select is
// VGTOutputPath::kTessellationEnable. The vertex shader receives the patch
// index rather than control point indices.
// With non-adaptive tessellation, VGT_DRAW_INITIATOR::num_indices is the
// patch count (4D5307F1 draws single ground patches by passing 1 as the index
// count). VGT_INDX_OFFSET is also applied to the patch index - 4D5307F1 uses
// auto-indexed patches with a nonzero VGT_INDX_OFFSET, which contains the
// base patch index there.
// With adaptive tessellation, however, num_indices is the number of
// tessellation factors in the "index buffer" reused for tessellation factors,
// which is the patch count multiplied by the edge count (if num_indices is
// multiplied further by 4 for quad patches for the ground in 4D5307F2, for
// example, some incorrect patches are drawn, so Xenia shouldn't do that; also
// 4D5307E6 draws water triangle patches with the number of indices that is 3
// times the invocation count of the memexporting shader that calculates the
// tessellation factors for a single patch for each "point").
kLinePatch = 0x10,
kTrianglePatch = 0x11,
kQuadPatch = 0x12,
};
// For the texture fetch constant (not the tfetch instruction), stacked stored
// as 2D.
enum class DataDimension : uint32_t {
k1D = 0,
k2DOrStacked = 1,
k3D = 2,
kCube = 3,
};
enum class ClampMode : uint32_t {
kRepeat = 0,
kMirroredRepeat = 1,
kClampToEdge = 2,
kMirrorClampToEdge = 3,
kClampToHalfway = 4,
kMirrorClampToHalfway = 5,
kClampToBorder = 6,
kMirrorClampToBorder = 7,
};
constexpr bool ClampModeUsesBorder(ClampMode clamp_mode) {
return clamp_mode == ClampMode::kClampToBorder ||
clamp_mode == ClampMode::kMirrorClampToBorder;
}
// TEX_FORMAT_COMP, known as GPUSIGN on the Xbox 360.
enum class TextureSign : uint32_t {
kUnsigned = 0,
// Two's complement texture data.
kSigned = 1,
// 2*color-1 - https://xboxforums.create.msdn.com/forums/t/107374.aspx
kUnsignedBiased = 2,
// Linearized when sampled.
kGamma = 3,
};
enum class TextureFilter : uint32_t {
kPoint = 0,
kLinear = 1,
// Only applicable to the mip filter - like OpenGL minification filters
// GL_NEAREST / GL_LINEAR without MIPMAP_NEAREST / MIPMAP_LINEAR.
kBaseMap = 2,
kUseFetchConst = 3,
};
enum class AnisoFilter : uint32_t {
kDisabled = 0,
kMax_1_1 = 1,
kMax_2_1 = 2,
kMax_4_1 = 3,
kMax_8_1 = 4,
kMax_16_1 = 5,
kUseFetchConst = 7,
};
enum class BorderColor : uint32_t {
// (0.0, 0.0, 0.0)
// TODO(Triang3l): Is the alpha 0 or 1?
k_ABGR_Black = 0,
// (1.0, 1.0, 1.0, 1.0)
k_ABGR_White = 1,
// Unknown precisely, but likely (0.5, 0.0, 0.5) for unsigned (Cr, Y, Cb)
// TODO(Triang3l): Real hardware border color, and is the alpha 0 or 1?
k_ACBYCR_Black = 2,
// Unknown precisely, but likely (0.0, 0.5, 0.5) for unsigned (Y, Cr, Cb)
// TODO(Triang3l): Real hardware border color, and is the alpha 0 or 1?
k_ACBCRY_Black = 3,
};
// For the tfetch instruction (not the fetch constant) and related instructions,
// stacked accessed using tfetch3D.
enum class FetchOpDimension : uint32_t {
k1D = 0,
k2D = 1,
k3DOrStacked = 2,
kCube = 3,
};
inline int GetFetchOpDimensionComponentCount(FetchOpDimension dimension) {
switch (dimension) {
case FetchOpDimension::k1D:
return 1;
case FetchOpDimension::k2D:
return 2;
case FetchOpDimension::k3DOrStacked:
case FetchOpDimension::kCube:
return 3;
default:
assert_unhandled_case(dimension);
return 1;
}
}
enum class SampleLocation : uint32_t {
kCentroid = 0,
kCenter = 1,
};
enum class Endian : uint32_t {
kNone = 0,
k8in16 = 1,
k8in32 = 2,
k16in32 = 3,
};
enum class Endian128 : uint32_t {
kNone = 0,
k8in16 = 1,
k8in32 = 2,
k16in32 = 3,
k8in64 = 4,
k8in128 = 5,
};
enum class IndexFormat : uint32_t {
kInt16,
// Not very common, but used for some world draws in 545407E0.
kInt32,
};
// SurfaceNumberX from yamato_enum.h.
enum class SurfaceNumberFormat : uint32_t {
kUnsignedRepeatingFraction = 0,
// Microsoft-style, scale factor (2^(n-1))-1.
kSignedRepeatingFraction = 1,
kUnsignedInteger = 2,
kSignedInteger = 3,
kFloat = 7,
};
// The EDRAM is an opaque block of memory accessible by the RB (render backend)
// pipeline stage of the GPU, which performs output-merger functionality (color
// render target writing and blending, depth and stencil testing) and resolve
// (copy) operations.
//
// Data in the 10 MiB of EDRAM is laid out as 2048 tiles on 80x16 32bpp MSAA
// samples. With 2x MSAA, one pixel consists of 1x2 samples, and with 4x, it
// consists of 2x2 samples. Thus, for a 32bpp render target, one tile contains
// 80x16 pixels without MSAA, samples of 80x8 pixels with 2x MSAA, or samples of
// 40x8 pixels with 4x MSAA. The base is specified in tiles, the pitch is also
// treated as tiles (so a 256x single-sampled surface will be stored in the
// EDRAM as 320x).
//
// XGSurfaceSize code in game executables calculates the size in tiles in the
// following order:
// 1) If MSAA is >=2x, multiply the height by 2.
// 2) If MSAA is 4x, multiply the width by 2.
// 3) 80x16-align width and height in samples.
// 4) Multiply width*height in samples by 4 or 8 depending on the pixel format.
// 5) Divide the byte size by 5120.
// This means that when working with layout of surfaces in the EDRAM, it should
// be assumed that a multisampled surface is the same as a single-sampled
// surface with 2x height and (with 4x MSAA) width - however, format size
// doesn't effect the dimensions, 64bpp surfaces take twice as many tiles as
// 32bpp surfaces.
//
// From this, it follows that the tile row pitch in tiles can be multiplied by
// 64bpp too. In the formula for calculating the tile count:
// (height rounded up to 16) * (width rounded up to 80) * (4 or 8) / 5120
// the fraction can be reduced because the numerator is always divisible by
// 5120 - it changes in 80 * 16 * 4 = 5120 increments - in tile increments -
// resulting in:
// (height in tiles) * (width in tiles) * (1 or 2)
// Here we get only multiplication, which (disregarding the variable size) is
// associative for integers, so:
// ((height in tiles) * (width in tiles)) * (1 or 2)
// is identical to:
// (height in tiles) * ((width in tiles) * (1 or 2))
//
// Depth surfaces are also stored as 32bpp tiles, however, as opposed to color
// surfaces, 40x16-sample halves of each tile are swapped - game shaders (for
// example, in 4D5307E6 main menu, 545407F2) perform this swapping when writing
// specific depth/stencil values by drawing to a depth buffer's memory through a
// color render target (to reupload a depth/stencil surface previously evicted
// from the EDRAM to the main memory, for instance).
//
// EDRAM addressing is circular - a render target may be backed by a EDRAM range
// that extends beyond 2048 tiles, in which case, what would go to the tile 2048
// will actually be in tile 0, tile 2049 will go to tile 1, and so on. 4D5307F1
// heavily relies on this behavior for its depth buffer. Specifically, it's used
// the following way:
// - First, a depth-only 1120x720 2xMSAA pass is performed with the depth buffer
// in tiles [1008, 2268), or [1008, 2048) and [0, 220).
// - Then, the depth buffer in [1008, 2268) is resolved into a texture, later
// used in screen-space effects.
// - The upper 1120x576 bin is drawn into the color buffer in [0, 1008), using
// the [1008, 2016) portion of the previously populated depth buffer for early
// depth testing (there seems to be no true early Z on the Xenos, only early
// hi-Z, but still it possibly needs to be in sync with the per-sample depth
// buffer), and overwriting the tail of the previously filled depth buffer in
// [0, 220).
// - The lower 1120x144 bin is drawn without the pregenerated depth buffer data.
enum class MsaaSamples : uint32_t {
k1X = 0,
k2X = 1,
k4X = 2,
};
constexpr uint32_t kMsaaSamplesBits = 2;
constexpr uint32_t kColorRenderTargetIndexBits = 2;
constexpr uint32_t kMaxColorRenderTargets = 4;
enum class ColorRenderTargetFormat : uint32_t {
k_8_8_8_8 = 0,
k_8_8_8_8_GAMMA = 1,
k_2_10_10_10 = 2,
// 7e3 [0, 32) RGB, unorm alpha.
// http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/eg05-xenos-doggett.pdf
k_2_10_10_10_FLOAT = 3,
// Fixed point -32...32.
// http://www.students.science.uu.nl/~3220516/advancedgraphics/papers/inferred_lighting.pdf
k_16_16 = 4,
// Fixed point -32...32.
k_16_16_16_16 = 5,
k_16_16_FLOAT = 6,
k_16_16_16_16_FLOAT = 7,
k_2_10_10_10_AS_10_10_10_10 = 10,
// 16-bit fixed point at half speed, with full blending.
// http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
k_2_10_10_10_FLOAT_AS_16_16_16_16 = 12,
k_32_FLOAT = 14,
k_32_32_FLOAT = 15,
};
const char* GetColorRenderTargetFormatName(ColorRenderTargetFormat format);
constexpr bool IsColorRenderTargetFormat64bpp(ColorRenderTargetFormat format) {
return format == ColorRenderTargetFormat::k_16_16_16_16 ||
format == ColorRenderTargetFormat::k_16_16_16_16_FLOAT ||
format == ColorRenderTargetFormat::k_32_32_FLOAT;
}
inline uint32_t GetColorRenderTargetFormatComponentCount(
ColorRenderTargetFormat format) {
switch (format) {
case ColorRenderTargetFormat::k_8_8_8_8:
case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
case ColorRenderTargetFormat::k_2_10_10_10:
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
case ColorRenderTargetFormat::k_16_16_16_16:
case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10:
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
return 4;
case ColorRenderTargetFormat::k_16_16:
case ColorRenderTargetFormat::k_16_16_FLOAT:
case ColorRenderTargetFormat::k_32_32_FLOAT:
return 2;
case ColorRenderTargetFormat::k_32_FLOAT:
return 1;
default:
assert_unhandled_case(format);
return 0;
}
}
// Returns the version of the format with the same packing and meaning of values
// stored in it, but without blending precision modifiers.
constexpr ColorRenderTargetFormat GetStorageColorFormat(
ColorRenderTargetFormat format) {
switch (format) {
case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10:
return ColorRenderTargetFormat::k_2_10_10_10;
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
return ColorRenderTargetFormat::k_2_10_10_10_FLOAT;
default:
return format;
}
}
enum class DepthRenderTargetFormat : uint32_t {
kD24S8 = 0,
// 20e4 [0, 2).
kD24FS8 = 1,
};
const char* GetDepthRenderTargetFormatName(DepthRenderTargetFormat format);
float PWLGammaToLinear(float gamma);
float LinearToPWLGamma(float linear);
// Converts Xenos floating-point 7e3 color value in bits 0:9 (not clamping) to
// an IEEE-754 32-bit floating-point number.
float Float7e3To32(uint32_t f10);
// Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
// floating-point number.
// Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
// depth, rounding to the nearest even or towards zero.
uint32_t Float32To20e4(float f32, bool round_to_nearest_even);
// Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
// IEEE-754 32-bit floating-point number.
float Float20e4To32(uint32_t f24);
// Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
// floating-point number.
constexpr float UNorm24To32(uint32_t n24) {
// Not 1.0f / 16777215.0f as that gives an incorrect result (like for a very
// common 0xC00000 which clears 2_10_10_10 to 0001). Division by 2^24 is just
// an exponent shift though, thus exact.
// Division by 16777215.0f behaves this way.
return float(n24 + (n24 >> 23)) * (1.0f / float(1 << 24));
}
// Scale for conversion of slope scales from PA_SU_POLY_OFFSET_FRONT/BACK_SCALE
// units to those used when the slope is computed from the difference between
// adjacent pixels, for conversion from the guest to common host APIs or to
// calculation using max(|ddx(z)|, |ddy(z)|).
// "slope computed in subpixels (1/12 or 1/16)" - R5xx Acceleration.
// But the correct scale for conversion of the slope scale from subpixels to
// pixels is likely 1/16 according to:
// https://github.com/mesa3d/mesa/blob/54ad9b444c8e73da498211870e785239ad3ff1aa/src/gallium/drivers/radeonsi/si_state.c#L946
constexpr float kPolygonOffsetScaleSubpixelUnit = 1.0f / 16.0f;
constexpr uint32_t kColorRenderTargetFormatBits = 4;
constexpr uint32_t kDepthRenderTargetFormatBits = 1;
constexpr uint32_t kRenderTargetFormatBits =
std::max(kColorRenderTargetFormatBits, kDepthRenderTargetFormatBits);
constexpr uint32_t kEdramTileWidthSamples = 80;
constexpr uint32_t kEdramTileHeightSamples = 16;
constexpr uint32_t kEdramTileCount = 2048;
constexpr uint32_t kEdramSizeBytes = kEdramTileCount * kEdramTileHeightSamples *
kEdramTileWidthSamples * sizeof(uint32_t);
// RB_SURFACE_INFO::surface_pitch width.
constexpr uint32_t kEdramPitchPixelsBits = 14;
// The part of RB_COLOR_INFO::color_base and RB_DEPTH_INFO::depth_base width
// usable on the Xenos, which has periodic 11-bit EDRAM tile addressing.
constexpr uint32_t kEdramBaseTilesBits = 11;
constexpr uint32_t GetSurfacePitchTiles(uint32_t pitch_pixels,
MsaaSamples msaa_samples,
bool is_64bpp) {
uint32_t pitch_samples = pitch_pixels
<< uint32_t(msaa_samples >= MsaaSamples::k4X);
uint32_t pitch_tiles =
(pitch_samples + (kEdramTileWidthSamples - 1)) / kEdramTileWidthSamples;
if (is_64bpp) {
pitch_tiles <<= 1;
}
return pitch_tiles;
}
// log2_ceil of the maximum value of GetSurfacePitchTiles, assuming 16383 being
// the maximum pitch in pixels (not sure about the validity of values above
// 8192, but to avoid bounds checking).
// log2_ceil of 16383, multiplied by 2 for 4x MSAA, rounded to 80 samples,
// multiplied by 2 for 64bpp.
constexpr uint32_t kEdramPitchTilesBits = 10;
constexpr uint32_t kFormatBits = 6;
// a2xx_sq_surfaceformat +
// https://github.com/indirivacua/RAGE-Console-Texture-Editor/blob/master/Console.Xbox360.Graphics.pas
enum class TextureFormat : uint32_t {
k_1_REVERSE = 0,
k_1 = 1,
k_8 = 2,
k_1_5_5_5 = 3,
k_5_6_5 = 4,
k_6_5_5 = 5,
k_8_8_8_8 = 6,
k_2_10_10_10 = 7,
// Possibly similar to k_8, but may be storing alpha instead of red when
// resolving/memexporting, though not exactly known. From the point of view of
// sampling, it should be treated the same as k_8 (given that textures have
// the last - and single-component textures have the only - component
// replicated into all the remaining ones before the swizzle).
// Used as:
// - Texture in 4B4E083C - text, starting from the "Loading..." and the "This
// game saves data automatically" messages. The swizzle in the fetch
// constant is 111W (suggesting that internally the only component may be
// the alpha one, not red).
// TODO(Triang3l): Investigate how k_8_A and k_8_B work in resolves and
// memexports, whether they store alpha/blue of the input or red.
k_8_A = 8,
k_8_B = 9,
k_8_8 = 10,
// Though it's unknown what exactly REP means, likely it's "repeating
// fraction" (the term used for normalized fixed-point formats, UNORM in
// particular for unsigned signedness - 0.0 to 1.0 range, like in
// Direct3D 10+, unlike the 0.0 to 255.0 range for D3DFMT_R8G8_B8G8 and
// D3DFMT_G8R8_G8B8 in Direct3D 9). 54540829 uses k_Y1_Cr_Y0_Cb_REP directly
// as UNORM.
k_Cr_Y1_Cb_Y0_REP = 11,
// Used for videos in 54540829.
k_Y1_Cr_Y0_Cb_REP = 12,
k_16_16_EDRAM = 13,
// Likely same as k_8_8_8_8.
// Used as:
// - Memexport destination in 4D5308BC - multiple small draws when looking
// back at the door behind the player in the first room of gameplay.
// - Memexport destination in 4D53085B and 4D530919 - in 4D53085B, in a frame
// between the intro video and the main menu, in a 8192-point draw.
k_8_8_8_8_A = 14,
k_4_4_4_4 = 15,
k_10_11_11 = 16,
k_11_11_10 = 17,
k_DXT1 = 18,
k_DXT2_3 = 19,
k_DXT4_5 = 20,
k_16_16_16_16_EDRAM = 21,
k_24_8 = 22,
k_24_8_FLOAT = 23,
k_16 = 24,
k_16_16 = 25,
k_16_16_16_16 = 26,
k_16_EXPAND = 27,
k_16_16_EXPAND = 28,
k_16_16_16_16_EXPAND = 29,
k_16_FLOAT = 30,
k_16_16_FLOAT = 31,
k_16_16_16_16_FLOAT = 32,
k_32 = 33,
k_32_32 = 34,
k_32_32_32_32 = 35,
k_32_FLOAT = 36,
k_32_32_FLOAT = 37,
k_32_32_32_32_FLOAT = 38,
k_32_AS_8 = 39,
k_32_AS_8_8 = 40,
k_16_MPEG = 41,
k_16_16_MPEG = 42,
k_8_INTERLACED = 43,
k_32_AS_8_INTERLACED = 44,
k_32_AS_8_8_INTERLACED = 45,
k_16_INTERLACED = 46,
k_16_MPEG_INTERLACED = 47,
k_16_16_MPEG_INTERLACED = 48,
k_DXN = 49,
k_8_8_8_8_AS_16_16_16_16 = 50,
k_DXT1_AS_16_16_16_16 = 51,
k_DXT2_3_AS_16_16_16_16 = 52,
k_DXT4_5_AS_16_16_16_16 = 53,
k_2_10_10_10_AS_16_16_16_16 = 54,
k_10_11_11_AS_16_16_16_16 = 55,
k_11_11_10_AS_16_16_16_16 = 56,
k_32_32_32_FLOAT = 57,
k_DXT3A = 58,
k_DXT5A = 59,
k_CTX1 = 60,
k_DXT3A_AS_1_1_1_1 = 61,
k_8_8_8_8_GAMMA_EDRAM = 62,
k_2_10_10_10_FLOAT_EDRAM = 63,
};
// Subset of a2xx_sq_surfaceformat - formats that RTs can be resolved to.
enum class ColorFormat : uint32_t {
k_8 = 2,
k_1_5_5_5 = 3,
k_5_6_5 = 4,
k_6_5_5 = 5,
k_8_8_8_8 = 6,
k_2_10_10_10 = 7,
k_8_A = 8,
k_8_B = 9,
k_8_8 = 10,
k_8_8_8_8_A = 14,
k_4_4_4_4 = 15,
k_10_11_11 = 16,
k_11_11_10 = 17,
k_16 = 24,
k_16_16 = 25,
k_16_16_16_16 = 26,
k_16_FLOAT = 30,
k_16_16_FLOAT = 31,
k_16_16_16_16_FLOAT = 32,
k_32_FLOAT = 36,
k_32_32_FLOAT = 37,
k_32_32_32_32_FLOAT = 38,
k_8_8_8_8_AS_16_16_16_16 = 50,
k_2_10_10_10_AS_16_16_16_16 = 54,
k_10_11_11_AS_16_16_16_16 = 55,
k_11_11_10_AS_16_16_16_16 = 56,
};
// Resolve writes unsigned data for fixed-point formats (so k_16_16 and
// k_16_16_16_16 render target formats, which are signed and also have a
// different range, are not equivalent to the respective texture formats).
constexpr bool IsColorResolveFormatBitwiseEquivalent(
ColorRenderTargetFormat render_target_format, ColorFormat color_format) {
switch (render_target_format) {
case ColorRenderTargetFormat::k_8_8_8_8:
// Shaders fetch data copied from k_8_8_8_8_GAMMA with TextureSign::kGamma.
case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
// TODO(Triang3l): Investigate k_8_8_8_8_A.
return color_format == ColorFormat::k_8_8_8_8 ||
color_format == ColorFormat::k_8_8_8_8_A ||
color_format == ColorFormat::k_8_8_8_8_AS_16_16_16_16;
case ColorRenderTargetFormat::k_2_10_10_10:
case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10:
return color_format == ColorFormat::k_2_10_10_10 ||
color_format == ColorFormat::k_2_10_10_10_AS_16_16_16_16;
case ColorRenderTargetFormat::k_16_16_FLOAT:
return color_format == ColorFormat::k_16_16_FLOAT;
case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
return color_format == ColorFormat::k_16_16_16_16_FLOAT;
case ColorRenderTargetFormat::k_32_FLOAT:
return color_format == ColorFormat::k_32_FLOAT;
case ColorRenderTargetFormat::k_32_32_FLOAT:
return color_format == ColorFormat::k_32_32_FLOAT;
default:
return false;
}
}
enum class VertexFormat : uint32_t {
kUndefined = 0,
k_8_8_8_8 = 6,
k_2_10_10_10 = 7,
k_10_11_11 = 16,
k_11_11_10 = 17,
k_16_16 = 25,
k_16_16_16_16 = 26,
k_16_16_FLOAT = 31,
k_16_16_16_16_FLOAT = 32,
k_32 = 33,
k_32_32 = 34,
k_32_32_32_32 = 35,
k_32_FLOAT = 36,
k_32_32_FLOAT = 37,
k_32_32_32_32_FLOAT = 38,
k_32_32_32_FLOAT = 57,
};
inline int GetVertexFormatComponentCount(VertexFormat format) {
switch (format) {
case VertexFormat::k_32:
case VertexFormat::k_32_FLOAT:
return 1;
case VertexFormat::k_16_16:
case VertexFormat::k_16_16_FLOAT:
case VertexFormat::k_32_32:
case VertexFormat::k_32_32_FLOAT:
return 2;
case VertexFormat::k_10_11_11:
case VertexFormat::k_11_11_10:
case VertexFormat::k_32_32_32_FLOAT:
return 3;
case VertexFormat::k_8_8_8_8:
case VertexFormat::k_2_10_10_10:
case VertexFormat::k_16_16_16_16:
case VertexFormat::k_16_16_16_16_FLOAT:
case VertexFormat::k_32_32_32_32:
case VertexFormat::k_32_32_32_32_FLOAT:
return 4;
default:
assert_unhandled_case(format);
return 0;
}
}
inline uint32_t GetVertexFormatNeededWords(VertexFormat format,
uint32_t used_components) {
assert_zero(used_components & ~uint32_t(0b1111));
if (!used_components) {
return 0;
}
switch (format) {
case VertexFormat::k_8_8_8_8:
case VertexFormat::k_2_10_10_10:
return 0b0001;
case VertexFormat::k_10_11_11:
case VertexFormat::k_11_11_10:
return (used_components & 0b0111) ? 0b0001 : 0b0000;
case VertexFormat::k_16_16:
case VertexFormat::k_16_16_FLOAT:
return (used_components & 0b0011) ? 0b0001 : 0b0000;
case VertexFormat::k_16_16_16_16:
case VertexFormat::k_16_16_16_16_FLOAT:
return ((used_components & 0b0011) ? 0b0001 : 0b0000) |
((used_components & 0b1100) ? 0b0010 : 0b0000);
case VertexFormat::k_32:
case VertexFormat::k_32_FLOAT:
return used_components & 0b0001;
case VertexFormat::k_32_32:
case VertexFormat::k_32_32_FLOAT:
return used_components & 0b0011;
case VertexFormat::k_32_32_32_32:
case VertexFormat::k_32_32_32_32_FLOAT:
return used_components;
case VertexFormat::k_32_32_32_FLOAT:
return used_components & 0b0111;
default:
assert_unhandled_case(format);
return 0b0000;
}
}
enum class CompareFunction : uint32_t {
kNever = 0b000,
kLess = 0b001,
kEqual = 0b010,
kLessEqual = 0b011,
kGreater = 0b100,
kNotEqual = 0b101,
kGreaterEqual = 0b110,
kAlways = 0b111,
};
enum class StencilOp : uint32_t {
kKeep = 0,
kZero = 1,
kReplace = 2,
kIncrementClamp = 3,
kDecrementClamp = 4,
kInvert = 5,
kIncrementWrap = 6,
kDecrementWrap = 7,
};
// adreno_rb_blend_factor
enum class BlendFactor : uint32_t {
kZero = 0,
kOne = 1,
kSrcColor = 4,
kOneMinusSrcColor = 5,
kSrcAlpha = 6,
kOneMinusSrcAlpha = 7,
kDstColor = 8,
kOneMinusDstColor = 9,
kDstAlpha = 10,
kOneMinusDstAlpha = 11,
kConstantColor = 12,
kOneMinusConstantColor = 13,
kConstantAlpha = 14,
kOneMinusConstantAlpha = 15,
kSrcAlphaSaturate = 16,
// SRC1 added on Adreno.
};
enum class BlendOp : uint32_t {
kAdd = 0,
kSubtract = 1,
kMin = 2,
kMax = 3,
kRevSubtract = 4,
};
typedef enum {
XE_GPU_INVALIDATE_MASK_VERTEX_SHADER = 1 << 8,
XE_GPU_INVALIDATE_MASK_PIXEL_SHADER = 1 << 9,
XE_GPU_INVALIDATE_MASK_ALL = 0x7FFF,
} XE_GPU_INVALIDATE_MASK;
// VGT_DRAW_INITIATOR::DI_SRC_SEL_*
enum class SourceSelect : uint32_t {
kDMA,
kImmediate,
kAutoIndex,
};
// VGT_DRAW_INITIATOR::DI_MAJOR_MODE_*
enum class MajorMode : uint32_t {
kImplicit,
kExplicit,
};
inline bool IsMajorModeExplicit(MajorMode major_mode,
PrimitiveType primitive_type) {
return major_mode != MajorMode::kImplicit ||
primitive_type >= PrimitiveType::kExplicitMajorModeForceStart;
}
enum class SignedRepeatingFractionMode : uint32_t {
// Microsoft-style representation with two -1 representations (one is slightly
// past -1 but clamped).
kZeroClampMinusOne,
// OpenGL "alternate mapping" format lacking representation for zero.
kNoZero,
};
// Arbitrary filter is still present in the Code Aurora Forum release of the
// Adreno 200 programming interface, but is deprecated according to the
// IPR2015-00325 R400 Document Library Folder History:
// "Change 124923 on 2003/10/03 by jhoule@jhoule_doc_lt
// [...]
// Deprecated the ARBITRARY_FILTER fields from TFetch instr+const."
enum class ArbitraryFilter : uint32_t {
k2x4Sym = 0,
k2x4Asym = 1,
k4x2Sym = 2,
k4x2Asym = 3,
k4x4Sym = 4,
k4x4Asym = 5,
kUseFetchConst = 7,
};
constexpr uint32_t kMaxShaderTempRegistersLog2 = 6;
constexpr uint32_t kMaxShaderTempRegisters = UINT32_C(1)
<< kMaxShaderTempRegistersLog2;
// a2xx_sq_ps_vtx_mode
enum class VertexShaderExportMode : uint32_t {
kPosition1Vector = 0,
kPosition2VectorsSprite = 2,
kPosition2VectorsEdge = 3,
kPosition2VectorsKill = 4,
kPosition2VectorsSpriteKill = 5,
kPosition2VectorsEdgeKill = 6,
// Vertex shader outputs are ignored (kill all primitives) - see
// SX_MISC::MULTIPASS on R6xx/R7xx.
kMultipass = 7,
};
constexpr uint32_t kMaxInterpolators = 16;
enum class SampleControl : uint32_t {
kCentroidsOnly = 0,
kCentersOnly = 1,
kCentroidsAndCenters = 2,
};
// - msaa_samples is RB_SURFACE_INFO::msaa_samples.
// - sample_control is SQ_CONTEXT_MISC::sc_sample_cntl.
// - interpolator_control_sampling_pattern is
// SQ_INTERPOLATOR_CNTL::sampling_pattern.
// Centroid interpolation can be tested in 5454082B. If the GPU host backend
// implements guest MSAA properly, using host MSAA, with everything interpolated
// at centers, the Monument Valley start screen background may have a few
// distinctly bright pixels on the mesas/buttes, where extrapolation happens.
// Interpolating certain values (ones that aren't used for gradient calculation,
// not texture coordinates) at centroids fixes this issue.
inline uint32_t GetInterpolatorSamplingPattern(
MsaaSamples msaa_samples, SampleControl sample_control,
uint32_t interpolator_control_sampling_pattern) {
if (msaa_samples == MsaaSamples::k1X ||
sample_control == SampleControl::kCentersOnly) {
return ((1 << kMaxInterpolators) - 1) * uint32_t(SampleLocation::kCenter);
}
if (sample_control == SampleControl::kCentroidsOnly) {
return ((1 << kMaxInterpolators) - 1) * uint32_t(SampleLocation::kCentroid);
}
assert_true(sample_control == SampleControl::kCentroidsAndCenters);
return interpolator_control_sampling_pattern;
}
enum class VGTOutputPath : uint32_t {
kVertexReuse = 0,
kTessellationEnable = 1,
kPassthru = 2,
};
enum class TessellationMode : uint32_t {
kDiscrete = 0,
kContinuous = 1,
kAdaptive = 2,
};
enum class PolygonModeEnable : uint32_t {
kDisabled = 0, // Render triangles.
kDualMode = 1, // Send 2 sets of 3 polygons with the specified polygon type.
// 4541096E uses 2 for triangles, which is "reserved" on R6xx and not defined
// on Adreno 2xx, but polymode_front/back_ptype are 0 (points) in this case in
// 4541096E, which should not be respected for non-kDualMode as the title
// wants to draw filled triangles.
};
enum class PolygonType : uint32_t {
kPoints = 0,
kLines = 1,
kTriangles = 2,
};
enum class PixelCenter : uint32_t {
// Pixel center at vertex positions .0, like in Direct3D 9.
// Commonly used in Xbox 360 games.
kD3DZero = 0,
// Pixel center at vertex positions .5, like in OpenGL.
// Used in 415607E6.
kOGLHalf = 1,
};
enum class VertexRounding : uint32_t {
kTruncate = 0, // OpenGL.
kRound = 1,
kRoundToEven = 2, // Direct3D. Common in Xbox 360 games.
kRoundToOdd = 3,
};
enum class VertexQuantization : uint32_t {
k_1_16th = 0,
k_1_8th = 1,
k_1_4th = 2,
k_1_2 = 3,
k_1 = 4,
// 1/256th was added in R600. On the Xbox 360, games normally use 1/16th.
};
enum class EdramMode : uint32_t {
kNoOperation = 0,
kColorDepth = 4,
// TODO(Triang3l): Verify whether kDepthOnly means the pixel shader is ignored
// completely even if it writes depth, exports to memory or kills pixels.
// Hints suggesting that it should be completely ignored (which is desirable
// on real hardware to avoid scheduling the pixel shader at all and waiting
// for it especially since the Xbox 360 doesn't have early per-sample depth /
// stencil, only early hi-Z / hi-stencil, and other registers possibly
// toggling pixel shader execution are yet to be found):
// - Most of depth pre-pass draws in 415607E6 use the kDepthOnly more with a
// `oC0 = tfetch2D(tf0, r0.xy) * r1` shader, some use `oC0 = r0` though.
// However, when alphatested surfaces are drawn, kColorDepth is explicitly
// used with the same shader performing the texture fetch.
// - 5454082B has some kDepthOnly draws with alphatest enabled, but the shader
// is `oC0 = r0`, which makes no sense (alphatest based on an interpolant
// from the vertex shader) as no texture alpha cutout is involved.
// - 5454082B also has kDepthOnly draws with pretty complex shaders clearly
// for use only in the color pass - even fetching and filtering a shadowmap.
// For now, based on these, let's assume the pixel shader is never used with
// kDepthOnly.
kDepthOnly = 5,
kCopy = 6,
};
// Xenos copies EDRAM contents to a tiled 2D or 3D texture (resolves - from
// "MSAA resolve", but this name is also used for single-sampled copying) by
// drawing primitives with the EDRAM mode EdramMode::kCopy. Pixels covered by
// the drawn geometry are copied. It's likely that only rectangular regions can
// be resolved.
//
// Resolve operation can write color data in ColorFormat formats, with or
// without MSAA color sample averaging, endian swap, red/blue swap, and exponent
// bias. Depth resolving likely has a lot more restrictions, considering sample
// averaging, red/blue swap and exponent bias would be pretty meaningless for it
// (also, Direct3D 9 specifies k_8_8_8_8 as RB_COPY_DEST_INFO::copy_dest_format
// for depth, which is clearly not true - the right format would be k_24_8 or
// k_24_8_FLOAT, so depth resolving likely doesn't support format conversion),
// though endian swap is supported.
//
// In addition, a resolve draw may clear the region it copies (this feature is
// commonly used when going to the next tile with predicated tiling). While one
// resolve draw call may copy just one color or depth buffer, it may clear both
// color and depth at once (or just color or depth, or nothing) if copying a
// color buffer (the color render target cleared is the same as the one copied -
// however, depth resolves have RB_COPY_CONTROL::copy_src_select 4, so they
// can't clear color).
//
// Direct3D 9 does resolving by drawing kRectangleList with 3 vertices with a
// vertex shader that accepts k_32_32_FLOAT vertices with k8in32 endianness in
// SHADER_CONSTANT_FETCH_00_0, with the half-pixel offset, according to the
// PA_SU_VTX_CNTL::pix_center setting, pre-applied to the vertices (for Direct3D
// 9 pixel centers, 0.5 must be added to the vertex positions to get the
// coordinates of the corners).
//
// The rectangle is used for both the source render target and the destination
// texture, according to how it's used in 4E4D07E9.
//
// Direct3D 9 gives the rectangle in source render target coordinates (for
// example, in 4D5307E6, the sniper rifle scope has a (128,64)->(448,256)
// rectangle). It doesn't adjust the EDRAM base pointer, otherwise (taking into
// account that 4x MSAA is used for the scope) it would have been
// (8,0)->(328,192), but it's not. However, it adjusts the destination texture
// address so (0,0) relative to the destination address is (0,0) relative to
// the render target (if resolving a part of a render target to the top-left
// corner of a texture, Direct3D 9 actually moves the destination pointer before
// the start of the texture, with tiled offset internally calculated for a
// negative offset). When copying, the pointer needs to be adjusted to the first
// 32x32 tile that will actually be modified, by adding the value of
// XGAddress2D/3DTiledOffset called for left/top & ~31.
//
// RB_COPY_DEST_PITCH's purpose appears to be not clamping or something like
// that, but just specifying pitch for going between rows, and height for going
// between 3D texture slices. copy_dest_pitch is rounded to 32 by Direct3D 9,
// copy_dest_height is not. In the 4D5307E6 sniper rifle scope example,
// copy_dest_pitch is 320, and copy_dest_height is 192 - the same as the resolve
// rectangle size (resolving from a 320x192 portion of the surface at 128,64 to
// the whole texture, at 0,0). Relative to RB_COPY_DEST_BASE, the height should
// have been 256, but it's not. Adreno doesn't have copy_dest_height at all (as
// well as RB_COPY_DEST_INFO::copy_dest_slice), suggesting (alongside the name
// of the register) that it exists purely to be able to go between 3D texture
// slices.
//
// Window scissor must also be applied - in the jigsaw puzzle in 58410955, there
// are 1280x720 resolve rectangles, but only the scissored 1280x256 needs to be
// copied, otherwise it overflows even beyond the EDRAM, and the depth buffer is
// visible on the screen. It also ensures the coordinates are not negative (in
// 565507D9, for example, the right tile is resolved with vertices
// (-640,0)->(640,720), however, the destination texture pointer is adjusted
// properly to the right half of the texture, and the source render target has a
// pitch of 800).
// Granularity of offset and size in resolve operations is 8x8 pixels
// (GPU_RESOLVE_ALIGNMENT - for example, 4D5307E6 resolves a 24x16 region for a
// 18x10 texture, 8x8 region for a 1x1 texture).
// https://github.com/jmfauvel/CSGO-SDK/blob/master/game/client/view.cpp#L944
// https://github.com/stanriders/hl2-asw-port/blob/master/src/game/client/vgui_int.cpp#L901
constexpr uint32_t kResolveAlignmentPixelsLog2 = 3;
constexpr uint32_t kResolveAlignmentPixels = 1 << kResolveAlignmentPixelsLog2;
// Same as RB_SURFACE_INFO::surface_pitch, RB_COPY_DEST_PITCH::copy_dest_pitch
// and RB_COPY_DEST_PITCH::copy_dest_height.
constexpr uint32_t kResolveSizeBits = 14;
constexpr uint32_t kMaxResolveSize =
(1 << kResolveSizeBits) - kResolveAlignmentPixels;
enum class CopyCommand : uint32_t {
kRaw = 0,
kConvert = 1,
kConstantOne = 2,
kNull = 3, // ?
};
// a2xx_rb_copy_sample_select
enum class CopySampleSelect : uint32_t {
k0,
k1,
k2,
k3,
k01,
k23,
k0123,
};
constexpr bool IsSingleCopySampleSelected(CopySampleSelect copy_sample_select) {
return copy_sample_select >= CopySampleSelect::k0 &&
copy_sample_select <= CopySampleSelect::k3;
}
#define XE_GPU_MAKE_TEXTURE_SWIZZLE(x, y, z, w) \
(((xe::gpu::xenos::XE_GPU_TEXTURE_SWIZZLE_##x) << 0) | \
((xe::gpu::xenos::XE_GPU_TEXTURE_SWIZZLE_##y) << 3) | \
((xe::gpu::xenos::XE_GPU_TEXTURE_SWIZZLE_##z) << 6) | \
((xe::gpu::xenos::XE_GPU_TEXTURE_SWIZZLE_##w) << 9))
typedef enum {
XE_GPU_TEXTURE_SWIZZLE_X = 0,
XE_GPU_TEXTURE_SWIZZLE_R = 0,
XE_GPU_TEXTURE_SWIZZLE_Y = 1,
XE_GPU_TEXTURE_SWIZZLE_G = 1,
XE_GPU_TEXTURE_SWIZZLE_Z = 2,
XE_GPU_TEXTURE_SWIZZLE_B = 2,
XE_GPU_TEXTURE_SWIZZLE_W = 3,
XE_GPU_TEXTURE_SWIZZLE_A = 3,
XE_GPU_TEXTURE_SWIZZLE_0 = 4,
XE_GPU_TEXTURE_SWIZZLE_1 = 5,
XE_GPU_TEXTURE_SWIZZLE_RRRR = XE_GPU_MAKE_TEXTURE_SWIZZLE(R, R, R, R),
XE_GPU_TEXTURE_SWIZZLE_RGGG = XE_GPU_MAKE_TEXTURE_SWIZZLE(R, G, G, G),
XE_GPU_TEXTURE_SWIZZLE_RGBB = XE_GPU_MAKE_TEXTURE_SWIZZLE(R, G, B, B),
XE_GPU_TEXTURE_SWIZZLE_RGBA = XE_GPU_MAKE_TEXTURE_SWIZZLE(R, G, B, A),
XE_GPU_TEXTURE_SWIZZLE_0000 = XE_GPU_MAKE_TEXTURE_SWIZZLE(0, 0, 0, 0),
} XE_GPU_TEXTURE_SWIZZLE;
inline uint16_t GpuSwap(uint16_t value, Endian endianness) {
switch (endianness) {
case Endian::kNone:
// No swap.
return value;
case Endian::k8in16:
// Swap bytes in half words.
return ((value << 8) & 0xFF00FF00) | ((value >> 8) & 0x00FF00FF);
default:
assert_unhandled_case(endianness);
return value;
}
}
inline uint32_t GpuSwap(uint32_t value, Endian endianness) {
switch (endianness) {
default:
case Endian::kNone:
// No swap.
return value;
case Endian::k8in16:
// Swap bytes in half words.
return ((value << 8) & 0xFF00FF00) | ((value >> 8) & 0x00FF00FF);
case Endian::k8in32:
// Swap bytes.
// NOTE: we are likely doing two swaps here. Wasteful. Oh well.
return xe::byte_swap(value);
case Endian::k16in32:
// Swap half words.
return ((value >> 16) & 0xFFFF) | (value << 16);
}
}
inline float GpuSwap(float value, Endian endianness) {
union {
uint32_t i;
float f;
} v;
v.f = value;
v.i = GpuSwap(v.i, endianness);
return v.f;
}
inline uint32_t GpuToCpu(uint32_t p) { return p; }
inline uint32_t CpuToGpu(uint32_t p) { return p & 0x1FFFFFFF; }
// XE_GPU_REG_SHADER_CONSTANT_LOOP_*
union alignas(uint32_t) LoopConstant {
uint32_t value;
struct {
uint32_t count : 8; // +0
// Address (aL) start and step.
// The resulting aL is `iterator * step + start`, 10-bit, and has the real
// range of [-256, 256], according to the IPR2015-00325 sequencer
// specification.
uint32_t start : 8; // +8
int32_t step : 8; // +16
uint32_t _pad_24 : 8; // +24
};
};
static_assert_size(LoopConstant, sizeof(uint32_t));
// SQ_TEX_VTX_INVALID/VALID_TEXTURE/BUFFER
enum class FetchConstantType : uint32_t {
kInvalidTexture,
kInvalidVertex,
kTexture,
kVertex,
};
constexpr uint32_t kTextureFetchConstantCount = 32;
constexpr uint32_t kVertexFetchConstantCount = 3 * kTextureFetchConstantCount;
// XE_GPU_REG_SHADER_CONSTANT_FETCH_*
union alignas(uint32_t) xe_gpu_vertex_fetch_t {
struct {
uint32_t dword_0;
uint32_t dword_1;
};
struct {
FetchConstantType type : 2; // +0
uint32_t address : 30; // +2 address in dwords
Endian endian : 2; // +0
uint32_t size : 24; // +2 size in words
uint32_t _pad_1_26 : 6; // +26
};
};
static_assert_size(xe_gpu_vertex_fetch_t, sizeof(uint32_t) * 2);
// Byte alignment of texture subresources in memory - of each mip and stack
// slice / cube face (and of textures themselves), this number of bits is also
// omitted from base_address and mip_address.
constexpr uint32_t kTextureSubresourceAlignmentBytesLog2 = 12;
constexpr uint32_t kTextureSubresourceAlignmentBytes =
1 << kTextureSubresourceAlignmentBytesLog2;
// Texture fetch constant size field widths.
constexpr uint32_t kTexture1DMaxWidthLog2 = 24;
constexpr uint32_t kTexture1DMaxWidth = 1 << kTexture1DMaxWidthLog2;
constexpr uint32_t kTexture2DCubeMaxWidthHeightLog2 = 13;
constexpr uint32_t kTexture2DCubeMaxWidthHeight =
1 << kTexture2DCubeMaxWidthHeightLog2;
constexpr uint32_t kTexture2DMaxStackDepthLog2 = 6;
constexpr uint32_t kTexture2DMaxStackDepth = 1 << kTexture2DMaxStackDepthLog2;
constexpr uint32_t kTexture3DMaxWidthHeightLog2 = 11;
constexpr uint32_t kTexture3DMaxWidthHeight = 1 << kTexture3DMaxWidthHeightLog2;
constexpr uint32_t kTexture3DMaxDepthLog2 = 10;
constexpr uint32_t kTexture3DMaxDepth = 1 << kTexture3DMaxDepthLog2;
constexpr uint32_t kTextureMaxMips =
std::max(kTexture2DCubeMaxWidthHeightLog2, kTexture3DMaxWidthHeightLog2) +
1;
constexpr uint32_t kTextureTileWidthHeightLog2 = 5;
constexpr uint32_t kTextureTileWidthHeight = 1 << kTextureTileWidthHeightLog2;
// 3D tiled texture slices 0:3 and 4:7 are stored separately in memory, in
// non-overlapping ranges, but addressing in 4:7 is different than in 0:3.
constexpr uint32_t kTextureTileDepthLog2 = 2;
constexpr uint32_t kTextureTileDepth = 1 << kTextureTileDepthLog2;
// Texture tile address function periods:
// - 2D 1bpb: 128x128
// - 2D 2bpb: 64x64
// - 2D 4bpb+: 32x32
// - 3D 1bpb: 64x32x8
// - 3D 2bpb+: 32x32x8
constexpr uint32_t GetTextureTiledXBaseGranularityLog2(
bool is_3d, uint32_t bytes_per_block_log2) {
return 7 - std::min(UINT32_C(2), bytes_per_block_log2 + uint32_t(is_3d));
}
constexpr uint32_t GetTextureTiledYBaseGranularityLog2(
bool is_3d, uint32_t bytes_per_block_log2) {
return is_3d ? 5 : (7 - std::min(UINT32_C(2), bytes_per_block_log2));
}
constexpr uint32_t kTextureTiledZBaseGranularityLog2 = 3;
constexpr uint32_t kTextureTiledZBaseGranularity =
1 << kTextureTiledZBaseGranularityLog2;
// Row pitch alignment of non-tiled textures.
constexpr uint32_t kTextureLinearRowAlignmentBytesLog2 = 8;
constexpr uint32_t kTextureLinearRowAlignmentBytes =
1 << kTextureLinearRowAlignmentBytesLog2;
// XE_GPU_REG_SHADER_CONSTANT_FETCH_*
union alignas(uint32_t) xe_gpu_texture_fetch_t {
struct {
uint32_t dword_0;
uint32_t dword_1;
uint32_t dword_2;
uint32_t dword_3;
uint32_t dword_4;
uint32_t dword_5;
};
struct {
FetchConstantType type : 2; // +0 dword_0
// The signedness applies to the data components (before the swizzle, which
// is the destination selection).
// Signed repeating fraction formats always use the kZeroClampMinusOne mode,
// according to the IPR2015-00325 R400 Document Library Folder History:
// "Change 133990 on 2003/11/25 by jhoule@jhoule_doc_lt
// v1.80 - Indicated that NO_ZERO srf mode is unsupported for Xenos (will
// currently only work in the VC path)"
TextureSign sign_x : 2; // +2
TextureSign sign_y : 2; // +4
TextureSign sign_z : 2; // +6
TextureSign sign_w : 2; // +8
ClampMode clamp_x : 3; // +10
ClampMode clamp_y : 3; // +13
ClampMode clamp_z : 3; // +16
uint32_t _pad_0_19 : 3; // +19
// Base row pitch in pixels (not blocks) >> 5. For linear textures, this is
// provided by Direct3D 9 in a way that every row of blocks ends up aligned
// to kTextureLinearRowAlignmentBytes (the GPU requires 256-byte alignment
// of linear texture block rows for all textures).
// Mips are always stored with padding to the `max(next_pow2(base width or
// height) >> level, 1)` or a 32x32x4 tile (whichever is larger), so this
// pitch is irrelevant to them (but the 256-byte alignment requirement still
// applies to linear textures).
// Examples of pitch > aligned width:
// - 584109FF (loading screen and menu backgrounds, 1408 for a 1280x linear
// k_DXT4_5 texture, which corresponds to 22 * 256 bytes rather than
// 20 * 256 for just 1280x).
uint32_t pitch : 9; // +22
uint32_t tiled : 1; // +31
TextureFormat format : 6; // +0 dword_1
Endian endianness : 2; // +6
uint32_t request_size : 2; // +8
uint32_t stacked : 1; // +10
uint32_t nearest_clamp_policy : 1; // +11 d3d/opengl
uint32_t base_address : 20; // +12 base address >> 12
// Size is stored with 1 subtracted from each component.
union { // dword_2
struct {
uint32_t width : 24;
uint32_t _pad_size_1d : 8;
} size_1d;
struct {
uint32_t width : 13;
uint32_t height : 13;
// Should be 0 for k2D and 5 for kCube if not stacked, but not very
// meaningful in this case, likely should be ignored for non-stacked.
uint32_t stack_depth : 6;
} size_2d;
struct {
uint32_t width : 11;
uint32_t height : 11;
uint32_t depth : 10;
} size_3d;
};
uint32_t num_format : 1; // +0 dword_3 frac/int
// xyzw, 3b each (XE_GPU_TEXTURE_SWIZZLE)
uint32_t swizzle : 12; // +1
int32_t exp_adjust : 6; // +13
TextureFilter mag_filter : 2; // +19
TextureFilter min_filter : 2; // +21
TextureFilter mip_filter : 2; // +23
AnisoFilter aniso_filter : 3; // +25
ArbitraryFilter arbitrary_filter : 3; // +28
uint32_t border_size : 1; // +31
uint32_t vol_mag_filter : 1; // +0 dword_4
uint32_t vol_min_filter : 1; // +1
uint32_t mip_min_level : 4; // +2
uint32_t mip_max_level : 4; // +6
uint32_t mag_aniso_walk : 1; // +10
uint32_t min_aniso_walk : 1; // +11
// 5 fractional bits (A2XX_SQ_TEX_4_LOD_BIAS).
int32_t lod_bias : 10; // +12
// Also known as LodBiasH/V in sys2gmem.
int32_t grad_exp_adjust_h : 5; // +22
int32_t grad_exp_adjust_v : 5; // +27
BorderColor border_color : 2; // +0 dword_5
uint32_t force_bc_w_to_max : 1; // +2
// Also known as TriJuice.
uint32_t tri_clamp : 2; // +3
int32_t aniso_bias : 4; // +5
DataDimension dimension : 2; // +9
uint32_t packed_mips : 1; // +11
uint32_t mip_address : 20; // +12 mip address >> 12
};
};
static_assert_size(xe_gpu_texture_fetch_t, sizeof(uint32_t) * 6);
// XE_GPU_REG_SHADER_CONSTANT_FETCH_*
union alignas(uint32_t) xe_gpu_fetch_group_t {
struct {
uint32_t dword_0;
uint32_t dword_1;
uint32_t dword_2;
uint32_t dword_3;
uint32_t dword_4;
uint32_t dword_5;
};
xe_gpu_texture_fetch_t texture_fetch;
struct {
xe_gpu_vertex_fetch_t vertex_fetch_0;
xe_gpu_vertex_fetch_t vertex_fetch_1;
xe_gpu_vertex_fetch_t vertex_fetch_2;
};
struct {
uint32_t type_0 : 2;
uint32_t data_0_a : 30;
uint32_t data_0_b : 32;
uint32_t type_1 : 2;
uint32_t data_1_a : 30;
uint32_t data_1_b : 32;
uint32_t type_2 : 2;
uint32_t data_2_a : 30;
uint32_t data_2_b : 32;
};
};
static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
// Shader memory export (memexport) allows for writing of arbitrary formatted
// data with random access / scatter capabilities. It provides functionality
// largely similar to resolving - format packing, supporting arbitrary color
// formats, from sub-dword ones such as k_8 in 58410B86, to 128-bit ones, with
// endian swap similar to how it's performed in resolves (up to 128-bit);
// specifying the number format, swapping red and blue channels - though with no
// exponent biasing. Unlike resolving, however, instead of writing to tiled
// textures, it exports the data to up to 5 elements (the eM# shader registers,
// each corresponding to `base address + element size * (offset + 0...4)`) in a
// stream defined by a stream constant and an offset in elements written to eA -
// a shader, however, can write to multiple streams with different or the same
// stream constants, by performing `alloc export` multiple times. It's used
// mostly in vertex shaders (most commonly in improvised "compute shaders" done
// by executing a vertex shader for a number of point-type primitives covering
// nothing), though usage in pixel shaders is also possible - an example is
// provided in the "Advanced Screenspace Antialiasing" presentation by Arne
// Schober.
// https://ubm-twvideo01.s3.amazonaws.com/o1/vault/gdceurope2010/slides/A_Schober_Advanced_Screenspace_Antialiasing.pdf
//
// Unlike fetch constants, which are passed via special registers, a memory
// export stream is configured by writing the stream constant and the offset to
// a shader export register (eA) allocated by the shader - similar to more
// conventional exports like oPos, o#, oC#. Therefore, in general, it's not
// possible to know what its value will be without running the shader. For
// emulation, this means that the memory range referenced by an export - that
// needs to be validated - requires running the shader on the CPU in general.
// Thankfully, however, the usual way of setting up eA is by executing:
// `mad eA, r#, const0100, c#`
// where c# is the stream float4 constant from the float constant registers, and
// const0100 is a literal (0.0f, 1.0f, 0.0f, 0.0f) constant, also from the float
// constant registers, used for placing the element index (r#) in the correct
// component of eA. This allows for easy gathering of memexport stream
// constants, which contain both the base address and the size of the
// destination buffer for bounds checking, from the shader code and the float
// constant registers, as long as the guest uses this instruction pattern to
// write to eA.
//
// The Xenos doesn't have an integer ALU, and denormals are treated as zero and
// are flushed. However, eA contains integers and bit fields. A stream constant
// is thus structured in a way that allows for packing integers in normalized
// floating-point numbers.
//
// X contains the base address of the stream in dwords as integer bits in the
// lower 30 bits, and bits 0b01 in the top. The 0b01 bits make the exponent
// nonzero, so the number is considered normalized, and therefore isn't flushed
// to zero. With only 512 MB of the physical memory on the Xbox 360, the
// exponent can't become 0b11111111, so X also won't be NaN for any valid Xbox
// 360 physical address (though in general the GPU supports 32-bit addresses,
// but this is originally an Xbox 360-specific feature, that was later, however,
// likely reused for GL_QCOM_writeonly_rendering).
//
// TODO(Triang3l): Verify whether GL_QCOM_writeonly_rendering is actually
// memexport on the Adreno 2xx using GL_OES_get_program_binary - it's also
// interesting to see how alphatest interacts with it, whether it's still true
// fixed-function alphatest, as it's claimed to be supported as usual by the
// extension specification.
//
// Y of eA contains the offset in elements - this is what shaders are supposed
// to calculate from something like the vertex index. Again, it's specified as
// an integer in the low bits, not as a truly floating-point number. For this
// purpose, stream constants contain the value 2^23 - when a whole
// floating-point number smaller than 2^23 is added as floating-point to 2^23,
// its integer representation becomes the mantissa bits of a number with an
// exponent of 23. Via multiply-add, `offset * 1.0f + exp2f(23)` is written here
// by the shader, allowing for element offsets of up to 2^23 - 1.
//
// Z is a bit field with the information about the formatting of the data. It's
// also packed as a normalized floating-point number, but in a cleaner way than
// X because not as many bits are required - just like Y, it has an exponent of
// 23 (possibly to let shaders build these values manually using floating-point
// multiply-add like integer shift-or, and finally to add 2^23, though that's
// not a case easy to handle in emulation, unlike prebuilt stream constants).
//
// W contains the number of elements in the stream. It's also packed with the
// full 23 exponent just like Y and Z, there's no way to index more than 2^23
// elements using packing via addition to 2^23, so this field also doesn't need
// more bits than that.
//
// According to the sequencer specification from IPR2015-00325 (where memexport
// is called "pass thru export"):
// - Pass thru exports can occur anywhere in the shader program.
// - There can be any number of pass thru exports.
// - The address register is not kept across clause boundaries, so it must be
// refreshed after any Serialize (or yield), allocate instruction or resource
// change.
// - The write to eM# may be predicated if the export is not needed.
// - Exports are dropped if:
// - The index is above the maximum.
// - The index sign bit is 1.
// - The exponent of the index is not 23.
// The requirement that eM4 must be written if any eM# other than eM0 is also
// written doesn't apply to the final Xenos, it's likely an outdated note in the
// specification considering that it's very preliminary.
//
// According to Microsoft's shader validator:
// - eA can be written only by `mad`.
// - A single eM# can be written by any number of instruction, including with
// write masking.
// - eA must be written before eM#.
// - Any alloc instruction or a `serialize` terminates the current memory
// export. This doesn't apply to `exec Yield=true`, however, and it's not
// clear if that's an oversight or if that's not considered a yield that
// terminates the export.
//
// From the emulation perspective, this means that:
// - Alloc instructions (`alloc export` mandatorily, other allocs optionally),
// and optionally `serialize` instructions within `exec`, should be treated as
// the locations where the currently open export should be flushed to the
// memory. It should be taken into account that an export may be in looping
// control flow, and in this case it must be performed at every iteration.
// - Whether each eM# was written to must be tracked at shader execution time,
// as predication can disable the export of an element.
//
// TODO(Triang3l): Investigate how memory export interacts with pixel killing.
// Given that eM# writes disabled by predication don't cause an export, it's
// possible that killed invocations are treated as inactive (invalid in Xenos
// terms) overall, and thus new memory exports from them shouldn't be done, but
// that's not verified. However, given that on Direct3D 11+, OpenGL and Vulkan
// hosts, discarding disables subsequent storage resource writes, on the host,
// it would be natural to perform all outstanding memory exports before
// discarding if the kill condition passes.
//
// Memory exports can be performed to any ColorFormat, including 8bpp and 16bpp
// ones. Hosts, however, may have the memory bound as a 32bpp buffer (for
// instance, due to the minimum resource view size limitation on Direct3D 11).
// In this case, bytes and shorts aren't addressable directly. However, taking
// into account that memory accesses are coherent within one shader invocation
// on Direct3D 11+, OpenGL and Vulkan and thus are done in order relatively to
// each other, it should be possible to implement them by clearing the bits via
// an atomic AND, and writing the new value using an atomic OR. This will, of
// course, make the entire write operation non-atomic, and in case of a race
// between writes to the same location, the final result may not even be just a
// value from one of the invocations, but rather, it can be OR of the values
// from any invocations involved. However, on the Xenos, there doesn't seem to
// be any possibility of meaningfully accessing the same location from multiple
// invocations if any of them is writing, memory exports are out-of-order, so
// such an implementation shouldn't be causing issues in reality. Atomic
// compare-exchange, however, should not be used for this purpose, as it may
// result in an infinite loop if different invocations want to write different
// values to the same memory location.
//
// Examples of setup in titles (Z from MSB to LSB):
//
// 4D5307E6 particles (different VS invocation counts, like 1, 2, 4):
// There is a passthrough shader - useful for verification as it simply writes
// directly what it reads via vfetch of various formats. Another shader (with
// different c# numbers, but same formats) does complicated math to process the
// particles.
// c152: Z = 010010110000|0|111|00|100110|00000|010, count = 35840
// 8in32, 32_32_32_32_FLOAT, float, RGBA - from 32_32_32_32_FLOAT vfetch
// c154, 162: Z = 010010110000|0|111|00|100000|00000|001, count = 71680
// 8in16, 16_16_16_16_FLOAT, float, RGBA - from 16_16_16_16_FLOAT vfetch
// c156, 158, 160: Z = 010010110000|0|000|00|011010|00000|001, count = 71680
// 8in16, 16_16_16_16, unorm, RGBA - from 16_16_16_16 unorm vfetch
// c164: Z = 010010110000|0|111|00|011111|00000|001, count = 143360
// 8in16, 16_16_FLOAT, float, RGBA - from 16_16_FLOAT vfetch
// c166: Z = 010010110000|0|000|00|011001|00000|001, count = 143360
// 8in16, 16_16, unorm, RGBA - from 16_16 unorm vfetch
// c168: Z = 010010110000|0|001|00|000111|00000|010, count = 143360
// 8in32, 2_10_10_10, snorm, RGBA - from 2_10_10_10 snorm vfetch
// c170, c172: Z = 010010110000|1|000|00|000110|00000|010, count = 143360
// 8in32, 8_8_8_8, unorm, BGRA - from 8_8_8_8 unorm vfetch with .zyxw swizzle
//
// 4D5307E6 water simulation (2048 VS invocations):
// c130: Z = 010010110000|0|111|00|100110|00000|010, count = 16384
// 8in32, 32_32_32_32_FLOAT, float, RGBA
// The shader has 5 memexports of this kind and 6 32_32_32_32_FLOAT vfetches.
//
// 4D5307E6 water tessellation factors (1 VS invocation per triangle patch):
// c130: Z = 010010110000|0|111|11|100100|11111|010, count = patch count * 3
// 8in32, 32_FLOAT, float, RGBA
//
// 41560817 texture memory copying (64 bytes per invocation, two eA, eight eM#):
// c0: Z = 010010110000|0|010|11|011010|00011|001
// 8in16, 16_16_16_16, uint, RGBA - from 16_16_16_16 uint vfetch
// (16_16_16_16 is the largest color format without special values)
//
// 58410B86 hierarchical depth buffer occlusion culling with the result read on
// the CPU (15000 VS invocations in the main menu):
// c8: Z = 010010110000|0|010|00|000010|00000|000, count = invocation count
// No endian swap, 8, uint, RGBA
union alignas(uint32_t) xe_gpu_memexport_stream_t {
struct {
uint32_t dword_0;
uint32_t dword_1;
uint32_t dword_2;
uint32_t dword_3;
};
struct {
uint32_t base_address : 30; // +0 dword_0 physical address >> 2
uint32_t const_0x1 : 2; // +30
uint32_t const_0x4b000000; // +0 dword_1
Endian128 endianness : 3; // +0 dword_2
uint32_t unused_0 : 5; // +3
ColorFormat format : 6; // +8
uint32_t unused_1 : 2; // +14
SurfaceNumberFormat num_format : 3; // +16
uint32_t red_blue_swap : 1; // +19
uint32_t const_0x4b0 : 12; // +20
uint32_t index_count : 23; // +0 dword_3
uint32_t const_0x96 : 9; // +23
};
};
static_assert_size(xe_gpu_memexport_stream_t, sizeof(uint32_t) * 4);
struct alignas(uint32_t) xe_gpu_depth_sample_counts {
// This is little endian as it is swapped in D3D code.
// Corresponding A and B values are summed up by D3D.
// Occlusion there is calculated by substracting begin from end struct.
le<uint32_t> Total_A;
le<uint32_t> Total_B;
le<uint32_t> ZFail_A;
le<uint32_t> ZFail_B;
le<uint32_t> ZPass_A;
le<uint32_t> ZPass_B;
le<uint32_t> StencilFail_A;
le<uint32_t> StencilFail_B;
};
static_assert_size(xe_gpu_depth_sample_counts, sizeof(uint32_t) * 8);
// Enum of event values used for VGT_EVENT_INITIATOR
enum Event {
VS_DEALLOC = 0,
PS_DEALLOC = 1,
VS_DONE_TS = 2,
PS_DONE_TS = 3,
CACHE_FLUSH_TS = 4,
CONTEXT_DONE = 5,
CACHE_FLUSH = 6,
VIZQUERY_START = 7,
VIZQUERY_END = 8,
SC_WAIT_WC = 9,
MPASS_PS_CP_REFETCH = 10,
MPASS_PS_RST_START = 11,
MPASS_PS_INCR_START = 12,
RST_PIX_CNT = 13,
RST_VTX_CNT = 14,
TILE_FLUSH = 15,
CACHE_FLUSH_AND_INV_TS_EVENT = 20,
ZPASS_DONE = 21,
CACHE_FLUSH_AND_INV_EVENT = 22,
PERFCOUNTER_START = 23,
PERFCOUNTER_STOP = 24,
SCREEN_EXT_INIT = 25,
SCREEN_EXT_RPT = 26,
VS_FETCH_DONE_TS = 27,
};
// Opcodes (IT_OPCODE) for Type-3 commands in the ringbuffer.
// https://github.com/freedreno/amd-gpu/blob/master/include/api/gsl_pm4types.h
// Not sure if all of these are used.
// clang-format off
enum Type3Opcode {
PM4_ME_INIT = 0x48, // initialize CP's micro-engine
PM4_NOP = 0x10, // skip N 32-bit words to get to the next packet
PM4_INDIRECT_BUFFER = 0x3f, // indirect buffer dispatch. prefetch parser uses this packet type to determine whether to pre-fetch the IB
PM4_INDIRECT_BUFFER_PFD = 0x37, // indirect buffer dispatch. same as IB, but init is pipelined
PM4_WAIT_FOR_IDLE = 0x26, // wait for the IDLE state of the engine
PM4_WAIT_REG_MEM = 0x3c, // wait until a register or memory location is a specific value
PM4_WAIT_REG_EQ = 0x52, // wait until a register location is equal to a specific value
PM4_WAIT_REG_GTE = 0x53, // wait until a register location is >= a specific value
PM4_WAIT_UNTIL_READ = 0x5c, // wait until a read completes
PM4_WAIT_IB_PFD_COMPLETE = 0x5d, // wait until all base/size writes from an IB_PFD packet have completed
PM4_REG_RMW = 0x21, // register read/modify/write
PM4_REG_TO_MEM = 0x3e, // reads register in chip and writes to memory
PM4_MEM_WRITE = 0x3d, // write N 32-bit words to memory
PM4_MEM_WRITE_CNTR = 0x4f, // write CP_PROG_COUNTER value to memory
PM4_COND_EXEC = 0x44, // conditional execution of a sequence of packets
PM4_COND_WRITE = 0x45, // conditional write to memory or register
PM4_EVENT_WRITE = 0x46, // generate an event that creates a write to memory when completed
PM4_EVENT_WRITE_SHD = 0x58, // generate a VS|PS_done event
PM4_EVENT_WRITE_CFL = 0x59, // generate a cache flush done event
PM4_EVENT_WRITE_EXT = 0x5a, // generate a screen extent event
PM4_EVENT_WRITE_ZPD = 0x5b, // generate a z_pass done event
PM4_DRAW_INDX = 0x22, // initiate fetch of index buffer and draw
PM4_DRAW_INDX_2 = 0x36, // draw using supplied indices in packet
PM4_DRAW_INDX_BIN = 0x34, // initiate fetch of index buffer and binIDs and draw
PM4_DRAW_INDX_2_BIN = 0x35, // initiate fetch of bin IDs and draw using supplied indices
PM4_VIZ_QUERY = 0x23, // begin/end initiator for viz query extent processing
PM4_SET_STATE = 0x25, // fetch state sub-blocks and initiate shader code DMAs
PM4_SET_CONSTANT = 0x2d, // load constant into chip and to memory
PM4_SET_CONSTANT2 = 0x55, // INCR_UPDATE_STATE
PM4_SET_SHADER_CONSTANTS = 0x56, // INCR_UPDT_CONST
PM4_LOAD_ALU_CONSTANT = 0x2f, // load constants from memory
PM4_IM_LOAD = 0x27, // load sequencer instruction memory (pointer-based)
PM4_IM_LOAD_IMMEDIATE = 0x2b, // load sequencer instruction memory (code embedded in packet)
PM4_LOAD_CONSTANT_CONTEXT = 0x2e, // load constants from a location in memory
PM4_INVALIDATE_STATE = 0x3b, // selective invalidation of state pointers
PM4_SET_SHADER_BASES = 0x4A, // dynamically changes shader instruction memory partition
PM4_SET_BIN_BASE_OFFSET = 0x4B, // program an offset that will added to the BIN_BASE value of the 3D_DRAW_INDX_BIN packet
PM4_SET_BIN_MASK = 0x50, // sets the 64-bit BIN_MASK register in the PFP
PM4_SET_BIN_SELECT = 0x51, // sets the 64-bit BIN_SELECT register in the PFP
PM4_CONTEXT_UPDATE = 0x5e, // updates the current context, if needed
PM4_INTERRUPT = 0x54, // generate interrupt from the command stream
PM4_XE_SWAP = 0x64, // Xenia only: VdSwap uses this to trigger a swap.
PM4_IM_STORE = 0x2c, // copy sequencer instruction memory to system memory
// Tiled rendering:
// https://www.google.com/patents/US20060055701
PM4_SET_BIN_MASK_LO = 0x60,
PM4_SET_BIN_MASK_HI = 0x61,
PM4_SET_BIN_SELECT_LO = 0x62,
PM4_SET_BIN_SELECT_HI = 0x63,
};
// clang-format on
inline uint32_t MakePacketType0(uint16_t index, uint16_t count,
bool one_reg = false) {
// ttcccccc cccccccc oiiiiiii iiiiiiii
assert(index <= 0x7FFF);
assert(count >= 1 && count <= 0x4000);
return (0u << 30) | (((count - 1) & 0x3FFF) << 16) | (index & 0x7FFF);
}
inline uint32_t MakePacketType1(uint16_t index_1, uint16_t index_2) {
// tt?????? ??222222 22222111 11111111
assert(index_1 <= 0x7FF);
assert(index_2 <= 0x7FF);
return (1u << 30) | ((index_2 & 0x7FF) << 11) | (index_1 & 0x7FF);
}
constexpr inline uint32_t MakePacketType2() {
// tt?????? ???????? ???????? ????????
return (2u << 30);
}
inline uint32_t MakePacketType3(Type3Opcode opcode, uint16_t count,
bool predicate = false) {
// ttcccccc cccccccc ?ooooooo ???????p
assert(opcode <= 0x7F);
assert(count >= 1 && count <= 0x4000);
return (3u << 30) | (((count - 1) & 0x3FFF) << 16) | ((opcode & 0x7F) << 8) |
(predicate ? 1 : 0);
}
} // namespace xenos
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_XENOS_H_