R"( #version 450 layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in; #define SSBO_LOCATION(x) (x + %loc) #define MEMORY_OP %op #define MEMORY_OP_DETILE 0 #define MEMORY_OP_TILE 1 #if (MEMORY_OP == MEMORY_OP_TILE) #define TILED_DATA_MODIFIER #define LINEAR_DATA_MODIFIER readonly #else #define TILED_DATA_MODIFIER readonly #define LINEAR_DATA_MODIFIER #endif layout(%set, binding=SSBO_LOCATION(0), std430) TILED_DATA_MODIFIER restrict buffer TiledDataBlock { uint tiled_data[]; }; layout(%set, binding=SSBO_LOCATION(1), std430) LINEAR_DATA_MODIFIER restrict buffer LinearDataBlock { uint linear_data[]; }; #ifdef VULKAN layout(%push_block) uniform Configuration { uint prime; /* Prime factor derived from the number of tiles per row */ uint factor; /* Counterpart to the prime value. prime * factor = tiles per row. */ uint num_tiles_per_row; /* Pitch / tile-width. Each "tile" is 256 bytes long */ uint tile_base_address; /* Base address for this tile. */ uint tile_size; /* Size of the whole tile. */ uint tile_address_offset; /* Address offset where the texture region sits. */ uint tile_rw_offset; /* Access offset. If we load the entire tile then this is 0, but can be a multiple of pitch if we skip some rows for performance reasons. */ uint tile_pitch; /* Row length in bytes for every line in the tile and consequently the image. */ uint tile_bank; /* Bank sense offset. Acts as a memory-subsystem bias so that different FBOS can make use of different parts of the circuitry */ uint image_width; /* Width of the linear 2D region we're encoding/decoding */ uint image_height; /* Height of the linear 2D region to encode/decode */ uint image_pitch; /* Image pitch. The incoming data may be from a GPU operation with packed pixels which can have a different pitch than the tile we're writing from/to */ uint image_bpp; /* Texel width of the image format. */ }; #else uniform uint prime; /* Prime factor derived from the number of tiles per row */ uniform uint factor; /* Counterpart to the prime value. prime * factor = tiles per row. */ uniform uint num_tiles_per_row; /* Pitch / tile-width. Each "tile" is 256 bytes long */ uniform uint tile_base_address; /* Base address for this tile. */ uniform uint tile_size; /* Size of the whole tile. */ uniform uint tile_address_offset; /* Address offset where the texture region sits. */ uniform uint tile_rw_offset; /* Access offset. If we load the entire tile then this is 0, but can be a multiple of pitch if we skip some rows for performance reasons. */ uniform uint tile_pitch; /* Row length in bytes for every line in the tile and consequently the image. */ uniform uint tile_bank; /* Bank sense offset. Acts as a memory-subsystem bias so that different FBOS can make use of different parts of the circuitry */ uniform uint image_width; /* Width of the linear 2D region we're encoding/decoding */ uniform uint image_height; /* Height of the linear 2D region to encode/decode */ uniform uint image_pitch; /* Image pitch. The incoming data may be from a GPU operation with packed pixels which can have a different pitch than the tile we're writing from/to */ uniform uint image_bpp; /* Texel width of the image format. */ #endif // Hard constants, set by hardware #define RSX_TILE_WIDTH 256 #define RSX_TILE_HEIGHT 64 #if (MEMORY_OP == MEMORY_OP_TILE) uvec4 read_linear(const in uint offset) { switch (image_bpp) { case 16: { return uvec4( linear_data[offset * 4], linear_data[offset * 4 + 1], linear_data[offset * 4 + 2], linear_data[offset * 4 + 3]); } case 8: { return uvec4( linear_data[offset * 2], linear_data[offset * 2 + 1], 0, 0); } case 4: { return uvec4(linear_data[offset], 0, 0, 0); } case 2: { const uint word = linear_data[offset >> 1]; const int shift = int(offset & 1) << 4; return uvec4(bitfieldExtract(word, shift, 16), 0, 0, 0); } case 1: { const uint word = linear_data[offset >> 2]; const int shift = int(offset & 3) << 3; return uvec4(bitfieldExtract(word, shift, 8), 0, 0, 0); } default: return uvec4(0); } } void write_tiled(const in uint offset, const in uvec4 value) { switch (image_bpp) { case 16: { tiled_data[offset * 4] = value.x; tiled_data[offset * 4 + 1] = value.y; tiled_data[offset * 4 + 2] = value.z; tiled_data[offset * 4 + 3] = value.w; break; } case 8: { tiled_data[offset * 2] = value.x; tiled_data[offset * 2 + 1] = value.y; break; } case 4: { tiled_data[offset] = value.x; break; } case 2: { const uint word_offset = offset >> 1; const uint word = tiled_data[word_offset]; const int shift = int(offset & 1) << 4; tiled_data[word_offset] = bitfieldInsert(word, value.x, shift, 16); break; } case 1: { const uint word_offset = offset >> 2; const uint word = tiled_data[word_offset]; const int shift = int(offset & 3) << 3; tiled_data[word_offset] = bitfieldInsert(word, value.x, shift, 8); break; } default: break; } } #else uvec4 read_tiled(const in uint offset) { switch (image_bpp) { case 16: { return uvec4( tiled_data[offset * 4], tiled_data[offset * 4 + 1], tiled_data[offset * 4 + 2], tiled_data[offset * 4 + 3]); } case 8: { return uvec4( tiled_data[offset * 2], tiled_data[offset * 2 + 1], 0, 0); } case 4: { return uvec4(tiled_data[offset], 0, 0, 0); } case 2: { const uint word = tiled_data[offset >> 1]; const int shift = int(offset & 1) << 4; return uvec4(bitfieldExtract(word, shift, 16), 0, 0, 0); } case 1: { const uint word = tiled_data[offset >> 2]; const int shift = int(offset & 3) << 3; return uvec4(bitfieldExtract(word, shift, 8), 0, 0, 0); } default: return uvec4(0); } } void write_linear(const in uint offset, const in uvec4 value) { switch (image_bpp) { case 16: { linear_data[offset * 4] = value.x; linear_data[offset * 4 + 1] = value.y; linear_data[offset * 4 + 2] = value.z; linear_data[offset * 4 + 3] = value.w; break; } case 8: { linear_data[offset * 2] = value.x; linear_data[offset * 2 + 1] = value.y; break; } case 4: { linear_data[offset] = value.x; break; } case 2: { const uint word_offset = offset >> 1; const uint word = linear_data[word_offset]; const int shift = int(offset & 1) << 4; linear_data[word_offset] = bitfieldInsert(word, value.x, shift, 16); break; } case 1: { const uint word_offset = offset >> 2; const uint word = linear_data[word_offset]; const int shift = int(offset & 3) << 3; linear_data[word_offset] = bitfieldInsert(word, value.x, shift, 8); break; } default: break; } } #endif void do_memory_op(const in uint row, const in uint col) { const uint row_offset = (row * tile_pitch) + tile_base_address + tile_address_offset; const uint this_address = row_offset + (col * image_bpp); // 1. Calculate row_addr const uint texel_offset = (this_address - tile_base_address) / RSX_TILE_WIDTH; // Calculate coordinate of the tile grid we're supposed to be in const uint tile_x = texel_offset % num_tiles_per_row; const uint tile_y = (texel_offset / num_tiles_per_row) / RSX_TILE_HEIGHT; // Calculate the grid offset for the tile selected and add the base offset. It's supposed to affect the bank stuff in the next step const uint tile_id = tile_y * num_tiles_per_row + tile_x; const uint tile_selector = (tile_id + (tile_base_address >> 14)) & 0x3ffff; // Calculate row address const uint row_address = (tile_selector >> 2) & 0xffff; // 2. Calculate bank selector // There's a lot of weird math here, but it's just a variant of (tile_selector % 4) to pick a value between [0..3] uint bank_selector = 0; const uint bank_distribution_lookup[16] = { 0, 1, 2, 3, 2, 3, 0, 1, 1, 2, 3, 0, 3, 0, 1, 2 }; if (factor == 1) { bank_selector = (tile_selector & 3); } else if (factor == 2) { const uint idx = ((tile_selector + ((tile_y & 1) << 1)) & 3) * 4 + (tile_y & 3); bank_selector = bank_distribution_lookup[idx]; } else if (factor >= 4) { const uint idx = (tile_selector & 3) * 4 + (tile_y & 3); bank_selector = bank_distribution_lookup[idx]; } bank_selector = (bank_selector + tile_bank) % 4; // 3. Calculate column selector uint column_selector = 0; const uint line_offset_in_tile = (texel_offset / num_tiles_per_row) % RSX_TILE_HEIGHT; // Calculate column_selector by bit-twiddling line offset and the other calculated parameter bits: // column_selector[9:7] = line_offset_in_tile[5:3] // column_selector[6:4] = this_address[7:5] // column_selector[3:2] = line_offset_in_tile[1:0] // column_selector[1:0] = 0 column_selector |= ((line_offset_in_tile >> 3) & 0x7) << 7; column_selector |= ((this_address >> 5) & 0x7) << 4; column_selector |= ((line_offset_in_tile >> 0) & 0x3) << 2; // 4. Calculate partition selector (0 or 1) const uint partition_selector = (((line_offset_in_tile >> 2) & 1) + ((this_address >> 6) & 1)) & 1; // 5. Build tiled address uint tile_address = 0; // tile_address[31:16] = row_adr[15:0] // tile_address[15:14] = bank_sel[1:0] // tile_address[13:8] = column_sel[9:4] // tile_address[7:7] = partition_sel[0:0] // tile_address[6:5] = column_sel[3:2] // tile_address[4:0] = this_address[4:0] tile_address |= ((row_address >> 0) & 0xFFFF) << 16; tile_address |= ((bank_selector >> 0) & 0x3) << 14; tile_address |= ((column_selector >> 4) & 0x3F) << 8; tile_address |= ((partition_selector >> 0) & 0x1) << 7; tile_address |= ((column_selector >> 2) & 0x3) << 5; tile_address |= ((this_address >> 0) & 0x1F) << 0; // Twiddle bits 9 and 10 tile_address ^= (((tile_address >> 12) ^ ((bank_selector ^ tile_selector) & 1) ^ (tile_address >> 14)) & 1) << 9; tile_address ^= ((tile_address >> 11) & 1) << 10; // Calculate relative addresses and sample uint linear_image_offset = (row * image_pitch) + (col * image_bpp); uint tile_base_offset = tile_address - tile_base_address; // Distance from tile base address uint tile_data_offset = tile_base_offset - tile_rw_offset; // Distance from data base address if (tile_base_offset >= tile_size) { // Do not touch anything out of bounds return; } // Convert to texel addresses for data access linear_image_offset /= image_bpp; tile_data_offset /= image_bpp; #if (MEMORY_OP == MEMORY_OP_DETILE) // Write to linear from tiled write_linear(linear_image_offset, read_tiled(tile_data_offset)); #else // Opposite. Write to tile from linear write_tiled(tile_data_offset, read_linear(linear_image_offset)); #endif } void main() { // The 2D coordinates are retrieved from gl_GlobalInvocationID const uint num_iterations = (image_bpp < 4) ? (4 / image_bpp) : 1; const uint row = gl_GlobalInvocationID.y; const uint col0 = gl_GlobalInvocationID.x; for (uint col = col0; col < (col0 + num_iterations); ++col) { if (row >= image_height || col0 >= image_width) { // Out of bounds return; } do_memory_op(row, col0); } } )"