diff --git a/src/xenia/base/utf8.cc b/src/xenia/base/utf8.cc index 943e293b1..6405aa2f8 100644 --- a/src/xenia/base/utf8.cc +++ b/src/xenia/base/utf8.cc @@ -12,6 +12,7 @@ #include #include #include +#include #define UTF_CPP_CPLUSPLUS 201703L #include "third_party/utfcpp/source/utf8.h" diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc index 8c541e531..8ac3f921c 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc @@ -1413,8 +1413,9 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, if (copy_dest_committed) { // Write the descriptors and transition the resources. // Full shared memory without resolution scaling, range of the scaled - // resolve buffer with scaling because only 128 R32 elements can be - // addressed on Nvidia. + // resolve buffer with scaling because only at least 128 * 2^20 R32 + // elements must be addressable + // (D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP). ui::d3d12::util::DescriptorCpuGpuHandlePair descriptor_dest; ui::d3d12::util::DescriptorCpuGpuHandlePair descriptor_source; ui::d3d12::util::DescriptorCpuGpuHandlePair descriptors[2]; diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.h b/src/xenia/gpu/d3d12/d3d12_shared_memory.h index dfd1e52c2..abf069447 100644 --- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h +++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h @@ -76,8 +76,8 @@ class D3D12SharedMemory : public SharedMemory { void WriteRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle); void WriteRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle); - // Due to the Nvidia 128 megatexel limitation, the smallest supported formats - // are 32-bit. + // Due to the D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP limitation, the + // smallest supported formats are 32-bit. void WriteUintPow2SRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2); void WriteUintPow2UAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle, diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc index ad9b320fc..74682680c 100644 --- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc @@ -1715,9 +1715,10 @@ bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture, } // Begin loading. - // May use different buffers for scaled base and mips, and also can't address - // more than 128 megatexels directly on Nvidia - need two separate UAV - // descriptors for base and mips. + // May use different buffers for scaled base and mips, and also addressability + // of more than 128 * 2^20 (2^D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP) + // texels is not mandatory - need two separate UAV descriptors for base and + // mips. // Destination. uint32_t descriptor_count = 1; if (texture_resolution_scaled) { @@ -1820,7 +1821,8 @@ bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture, if (texture_resolution_scaled) { // Offset already applied in the buffer because more than 512 MB can't be - // directly addresses on Nvidia as R32. + // directly addresses as R32 on some hardware (above + // 2^D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP). load_constants.guest_offset = 0; } else { load_constants.guest_offset = guest_address; diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index ea9deb591..e52457d1a 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -975,11 +975,11 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, dest_height = rb_copy_dest_pitch.copy_dest_height; // The pointer is only adjusted to Z / 8, but the texture may have a depth // of (N % 8) <= 4, like 4, 12, 20 when rounded up to 4 - // (xenos::kTextureTiledDepthGranularity), so provide Z + 1 to measure the - // size of the texture conservatively, but without going out of the upper - // bound (though this still may go out of bounds a bit probably if - // resolving to non-zero XY, but not sure if that really happens and - // actually causes issues). + // (xenos::kTextureTileDepth), so provide Z + 1 to measure the size of the + // texture conservatively, but without going out of the upper bound + // (though this still may go out of bounds a bit probably if resolving to + // non-zero XY, but not sure if that really happens and actually causes + // issues). dest_depth = rb_copy_dest_info.copy_dest_slice + 1; } else { copy_dest_base_adjusted += texture_util::GetTiledOffset2D( diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index 7009e9d3e..883193074 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -299,6 +299,9 @@ union ResolveAddressPackedInfo { // taking 8x8 granularity into account) if the offset of the 160x32 region // itself, and the offset of the texture tile, are pre-added to the bases. + // TODO(Triang3l): Tiled address repeats every up to 128x128 blocks (for 2D + // 1bpb textures) - change the range to 640x128. + // In the EDRAM source, the whole offset is relative to the base. // In the texture, & 31 of the offset is relative to the base (the base is // adjusted to 32x32 tiles). @@ -374,7 +377,8 @@ struct ResolveCopyShaderInfo { // shader (at least 2). uint32_t source_bpe_log2; // Log2 of bytes per element of the type of the destination buffer bound to - // the shader (at least 2 because of Nvidia's 128 megatexel limit that + // the shader (at least 2 because of the 128 megatexel minimum requirement on + // Direct3D 10+ - D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP - that // prevents binding the entire shared memory buffer with smaller element // sizes). uint32_t dest_bpe_log2; diff --git a/src/xenia/gpu/dxbc_shader_translator_memexport.cc b/src/xenia/gpu/dxbc_shader_translator_memexport.cc index b345f12f4..8d1295ee7 100644 --- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc +++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc @@ -18,12 +18,13 @@ namespace gpu { using namespace ucode; // TODO(Triang3l): Support sub-dword memexports (like k_8 in 58410B86). This -// would require four 128 MB R8_UINT UAVs due to the Nvidia addressing limit. -// Need to be careful with resource binding tiers, however. Resource binding -// tier 1 on feature level 11_0 allows only 8 UAVs _across all stages_. -// RWByteAddressBuffer + 4 typed buffers is 5 per stage already, would need 10 -// for both VS and PS, or even 11 with the eDRAM ROV. Need to drop draw commands -// doing memexport in both VS and PS on FL 11_0 resource binding tier 1. +// would require four 128 MB R8_UINT UAVs due to +// D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP. Need to be careful with +// resource binding tiers, however. Resource binding tier 1 on feature level +// 11_0 allows only 8 UAVs _across all stages_. RWByteAddressBuffer + 4 typed +// buffers is 5 per stage already, would need 10 for both VS and PS, or even 11 +// with the eDRAM ROV. Need to drop draw commands doing memexport in both VS and +// PS on FL 11_0 resource binding tier 1. void DxbcShaderTranslator::ExportToMemory_PackFixed32( const uint32_t* eM_temps, uint32_t eM_count, const uint32_t bits[4], diff --git a/src/xenia/gpu/primitive_processor.cc b/src/xenia/gpu/primitive_processor.cc index 0e0a53c3f..5c91abaa8 100644 --- a/src/xenia/gpu/primitive_processor.cc +++ b/src/xenia/gpu/primitive_processor.cc @@ -703,9 +703,9 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { // Does not need indirection on backends not supporting full 32-bit // indices. if (guest_primitive_reset_index_guest_endian != UINT16_MAX) { - // If primitive reset is with a non-0xFFFF index is used, replace - // with 0xFFFF if 0xFFFF is not used as a real index, or with - // 0xFFFFFFFF if it is. + // If primitive reset with a non-0xFFFF index is used, replace with + // 0xFFFF if 0xFFFF is not used as a real index, or with 0xFFFFFFFF + // if it is. // Writing to the trace irrespective of the cache lookup result // because cache behavior depends on runtime configuration and // state. diff --git a/src/xenia/gpu/texture_util.cc b/src/xenia/gpu/texture_util.cc index 218ad1133..75278053d 100644 --- a/src/xenia/gpu/texture_util.cc +++ b/src/xenia/gpu/texture_util.cc @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2018 Ben Vanik. All rights reserved. * + * Copyright 2022 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -346,12 +346,10 @@ TextureGuestLayout GetGuestTextureLayout( uint32_t z_stride_bytes = level_layout.array_slice_stride_bytes; if (dimension == xenos::DataDimension::k3D) { level_layout.array_slice_stride_bytes *= - xe::align(depth_or_array_size, xenos::kTextureTiledDepthGranularity); + xe::align(depth_or_array_size, xenos::kTextureTileDepth); } - uint32_t array_slice_stride_bytes_non_4kb_aligned = - level_layout.array_slice_stride_bytes; level_layout.array_slice_stride_bytes = - xe::align(array_slice_stride_bytes_non_4kb_aligned, + xe::align(level_layout.array_slice_stride_bytes, xenos::kTextureSubresourceAlignmentBytes); // Estimate the memory amount actually referenced by the texture, which may @@ -374,34 +372,68 @@ TextureGuestLayout GetGuestTextureLayout( xe::align(level_width_blocks, xenos::kTextureTileWidthHeight); level_layout.y_extent_blocks = xe::align(level_height_blocks, xenos::kTextureTileWidthHeight); + uint32_t bytes_per_block_log2 = xe::log2_floor(bytes_per_block); if (dimension == xenos::DataDimension::k3D) { level_layout.z_extent = - xe::align(level_depth, xenos::kTextureTiledDepthGranularity); - // 3D texture addressing is pretty complex, so it's hard to determine - // the memory extent of a subregion - just use `pitch_tiles * - // height_tiles * depth_tiles * bytes_per_tile` at least for now, until - // we find a case where it causes issues. `width > pitch` is a very - // weird edge case anyway, and is extremely unlikely. - assert_true(level_layout.x_extent_blocks <= - row_pitch_blocks_tile_aligned); - level_layout.array_slice_data_extent_bytes = - array_slice_stride_bytes_non_4kb_aligned; + xe::align(level_depth, xenos::kTextureTileDepth); + // 32-block-row x 4 slice portions laid out sequentially (4-slice-major, + // 32-block-row-minor), address extent within a 32x32x4 tile depends on + // the pitch. Origins of 32x32x4 tiles grow monotonically, first along + // Z, then along Y, then along X. + level_layout.array_slice_data_extent_bytes = uint32_t(GetTiledOffset3D( + int32_t(level_layout.x_extent_blocks - + xenos::kTextureTileWidthHeight), + int32_t(level_layout.y_extent_blocks - + xenos::kTextureTileWidthHeight), + int32_t(level_layout.z_extent - xenos::kTextureTileDepth), + row_pitch_blocks_tile_aligned, level_layout.y_extent_blocks, + bytes_per_block_log2)); + switch (bytes_per_block_log2) { + case 0: + // 64x32x8 portions have independent addressing. + // Extent relative to the 32x32x4 tile origin: + // - Pitch = 32, 96, 160...: (Pitch / 64) * 0x1000 + 0x1000 + // - Pitch = 64, 128, 192...: (Pitch / 64) * 0x1000 + 0xC00 + level_layout.array_slice_data_extent_bytes += + ((row_pitch_blocks_tile_aligned >> 6) << 12) + 0xC00 + + ((row_pitch_blocks_tile_aligned & (1 << 5)) << (10 - 5)); + break; + default: + // 32x32x8 portions have independent addressing. + // Extent: ((Pitch / 32) * 0x1000 + 0x1000) * (BPB / 2) + // Or: ((Pitch / 32) * 0x1000 / 2 + 0x1000 / 2) * BPB + level_layout.array_slice_data_extent_bytes += + ((row_pitch_blocks_tile_aligned << (12 - 5 - 1)) + + (0x1000 >> 1)) + << bytes_per_block_log2; + break; + } } else { level_layout.z_extent = 1; - // 2D 32x32-block tiles are laid out linearly in the texture. - // Calculate the extent as ((all rows except for the last * pitch in - // tiles + last row length in tiles) * bytes per tile). - // FIXME(Triang3l): This is wrong for 1bpb and 2bpb. At 1bpb (32x32 is - // 1024 bytes), offset for X + 32 minus offset for X is 512, not 1024, - // but offset for X + 128 minus offset for X + 96 is 2560. Also, for - // XY = 0...31, the extent of the addresses is 2560, not 1024. At 2bpb, - // addressing repeats every 64x64, and the extent for XY = 0...31 is - // 3072, not 2048. - level_layout.array_slice_data_extent_bytes = - (level_layout.y_extent_blocks - xenos::kTextureTileWidthHeight) * - level_layout.row_pitch_bytes + - bytes_per_block * level_layout.x_extent_blocks * - xenos::kTextureTileWidthHeight; + // Origins of 32x32 tiles grow monotonically, first along Y, then along + // X. + level_layout.array_slice_data_extent_bytes = uint32_t(GetTiledOffset2D( + int32_t(level_layout.x_extent_blocks - + xenos::kTextureTileWidthHeight), + int32_t(level_layout.y_extent_blocks - + xenos::kTextureTileWidthHeight), + row_pitch_blocks_tile_aligned, bytes_per_block_log2)); + switch (bytes_per_block_log2) { + case 0: + // Independent addressing within 128x128 portions, but the extent is + // 0xA00 bytes from the 32x32 tile origin. + level_layout.array_slice_data_extent_bytes += 0xA00; + break; + case 1: + // Independent addressing within 64x64 portions, but the extent is + // 0xC00 bytes from the 32x32 tile origin. + level_layout.array_slice_data_extent_bytes += 0xC00; + break; + default: + level_layout.array_slice_data_extent_bytes += + UINT32_C(0x400) << bytes_per_block_log2; + break; + } } } else { if (level == layout.packed_level) { diff --git a/src/xenia/gpu/texture_util.h b/src/xenia/gpu/texture_util.h index e1f849b42..855132f49 100644 --- a/src/xenia/gpu/texture_util.h +++ b/src/xenia/gpu/texture_util.h @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2018 Ben Vanik. All rights reserved. * + * Copyright 2022 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -198,22 +198,71 @@ void GetTextureTotalSize(xenos::DataDimension dimension, bool has_packed_mips, uint32_t* base_size_out, uint32_t* mip_size_out); -// Notes about tiled addresses that can be useful for simplifying and optimizing -// tiling/untiling: -// - Offset2D(X * 32 + x, Y * 32 + y) == -// Offset2D(X * 32, Y * 32) + Offset2D(x, y) -// (true for negative offsets too). -// - Offset3D(X * 32 + x, Y * 32 + y, Z * 8 + z) == -// Offset3D(X * 32, Y * 32, Z * 8) + Offset3D(x, y, z) -// (true for negative offsets too). -// - 2D 32x32 tiles are laid out linearly. -// FIXME(Triang3l): This is wrong for 1bpb and 2bpb. At 1bpb (32x32 is 1024 -// bytes), offset for X + 32 minus offset for X is 512, not 1024, but offset for -// X + 128 minus offset for X + 96 is 2560. Also, for XY = 0...31, the extent of -// the addresses is 2560, not 1024. At 2bpb, addressing repeats every 64x64, and -// the extent for XY = 0...31 is 3072, not 2048. -// - 3D tiled texture slices 0:3 and 4:7 are stored separately in memory, in -// non-overlapping ranges, but addressing in 4:7 is different than in 0:3. +// Notes about tiled addresses: +// - The tiled address calculation functions work for both positive and negative +// offsets, so they can be used to go both from the origin of the texture to a +// region inside it and back (as long as the coordinates are a multiple of the +// period of the tiled address function in each direction - depends on whether +// the texture is 2D or 3D, and on the number of bytes per block). This is, in +// particular, used by Direct3D 9 inside resolving to allow resolving with an +// offset in the texture, so the rectangle coordinates are relative to both +// the render target and the region (with the appropriate alignment) in the +// texture at the same time. +// - 2D: +// - Origins of 32x32-block tiles grow monotonically as Y/32 (in blocks) +// increases, and in each tile row, as X/32 (in blocks) increases. +// - In each 32x32 tile, the block at (0, 0) within the tile has the address +// that matches the origin of the tile itself. This is not true for the +// block (31, 31), however - its address will be somewhere within the memory +// extent of the tile. +// - 1bpb: +// - The tiled address sequence repeats every 128 blocks along X or Y. +// - 32x32 tiles have their origins 0x200-bytes-aligned, and the addresses +// of the blocks within a 32x32 tile span 0xA00 bytes. +// - Note that 32x32x1bpb is 0x400 bytes, but addresses of blocks within a +// tile span the range of 0xA00 bytes - so 32x32 tiles are stored in +// memory ranges that may overlap (even across 128x128 - with the pitch of +// 192 blocks, the tile at (96, 32)...(127, 63) spans 0x2200...0x2BFF, +// while the tile at (128, 32)...(159, 63) spans 0x2400...0x2DFF. +// - All blocks within a 32x32 tile are located in the same 4KB-aligned +// region. +// - 2bpb: +// - The approach to storage is conceptually similar to that of 1bpb, with +// some quantitative differences. +// - The tiled address sequence repeats every 64 blocks along X or Y. +// - 32x32 tiles have their origins 0x400-bytes-aligned, and the addresses +// of the blocks within a 32x32 tile span 0xC00 bytes. +// - 4bpb and larger: +// - 32x32 tiles (which themselves are 4 KB or larger in this case) are +// stored simply in a tile-row-major way, separately from each other in +// memory, with independent addressing within each tile. +// - 3D: +// - Origins of 32x32x4-block tiles grow monotonically as Z/4 increases, and +// in each 4-slice portion, as Y/32 (in blocks) increases, and in each tile +// row, as X/32 (in blocks) increases. +// - Along Z, addressing repeats every 8 slices. Along Y, addressing repeats +// every 32 blocks regardless of the number of bytes per block. +// - 32-block-row x 4-slice portions are stored in disjoint 4KB-aligned ranges +// in memory (thus every 4 slices are also stored in disjoint ranges). +// - Addresses within a 32x32x4-block tile span widely throughout the X pitch, +// with a lot of overlap between 32x32x4 tiles with different X. +// - 1bpb: +// - The tiled address sequence repeats every 64 blocks along X. +// - Origins of 32x32x4-block tiles within 32-block-row x 4-slice portions: +// - X = 0, 64, 128...: (X / 64) * 0x1000 +// - X = 32, 96, 160...: (X / 64) * 0x1000 + 0x400 +// - Or: ((X >> 6) << 12) | (((X >> 5) & 1) << 10) +// - Span of the addresses within a 32x32x4-block tile: +// - Pitch = 32, 96, 160...: (Pitch / 64) * 0x1000 + 0x1000 +// - Pitch = 64, 128, 192...: (Pitch / 64) * 0x1000 + 0xC00 +// - Or: ((Pitch >> 6) << 12) + 0xC00 + (((Pitch >> 5) & 1) << 10) +// - Or: ((Pitch >> 6) << 12) + 0xC00 + ((Pitch & (1 << 5)) << (10 - 5)) +// - 2bpb and larger: +// - The tiled address sequence repeats every 32 blocks along X. +// - Origins of 32x32x4-block tiles within 32-block-row x 4-slice portions: +// (X / 32) * 0x1000 * (BPB / 2) +// - Span of the addresses within a 32x32x4-block tile: +// ((Pitch / 32) * 0x1000 + 0x1000) * (BPB / 2) // - Addressing of blocks that are contiguous along X (for tiling/untiling of // larger portions at once): // - 1bpb - each 8 blocks are laid out sequentially, odd 8 blocks = diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index e9865946c..58bb4b045 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -1049,19 +1049,12 @@ constexpr uint32_t kTextureMaxMips = std::max(kTexture2DCubeMaxWidthHeightLog2, kTexture3DMaxWidthHeightLog2) + 1; -// Tiled texture sizes are in 32x32 increments for 2D, 32x32x4 for 3D. -// 2DTiledOffset(X * 32 + x, Y * 32 + y) == -// 2DTiledOffset(X * 32, Y * 32) + 2DTiledOffset(x, y) -// 3DTiledOffset(X * 32 + x, Y * 32 + y, Z * 8 + z) == -// 3DTiledOffset(X * 32, Y * 32, Z * 8) + 3DTiledOffset(x, y, z) -// Both are true for negative offsets too. constexpr uint32_t kTextureTileWidthHeightLog2 = 5; constexpr uint32_t kTextureTileWidthHeight = 1 << kTextureTileWidthHeightLog2; // 3D tiled texture slices 0:3 and 4:7 are stored separately in memory, in // non-overlapping ranges, but addressing in 4:7 is different than in 0:3. -constexpr uint32_t kTextureTiledDepthGranularityLog2 = 2; -constexpr uint32_t kTextureTiledDepthGranularity = - 1 << kTextureTiledDepthGranularityLog2; +constexpr uint32_t kTextureTileDepthLog2 = 2; +constexpr uint32_t kTextureTileDepth = 1 << kTextureTileDepthLog2; constexpr uint32_t kTextureTiledZBaseGranularityLog2 = 3; constexpr uint32_t kTextureTiledZBaseGranularity = 1 << kTextureTiledZBaseGranularityLog2;