#include "amdgpu/tiler_cpu.hpp"
#include "amdgpu/tiler.hpp"
#include "gnm/gnm.hpp"

constexpr std::uint64_t
getTiledOffset1D(gnm::TextureType texType, bool isPow2Padded,
                 gnm::DataFormat dfmt, amdgpu::TileMode tileMode, int mipLevel,
                 int arraySlice, int numFragments, int width, int height,
                 int depth, int pitch, int x, int y, int z) {

  using namespace amdgpu;
  bool isCubemap = texType == gnm::TextureType::Cube;
  bool isVolume = texType == gnm::TextureType::Dim3D;

  auto bitsPerFragment = getBitsPerElement(dfmt);
  uint32_t arraySliceCount = depth;

  if (isCubemap) {
    arraySliceCount *= 6;
  } else if (isVolume) {
    arraySliceCount = 1;
  }

  auto numFragmentsPerPixel = 1 << numFragments;
  auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
  auto arrayMode = tileMode.arrayMode();

  auto bitsPerElement = bitsPerFragment;
  auto paddedWidth = std::max((mipLevel != 0 ? pitch : width) >> mipLevel, 1);
  auto paddedHeight = std::max(height >> mipLevel, 1);

  auto tileThickness = (arrayMode == amdgpu::kArrayMode1dTiledThick) ? 4 : 1;

  if (isBlockCompressed) {
    switch (bitsPerFragment) {
    case 1:
      bitsPerElement *= 8;
      paddedWidth = std::max((paddedWidth + 7) / 8, 1);
      break;
    case 4:
    case 8:
      bitsPerElement *= 16;
      paddedWidth = std::max((paddedWidth + 3) / 4, 1);
      paddedHeight = std::max((paddedHeight + 3) / 4, 1);
      break;
    case 16:
      std::abort();
      break;

    default:
      std::abort();
      break;
    }
  }

  if (isPow2Padded) {
    arraySliceCount = std::bit_ceil(arraySliceCount);
    paddedWidth = std::bit_ceil(unsigned(paddedWidth));
    paddedHeight = std::bit_ceil(unsigned(paddedHeight));
  }

  uint64_t finalSurfaceOffset = 0;
  uint64_t finalSurfaceSize = 0;

  auto thickness = getMicroTileThickness(arrayMode);

  for (int i = 0; i <= mipLevel; i++) {
    finalSurfaceOffset += arraySliceCount * finalSurfaceSize;

    std::uint32_t elemWidth =
        std::max<std::uint64_t>((i > 0 ? pitch : width) >> i, 1);
    std::uint32_t elemHeight = std::max<std::uint64_t>(height >> i, 1);
    std::uint32_t elemDepth =
        std::max<std::uint64_t>((isVolume ? depth : 1) >> i, 1);

    if (isBlockCompressed) {
      switch (bitsPerFragment) {
      case 1:
        elemWidth = std::max<std::uint64_t>((elemWidth + 7) / 8, 1);
        break;
      case 4:
      case 8:
        elemWidth = std::max<std::uint64_t>((elemWidth + 3) / 4, 1);
        elemHeight = std::max<std::uint64_t>((elemHeight + 3) / 4, 1);
        break;
      case 16:
        std::abort();
        break;

      default:
        std::abort();
        break;
      }
    }

    if (isPow2Padded) {
      elemWidth = std::bit_ceil(elemWidth);
      elemHeight = std::bit_ceil(elemHeight);
      elemDepth = std::bit_ceil(elemDepth);
    }

    elemWidth = (elemWidth + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1);
    elemHeight = (elemHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1);
    elemDepth = (elemDepth + thickness - 1) & ~(thickness - 1);

    std::uint32_t tempPitch = elemWidth;
    std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) *
                                          elemHeight * bitsPerElement *
                                          numFragmentsPerPixel;
    logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;

    uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
    while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) {
      tempPitch += 8;
      logicalSliceSizeBytes = std::uint64_t(tempPitch) * elemHeight *
                              bitsPerElement * numFragmentsPerPixel;
      logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
      physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
    }

    finalSurfaceSize = logicalSliceSizeBytes * elemDepth;
  }

  finalSurfaceOffset += finalSurfaceSize * (uint64_t)arraySlice;

  auto tileBytes =
      (kMicroTileWidth * kMicroTileHeight * tileThickness * bitsPerElement +
       7) /
      8;
  auto tilesPerRow = paddedWidth / kMicroTileWidth;
  auto tilesPerSlice =
      std::max(tilesPerRow * (paddedHeight / kMicroTileHeight), 1U);

  uint64_t elementIndex = getElementIndex(x, y, z, bitsPerElement,
                                          tileMode.microTileMode(), arrayMode);

  uint64_t sliceOffset = (z / tileThickness) * tilesPerSlice * tileBytes;

  uint64_t tileRowIndex = y / kMicroTileHeight;
  uint64_t tileColumnIndex = x / kMicroTileWidth;
  uint64_t tileOffset =
      (tileRowIndex * tilesPerRow + tileColumnIndex) * tileBytes;

  uint64_t elementOffset = elementIndex * bitsPerElement;
  uint64_t finalOffset = (sliceOffset + tileOffset) * 8 + elementOffset;

  return finalOffset + finalSurfaceOffset * 8;
}

constexpr std::uint64_t getTiledOffsetLinear(gnm::DataFormat dfmt, int height,
                                             int pitch, int x, int y, int z) {
  auto bitsPerFragment = getBitsPerElement(dfmt);

  auto bitsPerElement = bitsPerFragment;
  auto paddedHeight = height;
  auto paddedWidth = pitch;

  if (bitsPerFragment == 1) {
    bitsPerElement *= 8;
    paddedWidth = std::max((paddedWidth + 7) / 8, 1);
  }

  uint64_t tiledRowSizeBits = bitsPerElement * paddedWidth;
  uint64_t tiledSliceBits = paddedWidth * paddedHeight * bitsPerElement;
  return tiledSliceBits * z + tiledRowSizeBits * y + bitsPerElement * x;
}

constexpr std::uint64_t
getTiledOffset2D(gnm::TextureType texType, bool isPow2Padded,
                 gnm::DataFormat dfmt, amdgpu::TileMode tileMode,
                 amdgpu::MacroTileMode macroTileMode, int mipLevel,
                 int arraySlice, int numFragments, int width, int height,
                 int depth, int pitch, int x, int y, int z, int fragmentIndex) {
  using namespace amdgpu;

  bool isCubemap = texType == gnm::TextureType::Cube;
  bool isVolume = texType == gnm::TextureType::Dim3D;
  auto m_bitsPerFragment = getBitsPerElement(dfmt);

  auto m_isBlockCompressed = getTexelsPerElement(dfmt) > 1;
  auto tileSwizzleMask = 0;
  auto numFragmentsPerPixel = 1 << numFragments;
  auto arrayMode = tileMode.arrayMode();

  auto tileThickness = 1;

  switch (arrayMode) {
  case amdgpu::kArrayMode2dTiledThin:
  case amdgpu::kArrayMode3dTiledThin:
  case amdgpu::kArrayModeTiledThinPrt:
  case amdgpu::kArrayMode2dTiledThinPrt:
  case amdgpu::kArrayMode3dTiledThinPrt:
    tileThickness = 1;
    break;
  case amdgpu::kArrayMode1dTiledThick:
  case amdgpu::kArrayMode2dTiledThick:
  case amdgpu::kArrayMode3dTiledThick:
  case amdgpu::kArrayModeTiledThickPrt:
  case amdgpu::kArrayMode2dTiledThickPrt:
  case amdgpu::kArrayMode3dTiledThickPrt:
    tileThickness = 4;
    break;
  case amdgpu::kArrayMode2dTiledXThick:
  case amdgpu::kArrayMode3dTiledXThick:
    tileThickness = 8;
    break;
  default:
    break;
  }

  auto bitsPerElement = m_bitsPerFragment;
  auto paddedWidth = pitch;
  auto paddedHeight = height;

  if (m_isBlockCompressed) {
    switch (m_bitsPerFragment) {
    case 1:
      bitsPerElement *= 8;
      paddedWidth = std::max((paddedWidth + 7) / 8, 1);
      break;
    case 4:
    case 8:
      bitsPerElement *= 16;
      paddedWidth = std::max((paddedWidth + 3) / 4, 1);
      paddedHeight = std::max((paddedHeight + 3) / 4, 1);
      break;
    case 16:
      std::abort();
      break;
    default:
      std::abort();
      break;
    }
  }

  auto bankWidthHW = macroTileMode.bankWidth();
  auto bankHeightHW = macroTileMode.bankHeight();
  auto macroAspectHW = macroTileMode.macroTileAspect();
  auto numBanksHW = macroTileMode.numBanks();

  auto bankWidth = 1 << bankWidthHW;
  auto bankHeight = 1 << bankHeightHW;
  unsigned numBanks = 2 << numBanksHW;
  auto macroTileAspect = 1 << macroAspectHW;

  uint32_t tileBytes1x =
      (tileThickness * bitsPerElement * kMicroTileWidth * kMicroTileHeight +
       7) /
      8;

  auto sampleSplitHw = tileMode.sampleSplit();
  auto tileSplitHw = tileMode.tileSplit();
  uint32_t sampleSplit = 1 << sampleSplitHw;
  uint32_t tileSplitC =
      (tileMode.microTileMode() == amdgpu::kMicroTileModeDepth)
          ? (64 << tileSplitHw)
          : std::max(256U, tileBytes1x * sampleSplit);

  auto tileSplitBytes = std::min(kDramRowSize, tileSplitC);

  auto numPipes = getPipeCount(tileMode.pipeConfig());
  auto pipeInterleaveBits = std::countr_zero(kPipeInterleaveBytes);
  auto pipeInterleaveMask = (1 << pipeInterleaveBits) - 1;
  auto pipeBits = std::countr_zero(numPipes);
  auto bankBits = std::countr_zero(numBanks);
  // auto pipeMask = (numPipes - 1) << pipeInterleaveBits;
  auto bankSwizzleMask = tileSwizzleMask;
  auto pipeSwizzleMask = 0;
  auto macroTileWidth =
      (kMicroTileWidth * bankWidth * numPipes) * macroTileAspect;
  auto macroTileHeight =
      (kMicroTileHeight * bankHeight * numBanks) / macroTileAspect;

  auto microTileMode = tileMode.microTileMode();

  uint64_t elementIndex =
      getElementIndex(x, y, z, bitsPerElement, microTileMode, arrayMode);

  uint32_t xh = x, yh = y;
  if (arrayMode == amdgpu::kArrayModeTiledThinPrt ||
      arrayMode == amdgpu::kArrayModeTiledThickPrt) {
    xh %= macroTileWidth;
    yh %= macroTileHeight;
  }
  uint64_t pipe = getPipeIndex(xh, yh, tileMode.pipeConfig());
  uint64_t bank =
      getBankIndex(xh, yh, bankWidth, bankHeight, numBanks, numPipes);

  uint32_t tileBytes = (kMicroTileWidth * kMicroTileHeight * tileThickness *
                            bitsPerElement * numFragmentsPerPixel +
                        7) /
                       8;

  uint64_t elementOffset = 0;
  if (microTileMode == amdgpu::kMicroTileModeDepth) {
    uint64_t pixelOffset = elementIndex * bitsPerElement * numFragmentsPerPixel;
    elementOffset = pixelOffset + (fragmentIndex * bitsPerElement);
  } else {
    uint64_t fragmentOffset =
        fragmentIndex * (tileBytes / numFragmentsPerPixel) * 8;
    elementOffset = fragmentOffset + (elementIndex * bitsPerElement);
  }

  uint64_t slicesPerTile = 1;
  uint64_t tileSplitSlice = 0;
  if (tileBytes > tileSplitBytes && tileThickness == 1) {
    slicesPerTile = tileBytes / tileSplitBytes;
    tileSplitSlice = elementOffset / (tileSplitBytes * 8);
    elementOffset %= (tileSplitBytes * 8);
    tileBytes = tileSplitBytes;
  }

  uint64_t macroTileBytes = (macroTileWidth / kMicroTileWidth) *
                            (macroTileHeight / kMicroTileHeight) * tileBytes /
                            (numPipes * numBanks);
  uint64_t macroTilesPerRow = paddedWidth / macroTileWidth;
  uint64_t macroTileRowIndex = y / macroTileHeight;
  uint64_t macroTileColumnIndex = x / macroTileWidth;
  uint64_t macroTileIndex =
      (macroTileRowIndex * macroTilesPerRow) + macroTileColumnIndex;
  uint64_t macro_tile_offset = macroTileIndex * macroTileBytes;
  uint64_t macroTilesPerSlice =
      macroTilesPerRow * (paddedHeight / macroTileHeight);
  uint64_t sliceBytes = macroTilesPerSlice * macroTileBytes;

  uint32_t slice = z;
  uint64_t sliceOffset =
      (tileSplitSlice + slicesPerTile * slice / tileThickness) * sliceBytes;
  if (arraySlice != 0) {
    slice = arraySlice;
  }

  uint64_t tileRowIndex = (y / kMicroTileHeight) % bankHeight;
  uint64_t tileColumnIndex = ((x / kMicroTileWidth) / numPipes) % bankWidth;
  uint64_t tileIndex = (tileRowIndex * bankWidth) + tileColumnIndex;
  uint64_t tileOffset = tileIndex * tileBytes;

  uint64_t bankSwizzle = bankSwizzleMask;
  uint64_t pipeSwizzle = pipeSwizzleMask;

  uint64_t pipeSliceRotation = 0;
  switch (arrayMode) {
  case amdgpu::kArrayMode3dTiledThin:
  case amdgpu::kArrayMode3dTiledThick:
  case amdgpu::kArrayMode3dTiledXThick:
    pipeSliceRotation =
        std::max(1UL, (numPipes / 2UL) - 1UL) * (slice / tileThickness);
    break;
  default:
    break;
  }
  pipeSwizzle += pipeSliceRotation;
  pipeSwizzle &= (numPipes - 1);
  pipe = pipe ^ pipeSwizzle;

  uint32_t sliceRotation = 0;
  switch (arrayMode) {
  case amdgpu::kArrayMode2dTiledThin:
  case amdgpu::kArrayMode2dTiledThick:
  case amdgpu::kArrayMode2dTiledXThick:
    sliceRotation = ((numBanks / 2) - 1) * (slice / tileThickness);
    break;
  case amdgpu::kArrayMode3dTiledThin:
  case amdgpu::kArrayMode3dTiledThick:
  case amdgpu::kArrayMode3dTiledXThick:
    sliceRotation = std::max(1UL, (numPipes / 2UL) - 1UL) *
                    (slice / tileThickness) / numPipes;
    break;
  default:
    break;
  }
  uint64_t tileSplitSliceRotation = 0;
  switch (arrayMode) {
  case amdgpu::kArrayMode2dTiledThin:
  case amdgpu::kArrayMode3dTiledThin:
  case amdgpu::kArrayMode2dTiledThinPrt:
  case amdgpu::kArrayMode3dTiledThinPrt:
    tileSplitSliceRotation = ((numBanks / 2) + 1) * tileSplitSlice;
    break;
  default:
    break;
  }
  bank ^= bankSwizzle + sliceRotation;
  bank ^= tileSplitSliceRotation;
  bank &= (numBanks - 1);

  uint64_t totalOffset =
      (sliceOffset + macro_tile_offset + tileOffset) * 8 + elementOffset;
  uint64_t bitOffset = totalOffset & 0x7;
  totalOffset /= 8;

  uint64_t pipeInterleaveOffset = totalOffset & pipeInterleaveMask;
  uint64_t offset = totalOffset >> pipeInterleaveBits;

  uint64_t finalByteOffset =
      pipeInterleaveOffset | (pipe << (pipeInterleaveBits)) |
      (bank << (pipeInterleaveBits + pipeBits)) |
      (offset << (pipeInterleaveBits + pipeBits + bankBits));
  return (finalByteOffset << 3) | bitOffset;
}

std::uint64_t amdgpu::getTiledOffset(gnm::TextureType texType,
                                     bool isPow2Padded, int numFragments,
                                     gnm::DataFormat dfmt,
                                     amdgpu::TileMode tileMode,
                                     amdgpu::MacroTileMode macroTileMode,
                                     int mipLevel, int arraySlice, int width,
                                     int height, int depth, int pitch, int x,
                                     int y, int z, int fragmentIndex) {
  switch (tileMode.arrayMode()) {
  case amdgpu::kArrayModeLinearGeneral:
  case amdgpu::kArrayModeLinearAligned:
    return getTiledOffsetLinear(dfmt, height, pitch, x, y, z);

  case amdgpu::kArrayMode1dTiledThin:
  case amdgpu::kArrayMode1dTiledThick: {
    return getTiledOffset1D(texType, isPow2Padded, dfmt, tileMode, mipLevel,
                            arraySlice, numFragments, width, height, depth,
                            pitch, x, y, z);
  }

  case amdgpu::kArrayMode2dTiledThin:
  case amdgpu::kArrayMode2dTiledThick:
  case amdgpu::kArrayMode2dTiledXThick:
  case amdgpu::kArrayMode3dTiledThin:
  case amdgpu::kArrayMode3dTiledThick:
  case amdgpu::kArrayMode3dTiledXThick:
  case amdgpu::kArrayModeTiledThinPrt:
  case amdgpu::kArrayModeTiledThickPrt:
  case amdgpu::kArrayMode2dTiledThinPrt:
  case amdgpu::kArrayMode2dTiledThickPrt:
  case amdgpu::kArrayMode3dTiledThinPrt:
  case amdgpu::kArrayMode3dTiledThickPrt:
    return getTiledOffset2D(texType, isPow2Padded, dfmt, tileMode,
                            macroTileMode, mipLevel, arraySlice, numFragments,
                            width, height, depth, pitch, x, y, z,
                            fragmentIndex);
  }

  std::abort();
}