xenia/src/xenia/gpu/gl4/gl4_command_processor.cc

/**
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
 * Copyright 2014 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */

#include "xenia/gpu/gl4/gl4_command_processor.h"

#include <algorithm>

#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/profiling.h"
#include "xenia/gpu/gl4/gl4_gpu_flags.h"
#include "xenia/gpu/gl4/gl4_graphics_system.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/sampler_info.h"
#include "xenia/gpu/texture_info.h"
#include "xenia/gpu/xenos.h"

#include "third_party/xxhash/xxhash.h"

DEFINE_bool(draw_all_framebuffers, false,
            "Copy all render targets to screen on swap");

namespace xe {
namespace gpu {
namespace gl4 {

using namespace xe::gpu::xenos;

const GLuint kAnyTarget = UINT_MAX;

// All uncached vertex/index data goes here. If it fills up we need to sync
// with the GPU, so this should be large enough to prevent that in a normal
// frame.
const size_t kScratchBufferCapacity = 256 * 1024 * 1024;
const size_t kScratchBufferAlignment = 256;

GL4CommandProcessor::CachedPipeline::CachedPipeline()
    : vertex_program(0), fragment_program(0), handles({0}) {}

GL4CommandProcessor::CachedPipeline::~CachedPipeline() {
  glDeleteProgramPipelines(1, &handles.default_pipeline);
  glDeleteProgramPipelines(1, &handles.point_list_pipeline);
  glDeleteProgramPipelines(1, &handles.rect_list_pipeline);
  glDeleteProgramPipelines(1, &handles.quad_list_pipeline);
  glDeleteProgramPipelines(1, &handles.line_quad_list_pipeline);
}

GL4CommandProcessor::GL4CommandProcessor(GL4GraphicsSystem* graphics_system,
                                         kernel::KernelState* kernel_state)
    : CommandProcessor(graphics_system, kernel_state),
      shader_translator_(GlslShaderTranslator::Dialect::kGL45),
      draw_batcher_(graphics_system_->register_file()),
      scratch_buffer_(kScratchBufferCapacity, kScratchBufferAlignment),
      shader_cache_(&shader_translator_) {}

GL4CommandProcessor::~GL4CommandProcessor() = default;

void GL4CommandProcessor::ClearCaches() {
  texture_cache()->Clear();

  for (auto& cached_framebuffer : cached_framebuffers_) {
    glDeleteFramebuffers(1, &cached_framebuffer.framebuffer);
  }
  cached_framebuffers_.clear();

  for (auto& cached_color_render_target : cached_color_render_targets_) {
    glDeleteTextures(1, &cached_color_render_target.texture);
  }
  cached_color_render_targets_.clear();

  for (auto& cached_depth_render_target : cached_depth_render_targets_) {
    glDeleteTextures(1, &cached_depth_render_target.texture);
  }
  cached_depth_render_targets_.clear();

  CommandProcessor::ClearCaches();
}

bool GL4CommandProcessor::SetupContext() {
  if (!CommandProcessor::SetupContext()) {
    XELOGE("Unable to initialize base command processor context");
    return false;
  }

  // Circular buffer holding scratch vertex/index data.
  if (!scratch_buffer_.Initialize()) {
    XELOGE("Unable to initialize scratch buffer");
    return false;
  }

  // Command buffer.
  if (!draw_batcher_.Initialize(&scratch_buffer_)) {
    XELOGE("Unable to initialize command buffer");
    return false;
  }

  // Texture cache that keeps track of any textures/samplers used.
  if (!texture_cache_.Initialize(memory_, &scratch_buffer_)) {
    XELOGE("Unable to initialize texture cache");
    return false;
  }

  const std::string geometry_header =
      "#version 450\n"
      "#extension all : warn\n"
      "#extension GL_ARB_explicit_uniform_location : require\n"
      "#extension GL_ARB_shading_language_420pack : require\n"
      "in gl_PerVertex {\n"
      "  vec4 gl_Position;\n"
      "  float gl_PointSize;\n"
      "  float gl_ClipDistance[];\n"
      "} gl_in[];\n"
      "out gl_PerVertex {\n"
      "  vec4 gl_Position;\n"
      "  float gl_PointSize;\n"
      "  float gl_ClipDistance[];\n"
      "};\n"
      "struct VertexData {\n"
      "  vec4 o[16];\n"
      "};\n"
      "\n"
      "layout(location = 1) in VertexData in_vtx[];\n"
      "layout(location = 1) out VertexData out_vtx;\n";
  // TODO(benvanik): fetch default point size from register and use that if
  //     the VS doesn't write oPointSize.
  // TODO(benvanik): clamp to min/max.
  // TODO(benvanik): figure out how to see which interpolator gets adjusted.
  std::string point_list_shader =
      geometry_header +
      "layout(points) in;\n"
      "layout(triangle_strip, max_vertices = 4) out;\n"
      "void main() {\n"
      "  const vec2 offsets[4] = {\n"
      "    vec2(-1.0,  1.0),\n"
      "    vec2( 1.0,  1.0),\n"
      "    vec2(-1.0, -1.0),\n"
      "    vec2( 1.0, -1.0),\n"
      "  };\n"
      "  vec4 pos = gl_in[0].gl_Position;\n"
      "  float psize = gl_in[0].gl_PointSize;\n"
      "  for (int i = 0; i < 4; ++i) {\n"
      "    gl_Position = vec4(pos.xy + offsets[i] * psize, pos.zw);\n"
      "    out_vtx = in_vtx[0];\n"
      "    EmitVertex();\n"
      "  }\n"
      "  EndPrimitive();\n"
      "}\n";
  std::string rect_list_shader =
      geometry_header +
      "layout(triangles) in;\n"
      "layout(triangle_strip, max_vertices = 6) out;\n"
      "void main() {\n"
      // Most games use the left-aligned form.
      "  bool left_aligned = gl_in[0].gl_Position.x == \n"
      "     gl_in[2].gl_Position.x;\n"
      "  if (left_aligned) {\n"
      //  0 ------ 1
      //  |      - |
      //  |   //   |
      //  | -      |
      //  2 ----- [3]
      "    gl_Position = gl_in[0].gl_Position;\n"
      "    gl_PointSize = gl_in[0].gl_PointSize;\n"
      "    out_vtx = in_vtx[0];\n"
      "    EmitVertex();\n"
      "    gl_Position = gl_in[1].gl_Position;\n"
      "    gl_PointSize = gl_in[1].gl_PointSize;\n"
      "    out_vtx = in_vtx[1];\n"
      "    EmitVertex();\n"
      "    gl_Position = gl_in[2].gl_Position;\n"
      "    gl_PointSize = gl_in[2].gl_PointSize;\n"
      "    out_vtx = in_vtx[2];\n"
      "    EmitVertex();\n"
      "    EndPrimitive();\n"
      "    gl_Position = gl_in[2].gl_Position;\n"
      "    gl_PointSize = gl_in[2].gl_PointSize;\n"
      "    out_vtx = in_vtx[2];\n"
      "    EmitVertex();\n"
      "    gl_Position = gl_in[1].gl_Position;\n"
      "    gl_PointSize = gl_in[1].gl_PointSize;\n"
      "    out_vtx = in_vtx[1];\n"
      "    EmitVertex();\n"
      "    gl_Position = \n"
      "       (gl_in[1].gl_Position + gl_in[2].gl_Position) - \n"
      "       gl_in[0].gl_Position;\n"
      "    gl_PointSize = gl_in[2].gl_PointSize;\n"
      "    for (int i = 0; i < 16; ++i) {\n"
      "      out_vtx.o[i] = -in_vtx[0].o[i] + in_vtx[1].o[i] + \n"
      "          in_vtx[2].o[i];\n"
      "    }\n"
      "    EmitVertex();\n"
      "    EndPrimitive();\n"
      "  } else {\n"
      //  0 ------ 1
      //  | -      |
      //  |   \\   |
      //  |      - |
      // [3] ----- 2
      "    gl_Position = gl_in[0].gl_Position;\n"
      "    gl_PointSize = gl_in[0].gl_PointSize;\n"
      "    out_vtx = in_vtx[0];\n"
      "    EmitVertex();\n"
      "    gl_Position = gl_in[1].gl_Position;\n"
      "    gl_PointSize = gl_in[1].gl_PointSize;\n"
      "    out_vtx = in_vtx[1];\n"
      "    EmitVertex();\n"
      "    gl_Position = gl_in[2].gl_Position;\n"
      "    gl_PointSize = gl_in[2].gl_PointSize;\n"
      "    out_vtx = in_vtx[2];\n"
      "    EmitVertex();\n"
      "    EndPrimitive();\n"
      "    gl_Position = gl_in[0].gl_Position;\n"
      "    gl_PointSize = gl_in[0].gl_PointSize;\n"
      "    out_vtx = in_vtx[0];\n"
      "    EmitVertex();\n"
      "    gl_Position = gl_in[2].gl_Position;\n"
      "    gl_PointSize = gl_in[2].gl_PointSize;\n"
      "    out_vtx = in_vtx[2];\n"
      "    EmitVertex();\n"
      "    gl_Position = (gl_in[0].gl_Position + gl_in[2].gl_Position) - \n"
      "        gl_in[1].gl_Position;\n"
      "    gl_PointSize = gl_in[2].gl_PointSize;\n"
      "    for (int i = 0; i < 16; ++i) {\n"
      "      out_vtx.o[i] = in_vtx[0].o[i] + -in_vtx[1].o[i] + \n"
      "          in_vtx[2].o[i];\n"
      "    }\n"
      "    EmitVertex();\n"
      "    EndPrimitive();\n"
      "  }\n"
      "}\n";
  std::string quad_list_shader =
      geometry_header +
      "layout(lines_adjacency) in;\n"
      "layout(triangle_strip, max_vertices = 4) out;\n"
      "void main() {\n"
      "  const int order[4] = { 0, 1, 3, 2 };\n"
      "  for (int i = 0; i < 4; ++i) {\n"
      "    int input_index = order[i];\n"
      "    gl_Position = gl_in[input_index].gl_Position;\n"
      "    gl_PointSize = gl_in[input_index].gl_PointSize;\n"
      "    out_vtx = in_vtx[input_index];\n"
      "    EmitVertex();\n"
      "  }\n"
      "  EndPrimitive();\n"
      "}\n";
  std::string line_quad_list_shader =
      geometry_header +
      "layout(lines_adjacency) in;\n"
      "layout(line_strip, max_vertices = 5) out;\n"
      "void main() {\n"
      "  gl_Position = gl_in[0].gl_Position;\n"
      "  gl_PointSize = gl_in[0].gl_PointSize;\n"
      "  out_vtx = in_vtx[0];\n"
      "  EmitVertex();\n"
      "  gl_Position = gl_in[1].gl_Position;\n"
      "  gl_PointSize = gl_in[1].gl_PointSize;\n"
      "  out_vtx = in_vtx[1];\n"
      "  EmitVertex();\n"
      "  gl_Position = gl_in[2].gl_Position;\n"
      "  gl_PointSize = gl_in[2].gl_PointSize;\n"
      "  out_vtx = in_vtx[2];\n"
      "  EmitVertex();\n"
      "  gl_Position = gl_in[3].gl_Position;\n"
      "  gl_PointSize = gl_in[3].gl_PointSize;\n"
      "  out_vtx = in_vtx[3];\n"
      "  EmitVertex();\n"
      "  gl_Position = gl_in[0].gl_Position;\n"
      "  gl_PointSize = gl_in[0].gl_PointSize;\n"
      "  out_vtx = in_vtx[0];\n"
      "  EmitVertex();\n"
      "  EndPrimitive();\n"
      "}\n";
  point_list_geometry_program_ = CreateGeometryProgram(point_list_shader);
  rect_list_geometry_program_ = CreateGeometryProgram(rect_list_shader);
  quad_list_geometry_program_ = CreateGeometryProgram(quad_list_shader);
  line_quad_list_geometry_program_ =
      CreateGeometryProgram(line_quad_list_shader);
  if (!point_list_geometry_program_ || !rect_list_geometry_program_ ||
      !quad_list_geometry_program_ || !line_quad_list_geometry_program_) {
    return false;
  }

  glEnable(GL_SCISSOR_TEST);
  glClipControl(GL_UPPER_LEFT, GL_ZERO_TO_ONE);
  glPointParameteri(GL_POINT_SPRITE_COORD_ORIGIN, GL_UPPER_LEFT);

  return true;
}

GLuint GL4CommandProcessor::CreateGeometryProgram(const std::string& source) {
  auto source_str = source.c_str();
  GLuint program = glCreateShaderProgramv(GL_GEOMETRY_SHADER, 1, &source_str);

  // Get error log, if we failed to link.
  GLint link_status = 0;
  glGetProgramiv(program, GL_LINK_STATUS, &link_status);
  if (!link_status) {
    GLint log_length = 0;
    glGetProgramiv(program, GL_INFO_LOG_LENGTH, &log_length);
    std::string info_log;
    info_log.resize(log_length - 1);
    glGetProgramInfoLog(program, log_length, &log_length,
                        const_cast<char*>(info_log.data()));
    XELOGE("Unable to link program: %s", info_log.c_str());
    glDeleteProgram(program);
    return 0;
  }

  return program;
}

void GL4CommandProcessor::ShutdownContext() {
  glDeleteProgram(point_list_geometry_program_);
  glDeleteProgram(rect_list_geometry_program_);
  glDeleteProgram(quad_list_geometry_program_);
  glDeleteProgram(line_quad_list_geometry_program_);
  texture_cache_.Shutdown();
  draw_batcher_.Shutdown();
  scratch_buffer_.Shutdown();

  all_pipelines_.clear();
  shader_cache_.Reset();

  CommandProcessor::ShutdownContext();
}

void GL4CommandProcessor::MakeCoherent() {
  RegisterFile* regs = register_file_;
  auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32;

  CommandProcessor::MakeCoherent();

  if (status_host & 0x80000000ul) {
    scratch_buffer_.ClearCache();
  }
}

void GL4CommandProcessor::PrepareForWait() {
  SCOPE_profile_cpu_f("gpu");

  CommandProcessor::PrepareForWait();

  // TODO(benvanik): fences and fancy stuff. We should figure out a way to
  // make interrupt callbacks from the GPU so that we don't have to do a full
  // synchronize here.
  glFlush();
  // glFinish();

  if (FLAGS_thread_safe_gl) {
    context_->ClearCurrent();
  }
}

void GL4CommandProcessor::ReturnFromWait() {
  if (FLAGS_thread_safe_gl) {
    context_->MakeCurrent();
  }

  CommandProcessor::ReturnFromWait();
}

void GL4CommandProcessor::PerformSwap(uint32_t frontbuffer_ptr,
                                      uint32_t frontbuffer_width,
                                      uint32_t frontbuffer_height) {
  // Ensure we issue any pending draws.
  draw_batcher_.Flush(DrawBatcher::FlushMode::kMakeCoherent);

  // One-time initialization.
  // TODO(benvanik): move someplace more sane?
  if (!swap_state_.front_buffer_texture) {
    std::lock_guard<std::mutex> lock(swap_state_.mutex);
    swap_state_.width = frontbuffer_width;
    swap_state_.height = frontbuffer_height;
    GLuint front_buffer_texture;
    GLuint back_buffer_texture;
    glCreateTextures(GL_TEXTURE_2D, 1, &front_buffer_texture);
    glCreateTextures(GL_TEXTURE_2D, 1, &back_buffer_texture);
    swap_state_.front_buffer_texture = front_buffer_texture;
    swap_state_.back_buffer_texture = back_buffer_texture;
    glTextureStorage2D(front_buffer_texture, 1, GL_RGBA8, swap_state_.width,
                       swap_state_.height);
    glTextureStorage2D(back_buffer_texture, 1, GL_RGBA8, swap_state_.width,
                       swap_state_.height);
  }

  // Lookup the framebuffer in the recently-resolved list.
  // TODO(benvanik): make this much more sophisticated.
  // TODO(benvanik): handle not found cases.
  // TODO(benvanik): handle dirty cases (resolved to sysmem, touched).
  // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  // HACK: just use whatever our current framebuffer is.
  GLuint framebuffer_texture = last_framebuffer_texture_;

  if (last_framebuffer_texture_ == 0) {
    framebuffer_texture =
        active_framebuffer_ ? active_framebuffer_->color_targets[0] : 0;
  }

  // Copy the the given framebuffer to the current backbuffer.
  Rect2D src_rect(0, 0, frontbuffer_width ? frontbuffer_width : 1280,
                  frontbuffer_height ? frontbuffer_height : 720);
  Rect2D dest_rect(0, 0, swap_state_.width, swap_state_.height);
  if (framebuffer_texture != 0) {
    reinterpret_cast<xe::ui::gl::GLContext*>(context_.get())
        ->blitter()
        ->CopyColorTexture2D(
            framebuffer_texture, src_rect,
            static_cast<GLuint>(swap_state_.back_buffer_texture), dest_rect,
            GL_LINEAR, true);
  }

  if (FLAGS_draw_all_framebuffers) {
    int32_t offsetx = (1280 - (1280 / 5));
    int32_t offsety = 0;
    int32_t doffsetx = 0;
    for (int i = 0; i < cached_framebuffers_.size(); i++) {
      bool has_colortargets = false;

      // Copy color targets to top right corner
      for (int j = 0; j < 4; j++) {
        GLuint tex = cached_framebuffers_[i].color_targets[j];
        if (!tex) {
          continue;
        }
        has_colortargets = true;

        dest_rect = {offsetx, offsety, 1280 / 5, 720 / 5};
        reinterpret_cast<ui::gl::GLContext*>(context_.get())
            ->blitter()
            ->CopyColorTexture2D(
                tex, src_rect,
                static_cast<GLuint>(swap_state_.back_buffer_texture), dest_rect,
                GL_LINEAR, true);

        offsety += 720 / 5;
      }

      if (has_colortargets) {
        offsetx -= 1280 / 5;
      }

      offsety = 0;

      GLuint tex = cached_framebuffers_[i].depth_target;
      if (!tex) {
        continue;
      }

      // Copy depth targets to bottom left corner of screen
      dest_rect = {doffsetx, (int32_t)swap_state_.height - (720 / 5), 1280 / 5,
                   720 / 5};
      reinterpret_cast<ui::gl::GLContext*>(context_.get())
          ->blitter()
          ->CopyColorTexture2D(
              tex, src_rect,
              static_cast<GLuint>(swap_state_.back_buffer_texture), dest_rect,
              GL_LINEAR, false);

      doffsetx += 1280 / 5;
    }
  }

  // Need to finish to be sure the other context sees the right data.
  // TODO(benvanik): prevent this? fences?
  glFinish();

  if (context_->WasLost()) {
    // We've lost the context due to a TDR.
    // TODO: Dump the current commands to a tracefile.
    assert_always();
  }

  // Remove any dead textures, etc.
  texture_cache_.Scavenge();
}

Shader* GL4CommandProcessor::LoadShader(ShaderType shader_type,
                                        uint32_t guest_address,
                                        const uint32_t* host_address,
                                        uint32_t dword_count) {
  return shader_cache_.LookupOrInsertShader(shader_type, host_address,
                                            dword_count);
}

bool GL4CommandProcessor::IssueDraw(PrimitiveType prim_type,
                                    uint32_t index_count,
                                    IndexBufferInfo* index_buffer_info) {
#if FINE_GRAINED_DRAW_SCOPES
  SCOPE_profile_cpu_f("gpu");
#endif  // FINE_GRAINED_DRAW_SCOPES

  bool draw_valid;
  if (index_buffer_info) {
    draw_valid = draw_batcher_.BeginDrawElements(prim_type, index_count,
                                                 index_buffer_info->format);
  } else {
    draw_valid = draw_batcher_.BeginDrawArrays(prim_type, index_count);
  }
  if (!draw_valid) {
    return false;
  }

  auto& regs = *register_file_;

  auto enable_mode =
      static_cast<ModeControl>(regs[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7);
  if (enable_mode == ModeControl::kIgnore) {
    // Ignored.
    draw_batcher_.DiscardDraw();
    return true;
  } else if (enable_mode == ModeControl::kCopy) {
    // Special copy handling.
    draw_batcher_.DiscardDraw();
    return IssueCopy();
  }

#define CHECK_ISSUE_UPDATE_STATUS(status, mismatch, error_message) \
  {                                                                \
    if (status == UpdateStatus::kError) {                          \
      XELOGE(error_message);                                       \
      draw_batcher_.DiscardDraw();                                 \
      return false;                                                \
    } else if (status == UpdateStatus::kMismatch) {                \
      mismatch = true;                                             \
    }                                                              \
  }

  UpdateStatus status;
  bool mismatch = false;
  status = UpdateShaders(draw_batcher_.prim_type());
  CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to prepare draw shaders");
  status = UpdateRenderTargets();
  CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to setup render targets");
  if (!active_framebuffer_) {
    // No framebuffer, so nothing we do will actually have an effect.
    // Treat it as a no-op.
    // TODO(benvanik): if we have a vs export, still allow it to go.
    draw_batcher_.DiscardDraw();
    return true;
  }

  status = UpdateState(draw_batcher_.prim_type());
  CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to setup render state");
  status = PopulateSamplers();
  CHECK_ISSUE_UPDATE_STATUS(status, mismatch,
                            "Unable to prepare draw samplers");

  status = PopulateIndexBuffer(index_buffer_info);
  CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to setup index buffer");
  status = PopulateVertexBuffers();
  CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to setup vertex buffers");

  if (!draw_batcher_.CommitDraw()) {
    return false;
  }

  // TODO(benvanik): find a way to get around glVertexArrayVertexBuffer below.
  draw_batcher_.Flush(DrawBatcher::FlushMode::kMakeCoherent);
  if (context_->WasLost()) {
    // This draw lost us the context. This typically isn't hit.
    assert_always();
    return false;
  }

  return true;
}

bool GL4CommandProcessor::SetShadowRegister(uint32_t* dest,
                                            uint32_t register_name) {
  uint32_t value = register_file_->values[register_name].u32;
  if (*dest == value) {
    return false;
  }
  *dest = value;
  return true;
}

bool GL4CommandProcessor::SetShadowRegister(float* dest,
                                            uint32_t register_name) {
  float value = register_file_->values[register_name].f32;
  if (*dest == value) {
    return false;
  }
  *dest = value;
  return true;
}

GL4CommandProcessor::UpdateStatus GL4CommandProcessor::UpdateShaders(
    PrimitiveType prim_type) {
  auto& regs = update_shaders_regs_;

  // These are the constant base addresses/ranges for shaders.
  // We have these hardcoded right now cause nothing seems to differ.
  assert_true(register_file_->values[XE_GPU_REG_SQ_VS_CONST].u32 ==
                  0x000FF000 ||
              register_file_->values[XE_GPU_REG_SQ_VS_CONST].u32 == 0x00000000);
  assert_true(register_file_->values[XE_GPU_REG_SQ_PS_CONST].u32 ==
                  0x000FF100 ||
              register_file_->values[XE_GPU_REG_SQ_PS_CONST].u32 == 0x00000000);

  bool dirty = false;
  dirty |= SetShadowRegister(&regs.pa_su_sc_mode_cntl,
                             XE_GPU_REG_PA_SU_SC_MODE_CNTL);
  dirty |= SetShadowRegister(&regs.sq_program_cntl, XE_GPU_REG_SQ_PROGRAM_CNTL);
  dirty |= SetShadowRegister(&regs.sq_context_misc, XE_GPU_REG_SQ_CONTEXT_MISC);
  dirty |= regs.vertex_shader != active_vertex_shader_;
  dirty |= regs.pixel_shader != active_pixel_shader_;
  dirty |= regs.prim_type != prim_type;
  if (!dirty) {
    return UpdateStatus::kCompatible;
  }
  regs.vertex_shader = static_cast<GL4Shader*>(active_vertex_shader_);
  regs.pixel_shader = static_cast<GL4Shader*>(active_pixel_shader_);
  regs.prim_type = prim_type;

  SCOPE_profile_cpu_f("gpu");

  draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange);

  xe_gpu_program_cntl_t program_cntl;
  program_cntl.dword_0 = regs.sq_program_cntl;

  // Populate a register in the pixel shader with frag coord.
  int ps_param_gen = (regs.sq_context_misc >> 8) & 0xFF;
  draw_batcher_.set_ps_param_gen(program_cntl.param_gen ? ps_param_gen : -1);

  // Normal vertex shaders only, for now.
  // TODO(benvanik): transform feedback/memexport.
  // https://github.com/freedreno/freedreno/blob/master/includes/a2xx.xml.h
  // 0 = normal
  // 2 = point size
  assert_true(program_cntl.vs_export_mode == 0 ||
              program_cntl.vs_export_mode == 2);

  if (!regs.vertex_shader->is_valid()) {
    XELOGE("Vertex shader invalid");
    return UpdateStatus::kError;
  }
  if (!regs.pixel_shader->is_valid()) {
    XELOGE("Pixel shader invalid");
    return UpdateStatus::kError;
  }

  GLuint vertex_program = regs.vertex_shader->program();
  GLuint fragment_program = regs.pixel_shader->program();

  uint64_t key = (uint64_t(vertex_program) << 32) | fragment_program;
  CachedPipeline* cached_pipeline = nullptr;
  auto it = cached_pipelines_.find(key);
  if (it == cached_pipelines_.end()) {
    // Existing pipeline for these programs not found - create it.
    auto new_pipeline = std::make_unique<CachedPipeline>();
    new_pipeline->vertex_program = vertex_program;
    new_pipeline->fragment_program = fragment_program;
    new_pipeline->handles.default_pipeline = 0;
    cached_pipeline = new_pipeline.get();
    all_pipelines_.emplace_back(std::move(new_pipeline));
    cached_pipelines_.insert({key, cached_pipeline});
  } else {
    // Found a pipeline container - it may or may not have what we want.
    cached_pipeline = it->second;
  }
  if (!cached_pipeline->handles.default_pipeline) {
    // Perhaps it's a bit wasteful to do all of these, but oh well.
    GLuint pipelines[5];
    glCreateProgramPipelines(GLsizei(xe::countof(pipelines)), pipelines);

    glUseProgramStages(pipelines[0], GL_VERTEX_SHADER_BIT, vertex_program);
    glUseProgramStages(pipelines[0], GL_FRAGMENT_SHADER_BIT, fragment_program);
    cached_pipeline->handles.default_pipeline = pipelines[0];

    glUseProgramStages(pipelines[1], GL_VERTEX_SHADER_BIT, vertex_program);
    glUseProgramStages(pipelines[1], GL_GEOMETRY_SHADER_BIT,
                       point_list_geometry_program_);
    glUseProgramStages(pipelines[1], GL_FRAGMENT_SHADER_BIT, fragment_program);
    cached_pipeline->handles.point_list_pipeline = pipelines[1];

    glUseProgramStages(pipelines[2], GL_VERTEX_SHADER_BIT, vertex_program);
    glUseProgramStages(pipelines[2], GL_GEOMETRY_SHADER_BIT,
                       rect_list_geometry_program_);
    glUseProgramStages(pipelines[2], GL_FRAGMENT_SHADER_BIT, fragment_program);
    cached_pipeline->handles.rect_list_pipeline = pipelines[2];

    glUseProgramStages(pipelines[3], GL_VERTEX_SHADER_BIT, vertex_program);
    glUseProgramStages(pipelines[3], GL_GEOMETRY_SHADER_BIT,
                       quad_list_geometry_program_);
    glUseProgramStages(pipelines[3], GL_FRAGMENT_SHADER_BIT, fragment_program);
    cached_pipeline->handles.quad_list_pipeline = pipelines[3];

    glUseProgramStages(pipelines[4], GL_VERTEX_SHADER_BIT, vertex_program);
    glUseProgramStages(pipelines[4], GL_GEOMETRY_SHADER_BIT,
                       line_quad_list_geometry_program_);
    glUseProgramStages(pipelines[4], GL_FRAGMENT_SHADER_BIT, fragment_program);
    cached_pipeline->handles.line_quad_list_pipeline = pipelines[4];

    // This can be set once, as the buffer never changes.
    glVertexArrayElementBuffer(regs.vertex_shader->vao(),
                               scratch_buffer_.handle());
  }

  bool line_mode = false;
  if (((regs.pa_su_sc_mode_cntl >> 3) & 0x3) != 0) {
    uint32_t front_poly_mode = (regs.pa_su_sc_mode_cntl >> 5) & 0x7;
    if (front_poly_mode == 1) {
      line_mode = true;
    }
  }

  GLuint pipeline;
  switch (regs.prim_type) {
    default:
      // Default pipeline used.
      pipeline = cached_pipeline->handles.default_pipeline;
      break;
    case PrimitiveType::kPointList:
      pipeline = cached_pipeline->handles.point_list_pipeline;
      break;
    case PrimitiveType::kRectangleList:
      pipeline = cached_pipeline->handles.rect_list_pipeline;
      break;
    case PrimitiveType::kQuadList: {
      if (line_mode) {
        pipeline = cached_pipeline->handles.line_quad_list_pipeline;
      } else {
        pipeline = cached_pipeline->handles.quad_list_pipeline;
      }
      break;
    }
  }

  draw_batcher_.ReconfigurePipeline(regs.vertex_shader, regs.pixel_shader,
                                    pipeline);

  glBindProgramPipeline(pipeline);
  glBindVertexArray(regs.vertex_shader->vao());

  return UpdateStatus::kMismatch;
}

GL4CommandProcessor::UpdateStatus GL4CommandProcessor::UpdateRenderTargets() {
  auto& regs = update_render_targets_regs_;

  bool dirty = false;
  dirty |= SetShadowRegister(&regs.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL);
  dirty |= SetShadowRegister(&regs.rb_surface_info, XE_GPU_REG_RB_SURFACE_INFO);
  dirty |= SetShadowRegister(&regs.rb_color_info, XE_GPU_REG_RB_COLOR_INFO);
  dirty |= SetShadowRegister(&regs.rb_color1_info, XE_GPU_REG_RB_COLOR1_INFO);
  dirty |= SetShadowRegister(&regs.rb_color2_info, XE_GPU_REG_RB_COLOR2_INFO);
  dirty |= SetShadowRegister(&regs.rb_color3_info, XE_GPU_REG_RB_COLOR3_INFO);
  dirty |= SetShadowRegister(&regs.rb_color_mask, XE_GPU_REG_RB_COLOR_MASK);
  dirty |= SetShadowRegister(&regs.rb_depthcontrol, XE_GPU_REG_RB_DEPTHCONTROL);
  dirty |=
      SetShadowRegister(&regs.rb_stencilrefmask, XE_GPU_REG_RB_STENCILREFMASK);
  dirty |= SetShadowRegister(&regs.rb_depth_info, XE_GPU_REG_RB_DEPTH_INFO);
  if (!dirty) {
    return UpdateStatus::kCompatible;
  }

  SCOPE_profile_cpu_f("gpu");

  draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange);

  auto enable_mode = static_cast<ModeControl>(regs.rb_modecontrol & 0x7);

  // RB_SURFACE_INFO
  // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
  uint32_t surface_pitch = regs.rb_surface_info & 0x3FFF;
  auto surface_msaa =
      static_cast<MsaaSamples>((regs.rb_surface_info >> 16) & 0x3);

  // Get/create all color render targets, if we are using them.
  // In depth-only mode we don't need them.
  // Note that write mask may be more permissive than we want, so we mix that
  // with the actual targets the pixel shader writes to.
  GLenum draw_buffers[4] = {GL_NONE, GL_NONE, GL_NONE, GL_NONE};
  GLuint color_targets[4] = {kAnyTarget, kAnyTarget, kAnyTarget, kAnyTarget};
  if (enable_mode == ModeControl::kColorDepth) {
    uint32_t color_info[4] = {
        regs.rb_color_info, regs.rb_color1_info, regs.rb_color2_info,
        regs.rb_color3_info,
    };
    // A2XX_RB_COLOR_MASK_WRITE_* == D3DRS_COLORWRITEENABLE
    for (int n = 0; n < xe::countof(color_info); n++) {
      uint32_t write_mask = (regs.rb_color_mask >> (n * 4)) & 0xF;
      if (!write_mask || !active_pixel_shader_->writes_color_target(n)) {
        // Unused, so keep disabled and set to wildcard so we'll take any
        // framebuffer that has it.
        continue;
      }
      uint32_t color_base = color_info[n] & 0xFFF;
      auto color_format =
          static_cast<ColorRenderTargetFormat>((color_info[n] >> 16) & 0xF);
      color_targets[n] = GetColorRenderTarget(surface_pitch, surface_msaa,
                                              color_base, color_format);
      draw_buffers[n] = GL_COLOR_ATTACHMENT0 + n;
      glColorMaski(n, !!(write_mask & 0x1), !!(write_mask & 0x2),
                   !!(write_mask & 0x4), !!(write_mask & 0x8));
    }
  }

  // Get/create depth buffer, but only if we are going to use it.
  bool uses_depth = (regs.rb_depthcontrol & 0x00000002) ||
                    (regs.rb_depthcontrol & 0x00000004);
  uint32_t stencil_write_mask = (regs.rb_stencilrefmask & 0x00FF0000) >> 16;
  bool uses_stencil =
      (regs.rb_depthcontrol & 0x00000001) || (stencil_write_mask != 0);
  GLuint depth_target = kAnyTarget;
  if (uses_depth || uses_stencil) {
    uint32_t depth_base = regs.rb_depth_info & 0xFFF;
    auto depth_format =
        static_cast<DepthRenderTargetFormat>((regs.rb_depth_info >> 16) & 0x1);
    depth_target = GetDepthRenderTarget(surface_pitch, surface_msaa, depth_base,
                                        depth_format);
    // TODO(benvanik): when a game switches does it expect to keep the same
    //     depth buffer contents?
  }

  // Get/create a framebuffer with the required targets.
  // Note that none may be returned if we really don't need one.
  auto cached_framebuffer = GetFramebuffer(color_targets, depth_target);
  active_framebuffer_ = cached_framebuffer;
  if (active_framebuffer_) {
    // Setup just the targets we want.
    glNamedFramebufferDrawBuffers(cached_framebuffer->framebuffer, 4,
                                  draw_buffers);

    // Make active.
    // TODO(benvanik): can we do this all named?
    // TODO(benvanik): do we want this on READ too?
    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, cached_framebuffer->framebuffer);
  }

  return UpdateStatus::kMismatch;
}

GL4CommandProcessor::UpdateStatus GL4CommandProcessor::UpdateState(
    PrimitiveType prim_type) {
  bool mismatch = false;

#define CHECK_UPDATE_STATUS(status, mismatch, error_message) \
  {                                                          \
    if (status == UpdateStatus::kError) {                    \
      XELOGE(error_message);                                 \
      return status;                                         \
    } else if (status == UpdateStatus::kMismatch) {          \
      mismatch = true;                                       \
    }                                                        \
  }

  UpdateStatus status;
  status = UpdateViewportState();
  CHECK_UPDATE_STATUS(status, mismatch, "Unable to update viewport state");
  status = UpdateRasterizerState(prim_type);
  CHECK_UPDATE_STATUS(status, mismatch, "Unable to update rasterizer state");
  status = UpdateBlendState();
  CHECK_UPDATE_STATUS(status, mismatch, "Unable to update blend state");
  status = UpdateDepthStencilState();
  CHECK_UPDATE_STATUS(status, mismatch, "Unable to update depth/stencil state");

  return mismatch ? UpdateStatus::kMismatch : UpdateStatus::kCompatible;
}

GL4CommandProcessor::UpdateStatus GL4CommandProcessor::UpdateViewportState() {
  auto& regs = update_viewport_state_regs_;

  bool dirty = false;
  // dirty |= SetShadowRegister(&state_regs.pa_cl_clip_cntl,
  //     XE_GPU_REG_PA_CL_CLIP_CNTL);
  dirty |= SetShadowRegister(&regs.rb_surface_info, XE_GPU_REG_RB_SURFACE_INFO);
  dirty |= SetShadowRegister(&regs.pa_cl_vte_cntl, XE_GPU_REG_PA_CL_VTE_CNTL);
  dirty |= SetShadowRegister(&regs.pa_su_sc_mode_cntl,
                             XE_GPU_REG_PA_SU_SC_MODE_CNTL);
  dirty |= SetShadowRegister(&regs.pa_sc_window_offset,
                             XE_GPU_REG_PA_SC_WINDOW_OFFSET);
  dirty |= SetShadowRegister(&regs.pa_sc_window_scissor_tl,
                             XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL);
  dirty |= SetShadowRegister(&regs.pa_sc_window_scissor_br,
                             XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR);
  dirty |= SetShadowRegister(&regs.pa_cl_vport_xoffset,
                             XE_GPU_REG_PA_CL_VPORT_XOFFSET);
  dirty |= SetShadowRegister(&regs.pa_cl_vport_yoffset,
                             XE_GPU_REG_PA_CL_VPORT_YOFFSET);
  dirty |= SetShadowRegister(&regs.pa_cl_vport_zoffset,
                             XE_GPU_REG_PA_CL_VPORT_ZOFFSET);
  dirty |= SetShadowRegister(&regs.pa_cl_vport_xscale,
                             XE_GPU_REG_PA_CL_VPORT_XSCALE);
  dirty |= SetShadowRegister(&regs.pa_cl_vport_yscale,
                             XE_GPU_REG_PA_CL_VPORT_YSCALE);
  dirty |= SetShadowRegister(&regs.pa_cl_vport_zscale,
                             XE_GPU_REG_PA_CL_VPORT_ZSCALE);

  // Much of this state machine is extracted from:
  // https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c
  // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
  // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf

  // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
  // VTX_XY_FMT = true: the incoming X, Y have already been multiplied by 1/W0.
  //            = false: multiply the X, Y coordinates by 1/W0.
  // VTX_Z_FMT = true: the incoming Z has already been multiplied by 1/W0.
  //           = false: multiply the Z coordinate by 1/W0.
  // VTX_W0_FMT = true: the incoming W0 is not 1/W0. Perform the reciprocal to
  //                    get 1/W0.
  draw_batcher_.set_vtx_fmt((regs.pa_cl_vte_cntl >> 8) & 0x1 ? 1.0f : 0.0f,
                            (regs.pa_cl_vte_cntl >> 9) & 0x1 ? 1.0f : 0.0f,
                            (regs.pa_cl_vte_cntl >> 10) & 0x1 ? 1.0f : 0.0f);

  // Done in VS, no need to flush state.
  if ((regs.pa_cl_vte_cntl & (1 << 0)) > 0) {
    draw_batcher_.set_window_scalar(1.0f, 1.0f);
  } else {
    draw_batcher_.set_window_scalar(1.0f / 2560.0f, -1.0f / 2560.0f);
  }

  if (!dirty) {
    return UpdateStatus::kCompatible;
  }

  draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange);

  // Clipping.
  // https://github.com/freedreno/amd-gpu/blob/master/include/reg/yamato/14/yamato_genenum.h#L1587
  // bool clip_enabled = ((regs.pa_cl_clip_cntl >> 17) & 0x1) == 0;
  // bool dx_clip = ((regs.pa_cl_clip_cntl >> 19) & 0x1) == 0x1;
  //// TODO(benvanik): depth range?
  // if (dx_clip) {
  //  glClipControl(GL_UPPER_LEFT, GL_ZERO_TO_ONE);
  //} else {
  //  glClipControl(GL_LOWER_LEFT, GL_NEGATIVE_ONE_TO_ONE);
  //}

  // Window parameters.
  // http://ftp.tku.edu.tw/NetBSD/NetBSD-current/xsrc/external/mit/xf86-video-ati/dist/src/r600_reg_auto_r6xx.h
  // See r200UpdateWindow:
  // https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c
  int16_t window_offset_x = 0;
  int16_t window_offset_y = 0;
  if ((regs.pa_su_sc_mode_cntl >> 16) & 1) {
    window_offset_x = regs.pa_sc_window_offset & 0x7FFF;
    window_offset_y = (regs.pa_sc_window_offset >> 16) & 0x7FFF;
    if (window_offset_x & 0x4000) {
      window_offset_x |= 0x8000;
    }
    if (window_offset_y & 0x4000) {
      window_offset_y |= 0x8000;
    }
  }

  GLint ws_x = regs.pa_sc_window_scissor_tl & 0x7FFF;
  GLint ws_y = (regs.pa_sc_window_scissor_tl >> 16) & 0x7FFF;
  GLsizei ws_w = (regs.pa_sc_window_scissor_br & 0x7FFF) - ws_x;
  GLsizei ws_h = ((regs.pa_sc_window_scissor_br >> 16) & 0x7FFF) - ws_y;
  ws_x += window_offset_x;
  ws_y += window_offset_y;
  glScissorIndexed(0, ws_x, ws_y, ws_w, ws_h);

  // HACK: no clue where to get these values.
  // RB_SURFACE_INFO
  auto surface_msaa =
      static_cast<MsaaSamples>((regs.rb_surface_info >> 16) & 0x3);
  // TODO(benvanik): ??
  float window_width_scalar = 1;
  float window_height_scalar = 1;
  switch (surface_msaa) {
    case MsaaSamples::k1X:
      break;
    case MsaaSamples::k2X:
      window_width_scalar = 2;
      break;
    case MsaaSamples::k4X:
      window_width_scalar = 2;
      window_height_scalar = 2;
      break;
  }

  // Whether each of the viewport settings are enabled.
  // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
  bool vport_xscale_enable = (regs.pa_cl_vte_cntl & (1 << 0)) > 0;
  bool vport_xoffset_enable = (regs.pa_cl_vte_cntl & (1 << 1)) > 0;
  bool vport_yscale_enable = (regs.pa_cl_vte_cntl & (1 << 2)) > 0;
  bool vport_yoffset_enable = (regs.pa_cl_vte_cntl & (1 << 3)) > 0;
  bool vport_zscale_enable = (regs.pa_cl_vte_cntl & (1 << 4)) > 0;
  bool vport_zoffset_enable = (regs.pa_cl_vte_cntl & (1 << 5)) > 0;
  assert_true(vport_xscale_enable == vport_yscale_enable ==
              vport_zscale_enable == vport_xoffset_enable ==
              vport_yoffset_enable == vport_zoffset_enable);

  if (vport_xscale_enable) {
    float texel_offset_x = 0.0f;
    float texel_offset_y = 0.0f;
    float vox = vport_xoffset_enable ? regs.pa_cl_vport_xoffset : 0;
    float voy = vport_yoffset_enable ? regs.pa_cl_vport_yoffset : 0;
    float vsx = vport_xscale_enable ? regs.pa_cl_vport_xscale : 1;
    float vsy = vport_yscale_enable ? regs.pa_cl_vport_yscale : 1;
    window_width_scalar = window_height_scalar = 1;
    float vpw = 2 * window_width_scalar * vsx;
    float vph = -2 * window_height_scalar * vsy;
    float vpx = window_width_scalar * vox - vpw / 2 + window_offset_x;
    float vpy = window_height_scalar * voy - vph / 2 + window_offset_y;
    glViewportIndexedf(0, vpx + texel_offset_x, vpy + texel_offset_y, vpw, vph);

    // TODO(benvanik): depth range adjustment?
    // float voz = vport_zoffset_enable ? regs.pa_cl_vport_zoffset : 0;
    // float vsz = vport_zscale_enable ? regs.pa_cl_vport_zscale : 1;
  } else {
    float texel_offset_x = 0.0f;
    float texel_offset_y = 0.0f;
    float vpw = 2 * 2560.0f * window_width_scalar;
    float vph = 2 * 2560.0f * window_height_scalar;
    float vpx = -2560.0f * window_width_scalar + window_offset_x;
    float vpy = -2560.0f * window_height_scalar + window_offset_y;
    glViewportIndexedf(0, vpx + texel_offset_x, vpy + texel_offset_y, vpw, vph);
  }
  float voz = vport_zoffset_enable ? regs.pa_cl_vport_zoffset : 0;
  float vsz = vport_zscale_enable ? regs.pa_cl_vport_zscale : 1;
  glDepthRangef(voz, voz + vsz);

  return UpdateStatus::kMismatch;
}

GL4CommandProcessor::UpdateStatus GL4CommandProcessor::UpdateRasterizerState(
    PrimitiveType prim_type) {
  auto& regs = update_rasterizer_state_regs_;

  bool dirty = false;
  dirty |= SetShadowRegister(&regs.pa_su_sc_mode_cntl,
                             XE_GPU_REG_PA_SU_SC_MODE_CNTL);
  dirty |= SetShadowRegister(&regs.pa_sc_screen_scissor_tl,
                             XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL);
  dirty |= SetShadowRegister(&regs.pa_sc_screen_scissor_br,
                             XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR);
  dirty |= SetShadowRegister(&regs.multi_prim_ib_reset_index,
                             XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX);
  dirty |= SetShadowRegister(&regs.pa_sc_viz_query, XE_GPU_REG_PA_SC_VIZ_QUERY);
  dirty |= regs.prim_type != prim_type;
  if (!dirty) {
    return UpdateStatus::kCompatible;
  }

  regs.prim_type = prim_type;

  SCOPE_profile_cpu_f("gpu");

  draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange);

  // viz query enabled
  // assert_zero(regs.pa_sc_viz_query & 0x01);

  // Kill pix post early-z test
  // assert_zero(regs.pa_sc_viz_query & 0x80);

  // Scissoring.
  // TODO(benvanik): is this used? we are using scissoring for window scissor.
  if (regs.pa_sc_screen_scissor_tl != 0 &&
      regs.pa_sc_screen_scissor_br != 0x20002000) {
    assert_always();
    // glEnable(GL_SCISSOR_TEST);
    // TODO(benvanik): signed?
    int32_t screen_scissor_x = regs.pa_sc_screen_scissor_tl & 0x7FFF;
    int32_t screen_scissor_y = (regs.pa_sc_screen_scissor_tl >> 16) & 0x7FFF;
    int32_t screen_scissor_w =
        regs.pa_sc_screen_scissor_br & 0x7FFF - screen_scissor_x;
    int32_t screen_scissor_h =
        (regs.pa_sc_screen_scissor_br >> 16) & 0x7FFF - screen_scissor_y;
    glScissor(screen_scissor_x, screen_scissor_y, screen_scissor_w,
              screen_scissor_h);
  } else {
    // glDisable(GL_SCISSOR_TEST);
  }

  switch (regs.pa_su_sc_mode_cntl & 0x3) {
    case 0:
      glDisable(GL_CULL_FACE);
      break;
    case 1:
      glEnable(GL_CULL_FACE);
      glCullFace(GL_FRONT);
      break;
    case 2:
      glEnable(GL_CULL_FACE);
      glCullFace(GL_BACK);
      break;
  }
  if (regs.pa_su_sc_mode_cntl & 0x4) {
    glFrontFace(GL_CW);
  } else {
    glFrontFace(GL_CCW);
  }

  if (prim_type == PrimitiveType::kRectangleList) {
    // Rectangle lists aren't culled. There may be other things they skip too.
    glDisable(GL_CULL_FACE);
  }

  static const GLenum kFillModes[3] = {
      GL_POINT, GL_LINE, GL_FILL,
  };
  bool poly_mode = ((regs.pa_su_sc_mode_cntl >> 3) & 0x3) != 0;
  if (poly_mode) {
    uint32_t front_poly_mode = (regs.pa_su_sc_mode_cntl >> 5) & 0x7;
    uint32_t back_poly_mode = (regs.pa_su_sc_mode_cntl >> 8) & 0x7;
    // GL only supports both matching.
    assert_true(front_poly_mode == back_poly_mode);
    glPolygonMode(GL_FRONT_AND_BACK, kFillModes[front_poly_mode]);
  } else {
    glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
  }

  if (regs.pa_su_sc_mode_cntl & (1 << 19)) {
    glProvokingVertex(GL_LAST_VERTEX_CONVENTION);
  } else {
    glProvokingVertex(GL_FIRST_VERTEX_CONVENTION);
  }

  if (regs.pa_su_sc_mode_cntl & (1 << 21)) {
    glEnable(GL_PRIMITIVE_RESTART);
  } else {
    glDisable(GL_PRIMITIVE_RESTART);
  }
  glPrimitiveRestartIndex(regs.multi_prim_ib_reset_index);

  return UpdateStatus::kMismatch;
}

GL4CommandProcessor::UpdateStatus GL4CommandProcessor::UpdateBlendState() {
  auto& reg_file = *register_file_;
  auto& regs = update_blend_state_regs_;

  // Alpha testing -- ALPHAREF, ALPHAFUNC, ALPHATESTENABLE
  // Deprecated in GL, implemented in shader.
  // if(ALPHATESTENABLE && frag_out.a [<=/ALPHAFUNC] ALPHAREF) discard;
  uint32_t color_control = reg_file[XE_GPU_REG_RB_COLORCONTROL].u32;
  draw_batcher_.set_alpha_test((color_control & 0x8) != 0,  // ALPAHTESTENABLE
                               color_control & 0x7,         // ALPHAFUNC
                               reg_file[XE_GPU_REG_RB_ALPHA_REF].f32);

  bool dirty = false;
  dirty |=
      SetShadowRegister(&regs.rb_blendcontrol[0], XE_GPU_REG_RB_BLENDCONTROL_0);
  dirty |=
      SetShadowRegister(&regs.rb_blendcontrol[1], XE_GPU_REG_RB_BLENDCONTROL_1);
  dirty |=
      SetShadowRegister(&regs.rb_blendcontrol[2], XE_GPU_REG_RB_BLENDCONTROL_2);
  dirty |=
      SetShadowRegister(&regs.rb_blendcontrol[3], XE_GPU_REG_RB_BLENDCONTROL_3);
  dirty |= SetShadowRegister(&regs.rb_blend_rgba[0], XE_GPU_REG_RB_BLEND_RED);
  dirty |= SetShadowRegister(&regs.rb_blend_rgba[1], XE_GPU_REG_RB_BLEND_GREEN);
  dirty |= SetShadowRegister(&regs.rb_blend_rgba[2], XE_GPU_REG_RB_BLEND_BLUE);
  dirty |= SetShadowRegister(&regs.rb_blend_rgba[3], XE_GPU_REG_RB_BLEND_ALPHA);
  if (!dirty) {
    return UpdateStatus::kCompatible;
  }

  SCOPE_profile_cpu_f("gpu");

  draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange);

  static const GLenum blend_map[] = {
      /*  0 */ GL_ZERO,
      /*  1 */ GL_ONE,
      /*  2 */ GL_ZERO,  // ?
      /*  3 */ GL_ZERO,  // ?
      /*  4 */ GL_SRC_COLOR,
      /*  5 */ GL_ONE_MINUS_SRC_COLOR,
      /*  6 */ GL_SRC_ALPHA,
      /*  7 */ GL_ONE_MINUS_SRC_ALPHA,
      /*  8 */ GL_DST_COLOR,
      /*  9 */ GL_ONE_MINUS_DST_COLOR,
      /* 10 */ GL_DST_ALPHA,
      /* 11 */ GL_ONE_MINUS_DST_ALPHA,
      /* 12 */ GL_CONSTANT_COLOR,
      /* 13 */ GL_ONE_MINUS_CONSTANT_COLOR,
      /* 14 */ GL_CONSTANT_ALPHA,
      /* 15 */ GL_ONE_MINUS_CONSTANT_ALPHA,
      /* 16 */ GL_SRC_ALPHA_SATURATE,
  };
  static const GLenum blend_op_map[] = {
      /*  0 */ GL_FUNC_ADD,
      /*  1 */ GL_FUNC_SUBTRACT,
      /*  2 */ GL_MIN,
      /*  3 */ GL_MAX,
      /*  4 */ GL_FUNC_REVERSE_SUBTRACT,
  };
  for (int i = 0; i < xe::countof(regs.rb_blendcontrol); ++i) {
    uint32_t blend_control = regs.rb_blendcontrol[i];
    // A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND
    auto src_blend = blend_map[(blend_control & 0x0000001F) >> 0];
    // A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND
    auto dest_blend = blend_map[(blend_control & 0x00001F00) >> 8];
    // A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN
    auto blend_op = blend_op_map[(blend_control & 0x000000E0) >> 5];
    // A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND
    auto src_blend_alpha = blend_map[(blend_control & 0x001F0000) >> 16];
    // A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND
    auto dest_blend_alpha = blend_map[(blend_control & 0x1F000000) >> 24];
    // A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN
    auto blend_op_alpha = blend_op_map[(blend_control & 0x00E00000) >> 21];
    // A2XX_RB_COLORCONTROL_BLEND_DISABLE ?? Can't find this!
    // Just guess based on actions.
    // bool blend_enable =
    //     !((src_blend == GL_ONE) && (dest_blend == GL_ZERO) &&
    //       (blend_op == GL_FUNC_ADD) && (src_blend_alpha == GL_ONE) &&
    //       (dest_blend_alpha == GL_ZERO) && (blend_op_alpha == GL_FUNC_ADD));
    bool blend_enable = !(color_control & 0x20);
    if (blend_enable) {
      glEnablei(GL_BLEND, i);
      glBlendEquationSeparatei(i, blend_op, blend_op_alpha);
      glBlendFuncSeparatei(i, src_blend, dest_blend, src_blend_alpha,
                           dest_blend_alpha);
    } else {
      glDisablei(GL_BLEND, i);
    }
  }

  glBlendColor(regs.rb_blend_rgba[0], regs.rb_blend_rgba[1],
               regs.rb_blend_rgba[2], regs.rb_blend_rgba[3]);

  return UpdateStatus::kMismatch;
}

GL4CommandProcessor::UpdateStatus
GL4CommandProcessor::UpdateDepthStencilState() {
  auto& regs = update_depth_stencil_state_regs_;

  bool dirty = false;
  dirty |= SetShadowRegister(&regs.rb_depthcontrol, XE_GPU_REG_RB_DEPTHCONTROL);
  dirty |=
      SetShadowRegister(&regs.rb_stencilrefmask, XE_GPU_REG_RB_STENCILREFMASK);
  if (!dirty) {
    return UpdateStatus::kCompatible;
  }

  SCOPE_profile_cpu_f("gpu");

  draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange);

  static const GLenum compare_func_map[] = {
      /*  0 */ GL_NEVER,
      /*  1 */ GL_LESS,
      /*  2 */ GL_EQUAL,
      /*  3 */ GL_LEQUAL,
      /*  4 */ GL_GREATER,
      /*  5 */ GL_NOTEQUAL,
      /*  6 */ GL_GEQUAL,
      /*  7 */ GL_ALWAYS,
  };
  static const GLenum stencil_op_map[] = {
      /*  0 */ GL_KEEP,
      /*  1 */ GL_ZERO,
      /*  2 */ GL_REPLACE,
      /*  3 */ GL_INCR_WRAP,
      /*  4 */ GL_DECR_WRAP,
      /*  5 */ GL_INVERT,
      /*  6 */ GL_INCR,
      /*  7 */ GL_DECR,
  };
  // A2XX_RB_DEPTHCONTROL_Z_ENABLE
  if (regs.rb_depthcontrol & 0x00000002) {
    glEnable(GL_DEPTH_TEST);
  } else {
    glDisable(GL_DEPTH_TEST);
  }
  // glDisable(GL_DEPTH_TEST);
  // A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE
  glDepthMask((regs.rb_depthcontrol & 0x00000004) ? GL_TRUE : GL_FALSE);
  // A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE
  // ?
  // A2XX_RB_DEPTHCONTROL_ZFUNC
  glDepthFunc(compare_func_map[(regs.rb_depthcontrol & 0x00000070) >> 4]);
  // A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE
  if (regs.rb_depthcontrol & 0x00000001) {
    glEnable(GL_STENCIL_TEST);
  } else {
    glDisable(GL_STENCIL_TEST);
  }
  // RB_STENCILREFMASK_STENCILREF
  uint32_t stencil_ref = (regs.rb_stencilrefmask & 0x000000FF);
  // RB_STENCILREFMASK_STENCILMASK
  uint32_t stencil_read_mask = (regs.rb_stencilrefmask & 0x0000FF00) >> 8;
  // RB_STENCILREFMASK_STENCILWRITEMASK
  glStencilMask((regs.rb_stencilrefmask & 0x00FF0000) >> 16);
  // A2XX_RB_DEPTHCONTROL_BACKFACE_ENABLE
  bool backface_enabled = (regs.rb_depthcontrol & 0x00000080) != 0;
  if (backface_enabled) {
    // A2XX_RB_DEPTHCONTROL_STENCILFUNC
    glStencilFuncSeparate(
        GL_FRONT, compare_func_map[(regs.rb_depthcontrol & 0x00000700) >> 8],
        stencil_ref, stencil_read_mask);
    // A2XX_RB_DEPTHCONTROL_STENCILFAIL
    // A2XX_RB_DEPTHCONTROL_STENCILZFAIL
    // A2XX_RB_DEPTHCONTROL_STENCILZPASS
    glStencilOpSeparate(
        GL_FRONT, stencil_op_map[(regs.rb_depthcontrol & 0x00003800) >> 11],
        stencil_op_map[(regs.rb_depthcontrol & 0x000E0000) >> 17],
        stencil_op_map[(regs.rb_depthcontrol & 0x0001C000) >> 14]);
    // A2XX_RB_DEPTHCONTROL_STENCILFUNC_BF
    glStencilFuncSeparate(
        GL_BACK, compare_func_map[(regs.rb_depthcontrol & 0x00700000) >> 20],
        stencil_ref, stencil_read_mask);
    // A2XX_RB_DEPTHCONTROL_STENCILFAIL_BF
    // A2XX_RB_DEPTHCONTROL_STENCILZFAIL_BF
    // A2XX_RB_DEPTHCONTROL_STENCILZPASS_BF
    glStencilOpSeparate(
        GL_BACK, stencil_op_map[(regs.rb_depthcontrol & 0x03800000) >> 23],
        stencil_op_map[(regs.rb_depthcontrol & 0xE0000000) >> 29],
        stencil_op_map[(regs.rb_depthcontrol & 0x1C000000) >> 26]);
  } else {
    // Backfaces disabled - treat backfaces as frontfaces.
    glStencilFunc(compare_func_map[(regs.rb_depthcontrol & 0x00000700) >> 8],
                  stencil_ref, stencil_read_mask);
    glStencilOp(stencil_op_map[(regs.rb_depthcontrol & 0x00003800) >> 11],
                stencil_op_map[(regs.rb_depthcontrol & 0x000E0000) >> 17],
                stencil_op_map[(regs.rb_depthcontrol & 0x0001C000) >> 14]);
  }

  return UpdateStatus::kMismatch;
}

GL4CommandProcessor::UpdateStatus GL4CommandProcessor::PopulateIndexBuffer(
    IndexBufferInfo* index_buffer_info) {
  auto& regs = *register_file_;
  if (!index_buffer_info || !index_buffer_info->guest_base) {
    // No index buffer or auto draw.
    return UpdateStatus::kCompatible;
  }
  auto& info = *index_buffer_info;

#if FINE_GRAINED_DRAW_SCOPES
  SCOPE_profile_cpu_f("gpu");
#endif  // FINE_GRAINED_DRAW_SCOPES

  // Min/max index ranges for clamping. This is often [0g,FFFF|FFFFFF].
  // All indices should be clamped to [min,max]. May be a way to do this in GL.
  uint32_t min_index = regs[XE_GPU_REG_VGT_MIN_VTX_INDX].u32;
  uint32_t max_index = regs[XE_GPU_REG_VGT_MAX_VTX_INDX].u32;
  assert_true(min_index == 0);
  assert_true(max_index == 0xFFFF || max_index == 0xFFFFFF);

  assert_true(info.endianness == Endian::k8in16 ||
              info.endianness == Endian::k8in32);

  trace_writer_.WriteMemoryRead(info.guest_base, info.length);

  size_t total_size =
      info.count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t)
                                                       : sizeof(uint16_t));
  CircularBuffer::Allocation allocation;
  if (!scratch_buffer_.AcquireCached(info.guest_base, total_size,
                                     &allocation)) {
    if (info.format == IndexFormat::kInt32) {
      auto dest = reinterpret_cast<uint32_t*>(allocation.host_ptr);
      auto src = memory_->TranslatePhysical<const uint32_t*>(info.guest_base);
      xe::copy_and_swap_32_aligned(dest, src, info.count);
    } else {
      auto dest = reinterpret_cast<uint16_t*>(allocation.host_ptr);
      auto src = memory_->TranslatePhysical<const uint16_t*>(info.guest_base);
      xe::copy_and_swap_16_aligned(dest, src, info.count);
    }
    draw_batcher_.set_index_buffer(allocation);
    scratch_buffer_.Commit(std::move(allocation));
  } else {
    draw_batcher_.set_index_buffer(allocation);
  }

  return UpdateStatus::kCompatible;
}

GL4CommandProcessor::UpdateStatus GL4CommandProcessor::PopulateVertexBuffers() {
#if FINE_GRAINED_DRAW_SCOPES
  SCOPE_profile_cpu_f("gpu");
#endif  // FINE_GRAINED_DRAW_SCOPES

  auto& regs = *register_file_;
  assert_not_null(active_vertex_shader_);

  for (const auto& vertex_binding : active_vertex_shader_->vertex_bindings()) {
    int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 +
            (vertex_binding.fetch_constant / 3) * 6;
    const auto group = reinterpret_cast<xe_gpu_fetch_group_t*>(&regs.values[r]);
    const xe_gpu_vertex_fetch_t* fetch = nullptr;
    switch (vertex_binding.fetch_constant % 3) {
      case 0:
        fetch = &group->vertex_fetch_0;
        break;
      case 1:
        fetch = &group->vertex_fetch_1;
        break;
      case 2:
        fetch = &group->vertex_fetch_2;
        break;
    }
    assert_true(fetch->endian == 2);

    size_t valid_range = size_t(fetch->size * 4);

    trace_writer_.WriteMemoryRead(fetch->address << 2, valid_range);

    auto vertex_shader = static_cast<GL4Shader*>(active_vertex_shader_);
    CircularBuffer::Allocation allocation;
    if (!scratch_buffer_.AcquireCached(fetch->address << 2, valid_range,
                                       &allocation)) {
      // Copy and byte swap the entire buffer.
      // We could be smart about this to save GPU bandwidth by building a CRC
      // as we copy and only if it differs from the previous value committing
      // it (and if it matches just discard and reuse).
      xe::copy_and_swap_32_aligned(
          reinterpret_cast<uint32_t*>(allocation.host_ptr),
          memory_->TranslatePhysical<const uint32_t*>(fetch->address << 2),
          valid_range / 4);

      // TODO(benvanik): if we could find a way to avoid this, we could use
      // multidraw without flushing.
      glVertexArrayVertexBuffer(
          vertex_shader->vao(),
          static_cast<GLuint>(vertex_binding.binding_index),
          scratch_buffer_.handle(), allocation.offset,
          vertex_binding.stride_words * 4);

      scratch_buffer_.Commit(std::move(allocation));
    } else {
      // TODO(benvanik): if we could find a way to avoid this, we could use
      // multidraw without flushing.
      glVertexArrayVertexBuffer(
          vertex_shader->vao(),
          static_cast<GLuint>(vertex_binding.binding_index),
          scratch_buffer_.handle(), allocation.offset,
          vertex_binding.stride_words * 4);
    }
  }

  return UpdateStatus::kCompatible;
}

GL4CommandProcessor::UpdateStatus GL4CommandProcessor::PopulateSamplers() {
#if FINE_GRAINED_DRAW_SCOPES
  SCOPE_profile_cpu_f("gpu");
#endif  // FINE_GRAINED_DRAW_SCOPES

  bool mismatch = false;

  // VS and PS samplers are shared, but may be used exclusively.
  // We walk each and setup lazily.
  bool has_setup_sampler[32] = {false};

  // Vertex texture samplers.
  for (auto& texture_binding : active_vertex_shader_->texture_bindings()) {
    if (has_setup_sampler[texture_binding.fetch_constant]) {
      continue;
    }
    has_setup_sampler[texture_binding.fetch_constant] = true;
    auto status = PopulateSampler(texture_binding);
    if (status == UpdateStatus::kError) {
      return status;
    } else if (status == UpdateStatus::kMismatch) {
      mismatch = true;
    }
  }

  // Pixel shader texture sampler.
  for (auto& texture_binding : active_pixel_shader_->texture_bindings()) {
    if (has_setup_sampler[texture_binding.fetch_constant]) {
      continue;
    }
    has_setup_sampler[texture_binding.fetch_constant] = true;
    auto status = PopulateSampler(texture_binding);
    if (status == UpdateStatus::kError) {
      return UpdateStatus::kError;
    } else if (status == UpdateStatus::kMismatch) {
      mismatch = true;
    }
  }

  return mismatch ? UpdateStatus::kMismatch : UpdateStatus::kCompatible;
}

GL4CommandProcessor::UpdateStatus GL4CommandProcessor::PopulateSampler(
    const Shader::TextureBinding& texture_binding) {
  auto& regs = *register_file_;
  int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 +
          texture_binding.fetch_constant * 6;
  auto group = reinterpret_cast<const xe_gpu_fetch_group_t*>(&regs.values[r]);
  auto& fetch = group->texture_fetch;

  // Reset slot.
  // If we fail, we still draw but with an invalid texture.
  draw_batcher_.set_texture_sampler(texture_binding.fetch_constant, 0, 0);

  if (FLAGS_disable_textures) {
    return UpdateStatus::kCompatible;
  }

  // ?
  if (!fetch.type) {
    return UpdateStatus::kCompatible;
  }
  assert_true(fetch.type == 0x2);

  TextureInfo texture_info;
  if (!TextureInfo::Prepare(fetch, &texture_info)) {
    XELOGE("Unable to parse texture fetcher info");
    return UpdateStatus::kCompatible;  // invalid texture used
  }
  SamplerInfo sampler_info;
  if (!SamplerInfo::Prepare(fetch, texture_binding.fetch_instr,
                            &sampler_info)) {
    XELOGE("Unable to parse sampler info");
    return UpdateStatus::kCompatible;  // invalid texture used
  }

  trace_writer_.WriteMemoryRead(texture_info.guest_address,
                                texture_info.input_length);

  auto entry_view = texture_cache_.Demand(texture_info, sampler_info);
  if (!entry_view) {
    // Unable to create/fetch/etc.
    XELOGE("Failed to demand texture");
    return UpdateStatus::kCompatible;
  }

  // Shaders will use bindless to fetch right from it.
  draw_batcher_.set_texture_sampler(texture_binding.fetch_constant,
                                    entry_view->texture_sampler_handle,
                                    fetch.swizzle);

  return UpdateStatus::kCompatible;
}

bool GL4CommandProcessor::IssueCopy() {
  SCOPE_profile_cpu_f("gpu");
  auto& regs = *register_file_;

  // This is used to resolve surfaces, taking them from EDRAM render targets
  // to system memory. It can optionally clear color/depth surfaces, too.
  // The command buffer has stuff for actually doing this by drawing, however
  // we should be able to do it without that much easier.

  uint32_t copy_control = regs[XE_GPU_REG_RB_COPY_CONTROL].u32;
  // Render targets 0-3, 4 = depth
  uint32_t copy_src_select = copy_control & 0x7;
  bool color_clear_enabled = (copy_control >> 8) & 0x1;
  bool depth_clear_enabled = (copy_control >> 9) & 0x1;
  auto copy_command = static_cast<CopyCommand>((copy_control >> 20) & 0x3);

  uint32_t copy_dest_info = regs[XE_GPU_REG_RB_COPY_DEST_INFO].u32;
  auto copy_dest_endian = static_cast<Endian128>(copy_dest_info & 0x7);
  uint32_t copy_dest_array = (copy_dest_info >> 3) & 0x1;
  assert_true(copy_dest_array == 0);
  uint32_t copy_dest_slice = (copy_dest_info >> 4) & 0x7;
  assert_true(copy_dest_slice == 0);
  auto copy_dest_format =
      static_cast<ColorFormat>((copy_dest_info >> 7) & 0x3F);
  uint32_t copy_dest_number = (copy_dest_info >> 13) & 0x7;
  // assert_true(copy_dest_number == 0); // ?
  uint32_t copy_dest_bias = (copy_dest_info >> 16) & 0x3F;
  // assert_true(copy_dest_bias == 0);
  uint32_t copy_dest_swap = (copy_dest_info >> 25) & 0x1;

  uint32_t copy_dest_base = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32;
  uint32_t copy_dest_pitch = regs[XE_GPU_REG_RB_COPY_DEST_PITCH].u32;
  uint32_t copy_dest_height = (copy_dest_pitch >> 16) & 0x3FFF;
  copy_dest_pitch &= 0x3FFF;

  // None of this is supported yet:
  uint32_t copy_surface_slice = regs[XE_GPU_REG_RB_COPY_SURFACE_SLICE].u32;
  assert_true(copy_surface_slice == 0);
  uint32_t copy_func = regs[XE_GPU_REG_RB_COPY_FUNC].u32;
  assert_true(copy_func == 0);
  uint32_t copy_ref = regs[XE_GPU_REG_RB_COPY_REF].u32;
  assert_true(copy_ref == 0);
  uint32_t copy_mask = regs[XE_GPU_REG_RB_COPY_MASK].u32;
  assert_true(copy_mask == 0);

  // RB_SURFACE_INFO
  // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
  uint32_t surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32;
  uint32_t surface_pitch = surface_info & 0x3FFF;
  auto surface_msaa = static_cast<MsaaSamples>((surface_info >> 16) & 0x3);

  // Depending on the source, pick the buffer we'll be sourcing.
  // We then query for a cached framebuffer setup with that buffer active.
  TextureFormat src_format = TextureFormat::kUnknown;
  GLuint color_targets[4] = {kAnyTarget, kAnyTarget, kAnyTarget, kAnyTarget};
  GLuint depth_target = kAnyTarget;
  if (copy_src_select <= 3 || color_clear_enabled) {
    // Source from a color target.
    uint32_t color_info[4] = {
        regs[XE_GPU_REG_RB_COLOR_INFO].u32, regs[XE_GPU_REG_RB_COLOR1_INFO].u32,
        regs[XE_GPU_REG_RB_COLOR2_INFO].u32,
        regs[XE_GPU_REG_RB_COLOR3_INFO].u32,
    };
    uint32_t color_base = color_info[copy_src_select] & 0xFFF;
    auto color_format = static_cast<ColorRenderTargetFormat>(
        (color_info[copy_src_select] >> 16) & 0xF);
    color_targets[copy_src_select] = GetColorRenderTarget(
        surface_pitch, surface_msaa, color_base, color_format);

    if (copy_src_select <= 3) {
      src_format = ColorRenderTargetToTextureFormat(color_format);
    }
  }

  // Grab the depth/stencil if we're sourcing from it or clear is enabled.
  if (copy_src_select > 3 || depth_clear_enabled) {
    uint32_t depth_info = regs[XE_GPU_REG_RB_DEPTH_INFO].u32;
    uint32_t depth_base = depth_info & 0xFFF;
    auto depth_format =
        static_cast<DepthRenderTargetFormat>((depth_info >> 16) & 0x1);
    depth_target = GetDepthRenderTarget(surface_pitch, surface_msaa, depth_base,
                                        depth_format);

    if (copy_src_select > 3) {
      src_format = DepthRenderTargetToTextureFormat(depth_format);
    }
  }

  auto source_framebuffer = GetFramebuffer(color_targets, depth_target);
  if (!source_framebuffer) {
    // If we get here we are likely missing some state checks.
    assert_always("No framebuffer for copy source? no-op copy?");
    XELOGE("No framebuffer for copy source");
    return false;
  }

  active_framebuffer_ = source_framebuffer;

  GLenum read_format;
  GLenum read_type;
  size_t read_size = 0;
  switch (copy_dest_format) {
    case ColorFormat::k_1_5_5_5:
      read_format = GL_RGB5_A1;
      read_type = GL_UNSIGNED_SHORT_1_5_5_5_REV;
      read_size = 16;
      break;
    case ColorFormat::k_2_10_10_10:
      read_format = GL_RGB10_A2;
      read_type = GL_UNSIGNED_INT_10_10_10_2;
      read_size = 32;
      break;
    case ColorFormat::k_4_4_4_4:
      read_format = GL_RGBA4;
      read_type = GL_UNSIGNED_SHORT_4_4_4_4;
      read_size = 16;
      break;
    case ColorFormat::k_5_6_5:
      read_format = GL_RGB565;
      read_type = GL_UNSIGNED_SHORT_5_6_5;
      read_size = 16;
      break;
    case ColorFormat::k_8:
      read_format = GL_R8;
      read_type = GL_UNSIGNED_BYTE;
      read_size = 8;
      break;
    case ColorFormat::k_8_8:
      read_format = GL_RG8;
      read_type = GL_UNSIGNED_BYTE;
      read_size = 16;
      break;
    case ColorFormat::k_8_8_8_8:
      read_format = copy_dest_swap ? GL_BGRA : GL_RGBA;
      read_type = GL_UNSIGNED_BYTE;
      read_size = 32;
      break;
    case ColorFormat::k_16:
      read_format = GL_R16;
      read_type = GL_UNSIGNED_SHORT;
      read_size = 16;
      break;
    case ColorFormat::k_16_FLOAT:
      read_format = GL_R16F;
      read_type = GL_HALF_FLOAT;
      read_size = 16;
      break;
    case ColorFormat::k_16_16:
      read_format = GL_RG16;
      read_type = GL_UNSIGNED_SHORT;
      read_size = 32;
      break;
    case ColorFormat::k_16_16_FLOAT:
      read_format = GL_RG16F;
      read_type = GL_HALF_FLOAT;
      read_size = 32;
      break;
    case ColorFormat::k_16_16_16_16:
      read_format = GL_RGBA16;
      read_type = GL_UNSIGNED_SHORT;
      read_size = 32;
      break;
    case ColorFormat::k_16_16_16_16_FLOAT:
      read_format = GL_RGBA16F;
      read_type = GL_HALF_FLOAT;
      read_size = 32;
      break;
    case ColorFormat::k_32_FLOAT:
      read_format = GL_R32F;
      read_type = GL_FLOAT;
      read_size = 32;
      break;
    case ColorFormat::k_32_32_FLOAT:
      read_format = GL_RG32F;
      read_type = GL_FLOAT;
      read_size = 64;
      break;
    case ColorFormat::k_32_32_32_32_FLOAT:
      read_format = GL_RGBA32F;
      read_type = GL_FLOAT;
      read_size = 128;
      break;
    case ColorFormat::k_10_11_11:
    case ColorFormat::k_11_11_10:
      read_format = GL_R11F_G11F_B10F;
      read_type = GL_UNSIGNED_INT_10F_11F_11F_REV;
      read_size = 32;
      break;
    default:
      assert_unhandled_case(copy_dest_format);
      return false;
  }

  // TODO(benvanik): swap channel ordering on copy_dest_swap
  //                 Can we use GL swizzles for this?

  // Swap byte order during read.
  // TODO(benvanik): handle other endian modes.
  switch (copy_dest_endian) {
    case Endian128::kUnspecified:
      glPixelStorei(GL_PACK_SWAP_BYTES, GL_FALSE);
      break;
    case Endian128::k8in32:
      glPixelStorei(GL_PACK_SWAP_BYTES, GL_TRUE);
      break;
    default:
      // assert_unhandled_case(copy_dest_endian);
      glPixelStorei(GL_PACK_SWAP_BYTES, GL_TRUE);
      break;
  }

  // TODO(benvanik): tweak alignments/strides.
  // glPixelStorei(GL_PACK_ALIGNMENT, 1);
  // glPixelStorei(GL_PACK_ROW_LENGTH, 0);
  // glPixelStorei(GL_PACK_IMAGE_HEIGHT, 0);

  // TODO(benvanik): any way to scissor this? a200 has:
  // REG_A2XX_RB_COPY_DEST_OFFSET = A2XX_RB_COPY_DEST_OFFSET_X(tile->xoff) |
  //                                A2XX_RB_COPY_DEST_OFFSET_Y(tile->yoff);
  // but I can't seem to find something similar.
  uint32_t dest_logical_width = copy_dest_pitch;
  uint32_t dest_logical_height = copy_dest_height;
  uint32_t dest_block_width = xe::round_up(dest_logical_width, 32);
  uint32_t dest_block_height = xe::round_up(dest_logical_height, 32);

  uint32_t window_offset = regs[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32;
  int16_t window_offset_x = window_offset & 0x7FFF;
  int16_t window_offset_y = (window_offset >> 16) & 0x7FFF;
  if (window_offset_x & 0x4000) {
    window_offset_x |= 0x8000;
  }
  if (window_offset_y & 0x4000) {
    window_offset_y |= 0x8000;
  }

  // HACK: vertices to use are always in vf0.
  int copy_vertex_fetch_slot = 0;
  int r =
      XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (copy_vertex_fetch_slot / 3) * 6;
  const auto group = reinterpret_cast<xe_gpu_fetch_group_t*>(&regs.values[r]);
  const xe_gpu_vertex_fetch_t* fetch = nullptr;
  switch (copy_vertex_fetch_slot % 3) {
    case 0:
      fetch = &group->vertex_fetch_0;
      break;
    case 1:
      fetch = &group->vertex_fetch_1;
      break;
    case 2:
      fetch = &group->vertex_fetch_2;
      break;
  }
  assert_true(fetch->type == 3);
  assert_true(fetch->endian == 2);
  assert_true(fetch->size == 6);
  const uint8_t* vertex_addr = memory_->TranslatePhysical(fetch->address << 2);
  trace_writer_.WriteMemoryRead(fetch->address << 2, fetch->size * 4);
  int32_t dest_min_x = int32_t((std::min(
      std::min(
          GpuSwap(xe::load<float>(vertex_addr + 0), Endian(fetch->endian)),
          GpuSwap(xe::load<float>(vertex_addr + 8), Endian(fetch->endian))),
      GpuSwap(xe::load<float>(vertex_addr + 16), Endian(fetch->endian)))));
  int32_t dest_max_x = int32_t((std::max(
      std::max(
          GpuSwap(xe::load<float>(vertex_addr + 0), Endian(fetch->endian)),
          GpuSwap(xe::load<float>(vertex_addr + 8), Endian(fetch->endian))),
      GpuSwap(xe::load<float>(vertex_addr + 16), Endian(fetch->endian)))));
  int32_t dest_min_y = int32_t((std::min(
      std::min(
          GpuSwap(xe::load<float>(vertex_addr + 4), Endian(fetch->endian)),
          GpuSwap(xe::load<float>(vertex_addr + 12), Endian(fetch->endian))),
      GpuSwap(xe::load<float>(vertex_addr + 20), Endian(fetch->endian)))));
  int32_t dest_max_y = int32_t((std::max(
      std::max(
          GpuSwap(xe::load<float>(vertex_addr + 4), Endian(fetch->endian)),
          GpuSwap(xe::load<float>(vertex_addr + 12), Endian(fetch->endian))),
      GpuSwap(xe::load<float>(vertex_addr + 20), Endian(fetch->endian)))));
  Rect2D dest_rect(dest_min_x, dest_min_y, dest_max_x - dest_min_x,
                   dest_max_y - dest_min_y);
  Rect2D src_rect(0, 0, dest_rect.width, dest_rect.height);

  // The dest base address passed in has already been offset by the window
  // offset, so to ensure texture lookup works we need to offset it.
  // TODO(benvanik): allow texture cache to lookup partial textures.
  // TODO(benvanik): change based on format.
  int32_t dest_offset = window_offset_y * copy_dest_pitch * 4;
  dest_offset += window_offset_x * 32 * 4;
  copy_dest_base += dest_offset;

  // Destination pointer in guest memory.
  // We have GL throw bytes directly into it.
  // TODO(benvanik): copy to staging texture then PBO back?
  void* ptr = memory_->TranslatePhysical(copy_dest_base);
  size_t size = copy_dest_pitch * copy_dest_height * (read_size / 8);

  auto blitter = static_cast<xe::ui::gl::GLContext*>(context_.get())->blitter();

  // Make active so glReadPixels reads from us.
  switch (copy_command) {
    case CopyCommand::kRaw: {
      // This performs a byte-for-byte copy of the textures from src to dest
      // with no conversion. Byte swapping may still occur.
      if (copy_src_select <= 3) {
        // Source from a bound render target.
        // TODO(benvanik): RAW copy.
        last_framebuffer_texture_ = texture_cache_.CopyTexture(
            blitter, copy_dest_base, dest_logical_width, dest_logical_height,
            dest_block_width, dest_block_height,
            ColorFormatToTextureFormat(copy_dest_format),
            copy_dest_swap ? true : false, color_targets[copy_src_select],
            src_rect, dest_rect);
        if (!FLAGS_disable_framebuffer_readback) {
          // std::memset(ptr, 0xDE,
          //             copy_dest_pitch * copy_dest_height * (read_size / 8));
          // glReadPixels(0, 0, copy_dest_pitch, copy_dest_height, read_format,
          //              read_type, ptr);
        }
      } else {
        // Source from the bound depth/stencil target.
        // TODO(benvanik): RAW copy.
        texture_cache_.CopyTexture(
            blitter, copy_dest_base, dest_logical_width, dest_logical_height,
            dest_block_width, dest_block_height, src_format,
            copy_dest_swap ? true : false, depth_target, src_rect, dest_rect);
        if (!FLAGS_disable_framebuffer_readback) {
          // std::memset(ptr, 0xDE,
          //             copy_dest_pitch * copy_dest_height * (read_size / 8));
          // glReadPixels(0, 0, copy_dest_pitch, copy_dest_height,
          //              GL_DEPTH_STENCIL, read_type, ptr);
        }
      }
      break;
    }
    case CopyCommand::kConvert: {
      if (copy_src_select <= 3) {
        // Source from a bound render target.
        // Either copy the readbuffer into an existing texture or create a new
        // one in the cache so we can service future upload requests.
        last_framebuffer_texture_ = texture_cache_.ConvertTexture(
            blitter, copy_dest_base, dest_logical_width, dest_logical_height,
            dest_block_width, dest_block_height,
            ColorFormatToTextureFormat(copy_dest_format),
            copy_dest_swap ? true : false, color_targets[copy_src_select],
            src_rect, dest_rect);
        if (!FLAGS_disable_framebuffer_readback) {
          // std::memset(ptr, 0xDE,
          //             copy_dest_pitch * copy_dest_height * (read_size / 8));
          // glReadPixels(0, 0, copy_dest_pitch, copy_dest_height, read_format,
          //              read_type, ptr);
        }
      } else {
        // Source from the bound depth/stencil target.
        texture_cache_.ConvertTexture(
            blitter, copy_dest_base, dest_logical_width, dest_logical_height,
            dest_block_width, dest_block_height, src_format,
            copy_dest_swap ? true : false, depth_target, src_rect, dest_rect);
        if (!FLAGS_disable_framebuffer_readback) {
          // std::memset(ptr, 0xDE,
          //             copy_dest_pitch * copy_dest_height * (read_size / 8));
          // glReadPixels(0, 0, copy_dest_pitch, copy_dest_height,
          //              GL_DEPTH_STENCIL, read_type, ptr);
        }
      }
      break;
    }
    case CopyCommand::kConstantOne:
    case CopyCommand::kNull:
    default:
      // assert_unhandled_case(copy_command);
      return false;
  }

  // Perform any requested clears.
  uint32_t copy_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
  uint32_t copy_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
  uint32_t copy_color_clear_low = regs[XE_GPU_REG_RB_COLOR_CLEAR_LOW].u32;
  assert_true(copy_color_clear == copy_color_clear_low);

  if (color_clear_enabled) {
    // Clear the render target we selected for copy.
    assert_true(copy_src_select < 3);
    // TODO(benvanik): verify color order.
    float color[] = {(copy_color_clear & 0xFF) / 255.0f,
                     ((copy_color_clear >> 8) & 0xFF) / 255.0f,
                     ((copy_color_clear >> 16) & 0xFF) / 255.0f,
                     ((copy_color_clear >> 24) & 0xFF) / 255.0f};
    // TODO(benvanik): remove query.
    GLboolean old_color_mask[4];
    glGetBooleani_v(GL_COLOR_WRITEMASK, copy_src_select, old_color_mask);
    glColorMaski(copy_src_select, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
    glClearNamedFramebufferfv(source_framebuffer->framebuffer, GL_COLOR,
                              copy_src_select, color);
    glColorMaski(copy_src_select, old_color_mask[0], old_color_mask[1],
                 old_color_mask[2], old_color_mask[3]);
  }

  if (depth_clear_enabled && depth_target != kAnyTarget) {
    // Clear the current depth buffer.
    // TODO(benvanik): verify format.
    GLfloat depth = {(copy_depth_clear & 0xFFFFFF00) /
                     static_cast<float>(0xFFFFFF00)};
    GLint stencil = copy_depth_clear & 0xFF;
    GLint old_draw_framebuffer;
    GLboolean old_depth_mask;
    GLint old_stencil_mask;
    glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &old_draw_framebuffer);
    glGetBooleanv(GL_DEPTH_WRITEMASK, &old_depth_mask);
    glGetIntegerv(GL_STENCIL_WRITEMASK, &old_stencil_mask);
    glDepthMask(GL_TRUE);
    glStencilMask(0xFF);
    // HACK: this should work, but throws INVALID_ENUM on nvidia drivers.
    // GLEW signature differs from OpenGL docs?
    // glClearNamedFramebufferfi(source_framebuffer->framebuffer,
    //                           GL_DEPTH_STENCIL, depth, stencil);
    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, source_framebuffer->framebuffer);
    glClearBufferfi(GL_DEPTH_STENCIL, 0, depth, stencil);
    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_framebuffer);
    glDepthMask(old_depth_mask);
    glStencilMask(old_stencil_mask);
  }

  return true;
}

GLuint GL4CommandProcessor::GetColorRenderTarget(
    uint32_t pitch, MsaaSamples samples, uint32_t base,
    ColorRenderTargetFormat format) {
  // Because we don't know the height of anything, we allocate at full res.
  // At 2560x2560, it's impossible for EDRAM to fit anymore.
  uint32_t width = 2560;
  uint32_t height = 2560;

  // NOTE: we strip gamma formats down to normal ones.
  if (format == ColorRenderTargetFormat::k_8_8_8_8_GAMMA) {
    format = ColorRenderTargetFormat::k_8_8_8_8;
  }

  for (auto it = cached_color_render_targets_.begin();
       it != cached_color_render_targets_.end(); ++it) {
    if (it->base == base && it->width == width && it->height == height &&
        it->format == format) {
      return it->texture;
    }
  }
  cached_color_render_targets_.push_back(CachedColorRenderTarget());
  auto cached = &cached_color_render_targets_.back();
  cached->base = base;
  cached->width = width;
  cached->height = height;
  cached->format = format;

  GLenum internal_format;
  switch (format) {
    case ColorRenderTargetFormat::k_8_8_8_8:
    case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
      internal_format = GL_RGBA8;
      break;
    case ColorRenderTargetFormat::k_2_10_10_10:
    case ColorRenderTargetFormat::k_2_10_10_10_unknown:
      internal_format = GL_RGB10_A2UI;
      break;
    case ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
    case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_unknown:
      internal_format = GL_RGB10_A2;
      break;
    case ColorRenderTargetFormat::k_16_16:
      internal_format = GL_RG16;
      break;
    case ColorRenderTargetFormat::k_16_16_FLOAT:
      internal_format = GL_RG16F;
      break;
    case ColorRenderTargetFormat::k_16_16_16_16:
      internal_format = GL_RGBA16;
      break;
    case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
      internal_format = GL_RGBA16F;
      break;
    case ColorRenderTargetFormat::k_32_FLOAT:
      internal_format = GL_R32F;
      break;
    case ColorRenderTargetFormat::k_32_32_FLOAT:
      internal_format = GL_RG32F;
      break;
    default:
      assert_unhandled_case(format);
      return 0;
  }

  glCreateTextures(GL_TEXTURE_2D, 1, &cached->texture);
  glTextureStorage2D(cached->texture, 1, internal_format, width, height);

  return cached->texture;
}

GLuint GL4CommandProcessor::GetDepthRenderTarget(
    uint32_t pitch, MsaaSamples samples, uint32_t base,
    DepthRenderTargetFormat format) {
  uint32_t width = 2560;
  uint32_t height = 2560;

  for (auto it = cached_depth_render_targets_.begin();
       it != cached_depth_render_targets_.end(); ++it) {
    if (it->base == base && it->width == width && it->height == height &&
        it->format == format) {
      return it->texture;
    }
  }
  cached_depth_render_targets_.push_back(CachedDepthRenderTarget());
  auto cached = &cached_depth_render_targets_.back();
  cached->base = base;
  cached->width = width;
  cached->height = height;
  cached->format = format;

  GLenum internal_format;
  switch (format) {
    case DepthRenderTargetFormat::kD24S8:
      internal_format = GL_DEPTH24_STENCIL8;
      break;
    case DepthRenderTargetFormat::kD24FS8:
      // TODO(benvanik): not supported in GL?
      internal_format = GL_DEPTH24_STENCIL8;
      break;
    default:
      assert_unhandled_case(format);
      return 0;
  }

  glCreateTextures(GL_TEXTURE_2D, 1, &cached->texture);
  glTextureStorage2D(cached->texture, 1, internal_format, width, height);

  return cached->texture;
}

GL4CommandProcessor::CachedFramebuffer* GL4CommandProcessor::GetFramebuffer(
    GLuint color_targets[4], GLuint depth_target) {
  for (auto it = cached_framebuffers_.begin(); it != cached_framebuffers_.end();
       ++it) {
    if ((depth_target == kAnyTarget || it->depth_target == depth_target) &&
        (color_targets[0] == kAnyTarget ||
         it->color_targets[0] == color_targets[0]) &&
        (color_targets[1] == kAnyTarget ||
         it->color_targets[1] == color_targets[1]) &&
        (color_targets[2] == kAnyTarget ||
         it->color_targets[2] == color_targets[2]) &&
        (color_targets[3] == kAnyTarget ||
         it->color_targets[3] == color_targets[3])) {
      return &*it;
    }
  }

  GLuint real_color_targets[4];
  bool any_set = false;
  for (int i = 0; i < 4; ++i) {
    if (color_targets[i] == kAnyTarget) {
      real_color_targets[i] = 0;
    } else {
      any_set = true;
      real_color_targets[i] = color_targets[i];
    }
  }
  GLuint real_depth_target;
  if (depth_target == kAnyTarget) {
    real_depth_target = 0;
  } else {
    any_set = true;
    real_depth_target = depth_target;
  }
  if (!any_set) {
    // No framebuffer required.
    return nullptr;
  }

  cached_framebuffers_.push_back(CachedFramebuffer());
  auto cached = &cached_framebuffers_.back();
  glCreateFramebuffers(1, &cached->framebuffer);
  for (int i = 0; i < 4; ++i) {
    cached->color_targets[i] = real_color_targets[i];
    glNamedFramebufferTexture(cached->framebuffer, GL_COLOR_ATTACHMENT0 + i,
                              real_color_targets[i], 0);
  }
  cached->depth_target = real_depth_target;
  glNamedFramebufferTexture(cached->framebuffer, GL_DEPTH_STENCIL_ATTACHMENT,
                            real_depth_target, 0);

  return cached;
}

}  // namespace gl4
}  // namespace gpu
}  // namespace xe