diff --git a/src/alloy/string_buffer.cc b/src/alloy/string_buffer.cc index d6cebc2c4..912690fdc 100644 --- a/src/alloy/string_buffer.cc +++ b/src/alloy/string_buffer.cc @@ -48,7 +48,7 @@ void StringBuffer::AppendVarargs(const char* format, va_list args) { auto offset = buffer_.size(); Grow(length + 1); buffer_.resize(buffer_.size() + length); - vsnprintf(buffer_.data() + offset, buffer_.capacity() - 1, format, args); + vsnprintf(buffer_.data() + offset, buffer_.capacity(), format, args); buffer_[buffer_.size()] = 0; } @@ -62,7 +62,7 @@ void StringBuffer::AppendBytes(const uint8_t* buffer, size_t length) { const char* StringBuffer::GetString() const { return buffer_.data(); } -std::string StringBuffer::to_string() { return std::string(buffer_.data()); } +std::string StringBuffer::to_string() { return std::string(buffer_.data(), buffer_.size()); } char* StringBuffer::ToString() { return strdup(buffer_.data()); } diff --git a/src/xenia/gpu/gl4/command_processor.cc b/src/xenia/gpu/gl4/command_processor.cc index 426a0f3dc..dd1fef43f 100644 --- a/src/xenia/gpu/gl4/command_processor.cc +++ b/src/xenia/gpu/gl4/command_processor.cc @@ -1661,15 +1661,6 @@ bool CommandProcessor::UpdateRenderTargets(DrawCommand* draw_command) { // TODO(benvanik): do we want this on READ too? glBindFramebuffer(GL_DRAW_FRAMEBUFFER, cached_framebuffer->framebuffer); - // TEST TEST TEST TEST TEST TEST TEST TEST TEST TEST - // Pretend we are drawing. - // glEnable(GL_SCISSOR_TEST); - // glScissor(100, 100, 100, 100); - // float red[] = {rand() / (float)RAND_MAX, 0, 0, 1.0f}; - // glClearNamedFramebufferfv(active_framebuffer_->framebuffer, GL_COLOR, 0, - // red); - // glDisable(GL_SCISSOR_TEST); - return true; } @@ -1691,8 +1682,7 @@ bool CommandProcessor::UpdateShaders(DrawCommand* draw_command) { } if (!active_pixel_shader_->has_prepared()) { - if (!active_pixel_shader_->PreparePixelShader(program_cntl, - active_vertex_shader_)) { + if (!active_pixel_shader_->PreparePixelShader(program_cntl)) { XELOGE("Unable to prepare pixel shader"); return false; } diff --git a/src/xenia/gpu/gl4/gl4_shader.cc b/src/xenia/gpu/gl4/gl4_shader.cc index 80702cf3a..2994ab627 100644 --- a/src/xenia/gpu/gl4/gl4_shader.cc +++ b/src/xenia/gpu/gl4/gl4_shader.cc @@ -92,22 +92,17 @@ bool GL4Shader::PrepareVertexShader( " float gl_PointSize;\n" " float gl_ClipDistance[];\n" "};\n" - "layout(location = 0) in vec3 iF0;\n" - "layout(location = 1) in vec4 iF1;\n" "layout(location = 0) out VertexData vtx;\n" - "void main() {\n" - //" vec4 oPos = vec4(iF0.xy, 0.0, 1.0);\n" - " vec4 oPos = iF0.xxxx * state->float_consts[0];\n" - " oPos = (iF0.yyyy * state->float_consts[1]) + oPos;\n" - " oPos = (iF0.zzzz * state->float_consts[2]) + oPos;\n" - " oPos = (vec4(1.0, 1.0, 1.0, 1.0) * state->float_consts[3]) + oPos;\n" - //" gl_PointSize = 1.0;\n" + "void processVertex();\n" + "void main() {\n" + + (alloc_counts().positions ? " gl_Position = vec4(0.0, 0.0, 0.0, 1.0);\n" + : "") + + (alloc_counts().point_size ? " gl_PointSize = 1.0;\n" : "") + " for (int i = 0; i < vtx.o.length(); ++i) {\n" - " vtx.o[0] = vec4(0.0, 0.0, 0.0, 0.0);\n" + " vtx.o[i] = vec4(0.0, 0.0, 0.0, 0.0);\n" " }\n" - " vtx.o[0] = iF1;\n" - " gl_Position = applyViewport(oPos);\n" - //" gl_Position = oPos;\n" + " processVertex();\n" + " gl_Position = applyViewport(gl_Position);\n" "}\n"; std::string translated_source = @@ -116,6 +111,7 @@ bool GL4Shader::PrepareVertexShader( PLOGE("Vertex shader failed translation"); return false; } + source += translated_source; if (!CompileProgram(source)) { return false; @@ -126,31 +122,34 @@ bool GL4Shader::PrepareVertexShader( } bool GL4Shader::PreparePixelShader( - const xenos::xe_gpu_program_cntl_t& program_cntl, - GL4Shader* vertex_shader) { + const xenos::xe_gpu_program_cntl_t& program_cntl) { if (has_prepared_) { return is_valid_; } has_prepared_ = true; - std::string source = header + - "layout(location = 0) in VertexData vtx;\n" - "layout(location = 0) out vec4 oC[4];\n" - "void main() {\n" - " for (int i = 0; i < oC.length(); ++i) {\n" - " oC[i] = vec4(1.0, 0.0, 0.0, 1.0);\n" - " }\n" - " oC[0] = vtx.o[0];\n" - //" gl_FragDepth = 0.0;\n" - "}\n"; + std::string source = + header + + "layout(location = 0) in VertexData vtx;\n" + "layout(location = 0) out vec4 oC[4];\n" + "void processFragment();\n" + "void main() {\n" + " for (int i = 0; i < oC.length(); ++i) {\n" + " oC[i] = vec4(0.0, 0.0, 0.0, 0.0);\n" + " }\n" + + (program_cntl.ps_export_depth ? " gl_FragDepth = 0.0\n" : "") + + " processFragment();\n" + "}\n"; - std::string translated_source = shader_translator_.TranslatePixelShader( - this, program_cntl, vertex_shader->alloc_counts()); + std::string translated_source = + shader_translator_.TranslatePixelShader(this, program_cntl); if (translated_source.empty()) { PLOGE("Pixel shader failed translation"); return false; } + source += translated_source; + if (!CompileProgram(source)) { return false; } @@ -166,12 +165,13 @@ bool GL4Shader::CompileProgram(std::string source) { const char* source_str = translated_disassembly_.c_str(); // Save to disk, if we asked for it. + auto base_path = FLAGS_dump_shaders.c_str(); + char file_name[poly::max_path]; + snprintf(file_name, poly::countof(file_name), "%s/gl4_gen_%.16llX.%s", + base_path, data_hash_, + shader_type_ == ShaderType::kVertex ? "vert" : "frag"); if (FLAGS_dump_shaders.size()) { - auto base_path = FLAGS_dump_shaders.c_str(); - char file_name[poly::max_path]; - snprintf(file_name, poly::countof(file_name), "%s/gl4_gen_%.16llX.%s", - base_path, data_hash_, - shader_type_ == ShaderType::kVertex ? "vert" : "frag"); + // Note that we put the translated source first so we get good line numbers. FILE* f = fopen(file_name, "w"); fprintf(f, translated_disassembly_.c_str()); fprintf(f, "\n\n"); @@ -190,6 +190,7 @@ bool GL4Shader::CompileProgram(std::string source) { return false; } + // Get error log, if we failed to link. GLint link_status = 0; glGetProgramiv(program_, GL_LINK_STATUS, &link_status); if (!link_status) { @@ -205,6 +206,50 @@ bool GL4Shader::CompileProgram(std::string source) { return false; } + // Get program binary, if it's available. + GLint binary_length = 0; + glGetProgramiv(program_, GL_PROGRAM_BINARY_LENGTH, &binary_length); + if (binary_length) { + translated_binary_.resize(binary_length); + GLenum binary_format; + glGetProgramBinary(program_, binary_length, &binary_length, &binary_format, + translated_binary_.data()); + + // Append to shader dump. + if (FLAGS_dump_shaders.size()) { + // If we are on nvidia, we can find the disassembly string. + // I haven't been able to figure out from the format how to do this + // without a search like this. + const char* disasm_start = nullptr; + size_t search_offset = 0; + char* search_start = reinterpret_cast(translated_binary_.data()); + while (true) { + auto p = reinterpret_cast( + memchr(translated_binary_.data() + search_offset, '!', + translated_binary_.size() - search_offset)); + if (!p) { + break; + } + if (p[0] == '!' && p[1] == '!' && p[2] == 'N' && p[3] == 'V') { + disasm_start = p; + break; + } + search_offset = p - search_start; + ++search_offset; + } + + if (disasm_start) { + FILE* f = fopen(file_name, "a"); + fprintf(f, "\n\n/*\n"); + fprintf(f, disasm_start); + fprintf(f, "\n*/\n"); + fclose(f); + } else { + PLOGW("Got program binary but unable to find disassembly"); + } + } + } + return true; } diff --git a/src/xenia/gpu/gl4/gl4_shader.h b/src/xenia/gpu/gl4/gl4_shader.h index 94489d766..da3c3df78 100644 --- a/src/xenia/gpu/gl4/gl4_shader.h +++ b/src/xenia/gpu/gl4/gl4_shader.h @@ -27,8 +27,7 @@ class GL4Shader : public Shader { GLuint program() const { return program_; } bool PrepareVertexShader(const xenos::xe_gpu_program_cntl_t& program_cntl); - bool PreparePixelShader(const xenos::xe_gpu_program_cntl_t& program_cntl, - GL4Shader* vertex_shader); + bool PreparePixelShader(const xenos::xe_gpu_program_cntl_t& program_cntl); protected: bool CompileProgram(std::string source); diff --git a/src/xenia/gpu/gl4/gl4_shader_translator.cc b/src/xenia/gpu/gl4/gl4_shader_translator.cc index d61437d49..f0b0c5bed 100644 --- a/src/xenia/gpu/gl4/gl4_shader_translator.cc +++ b/src/xenia/gpu/gl4/gl4_shader_translator.cc @@ -33,35 +33,27 @@ const char* GetVertexFormatTypeName(const GL4Shader::BufferDescElement& el) { return "float"; case VertexFormat::k_16_16: case VertexFormat::k_32_32: - if (el.is_normalized) { - return el.is_signed ? "snorm float2" : "unorm float2"; - } else { - return el.is_signed ? "int2" : "uint2"; - } + return el.is_signed ? "ivec2" : "uvec2"; case VertexFormat::k_16_16_FLOAT: case VertexFormat::k_32_32_FLOAT: - return "float2"; + return "vec2"; case VertexFormat::k_10_11_11: case VertexFormat::k_11_11_10: return "int3"; // ? case VertexFormat::k_32_32_32_FLOAT: - return "float3"; + return "vec3"; case VertexFormat::k_8_8_8_8: case VertexFormat::k_2_10_10_10: case VertexFormat::k_16_16_16_16: case VertexFormat::k_32_32_32_32: - if (el.is_normalized) { - return el.is_signed ? "snorm float4" : "unorm float4"; - } else { - return el.is_signed ? "int4" : "uint4"; - } + return el.is_signed ? "ivec4" : "uvec4"; case VertexFormat::k_16_16_16_16_FLOAT: case VertexFormat::k_32_32_32_32_FLOAT: - return "float4"; + return "vec4"; default: XELOGE("Unknown vertex format: %d", el.format); assert_always(); - return "float4"; + return "vec4"; } } @@ -81,45 +73,12 @@ std::string GL4ShaderTranslator::TranslateVertexShader( GL4Shader* vertex_shader, const xe_gpu_program_cntl_t& program_cntl) { Reset(vertex_shader); - // Add constants buffers. - // We could optimize this by only including used buffers, but the compiler - // seems to do a good job of doing this for us. - // It also does read detection, so c[512] can end up c[4] in the asm - - // instead of doing this optimization ourselves we could maybe just query - // this from the compiler. - Append( - "cbuffer float_consts : register(b0) {\n" - " float4 c[512];\n" - "};\n"); - // TODO(benvanik): add bool/loop constants. + // Normal shaders only, for now. + assert_true(program_cntl.vs_export_mode == 0); AppendTextureHeader(vertex_shader->sampler_inputs()); - // Transform utilities. We adjust the output position in various ways - // as we can't do this via D3D11 APIs. - Append( - "cbuffer vs_consts : register(b3) {\n" - " float4 window;\n" // x,y,w,h - " float4 viewport_z_enable;\n" // min,(max - min),?,enabled - " float4 viewport_size;\n" // x,y,w,h - "};" - "float4 applyViewport(float4 pos) {\n" - " if (viewport_z_enable.w) {\n" - //" pos.x = (pos.x + 1) * viewport_size.z * 0.5 + viewport_size.x;\n" - //" pos.y = (1 - pos.y) * viewport_size.w * 0.5 + viewport_size.y;\n" - //" pos.z = viewport_z_enable.x + pos.z * viewport_z_enable.y;\n" - // w? - " } else {\n" - " pos.xy = pos.xy / float2(window.z / 2.0, -window.w / 2.0) + " - "float2(-1.0, 1.0);\n" - " pos.zw = float2(0.0, 1.0);\n" - " }\n" - " pos.xy += window.xy;\n" - " return pos;\n" - "}\n"); - // Add vertex shader input. - Append("struct VS_INPUT {\n"); uint32_t el_index = 0; const auto& buffer_inputs = vertex_shader->buffer_inputs(); for (uint32_t n = 0; n < buffer_inputs.count; n++) { @@ -129,55 +88,23 @@ std::string GL4ShaderTranslator::TranslateVertexShader( const char* type_name = GetVertexFormatTypeName(el); const auto& fetch = el.vtx_fetch; uint32_t fetch_slot = fetch.const_index * 3 + fetch.const_index_sel; - Append(" %s vf%u_%d : XE_VF%u;\n", type_name, fetch_slot, fetch.offset, - el_index); + Append("layout(location = %d) in %s vf%u_%d;\n", el_index, type_name, + fetch_slot, fetch.offset); el_index++; } } - Append("};\n"); - // Add vertex shader output (pixel shader input). const auto& alloc_counts = vertex_shader->alloc_counts(); - Append("struct VS_OUTPUT {\n"); - if (alloc_counts.positions) { - assert_true(alloc_counts.positions == 1); - Append(" float4 oPos : SV_POSITION;\n"); - } - if (alloc_counts.params) { - Append(" float4 o[%d] : XE_O;\n", kMaxInterpolators); - } - if (alloc_counts.point_size) { - Append(" float4 oPointSize : PSIZE;\n"); - } - Append("};\n"); // Vertex shader main() header. - Append( - "VS_OUTPUT main(VS_INPUT i) {\n" - " VS_OUTPUT o;\n"); - - // Always write position, as some shaders seem to only write certain values. - if (alloc_counts.positions) { - Append(" o.oPos = float4(0.0, 0.0, 0.0, 1.0);\n"); - } - if (alloc_counts.point_size) { - Append(" o.oPointSize = float4(1.0, 0.0, 0.0, 0.0);\n"); - } - - // TODO(benvanik): remove this, if possible (though the compiler may be smart - // enough to do it for us). - if (alloc_counts.params) { - for (uint32_t n = 0; n < kMaxInterpolators; n++) { - Append(" o.o[%d] = float4(0.0, 0.0, 0.0, 0.0);\n", n); - } - } + Append("void processVertex() {\n"); // Add temporaries for any registers we may use. uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs; for (uint32_t n = 0; n <= temp_regs; n++) { - Append(" float4 r%d = c[%d];\n", n, n); + Append(" vec4 r%d = state->float_consts[%d];\n", n, n); } - Append(" float4 t;\n"); + Append(" vec4 t;\n"); // Execute blocks. const auto& execs = vertex_shader->execs(); @@ -189,20 +116,12 @@ std::string GL4ShaderTranslator::TranslateVertexShader( } } - // main footer. - if (alloc_counts.positions) { - Append(" o.oPos = applyViewport(o.oPos);\n"); - } - Append( - " return o;\n" - "};\n"); - + Append("}\n"); return output_.to_string(); } std::string GL4ShaderTranslator::TranslatePixelShader( - GL4Shader* pixel_shader, const xe_gpu_program_cntl_t& program_cntl, - const GL4Shader::AllocCounts& alloc_counts) { + GL4Shader* pixel_shader, const xe_gpu_program_cntl_t& program_cntl) { Reset(pixel_shader); // We need an input VS to make decisions here. @@ -210,63 +129,22 @@ std::string GL4ShaderTranslator::TranslatePixelShader( // If the same PS is used with different VS that output different amounts // (and less than the number of required registers), things may die. - // Add constants buffers. - // We could optimize this by only including used buffers, but the compiler - // seems to do a good job of doing this for us. - // It also does read detection, so c[512] can end up c[4] in the asm - - // instead of doing this optimization ourselves we could maybe just query - // this from the compiler. - Append( - "cbuffer float_consts : register(b0) {\n" - " float4 c[512];\n" - "};\n"); - // TODO(benvanik): add bool/loop constants. - AppendTextureHeader(pixel_shader->sampler_inputs()); - // Add vertex shader output (pixel shader input). - Append("struct VS_OUTPUT {\n"); - if (alloc_counts.positions) { - assert_true(alloc_counts.positions == 1); - Append(" float4 oPos : SV_POSITION;\n"); - } - if (alloc_counts.params) { - Append(" float4 o[%d] : XE_O;\n", kMaxInterpolators); - } - Append("};\n"); - - // Add pixel shader output. - Append("struct PS_OUTPUT {\n"); - for (uint32_t n = 0; n < alloc_counts.params; n++) { - Append(" float4 oC%d : SV_TARGET%d;\n", n, n); - if (program_cntl.ps_export_depth) { - // Is this per render-target? - Append(" float oD%d : SV_DEPTH%d;\n", n, n); - } - } - Append("};\n"); - // Pixel shader main() header. - Append( - "PS_OUTPUT main(VS_OUTPUT i) {\n" - " PS_OUTPUT o;\n"); - for (uint32_t n = 0; n < alloc_counts.params; n++) { - Append(" o.oC%d = float4(1.0, 0.0, 0.0, 1.0);\n", n); - } + Append("void processFragment() {\n"); // Add temporary registers. uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs; for (uint32_t n = 0; n <= std::max(15u, temp_regs); n++) { - Append(" float4 r%d = c[%d];\n", n, n + 256); + Append(" vec4 r%d = state->float_consts[%d];\n", n, n + 256); } - Append(" float4 t;\n"); + Append(" vec4 t;\n"); Append(" float s;\n"); // scalar result (used for RETAIN_PREV) // Bring registers local. - if (alloc_counts.params) { - for (uint32_t n = 0; n < kMaxInterpolators; n++) { - Append(" r%d = i.o[%d];\n", n, n); - } + for (uint32_t n = 0; n < kMaxInterpolators; n++) { + Append(" r%d = vtx.o[%d];\n", n, n); } // Execute blocks. @@ -279,11 +157,7 @@ std::string GL4ShaderTranslator::TranslatePixelShader( } } - // main footer. - Append( - " return o;\n" - "}\n"); - + Append("}\n"); return output_.to_string(); } @@ -343,7 +217,7 @@ void GL4ShaderTranslator::AppendSrcReg(uint32_t num, uint32_t type, if (abs_constants) { Append("abs("); } - Append("c[%u]", is_pixel_shader() ? num + 256 : num); + Append("state->float_consts[%u]", is_pixel_shader() ? num + 256 : num); if (abs_constants) { Append(")"); } @@ -367,14 +241,14 @@ void GL4ShaderTranslator::AppendDestRegName(uint32_t num, uint32_t dst_exp) { case ShaderType::kVertex: switch (num) { case 62: - Append("o.oPos"); + Append("gl_Position"); break; case 63: - Append("o.oPointSize"); + Append("gl_PointSize"); break; default: // Varying. - Append("o.o[%u]", num); + Append("vtx.o[%u]", num); ; break; } @@ -382,7 +256,7 @@ void GL4ShaderTranslator::AppendDestRegName(uint32_t num, uint32_t dst_exp) { case ShaderType::kPixel: switch (num) { case 0: - Append("o.oC0"); + Append("oC[0]"); break; default: // TODO(benvanik): other render targets? @@ -412,7 +286,7 @@ void GL4ShaderTranslator::AppendDestRegPost(uint32_t num, uint32_t mask, // Masking. Append(" "); AppendDestRegName(num, dst_exp); - Append(" = float4("); + Append(" = vec4("); for (int i = 0; i < 4; i++) { // TODO(benvanik): mask out values? mix in old value as temp? // Append("%c", (mask & 0x1) ? chan_names[i] : 'w'); @@ -487,6 +361,9 @@ void GL4ShaderTranslator::PrintExportComment(uint32_t num) { case 63: name = "gl_PointSize"; break; + default: + name = "??"; + break; } break; case ShaderType::kPixel: @@ -494,6 +371,9 @@ void GL4ShaderTranslator::PrintExportComment(uint32_t num) { case 0: name = "gl_FragColor"; break; + default: + name = "??"; + break; } break; } @@ -509,7 +389,7 @@ bool GL4ShaderTranslator::TranslateALU_ADDv(const instr_alu_t& alu) { AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } Append("("); AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, @@ -530,7 +410,7 @@ bool GL4ShaderTranslator::TranslateALU_MULv(const instr_alu_t& alu) { AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } Append("("); AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, @@ -540,7 +420,7 @@ bool GL4ShaderTranslator::TranslateALU_MULv(const instr_alu_t& alu) { alu.abs_constants); Append(")"); if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -551,7 +431,7 @@ bool GL4ShaderTranslator::TranslateALU_MAXv(const instr_alu_t& alu) { AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } if (alu.src1_reg == alu.src2_reg && alu.src1_sel == alu.src2_sel && alu.src1_swiz == alu.src2_swiz && @@ -569,7 +449,7 @@ bool GL4ShaderTranslator::TranslateALU_MAXv(const instr_alu_t& alu) { Append(")"); } if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -580,7 +460,7 @@ bool GL4ShaderTranslator::TranslateALU_MINv(const instr_alu_t& alu) { AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } Append("min("); AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, @@ -590,7 +470,7 @@ bool GL4ShaderTranslator::TranslateALU_MINv(const instr_alu_t& alu) { alu.abs_constants); Append(")"); if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -602,9 +482,9 @@ bool GL4ShaderTranslator::TranslateALU_SETXXv(const instr_alu_t& alu, AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } - Append("float4(("); + Append("vec4(("); AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.abs_constants); Append(").x %s (", op); @@ -630,7 +510,7 @@ bool GL4ShaderTranslator::TranslateALU_SETXXv(const instr_alu_t& alu, alu.abs_constants); Append(").w ? 1.0 : 0.0)"); if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -653,14 +533,14 @@ bool GL4ShaderTranslator::TranslateALU_FRACv(const instr_alu_t& alu) { AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } Append("frac("); AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.abs_constants); Append(")"); if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -671,14 +551,14 @@ bool GL4ShaderTranslator::TranslateALU_TRUNCv(const instr_alu_t& alu) { AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } Append("trunc("); AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.abs_constants); Append(")"); if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -689,14 +569,14 @@ bool GL4ShaderTranslator::TranslateALU_FLOORv(const instr_alu_t& alu) { AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } Append("floor("); AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.abs_constants); Append(")"); if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -707,20 +587,19 @@ bool GL4ShaderTranslator::TranslateALU_MULADDv(const instr_alu_t& alu) { AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } - Append("mad("); + Append("("); AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.abs_constants); - Append(", "); + Append(" * "); AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.abs_constants); - Append(", "); + Append(") + "); AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.abs_constants); - Append(")"); if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -732,11 +611,11 @@ bool GL4ShaderTranslator::TranslateALU_CNDXXv(const instr_alu_t& alu, AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } // TODO(benvanik): check argument order - could be 3 as compare and 1 and 2 as // values. - Append("float4(("); + Append("vec4(("); AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.abs_constants); Append(").x %s 0.0 ? (", op); @@ -774,7 +653,7 @@ bool GL4ShaderTranslator::TranslateALU_CNDXXv(const instr_alu_t& alu, alu.abs_constants); Append(").w)"); if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -794,7 +673,7 @@ bool GL4ShaderTranslator::TranslateALU_DOT4v(const instr_alu_t& alu) { AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } Append("dot("); AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, @@ -804,7 +683,7 @@ bool GL4ShaderTranslator::TranslateALU_DOT4v(const instr_alu_t& alu) { alu.abs_constants); Append(")"); if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -815,17 +694,17 @@ bool GL4ShaderTranslator::TranslateALU_DOT3v(const instr_alu_t& alu) { AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } - Append("dot(float4("); + Append("dot(vec4("); AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.abs_constants); - Append(").xyz, float4("); + Append(").xyz, vec4("); AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.abs_constants); Append(").xyz)"); if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -836,12 +715,12 @@ bool GL4ShaderTranslator::TranslateALU_DOT2ADDv(const instr_alu_t& alu) { AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } - Append("dot(float4("); + Append("dot(vec4("); AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.abs_constants); - Append(").xy, float4("); + Append(").xy, vec4("); AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.abs_constants); Append(").xy) + "); @@ -849,7 +728,7 @@ bool GL4ShaderTranslator::TranslateALU_DOT2ADDv(const instr_alu_t& alu) { alu.abs_constants); Append(".x"); if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -862,7 +741,7 @@ bool GL4ShaderTranslator::TranslateALU_MAX4v(const instr_alu_t& alu) { AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); Append(" = "); if (alu.vector_clamp) { - Append("saturate("); + Append("clamp("); } Append("max("); Append("max("); @@ -880,7 +759,7 @@ bool GL4ShaderTranslator::TranslateALU_MAX4v(const instr_alu_t& alu) { alu.abs_constants); Append(".w)"); if (alu.vector_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); @@ -894,7 +773,7 @@ bool GL4ShaderTranslator::TranslateALU_MAXs(const instr_alu_t& alu) { alu.export_data); Append(" = "); if (alu.scalar_clamp) { - Append("saturate("); + Append("clamp("); } if ((alu.src3_swiz & 0x3) == (((alu.src3_swiz >> 2) + 1) & 0x3)) { // This is a mov. @@ -910,7 +789,7 @@ bool GL4ShaderTranslator::TranslateALU_MAXs(const instr_alu_t& alu) { Append(".y).xxxx"); } if (alu.scalar_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(get_alu_scalar_dest(alu), alu.scalar_write_mask, @@ -923,7 +802,7 @@ bool GL4ShaderTranslator::TranslateALU_MINs(const instr_alu_t& alu) { alu.export_data); Append(" = "); if (alu.scalar_clamp) { - Append("saturate("); + Append("clamp("); } Append("min("); AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, @@ -933,7 +812,7 @@ bool GL4ShaderTranslator::TranslateALU_MINs(const instr_alu_t& alu) { alu.abs_constants); Append(".y).xxxx"); if (alu.scalar_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(get_alu_scalar_dest(alu), alu.scalar_write_mask, @@ -947,14 +826,14 @@ bool GL4ShaderTranslator::TranslateALU_SETXXs(const instr_alu_t& alu, alu.export_data); Append(" = "); if (alu.scalar_clamp) { - Append("saturate("); + Append("clamp("); } Append("(("); AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.abs_constants); Append(".x %s 0.0) ? 1.0 : 0.0).xxxx", op); if (alu.scalar_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(get_alu_scalar_dest(alu), alu.scalar_write_mask, @@ -979,14 +858,14 @@ bool GL4ShaderTranslator::TranslateALU_RECIP_IEEE(const instr_alu_t& alu) { alu.export_data); Append(" = "); if (alu.scalar_clamp) { - Append("saturate("); + Append("clamp("); } Append("(1.0 / "); AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.abs_constants); Append(")"); if (alu.scalar_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(get_alu_scalar_dest(alu), alu.scalar_write_mask, @@ -999,7 +878,7 @@ bool GL4ShaderTranslator::TranslateALU_MUL_CONST_0(const instr_alu_t& alu) { alu.export_data); Append(" = "); if (alu.scalar_clamp) { - Append("saturate("); + Append("clamp("); } uint32_t src3_swiz = alu.src3_swiz & ~0x3C; uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; @@ -1013,7 +892,7 @@ bool GL4ShaderTranslator::TranslateALU_MUL_CONST_0(const instr_alu_t& alu) { Append(".%c", chan_names[swiz_b]); Append(").xxxx"); if (alu.scalar_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(get_alu_scalar_dest(alu), alu.scalar_write_mask, @@ -1029,7 +908,7 @@ bool GL4ShaderTranslator::TranslateALU_ADD_CONST_0(const instr_alu_t& alu) { alu.export_data); Append(" = "); if (alu.scalar_clamp) { - Append("saturate("); + Append("clamp("); } uint32_t src3_swiz = alu.src3_swiz & ~0x3C; uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; @@ -1043,7 +922,7 @@ bool GL4ShaderTranslator::TranslateALU_ADD_CONST_0(const instr_alu_t& alu) { Append(".%c", chan_names[swiz_b]); Append(").xxxx"); if (alu.scalar_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(get_alu_scalar_dest(alu), alu.scalar_write_mask, @@ -1059,7 +938,7 @@ bool GL4ShaderTranslator::TranslateALU_SUB_CONST_0(const instr_alu_t& alu) { alu.export_data); Append(" = "); if (alu.scalar_clamp) { - Append("saturate("); + Append("clamp("); } uint32_t src3_swiz = alu.src3_swiz & ~0x3C; uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; @@ -1073,7 +952,7 @@ bool GL4ShaderTranslator::TranslateALU_SUB_CONST_0(const instr_alu_t& alu) { Append(".%c", chan_names[swiz_b]); Append(").xxxx"); if (alu.scalar_clamp) { - Append(")"); + Append(", 0.0, 1.0)"); } Append(";\n"); AppendDestRegPost(get_alu_scalar_dest(alu), alu.scalar_write_mask, @@ -1491,10 +1370,10 @@ bool GL4ShaderTranslator::TranslateVertexFetch(const instr_fetch_vtx_t* vtx, // Translate. Append(" "); Append("r%u.xyzw", vtx->dst_reg); - Append(" = float4("); + Append(" = vec4("); uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel; // TODO(benvanik): detect xyzw = xyzw, etc. - // TODO(benvanik): detect and set as rN = float4(samp.xyz, 1.0); / etc + // TODO(benvanik): detect and set as rN = vec4(samp.xyz, 1.0); / etc uint32_t component_count = GetVertexFormatComponentCount(static_cast(vtx->format)); uint32_t dst_swiz = vtx->dst_swiz; @@ -1509,8 +1388,7 @@ bool GL4ShaderTranslator::TranslateVertexFetch(const instr_fetch_vtx_t* vtx, } else if ((dst_swiz & 0x7) == 7) { Append("r%u.%c", vtx->dst_reg, chan_names[i]); } else { - Append("i.vf%u_%d.%c", fetch_slot, vtx->offset, - chan_names[dst_swiz & 0x3]); + Append("vf%u_%d.%c", fetch_slot, vtx->offset, chan_names[dst_swiz & 0x3]); } if (i < 3) { Append(", "); @@ -1633,7 +1511,7 @@ bool GL4ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex, } Append(");\n"); - Append(" r%u.xyzw = float4(", tex->dst_reg); + Append(" r%u.xyzw = vec4(", tex->dst_reg); uint32_t dst_swiz = tex->dst_swiz; for (int i = 0; i < 4; i++) { if (i) { diff --git a/src/xenia/gpu/gl4/gl4_shader_translator.h b/src/xenia/gpu/gl4/gl4_shader_translator.h index 22a9cdfbb..64da30b04 100644 --- a/src/xenia/gpu/gl4/gl4_shader_translator.h +++ b/src/xenia/gpu/gl4/gl4_shader_translator.h @@ -34,8 +34,8 @@ class GL4ShaderTranslator { GL4Shader* vertex_shader, const xenos::xe_gpu_program_cntl_t& program_cntl); std::string TranslatePixelShader( - GL4Shader* pixel_shader, const xenos::xe_gpu_program_cntl_t& program_cntl, - const GL4Shader::AllocCounts& alloc_counts); + GL4Shader* pixel_shader, + const xenos::xe_gpu_program_cntl_t& program_cntl); protected: ShaderType shader_type_; diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 820080133..5e8eacc04 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -11,6 +11,7 @@ #define XENIA_GPU_SHADER_H_ #include +#include #include #include @@ -29,6 +30,7 @@ class Shader { const std::string& translated_disassembly() const { return translated_disassembly_; } + const std::vector translated_binary() { return translated_binary_; } const uint32_t* data() const { return data_.data(); } @@ -93,6 +95,7 @@ class Shader { std::string ucode_disassembly_; std::string translated_disassembly_; + std::vector translated_binary_; std::string error_log_; AllocCounts alloc_counts_;