diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 09315be09..5c7c6722f 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -2193,7 +2193,6 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound( REFRESH_MSVC_RANGE(); DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536, REGULAR_WRITE_CALLBACK); - } template XE_FORCEINLINE void @@ -2799,11 +2798,17 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // todo: use SIMD for getscissor + scaling here, should reduce code size more draw_util::Scissor scissor; draw_util::GetScissor(regs, scissor); +#if XE_ARCH_AMD64 == 1 + __m128i* scisp = (__m128i*)&scissor; + *scisp = _mm_mullo_epi32( + *scisp, _mm_setr_epi32(draw_resolution_scale_x, draw_resolution_scale_y, + draw_resolution_scale_x, draw_resolution_scale_y)); +#else scissor.offset[0] *= draw_resolution_scale_x; scissor.offset[1] *= draw_resolution_scale_y; scissor.extent[0] *= draw_resolution_scale_x; scissor.extent[1] *= draw_resolution_scale_y; - +#endif // Update viewport, scissor, blend factor and stencil reference. UpdateFixedFunctionState(viewport_info, scissor, primitive_polygonal, normalized_depth_control); diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 98d2802ee..e6461e8bd 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -552,8 +552,90 @@ void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args, } } template -XE_NOINLINE static void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs, - Scissor& XE_RESTRICT scissor_out) { +static inline +void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs, + Scissor& XE_RESTRICT scissor_out) { +#if XE_ARCH_AMD64 == 1 + auto pa_sc_window_scissor_tl = regs.Get(); + auto pa_sc_window_scissor_br = regs.Get(); + auto pa_sc_window_offset = regs.Get(); + auto pa_sc_screen_scissor_tl = regs.Get(); + auto pa_sc_screen_scissor_br = regs.Get(); + uint32_t surface_pitch = 0; + if constexpr (clamp_to_surface_pitch) { + surface_pitch = regs.Get().surface_pitch; + } + uint32_t pa_sc_window_scissor_tl_tl_x = pa_sc_window_scissor_tl.tl_x, + pa_sc_window_scissor_tl_tl_y = pa_sc_window_scissor_tl.tl_y, + pa_sc_window_scissor_br_br_x = pa_sc_window_scissor_br.br_x, + pa_sc_window_scissor_br_br_y = pa_sc_window_scissor_br.br_y, + pa_sc_window_offset_window_x_offset = + pa_sc_window_offset.window_x_offset, + pa_sc_window_offset_window_y_offset = + pa_sc_window_offset.window_y_offset, + pa_sc_screen_scissor_tl_tl_x = pa_sc_screen_scissor_tl.tl_x, + pa_sc_screen_scissor_tl_tl_y = pa_sc_screen_scissor_tl.tl_y, + pa_sc_screen_scissor_br_br_x = pa_sc_screen_scissor_br.br_x, + pa_sc_screen_scissor_br_br_y = pa_sc_screen_scissor_br.br_y; + + int32_t tl_x = int32_t(pa_sc_window_scissor_tl_tl_x); + int32_t tl_y = int32_t(pa_sc_window_scissor_tl_tl_y); + + int32_t br_x = int32_t(pa_sc_window_scissor_br_br_x); + int32_t br_y = int32_t(pa_sc_window_scissor_br_br_y); + + __m128i tmp1 = _mm_setr_epi32(tl_x, tl_y, br_x, br_y); + __m128i pa_sc_scissor = _mm_setr_epi32( + pa_sc_screen_scissor_tl_tl_x, pa_sc_screen_scissor_tl_tl_y, + pa_sc_screen_scissor_br_br_x, pa_sc_screen_scissor_br_br_y); + __m128i xyoffsetadd = _mm_cvtsi64x_si128( + static_cast(pa_sc_window_offset_window_x_offset) | + (static_cast(pa_sc_window_offset_window_y_offset) + << 32)); + xyoffsetadd = _mm_unpacklo_epi64(xyoffsetadd, xyoffsetadd); + // chrispy: put this here to make it clear that the shift by 31 is extracting + // this field + XE_MAYBE_UNUSED + uint32_t window_offset_disable_reference = + pa_sc_window_scissor_tl.window_offset_disable; + + __m128i offset_disable_mask = _mm_set1_epi32(pa_sc_window_scissor_tl.value); + + __m128i addend = _mm_blendv_epi8(xyoffsetadd, _mm_setzero_si128(), + _mm_srai_epi32(offset_disable_mask, 31)); + + tmp1 = _mm_add_epi32(tmp1, addend); + + //} + // Screen scissor is not used by Direct3D 9 (always 0, 0 to 8192, 8192), but + // still handled here for completeness. + __m128i lomax = _mm_max_epi32(tmp1, pa_sc_scissor); + __m128i himin = _mm_min_epi32(tmp1, pa_sc_scissor); + + tmp1 = _mm_blend_epi16(lomax, himin, 0b11110000); + + if constexpr (clamp_to_surface_pitch) { + // Clamp the horizontal scissor to surface_pitch for safety, in case that's + // not done by the guest for some reason (it's not when doing draws without + // clipping in Direct3D 9, for instance), to prevent overflow - this is + // important for host implementations, both based on target-indepedent + // rasterization without render target width at all (pixel shader + // interlock-based custom RB implementations) and using conventional render + // targets, but padded to EDRAM tiles. + tmp1 = _mm_blend_epi16( + tmp1, _mm_min_epi32(tmp1, _mm_set1_epi32(surface_pitch)), + 0b00110011); + } + + tmp1 = _mm_max_epi32(tmp1, _mm_setzero_si128()); + + __m128i tl_in_high = _mm_unpacklo_epi64(tmp1, tmp1); + + __m128i final_br = _mm_max_epi32(tmp1, tl_in_high); + final_br = _mm_sub_epi32(final_br, tl_in_high); + __m128i scissor_res = _mm_blend_epi16(tmp1, final_br, 0b11110000); + _mm_storeu_si128((__m128i*)&scissor_out, scissor_res); +#else auto pa_sc_window_scissor_tl = regs.Get(); auto pa_sc_window_scissor_br = regs.Get(); auto pa_sc_window_offset = regs.Get(); @@ -629,6 +711,7 @@ XE_NOINLINE static void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs, scissor_out.offset[1] = uint32_t(tl_y); scissor_out.extent[0] = uint32_t(br_x - tl_x); scissor_out.extent[1] = uint32_t(br_y - tl_y); +#endif } void GetScissor(const RegisterFile& XE_RESTRICT regs,