Partially vectorized GetScissor (loading and unpacking the bitfields from the registers is still scalar)

This commit is contained in:
chss95cs@gmail.com 2022-12-14 09:33:14 -08:00
parent ab6d9dade0
commit 080b6f4cbd
2 changed files with 92 additions and 4 deletions

View file

@ -2193,7 +2193,6 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
REFRESH_MSVC_RANGE();
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
REGULAR_WRITE_CALLBACK);
}
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
XE_FORCEINLINE void
@ -2799,11 +2798,17 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
// todo: use SIMD for getscissor + scaling here, should reduce code size more
draw_util::Scissor scissor;
draw_util::GetScissor(regs, scissor);
#if XE_ARCH_AMD64 == 1
__m128i* scisp = (__m128i*)&scissor;
*scisp = _mm_mullo_epi32(
*scisp, _mm_setr_epi32(draw_resolution_scale_x, draw_resolution_scale_y,
draw_resolution_scale_x, draw_resolution_scale_y));
#else
scissor.offset[0] *= draw_resolution_scale_x;
scissor.offset[1] *= draw_resolution_scale_y;
scissor.extent[0] *= draw_resolution_scale_x;
scissor.extent[1] *= draw_resolution_scale_y;
#endif
// Update viewport, scissor, blend factor and stencil reference.
UpdateFixedFunctionState(viewport_info, scissor, primitive_polygonal,
normalized_depth_control);

View file

@ -552,8 +552,90 @@ void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
}
}
template <bool clamp_to_surface_pitch>
XE_NOINLINE static void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs,
Scissor& XE_RESTRICT scissor_out) {
static inline
void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs,
Scissor& XE_RESTRICT scissor_out) {
#if XE_ARCH_AMD64 == 1
auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
uint32_t surface_pitch = 0;
if constexpr (clamp_to_surface_pitch) {
surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
}
uint32_t pa_sc_window_scissor_tl_tl_x = pa_sc_window_scissor_tl.tl_x,
pa_sc_window_scissor_tl_tl_y = pa_sc_window_scissor_tl.tl_y,
pa_sc_window_scissor_br_br_x = pa_sc_window_scissor_br.br_x,
pa_sc_window_scissor_br_br_y = pa_sc_window_scissor_br.br_y,
pa_sc_window_offset_window_x_offset =
pa_sc_window_offset.window_x_offset,
pa_sc_window_offset_window_y_offset =
pa_sc_window_offset.window_y_offset,
pa_sc_screen_scissor_tl_tl_x = pa_sc_screen_scissor_tl.tl_x,
pa_sc_screen_scissor_tl_tl_y = pa_sc_screen_scissor_tl.tl_y,
pa_sc_screen_scissor_br_br_x = pa_sc_screen_scissor_br.br_x,
pa_sc_screen_scissor_br_br_y = pa_sc_screen_scissor_br.br_y;
int32_t tl_x = int32_t(pa_sc_window_scissor_tl_tl_x);
int32_t tl_y = int32_t(pa_sc_window_scissor_tl_tl_y);
int32_t br_x = int32_t(pa_sc_window_scissor_br_br_x);
int32_t br_y = int32_t(pa_sc_window_scissor_br_br_y);
__m128i tmp1 = _mm_setr_epi32(tl_x, tl_y, br_x, br_y);
__m128i pa_sc_scissor = _mm_setr_epi32(
pa_sc_screen_scissor_tl_tl_x, pa_sc_screen_scissor_tl_tl_y,
pa_sc_screen_scissor_br_br_x, pa_sc_screen_scissor_br_br_y);
__m128i xyoffsetadd = _mm_cvtsi64x_si128(
static_cast<unsigned long long>(pa_sc_window_offset_window_x_offset) |
(static_cast<unsigned long long>(pa_sc_window_offset_window_y_offset)
<< 32));
xyoffsetadd = _mm_unpacklo_epi64(xyoffsetadd, xyoffsetadd);
// chrispy: put this here to make it clear that the shift by 31 is extracting
// this field
XE_MAYBE_UNUSED
uint32_t window_offset_disable_reference =
pa_sc_window_scissor_tl.window_offset_disable;
__m128i offset_disable_mask = _mm_set1_epi32(pa_sc_window_scissor_tl.value);
__m128i addend = _mm_blendv_epi8(xyoffsetadd, _mm_setzero_si128(),
_mm_srai_epi32(offset_disable_mask, 31));
tmp1 = _mm_add_epi32(tmp1, addend);
//}
// Screen scissor is not used by Direct3D 9 (always 0, 0 to 8192, 8192), but
// still handled here for completeness.
__m128i lomax = _mm_max_epi32(tmp1, pa_sc_scissor);
__m128i himin = _mm_min_epi32(tmp1, pa_sc_scissor);
tmp1 = _mm_blend_epi16(lomax, himin, 0b11110000);
if constexpr (clamp_to_surface_pitch) {
// Clamp the horizontal scissor to surface_pitch for safety, in case that's
// not done by the guest for some reason (it's not when doing draws without
// clipping in Direct3D 9, for instance), to prevent overflow - this is
// important for host implementations, both based on target-indepedent
// rasterization without render target width at all (pixel shader
// interlock-based custom RB implementations) and using conventional render
// targets, but padded to EDRAM tiles.
tmp1 = _mm_blend_epi16(
tmp1, _mm_min_epi32(tmp1, _mm_set1_epi32(surface_pitch)),
0b00110011);
}
tmp1 = _mm_max_epi32(tmp1, _mm_setzero_si128());
__m128i tl_in_high = _mm_unpacklo_epi64(tmp1, tmp1);
__m128i final_br = _mm_max_epi32(tmp1, tl_in_high);
final_br = _mm_sub_epi32(final_br, tl_in_high);
__m128i scissor_res = _mm_blend_epi16(tmp1, final_br, 0b11110000);
_mm_storeu_si128((__m128i*)&scissor_out, scissor_res);
#else
auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
@ -629,6 +711,7 @@ XE_NOINLINE static void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs,
scissor_out.offset[1] = uint32_t(tl_y);
scissor_out.extent[0] = uint32_t(br_x - tl_x);
scissor_out.extent[1] = uint32_t(br_y - tl_y);
#endif
}
void GetScissor(const RegisterFile& XE_RESTRICT regs,