gpu: shader: implement more instructions

add v_mad_u64_u32, v_mad_i64_i32, image_sample_l
stub s_ttracedata
fix s_andn2_saveexec_b64, s_orn2_saveexec_b64, s_subb_u32
This commit is contained in:
DH 2024-10-17 04:42:08 +03:00
parent f135c0d4b7
commit 32a2c58441

View file

@ -497,7 +497,7 @@ uint32_t v_mbcnt_hi_u32_b32(uint32_t x, uint32_t y) {
return (thread_id > 32 ? bitCount(x & ((1 << (thread_id - 32)) - 1)) : 0) + y;
}
uint32_t v_add_i32(inout uint64_t sdst, int32_t x, int32_t y) {
uint64_t result = uint64_t(x) + uint64_t(y);
uint64_t result = uint64_t(int64_t(x) + int64_t(y));
if (result > 0xffffffff) {
sdst |= exec & (uint64_t(1) << thread_id);
@ -714,7 +714,7 @@ bool v_cmp_class_f64(float64_t x, uint vftypemask) { return CMP_CLASS(x, vftypem
float32_t v_mad_legacy_f32(float32_t a, float32_t b, float32_t c) { return (a == 0 || b == 0) ? c : fma(a, b, c); }
float32_t v_mad_f32(float32_t a, float32_t b, float32_t c) { return fma(a, b, c); }
uint32_t v_mad_i32_i24(int32_t a, int32_t b, int32_t c) { return mul24lo(a, b) + c; }
int32_t v_mad_i32_i24(int32_t a, int32_t b, int32_t c) { return mul24lo(a, b) + c; }
uint32_t v_mad_u32_u24(uint32_t a, uint32_t b, uint32_t c) { return mul24lo(a, b) + c; }
float32_t v_cubeid_f32(float32_t a, float32_t b, float32_t c) {
if (abs(c) >= abs(a) && abs(c) >= abs(b)) {
@ -964,13 +964,38 @@ uint32_t v_msad_u8(uint32_t x, uint32_t y, uint32_t z) {
// }
// void v_mqsad_u32_u8() {}
// void v_mad_u64_u32() {}
// void v_mad_i64_i32() {}
uint64_t v_mad_u64_u32(uint32_t a, uint32_t b, uint64_t c) {
uint32_t mulResult = a * b;
uint64_t result = mulResult + c;
uint64_t thread_mask = uint64_t(1) << thread_id;
if (result < max(mulResult, c)) {
vcc |= thread_mask & exec;
} else {
vcc &= ~thread_mask;
}
return result;
}
int64_t v_mad_i64_i32(int32_t a, int32_t b, int64_t c) {
int32_t mulResult = a * b;
int64_t result = mulResult + c;
uint64_t thread_mask = uint64_t(1) << thread_id;
if (sign(mulResult) == sign(c) && sign(mulResult) != result) {
vcc |= thread_mask & exec;
} else {
vcc &= ~thread_mask;
}
return result;
}
// SOP
bool scc;
void s_ttracedata(uint64_t) {}
void s_cmp_eq_i32(int32_t a, int32_t b) { scc = a == b; }
void s_cmp_ge_i32(int32_t a, int32_t b) { scc = a >= b; }
void s_cmp_gt_i32(int32_t a, int32_t b) { scc = a > b; }
@ -1128,13 +1153,13 @@ uint64_t s_xor_saveexec_b64(uint64_t x) {
}
uint64_t s_andn2_saveexec_b64(uint64_t x) {
uint64_t result = exec;
exec = result & ~x;
exec = x & ~result;
scc = result != 0;
return result;
}
uint64_t s_orn2_saveexec_b64(uint64_t x) {
uint64_t result = exec;
exec = result | ~x;
exec = x | ~result;
scc = result != 0;
return result;
}
@ -1197,18 +1222,14 @@ int32_t s_sub_i32(int32_t x, int32_t y) {
return result;
}
uint32_t s_addc_u32(uint32_t x, uint32_t y) {
uint32_t carry0;
uint32_t carry1 = 0;
uint32_t result = uaddCarry(x, y, carry0);
if (scc) {
result = uaddCarry(result, 1, carry1);
}
scc = (carry0 | carry1) != 0;
return result;
uint64_t result = uint64_t(x) + uint64_t(y) + (scc ? 1 : 0);
scc = result > 0xffffffff;
return uint32_t(result);
}
uint32_t s_subb_u32(uint32_t x, uint32_t y) {
uint32_t result = x - y - (scc ? 1 : 0);
scc = y + (scc ? 1 : 0) > x;
uint32_t sccValue = scc ? 1 : 0;
uint32_t result = x - y - sccValue;
scc = (y + sccValue) > x || sccValue > x;
return result;
}
int32_t s_min_i32(int32_t x, int32_t y) {
@ -2779,7 +2800,58 @@ void image_sample(inout f32vec4 vdata, f32vec3 vaddr, int32_t textureIndexHint,
// image_sample_cl
// image_sample_d
// image_sample_d_cl
// image_sample_l
void image_sample_l(inout f32vec4 vdata, f32vec4 vaddr, int32_t textureIndexHint, uint32_t tbuffer[8], int32_t samplerIndexHint, u32vec4 ssampler, uint32_t dmask) {
uint8_t textureType = tbuffer_type(tbuffer);
f32vec4 result;
switch (uint(textureType)) {
case kTextureType1D:
case kTextureTypeArray1D:
result = textureLod(
sampler1D(
textures1D[findTexture1DIndex(textureIndexHint, tbuffer)],
samplers[findSamplerIndex(samplerIndexHint, ssampler)]
), vaddr.x, vaddr.y);
break;
case kTextureType2D:
case kTextureTypeCube:
case kTextureTypeArray2D:
case kTextureTypeMsaa2D:
case kTextureTypeMsaaArray2D:
result = textureLod(
sampler2D(
textures2D[findTexture2DIndex(textureIndexHint, tbuffer)],
samplers[findSamplerIndex(samplerIndexHint, ssampler)]
), vaddr.xy, vaddr.z);
break;
case kTextureType3D:
result = textureLod(
sampler3D(
textures3D[findTexture3DIndex(textureIndexHint, tbuffer)],
samplers[findSamplerIndex(samplerIndexHint, ssampler)]
), vaddr.xyz, vaddr.w);
break;
default:
return;
}
result = swizzle(result,
tbuffer_dst_sel_x(tbuffer),
tbuffer_dst_sel_y(tbuffer),
tbuffer_dst_sel_z(tbuffer),
tbuffer_dst_sel_w(tbuffer));
int vdataIndex = 0;
for (int i = 0; i < 4; ++i) {
if ((dmask & (1 << i)) != 0) {
vdata[vdataIndex++] = result[i];
}
}
}
// image_sample_b
// image_sample_b_cl
// image_sample_lz