gpu2: shaders: implement initial values for cs

This commit is contained in:
DH 2024-10-01 19:29:33 +03:00
parent 59946fe927
commit d099439a32
3 changed files with 154 additions and 14 deletions

View file

@ -16,6 +16,9 @@ enum Op {
PS_INPUT_VGPR,
PS_COMP_SWAP,
VS_GET_INDEX,
CS_INPUT_SGPR,
CS_SET_INITIAL_EXEC,
CS_SET_THREAD_ID,
RESOURCE_PHI,
OpCount,
@ -49,6 +52,12 @@ inline const char *getInstructionName(unsigned op) {
return "ps_comp_swap";
case VS_GET_INDEX:
return "vs_get_index";
case CS_INPUT_SGPR:
return "cs_input_sgpr";
case CS_SET_INITIAL_EXEC:
return "cs_set_initial_exec";
case CS_SET_THREAD_ID:
return "cs_set_thread_id";
case RESOURCE_PHI:
return "resource_phi";
}

View file

@ -241,6 +241,60 @@ float32_t ps_input_vgpr(int32_t index, f32vec4 fragCoord, bool frontFace) {
return 0;
}
uint32_t cs_input_sgpr(int32_t index, u32vec3 localInvocationId) {
if (index == 0) {
return localInvocationId.x;
}
if (index == 1) {
return localInvocationId.y;
}
if (index == 2) {
return localInvocationId.z;
}
return 0;
}
void cs_set_initial_exec(u32vec3 localInvocationId, u32vec3 workgroupSize) {
uint32_t totalWorkgroupSize = workgroupSize.x * workgroupSize.y * workgroupSize.z;
if (totalWorkgroupSize == 64) {
exec = ~uint64_t(0);
return;
}
if (totalWorkgroupSize < 64) {
exec = (uint64_t(1) << totalWorkgroupSize) - 1;
return;
}
uint32_t waveCount = totalWorkgroupSize / 64;
uint32_t totalInvocationIndex = localInvocationId.x +
localInvocationId.y * workgroupSize.x +
localInvocationId.z * workgroupSize.x * workgroupSize.y;
uint32_t waveIndex = (totalInvocationIndex + 63) / 64;
if (waveIndex + 1 < waveCount) {
exec = ~uint64_t(0);
return;
}
uint32_t lastWaveLen = totalWorkgroupSize % 64;
exec = lastWaveLen == 0 ? ~uint64_t(0) : ((uint64_t(1) << lastWaveLen) - 1);
}
void cs_set_thread_id(u32vec3 localInvocationId, u32vec3 workgroupSize) {
uint32_t totalInvocationIndex = localInvocationId.x +
localInvocationId.y * workgroupSize.x +
localInvocationId.z * workgroupSize.x * workgroupSize.y;
thread_id = totalInvocationIndex % 64;
}
const uint32_t kPrimTypeQuadList = 0x13;
const uint32_t kPrimTypeQuadStrip = 0x14;

View file

@ -1231,7 +1231,7 @@ static void instructionsToSpv(GcnConverter &converter, gcn::Import &importer,
auto memorySSA = buildMemorySSA(cfg, &moduleInfo);
spv::Import resourceImporter;
memorySSA.print(std::cerr, body, context.ns);
// memorySSA.print(std::cerr, body, context.ns);
ResourcesBuilder resourcesBuilder;
std::map<ir::Value, std::int32_t> resourceConfigSlots;
@ -1324,8 +1324,8 @@ static void instructionsToSpv(GcnConverter &converter, gcn::Import &importer,
}
}
static void createEntryPoint(gcn::Context &context, gcn::Stage stage,
ir::Region &&body) {
static void createEntryPoint(gcn::Context &context, const gcn::Environment &env,
gcn::Stage stage, ir::Region &&body) {
auto executionModel = ir::spv::ExecutionModel::GLCompute;
switch (stage) {
@ -1408,6 +1408,17 @@ static void createEntryPoint(gcn::Context &context, gcn::Stage stage,
mainFn.getLocation(), mainFn,
ir::spv::ExecutionMode::OriginUpperLeft());
}
if (executionModel == ir::spv::ExecutionModel::GLCompute) {
auto executionModes = gcn::Builder::createAppend(
context, context.layout.getOrCreateExecutionModes(context));
executionModes.createSpvExecutionMode(
mainFn.getLocation(), mainFn,
ir::spv::ExecutionMode::LocalSize(env.numThreadX, env.numThreadY,
env.numThreadZ));
}
entryPoints.createSpvEntryPoint(mainFn.getLocation(), executionModel, mainFn,
"main", interfaceList);
}
@ -1457,13 +1468,9 @@ static void createInitialValues(GcnConverter &converter,
} else if (stage == gcn::Stage::Ps) {
auto boolT = context.getTypeBool();
auto f32T = context.getTypeFloat32();
auto s32T = context.getTypeSInt32();
auto f32x3 = context.getTypeVector(f32T, 3);
auto f32x4 = context.getTypeVector(f32T, 4);
auto boolPT = context.getTypePointer(ir::spv::StorageClass::Input, boolT);
auto s32PT = context.getTypePointer(ir::spv::StorageClass::Input, s32T);
auto f32x3PT = context.getTypePointer(ir::spv::StorageClass::Input, f32x3);
auto f32x4PT = context.getTypePointer(ir::spv::StorageClass::Input, f32x4);
auto globals = gcn::Builder::createAppend(
@ -1517,16 +1524,86 @@ static void createInitialValues(GcnConverter &converter,
builder.createSpvBitcast(loc, context.getTypeSInt32(), runtimeIndex));
auto vgprValue = builder.createValue(loc, ir::amdgpu::PS_INPUT_VGPR,
std::span<const ir::Operand>{{
context.getTypeFloat32(),
indexLocal,
fragCoord,
frontFace,
}});
context.getTypeFloat32(), indexLocal,
fragCoord, frontFace);
context.writeReg(loc, builder, gcn::RegId::Vgpr, i, vgprValue);
}
}
if (stage == gcn::Stage::Cs) {
auto uintT = context.getTypeUInt32();
auto uvec3T = context.getTypeVector(uintT, 3);
auto pInputUVec3T =
context.getTypePointer(ir::spv::StorageClass::Input, uvec3T);
auto globals = gcn::Builder::createAppend(
context, context.layout.getOrCreateGlobals(context));
auto annotations = gcn::Builder::createAppend(
context, context.layout.getOrCreateAnnotations(context));
auto workGroupIdVar = globals.createSpvVariable(
loc, pInputUVec3T, ir::spv::StorageClass::Input);
annotations.createSpvDecorate(
loc, workGroupIdVar,
ir::spv::Decoration::BuiltIn(ir::spv::BuiltIn::WorkgroupId));
auto localInvocationIdVar = globals.createSpvVariable(
loc, pInputUVec3T, ir::spv::StorageClass::Input);
annotations.createSpvDecorate(
loc, localInvocationIdVar,
ir::spv::Decoration::BuiltIn(ir::spv::BuiltIn::LocalInvocationId));
auto workGroupId = builder.createSpvLoad(loc, uvec3T, workGroupIdVar);
auto workGroupIdLocalVar =
converter.createLocalVariable(builder, loc, workGroupId);
auto localInvocationId =
builder.createSpvLoad(loc, uvec3T, localInvocationIdVar);
auto localInvocationIdLocVar =
converter.createLocalVariable(builder, loc, localInvocationId);
{
auto indexLocal =
converter.createLocalVariable(builder, loc, context.simm32(0));
int end = env.sgprCount;
end = std::min<int>(end, env.userSgprs.size() +
static_cast<int>(gcn::CsSGprInput::Count));
for (int i = env.userSgprs.size(); i < end; ++i) {
std::uint32_t slot =
info.create(gcn::ConfigType::CsInputSGpr, i - env.userSgprs.size());
auto runtimeIndex = converter.createReadConfig(stage, builder, slot);
builder.createSpvStore(loc, indexLocal,
builder.createSpvBitcast(
loc, context.getTypeSInt32(), runtimeIndex));
auto sgprValue = builder.createValue(loc, ir::amdgpu::CS_INPUT_SGPR,
context.getTypeUInt32(),
indexLocal, workGroupIdLocalVar);
context.writeReg(loc, builder, gcn::RegId::Sgpr, i, sgprValue);
}
}
for (std::int32_t i = 0; i < 3; ++i) {
auto value = builder.createSpvCompositeExtract(loc, uintT,
localInvocationId, {{i}});
context.writeReg(loc, builder, gcn::RegId::Vgpr, i, value);
}
auto workgroupSize = builder.createSpvCompositeConstruct(
loc, uvec3T,
{{context.imm32(env.numThreadX), context.imm32(env.numThreadY),
context.imm32(env.numThreadZ)}});
auto workgroupSizeLocVar =
converter.createLocalVariable(builder, loc, workgroupSize);
builder.createValue(loc, ir::amdgpu::CS_SET_INITIAL_EXEC,
context.getTypeVoid(), localInvocationIdLocVar,
workgroupSizeLocVar);
builder.createValue(loc, ir::amdgpu::CS_SET_THREAD_ID,
context.getTypeVoid(), localInvocationIdLocVar,
workgroupSizeLocVar);
}
context.writeReg(loc, builder, gcn::RegId::Vcc, 0, context.imm64(0));
for (int word = 0; word < 2; ++word) {
@ -1561,7 +1638,7 @@ gcn::convertToSpv(Context &context, ir::Region body,
context.imm32(0));
}
createEntryPoint(context, stage, std::move(body));
createEntryPoint(context, env, stage, std::move(body));
for (int userSgpr = std::countr_zero(context.requiredUserSgprs);
userSgpr < 32;