#ifdef LLVM_AVAILABLE #include "CPUTranslator.h" #include "util/v128.hpp" #include "util/simd.hpp" llvm::LLVMContext g_llvm_ctx; llvm::Value* peek_through_bitcasts(llvm::Value* arg) { llvm::CastInst* i; while ((i = llvm::dyn_cast_or_null(arg)) && i->getOpcode() == llvm::Instruction::BitCast) { arg = i->getOperand(0); } return arg; } cpu_translator::cpu_translator(llvm::Module* _module, bool is_be) : m_context(g_llvm_ctx) , m_module(_module) , m_is_be(is_be) { register_intrinsic("x86_pshufb", [&](llvm::CallInst* ci) -> llvm::Value* { const auto data0 = ci->getOperand(0); const auto index = ci->getOperand(1); const auto zeros = llvm::ConstantAggregateZero::get(get_type()); if (m_use_ssse3) { #if defined(ARCH_X64) return m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_ssse3_pshuf_b_128), {data0, index}); #elif defined(ARCH_ARM64) // Modified from sse2neon // movi v2.16b, #143 // and v1.16b, v1.16b, v2.16b // tbl v0.16b, { v0.16b }, v1.16b auto mask = llvm::ConstantInt::get(get_type(), 0x8F); auto and_mask = llvm::ConstantInt::get(get_type(), true); auto vec_len = llvm::ConstantInt::get(get_type(), 16); auto index_masked = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::vp_and), {index, mask, and_mask, vec_len}); return m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbl1), {data0, index_masked}); #else #error "Unimplemented" #endif } else { // Emulate PSHUFB (TODO) const auto mask = m_ir->CreateAnd(index, 0xf); const auto loop = llvm::BasicBlock::Create(m_context, "", m_ir->GetInsertBlock()->getParent()); const auto prev = ci->getParent(); const auto next = prev->splitBasicBlock(ci->getNextNode()); llvm::cast(m_ir->GetInsertBlock()->getTerminator())->setOperand(0, loop); llvm::Value* result; //m_ir->CreateBr(loop); m_ir->SetInsertPoint(loop); const auto i = m_ir->CreatePHI(get_type(), 2); const auto v = m_ir->CreatePHI(get_type(), 2); i->addIncoming(m_ir->getInt32(0), prev); i->addIncoming(m_ir->CreateAdd(i, m_ir->getInt32(1)), loop); v->addIncoming(zeros, prev); result = m_ir->CreateInsertElement(v, m_ir->CreateExtractElement(data0, m_ir->CreateExtractElement(mask, i)), i); v->addIncoming(result, loop); m_ir->CreateCondBr(m_ir->CreateICmpULT(i, m_ir->getInt32(16)), loop, next); m_ir->SetInsertPoint(next->getFirstNonPHI()); result = m_ir->CreateSelect(m_ir->CreateICmpSLT(index, zeros), zeros, result); return result; } }); register_intrinsic("any_select_by_bit4", [&](llvm::CallInst* ci) -> llvm::Value* { const auto s = bitcast(m_ir->CreateShl(bitcast(ci->getOperand(0)), 3));; const auto a = bitcast(ci->getOperand(1)); const auto b = bitcast(ci->getOperand(2)); return m_ir->CreateSelect(m_ir->CreateICmpSLT(s, llvm::ConstantAggregateZero::get(get_type())), b, a); }); } void cpu_translator::initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine) { m_context = context; m_engine = &engine; const auto cpu = m_engine->getTargetMachine()->getTargetCPU(); // Test SSSE3 feature (TODO) if (cpu == "generic" || cpu == "k8" || cpu == "opteron" || cpu == "athlon64" || cpu == "athlon-fx" || cpu == "k8-sse3" || cpu == "opteron-sse3" || cpu == "athlon64-sse3" || cpu == "amdfam10" || cpu == "barcelona") { m_use_ssse3 = false; } // Test AVX feature (TODO) if (cpu == "sandybridge" || cpu == "ivybridge" || cpu == "bdver1") { m_use_avx = true; } // Test FMA feature (TODO) if (cpu == "haswell" || cpu == "broadwell" || cpu == "skylake" || cpu == "alderlake" || cpu == "raptorlake" || cpu == "meteorlake" || cpu == "bdver2" || cpu == "bdver3" || cpu == "bdver4" || cpu == "znver1" || cpu == "znver2" || cpu == "znver3") { m_use_fma = true; m_use_avx = true; } // Test AVX-512 feature (TODO) if (cpu == "skylake-avx512" || cpu == "cascadelake" || cpu == "cannonlake" || cpu == "cooperlake") { m_use_avx = true; m_use_fma = true; m_use_avx512 = true; } // Test VNNI feature (TODO) if (cpu == "cascadelake" || cpu == "cooperlake" || cpu == "alderlake" || cpu == "raptorlake" || cpu == "meteorlake") { m_use_vnni = true; } // Test GFNI feature (TODO) if (cpu == "tremont" || cpu == "gracemont" || cpu == "alderlake" || cpu == "raptorlake" || cpu == "meteorlake") { m_use_gfni = true; } // Test AVX-512_icelake features (TODO) if (cpu == "icelake" || cpu == "icelake-client" || cpu == "icelake-server" || cpu == "tigerlake" || cpu == "rocketlake" || cpu == "sapphirerapids" || (cpu.startswith("znver") && cpu != "znver1" && cpu != "znver2" && cpu != "znver3")) { m_use_avx = true; m_use_fma = true; m_use_avx512 = true; m_use_avx512_icl = true; m_use_vnni = true; m_use_gfni = true; } // Aarch64 CPUs if (cpu == "cyclone" || cpu.contains("cortex")) { m_use_fma = true; // AVX does not use intrinsics so far m_use_avx = true; } } llvm::Value* cpu_translator::bitcast(llvm::Value* val, llvm::Type* type) const { uint s1 = type->getScalarSizeInBits(); uint s2 = val->getType()->getScalarSizeInBits(); if (type->isVectorTy()) s1 *= llvm::cast(type)->getNumElements(); if (val->getType()->isVectorTy()) s2 *= llvm::cast(val->getType())->getNumElements(); if (s1 != s2) { fmt::throw_exception("cpu_translator::bitcast(): incompatible type sizes (%u vs %u)", s1, s2); } if (const auto c1 = llvm::dyn_cast(val)) { return ensure(llvm::ConstantFoldCastOperand(llvm::Instruction::BitCast, c1, type, m_module->getDataLayout())); } return m_ir->CreateBitCast(val, type); } template <> std::pair cpu_translator::get_const_vector(llvm::Value* c, u32 _pos, u32 _line) { v128 result{}; if (!llvm::isa(c)) { return {false, result}; } const auto t = c->getType(); if (!t->isVectorTy()) { if (const auto ci = llvm::dyn_cast(c); ci && ci->getBitWidth() == 128) { const auto& cv = ci->getValue(); result._u64[0] = cv.extractBitsAsZExtValue(64, 0); result._u64[1] = cv.extractBitsAsZExtValue(64, 64); return {true, result}; } fmt::throw_exception("[0x%x, %u] Not a vector", _pos, _line); } if (auto v = llvm::cast(t); v->getScalarSizeInBits() * v->getNumElements() != 128) { fmt::throw_exception("[0x%x, %u] Bad vector size: i%ux%u", _pos, _line, v->getScalarSizeInBits(), v->getNumElements()); } const auto cv = llvm::dyn_cast(c); if (!cv) { if (llvm::isa(c)) { return {true, result}; } std::string result; llvm::raw_string_ostream out(result); c->print(out, true); out.flush(); if (llvm::isa(c)) { // Sorry, if we cannot evaluate it we cannot use it fmt::throw_exception("[0x%x, %u] Constant Expression!\n%s", _pos, _line, result); } fmt::throw_exception("[0x%x, %u] Unexpected constant type!\n%s", _pos, _line, result); } const auto sct = t->getScalarType(); if (sct->isIntegerTy(8)) { for (u32 i = 0; i < 16; i++) { result._u8[i] = static_cast(cv->getElementAsInteger(i)); } } else if (sct->isIntegerTy(16)) { for (u32 i = 0; i < 8; i++) { result._u16[i] = static_cast(cv->getElementAsInteger(i)); } } else if (sct->isIntegerTy(32)) { for (u32 i = 0; i < 4; i++) { result._u32[i] = static_cast(cv->getElementAsInteger(i)); } } else if (sct->isIntegerTy(64)) { for (u32 i = 0; i < 2; i++) { result._u64[i] = cv->getElementAsInteger(i); } } else if (sct->isFloatTy()) { for (u32 i = 0; i < 4; i++) { result._f[i] = cv->getElementAsFloat(i); } } else if (sct->isDoubleTy()) { for (u32 i = 0; i < 2; i++) { result._d[i] = cv->getElementAsDouble(i); } } else { fmt::throw_exception("[0x%x, %u] Unexpected vector element type", _pos, _line); } return {true, result}; } template <> llvm::Constant* cpu_translator::make_const_vector(v128 v, llvm::Type* t, u32 _line) { if (const auto ct = llvm::dyn_cast(t); ct && ct->getBitWidth() == 128) { return llvm::ConstantInt::get(t, llvm::APInt(128, llvm::ArrayRef(reinterpret_cast(v._bytes), 2))); } ensure(t->isVectorTy()); ensure(128 == t->getScalarSizeInBits() * llvm::cast(t)->getNumElements()); const auto sct = t->getScalarType(); if (sct->isIntegerTy(8)) { return llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(v._bytes), 16)); } if (sct->isIntegerTy(16)) { return llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(v._bytes), 8)); } if (sct->isIntegerTy(32)) { return llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(v._bytes), 4)); } if (sct->isIntegerTy(64)) { return llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(v._bytes), 2)); } if (sct->isFloatTy()) { return llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(v._bytes), 4)); } if (sct->isDoubleTy()) { return llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(v._bytes), 2)); } fmt::throw_exception("[line %u] No supported constant type", _line); } void cpu_translator::replace_intrinsics(llvm::Function& f) { for (auto& bb : f) { for (auto bit = bb.begin(); bit != bb.end();) { if (auto ci = llvm::dyn_cast(&*bit)) { if (auto cf = ci->getCalledFunction()) { if (auto it = m_intrinsics.find(std::string_view(cf->getName().data(), cf->getName().size())); it != m_intrinsics.end()) { m_ir->SetInsertPoint(ci); ci->replaceAllUsesWith(it->second(ci)); bit = ci->eraseFromParent(); continue; } } } ++bit; } } } void cpu_translator::erase_stores(llvm::ArrayRef args) { for (auto v : args) { for (auto it = v->use_begin(); it != v->use_end(); ++it) { llvm::Value* i = *it; llvm::CastInst* bci = nullptr; // Walk through bitcasts while (i && (bci = llvm::dyn_cast(i)) && bci->getOpcode() == llvm::Instruction::BitCast) { i = *bci->use_begin(); } if (auto si = llvm::dyn_cast_or_null(i)) { si->eraseFromParent(); } } } } #endif