SPU: Detect reduced loop

This commit is contained in:
Elad 2025-12-13 17:20:07 +02:00
parent 9b6bc7c1b4
commit a863e94c25
4 changed files with 1726 additions and 42 deletions

View file

@ -303,6 +303,8 @@ struct spu_itype
}
};
using spu_itype_t = spu_itype::type;
struct spu_iflag
{
enum
@ -528,6 +530,8 @@ struct spu_iflag
}
};
using spu_iflag_t = spu_iflag::flag;
#define NAME(x) static constexpr const char& x = *#x
struct spu_iname

File diff suppressed because it is too large Load diff

View file

@ -24,6 +24,20 @@ union spu_opcode_t
bf_t<u32, 7, 16> i16; // 9..24
bf_t<s32, 7, 16> si16; // 9..24, signed
bf_t<u32, 7, 18> i18; // 7..24
// For 16-bit instructions in the context of 32-bits
u32 duplicate_si10() const
{
const u32 _16 = static_cast<u16>(static_cast<s16>(si10));
return (_16 << 16) | _16;
}
// For 8-bit instructions in the context of 32-bits
u32 duplicate_duplicate_si10() const
{
const u32 _8 = static_cast<u8>(si10 & 0xff);
return (_8 << 24) | (_8 << 16) | (_8 << 8) | _8;
}
};
constexpr u32 spu_branch_target(u32 pc, u32 imm = 0)

View file

@ -4,6 +4,7 @@
#include "Utilities/lockless.h"
#include "Utilities/address_range.h"
#include "SPUThread.h"
#include "SPUAnalyser.h"
#include <vector>
#include <bitset>
#include <memory>
@ -201,6 +202,25 @@ public:
__bitset_enum_max
};
enum compare_direction : u32
{
CMP_TURNAROUND_FLAG = 0x1,
CMP_NEGATE_FLAG = 0x100,
CMP_SLESS = 0,
CMP_SGREATER = CMP_SLESS | CMP_TURNAROUND_FLAG,
CMP_EQUAL,
CMP_EQUAL2 = CMP_EQUAL | CMP_TURNAROUND_FLAG,
CMP_LLESS,
CMP_LGREATER = CMP_LLESS | CMP_TURNAROUND_FLAG,
CMP_SGREATER_EQUAL = CMP_SLESS | CMP_NEGATE_FLAG,
CMP_SLOWER_EQUAL = CMP_SGREATER | CMP_NEGATE_FLAG,
CMP_NOT_EQUAL = CMP_EQUAL | CMP_NEGATE_FLAG,
CMP_NOT_EQUAL2 = CMP_NOT_EQUAL | CMP_TURNAROUND_FLAG,
CMP_LGREATER_EQUAL = CMP_LLESS | CMP_NEGATE_FLAG,
CMP_LLOWER_EQUAL = CMP_LGREATER | CMP_NEGATE_FLAG,
CMP_UNKNOWN,
};
struct reg_state_t
{
bs_t<vf> flag{+vf::is_null};
@ -273,6 +293,399 @@ public:
static u32 alloc_tag(bool reset = false) noexcept;
};
struct reduced_loop_t
{
bool active = false; // Single block loop detected
bool failed = false;
u32 loop_pc = SPU_LS_SIZE;
u32 loop_end = SPU_LS_SIZE;
// False: single-block loop
// True: loop with a trailing block of aftermath (iteration update) stuff (like for (u32 i = 0; i < 10; /*update*/ i++))
bool is_two_block_loop = false;
bool has_cond_state = false;
// Loop stay-in state requirement
u64 cond_val_mask = umax;
u64 cond_val_min = 0;
u64 cond_val_size = 0;
compare_direction cond_val_compare{};
u64 cond_val_incr = 0;
bool cond_val_incr_is_immediate = false;
u64 cond_val_register_argument_idx = umax;
u64 cond_val_register_idx = umax;
bool cond_val_incr_before_cond = false;
bool cond_val_incr_before_cond_taken_in_account = false;
bool cond_val_is_immediate = false;
// Loop attributes
bool is_constant_expression = false;
bool is_secret = false;
struct supplemental_condition_t
{
u64 immediate_value = umax;
u64 type_size = 0;
compare_direction val_compare{};
};
// Supplemental loop condition:
// Inner conditions that depend on extrnal values (not produced inside the loop)
// all should evaluate to false in order for the optimization to work (at the moment)
// So succeeding can be treated linearly
u64 expected_sup_conds = 0;
u64 current_sup_conds_index = 0;
std::vector<supplemental_condition_t> sup_conds;
void take_cond_val_incr_before_cond_into_account()
{
if (cond_val_is_immediate && cond_val_incr_before_cond_taken_in_account && !cond_val_incr_before_cond_taken_in_account)
{
cond_val_min -= cond_val_incr;
cond_val_min &= cond_val_mask;
cond_val_incr_before_cond_taken_in_account = true;
}
}
std::bitset<s_reg_max> loop_args;
std::bitset<s_reg_max> loop_dicts;
std::bitset<s_reg_max> loop_writes;
struct origin_t
{
std::bitset<s_reg_max> regs{};
u32 modified = 0;
spu_itype_t mod1_type = spu_itype::UNK;
spu_itype_t mod2_type = spu_itype::UNK;
spu_itype_t mod3_type = spu_itype::UNK;
u32 IMM = 0;
private:
// Internal, please access using fixed order
spu_itype_t access_type(u32 i) const
{
if (i > modified)
{
return spu_itype::UNK;
}
switch (i)
{
case 1: return mod1_type;
case 2: return mod2_type;
case 3: return mod3_type;
default: return spu_itype::UNK;
}
return spu_itype::UNK;
}
public:
spu_itype_t reverse1_type()
{
return access_type(modified);
}
spu_itype_t reverse2_type()
{
return access_type(modified - 1);
}
spu_itype_t reverse3_type()
{
return access_type(modified - 2);
}
origin_t& join_with_this(const origin_t& rhs)
{
regs |= rhs.regs;
return *this;
}
origin_t& join_with_this(u32 rhs)
{
regs.set(rhs);
return *this;
}
origin_t& add_register_origin(u32 reg_val)
{
regs.set(reg_val);
return *this;
}
bool is_single_reg_access(u32 reg_val) const
{
if (!modified)
{
return true;
}
return regs.count() == 1 && regs.test(reg_val);
}
bool is_loop_dictator(u32 reg_val, bool test_predictable = false, bool should_predictable = true) const
{
if (!modified)
{
return false;
}
if (regs.count() >= 1 && regs.test(reg_val))
{
if (!test_predictable)
{
return true;
}
if (modified > 1)
{
return should_predictable ^ true;
}
switch (mod1_type)
{
case spu_itype::A:
{
if (regs.count() == 2)
{
return should_predictable;
}
return should_predictable ^ true;
}
case spu_itype::AI:
case spu_itype::AHI:
{
if (IMM && regs.count() == 1)
{
return should_predictable;
}
return should_predictable ^ true;
}
default: break;
}
return should_predictable ^ true;
}
return false;
}
bool is_predictable_loop_dictator(u32 reg_val) const
{
return is_loop_dictator(reg_val, true, true);
}
bool is_non_predictable_loop_dictator(u32 reg_val) const
{
return is_loop_dictator(reg_val, true, false);
}
bool is_null(u32 reg_val) const noexcept
{
if (modified)
{
return false;
}
if (regs.count() - (regs.test(reg_val) ? 1 : 0))
{
return false;
}
return true;
}
origin_t& add_instruction_modifier(spu_itype_t inst_type, u32 imm = 0)
{
if (inst_type == spu_itype::UNK)
{
mod1_type = spu_itype::UNK;
mod2_type = spu_itype::UNK;
mod3_type = spu_itype::UNK;
IMM = umax;
modified = 1;
return *this;
}
if (modified == 1)
{
if (modified == 3)
{
mod1_type = spu_itype::UNK;
mod2_type = spu_itype::UNK;
mod3_type = spu_itype::UNK;
IMM = umax;
modified = 1;
return *this;
}
bool is_ok = false;
switch (inst_type)
{
case spu_itype::XSBH:
{
const auto prev_type = modified == 1 ? mod1_type : mod2_type;
is_ok &= mod1_type == spu_itype::CEQB || mod1_type == spu_itype::CEQBI || mod1_type == spu_itype::CGTB || mod1_type == spu_itype::CGTBI || mod1_type == spu_itype::CLGTB || mod1_type == spu_itype::CLGTBI;
break;
}
case spu_itype::ANDI:
{
const auto prev_type = modified == 1 ? mod1_type : mod2_type;
is_ok &= mod1_type == spu_itype::CEQB || mod1_type == spu_itype::CEQBI || mod1_type == spu_itype::CGTB || mod1_type == spu_itype::CGTBI || mod1_type == spu_itype::CLGTB || mod1_type == spu_itype::CLGTBI;
is_ok &= (spu_opcode_t{imm}.si10 & 0xff) == 0xff;
break;
}
case spu_itype::CEQ:
case spu_itype::CEQH:
case spu_itype::CEQB:
case spu_itype::CGT:
case spu_itype::CGTH:
case spu_itype::CGTB:
case spu_itype::CLGT:
case spu_itype::CLGTH:
case spu_itype::CLGTB:
{
is_ok = modified == 1 && (mod1_type == spu_itype::AI || mod1_type == spu_itype::AHI);
IMM = imm;
break;
}
case spu_itype::CEQI:
case spu_itype::CEQHI:
case spu_itype::CEQBI:
case spu_itype::CGTI:
case spu_itype::CGTHI:
case spu_itype::CGTBI:
case spu_itype::CLGTI:
case spu_itype::CLGTHI:
case spu_itype::CLGTBI:
{
is_ok = modified == 1 && (mod1_type == spu_itype::AI || mod1_type == spu_itype::AHI);
IMM = spu_opcode_t{imm}.si10;
break;
}
}
if (!is_ok)
{
mod1_type = spu_itype::UNK;
mod2_type = spu_itype::UNK;
mod3_type = spu_itype::UNK;
IMM = umax;
modified = 1;
return *this;
}
(modified == 1 ? mod2_type : mod3_type) = inst_type;
modified++;
return *this;
}
mod1_type = inst_type;
modified = 1;
switch (inst_type)
{
case spu_itype::AHI:
{
IMM = spu_opcode_t{imm}.duplicate_si10();
return *this;
}
case spu_itype::AI:
case spu_itype::ORI:
case spu_itype::XORI:
case spu_itype::ANDI:
case spu_itype::CEQI:
case spu_itype::CEQHI:
case spu_itype::CEQBI:
case spu_itype::CGTI:
case spu_itype::CGTHI:
case spu_itype::CGTBI:
case spu_itype::CLGTI:
case spu_itype::CLGTHI:
case spu_itype::CLGTBI:
{
IMM = spu_opcode_t{imm}.si10;
return *this;
}
case spu_itype::ILA:
{
IMM = spu_opcode_t{imm}.i18;
return *this;
}
case spu_itype::IOHL:
case spu_itype::ILH:
case spu_itype::ILHU:
{
IMM = spu_opcode_t{imm}.i16;
return *this;
}
default:
{
IMM = imm;
break;
}
}
return *this;
}
};
static origin_t make_reg(u32 reg_val) noexcept
{
origin_t org{};
org.add_register_origin(reg_val);
return org;
}
const origin_t* find_reg(u32 reg_val) const noexcept
{
for (auto& pair : regs)
{
if (pair.first == reg_val)
{
return &pair.second;
}
}
return nullptr;
}
origin_t* find_reg(u32 reg_val) noexcept
{
return const_cast<origin_t*>(std::as_const(*this).find_reg(reg_val));
}
bool is_reg_null(u32 reg_val) const noexcept
{
if (const auto reg_found = find_reg(reg_val))
{
return reg_found->is_null(reg_val);
}
return true;
}
origin_t get_reg(u32 reg_val) noexcept
{
const auto org = find_reg(reg_val);
return org ? *org : regs.emplace_back(reg_val, std::remove_reference_t<decltype(*org)>{}).second;
}
std::vector<std::pair<u8, origin_t>> regs;
// Return old state for error reporting
reduced_loop_t discard()
{
const reduced_loop_t old = *this;
*this = reduced_loop_t{};
return old;
}
};
protected:
spu_runtime* m_spurt{};
@ -391,18 +804,23 @@ protected:
putllc16,
putllc0,
rchcnt_loop,
reduced_loop,
};
std::vector<inst_attr> m_inst_attrs;
struct pattern_info
{
u64 info;
// Info via integral
u64 info{};
// Info via additional erased-typed pointer
std::shared_ptr<void> info_ptr;
};
std::unordered_map<u32, pattern_info> m_patterns;
std::map<u32, pattern_info> m_patterns;
void add_pattern(inst_attr attr, u32 start, u64 info);
void add_pattern(inst_attr attr, u32 start, u64 info, std::shared_ptr<void> info_ptr = nullptr);
private:
// For private use
@ -435,7 +853,7 @@ public:
spu_program analyse(const be_t<u32>* ls, u32 entry_point, std::map<u32, std::vector<u32>>* out_target_list = nullptr);
// Print analyser internal state
void dump(const spu_program& result, std::string& out);
void dump(const spu_program& result, std::string& out, u32 block_min = 0, u32 block_max = SPU_LS_SIZE);
// Get SPU Runtime
spu_runtime& get_runtime()