rpcsx/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp

#include "stdafx.h"
#include "Loader/ELF.h"
#include "Emu/Cell/PPUModule.h"

#include "Emu/Memory/vm_reservation.h"
#include "Emu/Cell/SPUThread.h"
#include "Emu/Cell/SPURecompiler.h"
#include "Emu/Cell/lv2/sys_lwmutex.h"
#include "Emu/Cell/lv2/sys_lwcond.h"
#include "Emu/Cell/lv2/sys_spu.h"
#include "cellSpurs.h"

#include <thread>
#include <mutex>

LOG_CHANNEL(cellSpurs);

//----------------------------------------------------------------------------
// Function prototypes
//----------------------------------------------------------------------------

//
// SPURS utility functions
//
static void cellSpursModulePutTrace(CellSpursTracePacket* packet, u32 dmaTagId);
static u32 cellSpursModulePollStatus(spu_thread& spu, u32* status);
static void cellSpursModuleExit(spu_thread& spu);

static bool spursDma(spu_thread& spu, u32 cmd, u64 ea, u32 lsa, u32 size, u32 tag);
static u32 spursDmaGetCompletionStatus(spu_thread& spu, u32 tagMask);
static u32 spursDmaWaitForCompletion(spu_thread& spu, u32 tagMask, bool waitForAll = true);
static void spursHalt(spu_thread& spu);

//
// SPURS kernel functions
//
static bool spursKernel1SelectWorkload(spu_thread& spu);
static bool spursKernel2SelectWorkload(spu_thread& spu);
static void spursKernelDispatchWorkload(spu_thread& spu, u64 widAndPollStatus);
static bool spursKernelWorkloadExit(spu_thread& spu);
bool spursKernelEntry(spu_thread& spu);

//
// SPURS system workload functions
//
static bool spursSysServiceEntry(spu_thread& spu);
// TODO: Exit
static void spursSysServiceIdleHandler(spu_thread& spu, SpursKernelContext* ctxt);
static void spursSysServiceMain(spu_thread& spu, u32 pollStatus);
static void spursSysServiceProcessRequests(spu_thread& spu, SpursKernelContext* ctxt);
static void spursSysServiceActivateWorkload(spu_thread& spu, SpursKernelContext* ctxt);
// TODO: Deactivate workload
static void spursSysServiceUpdateShutdownCompletionEvents(spu_thread& spu, SpursKernelContext* ctxt, u32 wklShutdownBitSet);
static void spursSysServiceTraceSaveCount(spu_thread& spu, SpursKernelContext* ctxt);
static void spursSysServiceTraceUpdate(spu_thread& spu, SpursKernelContext* ctxt, u32 arg2, u32 arg3, u32 forceNotify);
// TODO: Deactivate trace
// TODO: System workload entry
static void spursSysServiceCleanupAfterSystemWorkload(spu_thread& spu, SpursKernelContext* ctxt);

//
// SPURS taskset policy module functions
//
static bool spursTasksetEntry(spu_thread& spu);
static bool spursTasksetSyscallEntry(spu_thread& spu);
static void spursTasksetResumeTask(spu_thread& spu);
static void spursTasksetStartTask(spu_thread& spu, CellSpursTaskArgument& taskArgs);
static s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* isWaiting);
static void spursTasksetProcessPollStatus(spu_thread& spu, u32 pollStatus);
static bool spursTasksetPollStatus(spu_thread& spu);
static void spursTasksetExit(spu_thread& spu);
static void spursTasksetOnTaskExit(spu_thread& spu, u64 addr, u32 taskId, s32 exitCode, u64 args);
static s32 spursTasketSaveTaskContext(spu_thread& spu);
static void spursTasksetDispatch(spu_thread& spu);
static s32 spursTasksetProcessSyscall(spu_thread& spu, u32 syscallNum, u32 args);
static void spursTasksetInit(spu_thread& spu, u32 pollStatus);
static s32 spursTasksetLoadElf(spu_thread& spu, u32* entryPoint, u32* lowestLoadAddr, u64 elfAddr, bool skipWriteableSegments);

//
// SPURS jobchain policy module functions
//
bool spursJobChainEntry(spu_thread& spu);
void spursJobchainPopUrgentCommand(spu_thread& spu);

//----------------------------------------------------------------------------
// SPURS utility functions
//----------------------------------------------------------------------------

// Output trace information
void cellSpursModulePutTrace(CellSpursTracePacket* packet, u32 dmaTagId)
{
	// TODO: Implement this
}

// Check for execution right requests
u32 cellSpursModulePollStatus(spu_thread& spu, u32* status)
{
	auto ctxt = spu._ptr<SpursKernelContext>(0x100);

	spu.gpr[3]._u32[3] = 1;
	if (ctxt->spurs->flags1 & SF1_32_WORKLOADS)
	{
		spursKernel2SelectWorkload(spu);
	}
	else
	{
		spursKernel1SelectWorkload(spu);
	}

	auto result = spu.gpr[3]._u64[1];
	if (status)
	{
		*status = static_cast<u32>(result);
	}

	u32 wklId = result >> 32;
	return wklId == ctxt->wklCurrentId ? 0 : 1;
}

// Exit current workload
void cellSpursModuleExit(spu_thread& spu)
{
	auto ctxt = spu._ptr<SpursKernelContext>(0x100);
	spu.pc = ctxt->exitToKernelAddr;

	// TODO: use g_escape for actual long jump
	//throw SpursModuleExit();
}

// Execute a DMA operation
bool spursDma(spu_thread& spu, const spu_mfc_cmd& args)
{
	spu.ch_mfc_cmd = args;

	if (!spu.process_mfc_cmd())
	{
		spu_runtime::g_escape(&spu);
	}

	if (args.cmd == MFC_GETLLAR_CMD || args.cmd == MFC_PUTLLC_CMD || args.cmd == MFC_PUTLLUC_CMD)
	{
		return static_cast<u32>(spu.get_ch_value(MFC_RdAtomicStat)) != MFC_PUTLLC_FAILURE;
	}

	return true;
}

// Execute a DMA operation
bool spursDma(spu_thread& spu, u32 cmd, u64 ea, u32 lsa, u32 size, u32 tag)
{
	return spursDma(spu, {MFC(cmd), static_cast<u8>(tag & 0x1f), static_cast<u16>(size & 0x7fff), lsa, static_cast<u32>(ea), static_cast<u32>(ea >> 32)});
}

// Get the status of DMA operations
u32 spursDmaGetCompletionStatus(spu_thread& spu, u32 tagMask)
{
	spu.set_ch_value(MFC_WrTagMask, tagMask);
	spu.set_ch_value(MFC_WrTagUpdate, MFC_TAG_UPDATE_IMMEDIATE);
	return static_cast<u32>(spu.get_ch_value(MFC_RdTagStat));
}

// Wait for DMA operations to complete
u32 spursDmaWaitForCompletion(spu_thread& spu, u32 tagMask, bool waitForAll)
{
	spu.set_ch_value(MFC_WrTagMask, tagMask);
	spu.set_ch_value(MFC_WrTagUpdate, waitForAll ? MFC_TAG_UPDATE_ALL : MFC_TAG_UPDATE_ANY);
	return static_cast<u32>(spu.get_ch_value(MFC_RdTagStat));
}

// Halt the SPU
void spursHalt(spu_thread& spu)
{
	spu.halt();
}

void sys_spu_thread_exit(spu_thread& spu, s32 status)
{
	// Cancel any pending status update requests
	spu.set_ch_value(MFC_WrTagUpdate, 0);
	while (spu.get_ch_count(MFC_RdTagStat) != 1);
	spu.get_ch_value(MFC_RdTagStat);

	// Wait for all pending DMA operations to complete
	spu.set_ch_value(MFC_WrTagMask, 0xFFFFFFFF);
	spu.set_ch_value(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL);
	spu.get_ch_value(MFC_RdTagStat);

	spu.set_ch_value(SPU_WrOutMbox, status);
	spu.stop_and_signal(0x102);
}

void sys_spu_thread_group_exit(spu_thread& spu, s32 status)
{
	// Cancel any pending status update requests
	spu.set_ch_value(MFC_WrTagUpdate, 0);
	while (spu.get_ch_count(MFC_RdTagStat) != 1);
	spu.get_ch_value(MFC_RdTagStat);

	// Wait for all pending DMA operations to complete
	spu.set_ch_value(MFC_WrTagMask, 0xFFFFFFFF);
	spu.set_ch_value(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL);
	spu.get_ch_value(MFC_RdTagStat);

	spu.set_ch_value(SPU_WrOutMbox, status);
	spu.stop_and_signal(0x101);
}

s32 sys_spu_thread_send_event(spu_thread& spu, u8 spup, u32 data0, u32 data1)
{
	if (spup > 0x3F)
	{
		return CELL_EINVAL;
	}

	if (spu.get_ch_count(SPU_RdInMbox))
	{
		return CELL_EBUSY;
	}

	spu.set_ch_value(SPU_WrOutMbox, data1);
	spu.set_ch_value(SPU_WrOutIntrMbox, (spup << 24) | (data0 & 0x00FFFFFF));
	return static_cast<u32>(spu.get_ch_value(SPU_RdInMbox));
}

s32 sys_spu_thread_switch_system_module(spu_thread& spu, u32 status)
{
	if (spu.get_ch_count(SPU_RdInMbox))
	{
		return CELL_EBUSY;
	}

	u32 result;

	// Cancel any pending status update requests
	spu.set_ch_value(MFC_WrTagUpdate, 0);
	while (spu.get_ch_count(MFC_RdTagStat) != 1);
	spu.get_ch_value(MFC_RdTagStat);

	// Wait for all pending DMA operations to complete
	spu.set_ch_value(MFC_WrTagMask, 0xFFFFFFFF);
	spu.set_ch_value(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL);
	spu.get_ch_value(MFC_RdTagStat);

	do
	{
		spu.set_ch_value(SPU_WrOutMbox, status);
		spu.stop_and_signal(0x120);
		result = static_cast<u32>(spu.get_ch_value(SPU_RdInMbox));
	}
	while (result == CELL_EBUSY);

	return result;
}

//----------------------------------------------------------------------------
// SPURS kernel functions
//----------------------------------------------------------------------------

// Select a workload to run
bool spursKernel1SelectWorkload(spu_thread& spu)
{
	const auto ctxt = spu._ptr<SpursKernelContext>(0x100);

	// The first and only argument to this function is a boolean that is set to false if the function
	// is called by the SPURS kernel and set to true if called by cellSpursModulePollStatus.
	// If the first argument is true then the shared data is not updated with the result.
	const auto isPoll = spu.gpr[3]._u32[3];

	u32 wklSelectedId;
	u32 pollStatus;

	//vm::reservation_op(vm::cast(ctxt->spurs.addr(), HERE), 128, [&]()
	{
		// lock the first 0x80 bytes of spurs
		auto spurs = ctxt->spurs.get_ptr();

		// Calculate the contention (number of SPUs used) for each workload
		u8 contention[CELL_SPURS_MAX_WORKLOAD];
		u8 pendingContention[CELL_SPURS_MAX_WORKLOAD];
		for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++)
		{
			contention[i] = spurs->wklCurrentContention[i] - ctxt->wklLocContention[i];

			// If this is a poll request then the number of SPUs pending to context switch is also added to the contention presumably
			// to prevent unnecessary jumps to the kernel
			if (isPoll)
			{
				pendingContention[i] = spurs->wklPendingContention[i] - ctxt->wklLocPendingContention[i];
				if (i != ctxt->wklCurrentId)
				{
					contention[i] += pendingContention[i];
				}
			}
		}

		wklSelectedId = CELL_SPURS_SYS_SERVICE_WORKLOAD_ID;
		pollStatus = 0;

		// The system service has the highest priority. Select the system service if
		// the system service message bit for this SPU is set.
		if (spurs->sysSrvMessage & (1 << ctxt->spuNum))
		{
			ctxt->spuIdling = 0;
			if (!isPoll || ctxt->wklCurrentId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID)
			{
				// Clear the message bit
				spurs->sysSrvMessage.raw() &= ~(1 << ctxt->spuNum);
			}
		}
		else
		{
			// Caclulate the scheduling weight for each workload
			u16 maxWeight = 0;
			for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++)
			{
				u16 runnable = ctxt->wklRunnable1 & (0x8000 >> i);
				u16 wklSignal = spurs->wklSignal1.load() & (0x8000 >> i);
				u8  wklFlag = spurs->wklFlag.flag.load() == 0u ? spurs->wklFlagReceiver == i ? 1 : 0 : 0;
				u8  readyCount = spurs->wklReadyCount1[i] > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->wklReadyCount1[i].load();
				u8  idleSpuCount = spurs->wklIdleSpuCountOrReadyCount2[i] > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->wklIdleSpuCountOrReadyCount2[i].load();
				u8  requestCount = readyCount + idleSpuCount;

				// For a workload to be considered for scheduling:
				// 1. Its priority must not be 0
				// 2. The number of SPUs used by it must be less than the max contention for that workload
				// 3. The workload should be in runnable state
				// 4. The number of SPUs allocated to it must be less than the number of SPUs requested (i.e. readyCount)
				//    OR the workload must be signalled
				//    OR the workload flag is 0 and the workload is configured as the wokload flag receiver
				if (runnable && ctxt->priority[i] != 0 && spurs->wklMaxContention[i] > contention[i])
				{
					if (wklFlag || wklSignal || (readyCount != 0 && requestCount > contention[i]))
					{
						// The scheduling weight of the workload is formed from the following parameters in decreasing order of priority:
						// 1. Wokload signal set or workload flag or ready count > contention
						// 2. Priority of the workload on the SPU
						// 3. Is the workload the last selected workload
						// 4. Minimum contention of the workload
						// 5. Number of SPUs that are being used by the workload (lesser the number, more the weight)
						// 6. Is the workload executable same as the currently loaded executable
						// 7. The workload id (lesser the number, more the weight)
						u16 weight = (wklFlag || wklSignal || (readyCount > contention[i])) ? 0x8000 : 0;
						weight |= (ctxt->priority[i] & 0x7F) << 8; // TODO: was shifted << 16
						weight |= i == ctxt->wklCurrentId ? 0x80 : 0x00;
						weight |= (contention[i] > 0 && spurs->wklMinContention[i] > contention[i]) ? 0x40 : 0x00;
						weight |= ((CELL_SPURS_MAX_SPU - contention[i]) & 0x0F) << 2;
						weight |= ctxt->wklUniqueId[i] == ctxt->wklCurrentId ? 0x02 : 0x00;
						weight |= 0x01;

						// In case of a tie the lower numbered workload is chosen
						if (weight > maxWeight)
						{
							wklSelectedId = i;
							maxWeight = weight;
							pollStatus = readyCount > contention[i] ? CELL_SPURS_MODULE_POLL_STATUS_READYCOUNT : 0;
							pollStatus |= wklSignal ? CELL_SPURS_MODULE_POLL_STATUS_SIGNAL : 0;
							pollStatus |= wklFlag ? CELL_SPURS_MODULE_POLL_STATUS_FLAG : 0;
						}
					}
				}
			}

			// Not sure what this does. Possibly mark the SPU as idle/in use.
			ctxt->spuIdling = wklSelectedId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID ? 1 : 0;

			if (!isPoll || wklSelectedId == ctxt->wklCurrentId)
			{
				// Clear workload signal for the selected workload
				spurs->wklSignal1.raw() &= ~(0x8000 >> wklSelectedId);
				spurs->wklSignal2.raw() &= ~(0x80000000u >> wklSelectedId);

				// If the selected workload is the wklFlag workload then pull the wklFlag to all 1s
				if (wklSelectedId == spurs->wklFlagReceiver)
				{
					spurs->wklFlag.flag = -1;
				}
			}
		}

		if (!isPoll)
		{
			// Called by kernel
			// Increment the contention for the selected workload
			if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID)
			{
				contention[wklSelectedId]++;
			}

			for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++)
			{
				spurs->wklCurrentContention[i] = contention[i];
				spurs->wklPendingContention[i] = spurs->wklPendingContention[i] - ctxt->wklLocPendingContention[i];
				ctxt->wklLocContention[i] = 0;
				ctxt->wklLocPendingContention[i] = 0;
			}

			if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID)
			{
				ctxt->wklLocContention[wklSelectedId] = 1;
			}

			ctxt->wklCurrentId = wklSelectedId;
		}
		else if (wklSelectedId != ctxt->wklCurrentId)
		{
			// Not called by kernel but a context switch is required
			// Increment the pending contention for the selected workload
			if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID)
			{
				pendingContention[wklSelectedId]++;
			}

			for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++)
			{
				spurs->wklPendingContention[i] = pendingContention[i];
				ctxt->wklLocPendingContention[i] = 0;
			}

			if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID)
			{
				ctxt->wklLocPendingContention[wklSelectedId] = 1;
			}
		}
		else
		{
			// Not called by kernel and no context switch is required
			for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++)
			{
				spurs->wklPendingContention[i] = spurs->wklPendingContention[i] - ctxt->wklLocPendingContention[i];
				ctxt->wklLocPendingContention[i] = 0;
			}
		}

		std::memcpy(ctxt, spurs, 128);
	}//);

	u64 result = u64{wklSelectedId} << 32;
	result |= pollStatus;
	spu.gpr[3]._u64[1] = result;
	return true;
}

// Select a workload to run
bool spursKernel2SelectWorkload(spu_thread& spu)
{
	const auto ctxt = spu._ptr<SpursKernelContext>(0x100);

	// The first and only argument to this function is a boolean that is set to false if the function
	// is called by the SPURS kernel and set to true if called by cellSpursModulePollStatus.
	// If the first argument is true then the shared data is not updated with the result.
	const auto isPoll = spu.gpr[3]._u32[3];

	u32 wklSelectedId;
	u32 pollStatus;

	//vm::reservation_op(vm::cast(ctxt->spurs.addr(), HERE), 128, [&]()
	{
		// lock the first 0x80 bytes of spurs
		auto spurs = ctxt->spurs.get_ptr();

		// Calculate the contention (number of SPUs used) for each workload
		u8 contention[CELL_SPURS_MAX_WORKLOAD2];
		u8 pendingContention[CELL_SPURS_MAX_WORKLOAD2];
		for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD2; i++)
		{
			contention[i] = spurs->wklCurrentContention[i & 0x0F] - ctxt->wklLocContention[i & 0x0F];
			contention[i] = i + 0u < CELL_SPURS_MAX_WORKLOAD ? contention[i] & 0x0F : contention[i] >> 4;

			// If this is a poll request then the number of SPUs pending to context switch is also added to the contention presumably
			// to prevent unnecessary jumps to the kernel
			if (isPoll)
			{
				pendingContention[i] = spurs->wklPendingContention[i & 0x0F] - ctxt->wklLocPendingContention[i & 0x0F];
				pendingContention[i] = i + 0u < CELL_SPURS_MAX_WORKLOAD ? pendingContention[i] & 0x0F : pendingContention[i] >> 4;
				if (i != ctxt->wklCurrentId)
				{
					contention[i] += pendingContention[i];
				}
			}
		}

		wklSelectedId = CELL_SPURS_SYS_SERVICE_WORKLOAD_ID;
		pollStatus = 0;

		// The system service has the highest priority. Select the system service if
		// the system service message bit for this SPU is set.
		if (spurs->sysSrvMessage & (1 << ctxt->spuNum))
		{
			// Not sure what this does. Possibly Mark the SPU as in use.
			ctxt->spuIdling = 0;
			if (!isPoll || ctxt->wklCurrentId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID)
			{
				// Clear the message bit
				spurs->sysSrvMessage.raw() &= ~(1 << ctxt->spuNum);
			}
		}
		else
		{
			// Caclulate the scheduling weight for each workload
			u8 maxWeight = 0;
			for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD2; i++)
			{
				u32 j = i & 0x0f;
				u16 runnable = i < CELL_SPURS_MAX_WORKLOAD ? ctxt->wklRunnable1 & (0x8000 >> j) : ctxt->wklRunnable2 & (0x8000 >> j);
				u8  priority = i < CELL_SPURS_MAX_WORKLOAD ? ctxt->priority[j] & 0x0F : ctxt->priority[j] >> 4;
				u8  maxContention = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklMaxContention[j] & 0x0F : spurs->wklMaxContention[j] >> 4;
				u16 wklSignal = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklSignal1.load() & (0x8000 >> j) : spurs->wklSignal2.load() & (0x8000 >> j);
				u8  wklFlag = spurs->wklFlag.flag.load() == 0u ? spurs->wklFlagReceiver == i ? 1 : 0 : 0;
				u8  readyCount = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklReadyCount1[j] : spurs->wklIdleSpuCountOrReadyCount2[j];

				// For a workload to be considered for scheduling:
				// 1. Its priority must be greater than 0
				// 2. The number of SPUs used by it must be less than the max contention for that workload
				// 3. The workload should be in runnable state
				// 4. The number of SPUs allocated to it must be less than the number of SPUs requested (i.e. readyCount)
				//    OR the workload must be signalled
				//    OR the workload flag is 0 and the workload is configured as the wokload receiver
				if (runnable && priority > 0 && maxContention > contention[i])
				{
					if (wklFlag || wklSignal || readyCount > contention[i])
					{
						// The scheduling weight of the workload is equal to the priority of the workload for the SPU.
						// The current workload is given a sligtly higher weight presumably to reduce the number of context switches.
						// In case of a tie the lower numbered workload is chosen.
						u8 weight = priority << 4;
						if (ctxt->wklCurrentId == i)
						{
							weight |= 0x04;
						}

						if (weight > maxWeight)
						{
							wklSelectedId = i;
							maxWeight = weight;
							pollStatus = readyCount > contention[i] ? CELL_SPURS_MODULE_POLL_STATUS_READYCOUNT : 0;
							pollStatus |= wklSignal ? CELL_SPURS_MODULE_POLL_STATUS_SIGNAL : 0;
							pollStatus |= wklFlag ? CELL_SPURS_MODULE_POLL_STATUS_FLAG : 0;
						}
					}
				}
			}

			// Not sure what this does. Possibly mark the SPU as idle/in use.
			ctxt->spuIdling = wklSelectedId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID ? 1 : 0;

			if (!isPoll || wklSelectedId == ctxt->wklCurrentId)
			{
				// Clear workload signal for the selected workload
				spurs->wklSignal1.raw() &= ~(0x8000 >> wklSelectedId);
				spurs->wklSignal2.raw() &= ~(0x80000000u >> wklSelectedId);

				// If the selected workload is the wklFlag workload then pull the wklFlag to all 1s
				if (wklSelectedId == spurs->wklFlagReceiver)
				{
					spurs->wklFlag.flag = -1;
				}
			}
		}

		if (!isPoll)
		{
			// Called by kernel
			// Increment the contention for the selected workload
			if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID)
			{
				contention[wklSelectedId]++;
			}

			for (u32 i = 0; i < (CELL_SPURS_MAX_WORKLOAD2 >> 1); i++)
			{
				spurs->wklCurrentContention[i] = contention[i] | (contention[i + 0x10] << 4);
				spurs->wklPendingContention[i] = spurs->wklPendingContention[i] - ctxt->wklLocPendingContention[i];
				ctxt->wklLocContention[i] = 0;
				ctxt->wklLocPendingContention[i] = 0;
			}

			ctxt->wklLocContention[wklSelectedId & 0x0F] = wklSelectedId < CELL_SPURS_MAX_WORKLOAD ? 0x01 : wklSelectedId < CELL_SPURS_MAX_WORKLOAD2 ? 0x10 : 0;
			ctxt->wklCurrentId = wklSelectedId;
		}
		else if (wklSelectedId != ctxt->wklCurrentId)
		{
			// Not called by kernel but a context switch is required
			// Increment the pending contention for the selected workload
			if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID)
			{
				pendingContention[wklSelectedId]++;
			}

			for (u32 i = 0; i < (CELL_SPURS_MAX_WORKLOAD2 >> 1); i++)
			{
				spurs->wklPendingContention[i] = pendingContention[i] | (pendingContention[i + 0x10] << 4);
				ctxt->wklLocPendingContention[i] = 0;
			}

			ctxt->wklLocPendingContention[wklSelectedId & 0x0F] = wklSelectedId < CELL_SPURS_MAX_WORKLOAD ? 0x01 : wklSelectedId < CELL_SPURS_MAX_WORKLOAD2 ? 0x10 : 0;
		}
		else
		{
			// Not called by kernel and no context switch is required
			for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++)
			{
				spurs->wklPendingContention[i] = spurs->wklPendingContention[i] - ctxt->wklLocPendingContention[i];
				ctxt->wklLocPendingContention[i] = 0;
			}
		}

		std::memcpy(ctxt, spurs, 128);
	}//);

	u64 result = u64{wklSelectedId} << 32;
	result |= pollStatus;
	spu.gpr[3]._u64[1] = result;
	return true;
}

// SPURS kernel dispatch workload
void spursKernelDispatchWorkload(spu_thread& spu, u64 widAndPollStatus)
{
	const auto ctxt = spu._ptr<SpursKernelContext>(0x100);
	auto isKernel2 = ctxt->spurs->flags1 & SF1_32_WORKLOADS ? true : false;

	auto pollStatus = static_cast<u32>(widAndPollStatus);
	auto wid = static_cast<u32>(widAndPollStatus >> 32);

	// DMA in the workload info for the selected workload
	auto wklInfoOffset = wid < CELL_SPURS_MAX_WORKLOAD ? &ctxt->spurs->wklInfo1[wid] :
		wid < CELL_SPURS_MAX_WORKLOAD2 && isKernel2 ? &ctxt->spurs->wklInfo2[wid & 0xf] :
		&ctxt->spurs->wklInfoSysSrv;

	const auto wklInfo = spu._ptr<CellSpurs::WorkloadInfo>(0x3FFE0);
	std::memcpy(wklInfo, wklInfoOffset, 0x20);

	// Load the workload to LS
	if (ctxt->wklCurrentAddr != wklInfo->addr)
	{
		switch (wklInfo->addr.addr())
		{
		case SPURS_IMG_ADDR_SYS_SRV_WORKLOAD:
			//spu.RegisterHleFunction(0xA00, spursSysServiceEntry);
			break;
		case SPURS_IMG_ADDR_TASKSET_PM:
			//spu.RegisterHleFunction(0xA00, spursTasksetEntry);
			break;
		default:
			std::memcpy(spu._ptr<void>(0xA00), wklInfo->addr.get_ptr(), wklInfo->size);
			break;
		}

		ctxt->wklCurrentAddr = wklInfo->addr;
		ctxt->wklCurrentUniqueId = wklInfo->uniqueId;
	}

	if (!isKernel2)
	{
		ctxt->moduleId[0] = 0;
		ctxt->moduleId[1] = 0;
	}

	// Run workload
	spu.gpr[0]._u32[3] = ctxt->exitToKernelAddr;
	spu.gpr[1]._u32[3] = 0x3FFB0;
	spu.gpr[3]._u32[3] = 0x100;
	spu.gpr[4]._u64[1] = wklInfo->arg;
	spu.gpr[5]._u32[3] = pollStatus;
	spu.pc = 0xA00;
}

// SPURS kernel workload exit
bool spursKernelWorkloadExit(spu_thread& spu)
{
	const auto ctxt = spu._ptr<SpursKernelContext>(0x100);
	auto isKernel2 = ctxt->spurs->flags1 & SF1_32_WORKLOADS ? true : false;

	// Select next workload to run
	spu.gpr[3].clear();
	if (isKernel2)
	{
		spursKernel2SelectWorkload(spu);
	}
	else
	{
		spursKernel1SelectWorkload(spu);
	}

	spursKernelDispatchWorkload(spu, spu.gpr[3]._u64[1]);
	return false;
}

// SPURS kernel entry point
bool spursKernelEntry(spu_thread& spu)
{
	const auto ctxt = spu._ptr<SpursKernelContext>(0x100);
	memset(ctxt, 0, sizeof(SpursKernelContext));

	// Save arguments
	ctxt->spuNum = spu.gpr[3]._u32[3];
	ctxt->spurs.set(spu.gpr[4]._u64[1]);

	auto isKernel2 = ctxt->spurs->flags1 & SF1_32_WORKLOADS ? true : false;

	// Initialise the SPURS context to its initial values
	ctxt->dmaTagId = CELL_SPURS_KERNEL_DMA_TAG_ID;
	ctxt->wklCurrentUniqueId = 0x20;
	ctxt->wklCurrentId = CELL_SPURS_SYS_SERVICE_WORKLOAD_ID;
	ctxt->exitToKernelAddr = isKernel2 ? CELL_SPURS_KERNEL2_EXIT_ADDR : CELL_SPURS_KERNEL1_EXIT_ADDR;
	ctxt->selectWorkloadAddr = isKernel2 ? CELL_SPURS_KERNEL2_SELECT_WORKLOAD_ADDR : CELL_SPURS_KERNEL1_SELECT_WORKLOAD_ADDR;
	if (!isKernel2)
	{
		ctxt->x1F0 = 0xF0020000;
		ctxt->x200 = 0x20000;
		ctxt->guid[0] = 0x423A3A02;
		ctxt->guid[1] = 0x43F43A82;
		ctxt->guid[2] = 0x43F26502;
		ctxt->guid[3] = 0x420EB382;
	}
	else
	{
		ctxt->guid[0] = 0x43A08402;
		ctxt->guid[1] = 0x43FB0A82;
		ctxt->guid[2] = 0x435E9302;
		ctxt->guid[3] = 0x43A3C982;
	}

	// Register SPURS kernel HLE functions
	//spu.UnregisterHleFunctions(0, 0x40000/*LS_BOTTOM*/);
	//spu.RegisterHleFunction(isKernel2 ? CELL_SPURS_KERNEL2_ENTRY_ADDR : CELL_SPURS_KERNEL1_ENTRY_ADDR, spursKernelEntry);
	//spu.RegisterHleFunction(ctxt->exitToKernelAddr, spursKernelWorkloadExit);
	//spu.RegisterHleFunction(ctxt->selectWorkloadAddr, isKernel2 ? spursKernel2SelectWorkload : spursKernel1SelectWorkload);

	// Start the system service
	spursKernelDispatchWorkload(spu, u64{CELL_SPURS_SYS_SERVICE_WORKLOAD_ID} << 32);
	return false;
}

//----------------------------------------------------------------------------
// SPURS system workload functions
//----------------------------------------------------------------------------

// Entry point of the system service
bool spursSysServiceEntry(spu_thread& spu)
{
	const auto ctxt = spu._ptr<SpursKernelContext>(spu.gpr[3]._u32[3]);
	auto arg = spu.gpr[4]._u64[1];
	auto pollStatus = spu.gpr[5]._u32[3];

	{
		if (ctxt->wklCurrentId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID)
		{
			spursSysServiceMain(spu, pollStatus);
		}
		else
		{
			// TODO: If we reach here it means the current workload was preempted to start the
			// system workload. Need to implement this.
		}

		cellSpursModuleExit(spu);
	}

	return false;
}

// Wait for an external event or exit the SPURS thread group if no workloads can be scheduled
void spursSysServiceIdleHandler(spu_thread& spu, SpursKernelContext* ctxt)
{
	bool shouldExit;

	while (true)
	{
		const auto spurs = spu._ptr<CellSpurs>(0x100);
		//vm::reservation_acquire(spurs, vm::cast(ctxt->spurs.addr(), HERE), 128);

		// Find the number of SPUs that are idling in this SPURS instance
		u32 nIdlingSpus = 0;
		for (u32 i = 0; i < 8; i++)
		{
			if (spurs->spuIdling & (1 << i))
			{
				nIdlingSpus++;
			}
		}

		bool allSpusIdle = nIdlingSpus == spurs->nSpus ? true : false;
		bool exitIfNoWork = spurs->flags1 & SF1_EXIT_IF_NO_WORK ? true : false;
		shouldExit = allSpusIdle && exitIfNoWork;

		// Check if any workloads can be scheduled
		bool foundReadyWorkload = false;
		if (spurs->sysSrvMessage & (1 << ctxt->spuNum))
		{
			foundReadyWorkload = true;
		}
		else
		{
			if (spurs->flags1 & SF1_32_WORKLOADS)
			{
				for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD2; i++)
				{
					u32 j = i & 0x0F;
					u16 runnable = i < CELL_SPURS_MAX_WORKLOAD ? ctxt->wklRunnable1 & (0x8000 >> j) : ctxt->wklRunnable2 & (0x8000 >> j);
					u8 priority = i < CELL_SPURS_MAX_WORKLOAD ? ctxt->priority[j] & 0x0F : ctxt->priority[j] >> 4;
					u8 maxContention = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklMaxContention[j] & 0x0F : spurs->wklMaxContention[j] >> 4;
					u8 contention = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklCurrentContention[j] & 0x0F : spurs->wklCurrentContention[j] >> 4;
					u16 wklSignal = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklSignal1.load() & (0x8000 >> j) : spurs->wklSignal2.load() & (0x8000 >> j);
					u8 wklFlag = spurs->wklFlag.flag.load() == 0u ? spurs->wklFlagReceiver == i ? 1 : 0 : 0;
					u8 readyCount = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklReadyCount1[j] : spurs->wklIdleSpuCountOrReadyCount2[j];

					if (runnable && priority > 0 && maxContention > contention)
					{
						if (wklFlag || wklSignal || readyCount > contention)
						{
							foundReadyWorkload = true;
							break;
						}
					}
				}
			}
			else
			{
				for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++)
				{
					u16 runnable = ctxt->wklRunnable1 & (0x8000 >> i);
					u16 wklSignal = spurs->wklSignal1.load() & (0x8000 >> i);
					u8 wklFlag = spurs->wklFlag.flag.load() == 0u ? spurs->wklFlagReceiver == i ? 1 : 0 : 0;
					u8 readyCount = spurs->wklReadyCount1[i] > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->wklReadyCount1[i].load();
					u8 idleSpuCount = spurs->wklIdleSpuCountOrReadyCount2[i] > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->wklIdleSpuCountOrReadyCount2[i].load();
					u8 requestCount = readyCount + idleSpuCount;

					if (runnable && ctxt->priority[i] != 0 && spurs->wklMaxContention[i] > spurs->wklCurrentContention[i])
					{
						if (wklFlag || wklSignal || (readyCount != 0 && requestCount > spurs->wklCurrentContention[i]))
						{
							foundReadyWorkload = true;
							break;
						}
					}
				}
			}
		}

		bool spuIdling = spurs->spuIdling & (1 << ctxt->spuNum) ? true : false;
		if (foundReadyWorkload && shouldExit == false)
		{
			spurs->spuIdling &= ~(1 << ctxt->spuNum);
		}
		else
		{
			spurs->spuIdling |= 1 << ctxt->spuNum;
		}

		// If all SPUs are idling and the exit_if_no_work flag is set then the SPU thread group must exit. Otherwise wait for external events.
		if (spuIdling && shouldExit == false && foundReadyWorkload == false)
		{
			// The system service blocks by making a reservation and waiting on the lock line reservation lost event.
			thread_ctrl::wait_for(1000);
			continue;
		}

		//if (vm::reservation_update(vm::cast(ctxt->spurs.addr(), HERE), spu._ptr<void>(0x100), 128) && (shouldExit || foundReadyWorkload))
		{
			break;
		}
	}

	if (shouldExit)
	{
		// TODO: exit spu thread group
	}
}

// Main function for the system service
void spursSysServiceMain(spu_thread& spu, u32 pollStatus)
{
	const auto ctxt = spu._ptr<SpursKernelContext>(0x100);

	if (!ctxt->spurs.aligned())
	{
		spu_log.error("spursSysServiceMain(): invalid spurs alignment");
		spursHalt(spu);
	}

	// Initialise the system service if this is the first time its being started on this SPU
	if (ctxt->sysSrvInitialised == 0)
	{
		ctxt->sysSrvInitialised = 1;

		//vm::reservation_acquire(ctxt, vm::cast(ctxt->spurs.addr(), HERE), 128);

		//vm::reservation_op(ctxt->spurs.ptr(&CellSpurs::wklState1).addr(), 128, [&]()
		{
			auto spurs = ctxt->spurs.get_ptr();

			// Halt if already initialised
			if (spurs->sysSrvOnSpu & (1 << ctxt->spuNum))
			{
				spu_log.error("spursSysServiceMain(): already initialized");
				spursHalt(spu);
			}

			spurs->sysSrvOnSpu |= 1 << ctxt->spuNum;

			std::memcpy(spu._ptr<void>(0x2D80), spurs->wklState1, 128);
		}//);

		ctxt->traceBuffer = 0;
		ctxt->traceMsgCount = -1;
		spursSysServiceTraceUpdate(spu, ctxt, 1, 1, 0);
		spursSysServiceCleanupAfterSystemWorkload(spu, ctxt);

		// Trace - SERVICE: INIT
		CellSpursTracePacket pkt{};
		pkt.header.tag = CELL_SPURS_TRACE_TAG_SERVICE;
		pkt.data.service.incident = CELL_SPURS_TRACE_SERVICE_INIT;
		cellSpursModulePutTrace(&pkt, ctxt->dmaTagId);
	}

	// Trace - START: Module='SYS '
	CellSpursTracePacket pkt{};
	pkt.header.tag = CELL_SPURS_TRACE_TAG_START;
	std::memcpy(pkt.data.start._module, "SYS ", 4);
	pkt.data.start.level = 1; // Policy module
	pkt.data.start.ls = 0xA00 >> 2;
	cellSpursModulePutTrace(&pkt, ctxt->dmaTagId);

	while (true)
	{
		// Process requests for the system service
		spursSysServiceProcessRequests(spu, ctxt);

	poll:
		if (cellSpursModulePollStatus(spu, nullptr))
		{
			// Trace - SERVICE: EXIT
			CellSpursTracePacket pkt{};
			pkt.header.tag = CELL_SPURS_TRACE_TAG_SERVICE;
			pkt.data.service.incident = CELL_SPURS_TRACE_SERVICE_EXIT;
			cellSpursModulePutTrace(&pkt, ctxt->dmaTagId);

			// Trace - STOP: GUID
			pkt = {};
			pkt.header.tag = CELL_SPURS_TRACE_TAG_STOP;
			pkt.data.stop = SPURS_GUID_SYS_WKL;
			cellSpursModulePutTrace(&pkt, ctxt->dmaTagId);

			//spursDmaWaitForCompletion(spu, 1 << ctxt->dmaTagId);
			break;
		}

		// If we reach here it means that either there are more system service messages to be processed
		// or there are no workloads that can be scheduled.

		// If the SPU is not idling then process the remaining system service messages
		if (ctxt->spuIdling == 0)
		{
			continue;
		}

		// If we reach here it means that the SPU is idling

		// Trace - SERVICE: WAIT
		CellSpursTracePacket pkt{};
		pkt.header.tag = CELL_SPURS_TRACE_TAG_SERVICE;
		pkt.data.service.incident = CELL_SPURS_TRACE_SERVICE_WAIT;
		cellSpursModulePutTrace(&pkt, ctxt->dmaTagId);

		spursSysServiceIdleHandler(spu, ctxt);

		goto poll;
	}
}

// Process any requests
void spursSysServiceProcessRequests(spu_thread& spu, SpursKernelContext* ctxt)
{
	bool updateTrace = false;
	bool updateWorkload = false;
	bool terminate = false;

	//vm::reservation_op(vm::cast(ctxt->spurs.addr() + OFFSET_32(CellSpurs, wklState1), HERE), 128, [&]()
	{
		auto spurs = ctxt->spurs.get_ptr();

		// Terminate request
		if (spurs->sysSrvMsgTerminate & (1 << ctxt->spuNum))
		{
			spurs->sysSrvOnSpu &= ~(1 << ctxt->spuNum);
			terminate = true;
		}

		// Update workload message
		if (spurs->sysSrvMsgUpdateWorkload & (1 << ctxt->spuNum))
		{
			spurs->sysSrvMsgUpdateWorkload &= ~(1 << ctxt->spuNum);
			updateWorkload = true;
		}

		// Update trace message
		if (spurs->sysSrvTrace.load().sysSrvMsgUpdateTrace & (1 << ctxt->spuNum))
		{
			updateTrace = true;
		}

		std::memcpy(spu._ptr<void>(0x2D80), spurs->wklState1, 128);
	}//);

	// Process update workload message
	if (updateWorkload)
	{
		spursSysServiceActivateWorkload(spu, ctxt);
	}

	// Process update trace message
	if (updateTrace)
	{
		spursSysServiceTraceUpdate(spu, ctxt, 1, 0, 0);
	}

	// Process terminate request
	if (terminate)
	{
		// TODO: Rest of the terminate processing
	}
}

// Activate a workload
void spursSysServiceActivateWorkload(spu_thread& spu, SpursKernelContext* ctxt)
{
	const auto spurs = spu._ptr<CellSpurs>(0x100);
	std::memcpy(spu._ptr<void>(0x30000), ctxt->spurs->wklInfo1, 0x200);
	if (spurs->flags1 & SF1_32_WORKLOADS)
	{
		std::memcpy(spu._ptr<void>(0x30200), ctxt->spurs->wklInfo2, 0x200);
	}

	u32 wklShutdownBitSet = 0;
	ctxt->wklRunnable1 = 0;
	ctxt->wklRunnable2 = 0;
	for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++)
	{
		const auto wklInfo1 = spu._ptr<CellSpurs::WorkloadInfo>(0x30000);

		// Copy the priority of the workload for this SPU and its unique id to the LS
		ctxt->priority[i] = wklInfo1[i].priority[ctxt->spuNum] == 0 ? 0 : 0x10 - wklInfo1[i].priority[ctxt->spuNum];
		ctxt->wklUniqueId[i] = wklInfo1[i].uniqueId;

		if (spurs->flags1 & SF1_32_WORKLOADS)
		{
			const auto wklInfo2 = spu._ptr<CellSpurs::WorkloadInfo>(0x30200);

			// Copy the priority of the workload for this SPU to the LS
			if (wklInfo2[i].priority[ctxt->spuNum])
			{
				ctxt->priority[i] |= (0x10 - wklInfo2[i].priority[ctxt->spuNum]) << 4;
			}
		}
	}

	//vm::reservation_op(ctxt->spurs.ptr(&CellSpurs::wklState1).addr(), 128, [&]()
	{
		auto spurs = ctxt->spurs.get_ptr();

		for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++)
		{
			// Update workload status and runnable flag based on the workload state
			auto wklStatus = spurs->wklStatus1[i];
			if (spurs->wklState1[i] == SPURS_WKL_STATE_RUNNABLE)
			{
				spurs->wklStatus1[i] |= 1 << ctxt->spuNum;
				ctxt->wklRunnable1 |= 0x8000 >> i;
			}
			else
			{
				spurs->wklStatus1[i] &= ~(1 << ctxt->spuNum);
			}

			// If the workload is shutting down and if this is the last SPU from which it is being removed then
			// add it to the shutdown bit set
			if (spurs->wklState1[i] == SPURS_WKL_STATE_SHUTTING_DOWN)
			{
				if (((wklStatus & (1 << ctxt->spuNum)) != 0) && (spurs->wklStatus1[i] == 0))
				{
					spurs->wklState1[i] = SPURS_WKL_STATE_REMOVABLE;
					wklShutdownBitSet |= 0x80000000u >> i;
				}
			}

			if (spurs->flags1 & SF1_32_WORKLOADS)
			{
				// Update workload status and runnable flag based on the workload state
				wklStatus = spurs->wklStatus2[i];
				if (spurs->wklState2[i] == SPURS_WKL_STATE_RUNNABLE)
				{
					spurs->wklStatus2[i] |= 1 << ctxt->spuNum;
					ctxt->wklRunnable2 |= 0x8000 >> i;
				}
				else
				{
					spurs->wklStatus2[i] &= ~(1 << ctxt->spuNum);
				}

				// If the workload is shutting down and if this is the last SPU from which it is being removed then
				// add it to the shutdown bit set
				if (spurs->wklState2[i] == SPURS_WKL_STATE_SHUTTING_DOWN)
				{
					if (((wklStatus & (1 << ctxt->spuNum)) != 0) && (spurs->wklStatus2[i] == 0))
					{
						spurs->wklState2[i] = SPURS_WKL_STATE_REMOVABLE;
						wklShutdownBitSet |= 0x8000 >> i;
					}
				}
			}
		}

		std::memcpy(spu._ptr<void>(0x2D80), spurs->wklState1, 128);
	}//);

	if (wklShutdownBitSet)
	{
		spursSysServiceUpdateShutdownCompletionEvents(spu, ctxt, wklShutdownBitSet);
	}
}

// Update shutdown completion events
void spursSysServiceUpdateShutdownCompletionEvents(spu_thread& spu, SpursKernelContext* ctxt, u32 wklShutdownBitSet)
{
	// Mark the workloads in wklShutdownBitSet as completed and also generate a bit set of the completed
	// workloads that have a shutdown completion hook registered
	u32 wklNotifyBitSet;
	u8  spuPort;
	//vm::reservation_op(ctxt->spurs.ptr(&CellSpurs::wklState1).addr(), 128, [&]()
	{
		auto spurs = ctxt->spurs.get_ptr();

		wklNotifyBitSet = 0;
		spuPort = spurs->spuPort;;
		for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++)
		{
			if (wklShutdownBitSet & (0x80000000u >> i))
			{
				spurs->wklEvent1[i] |= 0x01;
				if (spurs->wklEvent1[i] & 0x02 || spurs->wklEvent1[i] & 0x10)
				{
					wklNotifyBitSet |= 0x80000000u >> i;
				}
			}

			if (wklShutdownBitSet & (0x8000 >> i))
			{
				spurs->wklEvent2[i] |= 0x01;
				if (spurs->wklEvent2[i] & 0x02 || spurs->wklEvent2[i] & 0x10)
				{
					wklNotifyBitSet |= 0x8000 >> i;
				}
			}
		}

		std::memcpy(spu._ptr<void>(0x2D80), spurs->wklState1, 128);
	}//);

	if (wklNotifyBitSet)
	{
		// TODO: sys_spu_thread_send_event(spuPort, 0, wklNotifyMask);
	}
}

// Update the trace count for this SPU
void spursSysServiceTraceSaveCount(spu_thread& spu, SpursKernelContext* ctxt)
{
	if (ctxt->traceBuffer)
	{
		auto traceInfo = vm::ptr<CellSpursTraceInfo>::make(vm::cast(ctxt->traceBuffer - (ctxt->spurs->traceStartIndex[ctxt->spuNum] << 4)));
		traceInfo->count[ctxt->spuNum] = ctxt->traceMsgCount;
	}
}

// Update trace control
void spursSysServiceTraceUpdate(spu_thread& spu, SpursKernelContext* ctxt, u32 arg2, u32 arg3, u32 forceNotify)
{
	bool notify;

	u8 sysSrvMsgUpdateTrace;
	//vm::reservation_op(ctxt->spurs.ptr(&CellSpurs::wklState1).addr(), 128, [&]()
	{
		auto spurs = ctxt->spurs.get_ptr();
		auto& trace = spurs->sysSrvTrace.raw();

		sysSrvMsgUpdateTrace = trace.sysSrvMsgUpdateTrace;
		trace.sysSrvMsgUpdateTrace &= ~(1 << ctxt->spuNum);
		trace.sysSrvTraceInitialised &= ~(1 << ctxt->spuNum);
		trace.sysSrvTraceInitialised |= arg2 << ctxt->spuNum;

		notify = false;
		if (((sysSrvMsgUpdateTrace & (1 << ctxt->spuNum)) != 0) && (spurs->sysSrvTrace.load().sysSrvMsgUpdateTrace == 0) && (spurs->sysSrvTrace.load().sysSrvNotifyUpdateTraceComplete != 0))
		{
			trace.sysSrvNotifyUpdateTraceComplete = 0;
			notify = true;
		}

		if (forceNotify && spurs->sysSrvTrace.load().sysSrvNotifyUpdateTraceComplete != 0)
		{
			trace.sysSrvNotifyUpdateTraceComplete = 0;
			notify = true;
		}

		std::memcpy(spu._ptr<void>(0x2D80), spurs->wklState1, 128);
	}//);

	// Get trace parameters from CellSpurs and store them in the LS
	if (((sysSrvMsgUpdateTrace & (1 << ctxt->spuNum)) != 0) || (arg3 != 0))
	{
		//vm::reservation_acquire(spu._ptr<void>(0x80), ctxt->spurs.ptr(&CellSpurs::traceBuffer).addr(), 128);
		auto spurs = spu._ptr<CellSpurs>(0x80 - offset32(&CellSpurs::traceBuffer));

		if (ctxt->traceMsgCount != 0xffu || spurs->traceBuffer.addr() == 0u)
		{
			spursSysServiceTraceSaveCount(spu, ctxt);
		}
		else
		{
			const auto traceBuffer = spu._ptr<CellSpursTraceInfo>(0x2C00);
			std::memcpy(traceBuffer, vm::base(vm::cast(spurs->traceBuffer.addr(), HERE) & -0x4), 0x80);
			ctxt->traceMsgCount = traceBuffer->count[ctxt->spuNum];
		}

		ctxt->traceBuffer = spurs->traceBuffer.addr() + (spurs->traceStartIndex[ctxt->spuNum] << 4);
		ctxt->traceMaxCount = spurs->traceStartIndex[1] - spurs->traceStartIndex[0];
		if (ctxt->traceBuffer == 0u)
		{
			ctxt->traceMsgCount = 0u;
		}
	}

	if (notify)
	{
		auto spurs = spu._ptr<CellSpurs>(0x2D80 - offset32(&CellSpurs::wklState1));
		sys_spu_thread_send_event(spu, spurs->spuPort, 2, 0);
	}
}

// Restore state after executing the system workload
void spursSysServiceCleanupAfterSystemWorkload(spu_thread& spu, SpursKernelContext* ctxt)
{
	u8 wklId;

	bool do_return = false;

	//vm::reservation_op(ctxt->spurs.ptr(&CellSpurs::wklState1).addr(), 128, [&]()
	{
		auto spurs = ctxt->spurs.get_ptr();

		if (spurs->sysSrvPreemptWklId[ctxt->spuNum] == 0xFF)
		{
			do_return = true;
			return;
		}

		wklId = spurs->sysSrvPreemptWklId[ctxt->spuNum];
		spurs->sysSrvPreemptWklId[ctxt->spuNum] = 0xFF;

		std::memcpy(spu._ptr<void>(0x2D80), spurs->wklState1, 128);
	}//);

	if (do_return) return;

	spursSysServiceActivateWorkload(spu, ctxt);

	//vm::reservation_op(vm::cast(ctxt->spurs.addr(), HERE), 128, [&]()
	{
		auto spurs = ctxt->spurs.get_ptr();

		if (wklId >= CELL_SPURS_MAX_WORKLOAD)
		{
			spurs->wklCurrentContention[wklId & 0x0F] -= 0x10;
			spurs->wklReadyCount1[wklId & 0x0F].raw() -= 1;
		}
		else
		{
			spurs->wklCurrentContention[wklId & 0x0F] -= 0x01;
			spurs->wklIdleSpuCountOrReadyCount2[wklId & 0x0F].raw() -= 1;
		}

		std::memcpy(spu._ptr<void>(0x100), spurs, 128);
	}//);

	// Set the current workload id to the id of the pre-empted workload since cellSpursModulePutTrace
	// uses the current worload id to determine the workload to which the trace belongs
	auto wklIdSaved = ctxt->wklCurrentId;
	ctxt->wklCurrentId = wklId;

	// Trace - STOP: GUID
	CellSpursTracePacket pkt{};
	pkt.header.tag = CELL_SPURS_TRACE_TAG_STOP;
	pkt.data.stop = SPURS_GUID_SYS_WKL;
	cellSpursModulePutTrace(&pkt, ctxt->dmaTagId);

	ctxt->wklCurrentId = wklIdSaved;
}

//----------------------------------------------------------------------------
// SPURS taskset policy module functions
//----------------------------------------------------------------------------

enum SpursTasksetRequest
{
	SPURS_TASKSET_REQUEST_POLL_SIGNAL = -1,
	SPURS_TASKSET_REQUEST_DESTROY_TASK = 0,
	SPURS_TASKSET_REQUEST_YIELD_TASK = 1,
	SPURS_TASKSET_REQUEST_WAIT_SIGNAL = 2,
	SPURS_TASKSET_REQUEST_POLL = 3,
	SPURS_TASKSET_REQUEST_WAIT_WKL_FLAG = 4,
	SPURS_TASKSET_REQUEST_SELECT_TASK = 5,
	SPURS_TASKSET_REQUEST_RECV_WKL_FLAG = 6,
};

// Taskset PM entry point
bool spursTasksetEntry(spu_thread& spu)
{
	auto ctxt = spu._ptr<SpursTasksetContext>(0x2700);
	auto kernelCtxt = spu._ptr<SpursKernelContext>(spu.gpr[3]._u32[3]);

	auto arg = spu.gpr[4]._u64[1];
	auto pollStatus = spu.gpr[5]._u32[3];

	// Initialise memory and save args
	memset(ctxt, 0, sizeof(*ctxt));
	ctxt->taskset.set(arg);
	memcpy(ctxt->moduleId, "SPURSTASK MODULE", sizeof(ctxt->moduleId));
	ctxt->kernelMgmtAddr = spu.gpr[3]._u32[3];
	ctxt->syscallAddr = CELL_SPURS_TASKSET_PM_SYSCALL_ADDR;
	ctxt->spuNum = kernelCtxt->spuNum;
	ctxt->dmaTagId = kernelCtxt->dmaTagId;
	ctxt->taskId = 0xFFFFFFFF;

	// Register SPURS takset policy module HLE functions
	//spu.UnregisterHleFunctions(CELL_SPURS_TASKSET_PM_ENTRY_ADDR, 0x40000/*LS_BOTTOM*/);
	//spu.RegisterHleFunction(CELL_SPURS_TASKSET_PM_ENTRY_ADDR, spursTasksetEntry);
	//spu.RegisterHleFunction(ctxt->syscallAddr, spursTasksetSyscallEntry);

	{
		// Initialise the taskset policy module
		spursTasksetInit(spu, pollStatus);

		// Dispatch
		spursTasksetDispatch(spu);
	}

	return false;
}

// Entry point into the Taskset PM for task syscalls
bool spursTasksetSyscallEntry(spu_thread& spu)
{
	auto ctxt = spu._ptr<SpursTasksetContext>(0x2700);

	{
		// Save task context
		ctxt->savedContextLr = spu.gpr[0];
		ctxt->savedContextSp = spu.gpr[1];
		for (auto i = 0; i < 48; i++)
		{
			ctxt->savedContextR80ToR127[i] = spu.gpr[80 + i];
		}

		// Handle the syscall
		spu.gpr[3]._u32[3] = spursTasksetProcessSyscall(spu, spu.gpr[3]._u32[3], spu.gpr[4]._u32[3]);

		// Resume the previously executing task if the syscall did not cause a context switch
		fmt::throw_exception("Broken (TODO)" HERE);
		//if (spu.m_is_branch == false) {
		//    spursTasksetResumeTask(spu);
		//}
	}

	return false;
}

// Resume a task
void spursTasksetResumeTask(spu_thread& spu)
{
	auto ctxt = spu._ptr<SpursTasksetContext>(0x2700);

	// Restore task context
	spu.gpr[0] = ctxt->savedContextLr;
	spu.gpr[1] = ctxt->savedContextSp;
	for (auto i = 0; i < 48; i++)
	{
		spu.gpr[80 + i] = ctxt->savedContextR80ToR127[i];
	}

	spu.pc = spu.gpr[0]._u32[3];
}

// Start a task
void spursTasksetStartTask(spu_thread& spu, CellSpursTaskArgument& taskArgs)
{
	auto ctxt = spu._ptr<SpursTasksetContext>(0x2700);
	auto taskset = spu._ptr<CellSpursTaskset>(0x2700);

	spu.gpr[2].clear();
	spu.gpr[3] = v128::from64r(taskArgs._u64[0], taskArgs._u64[1]);
	spu.gpr[4]._u64[1] = taskset->args;
	spu.gpr[4]._u64[0] = taskset->spurs.addr();
	for (auto i = 5; i < 128; i++)
	{
		spu.gpr[i].clear();
	}

	spu.pc = ctxt->savedContextLr.value()._u32[3];
}

// Process a request and update the state of the taskset
s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* isWaiting)
{
	auto kernelCtxt = spu._ptr<SpursKernelContext>(0x100);
	auto ctxt = spu._ptr<SpursTasksetContext>(0x2700);

	s32 rc = CELL_OK;
	s32 numNewlyReadyTasks = 0;

	//vm::reservation_op(vm::cast(ctxt->taskset.addr(), HERE), 128, [&]()
	{
		auto taskset = ctxt->taskset;
		v128 waiting = vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::waiting));
		v128 running = vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::running));
		v128 ready = vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::ready));
		v128 pready = vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::pending_ready));
		v128 enabled = vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::enabled));
		v128 signalled = vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::signalled));

		// Verify taskset state is valid
		if ((waiting & running) != v128{} || (ready & pready) != v128{} ||
			(v128::andnot(enabled, running | ready | pready | signalled | waiting) != v128{}))
		{
			spu_log.error("Invalid taskset state");
			spursHalt(spu);
		}

		// Find the number of tasks that have become ready since the last iteration
		{
			auto newlyReadyTasks = v128::andnot(ready, signalled | pready);

			// TODO: Optimize this shit with std::popcount when it's known to be fixed
			for (auto i = 0; i < 128; i++)
			{
				if (newlyReadyTasks._bit[i])
				{
					numNewlyReadyTasks++;
				}
			}
		}

		v128 readyButNotRunning;
		u8   selectedTaskId;
		v128 signalled0 = (signalled & (ready | pready));
		v128 ready0 = (signalled | ready | pready);

		switch (request)
		{
		case SPURS_TASKSET_REQUEST_POLL_SIGNAL:
		{
			rc = signalled0._bit[ctxt->taskId] ? 1 : 0;
			signalled0._bit[ctxt->taskId] = false;
			break;
		}
		case SPURS_TASKSET_REQUEST_DESTROY_TASK:
		{
			numNewlyReadyTasks--;
			running._bit[ctxt->taskId] = false;
			enabled._bit[ctxt->taskId] = false;
			signalled0._bit[ctxt->taskId] = false;
			ready0._bit[ctxt->taskId] = false;
			break;
		}
		case SPURS_TASKSET_REQUEST_YIELD_TASK:
		{
			running._bit[ctxt->taskId] = false;
			waiting._bit[ctxt->taskId] = true;
			break;
		}
		case SPURS_TASKSET_REQUEST_WAIT_SIGNAL:
		{
			if (signalled0._bit[ctxt->taskId] == false)
			{
				numNewlyReadyTasks--;
				running._bit[ctxt->taskId] = false;
				waiting._bit[ctxt->taskId] = true;
				signalled0._bit[ctxt->taskId] = false;
				ready0._bit[ctxt->taskId] = false;
			}
			break;
		}
		case SPURS_TASKSET_REQUEST_POLL:
		{
			readyButNotRunning = v128::andnot(running, ready0);
			if (taskset->wkl_flag_wait_task < CELL_SPURS_MAX_TASK)
			{
				readyButNotRunning._bit[taskset->wkl_flag_wait_task] = false;
			}

			rc = readyButNotRunning != v128{} ? 1 : 0;
			break;
		}
		case SPURS_TASKSET_REQUEST_WAIT_WKL_FLAG:
		{
			if (taskset->wkl_flag_wait_task == 0x81)
			{
				// A workload flag is already pending so consume it
				taskset->wkl_flag_wait_task = 0x80;
				rc = 0;
			}
			else if (taskset->wkl_flag_wait_task == 0x80)
			{
				// No tasks are waiting for the workload flag. Mark this task as waiting for the workload flag.
				taskset->wkl_flag_wait_task = ctxt->taskId;
				running._bit[ctxt->taskId] = false;
				waiting._bit[ctxt->taskId] = true;
				rc = 1;
				numNewlyReadyTasks--;
			}
			else
			{
				// Another task is already waiting for the workload signal
				rc = CELL_SPURS_TASK_ERROR_BUSY;
			}
			break;
		}
		case SPURS_TASKSET_REQUEST_SELECT_TASK:
		{
			readyButNotRunning = v128::andnot(running, ready0);
			if (taskset->wkl_flag_wait_task < CELL_SPURS_MAX_TASK)
			{
				readyButNotRunning._bit[taskset->wkl_flag_wait_task] = false;
			}

			// Select a task from the readyButNotRunning set to run. Start from the task after the last scheduled task to ensure fairness.
			for (selectedTaskId = taskset->last_scheduled_task + 1; selectedTaskId < 128; selectedTaskId++)
			{
				if (readyButNotRunning._bit[selectedTaskId])
				{
					break;
				}
			}

			if (selectedTaskId == 128)
			{
				for (selectedTaskId = 0; selectedTaskId < taskset->last_scheduled_task + 1; selectedTaskId++)
				{
					if (readyButNotRunning._bit[selectedTaskId])
					{
						break;
					}
				}

				if (selectedTaskId == taskset->last_scheduled_task + 1)
				{
					selectedTaskId = CELL_SPURS_MAX_TASK;
				}
			}

			*taskId = selectedTaskId;
			*isWaiting = waiting._bit[selectedTaskId < CELL_SPURS_MAX_TASK ? selectedTaskId : 0] ? 1 : 0;
			if (selectedTaskId != CELL_SPURS_MAX_TASK)
			{
				taskset->last_scheduled_task = selectedTaskId;
				running._bit[selectedTaskId] = true;
				waiting._bit[selectedTaskId] = false;
			}
			break;
		}
		case SPURS_TASKSET_REQUEST_RECV_WKL_FLAG:
		{
			if (taskset->wkl_flag_wait_task < CELL_SPURS_MAX_TASK)
			{
				// There is a task waiting for the workload flag
				taskset->wkl_flag_wait_task = 0x80;
				rc = 1;
				numNewlyReadyTasks++;
			}
			else
			{
				// No tasks are waiting for the workload flag
				taskset->wkl_flag_wait_task = 0x81;
				rc = 0;
			}
			break;
		}
		default:
			spu_log.error("Unknown taskset request");
			spursHalt(spu);
		}

		vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::waiting)) = waiting;
		vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::running)) = running;
		vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::ready)) = ready;
		vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::pending_ready)) = v128{};
		vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::enabled)) = enabled;
		vm::_ref<v128>(ctxt->taskset.addr() + ::offset32(&CellSpursTaskset::signalled)) = signalled;

		std::memcpy(spu._ptr<void>(0x2700), spu._ptr<void>(0x100), 128); // Copy data
	}//);

	// Increment the ready count of the workload by the number of tasks that have become ready
	if (numNewlyReadyTasks)
	{
		auto spurs = kernelCtxt->spurs;

		vm::light_op(spurs->readyCount(kernelCtxt->wklCurrentId), [&](atomic_t<u8>& val)
		{
			val.fetch_op([&](u8& val)
			{
				const s32 _new = val + numNewlyReadyTasks;
				val = static_cast<u8>(std::clamp<s32>(_new, 0, 0xFF));
			});
		});
	}

	return rc;
}

// Process pollStatus received from the SPURS kernel
void spursTasksetProcessPollStatus(spu_thread& spu, u32 pollStatus)
{
	if (pollStatus & CELL_SPURS_MODULE_POLL_STATUS_FLAG)
	{
		spursTasksetProcessRequest(spu, SPURS_TASKSET_REQUEST_RECV_WKL_FLAG, nullptr, nullptr);
	}
}

// Check execution rights
bool spursTasksetPollStatus(spu_thread& spu)
{
	u32 pollStatus;

	if (cellSpursModulePollStatus(spu, &pollStatus))
	{
		return true;
	}

	spursTasksetProcessPollStatus(spu, pollStatus);
	return false;
}

// Exit the Taskset PM
void spursTasksetExit(spu_thread& spu)
{
	auto ctxt = spu._ptr<SpursTasksetContext>(0x2700);

	// Trace - STOP
	CellSpursTracePacket pkt{};
	pkt.header.tag = 0x54; // Its not clear what this tag means exactly but it seems similar to CELL_SPURS_TRACE_TAG_STOP
	pkt.data.stop = SPURS_GUID_TASKSET_PM;
	cellSpursModulePutTrace(&pkt, ctxt->dmaTagId);

	// Not sure why this check exists. Perhaps to check for memory corruption.
	if (memcmp(ctxt->moduleId, "SPURSTASK MODULE", 16) != 0)
	{
		spu_log.error("spursTasksetExit(): memory corruption");
		spursHalt(spu);
	}

	cellSpursModuleExit(spu);
}

// Invoked when a task exits
void spursTasksetOnTaskExit(spu_thread& spu, u64 addr, u32 taskId, s32 exitCode, u64 args)
{
	auto ctxt = spu._ptr<SpursTasksetContext>(0x2700);

	std::memcpy(spu._ptr<void>(0x10000), vm::base(addr & -0x80), (addr & 0x7F) << 11);

	spu.gpr[3]._u64[1] = ctxt->taskset.addr();
	spu.gpr[4]._u32[3] = taskId;
	spu.gpr[5]._u32[3] = exitCode;
	spu.gpr[6]._u64[1] = args;
	spu.fast_call(0x10000);
}

// Save the context of a task
s32 spursTasketSaveTaskContext(spu_thread& spu)
{
	auto ctxt = spu._ptr<SpursTasksetContext>(0x2700);
	auto taskInfo = spu._ptr<CellSpursTaskset::TaskInfo>(0x2780);

	//spursDmaWaitForCompletion(spu, 0xFFFFFFFF);

	if (taskInfo->context_save_storage_and_alloc_ls_blocks == 0u)
	{
		return CELL_SPURS_TASK_ERROR_STAT;
	}

	u32 allocLsBlocks = static_cast<u32>(taskInfo->context_save_storage_and_alloc_ls_blocks & 0x7F);
	u32 lsBlocks = 0;
	v128 ls_pattern = v128::from64r(taskInfo->ls_pattern._u64[0], taskInfo->ls_pattern._u64[1]);
	for (auto i = 0; i < 128; i++)
	{
		if (ls_pattern._bit[i])
		{
			lsBlocks++;
		}
	}

	if (lsBlocks > allocLsBlocks)
	{
		return CELL_SPURS_TASK_ERROR_STAT;
	}

	// Make sure the stack is area is specified in the ls pattern
	for (auto i = (ctxt->savedContextSp.value()._u32[3]) >> 11; i < 128; i++)
	{
		if (ls_pattern._bit[i] == false)
		{
			return CELL_SPURS_TASK_ERROR_STAT;
		}
	}

	// Get the processor context
	v128 r;
	spu.fpscr.Read(r);
	ctxt->savedContextFpscr = r;
	ctxt->savedSpuWriteEventMask = static_cast<u32>(spu.get_ch_value(SPU_RdEventMask));
	ctxt->savedWriteTagGroupQueryMask = static_cast<u32>(spu.get_ch_value(MFC_RdTagMask));

	// Store the processor context
	const u32 contextSaveStorage = vm::cast(taskInfo->context_save_storage_and_alloc_ls_blocks & -0x80, HERE);
	std::memcpy(vm::base(contextSaveStorage), spu._ptr<void>(0x2C80), 0x380);

	// Save LS context
	for (auto i = 6; i < 128; i++)
	{
		if (ls_pattern._bit[i])
		{
			// TODO: Combine DMA requests for consecutive blocks into a single request
			std::memcpy(vm::base(contextSaveStorage + 0x400 + ((i - 6) << 11)), spu._ptr<void>(CELL_SPURS_TASK_TOP + ((i - 6) << 11)), 0x800);
		}
	}

	//spursDmaWaitForCompletion(spu, 1 << ctxt->dmaTagId);
	return CELL_OK;
}

// Taskset dispatcher
void spursTasksetDispatch(spu_thread& spu)
{
	const auto ctxt = spu._ptr<SpursTasksetContext>(0x2700);
	const auto taskset = spu._ptr<CellSpursTaskset>(0x2700);

	u32 taskId;
	u32 isWaiting;
	spursTasksetProcessRequest(spu, SPURS_TASKSET_REQUEST_SELECT_TASK, &taskId, &isWaiting);
	if (taskId >= CELL_SPURS_MAX_TASK)
	{
		spursTasksetExit(spu);
		return;
	}

	ctxt->taskId = taskId;

	// DMA in the task info for the selected task
	const auto taskInfo = spu._ptr<CellSpursTaskset::TaskInfo>(0x2780);
	std::memcpy(taskInfo, &ctxt->taskset->task_info[taskId], sizeof(CellSpursTaskset::TaskInfo));
	auto elfAddr = taskInfo->elf.addr().value();
	taskInfo->elf.set(taskInfo->elf.addr() & 0xFFFFFFFFFFFFFFF8);

	// Trace - Task: Incident=dispatch
	CellSpursTracePacket pkt{};
	pkt.header.tag = CELL_SPURS_TRACE_TAG_TASK;
	pkt.data.task.incident = CELL_SPURS_TRACE_TASK_DISPATCH;
	pkt.data.task.taskId = taskId;
	cellSpursModulePutTrace(&pkt, CELL_SPURS_KERNEL_DMA_TAG_ID);

	if (isWaiting == 0)
	{
		// If we reach here it means that the task is being started and not being resumed
		std::memset(spu._ptr<void>(CELL_SPURS_TASK_TOP), 0, CELL_SPURS_TASK_BOTTOM - CELL_SPURS_TASK_TOP);
		ctxt->guidAddr = CELL_SPURS_TASK_TOP;

		u32 entryPoint;
		u32 lowestLoadAddr;
		if (spursTasksetLoadElf(spu, &entryPoint, &lowestLoadAddr, taskInfo->elf.addr(), false) != CELL_OK)
		{
			spu_log.error("spursTaskLoadElf() failed");
			spursHalt(spu);
		}

		//spursDmaWaitForCompletion(spu, 1 << ctxt->dmaTagId);

		ctxt->savedContextLr = v128::from32r(entryPoint);
		ctxt->guidAddr = lowestLoadAddr;
		ctxt->tasksetMgmtAddr = 0x2700;
		ctxt->x2FC0 = 0;
		ctxt->taskExitCode = isWaiting;
		ctxt->x2FD4 = elfAddr & 5; // TODO: Figure this out

		if ((elfAddr & 5) == 1)
		{
			std::memcpy(spu._ptr<void>(0x2FC0), &vm::_ptr<CellSpursTaskset2>(vm::cast(ctxt->taskset.addr()))->task_exit_code[taskId], 0x10);
		}

		// Trace - GUID
		pkt = {};
		pkt.header.tag = CELL_SPURS_TRACE_TAG_GUID;
		pkt.data.guid = 0; // TODO: Put GUID of taskId here
		cellSpursModulePutTrace(&pkt, 0x1F);

		if (elfAddr & 2)
		{
			// TODO: Figure this out
			spu_runtime::g_escape(&spu);
		}

		spursTasksetStartTask(spu, taskInfo->args);
	}
	else
	{
		if (taskset->enable_clear_ls)
		{
			std::memset(spu._ptr<void>(CELL_SPURS_TASK_TOP), 0, CELL_SPURS_TASK_BOTTOM - CELL_SPURS_TASK_TOP);
		}

		// If the entire LS is saved then there is no need to load the ELF as it will be be saved in the context save area as well
		v128 ls_pattern = v128::from64r(taskInfo->ls_pattern._u64[0], taskInfo->ls_pattern._u64[1]);
		if (ls_pattern != v128::from64r(0x03FFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull))
		{
			// Load the ELF
			u32 entryPoint;
			if (spursTasksetLoadElf(spu, &entryPoint, nullptr, taskInfo->elf.addr(), true) != CELL_OK)
			{
				spu_log.error("spursTasksetLoadElf() failed");
				spursHalt(spu);
			}
		}

		// Load saved context from main memory to LS
		const u32 contextSaveStorage = vm::cast(taskInfo->context_save_storage_and_alloc_ls_blocks & -0x80, HERE);
		std::memcpy(spu._ptr<void>(0x2C80), vm::base(contextSaveStorage), 0x380);
		for (auto i = 6; i < 128; i++)
		{
			if (ls_pattern._bit[i])
			{
				// TODO: Combine DMA requests for consecutive blocks into a single request
				std::memcpy(spu._ptr<void>(CELL_SPURS_TASK_TOP + ((i - 6) << 11)), vm::base(contextSaveStorage + 0x400 + ((i - 6) << 11)), 0x800);
			}
		}

		//spursDmaWaitForCompletion(spu, 1 << ctxt->dmaTagId);

		// Restore saved registers
		spu.fpscr.Write(ctxt->savedContextFpscr.value());
		spu.set_ch_value(MFC_WrTagMask, ctxt->savedWriteTagGroupQueryMask);
		spu.set_ch_value(SPU_WrEventMask, ctxt->savedSpuWriteEventMask);

		// Trace - GUID
		pkt = {};
		pkt.header.tag = CELL_SPURS_TRACE_TAG_GUID;
		pkt.data.guid = 0; // TODO: Put GUID of taskId here
		cellSpursModulePutTrace(&pkt, 0x1F);

		if (elfAddr & 2)
		{
			// TODO: Figure this out
			spu_runtime::g_escape(&spu);
		}

		spu.gpr[3].clear();
		spursTasksetResumeTask(spu);
	}
}

// Process a syscall request
s32 spursTasksetProcessSyscall(spu_thread& spu, u32 syscallNum, u32 args)
{
	auto ctxt = spu._ptr<SpursTasksetContext>(0x2700);
	auto taskset = spu._ptr<CellSpursTaskset>(0x2700);

	// If the 0x10 bit is set in syscallNum then its the 2nd version of the
	// syscall (e.g. cellSpursYield2 instead of cellSpursYield) and so don't wait
	// for DMA completion
	if ((syscallNum & 0x10) == 0)
	{
		//spursDmaWaitForCompletion(spu, 0xFFFFFFFF);
	}

	s32 rc = 0;
	u32 incident = 0;
	switch (syscallNum & 0x0F)
	{
	case CELL_SPURS_TASK_SYSCALL_EXIT:
		if (ctxt->x2FD4 == 4u || (ctxt->x2FC0 & 0xffffffffu) != 0u)
		{ // TODO: Figure this out
			if (ctxt->x2FD4 != 4u)
			{
				spursTasksetProcessRequest(spu, SPURS_TASKSET_REQUEST_DESTROY_TASK, nullptr, nullptr);
			}

			const u64 addr = ctxt->x2FD4 == 4u ? +taskset->x78 : +ctxt->x2FC0;
			const u64 args = ctxt->x2FD4 == 4u ? 0 : +ctxt->x2FC8;
			spursTasksetOnTaskExit(spu, addr, ctxt->taskId, ctxt->taskExitCode, args);
		}

		incident = CELL_SPURS_TRACE_TASK_EXIT;
		break;
	case CELL_SPURS_TASK_SYSCALL_YIELD:
		if (spursTasksetPollStatus(spu) || spursTasksetProcessRequest(spu, SPURS_TASKSET_REQUEST_POLL, nullptr, nullptr))
		{
			// If we reach here then it means that either another task can be scheduled or another workload can be scheduled
			// Save the context of the current task
			rc = spursTasketSaveTaskContext(spu);
			if (rc == CELL_OK)
			{
				spursTasksetProcessRequest(spu, SPURS_TASKSET_REQUEST_YIELD_TASK, nullptr, nullptr);
				incident = CELL_SPURS_TRACE_TASK_YIELD;
			}
		}
		break;
	case CELL_SPURS_TASK_SYSCALL_WAIT_SIGNAL:
		if (spursTasksetProcessRequest(spu, SPURS_TASKSET_REQUEST_POLL_SIGNAL, nullptr, nullptr) == 0)
		{
			rc = spursTasketSaveTaskContext(spu);
			if (rc == CELL_OK)
			{
				if (spursTasksetProcessRequest(spu, SPURS_TASKSET_REQUEST_WAIT_SIGNAL, nullptr, nullptr) == 0)
				{
					incident = CELL_SPURS_TRACE_TASK_WAIT;
				}
			}
		}
		break;
	case CELL_SPURS_TASK_SYSCALL_POLL:
		rc = spursTasksetPollStatus(spu) ? CELL_SPURS_TASK_POLL_FOUND_WORKLOAD : 0;
		rc |= spursTasksetProcessRequest(spu, SPURS_TASKSET_REQUEST_POLL, nullptr, nullptr) ? CELL_SPURS_TASK_POLL_FOUND_TASK : 0;
		break;
	case CELL_SPURS_TASK_SYSCALL_RECV_WKL_FLAG:
		if (args == 0)
		{ // TODO: Figure this out
			spu_log.error("args == 0");
			//spursHalt(spu);
		}

		if (spursTasksetPollStatus(spu) || spursTasksetProcessRequest(spu, SPURS_TASKSET_REQUEST_WAIT_WKL_FLAG, nullptr, nullptr) != 1)
		{
			rc = spursTasketSaveTaskContext(spu);
			if (rc == CELL_OK)
			{
				incident = CELL_SPURS_TRACE_TASK_WAIT;
			}
		}
		break;
	default:
		rc = CELL_SPURS_TASK_ERROR_NOSYS;
		break;
	}

	if (incident)
	{
		// Trace - TASK
		CellSpursTracePacket pkt{};
		pkt.header.tag = CELL_SPURS_TRACE_TAG_TASK;
		pkt.data.task.incident = incident;
		pkt.data.task.taskId = ctxt->taskId;
		cellSpursModulePutTrace(&pkt, ctxt->dmaTagId);

		// Clear the GUID of the task
		std::memset(spu._ptr<void>(ctxt->guidAddr), 0, 0x10);

		if (spursTasksetPollStatus(spu))
		{
			spursTasksetExit(spu);
		}
		else
		{
			spursTasksetDispatch(spu);
		}
	}

	return rc;
}

// Initialise the Taskset PM
void spursTasksetInit(spu_thread& spu, u32 pollStatus)
{
	auto ctxt = spu._ptr<SpursTasksetContext>(0x2700);
	auto kernelCtxt = spu._ptr<SpursKernelContext>(0x100);

	kernelCtxt->moduleId[0] = 'T';
	kernelCtxt->moduleId[1] = 'K';

	// Trace - START: Module='TKST'
	CellSpursTracePacket pkt{};
	pkt.header.tag = 0x52; // Its not clear what this tag means exactly but it seems similar to CELL_SPURS_TRACE_TAG_START
	std::memcpy(pkt.data.start._module, "TKST", 4);
	pkt.data.start.level = 2;
	pkt.data.start.ls = 0xA00 >> 2;
	cellSpursModulePutTrace(&pkt, ctxt->dmaTagId);

	spursTasksetProcessPollStatus(spu, pollStatus);
}

// Load an ELF
s32 spursTasksetLoadElf(spu_thread& spu, u32* entryPoint, u32* lowestLoadAddr, u64 elfAddr, bool skipWriteableSegments)
{
	if (elfAddr == 0 || (elfAddr & 0x0F) != 0)
	{
		return CELL_SPURS_TASK_ERROR_INVAL;
	}

	const spu_exec_object obj(fs::file(vm::base(vm::cast(elfAddr, HERE)), u32(0 - elfAddr)));

	if (obj != elf_error::ok)
	{
		return CELL_SPURS_TASK_ERROR_NOEXEC;
	}

	u32 _lowestLoadAddr = CELL_SPURS_TASK_BOTTOM;
	for (const auto& prog : obj.progs)
	{
		if (prog.p_paddr >= CELL_SPURS_TASK_BOTTOM)
		{
			break;
		}

		if (prog.p_type == 1u /* PT_LOAD */)
		{
			if (skipWriteableSegments == false || (prog.p_flags & 2u /*PF_W*/ ) == 0u)
			{
				if (prog.p_vaddr < CELL_SPURS_TASK_TOP || prog.p_vaddr + prog.p_memsz > CELL_SPURS_TASK_BOTTOM)
				{
					return CELL_SPURS_TASK_ERROR_FAULT;
				}

				_lowestLoadAddr > prog.p_vaddr ? _lowestLoadAddr = prog.p_vaddr : _lowestLoadAddr;
			}
		}
	}

	for (const auto& prog : obj.progs)
	{
		if (prog.p_paddr >= CELL_SPURS_TASK_BOTTOM) // ???
		{
			break;
		}

		if (prog.p_type == 1u)
		{
			if (skipWriteableSegments == false || (prog.p_flags & 2u) == 0u)
			{
				std::memcpy(spu._ptr<void>(prog.p_vaddr), prog.bin.data(), prog.p_filesz);
			}
		}
	}

	*entryPoint = obj.header.e_entry;
	if (lowestLoadAddr) *lowestLoadAddr = _lowestLoadAddr;

	return CELL_OK;
}

//----------------------------------------------------------------------------
// SPURS taskset policy module functions
//----------------------------------------------------------------------------
bool spursJobChainEntry(spu_thread& spu)
{
	const auto ctxt = spu._ptr<SpursJobChainContext>(0x4a00);
	auto kernelCtxt = spu._ptr<SpursKernelContext>(spu.gpr[3]._u32[3]);

	auto arg = spu.gpr[4]._u64[1];
	auto pollStatus = spu.gpr[5]._u32[3];

	// TODO
	return false;
}

void spursJobchainPopUrgentCommand(spu_thread& spu)
{
	const auto ctxt = spu._ptr<SpursJobChainContext>(0x4a00);
	const auto jc = vm::unsafe_ptr_cast<CellSpursJobChain_x00>(+ctxt->jobChain);

	const bool alterQueue = ctxt->unkFlag0;
	vm::reservation_op(spu, jc, [&](CellSpursJobChain_x00& op)
	{
		const auto ls = reinterpret_cast<CellSpursJobChain_x00*>(ctxt->tempAreaJobChain);

		struct alignas(16) { v128 first, second; } data;
		std::memcpy(&data, &op.urgentCmds, sizeof(op.urgentCmds));

		if (!alterQueue)
		{
			// Read the queue, do not modify it
		}
		else
		{
			// Move FIFO queue contents one command up
			data.first._u64[0] = data.first._u64[1];
			data.first._u64[1] = data.second._u64[0];
			data.second._u64[0] = data.second._u64[1];
			data.second._u64[1] = 0;
		}

		// Writeback
		std::memcpy(&ls->urgentCmds, &data, sizeof(op.urgentCmds));

		std::memcpy(&ls->isHalted, &op.unk0[0], 1); // Maybe intended to set it to false
		ls->unk5 = 0;
		ls->sizeJobDescriptor = op.maxGrabbedJob;
		std::memcpy(&op, ls, 128);
	});
}