chore: review-cleanup fleet (audit + fix + hardening) (#5158)

2026-04-20 22:23:37 +00:00 · 2026-04-16 19:02:59 -05:00 · 2026-04-16 19:02:59 -05:00 · 17e69c6d4c
commit 17e69c6d4c
parent 872c566ef1
68 changed files with 784 additions and 459 deletions
--- a/core/service/src/commonMain/kotlin/org/meshtastic/core/service/MeshServiceOrchestrator.kt
+++ b/core/service/src/commonMain/kotlin/org/meshtastic/core/service/MeshServiceOrchestrator.kt
@ -19,13 +19,15 @@ package org.meshtastic.core.service
 import co.touchlab.kermit.Logger
 import co.touchlab.kermit.Severity
 import kotlinx.coroutines.CoroutineScope
-import kotlinx.coroutines.Job
+import kotlinx.coroutines.SupervisorJob
+import kotlinx.coroutines.cancel
 import kotlinx.coroutines.flow.launchIn
 import kotlinx.coroutines.flow.onEach
-import org.koin.core.annotation.Named
+import kotlinx.coroutines.isActive
 import org.koin.core.annotation.Single
 import org.meshtastic.core.common.database.DatabaseManager
 import org.meshtastic.core.common.util.handledLaunch
+import org.meshtastic.core.di.CoroutineDispatchers
 import org.meshtastic.core.repository.MeshConnectionManager
 import org.meshtastic.core.repository.MeshMessageProcessor
 import org.meshtastic.core.repository.MeshRouter
@ -59,18 +61,15 @@ class MeshServiceOrchestrator(
    private val takPrefs: TakPrefs,
    private val databaseManager: DatabaseManager,
    private val connectionManager: MeshConnectionManager,
-    @Named("ServiceScope") private val scope: CoroutineScope,
+    private val dispatchers: CoroutineDispatchers,
 ) {
-    private var serviceJob: Job? = null
-    private var takJob: Job? = null
-
-    /** The coroutine scope for the service. */
-    val serviceScope: CoroutineScope
-        get() = scope
+    // Per-start coroutine scope. A fresh scope is created on each start() and cancelled on stop(), so all collectors
+    // launched from start() are torn down cleanly and do not accumulate across start/stop/start cycles.
+    private var scope: CoroutineScope? = null

    /** Whether the orchestrator is currently running. */
    val isRunning: Boolean
-        get() = serviceJob?.isActive == true
+        get() = scope?.isActive == true

    /**
     * Starts the mesh service components and wires up data flows.
@ -85,27 +84,31 @@ class MeshServiceOrchestrator(
        }

        Logger.i { "Starting mesh service orchestrator" }
-        val job = Job()
-        serviceJob = job
+        val newScope = CoroutineScope(SupervisorJob() + dispatchers.default)
+        scope = newScope
+
+        // Drop any bytes that piled up in the service's receivedData channel since the last stop(). The channel
+        // outlives the orchestrator's per-start scope, so without this drain a stop/start cycle would replay stale
+        // packets ahead of the fresh session's firmware handshake.
+        radioInterfaceService.resetReceivedBuffer()

        serviceNotifications.initChannels()
        connectionManager.updateStatusNotification()

        // Observe TAK server pref to start/stop
-        takJob =
-            takPrefs.isTakServerEnabled
-                .onEach { isEnabled ->
-                    if (isEnabled && !takServerManager.isRunning.value) {
-                        Logger.i { "TAK Server enabled by preference, starting integration" }
-                        takMeshIntegration.start(scope)
-                    } else if (!isEnabled && takServerManager.isRunning.value) {
-                        Logger.i { "TAK Server disabled by preference, stopping integration" }
-                        takMeshIntegration.stop()
-                    }
+        takPrefs.isTakServerEnabled
+            .onEach { isEnabled ->
+                if (isEnabled && !takServerManager.isRunning.value) {
+                    Logger.i { "TAK Server enabled by preference, starting integration" }
+                    takMeshIntegration.start(newScope)
+                } else if (!isEnabled && takServerManager.isRunning.value) {
+                    Logger.i { "TAK Server disabled by preference, stopping integration" }
+                    takMeshIntegration.stop()
                }
-                .launchIn(scope)
+            }
+            .launchIn(newScope)

-        scope.handledLaunch {
+        newScope.handledLaunch {
            // Ensure the per-device database is active before the radio connects.
            // On Android this is handled by MeshUtilApplication.init(); on Desktop (and any
            // future KMP host) the orchestrator is the first entry point, so it must initialize
@ -119,18 +122,18 @@ class MeshServiceOrchestrator(

        radioInterfaceService.receivedData
            .onEach { bytes -> messageProcessor.handleFromRadio(bytes, nodeManager.myNodeNum.value) }
-            .launchIn(scope)
+            .launchIn(newScope)

        radioInterfaceService.connectionError
            .onEach { errorMessage -> serviceRepository.setErrorMessage(errorMessage, Severity.Warn) }
-            .launchIn(scope)
+            .launchIn(newScope)

        // Each action is dispatched in its own supervised coroutine so that a failure in one
        // action (e.g. a timeout in sendAdminAwait) cannot terminate the collector and silently
        // drop all subsequent service actions for the rest of the session.
        serviceRepository.serviceAction
-            .onEach { action -> scope.handledLaunch { router.actionHandler.onServiceAction(action) } }
-            .launchIn(scope)
+            .onEach { action -> newScope.handledLaunch { router.actionHandler.onServiceAction(action) } }
+            .launchIn(newScope)

        nodeManager.loadCachedNodeDB()
    }
@ -142,13 +145,11 @@ class MeshServiceOrchestrator(
     */
    fun stop() {
        Logger.i { "Stopping mesh service orchestrator" }
-        takJob?.cancel()
-        takJob = null
        // Guard stop() so we don't emit a spurious "stopped" log when TAK was never started
        if (takServerManager.isRunning.value) {
            takMeshIntegration.stop()
        }
-        serviceJob?.cancel()
-        serviceJob = null
+        scope?.cancel()
+        scope = null
    }
 }
--- a/core/service/src/commonMain/kotlin/org/meshtastic/core/service/SharedRadioInterfaceService.kt
+++ b/core/service/src/commonMain/kotlin/org/meshtastic/core/service/SharedRadioInterfaceService.kt
@ -25,7 +25,9 @@ import kotlinx.coroutines.Job
 import kotlinx.coroutines.SupervisorJob
 import kotlinx.coroutines.cancel
 import kotlinx.coroutines.channels.BufferOverflow
+import kotlinx.coroutines.channels.Channel
 import kotlinx.coroutines.delay
+import kotlinx.coroutines.flow.Flow
 import kotlinx.coroutines.flow.MutableSharedFlow
 import kotlinx.coroutines.flow.MutableStateFlow
 import kotlinx.coroutines.flow.SharedFlow
@ -35,6 +37,7 @@ import kotlinx.coroutines.flow.asStateFlow
 import kotlinx.coroutines.flow.catch
 import kotlinx.coroutines.flow.launchIn
 import kotlinx.coroutines.flow.onEach
+import kotlinx.coroutines.flow.receiveAsFlow
 import kotlinx.coroutines.launch
 import kotlinx.coroutines.sync.Mutex
 import kotlinx.coroutines.sync.withLock
@ -42,7 +45,7 @@ import org.koin.core.annotation.Named
 import org.koin.core.annotation.Single
 import org.meshtastic.core.ble.BluetoothRepository
 import org.meshtastic.core.common.util.handledLaunch
-import org.meshtastic.core.common.util.ignoreException
+import org.meshtastic.core.common.util.ignoreExceptionSuspend
 import org.meshtastic.core.common.util.nowMillis
 import org.meshtastic.core.di.CoroutineDispatchers
 import org.meshtastic.core.model.ConnectionState
@ -95,8 +98,13 @@ class SharedRadioInterfaceService(
    private val _currentDeviceAddressFlow = MutableStateFlow<String?>(radioPrefs.devAddr.value)
    override val currentDeviceAddressFlow: StateFlow<String?> = _currentDeviceAddressFlow.asStateFlow()

-    private val _receivedData = MutableSharedFlow<ByteArray>(extraBufferCapacity = 64)
-    override val receivedData: SharedFlow<ByteArray> = _receivedData
+    // Unbounded Channel preserves strict FIFO delivery of incoming radio bytes, which the
+    // firmware handshake depends on (initial config packet ordering). A SharedFlow with
+    // `launch { emit() }` per packet reorders under concurrent dispatch and breaks config load.
+    // trySend on an UNLIMITED channel never suspends and never drops, so handleFromRadio can
+    // remain a non-suspend synchronous callback.
+    private val _receivedData = Channel<ByteArray>(Channel.UNLIMITED)
+    override val receivedData: Flow<ByteArray> = _receivedData.receiveAsFlow()

    private val _meshActivity =
        MutableSharedFlow<MeshActivity>(extraBufferCapacity = 64, onBufferOverflow = BufferOverflow.DROP_OLDEST)
@ -148,6 +156,7 @@ class SharedRadioInterfaceService(
                            }
                        }
                    }
+                    .catch { Logger.e(it) { "devAddr flow crashed" } }
                    .launchIn(processLifecycle.coroutineScope)

                bluetoothRepository.state
@ -216,7 +225,7 @@ class SharedRadioInterfaceService(

        processLifecycle.coroutineScope.launch {
            transportMutex.withLock {
-                ignoreException { stopTransportLocked() }
+                ignoreExceptionSuspend { stopTransportLocked() }
                startTransportLocked()
            }
        }
@ -245,7 +254,7 @@ class SharedRadioInterfaceService(
    }

    /** Must be called under [transportMutex]. */
-    private fun stopTransportLocked() {
+    private suspend fun stopTransportLocked() {
        val currentTransport = radioTransport
        Logger.i { "Stopping transport $currentTransport" }
        isStarted = false
@ -322,13 +331,28 @@ class SharedRadioInterfaceService(
    override fun handleFromRadio(bytes: ByteArray) {
        try {
            lastDataReceivedMillis = nowMillis
-            processLifecycle.coroutineScope.launch(dispatchers.io) { _receivedData.emit(bytes) }
+            // trySend synchronously onto the unbounded Channel so packet order matches arrival
+            // order. The previous `launch { emit() }` pattern dispatched each packet onto a
+            // fresh coroutine, letting the scheduler reorder them — which broke the firmware
+            // config handshake (see PhoneAPI.cpp initial-handshake sequence).
+            val result = _receivedData.trySend(bytes)
+            if (result.isFailure) {
+                Logger.e(result.exceptionOrNull()) { "Failed to enqueue ${bytes.size} received bytes; dropping packet" }
+            }
            _meshActivity.tryEmit(MeshActivity.Receive)
        } catch (t: Throwable) {
            Logger.e(t) { "handleFromRadio failed while emitting data" }
        }
    }

+    override fun resetReceivedBuffer() {
+        // Drain any bytes buffered while no collector was attached. Without this, a stop/start cycle
+        // would replay stale bytes ahead of the next session's firmware handshake, since the channel
+        // outlives the orchestrator's per-start scope.
+        @Suppress("EmptyWhileBlock", "ControlFlowWithEmptyBody")
+        while (_receivedData.tryReceive().isSuccess) Unit
+    }
+
    override fun onConnect() {
        // MutableStateFlow.value is thread-safe (backed by atomics) — assign directly rather than
        // launching a coroutine. The async launch pattern introduced a window where a concurrent
--- a/core/service/src/commonTest/kotlin/org/meshtastic/core/service/MeshServiceOrchestratorTest.kt
+++ b/core/service/src/commonTest/kotlin/org/meshtastic/core/service/MeshServiceOrchestratorTest.kt
@ -23,13 +23,15 @@ import dev.mokkery.every
 import dev.mokkery.matcher.any
 import dev.mokkery.mock
 import dev.mokkery.verify
+import dev.mokkery.verify.VerifyMode.Companion.atLeast
 import dev.mokkery.verify.VerifyMode.Companion.exactly
 import dev.mokkery.verifySuspend
-import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.ExperimentalCoroutinesApi
 import kotlinx.coroutines.flow.MutableSharedFlow
 import kotlinx.coroutines.flow.MutableStateFlow
 import kotlinx.coroutines.test.UnconfinedTestDispatcher
 import org.meshtastic.core.common.database.DatabaseManager
+import org.meshtastic.core.di.CoroutineDispatchers
 import org.meshtastic.core.model.Node
 import org.meshtastic.core.model.service.ServiceAction
 import org.meshtastic.core.repository.CommandSender
@ -70,8 +72,11 @@ class MeshServiceOrchestratorTest {
    private val databaseManager: DatabaseManager = mock(MockMode.autofill)
    private val connectionManager: MeshConnectionManager = mock(MockMode.autofill)

+    @OptIn(ExperimentalCoroutinesApi::class)
    private val testDispatcher = UnconfinedTestDispatcher()
-    private val testScope = CoroutineScope(testDispatcher)
+
+    @OptIn(ExperimentalCoroutinesApi::class)
+    private val dispatchers = CoroutineDispatchers(io = testDispatcher, main = testDispatcher, default = testDispatcher)

    /** Stubs the shared flow dependencies used by every test and returns an orchestrator. */
    private fun createOrchestrator(
@ -114,7 +119,7 @@ class MeshServiceOrchestratorTest {
            takPrefs = takPrefs,
            databaseManager = databaseManager,
            connectionManager = connectionManager,
-            scope = testScope,
+            dispatchers = dispatchers,
        )
    }

@ -217,4 +222,79 @@ class MeshServiceOrchestratorTest {
        orchestrator.stop()
        assertFalse(orchestrator.isRunning)
    }
+
+    /**
+     * Regression test for a bug where `stop()` did not actually tear down the FromRadio collectors. Collectors were
+     * attached to an injected process-wide ServiceScope rather than a per-start scope, so `start() -> stop() ->
+     * start()` caused duplicate collectors and every FromRadio packet was handled 2x (then 3x, etc.).
+     */
+    @Test
+    fun testFromRadioCollectorsTornDownOnStopAndRestartedCleanlyOnStart() {
+        val receivedData = MutableSharedFlow<ByteArray>(extraBufferCapacity = 8)
+        val orchestrator = createOrchestrator(receivedData = receivedData)
+        every { nodeManager.myNodeNum } returns MutableStateFlow(null)
+
+        orchestrator.start()
+        val packet1 = byteArrayOf(1, 2, 3)
+        receivedData.tryEmit(packet1)
+        verifySuspend(exactly(1)) { messageProcessor.handleFromRadio(packet1, null) }
+
+        orchestrator.stop()
+        val packet2 = byteArrayOf(4, 5, 6)
+        receivedData.tryEmit(packet2)
+        // After stop(), the collector must be gone - the handler should not be invoked for packet2.
+        verifySuspend(exactly(0)) { messageProcessor.handleFromRadio(packet2, null) }
+
+        orchestrator.start()
+        val packet3 = byteArrayOf(7, 8, 9)
+        receivedData.tryEmit(packet3)
+        // After restart, a single fresh collector must process packet3 exactly once (not twice).
+        verifySuspend(exactly(1)) { messageProcessor.handleFromRadio(packet3, null) }
+
+        orchestrator.stop()
+    }
+
+    /**
+     * Regression test for a channel-buffer-replay bug: the production [RadioInterfaceService] buffers inbound bytes in
+     * a process-lifetime `Channel(UNLIMITED)`. Between `stop()` and the next `start()`, any bytes that arrive sit in
+     * the channel and would be replayed to the fresh collector — prepending stale packets to the next session's
+     * firmware handshake. `start()` must call [RadioInterfaceService.resetReceivedBuffer] before attaching the
+     * collector.
+     */
+    @Test
+    fun testStartDrainsReceivedBufferBeforeAttachingCollector() {
+        val orchestrator = createOrchestrator()
+        every { nodeManager.myNodeNum } returns MutableStateFlow(null)
+
+        orchestrator.start()
+        orchestrator.stop()
+        orchestrator.start()
+
+        // resetReceivedBuffer must be invoked at least once per start() (twice total for two starts).
+        verify(atLeast(2)) { radioInterfaceService.resetReceivedBuffer() }
+
+        orchestrator.stop()
+    }
+
+    /** Additional regression: after many start/stop cycles, collectors must not accumulate. */
+    @Test
+    fun testRepeatedStartStopDoesNotAccumulateCollectors() {
+        val receivedData = MutableSharedFlow<ByteArray>(extraBufferCapacity = 8)
+        val orchestrator = createOrchestrator(receivedData = receivedData)
+        every { nodeManager.myNodeNum } returns MutableStateFlow(null)
+
+        repeat(5) {
+            orchestrator.start()
+            orchestrator.stop()
+        }
+
+        orchestrator.start()
+        val packet = byteArrayOf(42)
+        receivedData.tryEmit(packet)
+
+        // Despite six total start() calls, only the most recent collector is live.
+        verifySuspend(exactly(1)) { messageProcessor.handleFromRadio(packet, null) }
+
+        orchestrator.stop()
+    }
 }