chore: review-cleanup fleet (audit + fix + hardening) (#5158)

This commit is contained in:
James Rich 2026-04-16 19:02:59 -05:00 committed by GitHub
parent 872c566ef1
commit 17e69c6d4c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
68 changed files with 784 additions and 459 deletions

View file

@ -19,13 +19,15 @@ package org.meshtastic.core.service
import co.touchlab.kermit.Logger
import co.touchlab.kermit.Severity
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Job
import kotlinx.coroutines.SupervisorJob
import kotlinx.coroutines.cancel
import kotlinx.coroutines.flow.launchIn
import kotlinx.coroutines.flow.onEach
import org.koin.core.annotation.Named
import kotlinx.coroutines.isActive
import org.koin.core.annotation.Single
import org.meshtastic.core.common.database.DatabaseManager
import org.meshtastic.core.common.util.handledLaunch
import org.meshtastic.core.di.CoroutineDispatchers
import org.meshtastic.core.repository.MeshConnectionManager
import org.meshtastic.core.repository.MeshMessageProcessor
import org.meshtastic.core.repository.MeshRouter
@ -59,18 +61,15 @@ class MeshServiceOrchestrator(
private val takPrefs: TakPrefs,
private val databaseManager: DatabaseManager,
private val connectionManager: MeshConnectionManager,
@Named("ServiceScope") private val scope: CoroutineScope,
private val dispatchers: CoroutineDispatchers,
) {
private var serviceJob: Job? = null
private var takJob: Job? = null
/** The coroutine scope for the service. */
val serviceScope: CoroutineScope
get() = scope
// Per-start coroutine scope. A fresh scope is created on each start() and cancelled on stop(), so all collectors
// launched from start() are torn down cleanly and do not accumulate across start/stop/start cycles.
private var scope: CoroutineScope? = null
/** Whether the orchestrator is currently running. */
val isRunning: Boolean
get() = serviceJob?.isActive == true
get() = scope?.isActive == true
/**
* Starts the mesh service components and wires up data flows.
@ -85,27 +84,31 @@ class MeshServiceOrchestrator(
}
Logger.i { "Starting mesh service orchestrator" }
val job = Job()
serviceJob = job
val newScope = CoroutineScope(SupervisorJob() + dispatchers.default)
scope = newScope
// Drop any bytes that piled up in the service's receivedData channel since the last stop(). The channel
// outlives the orchestrator's per-start scope, so without this drain a stop/start cycle would replay stale
// packets ahead of the fresh session's firmware handshake.
radioInterfaceService.resetReceivedBuffer()
serviceNotifications.initChannels()
connectionManager.updateStatusNotification()
// Observe TAK server pref to start/stop
takJob =
takPrefs.isTakServerEnabled
.onEach { isEnabled ->
if (isEnabled && !takServerManager.isRunning.value) {
Logger.i { "TAK Server enabled by preference, starting integration" }
takMeshIntegration.start(scope)
} else if (!isEnabled && takServerManager.isRunning.value) {
Logger.i { "TAK Server disabled by preference, stopping integration" }
takMeshIntegration.stop()
}
takPrefs.isTakServerEnabled
.onEach { isEnabled ->
if (isEnabled && !takServerManager.isRunning.value) {
Logger.i { "TAK Server enabled by preference, starting integration" }
takMeshIntegration.start(newScope)
} else if (!isEnabled && takServerManager.isRunning.value) {
Logger.i { "TAK Server disabled by preference, stopping integration" }
takMeshIntegration.stop()
}
.launchIn(scope)
}
.launchIn(newScope)
scope.handledLaunch {
newScope.handledLaunch {
// Ensure the per-device database is active before the radio connects.
// On Android this is handled by MeshUtilApplication.init(); on Desktop (and any
// future KMP host) the orchestrator is the first entry point, so it must initialize
@ -119,18 +122,18 @@ class MeshServiceOrchestrator(
radioInterfaceService.receivedData
.onEach { bytes -> messageProcessor.handleFromRadio(bytes, nodeManager.myNodeNum.value) }
.launchIn(scope)
.launchIn(newScope)
radioInterfaceService.connectionError
.onEach { errorMessage -> serviceRepository.setErrorMessage(errorMessage, Severity.Warn) }
.launchIn(scope)
.launchIn(newScope)
// Each action is dispatched in its own supervised coroutine so that a failure in one
// action (e.g. a timeout in sendAdminAwait) cannot terminate the collector and silently
// drop all subsequent service actions for the rest of the session.
serviceRepository.serviceAction
.onEach { action -> scope.handledLaunch { router.actionHandler.onServiceAction(action) } }
.launchIn(scope)
.onEach { action -> newScope.handledLaunch { router.actionHandler.onServiceAction(action) } }
.launchIn(newScope)
nodeManager.loadCachedNodeDB()
}
@ -142,13 +145,11 @@ class MeshServiceOrchestrator(
*/
fun stop() {
Logger.i { "Stopping mesh service orchestrator" }
takJob?.cancel()
takJob = null
// Guard stop() so we don't emit a spurious "stopped" log when TAK was never started
if (takServerManager.isRunning.value) {
takMeshIntegration.stop()
}
serviceJob?.cancel()
serviceJob = null
scope?.cancel()
scope = null
}
}

View file

@ -25,7 +25,9 @@ import kotlinx.coroutines.Job
import kotlinx.coroutines.SupervisorJob
import kotlinx.coroutines.cancel
import kotlinx.coroutines.channels.BufferOverflow
import kotlinx.coroutines.channels.Channel
import kotlinx.coroutines.delay
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.MutableSharedFlow
import kotlinx.coroutines.flow.MutableStateFlow
import kotlinx.coroutines.flow.SharedFlow
@ -35,6 +37,7 @@ import kotlinx.coroutines.flow.asStateFlow
import kotlinx.coroutines.flow.catch
import kotlinx.coroutines.flow.launchIn
import kotlinx.coroutines.flow.onEach
import kotlinx.coroutines.flow.receiveAsFlow
import kotlinx.coroutines.launch
import kotlinx.coroutines.sync.Mutex
import kotlinx.coroutines.sync.withLock
@ -42,7 +45,7 @@ import org.koin.core.annotation.Named
import org.koin.core.annotation.Single
import org.meshtastic.core.ble.BluetoothRepository
import org.meshtastic.core.common.util.handledLaunch
import org.meshtastic.core.common.util.ignoreException
import org.meshtastic.core.common.util.ignoreExceptionSuspend
import org.meshtastic.core.common.util.nowMillis
import org.meshtastic.core.di.CoroutineDispatchers
import org.meshtastic.core.model.ConnectionState
@ -95,8 +98,13 @@ class SharedRadioInterfaceService(
private val _currentDeviceAddressFlow = MutableStateFlow<String?>(radioPrefs.devAddr.value)
override val currentDeviceAddressFlow: StateFlow<String?> = _currentDeviceAddressFlow.asStateFlow()
private val _receivedData = MutableSharedFlow<ByteArray>(extraBufferCapacity = 64)
override val receivedData: SharedFlow<ByteArray> = _receivedData
// Unbounded Channel preserves strict FIFO delivery of incoming radio bytes, which the
// firmware handshake depends on (initial config packet ordering). A SharedFlow with
// `launch { emit() }` per packet reorders under concurrent dispatch and breaks config load.
// trySend on an UNLIMITED channel never suspends and never drops, so handleFromRadio can
// remain a non-suspend synchronous callback.
private val _receivedData = Channel<ByteArray>(Channel.UNLIMITED)
override val receivedData: Flow<ByteArray> = _receivedData.receiveAsFlow()
private val _meshActivity =
MutableSharedFlow<MeshActivity>(extraBufferCapacity = 64, onBufferOverflow = BufferOverflow.DROP_OLDEST)
@ -148,6 +156,7 @@ class SharedRadioInterfaceService(
}
}
}
.catch { Logger.e(it) { "devAddr flow crashed" } }
.launchIn(processLifecycle.coroutineScope)
bluetoothRepository.state
@ -216,7 +225,7 @@ class SharedRadioInterfaceService(
processLifecycle.coroutineScope.launch {
transportMutex.withLock {
ignoreException { stopTransportLocked() }
ignoreExceptionSuspend { stopTransportLocked() }
startTransportLocked()
}
}
@ -245,7 +254,7 @@ class SharedRadioInterfaceService(
}
/** Must be called under [transportMutex]. */
private fun stopTransportLocked() {
private suspend fun stopTransportLocked() {
val currentTransport = radioTransport
Logger.i { "Stopping transport $currentTransport" }
isStarted = false
@ -322,13 +331,28 @@ class SharedRadioInterfaceService(
override fun handleFromRadio(bytes: ByteArray) {
try {
lastDataReceivedMillis = nowMillis
processLifecycle.coroutineScope.launch(dispatchers.io) { _receivedData.emit(bytes) }
// trySend synchronously onto the unbounded Channel so packet order matches arrival
// order. The previous `launch { emit() }` pattern dispatched each packet onto a
// fresh coroutine, letting the scheduler reorder them — which broke the firmware
// config handshake (see PhoneAPI.cpp initial-handshake sequence).
val result = _receivedData.trySend(bytes)
if (result.isFailure) {
Logger.e(result.exceptionOrNull()) { "Failed to enqueue ${bytes.size} received bytes; dropping packet" }
}
_meshActivity.tryEmit(MeshActivity.Receive)
} catch (t: Throwable) {
Logger.e(t) { "handleFromRadio failed while emitting data" }
}
}
override fun resetReceivedBuffer() {
// Drain any bytes buffered while no collector was attached. Without this, a stop/start cycle
// would replay stale bytes ahead of the next session's firmware handshake, since the channel
// outlives the orchestrator's per-start scope.
@Suppress("EmptyWhileBlock", "ControlFlowWithEmptyBody")
while (_receivedData.tryReceive().isSuccess) Unit
}
override fun onConnect() {
// MutableStateFlow.value is thread-safe (backed by atomics) — assign directly rather than
// launching a coroutine. The async launch pattern introduced a window where a concurrent

View file

@ -23,13 +23,15 @@ import dev.mokkery.every
import dev.mokkery.matcher.any
import dev.mokkery.mock
import dev.mokkery.verify
import dev.mokkery.verify.VerifyMode.Companion.atLeast
import dev.mokkery.verify.VerifyMode.Companion.exactly
import dev.mokkery.verifySuspend
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.ExperimentalCoroutinesApi
import kotlinx.coroutines.flow.MutableSharedFlow
import kotlinx.coroutines.flow.MutableStateFlow
import kotlinx.coroutines.test.UnconfinedTestDispatcher
import org.meshtastic.core.common.database.DatabaseManager
import org.meshtastic.core.di.CoroutineDispatchers
import org.meshtastic.core.model.Node
import org.meshtastic.core.model.service.ServiceAction
import org.meshtastic.core.repository.CommandSender
@ -70,8 +72,11 @@ class MeshServiceOrchestratorTest {
private val databaseManager: DatabaseManager = mock(MockMode.autofill)
private val connectionManager: MeshConnectionManager = mock(MockMode.autofill)
@OptIn(ExperimentalCoroutinesApi::class)
private val testDispatcher = UnconfinedTestDispatcher()
private val testScope = CoroutineScope(testDispatcher)
@OptIn(ExperimentalCoroutinesApi::class)
private val dispatchers = CoroutineDispatchers(io = testDispatcher, main = testDispatcher, default = testDispatcher)
/** Stubs the shared flow dependencies used by every test and returns an orchestrator. */
private fun createOrchestrator(
@ -114,7 +119,7 @@ class MeshServiceOrchestratorTest {
takPrefs = takPrefs,
databaseManager = databaseManager,
connectionManager = connectionManager,
scope = testScope,
dispatchers = dispatchers,
)
}
@ -217,4 +222,79 @@ class MeshServiceOrchestratorTest {
orchestrator.stop()
assertFalse(orchestrator.isRunning)
}
/**
* Regression test for a bug where `stop()` did not actually tear down the FromRadio collectors. Collectors were
* attached to an injected process-wide ServiceScope rather than a per-start scope, so `start() -> stop() ->
* start()` caused duplicate collectors and every FromRadio packet was handled 2x (then 3x, etc.).
*/
@Test
fun testFromRadioCollectorsTornDownOnStopAndRestartedCleanlyOnStart() {
val receivedData = MutableSharedFlow<ByteArray>(extraBufferCapacity = 8)
val orchestrator = createOrchestrator(receivedData = receivedData)
every { nodeManager.myNodeNum } returns MutableStateFlow(null)
orchestrator.start()
val packet1 = byteArrayOf(1, 2, 3)
receivedData.tryEmit(packet1)
verifySuspend(exactly(1)) { messageProcessor.handleFromRadio(packet1, null) }
orchestrator.stop()
val packet2 = byteArrayOf(4, 5, 6)
receivedData.tryEmit(packet2)
// After stop(), the collector must be gone - the handler should not be invoked for packet2.
verifySuspend(exactly(0)) { messageProcessor.handleFromRadio(packet2, null) }
orchestrator.start()
val packet3 = byteArrayOf(7, 8, 9)
receivedData.tryEmit(packet3)
// After restart, a single fresh collector must process packet3 exactly once (not twice).
verifySuspend(exactly(1)) { messageProcessor.handleFromRadio(packet3, null) }
orchestrator.stop()
}
/**
* Regression test for a channel-buffer-replay bug: the production [RadioInterfaceService] buffers inbound bytes in
* a process-lifetime `Channel(UNLIMITED)`. Between `stop()` and the next `start()`, any bytes that arrive sit in
* the channel and would be replayed to the fresh collector prepending stale packets to the next session's
* firmware handshake. `start()` must call [RadioInterfaceService.resetReceivedBuffer] before attaching the
* collector.
*/
@Test
fun testStartDrainsReceivedBufferBeforeAttachingCollector() {
val orchestrator = createOrchestrator()
every { nodeManager.myNodeNum } returns MutableStateFlow(null)
orchestrator.start()
orchestrator.stop()
orchestrator.start()
// resetReceivedBuffer must be invoked at least once per start() (twice total for two starts).
verify(atLeast(2)) { radioInterfaceService.resetReceivedBuffer() }
orchestrator.stop()
}
/** Additional regression: after many start/stop cycles, collectors must not accumulate. */
@Test
fun testRepeatedStartStopDoesNotAccumulateCollectors() {
val receivedData = MutableSharedFlow<ByteArray>(extraBufferCapacity = 8)
val orchestrator = createOrchestrator(receivedData = receivedData)
every { nodeManager.myNodeNum } returns MutableStateFlow(null)
repeat(5) {
orchestrator.start()
orchestrator.stop()
}
orchestrator.start()
val packet = byteArrayOf(42)
receivedData.tryEmit(packet)
// Despite six total start() calls, only the most recent collector is live.
verifySuspend(exactly(1)) { messageProcessor.handleFromRadio(packet, null) }
orchestrator.stop()
}
}