fix: resolve bugs across connection, PKI, admin, packet flow, and stability subsystems (#5011)

This commit is contained in:
James Rich 2026-04-09 08:20:06 -05:00 committed by GitHub
parent cd9f1c0600
commit 60cc2f4237
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 413 additions and 45 deletions

View file

@ -98,11 +98,12 @@ class CommandSenderImpl(
/**
* Resolves the correct channel index for sending a packet to [toNum].
*
* When both the local node and the destination support PKC, returns [DataPacket.PKC_CHANNEL_INDEX] so that
* [buildMeshPacket] enables PKI encryption. Otherwise falls back to the node's heard-on channel (for general
* packets) or the dedicated admin channel (for admin packets).
* PKI encryption ([DataPacket.PKC_CHANNEL_INDEX]) is only used for **admin** packets, where end-to-end encryption
* is appropriate. Protocol-level requests (traceroute, telemetry, position, nodeinfo, neighborinfo) must NOT use
* PKI because relay nodes need to read and/or modify the inner payload (e.g. traceroute appends each hop's node
* number). These requests fall back to the node's heard-on channel.
*/
private fun getChannelIndex(toNum: Int, isAdmin: Boolean = false): Int {
private fun getAdminChannelIndex(toNum: Int): Int {
val myNum = nodeManager.myNodeNum.value ?: return 0
val myNode = nodeManager.nodeDBbyNodeNum[myNum]
val destNode = nodeManager.nodeDBbyNodeNum[toNum]
@ -110,15 +111,18 @@ class CommandSenderImpl(
return when {
myNum == toNum -> 0
myNode?.hasPKC == true && destNode?.hasPKC == true -> DataPacket.PKC_CHANNEL_INDEX
isAdmin ->
else ->
channelSet.value.settings
.indexOfFirst { it.name.equals(ADMIN_CHANNEL_NAME, ignoreCase = true) }
.coerceAtLeast(0)
else -> destNode?.channel ?: 0
}
}
private fun getAdminChannelIndex(toNum: Int): Int = getChannelIndex(toNum, isAdmin = true)
/**
* Returns the heard-on channel for a non-admin request to [toNum]. Does NOT use PKI protocol-level requests need
* clear inner payloads.
*/
private fun getChannelIndex(toNum: Int): Int = nodeManager.nodeDBbyNodeNum[toNum]?.channel ?: 0
override fun sendData(p: DataPacket) {
if (p.id == 0) p.id = generatePacketId()

View file

@ -89,6 +89,12 @@ class FromRadioPacketHandlerImpl(
fileInfo != null -> router.value.configFlowManager.handleFileInfo(fileInfo)
xmodemPacket != null -> router.value.xmodemManager.handleIncomingXModem(xmodemPacket)
clientNotification != null -> handleClientNotification(clientNotification)
// Firmware rebooted without a transport-level disconnect (common on serial/TCP).
// Re-handshake immediately rather than waiting for the 30s stall guard.
proto.rebooted != null -> {
Logger.w { "Firmware rebooted (rebooted=${proto.rebooted}), re-initiating handshake" }
router.value.configFlowManager.triggerWantConfig()
}
}
}

View file

@ -248,6 +248,11 @@ class MeshActionHandlerImpl(
override fun handleSetRemoteConfig(id: Int, destNum: Int, payload: ByteArray) {
val c = Config.ADAPTER.decode(payload)
commandSender.sendAdmin(destNum, id) { AdminMessage(set_config = c) }
// When targeting the local node, optimistically persist the config so the
// UI reflects changes immediately (matching handleSetConfig behaviour).
if (destNum == nodeManager.myNodeNum.value) {
scope.handledLaunch { radioConfigRepository.setLocalConfig(c) }
}
}
override fun handleGetRemoteConfig(id: Int, destNum: Int, config: Int) {
@ -310,6 +315,11 @@ class MeshActionHandlerImpl(
if (payload != null) {
val c = Channel.ADAPTER.decode(payload)
commandSender.sendAdmin(destNum, id) { AdminMessage(set_channel = c) }
// When targeting the local node, optimistically persist the channel so
// the UI reflects changes immediately (matching handleSetChannel behaviour).
if (destNum == nodeManager.myNodeNum.value) {
scope.handledLaunch { radioConfigRepository.updateChannelSettings(c) }
}
}
}

View file

@ -17,6 +17,7 @@
package org.meshtastic.core.data.manager
import co.touchlab.kermit.Logger
import kotlinx.atomicfu.atomic
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.delay
import okio.IOException
@ -59,6 +60,9 @@ class MeshConfigFlowManagerImpl(
private lateinit var scope: CoroutineScope
private val wantConfigDelay = 100L
/** Monotonically increasing generation so async clears from a stale handshake are discarded. */
private val handshakeGeneration = atomic(0L)
override fun start(scope: CoroutineScope) {
this.scope = scope
}
@ -203,12 +207,18 @@ class MeshConfigFlowManagerImpl(
handshakeState = HandshakeState.ReceivingConfig(rawMyNodeInfo = myInfo)
nodeManager.setMyNodeNum(myInfo.my_node_num)
// Bump the generation so that a pending clear from a prior (interrupted) handshake
// will see a stale snapshot and skip its writes, preventing it from wiping config
// that was saved by this (newer) handshake's incoming packets.
val gen = handshakeGeneration.incrementAndGet()
// Clear persisted radio config so the new handshake starts from a clean slate.
// DataStore serializes its own writes, so the clear will precede subsequent
// setLocalConfig / updateChannelSettings calls dispatched by later packets in this
// session (handleFromRadio processes packets sequentially, so later dispatches always
// occur after this one returns).
scope.handledLaunch {
if (handshakeGeneration.value != gen) return@handledLaunch // Stale handshake; skip.
radioConfigRepository.clearChannelSet()
radioConfigRepository.clearLocalConfig()
radioConfigRepository.clearLocalModuleConfig()

View file

@ -205,6 +205,7 @@ class MeshConnectionManagerImpl(
private fun tearDownConnection() {
packetHandler.stopPacketQueue()
commandSender.setSessionPasskey(okio.ByteString.EMPTY) // Prevent stale passkey on reconnect.
locationManager.stop()
mqttManager.stop()
}
@ -227,8 +228,11 @@ class MeshConnectionManagerImpl(
scope.handledLaunch {
try {
val localConfig = radioConfigRepository.localConfigFlow.first()
val timeout = (localConfig.power?.ls_secs ?: 0) + DEVICE_SLEEP_TIMEOUT_SECONDS
Logger.d { "Waiting for sleeping device, timeout=$timeout secs" }
val rawTimeout = (localConfig.power?.ls_secs ?: 0) + DEVICE_SLEEP_TIMEOUT_SECONDS
// Cap the timeout so routers or power-saving configs (ls_secs=3600) don't
// leave the UI stuck in DeviceSleep for over an hour.
val timeout = rawTimeout.coerceAtMost(MAX_SLEEP_TIMEOUT_SECONDS)
Logger.d { "Waiting for sleeping device, timeout=$timeout secs (raw=$rawTimeout)" }
delay(timeout.seconds)
Logger.w { "Device timed out, setting disconnected" }
onConnectionChanged(ConnectionState.Disconnected)
@ -354,6 +358,12 @@ class MeshConnectionManagerImpl(
companion object {
private const val DEVICE_SLEEP_TIMEOUT_SECONDS = 30
// Maximum time (in seconds) to wait for a sleeping device before declaring it
// disconnected, regardless of the device's ls_secs configuration. Without this
// cap, routers (ls_secs=3600) leave the UI in DeviceSleep for over an hour.
private const val MAX_SLEEP_TIMEOUT_SECONDS = 300
private val HANDSHAKE_TIMEOUT = 30.seconds
// Shorter window for the retry attempt: if the device genuinely didn't receive the

View file

@ -41,6 +41,7 @@ import org.meshtastic.proto.FromRadio
import org.meshtastic.proto.LogRecord
import org.meshtastic.proto.MeshPacket
import org.meshtastic.proto.PortNum
import kotlin.concurrent.Volatile
import kotlin.uuid.Uuid
/** Implementation of [MeshMessageProcessor] that handles raw radio messages and prepares mesh packets for routing. */
@ -59,6 +60,13 @@ class MeshMessageProcessorImpl(
private val logUuidByPacketId = mutableMapOf<Int, String>()
private val logInsertJobByPacketId = mutableMapOf<Int, Job>()
/**
* Epoch-millisecond timestamp of the last local-node `lastHeard` DB write. Used to throttle updates to at most once
* per [LOCAL_NODE_REFRESH_INTERVAL_MS] so that high-frequency FromRadio variants (log records, queue status) don't
* flood the DB.
*/
@Volatile private var lastLocalNodeRefreshMs = 0L
private val earlyMutex = Mutex()
private val earlyReceivedPackets = kotlin.collections.ArrayDeque<MeshPacket>()
private val maxEarlyPacketBuffer = 10240
@ -95,6 +103,9 @@ class MeshMessageProcessorImpl(
}
private fun processFromRadio(proto: FromRadio, myNodeNum: Int?) {
// Any decoded FromRadio proves the radio link is alive — keep the local node fresh.
refreshLocalNodeLastHeard()
// Audit log every incoming variant
logVariant(proto)
@ -253,5 +264,33 @@ class MeshMessageProcessorImpl(
}
}
/**
* Refreshes the local node's [Node.lastHeard] to prove the radio link is alive.
*
* Without this, [lastHeard] is only set when a [MeshPacket] arrives from another node (see
* [processReceivedMeshPacket]). On a quiet mesh the heartbeat cycle still exchanges data with the firmware (ToRadio
* heartbeat FromRadio queueStatus every 30 s), but that data never touched [lastHeard], causing the local node to
* appear stale in the UI even though the connection is healthy.
*
* To avoid flooding the DB on high-frequency variants (log records arrive many times per second when debug logging
* is enabled), writes are throttled to at most once per [LOCAL_NODE_REFRESH_INTERVAL_MS].
*/
private fun refreshLocalNodeLastHeard() {
val now = nowMillis
if (now - lastLocalNodeRefreshMs < LOCAL_NODE_REFRESH_INTERVAL_MS) return
lastLocalNodeRefreshMs = now
val myNum = nodeManager.myNodeNum.value ?: return
nodeManager.updateNode(myNum, withBroadcast = false) { node: Node -> node.copy(lastHeard = nowSeconds.toInt()) }
}
private fun insertMeshLog(log: MeshLog): Job = scope.handledLaunch { meshLogRepository.value.insert(log) }
companion object {
/**
* Minimum interval between local-node `lastHeard` DB writes, in milliseconds. Aligned with the heartbeat
* interval (30 s) so that one write per heartbeat cycle keeps the node fresh without unnecessary DB churn.
*/
private const val LOCAL_NODE_REFRESH_INTERVAL_MS = 30_000L
}
}

View file

@ -110,6 +110,7 @@ class PacketHandlerImpl(
override fun sendToRadio(packet: MeshPacket) {
scope.launch {
queueMutex.withLock {
queueStopped = false // Allow queue to resume after a disconnect/reconnect cycle.
queuedPackets.add(packet)
startPacketQueueLocked()
}
@ -123,6 +124,7 @@ class PacketHandlerImpl(
val deferred = CompletableDeferred<Boolean>()
responseMutex.withLock { queueResponse[packet.id] = deferred }
queueMutex.withLock {
queueStopped = false // Allow queue to resume after a disconnect/reconnect cycle.
queuedPackets.add(packet)
startPacketQueueLocked()
}
@ -199,15 +201,18 @@ class PacketHandlerImpl(
Logger.d { "queueJob packet id=${packet.id.toUInt()} success $success" }
} catch (e: TimeoutCancellationException) {
Logger.d { "queueJob packet id=${packet.id.toUInt()} timeout" }
// Clean up the deferred for this packet. sendToRadioAndAwait callers
// also clean up in their own finally block (idempotent remove).
responseMutex.withLock { queueResponse.remove(packet.id) }
} catch (e: CancellationException) {
throw e // Preserve structured concurrency cancellation propagation.
} catch (e: Exception) {
Logger.d { "queueJob packet id=${packet.id.toUInt()} failed" }
responseMutex.withLock { queueResponse.remove(packet.id) }
}
// Do NOT remove from queueResponse here. Removal is owned by:
// - handleQueueStatus (normal completion path)
// - sendToRadioAndAwait's finally block (for await-style callers)
// - stopPacketQueue (bulk cleanup on disconnect)
// Deferred cleanup is now handled in the catch blocks above.
// handleQueueStatus (normal success) and stopPacketQueue (bulk cleanup)
// also remove entries, and these removals are idempotent.
}
} finally {
// Hold queueMutex so that clearing queueJob and the restart decision are

View file

@ -21,6 +21,7 @@ import kotlinx.coroutines.channels.BufferOverflow
import kotlinx.coroutines.flow.MutableSharedFlow
import kotlinx.coroutines.flow.asSharedFlow
import org.koin.core.annotation.Single
import org.meshtastic.core.common.util.nowMillis
import org.meshtastic.core.repository.PacketHandler
import org.meshtastic.core.repository.XModemFile
import org.meshtastic.core.repository.XModemManager
@ -59,6 +60,8 @@ class XModemManagerImpl(private val packetHandler: PacketHandler) : XModemManage
@Volatile private var transferName = ""
@Volatile private var expectedSeq = INITIAL_SEQ
@Volatile private var lastActivityMillis = 0L
private val blocks = mutableListOf<ByteArray>()
override fun setTransferName(name: String) {
@ -66,6 +69,17 @@ class XModemManagerImpl(private val packetHandler: PacketHandler) : XModemManage
}
override fun handleIncomingXModem(packet: XModem) {
// If blocks have accumulated but no activity for INACTIVITY_TIMEOUT_MS,
// the previous transfer is stale (firmware crash, BLE disconnect, etc.).
if (blocks.isNotEmpty() && lastActivityMillis > 0L) {
val elapsed = nowMillis - lastActivityMillis
if (elapsed > INACTIVITY_TIMEOUT_MS) {
Logger.w { "XModem: inactivity timeout (${elapsed}ms) — resetting stale transfer" }
reset()
}
}
lastActivityMillis = nowMillis
when (packet.control) {
XModem.Control.SOH,
XModem.Control.STX,
@ -135,6 +149,7 @@ class XModemManagerImpl(private val packetHandler: PacketHandler) : XModemManage
expectedSeq = INITIAL_SEQ
blocks.clear()
transferName = ""
lastActivityMillis = 0L
}
// CRC-CCITT: polynomial 0x1021, initial value 0x0000 (XModem variant)
@ -157,5 +172,6 @@ class XModemManagerImpl(private val packetHandler: PacketHandler) : XModemManage
private const val CTRLZ = 0x1A.toByte()
private const val CRC_POLY = 0x1021
private const val BITS_PER_BYTE = 8
private const val INACTIVITY_TIMEOUT_MS = 30_000L
}
}

View file

@ -28,6 +28,7 @@ import kotlinx.coroutines.ExperimentalCoroutinesApi
import kotlinx.coroutines.flow.MutableStateFlow
import kotlinx.coroutines.flow.flowOf
import kotlinx.coroutines.test.UnconfinedTestDispatcher
import kotlinx.coroutines.test.advanceTimeBy
import kotlinx.coroutines.test.advanceUntilIdle
import kotlinx.coroutines.test.runTest
import org.meshtastic.core.model.ConnectionState
@ -267,4 +268,46 @@ class MeshConnectionManagerImplTest {
verify { mqttManager.start(any(), true, true) }
verify { historyManager.requestHistoryReplay(any(), any(), any(), any()) }
}
@Test
fun `DeviceSleep timeout is capped at MAX_SLEEP_TIMEOUT_SECONDS for high ls_secs`() = runTest(testDispatcher) {
// Router with ls_secs=3600 — previously this created a 3630s timeout.
// With the cap, it should be clamped to 300s.
val config =
LocalConfig(
power = Config.PowerConfig(is_power_saving = true, ls_secs = 3600),
device = Config.DeviceConfig(role = Config.DeviceConfig.Role.ROUTER),
)
every { radioConfigRepository.localConfigFlow } returns flowOf(config)
every { packetHandler.sendToRadio(any<org.meshtastic.proto.ToRadio>()) } returns Unit
every { serviceNotifications.updateServiceStateNotification(any(), any()) } returns Unit
every { packetHandler.stopPacketQueue() } returns Unit
every { locationManager.stop() } returns Unit
every { mqttManager.stop() } returns Unit
every { nodeManager.nodeDBbyNodeNum } returns emptyMap()
manager.start(backgroundScope)
advanceUntilIdle()
// Transition to Connected then DeviceSleep
radioConnectionState.value = ConnectionState.Connected
advanceUntilIdle()
radioConnectionState.value = ConnectionState.DeviceSleep
advanceUntilIdle()
assertEquals(
ConnectionState.DeviceSleep,
serviceRepository.connectionState.value,
"Should be in DeviceSleep initially",
)
// Advance 300 seconds (the cap) + 1 second to trigger the timeout.
advanceTimeBy(301_000L)
assertEquals(
ConnectionState.Disconnected,
serviceRepository.connectionState.value,
"Should transition to Disconnected after capped timeout (300s), not the raw 3630s",
)
}
}