fix: resolve bugs across connection, PKI, admin, packet flow, and stability subsystems (#5011)

This commit is contained in:
James Rich 2026-04-09 08:20:06 -05:00 committed by GitHub
parent cd9f1c0600
commit 60cc2f4237
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 413 additions and 45 deletions

View file

@ -22,6 +22,8 @@ import org.meshtastic.core.network.repository.SerialConnection
import org.meshtastic.core.network.repository.SerialConnectionListener
import org.meshtastic.core.network.repository.UsbRepository
import org.meshtastic.core.repository.RadioInterfaceService
import org.meshtastic.proto.Heartbeat
import org.meshtastic.proto.ToRadio
import java.util.concurrent.atomic.AtomicReference
/** An interface that assumes we are talking to a meshtastic device via USB serial */
@ -119,7 +121,14 @@ class SerialInterface(
}
override fun keepAlive() {
Logger.d { "[$address] Serial keepAlive" }
// Send a ToRadio heartbeat so the firmware resets its idle timer and responds with
// a FromRadio queueStatus — proving the serial link is alive. Without this, the
// serial transport has no way to detect a silently dead device (battery depleted,
// firmware crash without the `rebooted` flag). The queueStatus response also feeds
// into MeshMessageProcessorImpl.refreshLocalNodeLastHeard() to keep the local
// node's lastHeard timestamp current.
Logger.d { "[$address] Serial keepAlive — sending heartbeat" }
handleSendToRadio(ToRadio(heartbeat = Heartbeat()).encode())
}
override fun sendBytes(p: ByteArray) {

View file

@ -65,6 +65,18 @@ private const val CONNECTION_TIMEOUT_MS = 15_000L
private const val RECONNECT_FAILURE_THRESHOLD = 3
private const val RECONNECT_BASE_DELAY_MS = 5_000L
private const val RECONNECT_MAX_DELAY_MS = 60_000L
private const val RECONNECT_MAX_FAILURES = 10
/**
* Minimum milliseconds a BLE connection must stay up before we consider it "stable" and reset
* [BleRadioInterface.consecutiveFailures]. Without this, a device at the edge of BLE range can repeatedly connect for a
* fraction of a second and drop each brief connection resets the failure counter so [RECONNECT_FAILURE_THRESHOLD] is
* never reached, and the app never signals [ConnectionState.DeviceSleep].
*
* The value (5 s) is long enough that only connections that survive past the initial GATT setup are treated as genuine,
* but short enough that normal reconnects after light-sleep still reset the counter promptly.
*/
private const val MIN_STABLE_CONNECTION_MS = 5_000L
/**
* Returns the reconnect backoff delay in milliseconds for a given consecutive failure count.
@ -181,7 +193,7 @@ class BleRadioInterface(
throw RadioNotConnectedException("Device not found at address $address")
}
@Suppress("LongMethod")
@Suppress("LongMethod", "CyclomaticComplexMethod")
private fun connect() {
connectionJob =
connectionScope.launch {
@ -231,8 +243,9 @@ class BleRadioInterface(
throw RadioNotConnectedException("Failed to connect to device at address $address")
}
// Connection succeeded — reset failure counter
consecutiveFailures = 0
// Connection succeeded — only reset the failure counter if the
// connection stays up long enough. See MIN_STABLE_CONNECTION_MS.
val gattConnectedAt = nowMillis
isFullyConnected = true
onConnected()
@ -257,6 +270,39 @@ class BleRadioInterface(
}
Logger.i { "[$address] BLE connection dropped, preparing to reconnect" }
// Only reset the failure counter if the connection was stable (lasted
// longer than MIN_STABLE_CONNECTION_MS). A connection that drops within
// seconds typically means the device is at the edge of BLE range or
// powered off — the Android BLE stack may briefly "connect" to a cached
// GATT profile before realising the device is gone. Without this guard,
// the failure counter resets on every brief connect, preventing us from
// ever reaching RECONNECT_FAILURE_THRESHOLD and signalling DeviceSleep.
val connectionUptime = nowMillis - gattConnectedAt
if (connectionUptime >= MIN_STABLE_CONNECTION_MS) {
consecutiveFailures = 0
} else {
consecutiveFailures++
Logger.w {
"[$address] Connection lasted only ${connectionUptime}ms " +
"(< ${MIN_STABLE_CONNECTION_MS}ms) — treating as failure " +
"(consecutive failures: $consecutiveFailures)"
}
if (consecutiveFailures >= RECONNECT_MAX_FAILURES) {
Logger.e { "[$address] Giving up after $consecutiveFailures unstable connections" }
service.onDisconnect(
isPermanent = true,
errorMessage = "Device unreachable (unstable connection)",
)
return@launch
}
if (consecutiveFailures >= RECONNECT_FAILURE_THRESHOLD) {
service.onDisconnect(
isPermanent = false,
errorMessage = "Device unreachable (unstable connection)",
)
}
}
} catch (e: kotlinx.coroutines.CancellationException) {
Logger.d { "[$address] BLE connection coroutine cancelled" }
throw e
@ -268,10 +314,19 @@ class BleRadioInterface(
"(consecutive failures: $consecutiveFailures)"
}
// At the failure threshold, signal DeviceSleep so MeshConnectionManagerImpl can
// start its sleep timeout. Use == (not >=) to fire exactly once; repeated
// onDisconnect signals would reset upstream state machines unnecessarily.
if (consecutiveFailures == RECONNECT_FAILURE_THRESHOLD) {
// After exceeding the max failure limit, give up permanently to stop
// draining battery on a device that is genuinely offline. The user
// must manually reconnect from the connections screen.
if (consecutiveFailures >= RECONNECT_MAX_FAILURES) {
Logger.e { "[$address] Giving up after $consecutiveFailures consecutive failures" }
val (_, msg) = e.toDisconnectReason()
service.onDisconnect(isPermanent = true, errorMessage = msg)
return@launch
}
// At the failure threshold, signal DeviceSleep so
// MeshConnectionManagerImpl can start its sleep timeout.
if (consecutiveFailures >= RECONNECT_FAILURE_THRESHOLD) {
handleFailure(e)
}
@ -312,10 +367,11 @@ class BleRadioInterface(
"Packets RX: $packetsReceived ($bytesReceived bytes), " +
"Packets TX: $packetsSent ($bytesSent bytes)"
}
// Do NOT call service.onDisconnect() here. The reconnect while-loop handles retries
// internally. Emitting DeviceSleep on every transient disconnect creates competing state
// transitions with MeshConnectionManagerImpl's sleep timeout. Instead, handleFailure()
// is called from the catch block after RECONNECT_FAILURE_THRESHOLD consecutive failures.
// Signal DeviceSleep immediately so the UI reflects the disconnect while the
// reconnect loop continues in the background. The previous approach suppressed
// this signal until RECONNECT_FAILURE_THRESHOLD consecutive failures, leaving the
// UI stuck on "Connected" for 35+ seconds after the device disappeared.
service.onDisconnect(isPermanent = false)
}
private suspend fun discoverServicesAndSetupCharacteristics() {

View file

@ -17,6 +17,8 @@
package org.meshtastic.core.network.radio
import dev.mokkery.MockMode
import dev.mokkery.answering.returns
import dev.mokkery.every
import dev.mokkery.matcher.any
import dev.mokkery.mock
import dev.mokkery.verify
@ -124,4 +126,52 @@ class BleRadioInterfaceTest {
// Cancel the reconnect loop so runTest can complete.
bleInterface.close()
}
/**
* After [RECONNECT_MAX_FAILURES] (10) consecutive failures, the reconnect loop should stop and signal a permanent
* disconnect. This prevents infinite battery drain when the device is genuinely offline.
*
* Time budget for 10 failures with bonded device (no scan): Each iteration = 1s settle + connectAndAwait throw +
* backoff Backoffs: 5s, 10s, 20s, 40s, 60s, 60s, 60s, 60s, 60s, (exit at failure 10 before backoff) Total 10×1s
* settle + 5+10+20+40+60+60+60+60+60 = 10 + 375 = 385s 385_000ms We use a generous 400_000ms to cover any timing
* variance.
*/
@Test
fun `reconnect loop stops after RECONNECT_MAX_FAILURES with permanent disconnect`() = runTest {
val device = FakeBleDevice(address = address, name = "Test Device")
bluetoothRepository.bond(device)
connection.connectException = RadioNotConnectedException("simulated failure")
every { service.onDisconnect(any(), any()) } returns Unit
val bleInterface =
BleRadioInterface(
serviceScope = this,
scanner = scanner,
bluetoothRepository = bluetoothRepository,
connectionFactory = connectionFactory,
service = service,
address = address,
)
// Advance enough time for all 10 failures to occur.
advanceTimeBy(400_001L)
// Should have been called with isPermanent=true at least once (the final call).
verify { service.onDisconnect(isPermanent = true, errorMessage = any()) }
bleInterface.close()
}
@Test
fun `computeReconnectBackoffMs returns correct backoff values`() {
assertEquals(5_000L, computeReconnectBackoffMs(0))
assertEquals(5_000L, computeReconnectBackoffMs(1))
assertEquals(10_000L, computeReconnectBackoffMs(2))
assertEquals(20_000L, computeReconnectBackoffMs(3))
assertEquals(40_000L, computeReconnectBackoffMs(4))
assertEquals(60_000L, computeReconnectBackoffMs(5))
assertEquals(60_000L, computeReconnectBackoffMs(10))
assertEquals(60_000L, computeReconnectBackoffMs(100))
}
}

View file

@ -25,6 +25,8 @@ import kotlinx.coroutines.isActive
import kotlinx.coroutines.launch
import org.meshtastic.core.network.radio.StreamInterface
import org.meshtastic.core.repository.RadioInterfaceService
import org.meshtastic.proto.Heartbeat
import org.meshtastic.proto.ToRadio
import java.io.File
/**
@ -137,7 +139,11 @@ private constructor(
}
override fun keepAlive() {
// Not specifically needed for raw serial unless implemented
// Send a ToRadio heartbeat so the firmware resets its idle timer and responds with
// a FromRadio queueStatus — proving the serial link is alive. Without this, the
// serial transport has no way to detect a silently dead device.
Logger.d { "[$portName] Serial keepAlive — sending heartbeat" }
handleSendToRadio(ToRadio(heartbeat = Heartbeat()).encode())
}
private fun closePortResources() {