Enhance health check logic for BLE connections

- Introduced a grace period for health checks on BLE connections, allowing for a limited number of consecutive failures before considering the connection degraded.
- Adjusted health check timeout and retry logic for BLE and TCP connections to improve reliability.
- Reset health check failure count upon successful connection and health check, ensuring accurate tracking of connection status.
- Improved logging for health check failures and timeouts to aid in debugging and monitoring connection health.
This commit is contained in:
agessaman 2025-12-18 08:48:32 -08:00
parent 6d030af555
commit a83bc27eec

View file

@ -228,6 +228,10 @@ class PacketCapture:
self.connection_retry_jitter = self.get_env_bool('CONNECTION_RETRY_JITTER', True)
self.health_check_interval = self.get_env_int('HEALTH_CHECK_INTERVAL', 30)
# Health check grace period for BLE connections
self.health_check_grace_period = self.get_env_int('HEALTH_CHECK_GRACE_PERIOD', 2) # Allow 2 consecutive failures
self.health_check_failure_count = 0 # Track consecutive health check failures
# MQTT connection
self.mqtt_clients = [] # List of MQTT client info dictionaries
self.mqtt_connected = False
@ -1185,22 +1189,43 @@ class PacketCapture:
return False
# 3. Try a lightweight command with timeout and retry
# Use longer timeout for TCP with SDK auto-reconnect (device might be busy)
health_check_timeout = 8.0 if (self.connection_type == 'tcp' and self.tcp_sdk_auto_reconnect_enabled) else 5.0
# Use longer timeout for BLE connections (Linux BLE can be slow) and TCP with SDK auto-reconnect
if self.connection_type == 'ble':
health_check_timeout = 12.0 # Longer timeout for BLE on Linux
health_check_retries = 3 # More retries for BLE
elif self.connection_type == 'tcp' and self.tcp_sdk_auto_reconnect_enabled:
health_check_timeout = 8.0 # Longer timeout for TCP with SDK auto-reconnect
health_check_retries = 2
else:
health_check_timeout = 5.0 # Default for serial/TCP without SDK auto-reconnect
health_check_retries = 2
try:
result = await self.retryable_device_command(
lambda: self.meshcore.commands.send_device_query(),
"send_device_query (health check)",
timeout=health_check_timeout,
max_retries=2, # Fewer retries for health checks
retry_delay=0.2
max_retries=health_check_retries,
retry_delay=0.3 # Slightly longer delay for health checks
)
if result and hasattr(result, 'type') and result.type != EventType.ERROR:
# Success - reset failure count
self.health_check_failure_count = 0
return True
else:
if self.debug:
self.logger.debug(f"Health check device query failed: {result}")
# For BLE, if is_connected is True, we might still consider it healthy
# (BLE can have slow responses but connection might still be valid)
if self.connection_type == 'ble' and self.meshcore and self.meshcore.is_connected:
self.health_check_failure_count += 1
if self.health_check_failure_count <= self.health_check_grace_period:
if self.debug:
self.logger.debug(f"Health check query failed but BLE connection appears active (grace period: {self.health_check_failure_count}/{self.health_check_grace_period})")
return True # Allow grace period for BLE
else:
self.logger.warning(f"Health check failed {self.health_check_failure_count} times consecutively - connection may be degraded")
return False
return False
except asyncio.TimeoutError:
# For TCP with SDK auto-reconnect, timeout might just mean device is busy
@ -1209,6 +1234,18 @@ class PacketCapture:
if self.debug:
self.logger.debug("Health check timed out, but SDK auto-reconnect is active")
return False
# For BLE, allow grace period even on timeout if connection appears active
if self.connection_type == 'ble' and self.meshcore and self.meshcore.is_connected:
self.health_check_failure_count += 1
if self.health_check_failure_count <= self.health_check_grace_period:
if self.debug:
self.logger.debug(f"Health check timed out but BLE connection appears active (grace period: {self.health_check_failure_count}/{self.health_check_grace_period})")
return True # Allow grace period for BLE
else:
self.logger.warning(f"Health check timed out {self.health_check_failure_count} times consecutively - connection may be degraded")
return False
self.logger.warning("Health check timed out")
return False
except Exception as e:
@ -1430,6 +1467,8 @@ class PacketCapture:
if self.meshcore and self.meshcore.is_connected:
self.connected = True
# Reset health check failure count on successful connection
self.health_check_failure_count = 0
# Reset SDK reconnect exhaustion flag on successful connection
if self.connection_type == 'tcp':
self.sdk_reconnect_exhausted = False
@ -1678,11 +1717,14 @@ class PacketCapture:
await self.meshcore.start_auto_message_fetching()
# Reset consecutive failures on successful reconnection
self.reset_consecutive_failures("connection")
# Reset health check failure count
self.health_check_failure_count = 0
# Skip health check and reconnect logic - let SDK handle it
continue
# For other connection types or after SDK has exhausted, do normal health check
needs_reconnection = not self.connected or not await self.check_connection_health()
health_check_passed = await self.check_connection_health()
needs_reconnection = not self.connected or not health_check_passed
if needs_reconnection:
@ -1698,6 +1740,8 @@ class PacketCapture:
# Reset consecutive failures on successful reconnection
self.reset_consecutive_failures("connection")
# Reset health check failure count
self.health_check_failure_count = 0
# Reset SDK reconnect exhaustion flag on successful reconnect
if self.connection_type == 'tcp':
self.sdk_reconnect_exhausted = False