Enhancement - optional transparent homoglyph encoding of a few characters in certain languages for more compact and efficient text messages (#4491)

2026-04-20 22:23:37 +00:00 · 2026-02-07 21:49:35 +03:00 · 2026-02-07 21:49:35 +03:00 · 4303bfaac4
commit 4303bfaac4
parent 6ec2ed76ca
10 changed files with 297 additions and 4 deletions
--- a/feature/messaging/src/main/kotlin/org/meshtastic/feature/messaging/HomoglyphCharacterStringTransformer.kt
+++ b/feature/messaging/src/main/kotlin/org/meshtastic/feature/messaging/HomoglyphCharacterStringTransformer.kt
@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2026 Meshtastic LLC
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+package org.meshtastic.feature.messaging
+
+/**
+ * This util class allows you to optimize the binary size of the transmitted text message strings. It replaces certain
+ * characters from national alphabets with the characters from the latin alphabet that have an identical appearance
+ * (homoglyphs), for example: cyrillic "А", "С", "у" -> latin "A", "C", "y", etc. According to statistics, such letters
+ * can make up about 20-25% of the total number of letters in the average text. Replacing them with Latin characters
+ * reduces the binary size of the transmitted message. The average transmitted message volume can then fit around
+ * ~140-145 characters instead of ~115-120
+ */
+internal object HomoglyphCharacterStringTransformer {
+
+    /**
+     * Unicode characters from the basic cyrillic block (U+0400-U+04FF), each of which occupies 2 bytes
+     * https://www.compart.com/en/unicode/block/U+0400 Mapped with the corresponding similarly written latin characters,
+     * each of which occupies 1 byte
+     *
+     * Please note that only 100% "reliable", completely visually identical characters are presented will here The
+     * characters that look like latin but contain various descenders, hooks, strokes, etc are not replaced with
+     * "simplified" latin appearance and will remain 2 byte unicode, as usual
+     */
+    private val homoglyphCharactersSubstitutionMapping: Map<Char, Char> =
+        mapOf(
+            '\u0405' to 'S', // https://www.compart.com/en/unicode/U+0405 - Cyrillic Capital Letter Dze
+            '\u0406' to
+                'I', // https://www.compart.com/en/unicode/U+0406 - Cyrillic Capital Letter Byelorussian-Ukrainian I
+            '\u0408' to 'J', // https://www.compart.com/en/unicode/U+0408 - Cyrillic Capital Letter Je
+            '\u0410' to 'A', // https://www.compart.com/en/unicode/U+0410 - Cyrillic Capital Letter A
+            '\u0412' to 'B', // https://www.compart.com/en/unicode/U+0412 - Cyrillic Capital Letter Ve
+            '\u0415' to 'E', // https://www.compart.com/en/unicode/U+0415 - Cyrillic Capital Letter Ie
+            '\u041A' to 'K', // https://www.compart.com/en/unicode/U+041A - Cyrillic Capital Letter Ka
+            '\u041C' to 'M', // https://www.compart.com/en/unicode/U+041C - Cyrillic Capital Letter Em
+            '\u041D' to 'H', // https://www.compart.com/en/unicode/U+041D - Cyrillic Capital Letter En
+            '\u041E' to 'O', // https://www.compart.com/en/unicode/U+041E - Cyrillic Capital Letter O
+            '\u0420' to 'P', // https://www.compart.com/en/unicode/U+0420 - Cyrillic Capital Letter Er
+            '\u0421' to 'C', // https://www.compart.com/en/unicode/U+0421 - Cyrillic Capital Letter Es
+            '\u0422' to 'T', // https://www.compart.com/en/unicode/U+0422 - Cyrillic Capital Letter Te
+            '\u0425' to 'X', // https://www.compart.com/en/unicode/U+0425 - Cyrillic Capital Letter Ha
+            '\u0430' to 'a', // https://www.compart.com/en/unicode/U+0430 - Cyrillic Small Letter A
+            '\u0435' to 'e', // https://www.compart.com/en/unicode/U+0435 - Cyrillic Small Letter Ie
+            '\u043E' to 'o', // https://www.compart.com/en/unicode/U+043E - Cyrillic Small Letter O
+            '\u0440' to 'p', // https://www.compart.com/en/unicode/U+0440 - Cyrillic Small Letter Er
+            '\u0441' to 'c', // https://www.compart.com/en/unicode/U+0441 - Cyrillic Small Letter Es
+            '\u0443' to 'y', // https://www.compart.com/en/unicode/U+0443 - Cyrillic Small Letter U
+            '\u0445' to 'x', // https://www.compart.com/en/unicode/U+0445 - Cyrillic Small Letter Ha
+            '\u0455' to 's', // https://www.compart.com/en/unicode/U+0455 - Cyrillic Small Letter Dze
+            '\u0456' to
+                'i', // https://www.compart.com/en/unicode/U+0456 - Cyrillic Small Letter Byelorussian-Ukrainian I
+            '\u0458' to 'j', // https://www.compart.com/en/unicode/U+0458 - Cyrillic Small Letter Je
+            '\u04AE' to 'Y', // https://www.compart.com/en/unicode/U+04AE - Cyrillic Capital Letter Straight U
+            '\u0417' to '3', // https://www.compart.com/en/unicode/U+0417 - Cyrillic Capital Letter Ze
+            // Note that capital "ze" here is a bit special - it technically transforms to a digit "three"
+            // The visuals are all the same, across the different fonts etc& The core idea is the same:
+            // We are still replacing 2-byte unicode letter with a digit character that occupies 1 byte in Unicode
+            // But I have to point it out to avoid confusion
+
+        )
+
+    /**
+     * Returns the transformed optimized [String] value, in which some characters of the national alphabets are replaced
+     * with identical Latin characters so that the text takes up fewer bytes and is more compact for transmission.
+     *
+     * @param value original string value.
+     * @return optimized string value.
+     */
+    fun optimizeUtf8StringWithHomoglyphs(value: String): String {
+        val stringBuilder = StringBuilder()
+        for (c in value.toCharArray()) stringBuilder.append(homoglyphCharactersSubstitutionMapping.getOrDefault(c, c))
+        return stringBuilder.toString()
+    }
+}
--- a/feature/messaging/src/main/kotlin/org/meshtastic/feature/messaging/Message.kt
+++ b/feature/messaging/src/main/kotlin/org/meshtastic/feature/messaging/Message.kt
@ -178,6 +178,7 @@ fun MessageScreen(
    val quickChatActions by viewModel.quickChatActions.collectAsStateWithLifecycle(initialValue = emptyList())
    val pagedMessages = viewModel.getMessagesFromPaged(contactKey).collectAsLazyPagingItems()
    val contactSettings by viewModel.contactSettings.collectAsStateWithLifecycle(initialValue = emptyMap())
+    val homoglyphEncodingEnabled by viewModel.homoglyphEncodingEnabled.collectAsStateWithLifecycle(initialValue = false)

    // UI State managed within this Composable
    var replyingToPacketId by rememberSaveable { mutableStateOf<Int?>(null) }
@ -469,6 +470,7 @@ fun MessageScreen(
            )
            MessageInput(
                isEnabled = connectionState.isConnected(),
+                isHomoglyphEncodingEnabled = homoglyphEncodingEnabled,
                textFieldState = messageInputState,
                onSendMessage = {
                    val messageText = messageInputState.text.toString().trim()
@ -938,12 +940,21 @@ private const val MAX_LINES = 3
@Composable
 private fun MessageInput(
    isEnabled: Boolean,
+    isHomoglyphEncodingEnabled: Boolean,
    textFieldState: TextFieldState,
    modifier: Modifier = Modifier,
    maxByteSize: Int = MESSAGE_CHARACTER_LIMIT_BYTES,
    onSendMessage: () -> Unit,
 ) {
-    val currentText = textFieldState.text.toString()
+    val currentTextRaw = textFieldState.text.toString()
+
+    val currentText =
+        if (isHomoglyphEncodingEnabled) {
+            HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(currentTextRaw)
+        } else {
+            currentTextRaw
+        }
+
    val currentByteLength =
        remember(currentText) {
            // Recalculate only when text changes
@ -1000,12 +1011,23 @@ private fun MessageInputPreview() {
    AppTheme {
        Surface {
            Column(modifier = Modifier.padding(8.dp)) {
-                MessageInput(isEnabled = true, textFieldState = rememberTextFieldState("Hello"), onSendMessage = {})
+                MessageInput(
+                    isEnabled = true,
+                    isHomoglyphEncodingEnabled = false,
+                    textFieldState = rememberTextFieldState("Hello"),
+                    onSendMessage = {},
+                )
                Spacer(Modifier.size(16.dp))
-                MessageInput(isEnabled = false, textFieldState = rememberTextFieldState("Disabled"), onSendMessage = {})
+                MessageInput(
+                    isEnabled = false,
+                    isHomoglyphEncodingEnabled = false,
+                    textFieldState = rememberTextFieldState("Disabled"),
+                    onSendMessage = {},
+                )
                Spacer(Modifier.size(16.dp))
                MessageInput(
                    isEnabled = true,
+                    isHomoglyphEncodingEnabled = false,
                    textFieldState =
                    rememberTextFieldState(
                        "A very long message that might exceed the byte limit " +
@ -1018,6 +1040,7 @@ private fun MessageInputPreview() {
                // Test Japanese characters (multi-byte)
                MessageInput(
                    isEnabled = true,
+                    isHomoglyphEncodingEnabled = false,
                    textFieldState = rememberTextFieldState("こんにちは世界"), // Hello World in Japanese
                    onSendMessage = {},
                    maxByteSize = 10,
--- a/feature/messaging/src/main/kotlin/org/meshtastic/feature/messaging/MessageViewModel.kt
+++ b/feature/messaging/src/main/kotlin/org/meshtastic/feature/messaging/MessageViewModel.kt
@ -44,6 +44,7 @@ import org.meshtastic.core.database.model.Node
 import org.meshtastic.core.model.Capabilities
 import org.meshtastic.core.model.DataPacket
 import org.meshtastic.core.prefs.emoji.CustomEmojiPrefs
+import org.meshtastic.core.prefs.homoglyph.HomoglyphPrefs
 import org.meshtastic.core.prefs.ui.UiPrefs
 import org.meshtastic.core.service.MeshServiceNotifications
 import org.meshtastic.core.service.ServiceAction
@ -67,6 +68,7 @@ constructor(
    private val packetRepository: PacketRepository,
    private val uiPrefs: UiPrefs,
    private val customEmojiPrefs: CustomEmojiPrefs,
+    private val homoglyphEncodingPrefs: HomoglyphPrefs,
    private val meshServiceNotifications: MeshServiceNotifications,
 ) : ViewModel() {
    private val _title = MutableStateFlow("")
@ -122,6 +124,8 @@ constructor(
                ?.map { it.first }
                ?.take(6) ?: listOf("👍", "👎", "😂", "🔥", "❤️", "😮")

+    val homoglyphEncodingEnabled = homoglyphEncodingPrefs.getHomoglyphEncodingEnabledChangesFlow()
+
    init {
        val contactKey = savedStateHandle.get<String>("contactKey")
        if (contactKey != null) {
@ -204,8 +208,20 @@ constructor(
                }
            }
        }
+
+        // Applying homoglyph encoding to the transmitted string if user has activated the feature
+        // In most cases the value in "str" parameter will already contain the correct
+        // transformed string from the text input. This call here added to make sure that
+        // the feature is effective across all possible message paths (quick-chat, reply, etc.)
+        val dataPacketText: String =
+            if (homoglyphEncodingPrefs.homoglyphEncodingEnabled) {
+                HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(str)
+            } else {
+                str
+            }
+
        val p =
-            DataPacket(dest, channel ?: 0, str, replyId).apply {
+            DataPacket(dest, channel ?: 0, dataPacketText, replyId).apply {
                from = ourNodeInfo.value?.user?.id ?: DataPacket.ID_LOCAL
            }
        sendDataPacket(p)
--- a/feature/messaging/src/test/kotlin/org/meshtastic/feature/messaging/HomoglyphCharacterTransformTest.kt
+++ b/feature/messaging/src/test/kotlin/org/meshtastic/feature/messaging/HomoglyphCharacterTransformTest.kt
@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2026 Meshtastic LLC
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+package org.meshtastic.feature.messaging
+
+import org.junit.Assert.assertEquals
+import org.junit.Assert.assertTrue
+import org.junit.Test
+
+class HomoglyphCharacterTransformTest {
+
+    @Test
+    fun `optimizeUtf8StringWithHomoglyphs shrinks binary size of cyrillic text containing some homoglyphs`() {
+        val testString = "Мештастик - это проект с открытым исходным кодом"
+        val transformedTestString = HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(testString)
+        val testStringBytes = testString.toByteArray(charset = Charsets.UTF_8)
+        val transformedTestStringBytes = transformedTestString.toByteArray(charset = Charsets.UTF_8)
+        val transformedStringBinarySizeShrinked = transformedTestStringBytes.size < testStringBytes.size
+        assertTrue(transformedStringBinarySizeShrinked)
+    }
+
+    @Test
+    fun `optimizeUtf8StringWithHomoglyphs shrinks binary size in half of cyrillic text containing only homoglyphs`() {
+        val testString = "Косуха"
+        val transformedTestString = HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(testString)
+        val testStringBytes = testString.toByteArray(charset = Charsets.UTF_8)
+        val transformedTestStringBytes = transformedTestString.toByteArray(charset = Charsets.UTF_8)
+        assertEquals(transformedTestStringBytes.size, testStringBytes.size / 2)
+    }
+
+    @Test
+    fun `optimizeUtf8StringWithHomoglyphs does not transform cyrillic text without any homoglyphs`() {
+        val testString = "Близкий"
+        val transformedTestString = HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(testString)
+        assertEquals(transformedTestString, testString)
+    }
+
+    @Test
+    fun `optimizeUtf8StringWithHomoglyphs does not transform latin text message`() {
+        val testString = "Meshtastic is an open source, off-grid, decentralized mesh network"
+        val transformedTestString = HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(testString)
+        assertEquals(transformedTestString, testString)
+    }
+
+    @Test
+    fun `optimizeUtf8StringWithHomoglyphs does not transform characters impossible to present by latin letters`() {
+        val testString = "ميشتاستيك هو مصدر مفتوح ، خارج الشبكة ، شبكة شبكة"
+        val transformedTestString = HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(testString)
+        assertEquals(transformedTestString, testString)
+    }
+}