Enhancement - optional transparent homoglyph encoding of a few characters in certain languages for more compact and efficient text messages (#4491)

This commit is contained in:
Pavel Vasiliev 2026-02-07 21:49:35 +03:00 committed by GitHub
parent 6ec2ed76ca
commit 4303bfaac4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 297 additions and 4 deletions

View file

@ -0,0 +1,87 @@
/*
* Copyright (c) 2026 Meshtastic LLC
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.meshtastic.feature.messaging
/**
* This util class allows you to optimize the binary size of the transmitted text message strings. It replaces certain
* characters from national alphabets with the characters from the latin alphabet that have an identical appearance
* (homoglyphs), for example: cyrillic "А", "С", "у" -> latin "A", "C", "y", etc. According to statistics, such letters
* can make up about 20-25% of the total number of letters in the average text. Replacing them with Latin characters
* reduces the binary size of the transmitted message. The average transmitted message volume can then fit around
* ~140-145 characters instead of ~115-120
*/
internal object HomoglyphCharacterStringTransformer {
/**
* Unicode characters from the basic cyrillic block (U+0400-U+04FF), each of which occupies 2 bytes
* https://www.compart.com/en/unicode/block/U+0400 Mapped with the corresponding similarly written latin characters,
* each of which occupies 1 byte
*
* Please note that only 100% "reliable", completely visually identical characters are presented will here The
* characters that look like latin but contain various descenders, hooks, strokes, etc are not replaced with
* "simplified" latin appearance and will remain 2 byte unicode, as usual
*/
private val homoglyphCharactersSubstitutionMapping: Map<Char, Char> =
mapOf(
'\u0405' to 'S', // https://www.compart.com/en/unicode/U+0405 - Cyrillic Capital Letter Dze
'\u0406' to
'I', // https://www.compart.com/en/unicode/U+0406 - Cyrillic Capital Letter Byelorussian-Ukrainian I
'\u0408' to 'J', // https://www.compart.com/en/unicode/U+0408 - Cyrillic Capital Letter Je
'\u0410' to 'A', // https://www.compart.com/en/unicode/U+0410 - Cyrillic Capital Letter A
'\u0412' to 'B', // https://www.compart.com/en/unicode/U+0412 - Cyrillic Capital Letter Ve
'\u0415' to 'E', // https://www.compart.com/en/unicode/U+0415 - Cyrillic Capital Letter Ie
'\u041A' to 'K', // https://www.compart.com/en/unicode/U+041A - Cyrillic Capital Letter Ka
'\u041C' to 'M', // https://www.compart.com/en/unicode/U+041C - Cyrillic Capital Letter Em
'\u041D' to 'H', // https://www.compart.com/en/unicode/U+041D - Cyrillic Capital Letter En
'\u041E' to 'O', // https://www.compart.com/en/unicode/U+041E - Cyrillic Capital Letter O
'\u0420' to 'P', // https://www.compart.com/en/unicode/U+0420 - Cyrillic Capital Letter Er
'\u0421' to 'C', // https://www.compart.com/en/unicode/U+0421 - Cyrillic Capital Letter Es
'\u0422' to 'T', // https://www.compart.com/en/unicode/U+0422 - Cyrillic Capital Letter Te
'\u0425' to 'X', // https://www.compart.com/en/unicode/U+0425 - Cyrillic Capital Letter Ha
'\u0430' to 'a', // https://www.compart.com/en/unicode/U+0430 - Cyrillic Small Letter A
'\u0435' to 'e', // https://www.compart.com/en/unicode/U+0435 - Cyrillic Small Letter Ie
'\u043E' to 'o', // https://www.compart.com/en/unicode/U+043E - Cyrillic Small Letter O
'\u0440' to 'p', // https://www.compart.com/en/unicode/U+0440 - Cyrillic Small Letter Er
'\u0441' to 'c', // https://www.compart.com/en/unicode/U+0441 - Cyrillic Small Letter Es
'\u0443' to 'y', // https://www.compart.com/en/unicode/U+0443 - Cyrillic Small Letter U
'\u0445' to 'x', // https://www.compart.com/en/unicode/U+0445 - Cyrillic Small Letter Ha
'\u0455' to 's', // https://www.compart.com/en/unicode/U+0455 - Cyrillic Small Letter Dze
'\u0456' to
'i', // https://www.compart.com/en/unicode/U+0456 - Cyrillic Small Letter Byelorussian-Ukrainian I
'\u0458' to 'j', // https://www.compart.com/en/unicode/U+0458 - Cyrillic Small Letter Je
'\u04AE' to 'Y', // https://www.compart.com/en/unicode/U+04AE - Cyrillic Capital Letter Straight U
'\u0417' to '3', // https://www.compart.com/en/unicode/U+0417 - Cyrillic Capital Letter Ze
// Note that capital "ze" here is a bit special - it technically transforms to a digit "three"
// The visuals are all the same, across the different fonts etc& The core idea is the same:
// We are still replacing 2-byte unicode letter with a digit character that occupies 1 byte in Unicode
// But I have to point it out to avoid confusion
)
/**
* Returns the transformed optimized [String] value, in which some characters of the national alphabets are replaced
* with identical Latin characters so that the text takes up fewer bytes and is more compact for transmission.
*
* @param value original string value.
* @return optimized string value.
*/
fun optimizeUtf8StringWithHomoglyphs(value: String): String {
val stringBuilder = StringBuilder()
for (c in value.toCharArray()) stringBuilder.append(homoglyphCharactersSubstitutionMapping.getOrDefault(c, c))
return stringBuilder.toString()
}
}

View file

@ -178,6 +178,7 @@ fun MessageScreen(
val quickChatActions by viewModel.quickChatActions.collectAsStateWithLifecycle(initialValue = emptyList())
val pagedMessages = viewModel.getMessagesFromPaged(contactKey).collectAsLazyPagingItems()
val contactSettings by viewModel.contactSettings.collectAsStateWithLifecycle(initialValue = emptyMap())
val homoglyphEncodingEnabled by viewModel.homoglyphEncodingEnabled.collectAsStateWithLifecycle(initialValue = false)
// UI State managed within this Composable
var replyingToPacketId by rememberSaveable { mutableStateOf<Int?>(null) }
@ -469,6 +470,7 @@ fun MessageScreen(
)
MessageInput(
isEnabled = connectionState.isConnected(),
isHomoglyphEncodingEnabled = homoglyphEncodingEnabled,
textFieldState = messageInputState,
onSendMessage = {
val messageText = messageInputState.text.toString().trim()
@ -938,12 +940,21 @@ private const val MAX_LINES = 3
@Composable
private fun MessageInput(
isEnabled: Boolean,
isHomoglyphEncodingEnabled: Boolean,
textFieldState: TextFieldState,
modifier: Modifier = Modifier,
maxByteSize: Int = MESSAGE_CHARACTER_LIMIT_BYTES,
onSendMessage: () -> Unit,
) {
val currentText = textFieldState.text.toString()
val currentTextRaw = textFieldState.text.toString()
val currentText =
if (isHomoglyphEncodingEnabled) {
HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(currentTextRaw)
} else {
currentTextRaw
}
val currentByteLength =
remember(currentText) {
// Recalculate only when text changes
@ -1000,12 +1011,23 @@ private fun MessageInputPreview() {
AppTheme {
Surface {
Column(modifier = Modifier.padding(8.dp)) {
MessageInput(isEnabled = true, textFieldState = rememberTextFieldState("Hello"), onSendMessage = {})
MessageInput(
isEnabled = true,
isHomoglyphEncodingEnabled = false,
textFieldState = rememberTextFieldState("Hello"),
onSendMessage = {},
)
Spacer(Modifier.size(16.dp))
MessageInput(isEnabled = false, textFieldState = rememberTextFieldState("Disabled"), onSendMessage = {})
MessageInput(
isEnabled = false,
isHomoglyphEncodingEnabled = false,
textFieldState = rememberTextFieldState("Disabled"),
onSendMessage = {},
)
Spacer(Modifier.size(16.dp))
MessageInput(
isEnabled = true,
isHomoglyphEncodingEnabled = false,
textFieldState =
rememberTextFieldState(
"A very long message that might exceed the byte limit " +
@ -1018,6 +1040,7 @@ private fun MessageInputPreview() {
// Test Japanese characters (multi-byte)
MessageInput(
isEnabled = true,
isHomoglyphEncodingEnabled = false,
textFieldState = rememberTextFieldState("こんにちは世界"), // Hello World in Japanese
onSendMessage = {},
maxByteSize = 10,

View file

@ -44,6 +44,7 @@ import org.meshtastic.core.database.model.Node
import org.meshtastic.core.model.Capabilities
import org.meshtastic.core.model.DataPacket
import org.meshtastic.core.prefs.emoji.CustomEmojiPrefs
import org.meshtastic.core.prefs.homoglyph.HomoglyphPrefs
import org.meshtastic.core.prefs.ui.UiPrefs
import org.meshtastic.core.service.MeshServiceNotifications
import org.meshtastic.core.service.ServiceAction
@ -67,6 +68,7 @@ constructor(
private val packetRepository: PacketRepository,
private val uiPrefs: UiPrefs,
private val customEmojiPrefs: CustomEmojiPrefs,
private val homoglyphEncodingPrefs: HomoglyphPrefs,
private val meshServiceNotifications: MeshServiceNotifications,
) : ViewModel() {
private val _title = MutableStateFlow("")
@ -122,6 +124,8 @@ constructor(
?.map { it.first }
?.take(6) ?: listOf("👍", "👎", "😂", "🔥", "❤️", "😮")
val homoglyphEncodingEnabled = homoglyphEncodingPrefs.getHomoglyphEncodingEnabledChangesFlow()
init {
val contactKey = savedStateHandle.get<String>("contactKey")
if (contactKey != null) {
@ -204,8 +208,20 @@ constructor(
}
}
}
// Applying homoglyph encoding to the transmitted string if user has activated the feature
// In most cases the value in "str" parameter will already contain the correct
// transformed string from the text input. This call here added to make sure that
// the feature is effective across all possible message paths (quick-chat, reply, etc.)
val dataPacketText: String =
if (homoglyphEncodingPrefs.homoglyphEncodingEnabled) {
HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(str)
} else {
str
}
val p =
DataPacket(dest, channel ?: 0, str, replyId).apply {
DataPacket(dest, channel ?: 0, dataPacketText, replyId).apply {
from = ourNodeInfo.value?.user?.id ?: DataPacket.ID_LOCAL
}
sendDataPacket(p)

View file

@ -0,0 +1,64 @@
/*
* Copyright (c) 2026 Meshtastic LLC
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.meshtastic.feature.messaging
import org.junit.Assert.assertEquals
import org.junit.Assert.assertTrue
import org.junit.Test
class HomoglyphCharacterTransformTest {
@Test
fun `optimizeUtf8StringWithHomoglyphs shrinks binary size of cyrillic text containing some homoglyphs`() {
val testString = "Мештастик - это проект с открытым исходным кодом"
val transformedTestString = HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(testString)
val testStringBytes = testString.toByteArray(charset = Charsets.UTF_8)
val transformedTestStringBytes = transformedTestString.toByteArray(charset = Charsets.UTF_8)
val transformedStringBinarySizeShrinked = transformedTestStringBytes.size < testStringBytes.size
assertTrue(transformedStringBinarySizeShrinked)
}
@Test
fun `optimizeUtf8StringWithHomoglyphs shrinks binary size in half of cyrillic text containing only homoglyphs`() {
val testString = "Косуха"
val transformedTestString = HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(testString)
val testStringBytes = testString.toByteArray(charset = Charsets.UTF_8)
val transformedTestStringBytes = transformedTestString.toByteArray(charset = Charsets.UTF_8)
assertEquals(transformedTestStringBytes.size, testStringBytes.size / 2)
}
@Test
fun `optimizeUtf8StringWithHomoglyphs does not transform cyrillic text without any homoglyphs`() {
val testString = "Близкий"
val transformedTestString = HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(testString)
assertEquals(transformedTestString, testString)
}
@Test
fun `optimizeUtf8StringWithHomoglyphs does not transform latin text message`() {
val testString = "Meshtastic is an open source, off-grid, decentralized mesh network"
val transformedTestString = HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(testString)
assertEquals(transformedTestString, testString)
}
@Test
fun `optimizeUtf8StringWithHomoglyphs does not transform characters impossible to present by latin letters`() {
val testString = "ميشتاستيك هو مصدر مفتوح ، خارج الشبكة ، شبكة شبكة"
val transformedTestString = HomoglyphCharacterStringTransformer.optimizeUtf8StringWithHomoglyphs(testString)
assertEquals(transformedTestString, testString)
}
}