diff --git a/CPP/7zip/Archive/LzhHandler.cpp b/CPP/7zip/Archive/LzhHandler.cpp index 8959300..e3cb9e7 100644 --- a/CPP/7zip/Archive/LzhHandler.cpp +++ b/CPP/7zip/Archive/LzhHandler.cpp @@ -463,7 +463,12 @@ Z7_COM7F_IMF(CHandler::GetProperty(UInt32 index, PROPID propID, PROPVARIANT *val { case kpidPath: { - UString s = NItemName::WinPathToOsPath(MultiByteToUnicodeString(item.GetName(), CP_OEMCP)); + UString res; + #ifndef _WIN32 + if (!UnixConvertLegacyToUnicode(item.GetName(), res, true, false, 0)) + #endif + MultiByteToUnicodeString2(res, item.GetName(), CP_OEMCP); + UString s = NItemName::WinPathToOsPath(res); if (!s.IsEmpty()) { if (s.Back() == WCHAR_PATH_SEPARATOR) diff --git a/CPP/7zip/Archive/Zip/ZipItem.cpp b/CPP/7zip/Archive/Zip/ZipItem.cpp index 327fcb4..0ff6b6a 100644 --- a/CPP/7zip/Archive/Zip/ZipItem.cpp +++ b/CPP/7zip/Archive/Zip/ZipItem.cpp @@ -1,12 +1,5 @@ // Archive/ZipItem.cpp -#ifndef _WIN32 -#include -#include -#include -#include -#endif - #include "StdAfx.h" #include "../../../../C/CpuArch.h" @@ -440,6 +433,15 @@ void CItem::GetUnicodeString(UString &res, const AString &s, bool isComment, boo } } + #ifndef _WIN32 + bool isOem = false; + bool isAnsi = false; + if (MadeByVersion.HostOS == NFileHeader::NHostOS::kNTFS && MadeByVersion.Version >= 20) isAnsi = true; + else if (MadeByVersion.HostOS == NFileHeader::NHostOS::kNTFS || MadeByVersion.HostOS == NFileHeader::NHostOS::kFAT) isOem = true; + if (isOem || isAnsi || (useSpecifiedCodePage && codePage != 65001)) + if (UnixConvertLegacyToUnicode(s, res, isOem, useSpecifiedCodePage, codePage)) return; + #endif + if (useSpecifiedCodePage) isUtf8 = (codePage == CP_UTF8); #ifdef _WIN32 @@ -458,184 +460,6 @@ void CItem::GetUnicodeString(UString &res, const AString &s, bool isComment, boo #endif } - #ifndef _WIN32 - - // Convert OEM char set to UTF-8 if needed - // Use system locale to select code page - - // locale -> code page translation tables generated from Wine source code - - const char *lcToOemTable[] = { - "af_ZA", "CP850", "ar_SA", "CP720", "ar_LB", "CP720", "ar_EG", "CP720", - "ar_DZ", "CP720", "ar_BH", "CP720", "ar_IQ", "CP720", "ar_JO", "CP720", - "ar_KW", "CP720", "ar_LY", "CP720", "ar_MA", "CP720", "ar_OM", "CP720", - "ar_QA", "CP720", "ar_SY", "CP720", "ar_TN", "CP720", "ar_AE", "CP720", - "ar_YE", "CP720", "ast_ES", "CP850", "az_AZ@cyrillic", "CP866", "az_AZ", "CP857", - "be_BY", "CP866", "bg_BG", "CP866", "br_FR", "CP850", "ca_ES", "CP850", - "zh_CN", "CP936", "zh_TW", "CP950", "kw_GB", "CP850", "cs_CZ", "CP852", - "cy_GB", "CP850", "da_DK", "CP850", "de_AT", "CP850", "de_LI", "CP850", - "de_LU", "CP850", "de_CH", "CP850", "de_DE", "CP850", "el_GR", "CP737", - "en_AU", "CP850", "en_CA", "CP850", "en_GB", "CP850", "en_IE", "CP850", - "en_JM", "CP850", "en_BZ", "CP850", "en_PH", "CP437", "en_ZA", "CP437", - "en_TT", "CP850", "en_US", "CP437", "en_ZW", "CP437", "en_NZ", "CP850", - "es_PA", "CP850", "es_BO", "CP850", "es_CR", "CP850", "es_DO", "CP850", - "es_SV", "CP850", "es_EC", "CP850", "es_GT", "CP850", "es_HN", "CP850", - "es_NI", "CP850", "es_CL", "CP850", "es_MX", "CP850", "es_ES", "CP850", - "es_CO", "CP850", "es_ES", "CP850", "es_PE", "CP850", "es_AR", "CP850", - "es_PR", "CP850", "es_VE", "CP850", "es_UY", "CP850", "es_PY", "CP850", - "et_EE", "CP775", "eu_ES", "CP850", "fa_IR", "CP720", "fi_FI", "CP850", - "fo_FO", "CP850", "fr_FR", "CP850", "fr_BE", "CP850", "fr_CA", "CP850", - "fr_LU", "CP850", "fr_MC", "CP850", "fr_CH", "CP850", "ga_IE", "CP437", - "gd_GB", "CP850", "gv_IM", "CP850", "gl_ES", "CP850", "he_IL", "CP862", - "hr_HR", "CP852", "hu_HU", "CP852", "id_ID", "CP850", "is_IS", "CP850", - "it_IT", "CP850", "it_CH", "CP850", "iv_IV", "CP437", "ja_JP", "CP932", - "kk_KZ", "CP866", "ko_KR", "CP949", "ky_KG", "CP866", "lt_LT", "CP775", - "lv_LV", "CP775", "mk_MK", "CP866", "mn_MN", "CP866", "ms_BN", "CP850", - "ms_MY", "CP850", "nl_BE", "CP850", "nl_NL", "CP850", "nl_SR", "CP850", - "nn_NO", "CP850", "nb_NO", "CP850", "pl_PL", "CP852", "pt_BR", "CP850", - "pt_PT", "CP850", "rm_CH", "CP850", "ro_RO", "CP852", "ru_RU", "CP866", - "sk_SK", "CP852", "sl_SI", "CP852", "sq_AL", "CP852", "sr_RS@latin", "CP852", - "sr_RS", "CP855", "sv_SE", "CP850", "sv_FI", "CP850", "sw_KE", "CP437", - "th_TH", "CP874", "tr_TR", "CP857", "tt_RU", "CP866", "uk_UA", "CP866", - "ur_PK", "CP720", "uz_UZ@cyrillic", "CP866", "uz_UZ", "CP857", "vi_VN", "CP1258", - "wa_BE", "CP850", "zh_HK", "CP950", "zh_SG", "CP936"}; - - const char *lcToAnsiTable[] = { - "af_ZA", "CP1252", "ar_SA", "CP1256", "ar_LB", "CP1256", "ar_EG", "CP1256", - "ar_DZ", "CP1256", "ar_BH", "CP1256", "ar_IQ", "CP1256", "ar_JO", "CP1256", - "ar_KW", "CP1256", "ar_LY", "CP1256", "ar_MA", "CP1256", "ar_OM", "CP1256", - "ar_QA", "CP1256", "ar_SY", "CP1256", "ar_TN", "CP1256", "ar_AE", "CP1256", - "ar_YE", "CP1256","ast_ES", "CP1252", "az_AZ@cyrillic", "CP1251", "az_AZ", "CP1254", - "be_BY", "CP1251", "bg_BG", "CP1251", "br_FR", "CP1252", "ca_ES", "CP1252", - "zh_CN", "CP936", "zh_TW", "CP950", "kw_GB", "CP1252", "cs_CZ", "CP1250", - "cy_GB", "CP1252", "da_DK", "CP1252", "de_AT", "CP1252", "de_LI", "CP1252", - "de_LU", "CP1252", "de_CH", "CP1252", "de_DE", "CP1252", "el_GR", "CP1253", - "en_AU", "CP1252", "en_CA", "CP1252", "en_GB", "CP1252", "en_IE", "CP1252", - "en_JM", "CP1252", "en_BZ", "CP1252", "en_PH", "CP1252", "en_ZA", "CP1252", - "en_TT", "CP1252", "en_US", "CP1252", "en_ZW", "CP1252", "en_NZ", "CP1252", - "es_PA", "CP1252", "es_BO", "CP1252", "es_CR", "CP1252", "es_DO", "CP1252", - "es_SV", "CP1252", "es_EC", "CP1252", "es_GT", "CP1252", "es_HN", "CP1252", - "es_NI", "CP1252", "es_CL", "CP1252", "es_MX", "CP1252", "es_ES", "CP1252", - "es_CO", "CP1252", "es_ES", "CP1252", "es_PE", "CP1252", "es_AR", "CP1252", - "es_PR", "CP1252", "es_VE", "CP1252", "es_UY", "CP1252", "es_PY", "CP1252", - "et_EE", "CP1257", "eu_ES", "CP1252", "fa_IR", "CP1256", "fi_FI", "CP1252", - "fo_FO", "CP1252", "fr_FR", "CP1252", "fr_BE", "CP1252", "fr_CA", "CP1252", - "fr_LU", "CP1252", "fr_MC", "CP1252", "fr_CH", "CP1252", "ga_IE", "CP1252", - "gd_GB", "CP1252", "gv_IM", "CP1252", "gl_ES", "CP1252", "he_IL", "CP1255", - "hr_HR", "CP1250", "hu_HU", "CP1250", "id_ID", "CP1252", "is_IS", "CP1252", - "it_IT", "CP1252", "it_CH", "CP1252", "iv_IV", "CP1252", "ja_JP", "CP932", - "kk_KZ", "CP1251", "ko_KR", "CP949", "ky_KG", "CP1251", "lt_LT", "CP1257", - "lv_LV", "CP1257", "mk_MK", "CP1251", "mn_MN", "CP1251", "ms_BN", "CP1252", - "ms_MY", "CP1252", "nl_BE", "CP1252", "nl_NL", "CP1252", "nl_SR", "CP1252", - "nn_NO", "CP1252", "nb_NO", "CP1252", "pl_PL", "CP1250", "pt_BR", "CP1252", - "pt_PT", "CP1252", "rm_CH", "CP1252", "ro_RO", "CP1250", "ru_RU", "CP1251", - "sk_SK", "CP1250", "sl_SI", "CP1250", "sq_AL", "CP1250", "sr_RS@latin", "CP1250", - "sr_RS", "CP1251", "sv_SE", "CP1252", "sv_FI", "CP1252", "sw_KE", "CP1252", - "th_TH", "CP874", "tr_TR", "CP1254", "tt_RU", "CP1251", "uk_UA", "CP1251", - "ur_PK", "CP1256", "uz_UZ@cyrillic", "CP1251", "uz_UZ", "CP1254", "vi_VN", "CP1258", - "wa_BE", "CP1252", "zh_HK", "CP950", "zh_SG", "CP936"}; - - bool isOem = false; - bool isAnsi = false; - - if (!isUtf8 && - MadeByVersion.HostOS == NFileHeader::NHostOS::kNTFS && - MadeByVersion.Version >= 20) { - isAnsi = true; - } else - if (!isUtf8 && - (MadeByVersion.HostOS == NFileHeader::NHostOS::kNTFS || - MadeByVersion.HostOS == NFileHeader::NHostOS::kFAT)) { - isOem = true; - } - - const char *legacyCp = nullptr; - const char *legacyCpAnsi = nullptr; - - if (isOem || isAnsi || (useSpecifiedCodePage && (codePage != 65001))) { - - int tableLen = sizeof(lcToOemTable) / sizeof(lcToOemTable[0]); - int lcLen = 0, i; - - // Detect required code page name from current locale - char *lc = getenv("LC_ALL"); - if (!lc || !lc[0]) { - lc = getenv("LC_CTYPE"); - } - if (!lc || !lc[0]) { - lc = getenv("LANG"); - } - - if (lc && lc[0]) { - // Compare up to the dot, if it exists, e.g. en_US.UTF-8 - for (lcLen = 0; lc[lcLen] != '.' && lc[lcLen] != ':' && lc[lcLen] != '\0'; ++lcLen); - - for (i = 0; i < tableLen; i += 2) - if (strncmp(lc, (lcToOemTable[i]), lcLen) == 0) { - legacyCp = lcToOemTable[i + 1]; - legacyCpAnsi = lcToAnsiTable[i + 1]; - break; // Stop searching once a match is found - } - - if (!legacyCp) { - legacyCp = "CP437"; - legacyCpAnsi = "CP1252"; - } - - char specCP[20]; - if (useSpecifiedCodePage) { - if (codePage == 0) { - strncpy(specCP, legacyCpAnsi, sizeof(legacyCpAnsi) - 1); - specCP[sizeof(legacyCpAnsi) - 1] = '\0'; - } - else if (codePage == 1) { - strncpy(specCP, legacyCp, sizeof(legacyCp) - 1); - specCP[sizeof(legacyCp) - 1] = '\0'; } - else { - snprintf(specCP, sizeof(specCP), "CP%d", codePage); - } - } - - iconv_t cd; - if ((cd = iconv_open("UTF-8", useSpecifiedCodePage ? specCP : (isOem ? legacyCp : legacyCpAnsi))) != (iconv_t)-1) { - - AString sUtf8; - - unsigned slen = s.Len(); - char* src = s.Ptr_non_const(); - - unsigned dlen = slen * 4 + 1; // (source length * 4) + null termination - char* dst = sUtf8.GetBuf_SetEnd(dlen); - const char* dstStart = dst; - - memset(dst, 0, dlen); - - size_t slen_size_t = static_cast(slen); - size_t dlen_size_t = static_cast(dlen); - size_t done = iconv(cd, &src, &slen_size_t, &dst, &dlen_size_t); - - if (done == (size_t)-1) { - iconv_close(cd); - - // iconv failed. Falling back to default behavior - MultiByteToUnicodeString2(res, s, useSpecifiedCodePage ? codePage : GetCodePage()); - return; - } - - // Null-terminate the result - *dst = '\0'; - - iconv_close(cd); - - size_t dstCorrectLength = dst - dstStart; - sUtf8.ReleaseBuf_SetEnd(static_cast(dstCorrectLength)); - - if (ConvertUTF8ToUnicode(sUtf8, res) /*|| ignore_Utf8_Errors*/) - return; - } - } - } - #endif if (isUtf8) { diff --git a/CPP/7zip/TEST_LegacyMapping.cpp b/CPP/7zip/TEST_LegacyMapping.cpp new file mode 100644 index 0000000..d851f88 --- /dev/null +++ b/CPP/7zip/TEST_LegacyMapping.cpp @@ -0,0 +1,58 @@ +#include "StdAfx.h" +#include "../Common/StringConvert.h" +#include +#include +#include + +#ifndef _WIN32 +// Linker dummy required for MyString.o dependencies +extern "C" UINT WINAPI SysStringLen(BSTR) { return 0; } + +void RunTest(const char* lc_all, bool isOem) +{ + if (lc_all) setenv("LC_ALL", lc_all, 1); + else unsetenv("LC_ALL"); + + UString result; + AString src("A"); // Standard ASCII "A" (0x41) + + printf("Testing Locale: %-15s (isOem=%d)... ", lc_all ? lc_all : "DEFAULT", isOem); + + // Test the internal mapping logic via iconv_open verification + bool ok = UnixConvertLegacyToUnicode(src, result, isOem, false, 0); + + if (ok) { + printf("[OK] (Mapping accepted by iconv)\n"); + } else { + printf("[FAIL] (Mapping rejected or conversion failed)\n"); + exit(1); + } +} + +int main() +{ + printf("Starting Legacy Codepage Mapping Tests...\n\n"); + + // 1. Japanese (Shift-JIS) + RunTest("ja_JP.UTF-8", false); + + // 2. Russian (Cyrillic OEM/ANSI) + RunTest("ru_RU.UTF-8", true); // Should map to IBM866 + RunTest("ru_RU.UTF-8", false); // Should map to WINDOWS-1251 + + // 3. German (Western Europe) + RunTest("de_DE.UTF-8", false); // Should map to WINDOWS-1252 / IBM850 + + // 4. Fallback/Standard + RunTest("C", true); // Should map to IBM437 + RunTest("POSIX", false); // Should map to WINDOWS-1252 + + printf("\nAll codepage mapping tests PASSED.\n"); + return 0; +} +#else +int main() { + printf("This test is for UNIX platforms only.\n"); + return 0; +} +#endif \ No newline at end of file diff --git a/CPP/Common/StringConvert.cpp b/CPP/Common/StringConvert.cpp index 79ff9e0..efcf02c 100644 --- a/CPP/Common/StringConvert.cpp +++ b/CPP/Common/StringConvert.cpp @@ -1,5 +1,7 @@ // Common/StringConvert.cpp +#include + #include "StdAfx.h" #include "StringConvert.h" @@ -17,6 +19,138 @@ #include #endif +#ifndef _WIN32 +#include + +static const char *lcToOemTable[] = { + "af_ZA", "IBM850", "ar_SA", "IBM720", "ar_LB", "IBM720", "ar_EG", "IBM720", + "ar_DZ", "IBM720", "ar_BH", "IBM720", "ar_IQ", "IBM720", "ar_JO", "IBM720", + "ar_KW", "IBM720", "ar_LY", "IBM720", "ar_MA", "IBM720", "ar_OM", "IBM720", + "ar_QA", "IBM720", "ar_SY", "IBM720", "ar_TN", "IBM720", "ar_AE", "IBM720", + "ar_YE", "IBM720", "ast_ES", "IBM850", "az_AZ@cyrillic", "IBM866", "az_AZ", "IBM857", + "be_BY", "IBM866", "bg_BG", "IBM866", "br_FR", "IBM850", "ca_ES", "IBM850", + "zh_CN", "GBK", "zh_TW", "BIG5", "kw_GB", "IBM850", "cs_CZ", "IBM852", + "cy_GB", "IBM850", "da_DK", "IBM850", "de_AT", "IBM850", "de_LI", "IBM850", + "de_LU", "IBM850", "de_CH", "IBM850", "de_DE", "IBM850", "el_GR", "IBM737", + "en_AU", "IBM850", "en_CA", "IBM850", "en_GB", "IBM850", "en_IE", "IBM850", + "en_JM", "IBM850", "en_BZ", "IBM850", "en_PH", "IBM437", "en_ZA", "IBM437", + "en_TT", "IBM850", "en_US", "IBM437", "en_ZW", "IBM437", "en_NZ", "IBM850", + "es_PA", "IBM850", "es_BO", "IBM850", "es_CR", "IBM850", "es_DO", "IBM850", + "es_SV", "IBM850", "es_EC", "IBM850", "es_GT", "IBM850", "es_HN", "IBM850", + "es_NI", "IBM850", "es_CL", "IBM850", "es_MX", "IBM850", "es_ES", "IBM850", + "es_CO", "IBM850", "es_PE", "IBM850", "es_AR", "IBM850", + "es_PR", "IBM850", "es_VE", "IBM850", "es_UY", "IBM850", "es_PY", "IBM850", + "et_EE", "IBM775", "eu_ES", "IBM850", "fa_IR", "IBM720", "fi_FI", "IBM850", + "fo_FO", "IBM850", "fr_FR", "IBM850", "fr_BE", "IBM850", "fr_CA", "IBM850", + "fr_LU", "IBM850", "fr_MC", "IBM850", "fr_CH", "IBM850", "ga_IE", "IBM437", + "gd_GB", "IBM850", "gv_IM", "IBM850", "gl_ES", "IBM850", "he_IL", "IBM862", + "hr_HR", "IBM852", "hu_HU", "IBM852", "id_ID", "IBM850", "is_IS", "IBM850", + "it_IT", "IBM850", "it_CH", "IBM850", "iv_IV", "IBM437", "ja_JP", "CP932", + "kk_KZ", "IBM866", "ko_KR", "CP949", "ky_KG", "IBM866", "lt_LT", "IBM775", + "lv_LV", "IBM775", "mk_MK", "IBM866", "mn_MN", "IBM866", "ms_BN", "IBM850", + "ms_MY", "IBM850", "nl_BE", "IBM850", "nl_NL", "IBM850", "nl_SR", "IBM850", + "nn_NO", "IBM850", "nb_NO", "IBM850", "pl_PL", "IBM852", "pt_BR", "IBM850", + "pt_PT", "IBM850", "rm_CH", "IBM850", "ro_RO", "IBM852", "ru_RU", "IBM866", + "sk_SK", "IBM852", "sl_SI", "IBM852", "sq_AL", "IBM852", "sr_RS@latin", "IBM852", + "sr_RS", "IBM855", "sv_SE", "IBM850", "sv_FI", "IBM850", "sw_KE", "IBM437", + "th_TH", "TIS-620", "tr_TR", "IBM857", "tt_RU", "IBM866", "uk_UA", "IBM866", + "ur_PK", "IBM720", "uz_UZ@cyrillic", "IBM866", "uz_UZ", "IBM857", "vi_VN", "WINDOWS-1258", + "wa_BE", "IBM850", "zh_HK", "BIG5-HKSCS", "zh_SG", "GBK" +}; + +static const char *lcToAnsiTable[] = { + "af_ZA", "WINDOWS-1252", "ar_SA", "WINDOWS-1256", "ar_LB", "WINDOWS-1256", "ar_EG", "WINDOWS-1256", + "ar_DZ", "WINDOWS-1256", "ar_BH", "WINDOWS-1256", "ar_IQ", "WINDOWS-1256", "ar_JO", "WINDOWS-1256", + "ar_KW", "WINDOWS-1256", "ar_LY", "WINDOWS-1256", "ar_MA", "WINDOWS-1256", "ar_OM", "WINDOWS-1256", + "ar_QA", "WINDOWS-1256", "ar_SY", "WINDOWS-1256", "ar_TN", "WINDOWS-1256", "ar_AE", "WINDOWS-1256", + "ar_YE", "WINDOWS-1256", "ast_ES", "WINDOWS-1252", "az_AZ@cyrillic", "WINDOWS-1251", "az_AZ", "WINDOWS-1254", + "be_BY", "WINDOWS-1251", "bg_BG", "WINDOWS-1251", "br_FR", "WINDOWS-1252", "ca_ES", "WINDOWS-1252", + "zh_CN", "GBK", "zh_TW", "BIG5", "kw_GB", "WINDOWS-1252", "cs_CZ", "WINDOWS-1250", + "cy_GB", "WINDOWS-1252", "da_DK", "WINDOWS-1252", "de_AT", "WINDOWS-1252", "de_LI", "WINDOWS-1252", + "de_LU", "WINDOWS-1252", "de_CH", "WINDOWS-1252", "de_DE", "WINDOWS-1252", "el_GR", "WINDOWS-1253", + "en_AU", "WINDOWS-1252", "en_CA", "WINDOWS-1252", "en_GB", "WINDOWS-1252", "en_IE", "WINDOWS-1252", + "en_JM", "WINDOWS-1252", "en_BZ", "WINDOWS-1252", "en_PH", "WINDOWS-1252", "en_ZA", "WINDOWS-1252", + "en_TT", "WINDOWS-1252", "en_US", "WINDOWS-1252", "en_ZW", "WINDOWS-1252", "en_NZ", "WINDOWS-1252", + "es_PA", "WINDOWS-1252", "es_BO", "WINDOWS-1252", "es_CR", "WINDOWS-1252", "es_DO", "WINDOWS-1252", + "es_SV", "WINDOWS-1252", "es_EC", "WINDOWS-1252", "es_GT", "WINDOWS-1252", "es_HN", "WINDOWS-1252", + "es_NI", "WINDOWS-1252", "es_CL", "WINDOWS-1252", "es_MX", "WINDOWS-1252", "es_ES", "WINDOWS-1252", + "es_CO", "WINDOWS-1252", "es_PE", "WINDOWS-1252", "es_AR", "WINDOWS-1252", + "es_PR", "WINDOWS-1252", "es_VE", "WINDOWS-1252", "es_UY", "WINDOWS-1252", "es_PY", "WINDOWS-1252", + "et_EE", "WINDOWS-1257", "eu_ES", "WINDOWS-1252", "fa_IR", "WINDOWS-1256", "fi_FI", "WINDOWS-1252", + "fo_FO", "WINDOWS-1252", "fr_FR", "WINDOWS-1252", "fr_BE", "WINDOWS-1252", "fr_CA", "WINDOWS-1252", + "fr_LU", "WINDOWS-1252", "fr_MC", "WINDOWS-1252", "fr_CH", "WINDOWS-1252", "ga_IE", "WINDOWS-1252", + "gd_GB", "WINDOWS-1252", "gv_IM", "WINDOWS-1252", "gl_ES", "WINDOWS-1252", "he_IL", "WINDOWS-1255", + "hr_HR", "WINDOWS-1250", "hu_HU", "WINDOWS-1250", "id_ID", "WINDOWS-1252", "is_IS", "WINDOWS-1252", + "it_IT", "WINDOWS-1252", "it_CH", "WINDOWS-1252", "iv_IV", "WINDOWS-1252", "ja_JP", "CP932", + "kk_KZ", "WINDOWS-1251", "ko_KR", "CP949", "ky_KG", "WINDOWS-1251", "lt_LT", "WINDOWS-1257", + "lv_LV", "WINDOWS-1257", "mk_MK", "WINDOWS-1251", "mn_MN", "WINDOWS-1251", "ms_BN", "WINDOWS-1252", + "ms_MY", "WINDOWS-1252", "nl_BE", "WINDOWS-1252", "nl_NL", "WINDOWS-1252", "nl_SR", "WINDOWS-1252", + "nn_NO", "WINDOWS-1252", "nb_NO", "WINDOWS-1252", "pl_PL", "WINDOWS-1250", "pt_BR", "WINDOWS-1252", + "pt_PT", "WINDOWS-1252", "rm_CH", "WINDOWS-1252", "ro_RO", "WINDOWS-1250", "ru_RU", "WINDOWS-1251", + "sk_SK", "WINDOWS-1250", "sl_SI", "WINDOWS-1250", "sq_AL", "WINDOWS-1250", "sr_RS@latin", "WINDOWS-1250", + "sr_RS", "WINDOWS-1251", "sv_SE", "WINDOWS-1252", "sv_FI", "WINDOWS-1252", "sw_KE", "WINDOWS-1252", + "th_TH", "WINDOWS-874", "tr_TR", "WINDOWS-1254", "tt_RU", "WINDOWS-1251", "uk_UA", "WINDOWS-1251", + "ur_PK", "WINDOWS-1256", "uz_UZ@cyrillic", "WINDOWS-1251", "uz_UZ", "WINDOWS-1254", "vi_VN", "WINDOWS-1258", + "wa_BE", "WINDOWS-1252", "zh_HK", "BIG5-HKSCS", "zh_SG", "GBK" +}; + +bool UnixConvertLegacyToUnicode(const AString &src, UString &dest, bool isOem, bool useSpecifiedCodePage, UINT codePage) +{ + const char *targetCp = NULL; + char specCP[32]; + + if (useSpecifiedCodePage && codePage != 65001) { + if (codePage == 0) isOem = false; + else if (codePage == 1) isOem = true; + else { + snprintf(specCP, sizeof(specCP), "CP%u", (unsigned)codePage); + targetCp = specCP; + } + } + + if (!targetCp) { + const char *lc = getenv("LC_ALL"); + if (!lc || !lc[0]) lc = getenv("LC_CTYPE"); + if (!lc || !lc[0]) lc = getenv("LANG"); + + if (!lc || !lc[0] || strcmp(lc, "C") == 0 || strcmp(lc, "POSIX") == 0) { + targetCp = isOem ? "IBM437" : "WINDOWS-1252"; + } else { + int lcLen; + for (lcLen = 0; lc[lcLen] != '.' && lc[lcLen] != '@' && lc[lcLen] != '\0'; ++lcLen); + const char **table = isOem ? lcToOemTable : lcToAnsiTable; + int tableLen = isOem ? (sizeof(lcToOemTable) / sizeof(lcToOemTable[0])) : (sizeof(lcToAnsiTable) / sizeof(lcToAnsiTable[0])); + for (int i = 0; i < tableLen; i += 2) { + if (strncmp(lc, table[i], lcLen) == 0 && table[i][lcLen] == '\0') { + targetCp = table[i + 1]; + break; + } + } + if (!targetCp) targetCp = isOem ? "IBM437" : "WINDOWS-1252"; + } + } + + iconv_t cd = iconv_open("UTF-8", targetCp); + if (cd == (iconv_t)-1) return false; + + AString sUtf8; + unsigned slen = src.Len(); + char* srcPtr = const_cast(src.Ptr()); + unsigned dlen = slen * 4 + 1; + char* dstPtr = sUtf8.GetBuf_SetEnd(dlen); + const char* dstStart = dstPtr; + memset(dstPtr, 0, dlen); + size_t slen_st = slen; + size_t dlen_st = dlen; + size_t done = iconv(cd, &srcPtr, &slen_st, &dstPtr, &dlen_st); + iconv_close(cd); + + if (done == (size_t)-1) return false; + *dstPtr = '\0'; + sUtf8.ReleaseBuf_SetEnd((unsigned)(dstPtr - dstStart)); + return ConvertUTF8ToUnicode(sUtf8, dest); +} +#endif static const char k_DefultChar = '_'; #ifdef _WIN32 diff --git a/CPP/Common/StringConvert.h b/CPP/Common/StringConvert.h index 2092a2d..5d5a2c4 100644 --- a/CPP/Common/StringConvert.h +++ b/CPP/Common/StringConvert.h @@ -16,6 +16,9 @@ void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage) AString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed); AString UnicodeStringToMultiByte(const UString &src, UINT codePage = CP_ACP); +#ifndef _WIN32 +bool UnixConvertLegacyToUnicode(const AString &src, UString &dest, bool isOem, bool useSpecifiedCodePage, UINT codePage); +#endif inline const wchar_t* GetUnicodeString(const wchar_t *u) { return u; } inline const UString& GetUnicodeString(const UString &u) { return u; }