mirror of
https://github.com/ip7z/7zip.git
synced 2026-04-21 06:03:40 +00:00
Reimplement legacy OEM/ANSI codepage detection on UNIX platforms for ZIP and LZH archives.
This implementation replaces previous version by centralizing the locale-to-codepage mapping logic within the core StringConvert module. This approach improves portability by using standard iconv encoding names, and provides a fallback mechanism to the default 7-Zip conversion behavior if the requested charset is unavailable. Additionally, this commit introduces a standalone unit test harness, TEST_LegacyMapping.cpp, to verify the cross-locale codepage mapping logic independently of the full archive processing pipeline.
This commit is contained in:
parent
92bf145ec3
commit
13164b23c3
5 changed files with 210 additions and 186 deletions
|
|
@ -463,7 +463,12 @@ Z7_COM7F_IMF(CHandler::GetProperty(UInt32 index, PROPID propID, PROPVARIANT *val
|
|||
{
|
||||
case kpidPath:
|
||||
{
|
||||
UString s = NItemName::WinPathToOsPath(MultiByteToUnicodeString(item.GetName(), CP_OEMCP));
|
||||
UString res;
|
||||
#ifndef _WIN32
|
||||
if (!UnixConvertLegacyToUnicode(item.GetName(), res, true, false, 0))
|
||||
#endif
|
||||
MultiByteToUnicodeString2(res, item.GetName(), CP_OEMCP);
|
||||
UString s = NItemName::WinPathToOsPath(res);
|
||||
if (!s.IsEmpty())
|
||||
{
|
||||
if (s.Back() == WCHAR_PATH_SEPARATOR)
|
||||
|
|
|
|||
|
|
@ -1,12 +1,5 @@
|
|||
// Archive/ZipItem.cpp
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <iconv.h>
|
||||
#include <locale.h>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#endif
|
||||
|
||||
#include "StdAfx.h"
|
||||
|
||||
#include "../../../../C/CpuArch.h"
|
||||
|
|
@ -440,6 +433,15 @@ void CItem::GetUnicodeString(UString &res, const AString &s, bool isComment, boo
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
bool isOem = false;
|
||||
bool isAnsi = false;
|
||||
if (MadeByVersion.HostOS == NFileHeader::NHostOS::kNTFS && MadeByVersion.Version >= 20) isAnsi = true;
|
||||
else if (MadeByVersion.HostOS == NFileHeader::NHostOS::kNTFS || MadeByVersion.HostOS == NFileHeader::NHostOS::kFAT) isOem = true;
|
||||
if (isOem || isAnsi || (useSpecifiedCodePage && codePage != 65001))
|
||||
if (UnixConvertLegacyToUnicode(s, res, isOem, useSpecifiedCodePage, codePage)) return;
|
||||
#endif
|
||||
|
||||
if (useSpecifiedCodePage)
|
||||
isUtf8 = (codePage == CP_UTF8);
|
||||
#ifdef _WIN32
|
||||
|
|
@ -458,184 +460,6 @@ void CItem::GetUnicodeString(UString &res, const AString &s, bool isComment, boo
|
|||
#endif
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
|
||||
// Convert OEM char set to UTF-8 if needed
|
||||
// Use system locale to select code page
|
||||
|
||||
// locale -> code page translation tables generated from Wine source code
|
||||
|
||||
const char *lcToOemTable[] = {
|
||||
"af_ZA", "CP850", "ar_SA", "CP720", "ar_LB", "CP720", "ar_EG", "CP720",
|
||||
"ar_DZ", "CP720", "ar_BH", "CP720", "ar_IQ", "CP720", "ar_JO", "CP720",
|
||||
"ar_KW", "CP720", "ar_LY", "CP720", "ar_MA", "CP720", "ar_OM", "CP720",
|
||||
"ar_QA", "CP720", "ar_SY", "CP720", "ar_TN", "CP720", "ar_AE", "CP720",
|
||||
"ar_YE", "CP720", "ast_ES", "CP850", "az_AZ@cyrillic", "CP866", "az_AZ", "CP857",
|
||||
"be_BY", "CP866", "bg_BG", "CP866", "br_FR", "CP850", "ca_ES", "CP850",
|
||||
"zh_CN", "CP936", "zh_TW", "CP950", "kw_GB", "CP850", "cs_CZ", "CP852",
|
||||
"cy_GB", "CP850", "da_DK", "CP850", "de_AT", "CP850", "de_LI", "CP850",
|
||||
"de_LU", "CP850", "de_CH", "CP850", "de_DE", "CP850", "el_GR", "CP737",
|
||||
"en_AU", "CP850", "en_CA", "CP850", "en_GB", "CP850", "en_IE", "CP850",
|
||||
"en_JM", "CP850", "en_BZ", "CP850", "en_PH", "CP437", "en_ZA", "CP437",
|
||||
"en_TT", "CP850", "en_US", "CP437", "en_ZW", "CP437", "en_NZ", "CP850",
|
||||
"es_PA", "CP850", "es_BO", "CP850", "es_CR", "CP850", "es_DO", "CP850",
|
||||
"es_SV", "CP850", "es_EC", "CP850", "es_GT", "CP850", "es_HN", "CP850",
|
||||
"es_NI", "CP850", "es_CL", "CP850", "es_MX", "CP850", "es_ES", "CP850",
|
||||
"es_CO", "CP850", "es_ES", "CP850", "es_PE", "CP850", "es_AR", "CP850",
|
||||
"es_PR", "CP850", "es_VE", "CP850", "es_UY", "CP850", "es_PY", "CP850",
|
||||
"et_EE", "CP775", "eu_ES", "CP850", "fa_IR", "CP720", "fi_FI", "CP850",
|
||||
"fo_FO", "CP850", "fr_FR", "CP850", "fr_BE", "CP850", "fr_CA", "CP850",
|
||||
"fr_LU", "CP850", "fr_MC", "CP850", "fr_CH", "CP850", "ga_IE", "CP437",
|
||||
"gd_GB", "CP850", "gv_IM", "CP850", "gl_ES", "CP850", "he_IL", "CP862",
|
||||
"hr_HR", "CP852", "hu_HU", "CP852", "id_ID", "CP850", "is_IS", "CP850",
|
||||
"it_IT", "CP850", "it_CH", "CP850", "iv_IV", "CP437", "ja_JP", "CP932",
|
||||
"kk_KZ", "CP866", "ko_KR", "CP949", "ky_KG", "CP866", "lt_LT", "CP775",
|
||||
"lv_LV", "CP775", "mk_MK", "CP866", "mn_MN", "CP866", "ms_BN", "CP850",
|
||||
"ms_MY", "CP850", "nl_BE", "CP850", "nl_NL", "CP850", "nl_SR", "CP850",
|
||||
"nn_NO", "CP850", "nb_NO", "CP850", "pl_PL", "CP852", "pt_BR", "CP850",
|
||||
"pt_PT", "CP850", "rm_CH", "CP850", "ro_RO", "CP852", "ru_RU", "CP866",
|
||||
"sk_SK", "CP852", "sl_SI", "CP852", "sq_AL", "CP852", "sr_RS@latin", "CP852",
|
||||
"sr_RS", "CP855", "sv_SE", "CP850", "sv_FI", "CP850", "sw_KE", "CP437",
|
||||
"th_TH", "CP874", "tr_TR", "CP857", "tt_RU", "CP866", "uk_UA", "CP866",
|
||||
"ur_PK", "CP720", "uz_UZ@cyrillic", "CP866", "uz_UZ", "CP857", "vi_VN", "CP1258",
|
||||
"wa_BE", "CP850", "zh_HK", "CP950", "zh_SG", "CP936"};
|
||||
|
||||
const char *lcToAnsiTable[] = {
|
||||
"af_ZA", "CP1252", "ar_SA", "CP1256", "ar_LB", "CP1256", "ar_EG", "CP1256",
|
||||
"ar_DZ", "CP1256", "ar_BH", "CP1256", "ar_IQ", "CP1256", "ar_JO", "CP1256",
|
||||
"ar_KW", "CP1256", "ar_LY", "CP1256", "ar_MA", "CP1256", "ar_OM", "CP1256",
|
||||
"ar_QA", "CP1256", "ar_SY", "CP1256", "ar_TN", "CP1256", "ar_AE", "CP1256",
|
||||
"ar_YE", "CP1256","ast_ES", "CP1252", "az_AZ@cyrillic", "CP1251", "az_AZ", "CP1254",
|
||||
"be_BY", "CP1251", "bg_BG", "CP1251", "br_FR", "CP1252", "ca_ES", "CP1252",
|
||||
"zh_CN", "CP936", "zh_TW", "CP950", "kw_GB", "CP1252", "cs_CZ", "CP1250",
|
||||
"cy_GB", "CP1252", "da_DK", "CP1252", "de_AT", "CP1252", "de_LI", "CP1252",
|
||||
"de_LU", "CP1252", "de_CH", "CP1252", "de_DE", "CP1252", "el_GR", "CP1253",
|
||||
"en_AU", "CP1252", "en_CA", "CP1252", "en_GB", "CP1252", "en_IE", "CP1252",
|
||||
"en_JM", "CP1252", "en_BZ", "CP1252", "en_PH", "CP1252", "en_ZA", "CP1252",
|
||||
"en_TT", "CP1252", "en_US", "CP1252", "en_ZW", "CP1252", "en_NZ", "CP1252",
|
||||
"es_PA", "CP1252", "es_BO", "CP1252", "es_CR", "CP1252", "es_DO", "CP1252",
|
||||
"es_SV", "CP1252", "es_EC", "CP1252", "es_GT", "CP1252", "es_HN", "CP1252",
|
||||
"es_NI", "CP1252", "es_CL", "CP1252", "es_MX", "CP1252", "es_ES", "CP1252",
|
||||
"es_CO", "CP1252", "es_ES", "CP1252", "es_PE", "CP1252", "es_AR", "CP1252",
|
||||
"es_PR", "CP1252", "es_VE", "CP1252", "es_UY", "CP1252", "es_PY", "CP1252",
|
||||
"et_EE", "CP1257", "eu_ES", "CP1252", "fa_IR", "CP1256", "fi_FI", "CP1252",
|
||||
"fo_FO", "CP1252", "fr_FR", "CP1252", "fr_BE", "CP1252", "fr_CA", "CP1252",
|
||||
"fr_LU", "CP1252", "fr_MC", "CP1252", "fr_CH", "CP1252", "ga_IE", "CP1252",
|
||||
"gd_GB", "CP1252", "gv_IM", "CP1252", "gl_ES", "CP1252", "he_IL", "CP1255",
|
||||
"hr_HR", "CP1250", "hu_HU", "CP1250", "id_ID", "CP1252", "is_IS", "CP1252",
|
||||
"it_IT", "CP1252", "it_CH", "CP1252", "iv_IV", "CP1252", "ja_JP", "CP932",
|
||||
"kk_KZ", "CP1251", "ko_KR", "CP949", "ky_KG", "CP1251", "lt_LT", "CP1257",
|
||||
"lv_LV", "CP1257", "mk_MK", "CP1251", "mn_MN", "CP1251", "ms_BN", "CP1252",
|
||||
"ms_MY", "CP1252", "nl_BE", "CP1252", "nl_NL", "CP1252", "nl_SR", "CP1252",
|
||||
"nn_NO", "CP1252", "nb_NO", "CP1252", "pl_PL", "CP1250", "pt_BR", "CP1252",
|
||||
"pt_PT", "CP1252", "rm_CH", "CP1252", "ro_RO", "CP1250", "ru_RU", "CP1251",
|
||||
"sk_SK", "CP1250", "sl_SI", "CP1250", "sq_AL", "CP1250", "sr_RS@latin", "CP1250",
|
||||
"sr_RS", "CP1251", "sv_SE", "CP1252", "sv_FI", "CP1252", "sw_KE", "CP1252",
|
||||
"th_TH", "CP874", "tr_TR", "CP1254", "tt_RU", "CP1251", "uk_UA", "CP1251",
|
||||
"ur_PK", "CP1256", "uz_UZ@cyrillic", "CP1251", "uz_UZ", "CP1254", "vi_VN", "CP1258",
|
||||
"wa_BE", "CP1252", "zh_HK", "CP950", "zh_SG", "CP936"};
|
||||
|
||||
bool isOem = false;
|
||||
bool isAnsi = false;
|
||||
|
||||
if (!isUtf8 &&
|
||||
MadeByVersion.HostOS == NFileHeader::NHostOS::kNTFS &&
|
||||
MadeByVersion.Version >= 20) {
|
||||
isAnsi = true;
|
||||
} else
|
||||
if (!isUtf8 &&
|
||||
(MadeByVersion.HostOS == NFileHeader::NHostOS::kNTFS ||
|
||||
MadeByVersion.HostOS == NFileHeader::NHostOS::kFAT)) {
|
||||
isOem = true;
|
||||
}
|
||||
|
||||
const char *legacyCp = nullptr;
|
||||
const char *legacyCpAnsi = nullptr;
|
||||
|
||||
if (isOem || isAnsi || (useSpecifiedCodePage && (codePage != 65001))) {
|
||||
|
||||
int tableLen = sizeof(lcToOemTable) / sizeof(lcToOemTable[0]);
|
||||
int lcLen = 0, i;
|
||||
|
||||
// Detect required code page name from current locale
|
||||
char *lc = getenv("LC_ALL");
|
||||
if (!lc || !lc[0]) {
|
||||
lc = getenv("LC_CTYPE");
|
||||
}
|
||||
if (!lc || !lc[0]) {
|
||||
lc = getenv("LANG");
|
||||
}
|
||||
|
||||
if (lc && lc[0]) {
|
||||
// Compare up to the dot, if it exists, e.g. en_US.UTF-8
|
||||
for (lcLen = 0; lc[lcLen] != '.' && lc[lcLen] != ':' && lc[lcLen] != '\0'; ++lcLen);
|
||||
|
||||
for (i = 0; i < tableLen; i += 2)
|
||||
if (strncmp(lc, (lcToOemTable[i]), lcLen) == 0) {
|
||||
legacyCp = lcToOemTable[i + 1];
|
||||
legacyCpAnsi = lcToAnsiTable[i + 1];
|
||||
break; // Stop searching once a match is found
|
||||
}
|
||||
|
||||
if (!legacyCp) {
|
||||
legacyCp = "CP437";
|
||||
legacyCpAnsi = "CP1252";
|
||||
}
|
||||
|
||||
char specCP[20];
|
||||
if (useSpecifiedCodePage) {
|
||||
if (codePage == 0) {
|
||||
strncpy(specCP, legacyCpAnsi, sizeof(legacyCpAnsi) - 1);
|
||||
specCP[sizeof(legacyCpAnsi) - 1] = '\0';
|
||||
}
|
||||
else if (codePage == 1) {
|
||||
strncpy(specCP, legacyCp, sizeof(legacyCp) - 1);
|
||||
specCP[sizeof(legacyCp) - 1] = '\0'; }
|
||||
else {
|
||||
snprintf(specCP, sizeof(specCP), "CP%d", codePage);
|
||||
}
|
||||
}
|
||||
|
||||
iconv_t cd;
|
||||
if ((cd = iconv_open("UTF-8", useSpecifiedCodePage ? specCP : (isOem ? legacyCp : legacyCpAnsi))) != (iconv_t)-1) {
|
||||
|
||||
AString sUtf8;
|
||||
|
||||
unsigned slen = s.Len();
|
||||
char* src = s.Ptr_non_const();
|
||||
|
||||
unsigned dlen = slen * 4 + 1; // (source length * 4) + null termination
|
||||
char* dst = sUtf8.GetBuf_SetEnd(dlen);
|
||||
const char* dstStart = dst;
|
||||
|
||||
memset(dst, 0, dlen);
|
||||
|
||||
size_t slen_size_t = static_cast<size_t>(slen);
|
||||
size_t dlen_size_t = static_cast<size_t>(dlen);
|
||||
size_t done = iconv(cd, &src, &slen_size_t, &dst, &dlen_size_t);
|
||||
|
||||
if (done == (size_t)-1) {
|
||||
iconv_close(cd);
|
||||
|
||||
// iconv failed. Falling back to default behavior
|
||||
MultiByteToUnicodeString2(res, s, useSpecifiedCodePage ? codePage : GetCodePage());
|
||||
return;
|
||||
}
|
||||
|
||||
// Null-terminate the result
|
||||
*dst = '\0';
|
||||
|
||||
iconv_close(cd);
|
||||
|
||||
size_t dstCorrectLength = dst - dstStart;
|
||||
sUtf8.ReleaseBuf_SetEnd(static_cast<unsigned>(dstCorrectLength));
|
||||
|
||||
if (ConvertUTF8ToUnicode(sUtf8, res) /*|| ignore_Utf8_Errors*/)
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (isUtf8)
|
||||
{
|
||||
|
|
|
|||
58
CPP/7zip/TEST_LegacyMapping.cpp
Normal file
58
CPP/7zip/TEST_LegacyMapping.cpp
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
#include "StdAfx.h"
|
||||
#include "../Common/StringConvert.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifndef _WIN32
|
||||
// Linker dummy required for MyString.o dependencies
|
||||
extern "C" UINT WINAPI SysStringLen(BSTR) { return 0; }
|
||||
|
||||
void RunTest(const char* lc_all, bool isOem)
|
||||
{
|
||||
if (lc_all) setenv("LC_ALL", lc_all, 1);
|
||||
else unsetenv("LC_ALL");
|
||||
|
||||
UString result;
|
||||
AString src("A"); // Standard ASCII "A" (0x41)
|
||||
|
||||
printf("Testing Locale: %-15s (isOem=%d)... ", lc_all ? lc_all : "DEFAULT", isOem);
|
||||
|
||||
// Test the internal mapping logic via iconv_open verification
|
||||
bool ok = UnixConvertLegacyToUnicode(src, result, isOem, false, 0);
|
||||
|
||||
if (ok) {
|
||||
printf("[OK] (Mapping accepted by iconv)\n");
|
||||
} else {
|
||||
printf("[FAIL] (Mapping rejected or conversion failed)\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
printf("Starting Legacy Codepage Mapping Tests...\n\n");
|
||||
|
||||
// 1. Japanese (Shift-JIS)
|
||||
RunTest("ja_JP.UTF-8", false);
|
||||
|
||||
// 2. Russian (Cyrillic OEM/ANSI)
|
||||
RunTest("ru_RU.UTF-8", true); // Should map to IBM866
|
||||
RunTest("ru_RU.UTF-8", false); // Should map to WINDOWS-1251
|
||||
|
||||
// 3. German (Western Europe)
|
||||
RunTest("de_DE.UTF-8", false); // Should map to WINDOWS-1252 / IBM850
|
||||
|
||||
// 4. Fallback/Standard
|
||||
RunTest("C", true); // Should map to IBM437
|
||||
RunTest("POSIX", false); // Should map to WINDOWS-1252
|
||||
|
||||
printf("\nAll codepage mapping tests PASSED.\n");
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
int main() {
|
||||
printf("This test is for UNIX platforms only.\n");
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
|
@ -1,5 +1,7 @@
|
|||
// Common/StringConvert.cpp
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#include "StdAfx.h"
|
||||
|
||||
#include "StringConvert.h"
|
||||
|
|
@ -17,6 +19,138 @@
|
|||
#include <locale.h>
|
||||
#endif
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <iconv.h>
|
||||
|
||||
static const char *lcToOemTable[] = {
|
||||
"af_ZA", "IBM850", "ar_SA", "IBM720", "ar_LB", "IBM720", "ar_EG", "IBM720",
|
||||
"ar_DZ", "IBM720", "ar_BH", "IBM720", "ar_IQ", "IBM720", "ar_JO", "IBM720",
|
||||
"ar_KW", "IBM720", "ar_LY", "IBM720", "ar_MA", "IBM720", "ar_OM", "IBM720",
|
||||
"ar_QA", "IBM720", "ar_SY", "IBM720", "ar_TN", "IBM720", "ar_AE", "IBM720",
|
||||
"ar_YE", "IBM720", "ast_ES", "IBM850", "az_AZ@cyrillic", "IBM866", "az_AZ", "IBM857",
|
||||
"be_BY", "IBM866", "bg_BG", "IBM866", "br_FR", "IBM850", "ca_ES", "IBM850",
|
||||
"zh_CN", "GBK", "zh_TW", "BIG5", "kw_GB", "IBM850", "cs_CZ", "IBM852",
|
||||
"cy_GB", "IBM850", "da_DK", "IBM850", "de_AT", "IBM850", "de_LI", "IBM850",
|
||||
"de_LU", "IBM850", "de_CH", "IBM850", "de_DE", "IBM850", "el_GR", "IBM737",
|
||||
"en_AU", "IBM850", "en_CA", "IBM850", "en_GB", "IBM850", "en_IE", "IBM850",
|
||||
"en_JM", "IBM850", "en_BZ", "IBM850", "en_PH", "IBM437", "en_ZA", "IBM437",
|
||||
"en_TT", "IBM850", "en_US", "IBM437", "en_ZW", "IBM437", "en_NZ", "IBM850",
|
||||
"es_PA", "IBM850", "es_BO", "IBM850", "es_CR", "IBM850", "es_DO", "IBM850",
|
||||
"es_SV", "IBM850", "es_EC", "IBM850", "es_GT", "IBM850", "es_HN", "IBM850",
|
||||
"es_NI", "IBM850", "es_CL", "IBM850", "es_MX", "IBM850", "es_ES", "IBM850",
|
||||
"es_CO", "IBM850", "es_PE", "IBM850", "es_AR", "IBM850",
|
||||
"es_PR", "IBM850", "es_VE", "IBM850", "es_UY", "IBM850", "es_PY", "IBM850",
|
||||
"et_EE", "IBM775", "eu_ES", "IBM850", "fa_IR", "IBM720", "fi_FI", "IBM850",
|
||||
"fo_FO", "IBM850", "fr_FR", "IBM850", "fr_BE", "IBM850", "fr_CA", "IBM850",
|
||||
"fr_LU", "IBM850", "fr_MC", "IBM850", "fr_CH", "IBM850", "ga_IE", "IBM437",
|
||||
"gd_GB", "IBM850", "gv_IM", "IBM850", "gl_ES", "IBM850", "he_IL", "IBM862",
|
||||
"hr_HR", "IBM852", "hu_HU", "IBM852", "id_ID", "IBM850", "is_IS", "IBM850",
|
||||
"it_IT", "IBM850", "it_CH", "IBM850", "iv_IV", "IBM437", "ja_JP", "CP932",
|
||||
"kk_KZ", "IBM866", "ko_KR", "CP949", "ky_KG", "IBM866", "lt_LT", "IBM775",
|
||||
"lv_LV", "IBM775", "mk_MK", "IBM866", "mn_MN", "IBM866", "ms_BN", "IBM850",
|
||||
"ms_MY", "IBM850", "nl_BE", "IBM850", "nl_NL", "IBM850", "nl_SR", "IBM850",
|
||||
"nn_NO", "IBM850", "nb_NO", "IBM850", "pl_PL", "IBM852", "pt_BR", "IBM850",
|
||||
"pt_PT", "IBM850", "rm_CH", "IBM850", "ro_RO", "IBM852", "ru_RU", "IBM866",
|
||||
"sk_SK", "IBM852", "sl_SI", "IBM852", "sq_AL", "IBM852", "sr_RS@latin", "IBM852",
|
||||
"sr_RS", "IBM855", "sv_SE", "IBM850", "sv_FI", "IBM850", "sw_KE", "IBM437",
|
||||
"th_TH", "TIS-620", "tr_TR", "IBM857", "tt_RU", "IBM866", "uk_UA", "IBM866",
|
||||
"ur_PK", "IBM720", "uz_UZ@cyrillic", "IBM866", "uz_UZ", "IBM857", "vi_VN", "WINDOWS-1258",
|
||||
"wa_BE", "IBM850", "zh_HK", "BIG5-HKSCS", "zh_SG", "GBK"
|
||||
};
|
||||
|
||||
static const char *lcToAnsiTable[] = {
|
||||
"af_ZA", "WINDOWS-1252", "ar_SA", "WINDOWS-1256", "ar_LB", "WINDOWS-1256", "ar_EG", "WINDOWS-1256",
|
||||
"ar_DZ", "WINDOWS-1256", "ar_BH", "WINDOWS-1256", "ar_IQ", "WINDOWS-1256", "ar_JO", "WINDOWS-1256",
|
||||
"ar_KW", "WINDOWS-1256", "ar_LY", "WINDOWS-1256", "ar_MA", "WINDOWS-1256", "ar_OM", "WINDOWS-1256",
|
||||
"ar_QA", "WINDOWS-1256", "ar_SY", "WINDOWS-1256", "ar_TN", "WINDOWS-1256", "ar_AE", "WINDOWS-1256",
|
||||
"ar_YE", "WINDOWS-1256", "ast_ES", "WINDOWS-1252", "az_AZ@cyrillic", "WINDOWS-1251", "az_AZ", "WINDOWS-1254",
|
||||
"be_BY", "WINDOWS-1251", "bg_BG", "WINDOWS-1251", "br_FR", "WINDOWS-1252", "ca_ES", "WINDOWS-1252",
|
||||
"zh_CN", "GBK", "zh_TW", "BIG5", "kw_GB", "WINDOWS-1252", "cs_CZ", "WINDOWS-1250",
|
||||
"cy_GB", "WINDOWS-1252", "da_DK", "WINDOWS-1252", "de_AT", "WINDOWS-1252", "de_LI", "WINDOWS-1252",
|
||||
"de_LU", "WINDOWS-1252", "de_CH", "WINDOWS-1252", "de_DE", "WINDOWS-1252", "el_GR", "WINDOWS-1253",
|
||||
"en_AU", "WINDOWS-1252", "en_CA", "WINDOWS-1252", "en_GB", "WINDOWS-1252", "en_IE", "WINDOWS-1252",
|
||||
"en_JM", "WINDOWS-1252", "en_BZ", "WINDOWS-1252", "en_PH", "WINDOWS-1252", "en_ZA", "WINDOWS-1252",
|
||||
"en_TT", "WINDOWS-1252", "en_US", "WINDOWS-1252", "en_ZW", "WINDOWS-1252", "en_NZ", "WINDOWS-1252",
|
||||
"es_PA", "WINDOWS-1252", "es_BO", "WINDOWS-1252", "es_CR", "WINDOWS-1252", "es_DO", "WINDOWS-1252",
|
||||
"es_SV", "WINDOWS-1252", "es_EC", "WINDOWS-1252", "es_GT", "WINDOWS-1252", "es_HN", "WINDOWS-1252",
|
||||
"es_NI", "WINDOWS-1252", "es_CL", "WINDOWS-1252", "es_MX", "WINDOWS-1252", "es_ES", "WINDOWS-1252",
|
||||
"es_CO", "WINDOWS-1252", "es_PE", "WINDOWS-1252", "es_AR", "WINDOWS-1252",
|
||||
"es_PR", "WINDOWS-1252", "es_VE", "WINDOWS-1252", "es_UY", "WINDOWS-1252", "es_PY", "WINDOWS-1252",
|
||||
"et_EE", "WINDOWS-1257", "eu_ES", "WINDOWS-1252", "fa_IR", "WINDOWS-1256", "fi_FI", "WINDOWS-1252",
|
||||
"fo_FO", "WINDOWS-1252", "fr_FR", "WINDOWS-1252", "fr_BE", "WINDOWS-1252", "fr_CA", "WINDOWS-1252",
|
||||
"fr_LU", "WINDOWS-1252", "fr_MC", "WINDOWS-1252", "fr_CH", "WINDOWS-1252", "ga_IE", "WINDOWS-1252",
|
||||
"gd_GB", "WINDOWS-1252", "gv_IM", "WINDOWS-1252", "gl_ES", "WINDOWS-1252", "he_IL", "WINDOWS-1255",
|
||||
"hr_HR", "WINDOWS-1250", "hu_HU", "WINDOWS-1250", "id_ID", "WINDOWS-1252", "is_IS", "WINDOWS-1252",
|
||||
"it_IT", "WINDOWS-1252", "it_CH", "WINDOWS-1252", "iv_IV", "WINDOWS-1252", "ja_JP", "CP932",
|
||||
"kk_KZ", "WINDOWS-1251", "ko_KR", "CP949", "ky_KG", "WINDOWS-1251", "lt_LT", "WINDOWS-1257",
|
||||
"lv_LV", "WINDOWS-1257", "mk_MK", "WINDOWS-1251", "mn_MN", "WINDOWS-1251", "ms_BN", "WINDOWS-1252",
|
||||
"ms_MY", "WINDOWS-1252", "nl_BE", "WINDOWS-1252", "nl_NL", "WINDOWS-1252", "nl_SR", "WINDOWS-1252",
|
||||
"nn_NO", "WINDOWS-1252", "nb_NO", "WINDOWS-1252", "pl_PL", "WINDOWS-1250", "pt_BR", "WINDOWS-1252",
|
||||
"pt_PT", "WINDOWS-1252", "rm_CH", "WINDOWS-1252", "ro_RO", "WINDOWS-1250", "ru_RU", "WINDOWS-1251",
|
||||
"sk_SK", "WINDOWS-1250", "sl_SI", "WINDOWS-1250", "sq_AL", "WINDOWS-1250", "sr_RS@latin", "WINDOWS-1250",
|
||||
"sr_RS", "WINDOWS-1251", "sv_SE", "WINDOWS-1252", "sv_FI", "WINDOWS-1252", "sw_KE", "WINDOWS-1252",
|
||||
"th_TH", "WINDOWS-874", "tr_TR", "WINDOWS-1254", "tt_RU", "WINDOWS-1251", "uk_UA", "WINDOWS-1251",
|
||||
"ur_PK", "WINDOWS-1256", "uz_UZ@cyrillic", "WINDOWS-1251", "uz_UZ", "WINDOWS-1254", "vi_VN", "WINDOWS-1258",
|
||||
"wa_BE", "WINDOWS-1252", "zh_HK", "BIG5-HKSCS", "zh_SG", "GBK"
|
||||
};
|
||||
|
||||
bool UnixConvertLegacyToUnicode(const AString &src, UString &dest, bool isOem, bool useSpecifiedCodePage, UINT codePage)
|
||||
{
|
||||
const char *targetCp = NULL;
|
||||
char specCP[32];
|
||||
|
||||
if (useSpecifiedCodePage && codePage != 65001) {
|
||||
if (codePage == 0) isOem = false;
|
||||
else if (codePage == 1) isOem = true;
|
||||
else {
|
||||
snprintf(specCP, sizeof(specCP), "CP%u", (unsigned)codePage);
|
||||
targetCp = specCP;
|
||||
}
|
||||
}
|
||||
|
||||
if (!targetCp) {
|
||||
const char *lc = getenv("LC_ALL");
|
||||
if (!lc || !lc[0]) lc = getenv("LC_CTYPE");
|
||||
if (!lc || !lc[0]) lc = getenv("LANG");
|
||||
|
||||
if (!lc || !lc[0] || strcmp(lc, "C") == 0 || strcmp(lc, "POSIX") == 0) {
|
||||
targetCp = isOem ? "IBM437" : "WINDOWS-1252";
|
||||
} else {
|
||||
int lcLen;
|
||||
for (lcLen = 0; lc[lcLen] != '.' && lc[lcLen] != '@' && lc[lcLen] != '\0'; ++lcLen);
|
||||
const char **table = isOem ? lcToOemTable : lcToAnsiTable;
|
||||
int tableLen = isOem ? (sizeof(lcToOemTable) / sizeof(lcToOemTable[0])) : (sizeof(lcToAnsiTable) / sizeof(lcToAnsiTable[0]));
|
||||
for (int i = 0; i < tableLen; i += 2) {
|
||||
if (strncmp(lc, table[i], lcLen) == 0 && table[i][lcLen] == '\0') {
|
||||
targetCp = table[i + 1];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!targetCp) targetCp = isOem ? "IBM437" : "WINDOWS-1252";
|
||||
}
|
||||
}
|
||||
|
||||
iconv_t cd = iconv_open("UTF-8", targetCp);
|
||||
if (cd == (iconv_t)-1) return false;
|
||||
|
||||
AString sUtf8;
|
||||
unsigned slen = src.Len();
|
||||
char* srcPtr = const_cast<char*>(src.Ptr());
|
||||
unsigned dlen = slen * 4 + 1;
|
||||
char* dstPtr = sUtf8.GetBuf_SetEnd(dlen);
|
||||
const char* dstStart = dstPtr;
|
||||
memset(dstPtr, 0, dlen);
|
||||
size_t slen_st = slen;
|
||||
size_t dlen_st = dlen;
|
||||
size_t done = iconv(cd, &srcPtr, &slen_st, &dstPtr, &dlen_st);
|
||||
iconv_close(cd);
|
||||
|
||||
if (done == (size_t)-1) return false;
|
||||
*dstPtr = '\0';
|
||||
sUtf8.ReleaseBuf_SetEnd((unsigned)(dstPtr - dstStart));
|
||||
return ConvertUTF8ToUnicode(sUtf8, dest);
|
||||
}
|
||||
#endif
|
||||
static const char k_DefultChar = '_';
|
||||
|
||||
#ifdef _WIN32
|
||||
|
|
|
|||
|
|
@ -16,6 +16,9 @@ void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage)
|
|||
|
||||
AString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed);
|
||||
AString UnicodeStringToMultiByte(const UString &src, UINT codePage = CP_ACP);
|
||||
#ifndef _WIN32
|
||||
bool UnixConvertLegacyToUnicode(const AString &src, UString &dest, bool isOem, bool useSpecifiedCodePage, UINT codePage);
|
||||
#endif
|
||||
|
||||
inline const wchar_t* GetUnicodeString(const wchar_t *u) { return u; }
|
||||
inline const UString& GetUnicodeString(const UString &u) { return u; }
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue