1 /* This file is part of the Amalthea library. 2 * 3 * Copyright (C) 2020-2021 Eugene 'Vindex' Stulin 4 * 5 * Distributed under the Boost Software License 1.0 or (at your option) 6 * the GNU Lesser General Public License 3.0 or later. 7 */ 8 9 module amalthea.encoding; 10 11 public import amalthea.libcore; 12 import std.algorithm, std.encoding, std.range, std.string; 13 14 shared static this() { 15 EncodingScheme.register!EncodingSchemeKOI8R; 16 EncodingScheme.register!EncodingSchemeKOI8U; 17 } 18 19 /// Defines a KOI8-R character and string. 20 enum KOI8RChar : ubyte { init } 21 alias KOI8RString = immutable(KOI8RChar)[]; 22 23 /// Defines an KOI8-U character and string. 24 enum KOI8UChar : ubyte { init } 25 alias KOI8UString = immutable(KOI8UChar)[]; 26 27 /** American Standard Code for Information Interchange, 128 bit encoding */ 28 immutable dstring asciiTable = iota!dchar('\x00', '\x80').array; 29 30 31 /******************************************************************************* 32 * Abstract base class for all ASCII based encoding schemes of Amalthea. 33 */ 34 abstract class AsciiBasedEncoding : std.encoding.EncodingScheme { 35 immutable static dchar[ubyte] conversionTableToUTF32; 36 immutable static ubyte[dchar] conversionTableFromUTF32; 37 38 const override { 39 bool canEncode(dchar c) @safe pure nothrow @nogc { 40 return cast(bool)(c in conversionTableFromUTF32); 41 } 42 43 size_t encodedLength(dchar c) 44 in { 45 assert(canEncode(c)); 46 } do { 47 return 1; 48 } 49 50 size_t encode(dchar c, ubyte[] buffer) 51 in { 52 assert(canEncode(c)); 53 assert(buffer.length > 0); 54 } do { 55 buffer[0] = conversionTableFromUTF32[c]; 56 return 1; 57 } 58 59 dchar decode(ref const(ubyte)[] s) 60 in { 61 assert(s.length > 0); 62 } do { 63 ubyte c = s[0]; 64 dchar decodedSymbol = conversionTableToUTF32[c]; 65 s = (s.length == 1) ? [] : s[1 .. $]; 66 return decodedSymbol; 67 } 68 69 dchar safeDecode(ref const(ubyte)[] s) 70 in { 71 assert(s.length > 0); 72 } do { 73 dchar decodedSymbol; 74 if (!canDecode(s[0])) { 75 decodedSymbol = INVALID_SEQUENCE; 76 } else { 77 ubyte c = s[0]; 78 decodedSymbol = conversionTableToUTF32[c]; 79 } 80 s = (s.length == 1) ? [] : s[1 .. $]; 81 return decodedSymbol; 82 } 83 84 @property immutable(ubyte)[] replacementSequence() { 85 return cast(immutable(ubyte)[])"?"; 86 } 87 88 } 89 90 const bool canDecode(ubyte c) { 91 return cast(bool)(c in conversionTableToUTF32); 92 } 93 94 // must be implemented in descendants 95 protected immutable dstring specTable; 96 } 97 98 99 /******************************************************************************* 100 * Template for use by all descendants of AsciiBasedEncoding. 101 * This template forms conversion tables for current encoding and UTF-32. 102 */ 103 mixin template ConversionInjection(CodeUnitType) { 104 immutable static dstring table = asciiTable ~ specTable; 105 immutable static dchar[CodeUnitType] conversionTableToUTF32; 106 immutable static CodeUnitType[dchar] conversionTableFromUTF32; 107 108 shared static this() { 109 for (size_t i = 0; i <= 0xFF; i++) { 110 dchar ch = EncodingSchemeKOI8R.table[i]; 111 this.conversionTableToUTF32[cast(CodeUnitType)i] = ch; 112 this.conversionTableFromUTF32[ch] = cast(CodeUnitType)i; 113 } 114 } 115 } 116 117 118 /******************************************************************************* 119 * EncodingScheme to handle KOI8-R. 120 */ 121 class EncodingSchemeKOI8R : AsciiBasedEncoding { 122 123 private immutable dstring specTable = 124 "\u2500\u2502\u250c\u2510\u2514\u2518\u251c\u2524" ~ 125 "\u252c\u2534\u253c\u2580\u2584\u2588\u258c\u2590" ~ 126 "\u2591\u2592\u2593\u2320\u25a0\u2219\u221a\u2248" ~ 127 "\u2264\u2265\u00a0\u2321\u00b0\u00b2\u00b7\u00f7" ~ 128 "\u2550\u2551\u2552\u0451\u2553\u2554\u2555\u2556" ~ 129 "\u2557\u2558\u2559\u255a\u255b\u255c\u255d\u255e" ~ 130 "\u255f\u2560\u2561\u0401\u2562\u2563\u2564\u2565" ~ 131 "\u2566\u2567\u2568\u2569\u256a\u256b\u256c\u00a9" ~ 132 "\u044e\u0430\u0431\u0446\u0434\u0435\u0444\u0433" ~ 133 "\u0445\u0438\u0439\u043a\u043b\u043c\u043d\u043e" ~ 134 "\u043f\u044f\u0440\u0441\u0442\u0443\u0436\u0432" ~ 135 "\u044c\u044b\u0437\u0448\u044d\u0449\u0447\u044a" ~ 136 "\u042e\u0410\u0411\u0426\u0414\u0415\u0424\u0413" ~ 137 "\u0425\u0418\u0419\u041a\u041b\u041c\u041d\u041e" ~ 138 "\u041f\u042f\u0420\u0421\u0422\u0423\u0416\u0412" ~ 139 "\u042c\u042b\u0417\u0428\u042d\u0429\u0427\u042a"; 140 141 mixin ConversionInjection!KOI8RChar; 142 143 override const string[] names() @safe pure nothrow { 144 return ["cskoi8r", "koi", "koi8", "koi8-r", "koi8_r"]; 145 } 146 147 override const string toString() { 148 return "KOI8-R"; 149 } 150 } 151 152 153 /******************************************************************************* 154 * EncodingScheme to handle KOI8-U. 155 */ 156 class EncodingSchemeKOI8U : AsciiBasedEncoding { 157 158 private immutable dstring specTable = 159 "\u2500\u2502\u250c\u2510\u2514\u2518\u251c\u2524" ~ 160 "\u252c\u2534\u253c\u2580\u2584\u2588\u258c\u2590" ~ 161 "\u2591\u2592\u2593\u2320\u25a0\u2219\u221a\u2248" ~ 162 "\u2264\u2265\u00a0\u2321\u00b0\u00b2\u00b7\u00f7" ~ 163 164 "\u2550\u2551\u2552\u0451\u0454\u2554\u0456\u0457" ~ 165 "\u2557\u2558\u2559\u255a\u255b\u0491\u255d\u255e" ~ 166 "\u255f\u2560\u2561\u0401\u0404\u2563\u0406\u0407" ~ 167 "\u2566\u2567\u2568\u2569\u256a\u256b\u0490\u00a9" ~ 168 169 "\u044e\u0430\u0431\u0446\u0434\u0435\u0444\u0433" ~ 170 "\u0445\u0438\u0439\u043a\u043b\u043c\u043d\u043e" ~ 171 "\u043f\u044f\u0440\u0441\u0442\u0443\u0436\u0432" ~ 172 "\u044c\u044b\u0437\u0448\u044d\u0449\u0447\u044a" ~ 173 "\u042e\u0410\u0411\u0426\u0414\u0415\u0424\u0413" ~ 174 "\u0425\u0418\u0419\u041a\u041b\u041c\u041d\u041e" ~ 175 "\u041f\u042f\u0420\u0421\u0422\u0423\u0416\u0412" ~ 176 "\u042c\u042b\u0417\u0428\u042d\u0429\u0427\u042a"; 177 178 mixin ConversionInjection!KOI8UChar; 179 180 override const string[] names() @safe pure nothrow { 181 return ["koi8-u", "koi8_u"]; 182 } 183 184 override const string toString() { 185 return "KOI8-U"; 186 } 187 } 188 189 190 /******************************************************************************* 191 * Gets encoded Amalthea ASCII based string from dstring (UTF-32). 192 * 193 * Params: 194 * s = The UTF32-string for transcoding. 195 * safe = If false, the input has to be valid to avoid mistakes, 196 * if true, inappropriate characters will be replaced with '?'. 197 */ 198 AsciiBasedString encodeFromUTF32(AsciiBasedString)(dstring s, bool safe = false) 199 if ( 200 is(AsciiBasedString == KOI8RString) || is(AsciiBasedString == KOI8UString) 201 ) { 202 static if (is(AsciiBasedString == KOI8RString)) { 203 alias Scheme = EncodingSchemeKOI8R; 204 alias CodeUnitType = KOI8RChar; 205 } else static if (is(AsciiBasedString == KOI8UString)) { 206 alias Scheme = EncodingSchemeKOI8U; 207 alias CodeUnitType = KOI8UChar; 208 } 209 210 CodeUnitType[] line; 211 line.length = s.length; 212 if (safe) { 213 immutable static replacementChar = cast(CodeUnitType)'?'; 214 foreach(i, ch; s) { 215 line[i] = Scheme.conversionTableFromUTF32.get(ch, replacementChar); 216 } 217 } else { 218 foreach(i, ch; s) { 219 line[i] = Scheme.conversionTableFromUTF32[ch]; 220 } 221 } 222 return line.idup; 223 } 224 /// 225 unittest { 226 dstring russianText = "Привет, мир!"d; 227 KOI8RString koi8rText = encodeFromUTF32!KOI8RString(russianText); 228 ubyte[] expected = [ 229 0xf0, 0xd2, 0xc9, 0xd7, 0xc5, 0xd4, 0x2c, 0x20, 0xcd, 0xc9, 0xd2, 0x21 230 ]; 231 assert(cast(ubyte[])koi8rText == expected); 232 } 233 234 235 /******************************************************************************* 236 * Encodes dstring to KOI8RString or KOI8UString. 237 * The input does not have to be valid. 238 * 239 * Params: 240 * s = The UTF32-string for transcoding. 241 */ 242 AsciiBasedString safeEncodeFromUTF32(AsciiBasedString)(dstring s) { 243 return encodeFromUTF32!AsciiBasedString(s, true); 244 } 245 /// 246 unittest { 247 dstring invalidText = "你好,世界!"d; 248 KOI8RString koi8rText = safeEncodeFromUTF32!KOI8RString(invalidText); 249 assert(cast(ubyte[])koi8rText == ['?', '?', '?', '?', '?', '?']); 250 } 251 252 253 /******************************************************************************* 254 * Convert a string from Amalthea ASCII based encoding to UTF-32. 255 * 256 * Params: 257 * source = Source string. It must be validly encoded. 258 * dest = Destination string. 259 */ 260 void transcode(AsciiBasedString)(AsciiBasedString source, out dstring dest) { 261 dchar[] result; 262 result.length = source.length; 263 foreach(i, ch; source) { 264 static if (is(AsciiBasedString == KOI8RString)) { 265 result[i] = EncodingSchemeKOI8R.conversionTableToUTF32[ch]; 266 } else static if (is(AsciiBasedString == KOI8UString)) { 267 result[i] = EncodingSchemeKOI8U.conversionTableToUTF32[ch]; 268 } 269 } 270 dest = result.idup; 271 } 272 273 274 /******************************************************************************* 275 * Convert a string from Amalthea ASCII based encoding to UTF-8. 276 * 277 * Params: 278 * source = Source string. It must be validly encoded. 279 * dest = Destination string. 280 */ 281 void transcode(AsciiBasedString)(AsciiBasedString source, out string dest) { 282 dstring result; 283 transcode!AsciiBasedString(source, result); 284 dest = result.to!string; 285 } 286 287 288 /******************************************************************************* 289 * Convert a string from UTF-32 to Amalthea ASCII based encoding. 290 * 291 * Params: 292 * source = Source string. It must be validly encoded. 293 * dest = Destination string. 294 */ 295 void transcode(AsciiBasedString) 296 (dstring source, out AsciiBasedString dest) { 297 static if (is(AsciiBasedString == KOI8RString)) { 298 alias Scheme = EncodingSchemeKOI8R; 299 alias CodeUnitType = KOI8RChar; 300 } else static if (is(AsciiBasedString == KOI8UString)) { 301 alias Scheme = EncodingSchemeKOI8U; 302 alias CodeUnitType = KOI8UChar; 303 } 304 305 CodeUnitType[] line; 306 line.length = source.length; 307 foreach(i, ch; source) { 308 line[i] = Scheme.conversionTableFromUTF32[ch]; 309 } 310 dest = cast(AsciiBasedString)line; 311 } 312 313 314 /******************************************************************************* 315 * Convert a string from UTF-8 to Amalthea ASCII based encoding. 316 * 317 * Params: 318 * source = Source string. It must be validly encoded. 319 * dest = Destination string. 320 */ 321 void transcode(AsciiBasedString) 322 (string source, out AsciiBasedString dest) { 323 transcode!AsciiBasedString(source.to!dstring, dest); 324 } 325 326 327 /******************************************************************************* 328 * A universal string containing string data and information about its encoding. 329 */ 330 struct UniString { 331 /// Byte string representation. 332 protected ubyte[] str; 333 /// Name of string encoding. 334 protected string encodingName; 335 336 /// Returns a current string as a byte array. 337 ubyte[] getRawData() { 338 return str; 339 } 340 341 /// Returns name of current encoding. 342 string getEncodingName() { 343 return encodingName; 344 } 345 346 /// Creates an instance from an encoded string. 347 this(T)(T s) { 348 encodingName = getEncodingNameByType!T; 349 str = cast(ubyte[])s.dup; 350 } 351 352 /// Creates an instance by encoding name and byte representaion. 353 this(string encoding, ubyte[] s) { 354 encodingName = encoding; 355 str = s.dup; 356 } 357 358 /// Returns UTF-8 string represenation. 359 string toString() { 360 return decodeByEncodingName(str, encodingName); 361 } 362 363 /// Returns string of specified encoding by data type in compile time. 364 T toEncodedString(T)() { 365 string temp = this.toString(); 366 T result; 367 static if (is(T:EncodingSchemeKOI8R) || is(T:EncodingSchemeKOI8U)) { 368 amalthea.encoding.transcode(temp, result); 369 } else { 370 std.encoding.transcode(temp, result); 371 } 372 return result; 373 } 374 375 /// Returns string as byte array of specified encoding by data type. 376 ubyte[] toEncodedString(string encodingName) { 377 return this.toString.encodeByEncodingName(encodingName); 378 } 379 380 /*************************************************************************** 381 * Recodes current data of this UniString object into a new encoding 382 * by the string type passed as template parameter. 383 */ 384 void recode(T)() { 385 this = UniString(this.toEncodedString!T); 386 } 387 388 /*************************************************************************** 389 * Recodes current data of this UniString object to new encoding in runtime. 390 * 391 * Params: 392 * encodingName = Name of new encoding of this string. 393 */ 394 void recode(string encodingName) { 395 str = this.toEncodedString(encodingName); 396 this.encodingName = encodingName; 397 } 398 399 } 400 401 402 /// Decodes to UTF-8 string from byte representaion by encoding name. 403 string decodeByEncodingName(ubyte[] s, string encodingName) { 404 alias phTranscode = std.encoding.transcode; 405 alias amTranscode = amalthea.encoding.transcode; 406 string dest; 407 switch(encodingName) { 408 case "ASCII": phTranscode(cast(AsciiString)s, dest); break; 409 case "ISO-8859-1": phTranscode(cast(Latin1String)s, dest); break; 410 case "ISO-8859-2": phTranscode(cast(Latin2String)s, dest); break; 411 case "windows-1250": phTranscode(cast(Windows1250String)s, dest); break; 412 static if (__traits(compiles, Windows1251String)) { 413 case "windows-1251": phTranscode(cast(Windows1251String)s, dest); break; 414 } 415 case "windows-1252": phTranscode(cast(Windows1252String)s, dest); break; 416 case "UTF-8": dest = cast(string)s; break; 417 case "UTF-16": phTranscode(cast(wstring)s, dest); break; 418 case "UTF-32": phTranscode(cast(dstring)s, dest); break; 419 420 case "KOI8-R": amTranscode(cast(KOI8RString)s, dest); break; 421 case "KOI8-U": amTranscode(cast(KOI8UString)s, dest); break; 422 423 default: dest = cast(string)encodeText(s, encodingName, "UTF-8"); 424 } 425 return dest; 426 } 427 428 429 /// Decodes to any type string from UTF-8 representaion by encoding name. 430 ubyte[] encodeByEncodingName(string s, string encodingName) { 431 alias phTranscode = std.encoding.transcode; 432 alias amTranscode = amalthea.encoding.transcode; 433 ubyte[] dest; 434 void transformData(T)() { 435 T temp; 436 static if (is(T == KOI8RString) || is(T == KOI8UString)) { 437 amTranscode(s, temp); 438 } else { 439 phTranscode(s, temp); 440 } 441 dest = cast(ubyte[])temp; 442 } 443 switch(encodingName) { 444 case "ASCII": transformData!AsciiString(); break; 445 case "ISO-8859-1": transformData!Latin1String(); break; 446 case "ISO-8859-2": transformData!Latin2String(); break; 447 case "windows-1250": transformData!Windows1250String(); break; 448 static if (__traits(compiles, Windows1251String)) { 449 case "windows-1251": transformData!Windows1251String(); break; 450 } 451 case "windows-1252": transformData!Windows1252String(); break; 452 case "UTF-8": dest = cast(ubyte[])(s); break; 453 case "UTF-16": dest = cast(ubyte[])(s.to!wstring); break; 454 case "UTF-32": dest = cast(ubyte[])(s.to!dstring); break; 455 456 case "KOI8-R": transformData!KOI8RString(); break; 457 case "KOI8-U": transformData!KOI8UString(); break; 458 459 default: dest = encodeText(s, "UTF-8", encodingName); 460 } 461 return dest; 462 } 463 464 465 /// Gets encoding name by string type. 466 string getEncodingNameByType(T)() { 467 if (is(T == AsciiString)) { 468 return "ASCII"; 469 } else if (is(T == Latin1String)) { 470 return "ISO-8859-1"; 471 } else if (is(T == Latin2String)) { 472 return "ISO-8859-2"; 473 } else if (is(T == Windows1250String)) { 474 return "windows-1250"; 475 } else if (is(T == Windows1251String)) { 476 return "windows-1251"; 477 } else if (is(T == Windows1252String)) { 478 return "windows-1252"; 479 } else if (is(T == string)) { 480 return "UTF-8"; 481 } else if (is(T == wstring)) { 482 return "UTF-16"; 483 } else if (is(T == dstring)) { 484 return "UTF-32"; 485 } 486 487 if (is(T == KOI8RString)) { 488 return "KOI8-R"; 489 } else if (is(T == KOI8UString)) { 490 return "KOI8-U"; 491 } 492 493 return ""; 494 } 495 496 497 /* 498 LIBICONV 499 */ 500 501 import core.stdc.errno; 502 import amalthea.dataprocessing : makeFilledArray; 503 504 private { 505 alias iconv_t = void*; 506 extern(C) iconv_t iconv_open(const ubyte* tocode, const ubyte* fromcode); 507 extern(C) size_t iconv(iconv_t cd, 508 const ubyte** inbuf, 509 size_t* inbytesleft, 510 ubyte** outbuf, 511 size_t* outbytesleft); 512 extern(C) int iconv_close(iconv_t cd); 513 } 514 515 516 /******************************************************************************* 517 * The function tries to encode text sequence to new encoding. 518 * The convertation is based on libiconv. 519 * The list is available with 'iconv --list'. 520 * 521 * Homepage of libiconv: $(EXT_LINK https://www.gnu.org/software/libiconv/) 522 * 523 * Params: 524 * seq = Array of characters (string, dstring, KOI8RString, ubyte[], etc.). 525 * fromEncoding = The start encoding of the transmitted sequence. 526 * toEncoding = The destination encoding for the returned value. 527 */ 528 ubyte[] encodeText(T)(const T[] seq, string fromEncoding, string toEncoding) { 529 const ubyte[] text = cast(const ubyte[])seq; 530 auto conversionDescriptor = iconv_open( 531 cast(const ubyte*)toEncoding, cast(const ubyte*)fromEncoding.ptr 532 ); 533 if (conversionDescriptor == cast(iconv_t)(-1)) { 534 throw new EncodingException("Unavailable conversion"); 535 } 536 scope(exit) iconv_close(conversionDescriptor); 537 538 // large array in reserve 539 ubyte[] encodedText = makeFilledArray!ubyte(text.length*4, 0); 540 auto bufLen = encodedText.length; 541 auto origLen = text.length; 542 543 errno = 0; 544 const ubyte* inbufPtr = text.ptr; 545 ubyte* outbufPtr = encodedText.ptr; 546 auto badChars = iconv( 547 conversionDescriptor, &inbufPtr, &origLen, &outbufPtr, &bufLen 548 ); 549 const err = errno; 550 551 if (badChars == size_t.max) { 552 if (err == EILSEQ || err == EINVAL) { 553 throw new EncodingException("Invalid sequence"); 554 } else if (err == E2BIG) { //unattainable situation 555 throw new EncodingException("Insufficient buffer size"); 556 } 557 } 558 return encodedText.stripRight(0); 559 } 560 /// 561 unittest { 562 string text = "Привет, мир!"; 563 ubyte[] koi8rText = encodeText(text, "utf-8", "koi8-r"); 564 ubyte[] expected = [ 565 0xf0, 0xd2, 0xc9, 0xd7, 0xc5, 0xd4, 0x2c, 0x20, 0xcd, 0xc9, 0xd2, 0x21 566 ]; 567 assert(koi8rText == expected); 568 569 text = "你好,世界!"; 570 bool error = false; 571 try { 572 koi8rText = encodeText(text, "utf-8", "koi8-r"); 573 } catch (EncodingException e) { 574 error = true; 575 } 576 assert(error); 577 }