amalthea.encoding source code

1 /* This file is part of the Amalthea library.
2  *
3  * Copyright (C) 2020-2021 Eugene 'Vindex' Stulin
4  *
5  * Distributed under the Boost Software License 1.0 or (at your option)
6  * the GNU Lesser General Public License 3.0 or later.
7  */
8 
9 module amalthea.encoding;
10 
11 public import amalthea.libcore;
12 import std.algorithm, std.encoding, std.range, std.string;
13 
14 shared static this() {
15     EncodingScheme.register!EncodingSchemeKOI8R;
16     EncodingScheme.register!EncodingSchemeKOI8U;
17 }
18 
19 /// Defines a KOI8-R character and string.
20 enum KOI8RChar : ubyte { init }
21 alias KOI8RString = immutable(KOI8RChar)[];
22 
23 /// Defines an KOI8-U character and string.
24 enum KOI8UChar : ubyte { init }
25 alias KOI8UString = immutable(KOI8UChar)[];
26 
27 /** American Standard Code for Information Interchange, 128 bit encoding */
28 immutable dstring asciiTable = iota!dchar('\x00', '\x80').array;
29 
30 
31 /*******************************************************************************
32  * Abstract base class for all ASCII based encoding schemes of Amalthea.
33  */
34 abstract class AsciiBasedEncoding : std.encoding.EncodingScheme {
35     immutable static dchar[ubyte] conversionTableToUTF32;
36     immutable static ubyte[dchar] conversionTableFromUTF32;
37 
38     const override {
39         bool canEncode(dchar c) @safe pure nothrow @nogc {
40             return cast(bool)(c in conversionTableFromUTF32);
41         }
42 
43         size_t encodedLength(dchar c)
44         in {
45             assert(canEncode(c));
46         } do {
47             return 1;
48         }
49 
50         size_t encode(dchar c, ubyte[] buffer)
51         in {
52             assert(canEncode(c));
53             assert(buffer.length > 0);
54         } do {
55             buffer[0] = conversionTableFromUTF32[c];
56             return 1;
57         }
58 
59         dchar decode(ref const(ubyte)[] s)
60         in {
61             assert(s.length > 0);
62         } do {
63             ubyte c = s[0];
64             dchar decodedSymbol = conversionTableToUTF32[c];
65             s = (s.length == 1) ? [] : s[1 .. $];
66             return decodedSymbol;
67         }
68 
69         dchar safeDecode(ref const(ubyte)[] s)
70         in {
71             assert(s.length > 0);
72         } do {
73             dchar decodedSymbol;
74             if (!canDecode(s[0])) {
75                 decodedSymbol = INVALID_SEQUENCE;
76             } else {
77                 ubyte c = s[0];
78                 decodedSymbol = conversionTableToUTF32[c];
79             }
80             s = (s.length == 1) ? [] : s[1 .. $];
81             return decodedSymbol;
82         }
83 
84         @property immutable(ubyte)[] replacementSequence() {
85             return cast(immutable(ubyte)[])"?";
86         }
87 
88     }
89 
90     const bool canDecode(ubyte c) {
91         return cast(bool)(c in conversionTableToUTF32);
92     }
93 
94     // must be implemented in descendants
95     protected immutable dstring specTable;
96 }
97 
98 
99 /*******************************************************************************
100  * Template for use by all descendants of AsciiBasedEncoding.
101  * This template forms conversion tables for current encoding and UTF-32.
102  */
103 mixin template ConversionInjection(CodeUnitType) {
104     immutable static dstring table = asciiTable ~ specTable;
105     immutable static dchar[CodeUnitType] conversionTableToUTF32;
106     immutable static CodeUnitType[dchar] conversionTableFromUTF32;
107 
108     shared static this() {
109         for (size_t i = 0; i <= 0xFF; i++) {
110             dchar ch = EncodingSchemeKOI8R.table[i];
111             this.conversionTableToUTF32[cast(CodeUnitType)i] = ch;
112             this.conversionTableFromUTF32[ch] = cast(CodeUnitType)i;
113         }
114     }
115 }
116 
117 
118 /*******************************************************************************
119  * EncodingScheme to handle KOI8-R.
120  */
121 class EncodingSchemeKOI8R : AsciiBasedEncoding {
122 
123     private immutable dstring specTable =
124         "\u2500\u2502\u250c\u2510\u2514\u2518\u251c\u2524" ~
125         "\u252c\u2534\u253c\u2580\u2584\u2588\u258c\u2590" ~
126         "\u2591\u2592\u2593\u2320\u25a0\u2219\u221a\u2248" ~
127         "\u2264\u2265\u00a0\u2321\u00b0\u00b2\u00b7\u00f7" ~
128         "\u2550\u2551\u2552\u0451\u2553\u2554\u2555\u2556" ~
129         "\u2557\u2558\u2559\u255a\u255b\u255c\u255d\u255e" ~
130         "\u255f\u2560\u2561\u0401\u2562\u2563\u2564\u2565" ~
131         "\u2566\u2567\u2568\u2569\u256a\u256b\u256c\u00a9" ~
132         "\u044e\u0430\u0431\u0446\u0434\u0435\u0444\u0433" ~
133         "\u0445\u0438\u0439\u043a\u043b\u043c\u043d\u043e" ~
134         "\u043f\u044f\u0440\u0441\u0442\u0443\u0436\u0432" ~
135         "\u044c\u044b\u0437\u0448\u044d\u0449\u0447\u044a" ~
136         "\u042e\u0410\u0411\u0426\u0414\u0415\u0424\u0413" ~
137         "\u0425\u0418\u0419\u041a\u041b\u041c\u041d\u041e" ~
138         "\u041f\u042f\u0420\u0421\u0422\u0423\u0416\u0412" ~
139         "\u042c\u042b\u0417\u0428\u042d\u0429\u0427\u042a";
140 
141     mixin ConversionInjection!KOI8RChar;
142 
143     override const string[] names() @safe pure nothrow {
144         return ["cskoi8r", "koi", "koi8", "koi8-r", "koi8_r"];
145     }
146 
147     override const string toString() {
148         return "KOI8-R";
149     }
150 }
151 
152 
153 /*******************************************************************************
154  * EncodingScheme to handle KOI8-U.
155  */
156 class EncodingSchemeKOI8U : AsciiBasedEncoding {
157 
158     private immutable dstring specTable =
159         "\u2500\u2502\u250c\u2510\u2514\u2518\u251c\u2524" ~
160         "\u252c\u2534\u253c\u2580\u2584\u2588\u258c\u2590" ~
161         "\u2591\u2592\u2593\u2320\u25a0\u2219\u221a\u2248" ~
162         "\u2264\u2265\u00a0\u2321\u00b0\u00b2\u00b7\u00f7" ~
163 
164         "\u2550\u2551\u2552\u0451\u0454\u2554\u0456\u0457" ~
165         "\u2557\u2558\u2559\u255a\u255b\u0491\u255d\u255e" ~
166         "\u255f\u2560\u2561\u0401\u0404\u2563\u0406\u0407" ~
167         "\u2566\u2567\u2568\u2569\u256a\u256b\u0490\u00a9" ~
168 
169         "\u044e\u0430\u0431\u0446\u0434\u0435\u0444\u0433" ~
170         "\u0445\u0438\u0439\u043a\u043b\u043c\u043d\u043e" ~
171         "\u043f\u044f\u0440\u0441\u0442\u0443\u0436\u0432" ~
172         "\u044c\u044b\u0437\u0448\u044d\u0449\u0447\u044a" ~
173         "\u042e\u0410\u0411\u0426\u0414\u0415\u0424\u0413" ~
174         "\u0425\u0418\u0419\u041a\u041b\u041c\u041d\u041e" ~
175         "\u041f\u042f\u0420\u0421\u0422\u0423\u0416\u0412" ~
176         "\u042c\u042b\u0417\u0428\u042d\u0429\u0427\u042a";
177 
178     mixin ConversionInjection!KOI8UChar;
179 
180     override const string[] names() @safe pure nothrow {
181         return ["koi8-u", "koi8_u"];
182     }
183 
184     override const string toString() {
185         return "KOI8-U";
186     }
187 }
188 
189 
190 /*******************************************************************************
191  * Gets encoded Amalthea ASCII based string from dstring (UTF-32).
192  *
193  * Params:
194  *     s = The UTF32-string for transcoding.
195  *     safe = If false, the input has to be valid to avoid mistakes,
196  *            if true, inappropriate characters will be replaced with '?'.
197  */
198 AsciiBasedString encodeFromUTF32(AsciiBasedString)(dstring s, bool safe = false)
199 if (
200     is(AsciiBasedString == KOI8RString) || is(AsciiBasedString == KOI8UString)
201 ) {
202     static if (is(AsciiBasedString == KOI8RString)) {
203         alias Scheme = EncodingSchemeKOI8R;
204         alias CodeUnitType = KOI8RChar;
205     } else static if (is(AsciiBasedString == KOI8UString)) {
206         alias Scheme = EncodingSchemeKOI8U;
207         alias CodeUnitType = KOI8UChar;
208     }
209 
210     CodeUnitType[] line;
211     line.length = s.length;
212     if (safe) {
213         immutable static replacementChar = cast(CodeUnitType)'?';
214         foreach(i, ch; s) {
215             line[i] = Scheme.conversionTableFromUTF32.get(ch, replacementChar);
216         }
217     } else {
218         foreach(i, ch; s) {
219             line[i] = Scheme.conversionTableFromUTF32[ch];
220         }
221     }
222     return line.idup;
223 }
224 ///
225 unittest {
226     dstring russianText = "Привет, мир!"d;
227     KOI8RString koi8rText = encodeFromUTF32!KOI8RString(russianText);
228     ubyte[] expected = [
229         0xf0, 0xd2, 0xc9, 0xd7, 0xc5, 0xd4, 0x2c, 0x20, 0xcd, 0xc9, 0xd2, 0x21
230     ];
231     assert(cast(ubyte[])koi8rText == expected);
232 }
233 
234 
235 /*******************************************************************************
236  * Encodes dstring to KOI8RString or KOI8UString.
237  * The input does not have to be valid.
238  *
239  * Params:
240  *     s = The UTF32-string for transcoding.
241  */
242 AsciiBasedString safeEncodeFromUTF32(AsciiBasedString)(dstring s) {
243     return encodeFromUTF32!AsciiBasedString(s, true);
244 }
245 ///
246 unittest {
247     dstring invalidText = "你好，世界！"d;
248     KOI8RString koi8rText = safeEncodeFromUTF32!KOI8RString(invalidText);
249     assert(cast(ubyte[])koi8rText == ['?', '?', '?', '?', '?', '?']);
250 }
251 
252 
253 /*******************************************************************************
254  * Convert a string from Amalthea ASCII based encoding to UTF-32.
255  *
256  * Params:
257  *     source = Source string. It must be validly encoded.
258  *     dest = Destination string.
259  */
260 void transcode(AsciiBasedString)(AsciiBasedString source, out dstring dest) {
261     dchar[] result;
262     result.length = source.length;
263     foreach(i, ch; source) {
264         static if (is(AsciiBasedString == KOI8RString)) {
265             result[i] = EncodingSchemeKOI8R.conversionTableToUTF32[ch];
266         } else static if (is(AsciiBasedString == KOI8UString)) {
267             result[i] = EncodingSchemeKOI8U.conversionTableToUTF32[ch];
268         }
269     }
270     dest = result.idup;
271 }
272 
273 
274 /*******************************************************************************
275  * Convert a string from Amalthea ASCII based encoding to UTF-8.
276  *
277  * Params:
278  *     source = Source string. It must be validly encoded.
279  *     dest = Destination string.
280  */
281 void transcode(AsciiBasedString)(AsciiBasedString source, out string dest) {
282     dstring result;
283     transcode!AsciiBasedString(source, result);
284     dest = result.to!string;
285 }
286 
287 
288 /*******************************************************************************
289  * Convert a string from UTF-32 to Amalthea ASCII based encoding.
290  *
291  * Params:
292  *     source = Source string. It must be validly encoded.
293  *     dest = Destination string.
294  */
295 void transcode(AsciiBasedString)
296               (dstring source, out AsciiBasedString dest) {
297     static if (is(AsciiBasedString == KOI8RString)) {
298         alias Scheme = EncodingSchemeKOI8R;
299         alias CodeUnitType = KOI8RChar;
300     } else static if (is(AsciiBasedString == KOI8UString)) {
301         alias Scheme = EncodingSchemeKOI8U;
302         alias CodeUnitType = KOI8UChar;
303     }
304 
305     CodeUnitType[] line;
306     line.length = source.length;
307     foreach(i, ch; source) {
308         line[i] = Scheme.conversionTableFromUTF32[ch];
309     }
310     dest = cast(AsciiBasedString)line;
311 }
312 
313 
314 /*******************************************************************************
315  * Convert a string from UTF-8 to Amalthea ASCII based encoding.
316  *
317  * Params:
318  *     source = Source string. It must be validly encoded.
319  *     dest = Destination string.
320  */
321 void transcode(AsciiBasedString)
322               (string source, out AsciiBasedString dest) {
323     transcode!AsciiBasedString(source.to!dstring, dest);
324 }
325 
326 
327 /*******************************************************************************
328  * A universal string containing string data and information about its encoding.
329  */
330 struct UniString {
331     /// Byte string representation.
332     protected ubyte[] str;
333     /// Name of string encoding.
334     protected string encodingName;
335 
336     /// Returns a current string as a byte array.
337     ubyte[] getRawData() {
338         return str;
339     }
340 
341     /// Returns name of current encoding.
342     string getEncodingName() {
343         return encodingName;
344     }
345 
346     /// Creates an instance from an encoded string.
347     this(T)(T s) {
348         encodingName = getEncodingNameByType!T;
349         str = cast(ubyte[])s.dup;
350     }
351 
352     /// Creates an instance by encoding name and byte representaion.
353     this(string encoding, ubyte[] s) {
354         encodingName = encoding;
355         str = s.dup;
356     }
357 
358     /// Returns UTF-8 string represenation.
359     string toString() {
360         return decodeByEncodingName(str, encodingName);
361     }
362 
363     /// Returns string of specified encoding by data type in compile time.
364     T toEncodedString(T)() {
365         string temp = this.toString();
366         T result;
367         static if (is(T:EncodingSchemeKOI8R) || is(T:EncodingSchemeKOI8U)) {
368             amalthea.encoding.transcode(temp, result);
369         } else {
370             std.encoding.transcode(temp, result);
371         }
372         return result;
373     }
374 
375     /// Returns string as byte array of specified encoding by data type.
376     ubyte[] toEncodedString(string encodingName) {
377         return this.toString.encodeByEncodingName(encodingName);
378     }
379 
380     /***************************************************************************
381      * Recodes current data of this UniString object into a new encoding
382      * by the string type passed as template parameter.
383      */
384     void recode(T)() {
385         this = UniString(this.toEncodedString!T);
386     }
387 
388     /***************************************************************************
389      * Recodes current data of this UniString object to new encoding in runtime.
390      *
391      * Params:
392      *     encodingName = Name of new encoding of this string.
393      */
394     void recode(string encodingName) {
395         str = this.toEncodedString(encodingName);
396         this.encodingName = encodingName;
397     }
398 
399 }
400 
401 
402 /// Decodes to UTF-8 string from byte representaion by encoding name.
403 string decodeByEncodingName(ubyte[] s, string encodingName) {
404     alias phTranscode = std.encoding.transcode;
405     alias amTranscode = amalthea.encoding.transcode;
406     string dest;
407     switch(encodingName) {
408         case "ASCII":        phTranscode(cast(AsciiString)s, dest);       break;
409         case "ISO-8859-1":   phTranscode(cast(Latin1String)s, dest);      break;
410         case "ISO-8859-2":   phTranscode(cast(Latin2String)s, dest);      break;
411         case "windows-1250": phTranscode(cast(Windows1250String)s, dest); break;
412 static if (__traits(compiles, Windows1251String)) {
413         case "windows-1251": phTranscode(cast(Windows1251String)s, dest); break;
414 }
415         case "windows-1252": phTranscode(cast(Windows1252String)s, dest); break;
416         case "UTF-8":        dest = cast(string)s;                        break;
417         case "UTF-16":       phTranscode(cast(wstring)s, dest);           break;
418         case "UTF-32":       phTranscode(cast(dstring)s, dest);           break;
419 
420         case "KOI8-R":       amTranscode(cast(KOI8RString)s, dest);       break;
421         case "KOI8-U":       amTranscode(cast(KOI8UString)s, dest);       break;
422 
423         default: dest = cast(string)encodeText(s, encodingName, "UTF-8");
424     }
425     return dest;
426 }
427 
428 
429 /// Decodes to any type string from UTF-8 representaion by encoding name.
430 ubyte[] encodeByEncodingName(string s, string encodingName) {
431     alias phTranscode = std.encoding.transcode;
432     alias amTranscode = amalthea.encoding.transcode;
433     ubyte[] dest;
434     void transformData(T)() {
435         T temp;
436         static if (is(T == KOI8RString) || is(T == KOI8UString)) {
437             amTranscode(s, temp);
438         } else {
439             phTranscode(s, temp);
440         }
441         dest = cast(ubyte[])temp;
442     }
443     switch(encodingName) {
444         case "ASCII":        transformData!AsciiString();        break;
445         case "ISO-8859-1":   transformData!Latin1String();       break;
446         case "ISO-8859-2":   transformData!Latin2String();       break;
447         case "windows-1250": transformData!Windows1250String();  break;
448 static if (__traits(compiles, Windows1251String)) {
449         case "windows-1251": transformData!Windows1251String();  break;
450 }
451         case "windows-1252": transformData!Windows1252String();  break;
452         case "UTF-8":        dest = cast(ubyte[])(s);            break;
453         case "UTF-16":       dest = cast(ubyte[])(s.to!wstring); break;
454         case "UTF-32":       dest = cast(ubyte[])(s.to!dstring); break;
455 
456         case "KOI8-R":       transformData!KOI8RString();        break;
457         case "KOI8-U":       transformData!KOI8UString();        break;
458 
459         default: dest = encodeText(s, "UTF-8", encodingName);
460     }
461     return dest;
462 }
463 
464 
465 /// Gets encoding name by string type.
466 string getEncodingNameByType(T)() {
467     if (is(T == AsciiString)) {
468         return "ASCII";
469     } else if (is(T == Latin1String)) {
470         return "ISO-8859-1";
471     } else if (is(T == Latin2String)) {
472         return "ISO-8859-2";
473     } else if (is(T == Windows1250String)) {
474         return "windows-1250";
475     } else if (is(T == Windows1251String)) {
476         return "windows-1251";
477     } else if (is(T == Windows1252String)) {
478         return "windows-1252";
479     } else if (is(T == string)) {
480         return "UTF-8";
481     } else if (is(T == wstring)) {
482         return "UTF-16";
483     } else if (is(T == dstring)) {
484         return "UTF-32";
485     }
486 
487     if (is(T == KOI8RString)) {
488         return "KOI8-R";
489     } else if (is(T == KOI8UString)) {
490         return "KOI8-U";
491     }
492 
493     return "";
494 }
495 
496 
497 /*
498    LIBICONV
499 */
500 
501 import core.stdc.errno;
502 import amalthea.dataprocessing : makeFilledArray;
503 
504 private {
505     alias iconv_t = void*;
506     extern(C) iconv_t iconv_open(const ubyte* tocode, const ubyte* fromcode);
507     extern(C) size_t iconv(iconv_t cd,
508                            const ubyte** inbuf,
509                            size_t* inbytesleft,
510                            ubyte** outbuf,
511                            size_t* outbytesleft);
512     extern(C) int iconv_close(iconv_t cd);
513 }
514 
515 
516 /*******************************************************************************
517  * The function tries to encode text sequence to new encoding.
518  * The convertation is based on libiconv.
519  * The list is available with 'iconv --list'.
520  *
521  * Homepage of libiconv: $(EXT_LINK https://www.gnu.org/software/libiconv/)
522  *
523  * Params:
524  *     seq = Array of characters (string, dstring, KOI8RString, ubyte[], etc.).
525  *     fromEncoding = The start encoding of the transmitted sequence.
526  *     toEncoding = The destination encoding for the returned value.
527  */
528 ubyte[] encodeText(T)(const T[] seq, string fromEncoding, string toEncoding) {
529     const ubyte[] text = cast(const ubyte[])seq;
530     auto conversionDescriptor = iconv_open(
531         cast(const ubyte*)toEncoding, cast(const ubyte*)fromEncoding.ptr
532     );
533     if (conversionDescriptor == cast(iconv_t)(-1)) {
534         throw new EncodingException("Unavailable conversion");
535     }
536     scope(exit) iconv_close(conversionDescriptor);
537 
538     // large array in reserve
539     ubyte[] encodedText = makeFilledArray!ubyte(text.length*4, 0);
540     auto bufLen = encodedText.length;
541     auto origLen = text.length;
542 
543     errno = 0;
544     const ubyte* inbufPtr = text.ptr;
545     ubyte* outbufPtr = encodedText.ptr;
546     auto badChars = iconv(
547         conversionDescriptor, &inbufPtr, &origLen, &outbufPtr, &bufLen
548     );
549     const err = errno;
550 
551     if (badChars == size_t.max) {
552         if (err == EILSEQ || err == EINVAL) {
553             throw new EncodingException("Invalid sequence");
554         } else if (err == E2BIG) { //unattainable situation
555             throw new EncodingException("Insufficient buffer size");
556         }
557     }
558     return encodedText.stripRight(0);
559 }
560 ///
561 unittest {
562     string text = "Привет, мир!";
563     ubyte[] koi8rText = encodeText(text, "utf-8", "koi8-r");
564     ubyte[] expected = [
565         0xf0, 0xd2, 0xc9, 0xd7, 0xc5, 0xd4, 0x2c, 0x20, 0xcd, 0xc9, 0xd2, 0x21
566     ];
567     assert(koi8rText == expected);
568 
569     text = "你好，世界！";
570     bool error = false;
571     try {
572         koi8rText = encodeText(text, "utf-8", "koi8-r");
573     } catch (EncodingException e) {
574         error = true;
575     }
576     assert(error);
577 }