cfad47cfa3/tads3/charmap.cpp

4b825dc642cb6eb9a060e54bf8d69288fbee4904cfad47cfa334b206c65f22086bcc5d63e6f70944
1
#ifdef RCSID
2
static char RCSid[] =
3
"$Header: d:/cvsroot/tads/tads3/charmap.cpp,v 1.3 1999/07/11 00:46:58 MJRoberts Exp $";
4
#endif
5
6
/* 
7
 *   Copyright (c) 1998, 2002 Michael J. Roberts.  All Rights Reserved.
8
 *   
9
 *   Please see the accompanying license file, LICENSE.TXT, for information
10
 *   on using and copying this software.  
11
 */
12
/*
13
Name
14
  charmap.cpp - character mapper
15
Function
16
  
17
Notes
18
  
19
Modified
20
  10/17/98 MJRoberts  - Creation
21
*/
22
23
#include <stdlib.h>
24
#include <string.h>
25
26
#include "t3std.h"
27
#include "os.h"
28
#include "utf8.h"
29
#include "resload.h"
30
#include "charmap.h"
31
32
33
/* ------------------------------------------------------------------------ */
34
/*
35
 *   Basic Mapper Class 
36
 */
37
38
/*
39
 *   Open and characterize a mapping file 
40
 */
41
osfildef *CCharmap::open_map_file(class CResLoader *res_loader,
42
                                  const char *table_name,
43
                                  charmap_type_t *map_type)
44
{
45
    osfildef *fp;
46
    char respath[100];
47
    ulong startpos;
48
    uchar buf[256];
49
    uint entry_cnt;
50
    int found_single;
51
    int found_double;
52
53
    /*
54
     *   Generate the full resource path - character mapping resource paths
55
     *   always start with "charmap/" followed by the table name, plus the
56
     *   ".tcm" extension.  
57
     */
58
    strcpy(respath, "charmap/");
59
    strcat(respath, table_name);
60
    strcat(respath, ".tcm");
61
62
    /* open the file for the character set */
63
    fp = res_loader->open_res_file(respath, "charmap/cmaplib", "CLIB");
64
65
    /* if we couldn't open the mapping file, return failure */
66
    if (fp == 0)
67
        return 0;
68
69
    /* note the initial seek position */
70
    startpos = osfpos(fp);
71
72
    /* read the header and the local-to-unicode header */
73
    if (osfrb(fp, buf, 6))
74
        goto fail;
75
76
    /* get the number of entries from the local-to-unicode header */
77
    entry_cnt = osrp2(buf + 4);
78
79
    /* 
80
     *   Scan the entries to determine if we have single-byte,
81
     *   double-byte, or both.  
82
     */
83
    found_single = found_double = FALSE;
84
    while (entry_cnt > 0)
85
    {
86
        size_t cur;
87
        const uchar *p;
88
89
        /* read up to a buffer-full or the remaining size */
90
        cur = sizeof(buf)/4;
91
        if (cur > entry_cnt)
92
            cur = entry_cnt;
93
94
        /* read it */
95
        if (osfrb(fp, buf, cur*4))
96
            goto fail;
97
98
        /* deduct the amount we just read from the amount remaining */
99
        entry_cnt -= cur;
100
101
        /* scan the entries */
102
        for (p = buf ; cur > 0 ; --cur, p += 4)
103
        {
104
            /* 
105
             *   Note whether this is a single-byte or double-byte entry.
106
             *   If the high-order byte is non-zero, it's a double-byte
107
             *   entry; otherwise, it's a single-byte entry.
108
             *   
109
             *   Note that we read the UINT2 at (p+2), because that's the
110
             *   local character-set code point in this tuple.  
111
             */
112
            if (((uint)osrp2(p + 2)) > 0xFF)
113
                found_double = TRUE;
114
            else
115
                found_single = TRUE;
116
        }
117
118
        /* 
119
         *   if we've found both single- and double-byte characters so
120
         *   far, there's no need to look any further, since we know
121
         *   everything about the file now 
122
         */
123
        if (found_single && found_double)
124
            break;
125
    }
126
127
    /* 
128
     *   create the appropriate mapper, depending on whether we found
129
     *   single, double, or mixed characters 
130
     */
131
    if (found_single && found_double)
132
    {
133
        /* it's mixed */
134
        *map_type = CHARMAP_TYPE_MB;
135
    }
136
    else if (found_double)
137
    {
138
        /* it's all double-byte */
139
        *map_type = CHARMAP_TYPE_DB;
140
    }
141
    else if (found_single)
142
    {
143
        /* it's all single-byte */
144
        *map_type = CHARMAP_TYPE_SB;
145
    }
146
    else
147
    {
148
        /* no mappings found at all - presume it's a single-byte mapper */
149
        *map_type = CHARMAP_TYPE_SB;
150
    }
151
152
    /* seek back to the start of the table */
153
    osfseek(fp, startpos, OSFSK_SET);
154
155
    /* return the file pointer */
156
    return fp;
157
158
fail:
159
    /* close the file and return failure */
160
    osfcls(fp);
161
    return 0;
162
}
163
164
/* ------------------------------------------------------------------------ */
165
/*
166
 *   Special built-in mapper to 7-bit ASCII.  This is available as a last
167
 *   resort when no external mapping file can be found.  
168
 */
169
170
/*
171
 *   create a plain ascii translator 
172
 */
173
CCharmapToLocalASCII::CCharmapToLocalASCII()
174
{
175
    unsigned char *dst;
176
    wchar_t *exp_dst;
177
    size_t siz;
178
    size_t exp_siz;
179
    struct ascii_map_t
180
    {
181
        wchar_t uni;
182
        char asc[5];
183
    };
184
    ascii_map_t *p;
185
    static ascii_map_t ascii_mapping[] =
186
    {
187
        /* regular ASCII characters */
188
        { 1, { 1 } },
189
        { 2, { 2 } },
190
        { 3, { 3 } },
191
        { 4, { 4 } },
192
        { 5, { 5 } },
193
        { 6, { 6 } },
194
        { 7, { 7 } },
195
        { 8, { 8 } },
196
        { 9, { 9 } },
197
        { 10, { 10 } },
198
        { 11, { 11 } },
199
        { 12, { 12 } },
200
        { 13, { 13 } },
201
        { 14, { 14 } },
202
        { 15, { 15 } },
203
        { 16, { 16 } },
204
        { 17, { 17 } },
205
        { 18, { 18 } },
206
        { 19, { 19 } },
207
        { 20, { 20 } },
208
        { 21, { 21 } },
209
        { 22, { 22 } },
210
        { 23, { 23 } },
211
        { 24, { 24 } },
212
        { 25, { 25 } },
213
        { 26, { 26 } },
214
        { 27, { 27 } },
215
        { 28, { 28 } },
216
        { 29, { 29 } },
217
        { 30, { 30 } },
218
        { 31, { 31 } },
219
        { 32, { 32 } },
220
        { 33, { 33 } },
221
        { 34, { 34 } },
222
        { 35, { 35 } },
223
        { 36, { 36 } },
224
        { 37, { 37 } },
225
        { 38, { 38 } },
226
        { 39, { 39 } },
227
        { 40, { 40 } },
228
        { 41, { 41 } },
229
        { 42, { 42 } },
230
        { 43, { 43 } },
231
        { 44, { 44 } },
232
        { 45, { 45 } },
233
        { 46, { 46 } },
234
        { 47, { 47 } },
235
        { 48, { 48 } },
236
        { 49, { 49 } },
237
        { 50, { 50 } },
238
        { 51, { 51 } },
239
        { 52, { 52 } },
240
        { 53, { 53 } },
241
        { 54, { 54 } },
242
        { 55, { 55 } },
243
        { 56, { 56 } },
244
        { 57, { 57 } },
245
        { 58, { 58 } },
246
        { 59, { 59 } },
247
        { 60, { 60 } },
248
        { 61, { 61 } },
249
        { 62, { 62 } },
250
        { 63, { 63 } },
251
        { 64, { 64 } },
252
        { 65, { 65 } },
253
        { 66, { 66 } },
254
        { 67, { 67 } },
255
        { 68, { 68 } },
256
        { 69, { 69 } },
257
        { 70, { 70 } },
258
        { 71, { 71 } },
259
        { 72, { 72 } },
260
        { 73, { 73 } },
261
        { 74, { 74 } },
262
        { 75, { 75 } },
263
        { 76, { 76 } },
264
        { 77, { 77 } },
265
        { 78, { 78 } },
266
        { 79, { 79 } },
267
        { 80, { 80 } },
268
        { 81, { 81 } },
269
        { 82, { 82 } },
270
        { 83, { 83 } },
271
        { 84, { 84 } },
272
        { 85, { 85 } },
273
        { 86, { 86 } },
274
        { 87, { 87 } },
275
        { 88, { 88 } },
276
        { 89, { 89 } },
277
        { 90, { 90 } },
278
        { 91, { 91 } },
279
        { 92, { 92 } },
280
        { 93, { 93 } },
281
        { 94, { 94 } },
282
        { 95, { 95 } },
283
        { 96, { 96 } },
284
        { 97, { 97 } },
285
        { 98, { 98 } },
286
        { 99, { 99 } },
287
        { 100, { 100 } },
288
        { 101, { 101 } },
289
        { 102, { 102 } },
290
        { 103, { 103 } },
291
        { 104, { 104 } },
292
        { 105, { 105 } },
293
        { 106, { 106 } },
294
        { 107, { 107 } },
295
        { 108, { 108 } },
296
        { 109, { 109 } },
297
        { 110, { 110 } },
298
        { 111, { 111 } },
299
        { 112, { 112 } },
300
        { 113, { 113 } },
301
        { 114, { 114 } },
302
        { 115, { 115 } },
303
        { 116, { 116 } },
304
        { 117, { 117 } },
305
        { 118, { 118 } },
306
        { 119, { 119 } },
307
        { 120, { 120 } },
308
        { 121, { 121 } },
309
        { 122, { 122 } },
310
        { 123, { 123 } },
311
        { 124, { 124 } },
312
        { 125, { 125 } },
313
        { 126, { 126 } },
314
        { 127, { 127 } },
315
316
        /* Latin-1 accented characters and symbols */
317
        { 353, "s" },
318
        { 352, "S" },
319
        { 8218, "\'" },
320
        { 8222, "\"" },
321
        { 8249, "<" },
322
        { 338, "OE" },
323
        { 8216, "\'" },
324
        { 8217, "\'" },
325
        { 8220, "\"" },
326
        { 8221, "\"" },
327
        { 8211, "-" },
328
        { 8212, "--" },
329
        { 8482, "(tm)" },
330
        { 8250, ">" },
331
        { 339, "oe" },
332
        { 376, "Y" },
333
        { 162, "c" },
334
        { 163, "L" },
335
        { 165, "Y" },
336
        { 166, "|" },
337
        { 169, "(c)" },
338
        { 170, "a" },
339
        { 173, " " },
340
        { 174, "(R)" },
341
        { 175, "-" },
342
        { 177, "+/-" },
343
        { 178, "2" },
344
        { 179, "3" },
345
        { 180, "\'" },
346
        { 181, "u" },
347
        { 182, "P" },
348
        { 183, "*" },
349
        { 184, "," },
350
        { 185, "1" },
351
        { 186, "o" },
352
        { 171, "<<" },
353
        { 187, ">>" },
354
        { 188, "1/4" },
355
        { 189, "1/2" },
356
        { 190, "3/4" },
357
        { 192, "A" },
358
        { 193, "A" },
359
        { 194, "A" },
360
        { 195, "A" },
361
        { 196, "A" },
362
        { 197, "A" },
363
        { 198, "AE" },
364
        { 199, "C" },
365
        { 200, "E" },
366
        { 201, "E" },
367
        { 202, "E" },
368
        { 203, "E" },
369
        { 204, "I" },
370
        { 205, "I" },
371
        { 206, "I" },
372
        { 207, "I" },
373
        { 209, "N" },
374
        { 210, "O" },
375
        { 211, "O" },
376
        { 212, "O" },
377
        { 213, "O" },
378
        { 214, "O" },
379
        { 215, "x" },
380
        { 216, "O" },
381
        { 217, "U" },
382
        { 218, "U" },
383
        { 219, "U" },
384
        { 220, "U" },
385
        { 221, "Y" },
386
        { 223, "ss" },
387
        { 224, "a" },
388
        { 225, "a" },
389
        { 226, "a" },
390
        { 227, "a" },
391
        { 228, "a" },
392
        { 229, "a" },
393
        { 230, "ae" },
394
        { 231, "c" },
395
        { 232, "e" },
396
        { 233, "e" },
397
        { 234, "e" },
398
        { 235, "e" },
399
        { 236, "i" },
400
        { 237, "i" },
401
        { 238, "i" },
402
        { 239, "i" },
403
        { 241, "n" },
404
        { 242, "o" },
405
        { 243, "o" },
406
        { 244, "o" },
407
        { 245, "o" },
408
        { 246, "o" },
409
        { 247, "/" },
410
        { 248, "o" },
411
        { 249, "u" },
412
        { 250, "u" },
413
        { 251, "u" },
414
        { 252, "u" },
415
        { 253, "y" },
416
        { 255, "y" },
417
        { 710, "^" },
418
        { 732, "~" },
419
420
        /* math symbols */
421
        { 402, "f" },
422
423
        /* other symbols */
424
        { 8226, "*" },
425
426
        /* arrows */
427
        { 8592, "<-" },
428
        { 8594, "->" },
429
430
        /* several capital Greek letters look a lot like Roman letters */
431
        { 913, "A" },
432
        { 914, "B" },
433
        { 918, "Z" },
434
        { 919, "H" },
435
        { 921, "I" },
436
        { 922, "K" },
437
        { 924, "M" },
438
        { 925, "N" },
439
        { 927, "O" },
440
        { 929, "P" },
441
        { 932, "T" },
442
        { 933, "Y" },
443
        { 935, "X" },
444
445
        /* Latin-2 accented characters */
446
        { 260, "A" },
447
        { 321, "L" },
448
        { 317, "L" },
449
        { 346, "S" },
450
        { 350, "S" },
451
        { 356, "T" },
452
        { 377, "Z" },
453
        { 381, "Z" },
454
        { 379, "Z" },
455
        { 261, "a" },
456
        { 731, "o" },
457
        { 322, "l" },
458
        { 318, "l" },
459
        { 347, "s" },
460
        { 351, "s" },
461
        { 357, "t" },
462
        { 378, "z" },
463
        { 733, "\"" },
464
        { 382, "z" },
465
        { 380, "z" },
466
        { 340, "R" },
467
        { 258, "A" },
468
        { 313, "L" },
469
        { 262, "C" },
470
        { 268, "C" },
471
        { 280, "E" },
472
        { 282, "E" },
473
        { 270, "D" },
474
        { 272, "D" },
475
        { 323, "N" },
476
        { 327, "N" },
477
        { 336, "O" },
478
        { 344, "R" },
479
        { 366, "U" },
480
        { 368, "U" },
481
        { 354, "T" },
482
        { 341, "r" },
483
        { 259, "a" },
484
        { 314, "l" },
485
        { 263, "c" },
486
        { 269, "c" },
487
        { 281, "e" },
488
        { 283, "e" },
489
        { 271, "d" },
490
        { 273, "d" },
491
        { 324, "n" },
492
        { 328, "n" },
493
        { 337, "o" },
494
        { 345, "r" },
495
        { 367, "u" },
496
        { 369, "u" },
497
        { 355, "t" },
498
        { 0, { 0 } }
499
    };
500
501
    /* determine how much space we'll need in the translation array */
502
    for (p = ascii_mapping, siz = 0, exp_siz = 0 ; p->uni != 0 ; ++p)
503
    {
504
        /* we need space for this mapping string, plus a length prefix byte */
505
        siz += strlen(p->asc) + 1;
506
507
        /* 
508
         *   if this is a multi-character expansion, count it in the
509
         *   expansion array size 
510
         */
511
        if (strlen(p->asc) > 1)
512
            exp_siz += strlen(p->asc) + 1;
513
    }
514
515
    /* add in space for the default entry */
516
    siz += 2;
517
518
    /* allocate the translation array */
519
    xlat_array_ = (unsigned char *)t3malloc(siz);
520
521
    /* 
522
     *   allocate the expansion array; allocate one extra entry for the null
523
     *   mapping at index zero 
524
     */
525
    exp_array_ = (wchar_t *)t3malloc((exp_siz + 1) * sizeof(wchar_t));
526
527
    /* 
528
     *   start at element 1 of the expansion array (element zero is reserved
529
     *   to indicate the null mapping) 
530
     */
531
    dst = xlat_array_;
532
    exp_dst = exp_array_ + 1;
533
534
    /* 
535
     *   Add the zeroeth entry, which serves as the default mapping for
536
     *   characters that aren't otherwise mappable.  
537
     */
538
    set_mapping(0, 0);
539
    *dst++ = 1;
540
    *dst++ = '?';
541
542
    /* set up the arrays */
543
    for (p = ascii_mapping ; p->uni != 0 ; ++p)
544
    {
545
        size_t len;
546
547
        /* set the mapping's offset in the translation array */
548
        set_mapping(p->uni, dst - xlat_array_);
549
550
        /* get the length of this mapping */
551
        len = strlen(p->asc);
552
553
        /* set this mapping's length */
554
        *dst++ = (unsigned char)len;
555
556
        /* copy the mapping */
557
        memcpy(dst, p->asc, len);
558
559
        /* move past the mapping in the translation array */
560
        dst += len;
561
562
        /* add the expansion mapping if necessary */
563
        if (len > 1)
564
        {
565
            size_t i;
566
567
            /* add an expansion mapping */
568
            set_exp_mapping(p->uni, exp_dst - exp_array_);
569
570
            /* set the length prefix */
571
            *exp_dst++ = (wchar_t)len;
572
573
            /* add the mapping */
574
            for (i = 0 ; i < len ; ++i)
575
                *exp_dst++ = (wchar_t)p->asc[i];
576
        }
577
    }
578
}
579
580
/* ------------------------------------------------------------------------ */
581
/*
582
 *   Special built-in mapper to ISO-8859-1.  Because of the widespread use
583
 *   of this character set, we make this mapping available even when no
584
 *   external mapping file is available.  
585
 */
586
587
/*
588
 *   create an 8859-1 mapper
589
 */
590
CCharmapToLocal8859_1::CCharmapToLocal8859_1()
591
{
592
    unsigned char *dst;
593
    size_t siz;
594
    wchar_t c;
595
596
    /* 
597
     *   Determine how much space we'll need in the translation array - we
598
     *   need one byte for each character, plus one byte for the length of
599
     *   each character.  We also need two bytes for the default entry.  
600
     */
601
    siz = 256 + 256 + 2;
602
603
    /* allocate the mapping */
604
    xlat_array_ = (unsigned char *)t3malloc(siz);
605
606
    /* start at the start of the array */
607
    dst = xlat_array_;
608
609
    /* 
610
     *   Add the zeroeth entry, which serves as the default mapping for
611
     *   characters that aren't otherwise mappable.  
612
     */
613
    set_mapping(0, 0);
614
    *dst++ = 1;
615
    *dst++ = '?';
616
617
    /* 
618
     *   Set up the mappings - this is easy because each Unicode code point
619
     *   from 0 to 255 maps to the same ISO 8859-1 code point.  
620
     */
621
    for (c = 0 ; c < 256 ; ++c)
622
    {
623
        /* set the mapping's offset in the translation array */
624
        set_mapping(c, dst - xlat_array_);
625
626
        /* store the length (always 1) and translated character */
627
        *dst++ = 1;
628
        *dst++ = (unsigned char)c;
629
    }
630
}
631
632
633
/* ------------------------------------------------------------------------ */
634
/*
635
 *   Character mapping for Unicode to Local 
636
 */
637
638
/*
639
 *   create the translator 
640
 */
641
CCharmapToLocal::CCharmapToLocal()
642
{
643
    /* no mapping sub-tables yet */
644
    memset(map_, 0, sizeof(map_));
645
    memset(exp_map_, 0, sizeof(exp_map_));
646
647
    /* no translation or expansion arrays yet */
648
    xlat_array_ = 0;
649
    exp_array_ = 0;
650
}
651
652
/*
653
 *   delete the translator 
654
 */
655
CCharmapToLocal::~CCharmapToLocal()
656
{
657
    size_t i;
658
659
    /* delete the translation array */
660
    if (xlat_array_ != 0)
661
        t3free(xlat_array_);
662
663
    /* delete the expansion array */
664
    if (exp_array_ != 0)
665
        t3free(exp_array_);
666
667
    /* delete any mapping tables we've allocated */
668
    for (i = 0 ; i < sizeof(map_)/sizeof(map_[0]) ; ++i)
669
    {
670
        /* delete this mapping if allocated */
671
        if (map_[i] != 0)
672
            t3free(map_[i]);
673
    }
674
675
    /* delete any expansion mapping tables */
676
    for (i = 0 ; i < sizeof(exp_map_)/sizeof(exp_map_[0]) ; ++i)
677
    {
678
        /* delete this expansion mapping if allocated */
679
        if (exp_map_[i] != 0)
680
            t3free(exp_map_[i]);
681
    }
682
}
683
684
/*
685
 *   Set a mapping 
686
 */
687
void CCharmapToLocal::set_mapping(wchar_t unicode_char,
688
                                  unsigned int xlat_offset)
689
{
690
    int master_idx;
691
    
692
    /* get the master table index for this unicode character */
693
    master_idx = (int)((unicode_char >> 8) & 0xff);
694
    
695
    /* if there's no sub-table here yet, create one */
696
    if (map_[master_idx] == 0)
697
    {
698
        int i;
699
        
700
        /* allocate it */
701
        map_[master_idx] =
702
            (unsigned int *)t3malloc(256 * sizeof(unsigned int));
703
        
704
        /* 
705
         *   Set each entry to the default character, so that it will
706
         *   produce valid results if no mapping is ever specified for the
707
         *   character.  The default character is always at offset zero in
708
         *   the translation array.  
709
         */
710
        for (i = 0 ; i < 256 ; ++i)
711
            map_[master_idx][i] = 0;
712
    }
713
    
714
    /* set the mapping for the character's entry in the sub-table */
715
    map_[master_idx][unicode_char & 0xff] = xlat_offset;
716
}
717
718
/*
719
 *   Set an expansion mapping 
720
 */
721
void CCharmapToLocal::set_exp_mapping(wchar_t unicode_char,
722
                                      unsigned int exp_offset)
723
{
724
    int master_idx;
725
726
    /* get the master table index for this unicode character */
727
    master_idx = (int)((unicode_char >> 8) & 0xff);
728
729
    /* if there's no sub-table here yet, create one */
730
    if (exp_map_[master_idx] == 0)
731
    {
732
        int i;
733
734
        /* allocate it */
735
        exp_map_[master_idx] =
736
            (unsigned int *)t3malloc(256 * sizeof(unsigned int));
737
738
        /* 
739
         *   Set each entry to the default character, so that it will produce
740
         *   valid results if no mapping is ever specified for the character.
741
         *   The default character is always at offset zero in the expansion
742
         *   array.  
743
         */
744
        for (i = 0 ; i < 256 ; ++i)
745
            exp_map_[master_idx][i] = 0;
746
    }
747
748
    /* set the mapping for the character's entry in the sub-table */
749
    exp_map_[master_idx][unicode_char & 0xff] = exp_offset;
750
}
751
752
/*
753
 *   Map a UTF-8 string of known byte length to the local character set
754
 */
755
size_t CCharmapToLocal::map_utf8(char *dest, size_t dest_len,
756
                                 utf8_ptr src, size_t src_byte_len,
757
                                 size_t *src_bytes_used) const
758
{
759
    utf8_ptr src_start;
760
    size_t cur_total;
761
    char *srcend;
762
        
763
    /* remember where we started */
764
    src_start = src;
765
766
    /* compute where the source buffer ends */
767
    srcend = src.getptr() + src_byte_len;
768
    
769
    /* copy characters until we reach the end of the source string */
770
    for (cur_total = 0 ; src.getptr() < srcend ; src.inc())
771
    {
772
        char mapbuf[10];
773
        size_t maplen = sizeof(mapbuf);
774
        char *mapp = mapbuf;
775
776
        /* map this character */
777
        maplen = map(src.getch(), &mapp, &maplen);
778
779
        /* determine how to store the character */
780
        if (dest == 0)
781
        {
782
            /* we're just counting */
783
        }
784
        else if (dest_len >= maplen)
785
        {
786
            /* we have room for it - add it in */
787
            memcpy(dest, mapbuf, maplen);
788
789
            /* advance past it */
790
            dest += maplen;
791
            dest_len -= maplen;
792
        }
793
        else
794
        {
795
            /* there's no more room - stop now */
796
            break;
797
        }
798
799
        /* add this into the total */
800
        cur_total += maplen;
801
    }
802
803
    /* if the caller wants to know how much space we used, tell them */
804
    if (src_bytes_used != 0)
805
        *src_bytes_used = src.getptr() - src_start.getptr();
806
807
    /* return the total length of the result */
808
    return cur_total;
809
}
810
811
/*
812
 *   Map a null-terminated UTF-8 string to the local character set
813
 */
814
size_t CCharmapToLocal::map_utf8z(char *dest, size_t dest_len,
815
                                  utf8_ptr src) const
816
{
817
    size_t cur_total;
818
    
819
    /* copy characters until we find the terminating null */
820
    for (cur_total = 0 ; src.getch() != 0 ; src.inc())
821
    {
822
        /* 
823
         *   map this character into the output, if it will fit, but in
824
         *   any case count the space it needs in the output 
825
         */
826
        cur_total += map(src.getch(), &dest, &dest_len);
827
    }
828
829
    /* 
830
     *   add a null terminator if there's room, but don't count it in the
831
     *   result length 
832
     */
833
    map(0, &dest, &dest_len);
834
    
835
    /* return the total length of the result */
836
    return cur_total;
837
}
838
839
/*
840
 *   Map a null-terminated UTF-8 string to the local character set, escaping
841
 *   characters that aren't part of the local character set.  
842
 */
843
size_t CCharmapToLocal::map_utf8z_esc(
844
    char *dest, size_t dest_len, utf8_ptr src,
845
    size_t (*esc_fn)(wchar_t, char **, size_t *)) const
846
{
847
    size_t cur_total;
848
849
    /* copy characters until we find the terminating null */
850
    for (cur_total = 0 ; src.getch() != 0 ; src.inc())
851
    {
852
        wchar_t ch = src.getch();
853
        
854
        /* if this character is mappable, map it; otherwise, escape it */
855
        if (is_mappable(src.getch()))
856
        {
857
            /* map the character */
858
            cur_total += map(ch, &dest, &dest_len);
859
        }
860
        else
861
        {
862
            /* we can't map it, so let the escape callback handle it */
863
            cur_total += (*esc_fn)(ch, &dest, &dest_len);
864
        }
865
    }
866
867
    /* 
868
     *   add a null terminator if there's room, but don't count it in the
869
     *   result length 
870
     */
871
    map(0, &dest, &dest_len);
872
873
    /* return the total length of the result */
874
    return cur_total;
875
}
876
877
/*
878
 *   Escape callback for map_utf8z_esc() - prepares source-code-style
879
 *   'backslash' escape sequences for unmappable characters.  
880
 */
881
size_t CCharmapToLocal::source_esc_cb(wchar_t ch, char **dest, size_t *len)
882
{
883
    char buf[7];
884
    size_t copylen;
885
    
886
    /* prepare our own representation */
887
    sprintf(buf, "\\u%04x", (unsigned int)ch);
888
889
    /* copy the whole thing if possible, but limit to the available space */
890
    copylen = 6;
891
    if (copylen > *len)
892
        copylen = *len;
893
894
    /* copy the bytes */
895
    memcpy(*dest, buf, copylen);
896
897
    /* advance the buffer pointers */
898
    *dest += copylen;
899
    *len -= copylen;
900
901
    /* return the full space needed */
902
    return 6;
903
}
904
905
/*
906
 *   Map to UTF8 
907
 */
908
size_t CCharmapToLocal::map_utf8(char *dest, size_t dest_len,
909
                                 const char *src, size_t src_byte_len,
910
                                 size_t *src_bytes_used) const
911
{
912
    utf8_ptr src_ptr;
913
914
    /* set up the source UTF-8 pointer */
915
    src_ptr.set((char *)src);
916
917
    /* map it and return the result */
918
    return map_utf8(dest, dest_len, src_ptr, src_byte_len, src_bytes_used);
919
}
920
921
/*
922
 *   Create a mapper and load a mapping file 
923
 */
924
CCharmapToLocal *CCharmapToLocal::load(CResLoader *res_loader,
925
                                       const char *table_name)
926
{
927
    osfildef *fp;
928
    CCharmapToLocal *mapper;
929
    charmap_type_t map_type;
930
931
    /* if they want a trivial UTF-8 translator, return one */
932
    if (stricmp(table_name, "utf-8") == 0
933
        || stricmp(table_name, "utf8") == 0)
934
        return new CCharmapToLocalUTF8();
935
936
    /* if they want a Unicode 16-bit encoding, return one */
937
    if (stricmp(table_name, "utf-16le") == 0
938
        || stricmp(table_name, "unicodel") == 0)
939
        return new CCharmapToLocalUcs2Little();
940
    if (stricmp(table_name, "utf-16be") == 0
941
        || stricmp(table_name, "unicodeb") == 0)
942
        return new CCharmapToLocalUcs2Big();
943
944
    /* presume failure */
945
    mapper = 0;
946
947
    /* open and characterize the mapping file */
948
    fp = open_map_file(res_loader, table_name, &map_type);
949
950
    /* check to make sure we opened the file */
951
    if (fp == 0)
952
    {
953
        /* if they want a plain ASCII translator, return a default one */
954
        if (name_is_ascii_synonym(table_name))
955
            return new CCharmapToLocalASCII();
956
957
        /* if they want a plain ISO-8859-1 translator, return a default one */
958
        if (name_is_8859_1_synonym(table_name))
959
            return new CCharmapToLocal8859_1();
960
        
961
        /* return failure */
962
        return 0;
963
    }
964
965
    /* create an appropriate mapper */
966
    switch(map_type)
967
    {
968
    case CHARMAP_TYPE_SB:
969
        /* create a single-byte mapper */
970
        mapper = new CCharmapToLocalSB();
971
        break;
972
973
    case CHARMAP_TYPE_DB:
974
        /* create a double-byte mapper */
975
        mapper = new CCharmapToLocalDB();
976
        break;
977
978
    case CHARMAP_TYPE_MB:
979
        /* create a mixed multi-byte mapper */
980
        mapper = new CCharmapToLocalMB();
981
        break;
982
983
    default:
984
        /* other mapper types are currently unknown */
985
        break;
986
    }
987
988
    /* if we successfully created a mapper, tell it to load the table */
989
    if (mapper != 0)
990
    {
991
        /* load the table */
992
        mapper->load_table(fp);
993
    }
994
995
    /* close the file */
996
    osfcls(fp);
997
998
    /* return the mapper, if any */
999
    return mapper;
1000
}
1001
1002
/*
1003
 *   Load the character set translation table 
1004
 */
1005
void CCharmapToLocal::load_table(osfildef *fp)
1006
{
1007
    ulong startpos;
1008
    ulong ofs;
1009
    uchar buf[256];
1010
    uint cnt;
1011
    ulong xbytes;
1012
    ulong xchars;
1013
    uint next_ofs;
1014
    
1015
    /* note the initial seek position */
1016
    startpos = osfpos(fp);
1017
1018
    /* read the first entry, which gives the offset of the to-local table */
1019
    if (osfrb(fp, buf, 4))
1020
        return;
1021
    ofs = t3rp4u(buf);
1022
1023
    /* seek to the to-local table */
1024
    osfseek(fp, startpos + ofs, OSFSK_SET);
1025
1026
    /* read the number of entries and number of bytes needed */
1027
    if (osfrb(fp, buf, 6))
1028
        return;
1029
    cnt = osrp2(buf);
1030
    xbytes = t3rp4u(buf + 2);
1031
1032
    /* 
1033
     *   Allocate space for the translation table.  Note that we cannot
1034
     *   handle translation tables bigger than the maximum allowed in a
1035
     *   single allocation unit on the operating system. 
1036
     */
1037
    if (xbytes > OSMALMAX)
1038
        return;
1039
    xlat_array_ = (unsigned char *)t3malloc(xbytes);
1040
    if (xlat_array_ == 0)
1041
        return;
1042
1043
    /*
1044
     *   Read each mapping 
1045
     */
1046
    for (next_ofs = 0 ; cnt > 0 ; --cnt)
1047
    {
1048
        wchar_t codept;
1049
        uint xlen;
1050
        
1051
        /* read the code point and translation length */
1052
        if (osfrb(fp, buf, 3))
1053
            return;
1054
1055
        /* decode the code point and translation length */
1056
        codept = osrp2(buf);
1057
        xlen = (unsigned int)buf[2];
1058
1059
        /* assign the mapping */
1060
        set_mapping(codept, next_ofs);
1061
1062
        /* store the translation length */
1063
        xlat_array_[next_ofs++] = buf[2];
1064
1065
        /* read the translation bytes */
1066
        if (osfrb(fp, xlat_array_ + next_ofs, xlen))
1067
            return;
1068
1069
        /* skip past the translation bytes we've read */
1070
        next_ofs += xlen;
1071
    }
1072
1073
    /*
1074
     *   Next, read the expansions, if present.
1075
     *   
1076
     *   If we find the $EOF marker, it means it's an old-format file without
1077
     *   the separate expansion definitions.  Otherwise, we'll have the
1078
     *   expansion entry count and the aggregate number of unicode characters
1079
     *   in all of the expansions.  
1080
     */
1081
    if (osfrb(fp, buf, 6) || memcmp(buf, "$EOF", 4) == 0)
1082
        return;
1083
1084
    /* decode the expansion entry count and aggregate length */
1085
    cnt = osrp2(buf);
1086
    xchars = t3rp4u(buf + 2);
1087
1088
    /* 
1089
     *   add one entry so that we can leave index zero unused, to indicate
1090
     *   unmapped characters 
1091
     */
1092
    ++xchars;
1093
1094
    /* add one array slot per entry, for the length prefix slots */
1095
    xchars += cnt;
1096
1097
    /* allocate space for the expansions */
1098
    exp_array_ = (wchar_t *)t3malloc(xchars * sizeof(wchar_t));
1099
    if (exp_array_ == 0)
1100
        return;
1101
1102
    /* 
1103
     *   read the mappings; start loading them at index 1, since we want to
1104
     *   leave index 0 unused so that it can indicate unused mappings 
1105
     */
1106
    for (next_ofs = 1 ; cnt > 0 ; --cnt)
1107
    {
1108
        wchar_t codept;
1109
        uint xlen;
1110
        size_t i;
1111
1112
        /* read this entry's unicode value and expansion character length */
1113
        if (osfrb(fp, buf, 3))
1114
            return;
1115
1116
        /* decode the code point and expansion length */
1117
        codept = osrp2(buf);
1118
        xlen = (uint)buf[2];
1119
1120
        /* assign the expansion mapping */
1121
        set_exp_mapping(codept, next_ofs);
1122
1123
        /* set the length prefix */
1124
        exp_array_[next_ofs++] = (wchar_t)xlen;
1125
1126
        /* read and store the expansion characters */
1127
        for (i = 0 ; i < xlen ; ++i)
1128
        {
1129
            /* read this translation */
1130
            if (osfrb(fp, buf, 2))
1131
                return;
1132
1133
            /* decode and store this translation */
1134
            exp_array_[next_ofs++] = osrp2(buf);
1135
        }
1136
    }
1137
}
1138
1139
/*
1140
 *   Write to a file 
1141
 */
1142
int CCharmapToLocal::write_file(osfildef *fp, const char *buf, size_t bufl)
1143
{
1144
    utf8_ptr p;
1145
1146
    /* set up to read from the buffer */
1147
    p.set((char *)buf);
1148
    
1149
    /* map and write one buffer-full at a time */
1150
    while (bufl > 0)
1151
    {
1152
        char conv_buf[256];
1153
        size_t conv_len;
1154
        size_t used_src_len;
1155
1156
        /* map as much as we can fit into our buffer */
1157
        conv_len = map_utf8(conv_buf, sizeof(conv_buf), p, bufl,
1158
                            &used_src_len);
1159
1160
        /* write out this chunk */
1161
        if (osfwb(fp, conv_buf, conv_len))
1162
            return 1;
1163
1164
        /* advance past this chunk in the input */
1165
        p.set(p.getptr() + used_src_len);
1166
        bufl -= used_src_len;
1167
    }
1168
1169
    /* no errors */
1170
    return 0;
1171
}
1172
1173
1174
/* ------------------------------------------------------------------------ */
1175
/*
1176
 *   Character mapper - trivial UTF8-to-UTF8 conversion
1177
 */
1178
1179
/*
1180
 *   map a character 
1181
 */
1182
size_t CCharmapToLocalUTF8::map(wchar_t unicode_char, char **output_ptr,
1183
                                size_t *output_len) const
1184
{
1185
    size_t map_len;
1186
    
1187
    /* get the character size */
1188
    map_len = utf8_ptr::s_wchar_size(unicode_char);
1189
    
1190
    /* if we don't have room for one more character, abort */
1191
    if (*output_len < map_len)
1192
    {
1193
        *output_len = 0;
1194
        return map_len;
1195
    }
1196
1197
    /* store the mapping */
1198
    utf8_ptr::s_putch(*output_ptr, unicode_char);
1199
1200
    /* increment the pointer by the number of characters we copied */
1201
    *output_ptr += map_len;
1202
1203
    /* adjust the remaining output length */
1204
    *output_len -= map_len;
1205
1206
    /* return the size of the result */
1207
    return map_len;
1208
}
1209
1210
/*
1211
 *   Map a UTF-8 string of known byte length
1212
 */
1213
size_t CCharmapToLocalUTF8::map_utf8(char *dest, size_t dest_len,
1214
                                     utf8_ptr src, size_t src_byte_len,
1215
                                     size_t *src_bytes_used) const
1216
{
1217
    size_t copy_len;
1218
1219
    /* 
1220
     *   if they didn't give us a destination buffer, tell them how much
1221
     *   space is needed for the copy - this is identical to the length of
1222
     *   the source string since we make no changes to it 
1223
     */
1224
    if (dest == 0)
1225
    {
1226
        *src_bytes_used = 0;
1227
        return src_byte_len;
1228
    }
1229
1230
    /* copy as much as we can, up to the output buffer length */
1231
    copy_len = src_byte_len;
1232
    if (copy_len > dest_len)
1233
        copy_len = dest_len;
1234
1235
    /* 
1236
     *   if the last byte we'd copy is a continuation byte, don't copy it
1237
     *   so that we keep whole characters intact 
1238
     */
1239
    if (copy_len > 0
1240
        && utf8_ptr::s_is_continuation(src.getptr() + copy_len - 1))
1241
    {
1242
        /* don't copy this byte */
1243
        --copy_len;
1244
1245
        /* 
1246
         *   check the previous byte as well, since a given character can
1247
         *   be up to three bytes long (hence we might have two
1248
         *   continuation bytes) 
1249
         */
1250
        if (copy_len > 0
1251
            && utf8_ptr::s_is_continuation(src.getptr() + copy_len - 1))
1252
            --copy_len;
1253
    }
1254
1255
    /* if we have an output buffer, copy the data */
1256
    if (dest != 0)
1257
        memcpy(dest, src.getptr(), copy_len);
1258
1259
    /* set the amount we copied, if the caller is interested */
1260
    if (src_bytes_used != 0)
1261
        *src_bytes_used = copy_len;
1262
1263
    /* return the number of bytes we put in the destination buffer */
1264
    return copy_len;
1265
}
1266
1267
/*
1268
 *   Map a null-terminated UTF-8 string
1269
 */
1270
size_t CCharmapToLocalUTF8::map_utf8z(char *dest, size_t dest_len,
1271
                                      utf8_ptr src) const
1272
{
1273
    size_t src_len;
1274
1275
    /* get the source length */
1276
    src_len = strlen(src.getptr());
1277
1278
    /* copy the bytes */
1279
    map_utf8(dest, dest_len, src, src_len, 0);
1280
1281
    /* 
1282
     *   if there's room for the null terminator (which takes up just one
1283
     *   byte in UTF-8), add it 
1284
     */
1285
    if (dest_len > src_len)
1286
        *(dest + src_len) = '\0';
1287
1288
    /* 
1289
     *   return the amount of space needed to copy the whole string --
1290
     *   this is identical to the source length, since we don't make any
1291
     *   changes to it 
1292
     */
1293
    return src_len;
1294
}
1295
1296
1297
/* ------------------------------------------------------------------------ */
1298
/*
1299
 *   Character mapper - Unicode to Single-byte 
1300
 */
1301
1302
/*
1303
 *   map a character 
1304
 */
1305
size_t CCharmapToLocalSB::map(wchar_t unicode_char, char **output_ptr,
1306
                              size_t *output_len) const
1307
{
1308
    const unsigned char *mapping;
1309
    size_t map_len;
1310
    
1311
    /* get the mapping */
1312
    mapping = get_xlation(unicode_char, &map_len);
1313
    
1314
    /* if we don't have room for one more character, abort */
1315
    if (*output_len < map_len)
1316
    {
1317
        *output_len = 0;
1318
        return map_len;
1319
    }
1320
    
1321
    /* copy the mapping */
1322
    memcpy(*output_ptr, mapping, map_len);
1323
    
1324
    /* increment the pointer by the number of characters we copied */
1325
    *output_ptr += map_len;
1326
1327
    /* adjust the remaining output length */
1328
    *output_len -= map_len;
1329
    
1330
    /* return the size of the result */
1331
    return map_len;
1332
}
1333
1334
/*
1335
 *   Map a UTF-8 string of known byte length to the local character set 
1336
 */
1337
size_t CCharmapToLocalSB::map_utf8(char *dest, size_t dest_len,
1338
                                   utf8_ptr src, size_t src_byte_len,
1339
                                   size_t *src_bytes_used) const
1340
{
1341
    utf8_ptr src_start;
1342
    size_t cur_total;
1343
    char *srcend;
1344
1345
    /* remember where we started */
1346
    src_start = src;
1347
1348
    /* compute where the source buffer ends */
1349
    srcend = src.getptr() + src_byte_len;
1350
1351
    /* copy characters until we reach the end of the source string */
1352
    for (cur_total = 0 ; src.getptr() < srcend ; src.inc())
1353
    {
1354
        const unsigned char *mapping;
1355
        size_t map_len;
1356
        
1357
        /* get the mapping for this character */
1358
        mapping = get_xlation(src.getch(), &map_len);
1359
1360
        /* 
1361
         *   if we have room, add it; otherwise, zero the output length
1362
         *   remaining so we don't try to add anything more 
1363
         */
1364
        if (dest == 0)
1365
        {
1366
            /* we're just counting */
1367
        }
1368
        else if (map_len <= dest_len)
1369
        {
1370
            /* add the sequence */
1371
            memcpy(dest, mapping, map_len);
1372
1373
            /* adjust the output pointer and length remaining */
1374
            dest += map_len;
1375
            dest_len -= map_len;
1376
        }
1377
        else
1378
        {
1379
            /* it doesn't fit - stop now */
1380
            break;
1381
        }
1382
1383
        /* count the length in the total */
1384
        cur_total += map_len;
1385
    }
1386
1387
    /* if the caller wants to know how much space we used, tell them */
1388
    if (src_bytes_used != 0)
1389
        *src_bytes_used = src.getptr() - src_start.getptr();
1390
1391
    /* return the total length of the result */
1392
    return cur_total;
1393
}
1394
1395
/*
1396
 *   Map a null-terminated UTF-8 string to the local character set 
1397
 */
1398
size_t CCharmapToLocalSB::map_utf8z(char *dest, size_t dest_len,
1399
                                    utf8_ptr src) const
1400
{
1401
    size_t cur_total;
1402
1403
    /* copy characters until we find the terminating null */
1404
    for (cur_total = 0 ; src.getch() != 0 ; src.inc())
1405
    {
1406
        const unsigned char *mapping;
1407
        size_t map_len;
1408
1409
        /* get the mapping for this character */
1410
        mapping = get_xlation(src.getch(), &map_len);
1411
1412
        /* 
1413
         *   if we have room, add it; otherwise, zero the output length
1414
         *   remaining so we don't try to add anything more 
1415
         */
1416
        if (map_len <= dest_len)
1417
        {
1418
            /* add the sequence */
1419
            memcpy(dest, mapping, map_len);
1420
1421
            /* adjust the output pointer and length remaining */
1422
            dest += map_len;
1423
            dest_len -= map_len;
1424
        }
1425
        else
1426
        {
1427
            /* it doesn't fit - zero the output length remaining */
1428
            dest_len = 0;
1429
        }
1430
1431
        /* count the length in the total */
1432
        cur_total += map_len;
1433
    }
1434
1435
    /* 
1436
     *   add a null terminator, if there's room, but don't count it in the
1437
     *   output length 
1438
     */
1439
    if (dest_len > 0)
1440
        *dest = '\0';
1441
    
1442
    /* return the total length of the result */
1443
    return cur_total;
1444
}
1445
1446
1447
/* ------------------------------------------------------------------------ */
1448
/*
1449
 *   Character mapper - Unicode to 16-bit Wide Unicode local character set 
1450
 */
1451
1452
/*
1453
 *   map a character 
1454
 */
1455
size_t CCharmapToLocalWideUnicode::map(wchar_t unicode_char,
1456
                                       char **output_ptr,
1457
                                       size_t *output_len) const
1458
{
1459
    /* if we don't have room for another wchar_t, abort */
1460
    if (*output_len < sizeof(wchar_t))
1461
    {
1462
        *output_len = 0;
1463
        return sizeof(wchar_t);
1464
    }
1465
    
1466
    /* 
1467
     *   Set the wide character to the unicode value, with no translation
1468
     *   - unicode is the same everywhere.
1469
     *   
1470
     *   Note that the need to perform this trivial translation for this
1471
     *   character set is a secondary reason that this routine is virtual
1472
     *   (the primary reason is to handle the default ASCII translation).  
1473
     */
1474
    **(wchar_t **)output_ptr = unicode_char;
1475
    
1476
    /* increment the pointer by the size of a wide character */
1477
    ++(*(wchar_t **)output_ptr);
1478
    
1479
    /* return the size of the result */
1480
    return sizeof(wchar_t);
1481
}
1482
1483
/*
1484
 *   Map a UTF-8 string of known byte length to the local character set 
1485
 */
1486
size_t CCharmapToLocalWideUnicode::
1487
   map_utf8(char *dest, size_t dest_len,
1488
            utf8_ptr src, size_t src_byte_len,
1489
            size_t *src_bytes_used) const
1490
{
1491
    utf8_ptr src_start;
1492
    size_t cur_total;
1493
    char *srcend;
1494
    wchar_t *destw;
1495
1496
    /* remember where we started */
1497
    src_start = src;
1498
1499
    /* compute where the source buffer ends */
1500
    srcend = src.getptr() + src_byte_len;
1501
1502
    /* set up a wchar_t output pointer for convenience */
1503
    destw = (wchar_t *)dest;
1504
1505
    /* copy characters until we reach the end of the source string */
1506
    for (cur_total = 0 ; src.getptr() < srcend ; src.inc())
1507
    {
1508
        /* 
1509
         *   if we have room, add it; otherwise, zero the output length
1510
         *   remaining so we don't try to add anything more 
1511
         */
1512
        if (dest == 0)
1513
        {
1514
            /* we're just counting - don't store anything */
1515
        }
1516
        else if (dest_len >= sizeof(wchar_t))
1517
        {
1518
            /* add the sequence */
1519
            *destw++ = src.getch();
1520
1521
            /* adjust the length remaining */
1522
            dest_len -= sizeof(wchar_t);
1523
        }
1524
        else
1525
        {
1526
            /* it doesn't fit - stop now */
1527
            break;
1528
        }
1529
1530
        /* count the length in the total */
1531
        cur_total += sizeof(wchar_t);
1532
    }
1533
1534
    /* if the caller wants to know how much space we used, tell them */
1535
    if (src_bytes_used != 0)
1536
        *src_bytes_used = src.getptr() - src_start.getptr();
1537
1538
    /* return the total length of the result */
1539
    return cur_total;
1540
}
1541
1542
/*
1543
 *   Map a null-terminated UTF-8 string to the local character set 
1544
 */
1545
size_t CCharmapToLocalWideUnicode::
1546
   map_utf8z(char *dest, size_t dest_len, utf8_ptr src) const
1547
{
1548
    size_t cur_total;
1549
    wchar_t *destw;
1550
1551
    /* set up a wchar_t output pointer for convenience */
1552
    destw = (wchar_t *)dest;
1553
1554
    /* copy characters until we find the terminating null */
1555
    for (cur_total = 0 ; src.getch() != 0 ; src.inc())
1556
    {
1557
        /* 
1558
         *   if we have room, add it; otherwise, zero the output length
1559
         *   remaining so we don't try to add anything more 
1560
         */
1561
        if (dest_len >= sizeof(wchar_t))
1562
        {
1563
            /* add the sequence */
1564
            *destw++ = src.getch();
1565
1566
            /* adjust the length remaining */
1567
            dest_len -= sizeof(wchar_t);
1568
        }
1569
        else
1570
        {
1571
            /* it doesn't fit - zero the output length remaining */
1572
            dest_len = 0;
1573
        }
1574
1575
        /* count the length in the total */
1576
        cur_total += sizeof(wchar_t);
1577
    }
1578
1579
    /* 
1580
     *   if there's room for a null terminator character (not byte - we need
1581
     *   to add an entire wide character), add it, but don't count it in the
1582
     *   return length 
1583
     */
1584
    if (dest_len >= sizeof(wchar_t))
1585
        *destw = '\0';
1586
    
1587
    /* return the total length of the result */
1588
    return cur_total;
1589
}
1590
1591
1592
/* ------------------------------------------------------------------------ */
1593
/*
1594
 *   Character mapper for 16-bit Wide Unicode, big-endian.  Stores the
1595
 *   characters in big-endian UCS-2 representation.  
1596
 */
1597
size_t CCharmapToLocalUcs2Big::map(wchar_t unicode_char, char **output_ptr,
1598
                                   size_t *output_len) const
1599
{
1600
    /* 
1601
     *   If we don't have room for another byte pair, abort.  Note that we
1602
     *   really do want to store exactly two bytes, not sizeof(anything),
1603
     *   since we're storing to the UCS-2 file format, which encodes each
1604
     *   character in two bytes.  
1605
     */
1606
    if (*output_len < 2)
1607
    {
1608
        *output_len = 0;
1609
        return 2;
1610
    }
1611
1612
    /*
1613
     *   Store the big-endian 16-bit value with no translation - unicode
1614
     *   is the same everywhere.
1615
     *   
1616
     *   Note that the need to perform this trivial translation for this
1617
     *   character set is a secondary reason that this routine is virtual
1618
     *   (the primary reason is to handle the default ASCII translation).
1619
     *   
1620
     *   Store the high-order 8 bits in the first byte, and the low-order
1621
     *   8 bits in the second byte.  
1622
     */
1623
    **output_ptr = ((unicode_char >> 8) & 0xff);
1624
    *(*output_ptr + 1) = (unicode_char & 0xff);
1625
1626
    /* skip two bytes in the output */
1627
    *output_ptr += 2;
1628
    *output_len -= 2;
1629
1630
    /* return the size of the result */
1631
    return 2;
1632
}
1633
1634
/*
1635
 *   Map a UTF-8 string of known byte length to the local character set 
1636
 */
1637
size_t CCharmapToLocalUcs2Big::
1638
   map_utf8(char *dest, size_t dest_len,
1639
            utf8_ptr src, size_t src_byte_len,
1640
            size_t *src_bytes_used) const
1641
{
1642
    utf8_ptr src_start;
1643
    size_t cur_total;
1644
    char *srcend;
1645
1646
    /* remember where we started */
1647
    src_start = src;
1648
1649
    /* compute where the source buffer ends */
1650
    srcend = src.getptr() + src_byte_len;
1651
1652
    /* copy characters until we reach the end of the source string */
1653
    for (cur_total = 0 ; src.getptr() < srcend ; src.inc())
1654
    {
1655
        /* 
1656
         *   if we have room, add it; otherwise, zero the output length
1657
         *   remaining so we don't try to add anything more 
1658
         */
1659
        if (dest == 0)
1660
        {
1661
            /* we're not storing anything */
1662
        }
1663
        else if (dest_len >= 2)
1664
        {
1665
            wchar_t unicode_char;
1666
1667
            /* get the current character */
1668
            unicode_char = src.getch();
1669
1670
            /* add the sequence */
1671
            *dest++ = ((unicode_char >> 8) & 0xff);
1672
            *dest++ = (unicode_char & 0xff);
1673
1674
            /* adjust the length remaining */
1675
            dest_len -= 2;
1676
        }
1677
        else
1678
        {
1679
            /* it doesn't fit - stop now */
1680
            break;
1681
        }
1682
1683
        /* count the length in the total */
1684
        cur_total += 2;
1685
    }
1686
1687
    /* if the caller wants to know how much space we used, tell them */
1688
    if (src_bytes_used != 0)
1689
        *src_bytes_used = src.getptr() - src_start.getptr();
1690
1691
    /* return the total length of the result */
1692
    return cur_total;
1693
}
1694
1695
/*
1696
 *   Map a null-terminated UTF-8 string to the local character set 
1697
 */
1698
size_t CCharmapToLocalUcs2Big::
1699
   map_utf8z(char *dest, size_t dest_len, utf8_ptr src) const
1700
{
1701
    size_t cur_total;
1702
1703
    /* copy characters until we find the terminating null */
1704
    for (cur_total = 0 ; src.getch() != 0 ; src.inc())
1705
    {
1706
        /* 
1707
         *   if we have room, add it; otherwise, zero the output length
1708
         *   remaining so we don't try to add anything more 
1709
         */
1710
        if (dest_len >= 2)
1711
        {
1712
            wchar_t unicode_char;
1713
1714
            /* get the current character */
1715
            unicode_char = src.getch();
1716
1717
            /* add the sequence */
1718
            *dest++ = ((unicode_char >> 8) & 0xff);
1719
            *dest++ = (unicode_char & 0xff);
1720
1721
            /* adjust the length remaining */
1722
            dest_len -= 2;
1723
        }
1724
        else
1725
        {
1726
            /* it doesn't fit - zero the output length remaining */
1727
            dest_len = 0;
1728
        }
1729
1730
        /* count the length in the total */
1731
        cur_total += 2;
1732
    }
1733
    
1734
    /* return the total length of the result */
1735
    return cur_total;
1736
}
1737
1738
1739
/* ------------------------------------------------------------------------ */
1740
/*
1741
 *   Character mapper for 16-bit Wide Unicode, little-endian.  Stores the
1742
 *   characters in little-endian UCS-2 representation.  
1743
 */
1744
size_t CCharmapToLocalUcs2Little::map(wchar_t unicode_char,
1745
                                      char **output_ptr,
1746
                                      size_t *output_len) const
1747
{
1748
    /* 
1749
     *   If we don't have room for another byte pair, abort.  Note that we
1750
     *   really do want to store exactly two bytes, not sizeof(anything),
1751
     *   since we're storing to the UCS-2 file format, which encodes each
1752
     *   character in two bytes.  
1753
     */
1754
    if (*output_len < 2)
1755
    {
1756
        *output_len = 0;
1757
        return 2;
1758
    }
1759
1760
    /*
1761
     *   Store the little-endian 16-bit value with no translation -
1762
     *   unicode is the same everywhere.
1763
     *   
1764
     *   Note that the need to perform this trivial translation for this
1765
     *   character set is a secondary reason that this routine is virtual
1766
     *   (the primary reason is to handle the default ASCII translation).
1767
     *   
1768
     *   Store the low-order 8 bits in the first byte, and the high-order
1769
     *   8 bits in the second byte.  
1770
     */
1771
    **output_ptr = (unicode_char & 0xff);
1772
    *(*output_ptr + 1) = ((unicode_char >> 8) & 0xff);
1773
1774
    /* skip two bytes in the output */
1775
    *output_ptr += 2;
1776
    *output_len -= 2;
1777
1778
    /* return the size of the result */
1779
    return 2;
1780
}
1781
1782
/*
1783
 *   Map a UTF-8 string of known byte length to the local character set 
1784
 */
1785
size_t CCharmapToLocalUcs2Little::
1786
   map_utf8(char *dest, size_t dest_len,
1787
            utf8_ptr src, size_t src_byte_len,
1788
            size_t *src_bytes_used) const
1789
{
1790
    utf8_ptr src_start;
1791
    size_t cur_total;
1792
    char *srcend;
1793
1794
    /* remember where we started */
1795
    src_start = src;
1796
1797
    /* compute where the source buffer ends */
1798
    srcend = src.getptr() + src_byte_len;
1799
1800
    /* copy characters until we reach the end of the source string */
1801
    for (cur_total = 0 ; src.getptr() < srcend ; src.inc())
1802
    {
1803
        /* 
1804
         *   if we have room, add it; otherwise, zero the output length
1805
         *   remaining so we don't try to add anything more 
1806
         */
1807
        if (dest == 0)
1808
        {
1809
            /* we're just counting - don't store anything */
1810
        }
1811
        else if (dest_len >= 2)
1812
        {
1813
            wchar_t unicode_char;
1814
1815
            /* get the current character */
1816
            unicode_char = src.getch();
1817
1818
            /* add the sequence */
1819
            *dest++ = (unicode_char & 0xff);
1820
            *dest++ = ((unicode_char >> 8) & 0xff);
1821
1822
            /* adjust the length remaining */
1823
            dest_len -= 2;
1824
        }
1825
        else
1826
        {
1827
            /* it doesn't fit - stop now */
1828
            break;
1829
        }
1830
1831
        /* count the length in the total */
1832
        cur_total += 2;
1833
    }
1834
1835
    /* if the caller wants to know how much space we used, tell them */
1836
    if (src_bytes_used != 0)
1837
        *src_bytes_used = src.getptr() - src_start.getptr();
1838
1839
    /* return the total length of the result */
1840
    return cur_total;
1841
}
1842
1843
/*
1844
 *   Map a null-terminated UTF-8 string to the local character set 
1845
 */
1846
size_t CCharmapToLocalUcs2Little::
1847
   map_utf8z(char *dest, size_t dest_len, utf8_ptr src) const
1848
{
1849
    size_t cur_total;
1850
1851
    /* copy characters until we find the terminating null */
1852
    for (cur_total = 0 ; src.getch() != 0 ; src.inc())
1853
    {
1854
        /* 
1855
         *   if we have room, add it; otherwise, zero the output length
1856
         *   remaining so we don't try to add anything more 
1857
         */
1858
        if (dest_len >= 2)
1859
        {
1860
            wchar_t unicode_char;
1861
1862
            /* get the current character */
1863
            unicode_char = src.getch();
1864
1865
            /* add the sequence */
1866
            *dest++ = (unicode_char & 0xff);
1867
            *dest++ = ((unicode_char >> 8) & 0xff);
1868
1869
            /* adjust the length remaining */
1870
            dest_len -= 2;
1871
        }
1872
        else
1873
        {
1874
            /* it doesn't fit - zero the output length remaining */
1875
            dest_len = 0;
1876
        }
1877
1878
        /* count the length in the total */
1879
        cur_total += 2;
1880
    }
1881
    
1882
    /* 
1883
     *   if there's room for a null terminator character (which takes two
1884
     *   bytes in UCS-2), add it, but don't count it in the return length 
1885
     */
1886
    if (dest_len >= 2)
1887
    {
1888
        *dest++ = '\0';
1889
        *dest++ = '\0';
1890
    }
1891
    
1892
    /* return the total length of the result */
1893
    return cur_total;
1894
}
1895
1896
1897
/* ------------------------------------------------------------------------ */
1898
/*
1899
 *   Character mapper - local to UTF-8 
1900
 */
1901
1902
/*
1903
 *   create an appropriate mapping object for the given mapping file 
1904
 */
1905
CCharmapToUni *CCharmapToUni::load(class CResLoader *res_loader,
1906
                                   const char *table_name)
1907
{
1908
    osfildef *fp;
1909
    CCharmapToUni *mapper;
1910
    charmap_type_t map_type;
1911
1912
    /* if they want a trivial UTF-8 translator, return one */
1913
    if (stricmp(table_name, "utf-8") == 0
1914
        || stricmp(table_name, "utf8") == 0)
1915
        return new CCharmapToUniUTF8();
1916
1917
    /* if they want a 16-bit Unicode mapping, return one */
1918
    if (stricmp(table_name, "utf-16le") == 0
1919
        || stricmp(table_name, "unicodel") == 0)
1920
        return new CCharmapToUniUcs2Little();
1921
    if (stricmp(table_name, "utf-16be") == 0
1922
        || stricmp(table_name, "unicodeb") == 0)
1923
        return new CCharmapToUniUcs2Big();
1924
1925
    /* presume failure */
1926
    mapper = 0;
1927
1928
    /* open and characterize the mapping file */
1929
    fp = open_map_file(res_loader, table_name, &map_type);
1930
1931
    /* check to make sure we opened a file */
1932
    if (fp == 0)
1933
    {
1934
        /* 
1935
         *   if there was no file, and they want a plain ASCII translator,
1936
         *   return a default ASCII translator 
1937
         */
1938
        if (name_is_ascii_synonym(table_name))
1939
            return new CCharmapToUniASCII();
1940
1941
        /* if they want an ISO-8859-1 translator, return a default one */
1942
        if (name_is_8859_1_synonym(table_name))
1943
            return new CCharmapToUni8859_1();
1944
1945
        /* return failure */
1946
        return 0;
1947
    }
1948
1949
    /* create an appropriate mapper */
1950
    switch(map_type)
1951
    {
1952
    case CHARMAP_TYPE_SB:
1953
        /* create a single-byte mapper */
1954
        mapper = new CCharmapToUniSB();
1955
        break;
1956
1957
    case CHARMAP_TYPE_DB:
1958
        /* create a double-byte mapper */
1959
        mapper = new CCharmapToUniDB();
1960
        break;
1961
1962
    case CHARMAP_TYPE_MB:
1963
        /* create a mixed multi-byte mapper */
1964
        mapper = new CCharmapToUniMB();
1965
        break;
1966
1967
    default:
1968
        /* other mapper types are currently unknown */
1969
        break;
1970
    }
1971
1972
    /* if we successfully created a mapper, tell it to load the table */
1973
    if (mapper != 0)
1974
    {
1975
        /* load the table */
1976
        mapper->load_table(fp);
1977
    }
1978
1979
    /* close the file */
1980
    osfcls(fp);
1981
1982
    /* return the mapper, if any */
1983
    return mapper;
1984
}
1985
1986
1987
/*
1988
 *   load a mapping table 
1989
 */
1990
void CCharmapToUni::load_table(osfildef *fp)
1991
{
1992
    uchar buf[256];
1993
    uint entry_cnt;
1994
1995
    /* read the header and the local table header */
1996
    if (osfrb(fp, buf, 6))
1997
        return;
1998
1999
    /* get the local table size from the local table header */
2000
    entry_cnt = osrp2(buf + 4);
2001
2002
    /* read the mappings */
2003
    while (entry_cnt > 0)
2004
    {
2005
        size_t cur;
2006
        const uchar *p;
2007
2008
        /* figure out how many entries we can read this time */
2009
        cur = sizeof(buf)/4;
2010
        if (cur > entry_cnt)
2011
            cur = entry_cnt;
2012
2013
        /* read the entries */
2014
        if (osfrb(fp, buf, cur*4))
2015
            return;
2016
2017
        /* deduct this number from the remaining count */
2018
        entry_cnt -= cur;
2019
2020
        /* scan the entries */
2021
        for (p = buf ; cur > 0 ; p += 4, --cur)
2022
        {
2023
            /* map this entry */
2024
            set_mapping(osrp2(p), osrp2(p+2));
2025
        }
2026
    }
2027
}
2028
2029
/*
2030
 *   Map a null-terminated string into a buffer 
2031
 */
2032
size_t CCharmapToUni::map_str(char *outbuf, size_t outbuflen,
2033
                              const char *input_str)
2034
{
2035
    size_t input_len;
2036
    size_t output_len;
2037
    
2038
    /* get the length of the input string */
2039
    input_len = strlen(input_str);
2040
2041
    /* map the string to the output buffer */
2042
    output_len = map(&outbuf, &outbuflen, input_str, input_len);
2043
2044
    /* if there's space remaining in the output buffer, add the null byte */
2045
    if (outbuflen != 0)
2046
        *outbuf = '\0';
2047
2048
    /* return the number of bytes needed for the conversion */
2049
    return output_len;
2050
}
2051
2052
/* ------------------------------------------------------------------------ */
2053
/*
2054
 *   Basic single-byte character set to UTF-8 mapper 
2055
 */
2056
2057
/*
2058
 *   read from a single-byte file and translate to UTF-8
2059
 */
2060
size_t CCharmapToUniSB_basic::read_file(osfildef *fp,
2061
                                        char *buf, size_t bufl,
2062
                                        unsigned long read_limit)
2063
{
2064
    size_t inlen;
2065
2066
    /* 
2067
     *   Compute how much to read from the file.  The input file is
2068
     *   composed of single-byte characters, so only read up to one third
2069
     *   of the buffer length; this will ensure that we can always fit
2070
     *   what we read into the caller's buffer.  
2071
     */
2072
    inlen = bufl / 3;
2073
2074
    /* in any case, we can't read more than our own buffer size */
2075
    if (inlen > sizeof(inbuf_))
2076
        inlen = sizeof(inbuf_);
2077
2078
    /* limit the read length to the caller's read limit, if appropriate */
2079
    if (read_limit != 0 && inlen > read_limit)
2080
        inlen = (size_t)read_limit;
2081
2082
    /* read from the file */
2083
    inlen = osfrbc(fp, inbuf_, inlen);
2084
2085
    /* 
2086
     *   Map data to the caller's buffer, and return the result.  We're
2087
     *   certain that the data will fit in the caller's buffer: we're
2088
     *   mapping only a third as many characters as we have bytes
2089
     *   available, and each character can take up at most three bytes,
2090
     *   hence the worst case is that we fill the buffer completely.
2091
     *   
2092
     *   On the other hand, we may only fill the buffer to a third of its
2093
     *   capacity, but this is okay too, since we're not required to give
2094
     *   the caller everything they asked for.  
2095
     */
2096
    return map(&buf, &bufl, inbuf_, inlen);
2097
}
2098
2099
2100
/* ------------------------------------------------------------------------ */
2101
/*
2102
 *   Plain ASCII local to UTF-8 mapper 
2103
 */
2104
2105
/*
2106
 *   map a string from the single-byte local character set to UTF-8 
2107
 */
2108
size_t CCharmapToUniASCII::map(char **outp, size_t *outlen,
2109
                               const char *inp, size_t inlen) const
2110
{
2111
    size_t tot_outlen;
2112
2113
    /* we haven't written any characters to the output buffer yet */
2114
    tot_outlen = 0;
2115
2116
    /* scan each character (character == byte) in the input string */
2117
    for ( ; inlen > 0 ; --inlen, ++inp)
2118
    {
2119
        wchar_t uni;
2120
        size_t csiz;
2121
2122
        /* 
2123
         *   map any character outside of the 7-bit range to U+FFFD, the
2124
         *   Unicode REPLACEMENT CHARACTER, which is the standard way to
2125
         *   represent characters that can't be mapped from an incoming
2126
         *   character set 
2127
         */
2128
        if (((unsigned char)*inp) > 127)
2129
            uni = 0xfffd;
2130
        else
2131
            uni = ((wchar_t)(unsigned char)*inp);
2132
2133
        /* get the size of this character */
2134
        csiz = utf8_ptr::s_wchar_size(uni);
2135
2136
        /* add it to the total output lenght */
2137
        tot_outlen += csiz;
2138
2139
        /* if there's room, add it to our output buffer */
2140
        if (*outlen >= csiz)
2141
        {
2142
            /* write it out */
2143
            *outp += utf8_ptr::s_putch(*outp, uni);
2144
2145
            /* deduct it from the remaining output length */
2146
            *outlen -= csiz;
2147
        }
2148
        else
2149
        {
2150
            /* there's no room - set the remaining output length to zero */
2151
            *outlen = 0;
2152
        }
2153
    }
2154
2155
    /* return the total output length */
2156
    return tot_outlen;
2157
}
2158
2159
2160
/* ------------------------------------------------------------------------ */
2161
/*
2162
 *   Single-byte mapped local to UTF-8 mapper 
2163
 */
2164
2165
/*
2166
 *   map a string from the single-byte local character set to UTF-8 
2167
 */
2168
size_t CCharmapToUniSB::map(char **outp, size_t *outlen,
2169
                            const char *inp, size_t inlen) const
2170
{
2171
    size_t tot_outlen;
2172
2173
    /* we haven't written any characters to the output buffer yet */
2174
    tot_outlen = 0;
2175
2176
    /* scan each character (character == byte) in the input string */
2177
    for ( ; inlen > 0 ; --inlen, ++inp)
2178
    {
2179
        wchar_t uni;
2180
        size_t csiz;
2181
        
2182
        /* get the unicode mapping for this character */
2183
        uni = map_[(unsigned char)*inp];
2184
2185
        /* get the size of this character */
2186
        csiz = utf8_ptr::s_wchar_size(uni);
2187
2188
        /* add it to the total output lenght */
2189
        tot_outlen += csiz;
2190
2191
        /* if there's room, add it to our output buffer */
2192
        if (*outlen >= csiz)
2193
        {
2194
            /* write it out */
2195
            *outp += utf8_ptr::s_putch(*outp, uni);
2196
            
2197
            /* deduct it from the remaining output length */
2198
            *outlen -= csiz;
2199
        }
2200
        else
2201
        {
2202
            /* there's no room - set the remaining output length to zero */
2203
            *outlen = 0;
2204
        }
2205
    }
2206
2207
    /* return the total output length */
2208
    return tot_outlen;
2209
}
2210
2211
/* ------------------------------------------------------------------------ */
2212
/*
2213
 *   Trivial UTF8-to-UTF8 input mapper 
2214
 */
2215
2216
/*
2217
 *   map a string 
2218
 */
2219
size_t CCharmapToUniUTF8::map2(char **outp, size_t *outlen,
2220
                               const char *inp, size_t inlen,
2221
                               size_t *partial_len) const
2222
{
2223
    size_t copy_len;
2224
2225
    /* 
2226
     *   Make sure we copy only whole characters, by truncating the string
2227
     *   to a length that includes only whole characters.  
2228
     */
2229
    copy_len = utf8_ptr::s_trunc(inp, inlen);
2230
2231
    /* 
2232
     *   note the length of any partial characters at the end of the buffer
2233
     *   for the caller - this is simply the difference between the original
2234
     *   length and the truncated copy length, since the truncation length
2235
     *   is simply the length excluding the partial last character bytes 
2236
     */
2237
    *partial_len = inlen - copy_len;
2238
2239
    /* limit the copying to what will fit in the output buffer */
2240
    if (copy_len > *outlen)
2241
    {
2242
        /* don't copy more than will fit, and don't copy partial characters */
2243
        copy_len = utf8_ptr::s_trunc(inp, *outlen);
2244
2245
        /* we don't have enough room, so set the output size to zero */
2246
        *outlen = 0;
2247
    }
2248
    else
2249
    {
2250
        /* we have room, so decrement the output size by the copy size */
2251
        *outlen -= copy_len;
2252
    }
2253
2254
    /* copy the data */
2255
    memcpy(*outp, inp, copy_len);
2256
2257
    /* advance the output pointer past the copied data */
2258
    *outp += copy_len;
2259
2260
    /* 
2261
     *   return the total input length -- the total output length is
2262
     *   always identical to the input length, because we don't change
2263
     *   anything 
2264
     */
2265
    return inlen;
2266
}
2267
2268
/*
2269
 *   read a file 
2270
 */
2271
size_t CCharmapToUniUTF8::read_file(osfildef *fp,
2272
                                    char *buf, size_t bufl,
2273
                                    unsigned long read_limit)
2274
{
2275
    size_t read_len;
2276
    char *last_start;
2277
    size_t last_got_len;
2278
    size_t last_need_len;
2279
    
2280
    /* make sure we don't read past the read limit, if applicable */
2281
    if (read_limit != 0 && bufl > read_limit)
2282
        bufl = (size_t)read_limit;
2283
2284
    /* 
2285
     *   Read directly from the file, up the buffer size minus two bytes.
2286
     *   We want to leave two extra bytes so that we can read any extra
2287
     *   continuation bytes for the last character, in order to ensure
2288
     *   that we always read whole characters; in the worst case, the last
2289
     *   character could be three bytes long, in which case we'd need to
2290
     *   read two extra bytes.
2291
     *   
2292
     *   If the available buffer size is less than three bytes, just read
2293
     *   the number of bytes they asked for and don't bother trying to
2294
     *   keep continuation sequences intact.  
2295
     */
2296
    if (bufl < 3)
2297
        return osfrbc(fp, buf, bufl);
2298
2299
    /* 
2300
     *   read up to the buffer size, less two bytes for possible
2301
     *   continuation bytes 
2302
     */
2303
    read_len = osfrbc(fp, buf, bufl - 2);
2304
2305
    /* 
2306
     *   if we didn't satisfy the entire request, we're at the end of the
2307
     *   file, so there's no point in trying to finish off any
2308
     *   continuation sequences - in this case, just return what we have 
2309
     */
2310
    if (read_len < bufl - 2)
2311
        return read_len;
2312
2313
    /* 
2314
     *   Check the last byte we read to see if there's another byte or two
2315
     *   following. 
2316
     *   
2317
     *   If the last byte is a continuation byte, this is a bit trickier.
2318
     *   We must back up to the preceding lead byte to figure out what we
2319
     *   have in this case.  
2320
     */
2321
    last_start = &buf[read_len - 1];
2322
    last_got_len = 1;
2323
    if (utf8_ptr::s_is_continuation(last_start))
2324
    {
2325
        /* 
2326
         *   if we only read one byte, simply return the one byte - we
2327
         *   started in the middle of a sequence, so there's no way we can
2328
         *   read a complete sequence 
2329
         */
2330
        if (read_len == 1)
2331
            return read_len;
2332
2333
        /* back up to the byte we're continuing from */
2334
        --last_start;
2335
        ++last_got_len;
2336
2337
        /* 
2338
         *   if this is another continuation byte, we've reached the maximum
2339
         *   byte length of three for a single character, so there's no way
2340
         *   we could need to read anything more 
2341
         */
2342
        if (utf8_ptr::s_is_continuation(last_start))
2343
            return read_len;
2344
    }
2345
2346
    /* 
2347
     *   Okay: we have last_start pointing to the start of the last
2348
     *   character, and last_got_len the number of bytes we actually have for
2349
     *   that last character.  If the needed length differs from the length
2350
     *   we actually have, we need to read more.  
2351
     */
2352
    last_need_len = utf8_ptr::s_charsize(*last_start);
2353
    if (last_need_len > last_got_len)
2354
    {
2355
        /* 
2356
         *   we need more than we actually read, so read the remaining
2357
         *   characters 
2358
         */
2359
        read_len += osfrbc(fp, buf + read_len, last_need_len - last_got_len);
2360
    }
2361
2362
    /* return the length we read */
2363
    return read_len;
2364
}
2365
2366
/* ------------------------------------------------------------------------ */
2367
/*
2368
 *   Basic UCS-2 to UTF-8 mapper 
2369
 */
2370
2371
/*
2372
 *   Read from a file, translating to UTF-8 encoding 
2373
 */
2374
size_t CCharmapToUniUcs2::read_file(osfildef *fp,
2375
                                    char *buf, size_t bufl,
2376
                                    unsigned long read_limit)
2377
{
2378
    size_t inlen;
2379
2380
    /* 
2381
     *   Compute how much to read from the file.  The input file is composed
2382
     *   of two-byte characters, so only read up to two thirds of the buffer
2383
     *   length; this will ensure that we can always fit what we read into
2384
     *   the caller's buffer.
2385
     *   
2386
     *   Note that we divide by three first, then double the result, to
2387
     *   ensure that we read an even number of bytes.  Each UCS-2 character
2388
     *   is represented in exactly two bytes, so we must always read pairs of
2389
     *   bytes to be sure we're reading whole characters.  
2390
     */
2391
    inlen = bufl / 3;
2392
    inlen *= 2;
2393
2394
    /* in any case, we can't read more than our own buffer size */
2395
    if (inlen > sizeof(inbuf_))
2396
        inlen = sizeof(inbuf_);
2397
2398
    /* don't read past the read limit, if applicable */
2399
    if (read_limit != 0 && inlen > read_limit)
2400
        inlen = (size_t)read_limit;
2401
2402
    /* read from the file */
2403
    inlen = osfrbc(fp, inbuf_, inlen);
2404
2405
    /* 
2406
     *   Map data to the caller's buffer, and return the result.  We're
2407
     *   certain that the data will fit in the caller's buffer: we're
2408
     *   mapping only a third as many characters as we have bytes
2409
     *   available, and each character can take up at most three bytes,
2410
     *   hence the worst case is that we fill the buffer completely.
2411
     *   
2412
     *   On the other hand, we may only fill the buffer to a third of its
2413
     *   capacity, but this is okay too, since we're not required to give
2414
     *   the caller everything they asked for.  
2415
     */
2416
    return map(&buf, &bufl, inbuf_, inlen);
2417
}
2418
2419
/* ------------------------------------------------------------------------ */
2420
/*
2421
 *   UCS-2 little-endian to UTF-8 mapper 
2422
 */
2423
2424
/*
2425
 *   map a string 
2426
 */
2427
size_t CCharmapToUniUcs2Little::map(char **outp, size_t *outlen,
2428
                                    const char *inp, size_t inlen) const
2429
{
2430
    size_t tot_outlen;
2431
2432
    /* we haven't written any characters to the output buffer yet */
2433
    tot_outlen = 0;
2434
2435
    /* scan each character (character == byte pair) in the input string */
2436
    for ( ; inlen > 1 ; inlen -= 2, inp += 2)
2437
    {
2438
        wchar_t uni;
2439
        size_t csiz;
2440
2441
        /* 
2442
         *   read the little-endian two-byte value - no mapping is
2443
         *   required, since UCS-2 uses the same code point assignments as
2444
         *   UTF-8 
2445
         */
2446
        uni = ((wchar_t)(unsigned char)*inp)
2447
              + (((wchar_t)(unsigned char)*(inp + 1)) << 8);
2448
2449
        /* get the size of this character */
2450
        csiz = utf8_ptr::s_wchar_size(uni);
2451
2452
        /* add it to the total output lenght */
2453
        tot_outlen += csiz;
2454
2455
        /* if there's room, add it to our output buffer */
2456
        if (*outlen >= csiz)
2457
        {
2458
            /* write it out */
2459
            *outp += utf8_ptr::s_putch(*outp, uni);
2460
2461
            /* deduct it from the remaining output length */
2462
            *outlen -= csiz;
2463
        }
2464
        else
2465
        {
2466
            /* there's no room - set the remaining output length to zero */
2467
            *outlen = 0;
2468
        }
2469
    }
2470
2471
    /* return the total output length */
2472
    return tot_outlen;
2473
}
2474
2475
/* ------------------------------------------------------------------------ */
2476
/*
2477
 *   UCS-2 big-endian to UTF-8 mapper 
2478
 */
2479
2480
/*
2481
 *   map a string 
2482
 */
2483
size_t CCharmapToUniUcs2Big::map(char **outp, size_t *outlen,
2484
                                 const char *inp, size_t inlen) const
2485
{
2486
    size_t tot_outlen;
2487
2488
    /* we haven't written any characters to the output buffer yet */
2489
    tot_outlen = 0;
2490
2491
    /* scan each character (character == byte pair) in the input string */
2492
    for ( ; inlen > 1 ; inlen -= 2, inp += 2)
2493
    {
2494
        wchar_t uni;
2495
        size_t csiz;
2496
2497
        /* 
2498
         *   read the big-endian two-byte value - no mapping is required,
2499
         *   since UCS-2 uses the same code point assignments as UTF-8 
2500
         */
2501
        uni = (((wchar_t)(unsigned char)*inp) << 8)
2502
              + ((wchar_t)(unsigned char)*(inp + 1));
2503
2504
        /* get the size of this character */
2505
        csiz = utf8_ptr::s_wchar_size(uni);
2506
2507
        /* add it to the total output lenght */
2508
        tot_outlen += csiz;
2509
2510
        /* if there's room, add it to our output buffer */
2511
        if (*outlen >= csiz)
2512
        {
2513
            /* write it out */
2514
            *outp += utf8_ptr::s_putch(*outp, uni);
2515
2516
            /* deduct it from the remaining output length */
2517
            *outlen -= csiz;
2518
        }
2519
        else
2520
        {
2521
            /* there's no room - set the remaining output length to zero */
2522
            *outlen = 0;
2523
        }
2524
    }
2525
2526
    /* return the total output length */
2527
    return tot_outlen;
2528
}
2529
2530
/* ------------------------------------------------------------------------ */
2531
/*
2532
 *   Multi-byte character set translation to Unicode 
2533
 */
2534
2535
/*
2536
 *   construct the mapper 
2537
 */
2538
CCharmapToUniMB::CCharmapToUniMB()
2539
{
2540
    int i;
2541
    cmap_mb_entry *p;
2542
2543
    /* clear out the mapping table */
2544
    for (i = 0, p = map_ ; i < 256 ; ++i, ++p)
2545
    {
2546
        /* assume this lead byte won't have a sub-table */
2547
        p->sub = 0;
2548
2549
        /* 
2550
         *   we don't have a mapping for this lead byte yet, so use U+FFFD
2551
         *   (the Unicode REPLACEMENT CHARACTER) as the default mapping in
2552
         *   case we never assign it any other mapping 
2553
         */
2554
        p->ch = 0xFFFD;
2555
    }
2556
}
2557
2558
/* 
2559
 *   delete the mapper 
2560
 */
2561
CCharmapToUniMB::~CCharmapToUniMB()
2562
{
2563
    int i;
2564
    cmap_mb_entry *p;
2565
2566
    /* delete all of our sub-tables */
2567
    for (i = 0, p = map_ ; i < 256 ; ++i, ++p)
2568
    {
2569
        /* if this sub-table was allocated, delete it */
2570
        if (p->sub != 0)
2571
            t3free(p->sub);
2572
    }
2573
}
2574
2575
/*
2576
 *   Set a mapping 
2577
 */
2578
void CCharmapToUniMB::set_mapping(wchar_t uni_code_pt, wchar_t local_code_pt)
2579
{
2580
    /* 
2581
     *   Check to see if it's a one-byte or two-byte mapping.  If the local
2582
     *   code point is in the range 0-255, it's a one-byte character;
2583
     *   otherwise, it's a two-byte character.  
2584
     */
2585
    if (local_code_pt <= 255)
2586
    {
2587
        /* it's a single-byte character, so simply set the mapping */
2588
        map_[(unsigned char)local_code_pt].ch = uni_code_pt;
2589
    }
2590
    else
2591
    {
2592
        cmap_mb_entry *entp;
2593
        wchar_t *subp;
2594
2595
        /* 
2596
         *   Get the mapping table entry for the lead byte.  The lead byte of
2597
         *   the local code point is given by the high-order byte of the
2598
         *   local code point.  (Note that this doesn't have anything to do
2599
         *   with the endian-ness of the local platform.  The generic Unicode
2600
         *   mapping tables are specifically designed this way, independently
2601
         *   of endian-ness.)  
2602
         */
2603
        entp = &map_[(unsigned char)((local_code_pt >> 8) & 0xff)];
2604
2605
        /* 
2606
         *   It's a two-byte character.  The high-order byte is the lead
2607
         *   byte, and the low-order byte is the trailing byte of the
2608
         *   two-byte sequence.  
2609
         *   
2610
         *   If we haven't previously set up a sub-table for the lead byte,
2611
         *   do so now.  
2612
         */
2613
        if ((subp = entp->sub) == 0)
2614
        {
2615
            size_t i;
2616
            wchar_t *p;
2617
2618
            /* allocate a new sub-mapping table for the lead byte */
2619
            subp = entp->sub = (wchar_t *)t3malloc(256 * sizeof(wchar_t));
2620
2621
            /* initialize each entry to U+FFFD, in case we never map them */
2622
            for (i = 256, p = subp ; i != 0 ; --i, *p++ = 0xFFFD) ;
2623
        }
2624
2625
        /* set the mapping in the sub-table for the second byte */
2626
        subp[(unsigned char)(local_code_pt & 0xff)] = uni_code_pt;
2627
    }
2628
}
2629
2630
/* 
2631
 *   map a string, providing partial character info 
2632
 */
2633
size_t CCharmapToUniMB::map2(char **output_ptr, size_t *output_buf_len,
2634
                             const char *input_ptr, size_t input_len,
2635
                             size_t *partial_len) const
2636
{
2637
    size_t needed_out_len;
2638
2639
    /* presume we won't have a partial last character */
2640
    *partial_len = 0;
2641
2642
    /* we haven't found anything to store in the output yet */
2643
    needed_out_len = 0;
2644
2645
    /* keep going until we've mapped each character */
2646
    while (input_len != 0)
2647
    {
2648
        unsigned char c;
2649
        const cmap_mb_entry *entp;
2650
        wchar_t wc;
2651
        size_t wlen;
2652
2653
        /* get the lead byte of the next input character */
2654
        c = *input_ptr;
2655
2656
        /* get the primary mapping table entry for the lead byte */
2657
        entp = &map_[c];
2658
2659
        /* check for a one-byte or two-byte mapping */
2660
        if (entp->sub == 0)
2661
        {
2662
            /* it's a one-byte character - get the mapping */
2663
            wc = entp->ch;
2664
2665
            /* skip the single byte of input */
2666
            ++input_ptr;
2667
            --input_len;
2668
        }
2669
        else
2670
        {
2671
            /* 
2672
             *   it's a two-byte character lead byte - make sure we have a
2673
             *   complete input character 
2674
             */
2675
            if (input_len < 2)
2676
            {
2677
                /* we have an incomplete last character - tell the caller */
2678
                *partial_len = 1;
2679
2680
                /* we're done mapping it */
2681
                break;
2682
            }
2683
2684
            /* get the second byte of the sequence */
2685
            c = input_ptr[1];
2686
2687
            /* get the translation from the sub-table */
2688
            wc = entp->sub[c];
2689
2690
            /* skip the two-byte sequence */
2691
            input_ptr += 2;
2692
            input_len -= 2;
2693
        }
2694
2695
        /* we have the translation - note its stored UTF-8 byte size */
2696
        wlen = utf8_ptr::s_wchar_size(wc);
2697
2698
        /* check for room to store the output character */
2699
        if (wlen > *output_buf_len)
2700
        {
2701
            /* 
2702
             *   there's no room to store this character - zero out the
2703
             *   output buffer length so that we know not to try storing
2704
             *   anything else in the buffer 
2705
             */
2706
            *output_buf_len = 0;
2707
        }
2708
        else
2709
        {
2710
            /* there's room - store it */
2711
            wlen = utf8_ptr::s_putch(*output_ptr, wc);
2712
2713
            /* consume output buffer space */
2714
            *output_ptr += wlen;
2715
            *output_buf_len -= wlen;
2716
        }
2717
2718
        /* count the needed length, whether we stored it or not */
2719
        needed_out_len += wlen;
2720
    }
2721
2722
    /* return the required output length */
2723
    return needed_out_len;
2724
}
2725
2726
/* 
2727
 *   read from a multi-byte input file, translating to UTF-8 
2728
 */
2729
size_t CCharmapToUniMB::read_file(osfildef *fp, char *buf, size_t bufl,
2730
                                  unsigned long read_limit)
2731
{
2732
    size_t inlen;
2733
    size_t outlen;
2734
    size_t partial;
2735
2736
    /*
2737
     *   Compute how much to read from the file.  The input file is composed
2738
     *   of one-byte or two-byte characters, so only read up to one-third of
2739
     *   the caller's buffer length; this will ensure that in the worst case
2740
     *   we can always fit what we read into the caller's buffer.  (The worst
2741
     *   case is that the input is entirely single-byte local characters that
2742
     *   translate into three-byte UTF-8 characters.)  
2743
     */
2744
    inlen = bufl / 3;
2745
2746
    /* in any case, we can't read more than our own buffer size */
2747
    if (inlen >= sizeof(inbuf_))
2748
        inlen = sizeof(inbuf_);
2749
2750
    /* limit the read length to the caller's read limit, if appropriate */
2751
    if (read_limit != 0 && inlen > read_limit)
2752
        inlen = (size_t)read_limit;
2753
2754
    /* read raw bytes from the file */
2755
    inlen = osfrbc(fp, inbuf_, inlen);
2756
2757
    /* 
2758
     *   Map data to the caller's buffer.  Note if we have a partial
2759
     *   character at the end of the buffer (i.e., the last byte of the
2760
     *   buffer is a lead byte that requires a second byte to make up a
2761
     *   complete two-byte local character), so that we can read an
2762
     *   additional byte to complete the two-byte final character if
2763
     *   necessary.  
2764
     */
2765
    outlen = map2(&buf, &bufl, inbuf_, inlen, &partial);
2766
2767
    /* 
2768
     *   if we have a partial trailing character, read the other half of the
2769
     *   final character 
2770
     */
2771
    if (partial != 0)
2772
    {
2773
        /* move the lead byte to the start of our buffer */
2774
        inbuf_[0] = inbuf_[inlen - 1];
2775
2776
        /* read the extra byte to form a complete character */
2777
        inlen = 1 + osfrbc(fp, inbuf_ + 1, 1);
2778
2779
        /* if we got the second byte, map the complete final character */
2780
        if (inlen == 2)
2781
            outlen += map(&buf, &bufl, inbuf_, inlen);
2782
    }
2783
2784
    /* return the result length */
2785
    return outlen;
2786
}
2787