cfad47cfa3/tads3/charmap.h

4b825dc642cb6eb9a060e54bf8d69288fbee4904cfad47cfa334b206c65f22086bcc5d63e6f70944
1
/* $Header: d:/cvsroot/tads/tads3/charmap.h,v 1.2 1999/05/17 02:52:29 MJRoberts Exp $ */
2
3
/* 
4
 *   Copyright (c) 1998, 2002 Michael J. Roberts.  All Rights Reserved.
5
 *   
6
 *   Please see the accompanying license file, LICENSE.TXT, for information
7
 *   on using and copying this software.  
8
 */
9
/*
10
Name
11
  charmap.h - character-set mapper
12
Function
13
  Provides mappings between 16-bit Unicode and single-byte, multi-byte,
14
  and double-byte character sets.
15
Notes
16
  
17
Modified
18
  10/17/98 MJRoberts  - Creation
19
*/
20
21
#ifndef CHARMAP_H
22
#define CHARMAP_H
23
24
#include <stdlib.h>
25
#include <memory.h>
26
#include <string.h>
27
28
#include "utf8.h"
29
#include "os.h"
30
#include "t3std.h"
31
32
33
/* ------------------------------------------------------------------------ */
34
/*
35
 *   Mapping Types.  This enum provides a characterization of a local
36
 *   character set (as defined in a mapping file).  
37
 */
38
enum charmap_type_t
39
{
40
    /* 
41
     *   Single-byte character set - each character is represented with a
42
     *   single 8-bit byte.
43
     */
44
    CHARMAP_TYPE_SB,
45
46
    /*
47
     *   Double-byte character set - each character is represented with
48
     *   exactly two 8-bit bytes.  In each byte pair, the first byte is
49
     *   taken as the high-order byte, so a text input stream consisting
50
     *   of the bytes 0x12, 0x34, 0x56, 0x78 would be interpreted as the
51
     *   two 16-bit code point values 0x1234, 0x5678.  
52
     */
53
    CHARMAP_TYPE_DB,
54
55
    /*
56
     *   Mixed multi-byte - each character is represented by either one or
57
     *   two 8-bit bytes.  Each two-byte character starts with a byte that
58
     *   is only used in two-byte characters; each one-byte character
59
     *   consists of a single byte that is not used as the first byte of
60
     *   any two-byte character.  In each two-byte character, the first
61
     *   byte is taken as the high-order byte.
62
     *   
63
     *   For example, assuming that 0x00-0x7F are defined as single-byte
64
     *   characters, and 0x8000-0xFFFF are defined as double-byte
65
     *   characters, the byte sequence 0x12, 0x81, 0xAB, 0x82, 0xCD, 0x34
66
     *   would be taken as the character sequence 0x12, 0x81AB, 0x82CD,
67
     *   0x34.  
68
     */
69
    CHARMAP_TYPE_MB
70
};
71
72
/* ------------------------------------------------------------------------ */
73
/*
74
 *   Basic character mapper class. 
75
 */
76
class CCharmap
77
{
78
public:
79
    /* add a reference */
80
    void add_ref() { ++ref_cnt_; }
81
82
    /* release a reference; delete on removing the last reference */
83
    void release_ref()
84
    {
85
        /* count the unreference */
86
        --ref_cnt_;
87
88
        /* if that leaves no references, delete me */
89
        if (ref_cnt_ == 0)
90
            delete this;
91
    }
92
93
protected:
94
    CCharmap()
95
    {
96
        /* start out with one reference, for the initial creator */
97
        ref_cnt_ = 1;
98
    }
99
    
100
    virtual ~CCharmap() { }
101
    
102
    /*
103
     *   Open and characterize a mapping file.  Returns the osfildef
104
     *   pointer if the file was successfully opened and parsed, or null
105
     *   if not.  Sets *map_type to indicate the type of mapping contained
106
     *   in the file.  
107
     */
108
    static osfildef *open_map_file(class CResLoader *res_loader,
109
                                   const char *table_name,
110
                                   charmap_type_t *map_type);
111
112
    /* check a name to see if it matches one of the names for ASCII */
113
    static int name_is_ascii_synonym(const char *table_name)
114
    {
115
        /* accept any of the various synonyms for ASCII */
116
        return (stricmp(table_name, "us-ascii") == 0
117
                || stricmp(table_name, "asc7dflt") == 0
118
                || stricmp(table_name, "ascii") == 0
119
                || stricmp(table_name, "iso646-us") == 0
120
                || stricmp(table_name, "iso-ir-6") == 0
121
                || stricmp(table_name, "cp367") == 0
122
                || stricmp(table_name, "us") == 0);
123
    }
124
125
    /* check a name to see if it matches one of the names for ISO 8859-1 */
126
    static int name_is_8859_1_synonym(const char *table_name)
127
    {
128
        /* accept any of the various names for ISO 8859-1 */
129
        return (stricmp(table_name, "iso-8859-1") == 0
130
                || stricmp(table_name, "iso_8859-1") == 0
131
                || stricmp(table_name, "iso-ir-100") == 0
132
                || stricmp(table_name, "latin1") == 0
133
                || stricmp(table_name, "l1") == 0
134
                || stricmp(table_name, "cp819") == 0);
135
    }
136
137
    /* reference count */
138
    unsigned int ref_cnt_;
139
};
140
141
/* ------------------------------------------------------------------------ */
142
/*
143
 *   Base character mapper class for mapping from a local character set to
144
 *   UTF-8.  This is an abstract interface that must be implemented for
145
 *   different classes of character sets.  
146
 */
147
class CCharmapToUni: public CCharmap
148
{
149
public:
150
    /* initialize */
151
    CCharmapToUni() { }
152
153
    /*
154
     *   Create a mapping object for a given character table.  We'll read
155
     *   enough of the character table to determine the appropriate
156
     *   concrete subclass to instantiate, then create an object, load the
157
     *   table into the object, and return the object.  The caller is
158
     *   responsible for deleting the object when finished with it.
159
     *   
160
     *   Returns null if the mapping file cannot be loaded.
161
     */
162
    static CCharmapToUni *load(class CResLoader *res_loader,
163
                               const char *table_name);
164
165
    /*
166
     *   Determine if the given byte sequence forms a complete character in
167
     *   the local character set.  Returns true if so, false if not.  'len'
168
     *   must be at least 1.  
169
     */
170
    virtual int is_complete_char(const char *p, size_t len) const = 0;
171
172
    /*
173
     *   Convert a string from the local character set to Unicode.
174
     *   Returns the byte length of the output.  If the output buffer is
175
     *   too small to store the result, we will return the size of the
176
     *   full result, but we won't write past the end of the buffer.
177
     *   
178
     *   We'll advance *output_ptr by the number of bytes we write.
179
     *   
180
     *   If we store anything, we'll decrement *output_buf_len by the
181
     *   number of bytes we store; if we don't have enough room, we'll set
182
     *   *output_buf_len to zero.
183
     *   
184
     *   input_ptr is a pointer to the input string; input_len is the
185
     *   length in bytes of the input string.  
186
     */
187
    virtual size_t map(char **output_ptr, size_t *output_buf_len,
188
                       const char *input_ptr, size_t input_len) const = 0;
189
190
    /*
191
     *   Convert a string from the local character set to Unicode.
192
     *   
193
     *   This works the same way as map(), but additionally provides
194
     *   information on the consumption of source bytes by filling in
195
     *   partial_len with the number of bytes at the end of the source
196
     *   buffer that are not mappable because they do not form complete
197
     *   characters in the source character set.  Since we scan all input
198
     *   regardless of whether there's space to store the resulting output,
199
     *   this will reflect the same number of bytes no matter what the
200
     *   output buffer length.  
201
     */
202
    virtual size_t map2(char **output_ptr, size_t *output_buf_len,
203
                        const char *input_ptr, size_t input_len,
204
                        size_t *partial_len) const = 0;
205
206
    /* 
207
     *   Map a null-terminated string into a buffer; returns the number of
208
     *   bytes of the buffer actually needed to store the string.  If the
209
     *   entire string couldn't be mapped, this will return a number
210
     *   greater than or equal to the output buffer size, but we will not
211
     *   write beyond the end of the buffer.
212
     *   
213
     *   If there's space, the result will be null-terminated; however,
214
     *   the null terminator byte will not be included in the result
215
     *   length.  If the return value exactly equals outbuflen, it means
216
     *   that the string exactly fills the buffer, hence there isn't space
217
     *   for a null terminator.  
218
     */
219
    size_t map_str(char *outbuf, size_t outbuflen, const char *input_str);
220
221
    /*
222
     *   Read characters from a file into a buffer, translating the
223
     *   characters to UTF-8.  Returns the number of bytes copied into the
224
     *   buffer; returns zero on end of file.  The buffer must be at least
225
     *   three bytes long to ensure that at least one character can be read
226
     *   from the file (the longest UTF-8 character takes up three bytes),
227
     *   since it would otherwise not be possible to distinguish reaching
228
     *   the end of the file from simply being unable to fit even one
229
     *   character into the buffer.
230
     *   
231
     *   The file can be opened in text or binary mode; we don't pay any
232
     *   attention to newline sequences, so the mode is not relevant to us.
233
     *   
234
     *   This routine may read fewer than the desired number of bytes.  Upon
235
     *   return, the file's seek position should be set to the next byte of
236
     *   the file after the last character copied into the output buffer.
237
     *   
238
     *   'read_limit' is the maximum number of bytes we're allowed to read
239
     *   from the underlying file.  If this is zero, then the read size is
240
     *   unlimited.  
241
     */
242
    virtual size_t read_file(osfildef *fp, char *buf, size_t bufl,
243
                             unsigned long read_limit) = 0;
244
245
protected:
246
    /* delete the mapping */
247
    virtual ~CCharmapToUni() { }
248
249
    /* load the mapping table from the file */
250
    void load_table(osfildef *fp);
251
252
    /*
253
     *   Set a mapping.  uni_code_pt is the unicode code point, and
254
     *   local_code_pt is the code point in the local character set.  
255
     */
256
    virtual void set_mapping(wchar_t uni_code_pt, wchar_t local_code_pt) = 0;
257
};
258
259
/* ------------------------------------------------------------------------ */
260
/*
261
 *   Base character mapper class for mapping from Unicode UTF-8 to a local
262
 *   character set.  This is an abstract interface that must be separately
263
 *   implemented for different classes of character sets.
264
 *   
265
 *   Each mapping object maintains a table of mapping tables.  The master
266
 *   table contains an array of up to 256 sub-tables.  The top 8 bits of
267
 *   the unicode character value give the index in the master table.  Each
268
 *   entry in the master table is a pointer to a sub-table, or a null
269
 *   pointer if there are no mappings for characters in the range for that
270
 *   sub-table.
271
 *   
272
 *   For example, unicode characters 0x0000 through 0x007f are mapped
273
 *   through the table obtained by getting the pointer at index 0 from the
274
 *   master table.  Unicode characters 0x0200 through 0x02ff are in the
275
 *   table at master table index 2.
276
 *   
277
 *   If a master table index entry is empty (i.e., the pointer in the
278
 *   master table at that index is null), it means that all of the
279
 *   characters in the range for that master index map to the default
280
 *   character.  Otherwise, we index into the sub-table using the
281
 *   low-order 8 bits of the Unicode character code to find the character
282
 *   mapping giving the local character set code for the Unicode value.
283
 *   
284
 *   Each entry in the mapping table is the offset of the translation of
285
 *   the character within the translation array.  The translation array is
286
 *   an array of bytes.  The first byte of each entry is the length in
287
 *   bytes of the entry (not including the length byte), followed by the
288
 *   bytes of the entry.
289
 *   
290
 *   The first entry in the translation array is always the default
291
 *   character, which is the mapping we use for characters with no other
292
 *   valid mapping.  
293
 */
294
class CCharmapToLocal: public CCharmap
295
{
296
public:
297
    /* initialize */
298
    CCharmapToLocal();
299
300
    /* create a mapper and load the mapping from a file */
301
    static CCharmapToLocal *load(class CResLoader *res_loader,
302
                                 const char *table_name);
303
    
304
    /* 
305
     *   Convert a character from Unicode to the local character set.
306
     *   Stores the character's byte or bytes at the given pointer, and
307
     *   increments the pointer to point to the next byte after the
308
     *   character.
309
     *   
310
     *   Returns the byte length of the output.  If the output buffer is
311
     *   not long enough to store the result, we simply return the size of
312
     *   the result without storing anything.
313
     *   
314
     *   If we actually store anything, we'll decrement *output_buf_len by
315
     *   the number of bytes we stored; if we don't have room to store
316
     *   anything, we'll set *output_buf_len to zero.  
317
     */
318
    virtual size_t map(wchar_t unicode_char, char **output_ptr,
319
                       size_t *output_buf_len) const = 0;
320
321
    /*
322
     *   Simple single-character mapper - returns the byte length of the
323
     *   local character equivalent of the unicode character, which is
324
     *   written into the buffer.  If the buffer isn't big enough, we'll
325
     *   still return the length, but won't write anything to the buffer.  
326
     */
327
    size_t map_char(wchar_t unicode_char, char *buf, size_t buflen)
328
    {
329
        /* map the character */
330
        return map(unicode_char, &buf, &buflen);
331
    }
332
333
334
    /*
335
     *   Convert a UTF-8 string with a given byte length to the local
336
     *   character set.
337
     *   
338
     *   Returns the byte length of the result.  If the result is too long
339
     *   to fit in the output buffer, we'll return the number of bytes we
340
     *   actually were able to store (we'll store as much as we can, and
341
     *   stop when we run out of space).  We'll indicate in
342
     *   *src_bytes_used how many bytes of the source we were able to map.
343
     *   
344
     *   If the output buffer is null, we will store nothing, but simply
345
     *   determine how much space it would take to store the entire string.
346
     *   
347
     *   This base class provides an implementation of this method that is
348
     *   suitable for all subclasses, but the method is defined as virtual
349
     *   so that subclasses can override it with a more tailored (and thus
350
     *   more efficient) implementation.  The general-purpose base-class
351
     *   implementation must call the virtual function map() for each
352
     *   character mapped.  
353
     */
354
    virtual size_t map_utf8(char *dest, size_t dest_len,
355
                            utf8_ptr src, size_t src_byte_len,
356
                            size_t *src_bytes_used) const;
357
358
    /* 
359
     *   map to utf8 - alternative interface using character buffers
360
     *   (rather than UTF8 pointers) 
361
     */
362
    size_t map_utf8(char *dest, size_t dest_len,
363
                    const char *src, size_t src_byte_len,
364
                    size_t *src_bytes_used) const;
365
366
    /*
367
     *   Convert a null-terminated UTF-8 string to the local character set.
368
     *   
369
     *   Returns the byte length of the result.  If the result is too long
370
     *   to fit in the output buffer, we'll return the size without storing
371
     *   the entire string (we'll store as much as we can, and stop when we
372
     *   run out of space, but continue counting the length needed; call
373
     *   with a destination buffer length of zero to simply determine how
374
     *   much space is needed for the result).
375
     *   
376
     *   The length returned does NOT include the null terminator.  However,
377
     *   if there's room, we will null-terminate the result string.  So, if
378
     *   the caller wants the result to be null terminated, it should make
379
     *   sure that the buffer contains one byte more than the space reported
380
     *   as necessary to store the result.  
381
     */
382
    virtual size_t map_utf8z(char *dest, size_t dest_len, utf8_ptr src)
383
        const;
384
385
    /*
386
     *   Convert a null-terminated UTF-8 string to the local character set,
387
     *   filling in an 'escape' sequence for unknown characters.  For each
388
     *   unknown character, we'll invoke the given callback to get the
389
     *   'escaped' representation.  Use &CCharmapToLocal::source_esc_cb, for
390
     *   example, to map using source-code-style escape sequences.
391
     *   
392
     *   The callback takes the unmappable character, a pointer to the output
393
     *   buffer, and a pointer to the length remaining.  It should fill in
394
     *   the buffer with the escaped sequence (up to the remaining length
395
     *   limit), and adjust the buffer pointer and length for the space
396
     *   consumed.  The return value is the full length required for the
397
     *   complete escape sequence, even if there's not enough space in the
398
     *   buffer to hold that many characters.  
399
     */
400
    virtual size_t map_utf8z_esc(char *dest, size_t dest_len, utf8_ptr src,
401
                                 size_t (*esc_fn)(wchar_t, char **, size_t *))
402
        const;
403
404
    /* 
405
     *   ready-made callback for map_utf8z_esc() - map to unicode 'backslash'
406
     *   escape sequences ('\u1234'), as we'd use in tads source code 
407
     */
408
    static size_t source_esc_cb(wchar_t ch, char **dest, size_t *len);
409
410
    /* 
411
     *   Write data to a file, converting from UTF-8 to the local character
412
     *   set.  Returns zero on success, non-zero if an error occurs writing
413
     *   the data.  
414
     */
415
    int write_file(osfildef *fp, const char *buf, size_t bufl);
416
417
    /* 
418
     *   determine if the given Unicode character has a mapping to the local
419
     *   character set 
420
     */
421
    virtual int is_mappable(wchar_t unicode_char) const
422
    {
423
        /* 
424
         *   By default, it's mappable if it has a non-default mapping in
425
         *   the translation table.  The default mapping is always at offset
426
         *   zero in the translation table.  
427
         */
428
        return (get_mapping(unicode_char) != 0);
429
    }
430
431
    /*
432
     *   Get the display expansion for a unicode character.  This returns a
433
     *   pointer to an array of wchar_t characters, and fills in the length
434
     *   variable.  Returns null if there's no expansion.
435
     *   
436
     *   An "expansion" is a list of two or more unicode characters that
437
     *   should be substituted for the given unicode character when the
438
     *   character is displayed.  Display expansions are normally used for
439
     *   visual approximations when the local character set doesn't contain
440
     *   an exact match for the unicode character; for example, an ASCII
441
     *   mapping might use the expansion "(c)" to represent the copyright
442
     *   circled-C symbol, or the two-character sequence "AE" to represent
443
     *   the AE ligature.  
444
     */
445
    const wchar_t *get_expansion(wchar_t unicode_char, size_t *len)
446
    {
447
        size_t ofs;
448
        const wchar_t *map;
449
450
        /* get the mapping offset in the expansion array */
451
        ofs = get_exp_mapping(unicode_char);
452
453
        /* if the mapping offset is zero, it means there's no mapping */
454
        if (ofs == 0)
455
        {
456
            /* indicate that there's no mapping by returning null */
457
            *len = 0;
458
            return 0;
459
        }
460
461
        /* get the mapping pointer */
462
        map = get_exp_ptr(ofs);
463
464
        /* read the length and skip it */
465
        *len = (size_t)*map++;
466
467
        /* return the pointer to the first character of the expansion */
468
        return map;
469
    }
470
471
protected:
472
    /* delete the mapping */
473
    virtual ~CCharmapToLocal();
474
475
    /* given a Unicode character, get the mapping for the character */
476
    unsigned int get_mapping(wchar_t unicode_char) const
477
    {
478
        unsigned int *subtable;
479
480
        /* get the mapping table */
481
        subtable = get_sub_table(unicode_char);
482
483
        /* 
484
         *   If there is no subtable, return the default character, which is
485
         *   always at offset zero in the translation array; otherwise, use
486
         *   the low-order 8 bits of the character code as the index into
487
         *   the subtable and return the value we find there 
488
         */
489
        if (subtable == 0)
490
            return 0;
491
        else
492
            return subtable[unicode_char & 0xff];
493
    }
494
495
    /* given a Unicode character, get the expansion for the character */
496
    unsigned int get_exp_mapping(wchar_t unicode_char) const
497
    {
498
        unsigned int *subtable;
499
500
        /* get the mapping table */
501
        subtable = get_exp_sub_table(unicode_char);
502
503
        /* 
504
         *   if there's no subtable, return zero to indicate there's no
505
         *   expansion; otherwise, return the entry from the subtable 
506
         */
507
        return (subtable == 0 ? 0 : subtable[unicode_char & 0xff]);
508
    }
509
510
    /*
511
     *   Get a pointer to the sequence of bytes in the translation array at
512
     *   a given offset 
513
     */
514
    const unsigned char *get_xlat_ptr(unsigned int ofs) const
515
    {
516
        return &xlat_array_[ofs];
517
    }
518
519
    /*
520
     *   Get a pointer to the translation of a character and the length in
521
     *   bytes of the translation 
522
     */
523
    const unsigned char *get_xlation(wchar_t unicode_char, size_t *map_len)
524
        const
525
    {
526
        const unsigned char *map;
527
528
        /* get the translation offset */
529
        map = get_xlat_ptr(get_mapping(unicode_char));
530
531
        /* read the length and skip it in the table */
532
        *map_len = (size_t)*map++;
533
534
        /* return the mapped byte sequence */
535
        return map;
536
    }
537
538
    /* 
539
     *   get a pointer to the sequence of wchar_t values in the expansion
540
     *   array at a given offset 
541
     */
542
    const wchar_t *get_exp_ptr(unsigned int ofs) const
543
    {
544
        return &exp_array_[ofs];
545
    }
546
547
    /* load the mapping table from a file */
548
    void load_table(osfildef *fp);
549
550
    /*
551
     *   Given a Unicode character, get the sub-table for the character,
552
     *   or null if there is no sub-table for this character.  
553
     */
554
    unsigned int *get_sub_table(wchar_t unicode_char) const
555
    {
556
        /* 
557
         *   use the high-order 8 bits of the unicode character as the
558
         *   index into the master table 
559
         */
560
        return map_[(unicode_char >> 8) & 0xff];
561
    }
562
563
    /* 
564
     *   Given a Unicode character, get the expansion sub-table for the
565
     *   character. or null if there is no sub-table for the character.  
566
     */
567
    unsigned int *get_exp_sub_table(wchar_t unicode_char) const
568
    {
569
        /* 
570
         *   use the high-order 8 bits of the unicode character as the index
571
         *   into the master table 
572
         */
573
        return exp_map_[(unicode_char >> 8) & 0xff];
574
    }
575
576
    /*
577
     *   Set a mapping.  This allocates a new sub-table if necessary, and
578
     *   stores the local character mapping in the table.  
579
     */
580
    void set_mapping(wchar_t unicode_char, unsigned int xlat_offset);
581
582
    /* set an expansion mapping */
583
    void set_exp_mapping(wchar_t unicode_char, unsigned int exp_offset);
584
585
    /*
586
     *   The master mapping table list.  Each entry points to the
587
     *   sub-array that contains the mapping for the 256 characters whose
588
     *   high-order 8 bits give the index into this table.  Each entry of
589
     *   the subarray is the offset within the xlat_array_ byte array of
590
     *   the first byte of the translation for the unicode character.  
591
     */
592
    unsigned int *map_[256];
593
594
    /* 
595
     *   The master expansion mapping list.  This works just like map_, but
596
     *   points to exp_array_ entries for unicode display expansions.  
597
     */
598
    unsigned int *exp_map_[256];
599
600
    /*
601
     *   The translation array.  This is an array of bytes containing the
602
     *   translations.  map_[high_8_bits][low_8_bits] contains the offset
603
     *   within this array of the translation of the character with the
604
     *   given code ((high_8_bits << 8) + low_8_bits).  The first byte at
605
     *   this offset is the length in bytes of the translation, not
606
     *   counting the length byte.  The remaining bytes are the bytes of
607
     *   the translation for the character. 
608
     */
609
    unsigned char *xlat_array_;
610
611
    /* size of the translation array */
612
    size_t xlat_array_size_;
613
614
    /*
615
     *   The expansion array.  This is an array of unicode characters
616
     *   containing the expansions for displaying unicode characters.  This
617
     *   works just like xlat_array_: each entry in expmap_ is an index into
618
     *   this array, which gives the starting point in the array of the run
619
     *   of entries for the expansion of that character.  The first character
620
     *   of a run is a length prefix giving the number of characters in the
621
     *   expansion.  
622
     */
623
    wchar_t *exp_array_;
624
};
625
626
627
/* ======================================================================== */
628
/*
629
 *   Local character set - to - Unicode UTF-8 mappers 
630
 */
631
632
/* ------------------------------------------------------------------------ */
633
/*
634
 *   Trival UTF8-to-UTF8 mapper - performs no conversions.  This can be
635
 *   used when reading from an external data source that is itself in
636
 *   UTF-8 format; since this is identical to the format we use
637
 *   internally, no mapping is required.  
638
 */
639
class CCharmapToUniUTF8: public CCharmapToUni
640
{
641
public:
642
    /* read from a file */
643
    virtual size_t read_file(osfildef *fp, char *buf, size_t bufl,
644
                             unsigned long read_limit);
645
646
    /* determine if a byte sequence forms a complete character */
647
    virtual int is_complete_char(const char *p, size_t len) const
648
    {
649
        /* 
650
         *   For UTF-8, we can infer the byte length of a character from the
651
         *   first byte of the sequence.  If the given length is at least the
652
         *   inferred byte length, we have a complete character.  
653
         */
654
        return (len >= utf8_ptr::s_charsize(*p));
655
    }
656
657
    /* map a string */
658
    size_t map(char **output_ptr, size_t *output_buf_len,
659
               const char *input_ptr, size_t input_len) const
660
    {
661
        size_t partial_len;
662
663
        /* 
664
         *   do the full mapping, discarding the partial last character byte
665
         *   length information 
666
         */
667
        return map2(output_ptr, output_buf_len, input_ptr, input_len,
668
                    &partial_len);
669
    }
670
671
    /* map a string, providing partial character info */
672
    virtual size_t map2(char **output_ptr, size_t *output_buf_len,
673
                        const char *input_ptr, size_t input_len,
674
                        size_t *partial_len) const;
675
    
676
protected:
677
    /* we don't need a mapping table - ignore any that is set */
678
    virtual void set_mapping(wchar_t, wchar_t) { }
679
};
680
681
/* ------------------------------------------------------------------------ */
682
/*
683
 *   Character mapper base class for UCS-2 to UTF-8.  We will subclass
684
 *   this mapper for big-endian and little-endian UCS-2 representations,
685
 *   but both mappers are essentially the same in that only format
686
 *   translation is required, since UCS-2 and UTF-8 use the same code
687
 *   point mapping (i.e., Unicode).  
688
 */
689
class CCharmapToUniUcs2: public CCharmapToUni
690
{
691
public:
692
    /* read from a file */
693
    virtual size_t read_file(osfildef *fp, char *buf, size_t bufl,
694
                             unsigned long read_limit);
695
696
    /* determine if a byte sequence forms a complete character */
697
    virtual int is_complete_char(const char *, size_t len) const
698
    {
699
        /* every character in UCS-2 requires two bytes */
700
        return (len >= 2);
701
    }
702
703
    /* map a string, providing partial character info */
704
    virtual size_t map2(char **output_ptr, size_t *output_buf_len,
705
                        const char *input_ptr, size_t input_len,
706
                        size_t *partial_len) const
707
    {
708
        /* 
709
         *   if the input length is odd, there's one byte of partial
710
         *   character information at the end of the buffer; otherwise
711
         *   everything is valid 
712
         */
713
        *partial_len = (input_len & 1);
714
715
        /* perform the usual mapping */
716
        return map(output_ptr, output_buf_len, input_ptr, input_len);
717
    }
718
719
protected:
720
    /* 
721
     *   there's no mapping table for UCS-2 translations, so we don't need
722
     *   to do anything with mappings 
723
     */
724
    virtual void set_mapping(wchar_t, wchar_t) { }
725
726
    /* temporary buffer for reading files */
727
    char inbuf_[512];
728
};
729
730
/* ------------------------------------------------------------------------ */
731
/*
732
 *   Character mapper for UCS-2 little-endian to UTF-8 
733
 */
734
class CCharmapToUniUcs2Little: public CCharmapToUniUcs2
735
{
736
public:
737
    /* map a string */
738
    size_t map(char **output_ptr, size_t *output_buf_len,
739
               const char *input_ptr, size_t input_len) const;
740
};
741
742
/* ------------------------------------------------------------------------ */
743
/*
744
 *   Character mapper for UCS-2 big-endian to UTF-8 
745
 */
746
class CCharmapToUniUcs2Big: public CCharmapToUniUcs2
747
{
748
public:
749
    /* map a string */
750
    size_t map(char **output_ptr, size_t *output_buf_len,
751
               const char *input_ptr, size_t input_len) const;
752
};
753
754
/* ------------------------------------------------------------------------ */
755
/*
756
 *   Basic character mapper for single-byte character sets to UTF-8 
757
 */
758
class CCharmapToUniSB_basic: public CCharmapToUni
759
{
760
public:
761
    /* read from a single-byte input file, translating to UTF-8 */
762
    virtual size_t read_file(osfildef *fp, char *buf, size_t bufl,
763
                             unsigned long read_limit);
764
765
    /* determine if a byte sequence forms a complete character */
766
    virtual int is_complete_char(const char *, size_t) const
767
    {
768
        /* 
769
         *   every character in a single-byte set requires just one byte;
770
         *   since 'len' is required to be at least one, there's no way we
771
         *   can't have a complete character 
772
         */
773
        return TRUE;
774
    }
775
776
    /* map a string, providing partial character info */
777
    virtual size_t map2(char **output_ptr, size_t *output_buf_len,
778
                        const char *input_ptr, size_t input_len,
779
                        size_t *partial_len) const
780
    {
781
        /* 
782
         *   for all single-byte character sets, one byte == one character,
783
         *   so it's impossible to have partial characters 
784
         */
785
        *partial_len = 0;
786
787
        /* perform the normal mapping */
788
        return map(output_ptr, output_buf_len, input_ptr, input_len);
789
    }
790
791
protected:
792
    /* temporary buffer for reading files */
793
    char inbuf_[512];
794
};
795
796
/* ------------------------------------------------------------------------ */
797
/*
798
 *   Character mapper for plain ASCII to UTF-8
799
 */
800
class CCharmapToUniASCII: public CCharmapToUniSB_basic
801
{
802
public:
803
    /* map a string */
804
    size_t map(char **output_ptr, size_t *output_buf_len,
805
               const char *input_ptr, size_t input_len) const;
806
807
protected:
808
    /* 
809
     *   there's no map for the ASCII translation, so we can ignore
810
     *   mapping calls 
811
     */
812
    void set_mapping(wchar_t, wchar_t) { }
813
};
814
815
/* ------------------------------------------------------------------------ */
816
/*
817
 *   Character mapper for single-byte character sets to UTF-8.
818
 */
819
class CCharmapToUniSB: public CCharmapToUniSB_basic
820
{
821
public:
822
    CCharmapToUniSB()
823
    {
824
        int i;
825
        
826
        /* initialize the mapping table to all U+FFFD */
827
        for (i = 0 ; i < 256 ; ++i)
828
            map_[i] = 0xFFFD;
829
    }
830
    
831
    /* map a string */
832
    size_t map(char **output_ptr, size_t *output_buf_len,
833
               const char *input_ptr, size_t input_len) const;
834
835
protected:
836
    /* set a mapping */
837
    void set_mapping(wchar_t uni_code_pt, wchar_t local_code_pt)
838
    {
839
        /* 
840
         *   set the mapping, ignoring characters outside of our 8-bit
841
         *   range 
842
         */
843
        if (((unsigned int)local_code_pt) < 256)
844
            map_[local_code_pt] = uni_code_pt;
845
    }
846
847
private:
848
    /* 
849
     *   our mapping table - since the source character set is
850
     *   single-byte, we need only store a wchar_t for each of the
851
     *   possible 256 source characters 
852
     */
853
    wchar_t map_[256];
854
};
855
856
/* ------------------------------------------------------------------------ */
857
/*
858
 *   Character mapper for mixed multi-byte character sets to UTF-8.  This
859
 *   maps from local character sets that use a mixture of one-byte and
860
 *   two-byte sequences to represent characters.  
861
 */
862
863
/*
864
 *   Primary-byte mapping table entry.  This gives us mapping instructions
865
 *   for each leading byte of a character sequence.
866
 *   
867
 *   Each character is represented by a one-byte or two-byte sequence.  This
868
 *   mapper assumes a context-free mapping, hence for each character
869
 *   represented by a single byte, that single byte unambiguously indicates
870
 *   that character, and hence is never the first byte of a two-byte
871
 *   sequence.  For each character represented by a two-byte sequence, the
872
 *   first byte of the sequence can only be part of two-byte sequences, hence
873
 *   whenever we see that first byte we'll know for sure we have a two-byte
874
 *   character.
875
 *   
876
 *   Each mapping here is for a first byte.  If the byte is a single-byte
877
 *   character, then the 'sub' pointer is null and the 'ch' entry gives the
878
 *   Unicode code point for the character.  If the byte is the lead byte of
879
 *   one or more two-byte characters, then the 'sub' pointer is non-null and
880
 *   'ch' is ignored.  
881
 */
882
struct cmap_mb_entry
883
{
884
    /* 
885
     *   The sub-mapping table.  This is a pointer to a table of the Unicode
886
     *   code points of the two-byte sequences that start with this byte.
887
     *   Each entry in the array is a Unicode code point, and the array is
888
     *   indexed by the second byte of the two-byte sequence.  If this
889
     *   pointer is null, then this lead byte is a single-byte character.
890
     *   
891
     *   Note that this pointer, if non-null, always points to a 256-element
892
     *   array.  This array can thus be indexed directly with any unsigned
893
     *   8-bit byte value without any range checking.  
894
     */
895
    wchar_t *sub;
896
897
    /* 
898
     *   The Unicode code point of this character, if this primary byte is a
899
     *   one-byte character.  
900
     */
901
    wchar_t ch;
902
};
903
904
905
/*
906
 *   The multi-byte-to-UTF8 mapper 
907
 */
908
class CCharmapToUniMB: public CCharmapToUni
909
{
910
public:
911
    CCharmapToUniMB();
912
913
    /* delete the table */
914
    virtual ~CCharmapToUniMB();
915
916
    /* determine if a byte sequence forms a complete character */
917
    virtual int is_complete_char(const char *p, size_t len) const
918
    {
919
        /* 
920
         *   Check the first byte to see if this is a leading byte or a
921
         *   stand-alone single byte.  
922
         */
923
        if (map_[(unsigned char)*p].sub == 0)
924
        {
925
            /* 
926
             *   it's a stand-alone byte, so the character length is one;
927
             *   'len' is required to be at least 1, so we definitely have a
928
             *   complete character 
929
             */
930
            return TRUE;
931
        }
932
        else
933
        {
934
            /* it's a lead byte, so the character length is two */
935
            return (len >= 2);
936
        }
937
    }
938
939
    /* read from a multi-byte input file, translating to UTF-8 */
940
    virtual size_t read_file(osfildef *fp, char *buf, size_t bufl,
941
                             unsigned long read_limit);
942
943
    /* map a string */
944
    size_t map(char **output_ptr, size_t *output_buf_len,
945
               const char *input_ptr, size_t input_len) const
946
    {
947
        size_t partial_len;
948
949
        /* 
950
         *   do the full mapping, discarding the partial last character byte
951
         *   length information 
952
         */
953
        return map2(output_ptr, output_buf_len, input_ptr, input_len,
954
                    &partial_len);
955
    }
956
957
    /* map a string, providing partial character info */
958
    virtual size_t map2(char **output_ptr, size_t *output_buf_len,
959
                        const char *input_ptr, size_t input_len,
960
                        size_t *partial_len) const;
961
962
protected:
963
    /* set a mapping */
964
    void set_mapping(wchar_t uni_code_pt, wchar_t local_code_pt);
965
966
private:
967
    /* the primary-byte mapping table */
968
    cmap_mb_entry map_[256];
969
970
    /* temporary buffer for reading files */
971
    char inbuf_[512];
972
};
973
974
/* ------------------------------------------------------------------------ */
975
/*
976
 *   Character mapper for double-byte character sets to UTF-8.  This maps
977
 *   from local character sets that use a two-byte sequence to represent
978
 *   each local character.
979
 *   
980
 *   For now, this is a trivial subclass of the multi-byte mapper; that
981
 *   mapper handles the more general case of varying-length local
982
 *   characters, so it can easily handle the case where every where is
983
 *   represented by two bytes.  If there is sufficient demand for it, a
984
 *   special-case subclass to handle double-byte character sets specifically
985
 *   could provide efficiency gains, since it wouldn't have to check each
986
 *   lead byte to determine the character sequence length.  
987
 */
988
class CCharmapToUniDB: public CCharmapToUniMB
989
{
990
public:
991
};
992
993
/* ------------------------------------------------------------------------ */
994
/*
995
 *   Character mapper for plain ISO-8859-1 to UTF-8 
996
 */
997
class CCharmapToUni8859_1: public CCharmapToUniSB
998
{
999
public:
1000
    /* creation */
1001
    CCharmapToUni8859_1()
1002
    {
1003
        wchar_t i;
1004
        
1005
        /* 
1006
         *   Initialize our mapping table.  Each 8859-1 code point maps to
1007
         *   the same code point in Unicode, so this is a trivial
1008
         *   translation.  
1009
         */
1010
        for (i = 0 ; i < 256 ; ++i)
1011
            set_mapping(i, i);
1012
    }
1013
};
1014
1015
/* ======================================================================== */
1016
/*
1017
 *   Unicode UTF-8 - to - local character set mappers 
1018
 */
1019
1020
/* ------------------------------------------------------------------------ */
1021
/*
1022
 *   Trivial character mapper for UTF8-to-UTF8 conversions.  This can be
1023
 *   used when writing external data in UTF8 format; since this is the
1024
 *   same format we use internally, no conversion is required.  
1025
 */
1026
class CCharmapToLocalUTF8: public CCharmapToLocal
1027
{
1028
public:
1029
    /* map a string */
1030
    virtual size_t map_utf8(char *dest, size_t dest_len,
1031
                            utf8_ptr src, size_t src_byte_len,
1032
                            size_t *src_bytes_used) const;
1033
1034
    /* map a null-terminated string */
1035
    virtual size_t map_utf8z(char *dest, size_t dest_len,
1036
                             utf8_ptr src) const;
1037
1038
    /* map a character */
1039
    size_t map(wchar_t unicode_char, char **output_ptr,
1040
               size_t *output_len) const;
1041
1042
    /* 
1043
     *   determine if the given Unicode character has a mapping to the local
1044
     *   character set 
1045
     */
1046
    virtual int is_mappable(wchar_t unicode_char) const
1047
    {
1048
        /* every character can be mapped UTF8-to-UTF8, obviously */
1049
        return TRUE;
1050
    }
1051
};
1052
1053
/* ------------------------------------------------------------------------ */
1054
/*
1055
 *   Character mapper for single-byte character sets.  Each character in
1056
 *   the local (output) character set is represented by a single byte.
1057
 */
1058
class CCharmapToLocalSB: public CCharmapToLocal
1059
{
1060
public:
1061
    /* map a string */
1062
    virtual size_t map_utf8(char *dest, size_t dest_len,
1063
                            utf8_ptr src, size_t src_byte_len,
1064
                            size_t *src_bytes_used) const;
1065
1066
    /* map a null-terminated string */
1067
    virtual size_t map_utf8z(char *dest, size_t dest_len,
1068
                             utf8_ptr src) const;
1069
1070
    /* map a character */
1071
    size_t map(wchar_t unicode_char, char **output_ptr,
1072
               size_t *output_len) const;
1073
};
1074
1075
1076
/* ------------------------------------------------------------------------ */
1077
/*
1078
 *   Mixed multi-byte mapper.  Each local character is represented by a
1079
 *   sequence of one or more bytes.
1080
 *   
1081
 *   This class is a trivial subclass of CCharmapToLocalSB.  The single-byte
1082
 *   base class already does everything we need to do, because it is designed
1083
 *   to cope with mappings that involve expansions that represent a single
1084
 *   Unicode character with a sequence of local characters (for example,
1085
 *   "(c)" for the copyright symbol).  
1086
 */
1087
class CCharmapToLocalMB: public CCharmapToLocalSB
1088
{
1089
public:
1090
};
1091
1092
/*
1093
 *   Double-byte mapper.  Each local character is represented by exactly two
1094
 *   bytes.  This class is a trivial subclass of CCharmapToLocalMB, because
1095
 *   the multi-byte mapper already handles the more general case of local
1096
 *   character representations that use varying byte lengths; there is no
1097
 *   particular efficiency gain to be had by creating a separate special-case
1098
 *   class for double-byte character sets.  
1099
 */
1100
class CCharmapToLocalDB: public CCharmapToLocalMB
1101
{
1102
public:
1103
};
1104
1105
1106
/* ------------------------------------------------------------------------ */
1107
/*
1108
 *   Character mapper for mapping to local default 7-bit ASCII.  This
1109
 *   mapper is has a built-in character set translation so that we can
1110
 *   always create one without having to find an external mapping file.  
1111
 */
1112
class CCharmapToLocalASCII: public CCharmapToLocalSB
1113
{
1114
public:
1115
    CCharmapToLocalASCII();
1116
};
1117
1118
1119
/*
1120
 *   Character mapper for mapping to local ISO-8859-1.  This mapper has a
1121
 *   built-in character set translation so that we can always create one
1122
 *   even without an external mapping file.  
1123
 */
1124
class CCharmapToLocal8859_1: public CCharmapToLocalSB
1125
{
1126
public:
1127
    CCharmapToLocal8859_1();
1128
};
1129
1130
/* ------------------------------------------------------------------------ */
1131
/*
1132
 *   Character mapper for 16-bit Wide Unicode local character set.  Stores
1133
 *   characters in the correct local wchar_t representation.  Assumes that
1134
 *   the pointer is wchar_t-aligned.
1135
 *   
1136
 *   This is a trival translation.  Because we're mapping from Unicode to
1137
 *   Unicode, the only thing we're changing is the encoding format - the
1138
 *   character code is simply copied without any translation, since
1139
 *   Unicode is the same everywhere.  
1140
 */
1141
class CCharmapToLocalWideUnicode: public CCharmapToLocal
1142
{
1143
public:
1144
    /* map a string */
1145
    virtual size_t map_utf8(char *dest, size_t dest_len,
1146
                            utf8_ptr src, size_t src_byte_len,
1147
                            size_t *src_bytes_used) const;
1148
1149
    /* map a null-terminated string */
1150
    virtual size_t map_utf8z(char *dest, size_t dest_len,
1151
                             utf8_ptr src) const;
1152
1153
    /* map a character */
1154
    size_t map(wchar_t unicode_char, char **output_ptr,
1155
               size_t *output_len) const;
1156
1157
    /* 
1158
     *   determine if the given Unicode character has a mapping to the local
1159
     *   character set 
1160
     */
1161
    virtual int is_mappable(wchar_t unicode_char) const
1162
    {
1163
        /* every character can be mapped UTF8-to-UCS2 */
1164
        return TRUE;
1165
    }
1166
};
1167
1168
/* ------------------------------------------------------------------------ */
1169
/*
1170
 *   Character mapper for 16-bit Wide Unicode, big-endian.  Stores the
1171
 *   characters in big-endian UCS-2 representation. 
1172
 */
1173
class CCharmapToLocalUcs2Big: public CCharmapToLocal
1174
{
1175
public:
1176
    /* map a string */
1177
    virtual size_t map_utf8(char *dest, size_t dest_len,
1178
                            utf8_ptr src, size_t src_byte_len,
1179
                            size_t *src_bytes_used) const;
1180
1181
    /* map a null-terminated string */
1182
    virtual size_t map_utf8z(char *dest, size_t dest_len,
1183
                             utf8_ptr src) const;
1184
1185
    /* map a character */
1186
    size_t map(wchar_t unicode_char, char **output_ptr,
1187
               size_t *output_len) const;
1188
};
1189
1190
/* ------------------------------------------------------------------------ */
1191
/*
1192
 *   Character mapper for 16-bit Wide Unicode, little-endian.  Stores the
1193
 *   characters in little-endian UCS-2 representation.  
1194
 */
1195
class CCharmapToLocalUcs2Little: public CCharmapToLocal
1196
{
1197
public:
1198
    /* map a string */
1199
    virtual size_t map_utf8(char *dest, size_t dest_len,
1200
                            utf8_ptr src, size_t src_byte_len,
1201
                            size_t *src_bytes_used) const;
1202
1203
    /* map a null-terminated string */
1204
    virtual size_t map_utf8z(char *dest, size_t dest_len,
1205
                             utf8_ptr src) const;
1206
1207
    /* map a character */
1208
    size_t map(wchar_t unicode_char, char **output_ptr,
1209
               size_t *output_len) const;
1210
};
1211
1212
1213
#endif /* CHARMAP_H */