cfad47cfa3/tads3/utf8.h

4b825dc642cb6eb9a060e54bf8d69288fbee4904cfad47cfa334b206c65f22086bcc5d63e6f70944
1
/* $Header: d:/cvsroot/tads/tads3/utf8.h,v 1.2 1999/05/17 02:52:29 MJRoberts Exp $ */
2
3
/* 
4
 *   Copyright (c) 1998, 2002 Michael J. Roberts.  All Rights Reserved.
5
 *   
6
 *   Please see the accompanying license file, LICENSE.TXT, for information
7
 *   on using and copying this software.  
8
 */
9
/*
10
Name
11
  utf8.h - UTF-8 character string manipulation
12
Function
13
  
14
Notes
15
  
16
Modified
17
  10/16/98 MJRoberts  - Creation
18
*/
19
20
#ifndef UTF8_H
21
#define UTF8_H
22
23
#include <stdlib.h>
24
25
/* ------------------------------------------------------------------------ */
26
/*
27
 *   UTF-8 character string pointer.
28
 *   
29
 *   Note that this class deviates from the normal naming convention where
30
 *   each class begins with a capital 'C'.  Since this class is so
31
 *   low-level, and is used so much like the (char *) type, it seems more
32
 *   proper to give it a name as though it were a typedef for a native type.
33
 *   
34
 *   If ever there was a time when operator overloading is indicated, this
35
 *   would be it.  We could overload increment and decrement operators, for
36
 *   example, to step through the string.  However, I just plain don't like
37
 *   operator overloading, so I do not use it here.  Instead, we use
38
 *   explicit method names to avoid obfuscating the code as overloaded
39
 *   operators would.  It's a trade-off: it's less concise this way, but
40
 *   less obscure.
41
 *   
42
 *   Note the important distinction between "byte" and "character": a byte
43
 *   is the basic multi-bit unit of native storage, and a character
44
 *   represents the basic lexical unit; a character may be composed of more
45
 *   than one byte.
46
 */
47
48
class utf8_ptr
49
{
50
public:
51
    /* create a UTF-8 string pointer, with no initial underlying string */
52
    utf8_ptr() { p_ = 0; }
53
    
54
    /* 
55
     *   Create a UTF-8 string pointer with an underlying string.  The
56
     *   pointer must point to the first byte of a valid character.  
57
     */
58
    utf8_ptr(char *str) { set(str); }
59
60
    /* 
61
     *   Set the pointer to a new underlying buffer.  The pointer must
62
     *   point to the first byte of a valid character if there are already
63
     *   characters in the buffer.  
64
     */
65
    void set(char *str) { p_ = str; }
66
67
    /*
68
     *   Get the character at the current position 
69
     */
70
    wchar_t getch() const { return s_getch(p_); }
71
72
    /*
73
     *   Get the character at a given character offset from the current
74
     *   position.  The offset must be positive.  
75
     */
76
    wchar_t getch_at(size_t ofs) const { return s_getch_at(p_, ofs); }
77
78
    /*
79
     *   Get the character preceding the current character by the given
80
     *   amount.  The offset must be positive.  getch_before(1) returns
81
     *   the character preceding the current character, (2) returns the
82
     *   character two positions before the current character, and so on. 
83
     */
84
    wchar_t getch_before(size_t ofs) const { return s_getch_before(p_,ofs); }
85
86
    /*
87
     *   Encode a character into the buffer at the current position, and
88
     *   increment the pointer past the character.
89
     */
90
    void setch(wchar_t ch)
91
    {
92
        /* store the character and advance the buffer pointer */
93
        p_ += s_putch(p_, ch);
94
    }
95
96
    /* call setch() for each character in a null-terminated string */
97
    void setch_str(const char *str)
98
    {
99
        for ( ; *str != '\0' ; ++str)
100
            p_ += s_putch(p_, *str);
101
    }
102
103
    /*
104
     *   Encode a string of wide characters into the buffer.  We'll
105
     *   increment our pointer so that it points to the next available
106
     *   character when we're done.  Returns the number of bytes used for
107
     *   the encoding.
108
     *   
109
     *   'src_count' is the number of wide characters in the source string.
110
     *   
111
     *   'bufsiz' gives the size remaining in the underlying buffer.  If
112
     *   we run out of space, we won't encode any more characters, but we
113
     *   will still return the total number of bytes required to encode
114
     *   the string.  
115
     */
116
    size_t setwchars(const wchar_t *src, size_t src_count, size_t bufsiz);
117
118
    /*
119
     *   Encode a null-terminated string of wide-characters into our
120
     *   buffer.  Works like setwchars(), but stops at the null terminator
121
     *   in the source rather than taking a character count.
122
     *   
123
     *   This routine includes the null terminator in the resulting UTF-8
124
     *   string, and includes the space it takes in the result length, BUT
125
     *   we leave our pointer pointing to the null terminator.  
126
     */
127
    size_t setwcharsz(const wchar_t *src, size_t bufsiz);
128
    
129
    /* increment the pointer by one character */
130
    void inc() { p_ = s_inc(p_); }
131
132
    /* 
133
     *   increment the pointer by one character, and decrement a remaining
134
     *   length counter accordingly 
135
     */
136
    void inc(size_t *rem)
137
    {
138
        char *p;
139
        
140
        /* calculate the increment amount */
141
        p = s_inc(p_);
142
143
        /* decrement the length counter by the change */
144
        *rem -= (p - p_);
145
146
        /* save the new pointer value */
147
        p_ = p;
148
    }
149
150
    /* decrement the pointer by one character */
151
    void dec() { p_ = s_dec(p_); }
152
153
    /* decrement poniter and increment the remaining size to compensate */
154
    void dec(size_t *rem)
155
    {
156
        char *p;
157
158
        /* calculate the decrement amount */
159
        p = s_dec(p_);
160
161
        /* decrement the length counter by the change */
162
        *rem += (p_ - p);
163
        
164
        /* save the new pointer value */
165
        p_ = p;
166
    }
167
168
    /* 
169
     *   Determine if the current character is a continuation character.
170
     *   Returns 1 if so, 0 if not. 
171
     */
172
    int is_continuation() const { return s_is_continuation(p_); }
173
174
    /* 
175
     *   count the number of characters in the given number of bytes,
176
     *   starting at the current byte 
177
     */
178
    size_t len(size_t bytecnt) const
179
    {
180
        char *end;
181
        char *p;
182
        size_t cnt;
183
184
        /* calculate the ending point */
185
        p = p_;
186
        end = p + bytecnt;
187
188
        /* increment until we run out of bytes */ 
189
        for (cnt = 0 ; p < end ; p = s_inc(p), ++cnt) ;
190
191
        /* return the result */
192
        return cnt;
193
    }
194
195
    /* get the byte size of the current character */
196
    size_t charsize() const { return s_charsize(*p_); }
197
    
198
    /*
199
     *   Get the number of bytes in the given number of characters
200
     *   starting at the current position.  
201
     */
202
    size_t bytelen(size_t charcnt) const
203
    {
204
        char *p;
205
206
        /* skip the given number of characters */
207
        for (p = p_ ; charcnt != 0 ; p = s_inc(p), --charcnt) ;
208
209
        /* return the number of bytes we skipped */
210
        return (p - p_);
211
    }
212
213
    /*
214
     *   count the number of characters to the null terminator
215
     */
216
    size_t lenz() const
217
    {
218
        char *p;
219
        size_t cnt;
220
221
        /* increment until we find a null byte */
222
        for (cnt = 0, p = p_ ; *p != 0 ; p = s_inc(p), ++cnt) ;
223
224
        /* return the result */
225
        return cnt;
226
    }
227
228
    /* get the current pointer position */
229
    char *getptr() const { return p_; }
230
231
    /* -------------------------------------------------------------------- */
232
    /*
233
     *   Static methods 
234
     */
235
236
    /*
237
     *   Compare two UTF-8 strings.  Returns a value less than zero if the
238
     *   first string is lexically less than the second string (i.e., the
239
     *   first string sorts ahead of the second string), zero if the two
240
     *   strings are identical, or a value greater than zero if the first
241
     *   string is lexically greater than the second string.  
242
     */
243
    static int s_compare_to(const char *p1, size_t bytelen1,
244
                            const char *p2, size_t bytelen2);
245
246
    /* get the character at the given byte pointer */
247
    static wchar_t s_getch(const char *p)
248
    {
249
        /*
250
         *   If the high bit is 0, it's a one-byte sequence encoding the
251
         *   value in the low seven bits.  
252
         */
253
        if ((*p & 0x80) == 0)
254
            return (((unsigned char)*p) & 0x7f);
255
256
        /*
257
         *   If the high two bytes are 110, it's a two-byte sequence, with
258
         *   the high-order 5 bits in the low 5 bits of the first byte, and
259
         *   the low-order six bits in the low 6 bits of the second byte.  
260
         */
261
        if ((*p & 0xE0) == 0xC0)
262
            return (((((unsigned char)*p) & 0x1F) << 6)
263
                    + (((unsigned char)*(p + 1)) & 0x3F));
264
265
        /*
266
         *   Otherwise, we have a three-byte sequence: the high-order 4 bits
267
         *   are in the low-order 5 bits of the first byte, the next 6 bits
268
         *   are in the low-order 6 bits of the second byte, and the
269
         *   low-order 6 bits are in the low-order 6 bits of the third byte.
270
         */
271
        return (((((unsigned char)*p) & 0x0F) << 12)
272
                + ((((unsigned char)*(p + 1)) & 0x3F) << 6)
273
                + (((unsigned char)*(p + 2)) & 0x3F));
274
    }
275
276
    /* 
277
     *   get the character at a given positive character offset from a
278
     *   byte pointer 
279
     */
280
    static wchar_t s_getch_at(const char *p, size_t ofs)
281
    {
282
        /* skip the given number of characters */
283
        for ( ; ofs != 0 ; --ofs, p += s_charsize(*p)) ;
284
285
        /* return the character at the current position */
286
        return s_getch(p);
287
    }
288
289
    /*
290
     *   get the character preceding the current character by the given
291
     *   number of positions; the offset value must be positive 
292
     */
293
    static wchar_t s_getch_before(const char *p, size_t ofs)
294
    {
295
        /* skip backwards the given number of characters */
296
        for ( ; ofs != 0 ; --ofs)
297
        {
298
            /* 
299
             *   back up by one to three bytes, until we find no more
300
             *   continuation flags 
301
             */
302
            --p;
303
            p -= s_is_continuation(p);
304
            p -= s_is_continuation(p);
305
        }
306
        
307
        /* return the character at the current position */
308
        return s_getch(p);
309
    }
310
311
    /* 
312
     *   Write a given wchar_t value to the given byte pointer.  The
313
     *   caller must already have checked (via s_wchar_size) that there's
314
     *   enough room in the buffer for this character's UTF-8
315
     *   representation.
316
     *   
317
     *   Returns the number of bytes stored.  
318
     */
319
    static size_t s_putch(char *p, wchar_t ch)
320
    {
321
        /* check the range to determine how to encode it */
322
        if (ch <= 0x7f)
323
        {
324
            /* 
325
             *   it's in the range 0x0000 to 0x007f - encode the low-order
326
             *   7 bits in one byte 
327
             */
328
            *p = (char)(ch & 0x7f);
329
            return 1;
330
        }
331
        else if (ch <= 0x07ff)
332
        {
333
            /* 
334
             *   It's in the range 0x0080 to 0x07ff - encode it in two
335
             *   bytes.  The high-order 5 bits go in the first byte after
336
             *   the two-byte prefix of 110, and the low-order 6 bits go in
337
             *   the second byte after the continuation prefix of 10.  
338
             */
339
            *p++ = (char)(0xC0 | ((ch >> 6) & 0x1F));
340
            *p = (char)(0x80 | (ch & 0x3F));
341
            return 2;
342
        }
343
        else
344
        {
345
            /*
346
             *   It's in the range 0x0800 to 0xffff - encode it in three
347
             *   bytes.  The high-order 4 bits go in the first byte after
348
             *   the 1110 prefix, the next 6 bits go in the second byte
349
             *   after the 10 continuation prefix, and the low-order 6 bits
350
             *   go in the third byte after another 10 continuation prefix.  
351
             */
352
            *p++ = (char)(0xE0 | ((ch >> 12) & 0x0F));
353
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3F));
354
            *p = (char)(0x80 | (ch & 0x3F));
355
            return 3;
356
        }
357
    }
358
359
    /* increment a pointer to a buffer, returning the result */
360
    static char *s_inc(char *p)
361
    {
362
        /* 
363
         *   increment the pointer by the size of the current character
364
         *   and return the result 
365
         */
366
        return p + s_charsize(*p);
367
    }
368
369
    /* get the size of the character at the given byte pointer */
370
    static size_t s_charsize(char c)
371
    {
372
        unsigned int ch;
373
374
        /*
375
         *   Check the top three bits.  If the pattern is 111xxxxx, we're
376
         *   pointing to a three-byte sequence.  If the pattern is
377
         *   110xxxxx, we're pointing to a two-byte sequence.  If it's
378
         *   0xxxxxxx, it's a one-byte sequence.
379
         *   
380
         *   We're being somewhat clever (tricky, anyway) here at the
381
         *   expense of clarity.  To avoid conditionals, we're doing some
382
         *   tricky bit masking and shifting, since these operations are
383
         *   extremely fast on most machines.  We figure out our increment
384
         *   using the bit patterns above to generate masks, then shift
385
         *   these around to produce 1's or 0's, then add up all of the
386
         *   mask calculations to get our final increment.
387
         *   
388
         *   The size is always at least 1 byte, so we start out with an
389
         *   increment of 1.
390
         *   
391
         *   Next, we note that character sizes other than 1 always
392
         *   require the high bit to be set.  So, the rest is all ANDed
393
         *   with (byte & 80) shifted right by seven OR'ed to the same
394
         *   thing shifted right by six, which will give us a bit mask of
395
         *   0 when the high bit is clear and 3 when it's set.
396
         *   
397
         *   Next, we'll pick out that third bit (xx1xxxxx or xx0xxxxx) by
398
         *   AND'ing with 0x20.  We'll shift this right by 5, to give us 1
399
         *   if we have a three-byte sequence.
400
         *   
401
         *   We'll then add 1 to this, so we'll have a result of 1 for a
402
         *   two-byte sequence, 2 for a three-byte sequence.  
403
         */
404
        ch = (unsigned int)(unsigned char)c;
405
        return (1 +
406
                ((((ch & 0x80) >> 7) | ((ch & 0x80) >> 6))
407
                 & (1 + ((ch & 0x20) >> 5))));
408
    }
409
410
    /* 
411
     *   get the number of bytes required to encode a given wchar_t in
412
     *   UTF-8 format 
413
     */
414
    static size_t s_wchar_size(wchar_t ch)
415
    {
416
        /* 
417
         *   characters 0-0x7f take up one byte; characters 0x80-0x7ff
418
         *   take up two bytes; all others take up three bytes 
419
         */
420
        return (ch < 0x80 ? 1 : (ch < 0x800 ? 2 : 3));
421
    }
422
423
    /* decrement a pointer by one character, returning the result */
424
    static char *s_dec(char *p)
425
    {
426
        /*
427
         *   Going backwards, we can't tell that we're on a start byte
428
         *   until we get there - there's no context to tell us which byte
429
         *   of a multi-byte sequence we're on, except that we can tell
430
         *   whether or not we're on the first byte or an extra byte.  So,
431
         *   decrement the pointer by a byte; if we're not on a start
432
         *   byte, decrement by another byte; if we're still not on a
433
         *   start byte, decrement it again.
434
         *   
435
         *   Since the longest possible sequence is three bytes, we'll
436
         *   unroll the loop and simply check twice to see if we're done
437
         *   yet.  
438
         */
439
        --p;
440
        p -= s_is_continuation(p);
441
        p -= s_is_continuation(p);
442
443
        /* return the result */
444
        return p;
445
    }
446
447
    /*
448
     *   Determine if the current byte is a continuation byte.  Returns 1
449
     *   if this is a continuation byte, 0 if not.  
450
     */
451
    static int s_is_continuation(const char *p)
452
    {
453
        unsigned int ch;
454
455
        /*   
456
         *   Continuation bytes have the pattern 10xxxxxx.  Initial bytes
457
         *   never have this pattern.  So, if a byte ANDed with 0xC0 is
458
         *   0x80 (i.e., the high two bits have the exact patern '10'),
459
         *   we're on a continuation byte.
460
         *   
461
         *   To avoid conditionals, which can be expensive because they
462
         *   require branching, we'll play more bit mask tricks: we'll
463
         *   compute a value that's 1 when the high two bits are '10', and
464
         *   is zero otherwise, and then subtract that from the current
465
         *   pointer.  To figure this value, we'll mask the byte with 0x80
466
         *   to pick out the high bit, and shift this right seven bits.
467
         *   This will give us 1 for 1xxxxxxx.  Then, we'll mask the byte
468
         *   with 0x40, which will pick out the second bit, invert the
469
         *   resulting bit pattern, AND it again with 0x40, and shift it
470
         *   right six bits.  This will give us 1 for x0xxxxxx.  We'll AND
471
         *   this with the previous calculation, which will give us 1 for
472
         *   10xxxxxx and 0 for anything else.  
473
         */
474
        ch = (unsigned int)(unsigned char)*p;
475
        return (((ch & 0x80) >> 7)
476
                & (((~(ch & 0x40)) & 0x40) >> 6));
477
    }
478
479
    /*
480
     *   Truncate a string to the given byte length, ensuring that only
481
     *   whole characters are included in the result.  Takes the proposed
482
     *   truncated length, and returns the actual length to use.  The
483
     *   returned length will be less than or equal to the proposed
484
     *   length; if the returned length is less than the proposed length,
485
     *   it means that the proposed length would have cut off a multi-byte
486
     *   character, so the actual length had to be shorter to ensure that
487
     *   no bytes of the final character were included. 
488
     */
489
    static size_t s_trunc(const char *p, size_t len)
490
    {
491
        const char *last_ch;
492
        size_t last_ch_len;
493
494
        /* 
495
         *   if the length is zero, no adjustment is needed - you
496
         *   obviously can't divide zero bytes 
497
         */
498
        if (len == 0)
499
            return 0;
500
501
        /* 
502
         *   Get a pointer to the start of the last byte within the
503
         *   proposed truncated byte region.  Note that the last byte in
504
         *   the buffer is at index (len-1), since the byte at index (len)
505
         *   is the next byte after the truncated region.  
506
         */
507
        last_ch = p + len - 1;
508
509
        /* 
510
         *   Decrement this byte pointer until we get to the start of the
511
         *   character that contains the final byte.  Since a character
512
         *   can never be more than three bytes long, we need decrement
513
         *   our pointer a maximum of two times.  
514
         */
515
        last_ch -= s_is_continuation(last_ch);
516
        last_ch -= s_is_continuation(last_ch);
517
518
        /* 
519
         *   figure the number of bytes of the last character that are
520
         *   actually in the truncated region - this is simply the number
521
         *   of bytes from where we are now to the end of the region 
522
         */
523
        last_ch_len = len - (last_ch - p);
524
525
        /*
526
         *   Now compute the actual size of this last character.  If the
527
         *   last character's actual size is the same as the truncated
528
         *   size, then the last character fits exactly and we can return
529
         *   the proposed length unchanged.  If the last character's
530
         *   required length is more than the truncated length, it means
531
         *   that the truncation has cut off the last character so that
532
         *   not all of its bytes fit, and hence we cannot include ANY of
533
         *   the last character's bytes in the result. 
534
         */
535
        if (last_ch_len >= s_charsize(*last_ch))
536
        {
537
            /* the last character fits in the truncation - we're fine */
538
            return len;
539
        }
540
        else
541
        {
542
            /* 
543
             *   the last character doesn't fit - truncate so that none of
544
             *   the last character's bytes are included 
545
             */
546
            return (last_ch - p);
547
        }
548
    }
549
550
private:
551
    /* the buffer pointer */
552
    char *p_;
553
};
554
555
#endif /* UTF8_H */
556
557