| | 1 | /* $Header: d:/cvsroot/tads/tads3/charmap.h,v 1.2 1999/05/17 02:52:29 MJRoberts Exp $ */ |
| | 2 | |
| | 3 | /* |
| | 4 | * Copyright (c) 1998, 2002 Michael J. Roberts. All Rights Reserved. |
| | 5 | * |
| | 6 | * Please see the accompanying license file, LICENSE.TXT, for information |
| | 7 | * on using and copying this software. |
| | 8 | */ |
| | 9 | /* |
| | 10 | Name |
| | 11 | charmap.h - character-set mapper |
| | 12 | Function |
| | 13 | Provides mappings between 16-bit Unicode and single-byte, multi-byte, |
| | 14 | and double-byte character sets. |
| | 15 | Notes |
| | 16 | |
| | 17 | Modified |
| | 18 | 10/17/98 MJRoberts - Creation |
| | 19 | */ |
| | 20 | |
| | 21 | #ifndef CHARMAP_H |
| | 22 | #define CHARMAP_H |
| | 23 | |
| | 24 | #include <stdlib.h> |
| | 25 | #include <memory.h> |
| | 26 | #include <string.h> |
| | 27 | |
| | 28 | #include "utf8.h" |
| | 29 | #include "os.h" |
| | 30 | #include "t3std.h" |
| | 31 | |
| | 32 | |
| | 33 | /* ------------------------------------------------------------------------ */ |
| | 34 | /* |
| | 35 | * Mapping Types. This enum provides a characterization of a local |
| | 36 | * character set (as defined in a mapping file). |
| | 37 | */ |
| | 38 | enum charmap_type_t |
| | 39 | { |
| | 40 | /* |
| | 41 | * Single-byte character set - each character is represented with a |
| | 42 | * single 8-bit byte. |
| | 43 | */ |
| | 44 | CHARMAP_TYPE_SB, |
| | 45 | |
| | 46 | /* |
| | 47 | * Double-byte character set - each character is represented with |
| | 48 | * exactly two 8-bit bytes. In each byte pair, the first byte is |
| | 49 | * taken as the high-order byte, so a text input stream consisting |
| | 50 | * of the bytes 0x12, 0x34, 0x56, 0x78 would be interpreted as the |
| | 51 | * two 16-bit code point values 0x1234, 0x5678. |
| | 52 | */ |
| | 53 | CHARMAP_TYPE_DB, |
| | 54 | |
| | 55 | /* |
| | 56 | * Mixed multi-byte - each character is represented by either one or |
| | 57 | * two 8-bit bytes. Each two-byte character starts with a byte that |
| | 58 | * is only used in two-byte characters; each one-byte character |
| | 59 | * consists of a single byte that is not used as the first byte of |
| | 60 | * any two-byte character. In each two-byte character, the first |
| | 61 | * byte is taken as the high-order byte. |
| | 62 | * |
| | 63 | * For example, assuming that 0x00-0x7F are defined as single-byte |
| | 64 | * characters, and 0x8000-0xFFFF are defined as double-byte |
| | 65 | * characters, the byte sequence 0x12, 0x81, 0xAB, 0x82, 0xCD, 0x34 |
| | 66 | * would be taken as the character sequence 0x12, 0x81AB, 0x82CD, |
| | 67 | * 0x34. |
| | 68 | */ |
| | 69 | CHARMAP_TYPE_MB |
| | 70 | }; |
| | 71 | |
| | 72 | /* ------------------------------------------------------------------------ */ |
| | 73 | /* |
| | 74 | * Basic character mapper class. |
| | 75 | */ |
| | 76 | class CCharmap |
| | 77 | { |
| | 78 | public: |
| | 79 | /* add a reference */ |
| | 80 | void add_ref() { ++ref_cnt_; } |
| | 81 | |
| | 82 | /* release a reference; delete on removing the last reference */ |
| | 83 | void release_ref() |
| | 84 | { |
| | 85 | /* count the unreference */ |
| | 86 | --ref_cnt_; |
| | 87 | |
| | 88 | /* if that leaves no references, delete me */ |
| | 89 | if (ref_cnt_ == 0) |
| | 90 | delete this; |
| | 91 | } |
| | 92 | |
| | 93 | protected: |
| | 94 | CCharmap() |
| | 95 | { |
| | 96 | /* start out with one reference, for the initial creator */ |
| | 97 | ref_cnt_ = 1; |
| | 98 | } |
| | 99 | |
| | 100 | virtual ~CCharmap() { } |
| | 101 | |
| | 102 | /* |
| | 103 | * Open and characterize a mapping file. Returns the osfildef |
| | 104 | * pointer if the file was successfully opened and parsed, or null |
| | 105 | * if not. Sets *map_type to indicate the type of mapping contained |
| | 106 | * in the file. |
| | 107 | */ |
| | 108 | static osfildef *open_map_file(class CResLoader *res_loader, |
| | 109 | const char *table_name, |
| | 110 | charmap_type_t *map_type); |
| | 111 | |
| | 112 | /* check a name to see if it matches one of the names for ASCII */ |
| | 113 | static int name_is_ascii_synonym(const char *table_name) |
| | 114 | { |
| | 115 | /* accept any of the various synonyms for ASCII */ |
| | 116 | return (stricmp(table_name, "us-ascii") == 0 |
| | 117 | || stricmp(table_name, "asc7dflt") == 0 |
| | 118 | || stricmp(table_name, "ascii") == 0 |
| | 119 | || stricmp(table_name, "iso646-us") == 0 |
| | 120 | || stricmp(table_name, "iso-ir-6") == 0 |
| | 121 | || stricmp(table_name, "cp367") == 0 |
| | 122 | || stricmp(table_name, "us") == 0); |
| | 123 | } |
| | 124 | |
| | 125 | /* check a name to see if it matches one of the names for ISO 8859-1 */ |
| | 126 | static int name_is_8859_1_synonym(const char *table_name) |
| | 127 | { |
| | 128 | /* accept any of the various names for ISO 8859-1 */ |
| | 129 | return (stricmp(table_name, "iso-8859-1") == 0 |
| | 130 | || stricmp(table_name, "iso_8859-1") == 0 |
| | 131 | || stricmp(table_name, "iso-ir-100") == 0 |
| | 132 | || stricmp(table_name, "latin1") == 0 |
| | 133 | || stricmp(table_name, "l1") == 0 |
| | 134 | || stricmp(table_name, "cp819") == 0); |
| | 135 | } |
| | 136 | |
| | 137 | /* reference count */ |
| | 138 | unsigned int ref_cnt_; |
| | 139 | }; |
| | 140 | |
| | 141 | /* ------------------------------------------------------------------------ */ |
| | 142 | /* |
| | 143 | * Base character mapper class for mapping from a local character set to |
| | 144 | * UTF-8. This is an abstract interface that must be implemented for |
| | 145 | * different classes of character sets. |
| | 146 | */ |
| | 147 | class CCharmapToUni: public CCharmap |
| | 148 | { |
| | 149 | public: |
| | 150 | /* initialize */ |
| | 151 | CCharmapToUni() { } |
| | 152 | |
| | 153 | /* |
| | 154 | * Create a mapping object for a given character table. We'll read |
| | 155 | * enough of the character table to determine the appropriate |
| | 156 | * concrete subclass to instantiate, then create an object, load the |
| | 157 | * table into the object, and return the object. The caller is |
| | 158 | * responsible for deleting the object when finished with it. |
| | 159 | * |
| | 160 | * Returns null if the mapping file cannot be loaded. |
| | 161 | */ |
| | 162 | static CCharmapToUni *load(class CResLoader *res_loader, |
| | 163 | const char *table_name); |
| | 164 | |
| | 165 | /* |
| | 166 | * Determine if the given byte sequence forms a complete character in |
| | 167 | * the local character set. Returns true if so, false if not. 'len' |
| | 168 | * must be at least 1. |
| | 169 | */ |
| | 170 | virtual int is_complete_char(const char *p, size_t len) const = 0; |
| | 171 | |
| | 172 | /* |
| | 173 | * Convert a string from the local character set to Unicode. |
| | 174 | * Returns the byte length of the output. If the output buffer is |
| | 175 | * too small to store the result, we will return the size of the |
| | 176 | * full result, but we won't write past the end of the buffer. |
| | 177 | * |
| | 178 | * We'll advance *output_ptr by the number of bytes we write. |
| | 179 | * |
| | 180 | * If we store anything, we'll decrement *output_buf_len by the |
| | 181 | * number of bytes we store; if we don't have enough room, we'll set |
| | 182 | * *output_buf_len to zero. |
| | 183 | * |
| | 184 | * input_ptr is a pointer to the input string; input_len is the |
| | 185 | * length in bytes of the input string. |
| | 186 | */ |
| | 187 | virtual size_t map(char **output_ptr, size_t *output_buf_len, |
| | 188 | const char *input_ptr, size_t input_len) const = 0; |
| | 189 | |
| | 190 | /* |
| | 191 | * Convert a string from the local character set to Unicode. |
| | 192 | * |
| | 193 | * This works the same way as map(), but additionally provides |
| | 194 | * information on the consumption of source bytes by filling in |
| | 195 | * partial_len with the number of bytes at the end of the source |
| | 196 | * buffer that are not mappable because they do not form complete |
| | 197 | * characters in the source character set. Since we scan all input |
| | 198 | * regardless of whether there's space to store the resulting output, |
| | 199 | * this will reflect the same number of bytes no matter what the |
| | 200 | * output buffer length. |
| | 201 | */ |
| | 202 | virtual size_t map2(char **output_ptr, size_t *output_buf_len, |
| | 203 | const char *input_ptr, size_t input_len, |
| | 204 | size_t *partial_len) const = 0; |
| | 205 | |
| | 206 | /* |
| | 207 | * Map a null-terminated string into a buffer; returns the number of |
| | 208 | * bytes of the buffer actually needed to store the string. If the |
| | 209 | * entire string couldn't be mapped, this will return a number |
| | 210 | * greater than or equal to the output buffer size, but we will not |
| | 211 | * write beyond the end of the buffer. |
| | 212 | * |
| | 213 | * If there's space, the result will be null-terminated; however, |
| | 214 | * the null terminator byte will not be included in the result |
| | 215 | * length. If the return value exactly equals outbuflen, it means |
| | 216 | * that the string exactly fills the buffer, hence there isn't space |
| | 217 | * for a null terminator. |
| | 218 | */ |
| | 219 | size_t map_str(char *outbuf, size_t outbuflen, const char *input_str); |
| | 220 | |
| | 221 | /* |
| | 222 | * Read characters from a file into a buffer, translating the |
| | 223 | * characters to UTF-8. Returns the number of bytes copied into the |
| | 224 | * buffer; returns zero on end of file. The buffer must be at least |
| | 225 | * three bytes long to ensure that at least one character can be read |
| | 226 | * from the file (the longest UTF-8 character takes up three bytes), |
| | 227 | * since it would otherwise not be possible to distinguish reaching |
| | 228 | * the end of the file from simply being unable to fit even one |
| | 229 | * character into the buffer. |
| | 230 | * |
| | 231 | * The file can be opened in text or binary mode; we don't pay any |
| | 232 | * attention to newline sequences, so the mode is not relevant to us. |
| | 233 | * |
| | 234 | * This routine may read fewer than the desired number of bytes. Upon |
| | 235 | * return, the file's seek position should be set to the next byte of |
| | 236 | * the file after the last character copied into the output buffer. |
| | 237 | * |
| | 238 | * 'read_limit' is the maximum number of bytes we're allowed to read |
| | 239 | * from the underlying file. If this is zero, then the read size is |
| | 240 | * unlimited. |
| | 241 | */ |
| | 242 | virtual size_t read_file(osfildef *fp, char *buf, size_t bufl, |
| | 243 | unsigned long read_limit) = 0; |
| | 244 | |
| | 245 | protected: |
| | 246 | /* delete the mapping */ |
| | 247 | virtual ~CCharmapToUni() { } |
| | 248 | |
| | 249 | /* load the mapping table from the file */ |
| | 250 | void load_table(osfildef *fp); |
| | 251 | |
| | 252 | /* |
| | 253 | * Set a mapping. uni_code_pt is the unicode code point, and |
| | 254 | * local_code_pt is the code point in the local character set. |
| | 255 | */ |
| | 256 | virtual void set_mapping(wchar_t uni_code_pt, wchar_t local_code_pt) = 0; |
| | 257 | }; |
| | 258 | |
| | 259 | /* ------------------------------------------------------------------------ */ |
| | 260 | /* |
| | 261 | * Base character mapper class for mapping from Unicode UTF-8 to a local |
| | 262 | * character set. This is an abstract interface that must be separately |
| | 263 | * implemented for different classes of character sets. |
| | 264 | * |
| | 265 | * Each mapping object maintains a table of mapping tables. The master |
| | 266 | * table contains an array of up to 256 sub-tables. The top 8 bits of |
| | 267 | * the unicode character value give the index in the master table. Each |
| | 268 | * entry in the master table is a pointer to a sub-table, or a null |
| | 269 | * pointer if there are no mappings for characters in the range for that |
| | 270 | * sub-table. |
| | 271 | * |
| | 272 | * For example, unicode characters 0x0000 through 0x007f are mapped |
| | 273 | * through the table obtained by getting the pointer at index 0 from the |
| | 274 | * master table. Unicode characters 0x0200 through 0x02ff are in the |
| | 275 | * table at master table index 2. |
| | 276 | * |
| | 277 | * If a master table index entry is empty (i.e., the pointer in the |
| | 278 | * master table at that index is null), it means that all of the |
| | 279 | * characters in the range for that master index map to the default |
| | 280 | * character. Otherwise, we index into the sub-table using the |
| | 281 | * low-order 8 bits of the Unicode character code to find the character |
| | 282 | * mapping giving the local character set code for the Unicode value. |
| | 283 | * |
| | 284 | * Each entry in the mapping table is the offset of the translation of |
| | 285 | * the character within the translation array. The translation array is |
| | 286 | * an array of bytes. The first byte of each entry is the length in |
| | 287 | * bytes of the entry (not including the length byte), followed by the |
| | 288 | * bytes of the entry. |
| | 289 | * |
| | 290 | * The first entry in the translation array is always the default |
| | 291 | * character, which is the mapping we use for characters with no other |
| | 292 | * valid mapping. |
| | 293 | */ |
| | 294 | class CCharmapToLocal: public CCharmap |
| | 295 | { |
| | 296 | public: |
| | 297 | /* initialize */ |
| | 298 | CCharmapToLocal(); |
| | 299 | |
| | 300 | /* create a mapper and load the mapping from a file */ |
| | 301 | static CCharmapToLocal *load(class CResLoader *res_loader, |
| | 302 | const char *table_name); |
| | 303 | |
| | 304 | /* |
| | 305 | * Convert a character from Unicode to the local character set. |
| | 306 | * Stores the character's byte or bytes at the given pointer, and |
| | 307 | * increments the pointer to point to the next byte after the |
| | 308 | * character. |
| | 309 | * |
| | 310 | * Returns the byte length of the output. If the output buffer is |
| | 311 | * not long enough to store the result, we simply return the size of |
| | 312 | * the result without storing anything. |
| | 313 | * |
| | 314 | * If we actually store anything, we'll decrement *output_buf_len by |
| | 315 | * the number of bytes we stored; if we don't have room to store |
| | 316 | * anything, we'll set *output_buf_len to zero. |
| | 317 | */ |
| | 318 | virtual size_t map(wchar_t unicode_char, char **output_ptr, |
| | 319 | size_t *output_buf_len) const = 0; |
| | 320 | |
| | 321 | /* |
| | 322 | * Simple single-character mapper - returns the byte length of the |
| | 323 | * local character equivalent of the unicode character, which is |
| | 324 | * written into the buffer. If the buffer isn't big enough, we'll |
| | 325 | * still return the length, but won't write anything to the buffer. |
| | 326 | */ |
| | 327 | size_t map_char(wchar_t unicode_char, char *buf, size_t buflen) |
| | 328 | { |
| | 329 | /* map the character */ |
| | 330 | return map(unicode_char, &buf, &buflen); |
| | 331 | } |
| | 332 | |
| | 333 | |
| | 334 | /* |
| | 335 | * Convert a UTF-8 string with a given byte length to the local |
| | 336 | * character set. |
| | 337 | * |
| | 338 | * Returns the byte length of the result. If the result is too long |
| | 339 | * to fit in the output buffer, we'll return the number of bytes we |
| | 340 | * actually were able to store (we'll store as much as we can, and |
| | 341 | * stop when we run out of space). We'll indicate in |
| | 342 | * *src_bytes_used how many bytes of the source we were able to map. |
| | 343 | * |
| | 344 | * If the output buffer is null, we will store nothing, but simply |
| | 345 | * determine how much space it would take to store the entire string. |
| | 346 | * |
| | 347 | * This base class provides an implementation of this method that is |
| | 348 | * suitable for all subclasses, but the method is defined as virtual |
| | 349 | * so that subclasses can override it with a more tailored (and thus |
| | 350 | * more efficient) implementation. The general-purpose base-class |
| | 351 | * implementation must call the virtual function map() for each |
| | 352 | * character mapped. |
| | 353 | */ |
| | 354 | virtual size_t map_utf8(char *dest, size_t dest_len, |
| | 355 | utf8_ptr src, size_t src_byte_len, |
| | 356 | size_t *src_bytes_used) const; |
| | 357 | |
| | 358 | /* |
| | 359 | * map to utf8 - alternative interface using character buffers |
| | 360 | * (rather than UTF8 pointers) |
| | 361 | */ |
| | 362 | size_t map_utf8(char *dest, size_t dest_len, |
| | 363 | const char *src, size_t src_byte_len, |
| | 364 | size_t *src_bytes_used) const; |
| | 365 | |
| | 366 | /* |
| | 367 | * Convert a null-terminated UTF-8 string to the local character set. |
| | 368 | * |
| | 369 | * Returns the byte length of the result. If the result is too long |
| | 370 | * to fit in the output buffer, we'll return the size without storing |
| | 371 | * the entire string (we'll store as much as we can, and stop when we |
| | 372 | * run out of space, but continue counting the length needed; call |
| | 373 | * with a destination buffer length of zero to simply determine how |
| | 374 | * much space is needed for the result). |
| | 375 | * |
| | 376 | * The length returned does NOT include the null terminator. However, |
| | 377 | * if there's room, we will null-terminate the result string. So, if |
| | 378 | * the caller wants the result to be null terminated, it should make |
| | 379 | * sure that the buffer contains one byte more than the space reported |
| | 380 | * as necessary to store the result. |
| | 381 | */ |
| | 382 | virtual size_t map_utf8z(char *dest, size_t dest_len, utf8_ptr src) |
| | 383 | const; |
| | 384 | |
| | 385 | /* |
| | 386 | * Convert a null-terminated UTF-8 string to the local character set, |
| | 387 | * filling in an 'escape' sequence for unknown characters. For each |
| | 388 | * unknown character, we'll invoke the given callback to get the |
| | 389 | * 'escaped' representation. Use &CCharmapToLocal::source_esc_cb, for |
| | 390 | * example, to map using source-code-style escape sequences. |
| | 391 | * |
| | 392 | * The callback takes the unmappable character, a pointer to the output |
| | 393 | * buffer, and a pointer to the length remaining. It should fill in |
| | 394 | * the buffer with the escaped sequence (up to the remaining length |
| | 395 | * limit), and adjust the buffer pointer and length for the space |
| | 396 | * consumed. The return value is the full length required for the |
| | 397 | * complete escape sequence, even if there's not enough space in the |
| | 398 | * buffer to hold that many characters. |
| | 399 | */ |
| | 400 | virtual size_t map_utf8z_esc(char *dest, size_t dest_len, utf8_ptr src, |
| | 401 | size_t (*esc_fn)(wchar_t, char **, size_t *)) |
| | 402 | const; |
| | 403 | |
| | 404 | /* |
| | 405 | * ready-made callback for map_utf8z_esc() - map to unicode 'backslash' |
| | 406 | * escape sequences ('\u1234'), as we'd use in tads source code |
| | 407 | */ |
| | 408 | static size_t source_esc_cb(wchar_t ch, char **dest, size_t *len); |
| | 409 | |
| | 410 | /* |
| | 411 | * Write data to a file, converting from UTF-8 to the local character |
| | 412 | * set. Returns zero on success, non-zero if an error occurs writing |
| | 413 | * the data. |
| | 414 | */ |
| | 415 | int write_file(osfildef *fp, const char *buf, size_t bufl); |
| | 416 | |
| | 417 | /* |
| | 418 | * determine if the given Unicode character has a mapping to the local |
| | 419 | * character set |
| | 420 | */ |
| | 421 | virtual int is_mappable(wchar_t unicode_char) const |
| | 422 | { |
| | 423 | /* |
| | 424 | * By default, it's mappable if it has a non-default mapping in |
| | 425 | * the translation table. The default mapping is always at offset |
| | 426 | * zero in the translation table. |
| | 427 | */ |
| | 428 | return (get_mapping(unicode_char) != 0); |
| | 429 | } |
| | 430 | |
| | 431 | /* |
| | 432 | * Get the display expansion for a unicode character. This returns a |
| | 433 | * pointer to an array of wchar_t characters, and fills in the length |
| | 434 | * variable. Returns null if there's no expansion. |
| | 435 | * |
| | 436 | * An "expansion" is a list of two or more unicode characters that |
| | 437 | * should be substituted for the given unicode character when the |
| | 438 | * character is displayed. Display expansions are normally used for |
| | 439 | * visual approximations when the local character set doesn't contain |
| | 440 | * an exact match for the unicode character; for example, an ASCII |
| | 441 | * mapping might use the expansion "(c)" to represent the copyright |
| | 442 | * circled-C symbol, or the two-character sequence "AE" to represent |
| | 443 | * the AE ligature. |
| | 444 | */ |
| | 445 | const wchar_t *get_expansion(wchar_t unicode_char, size_t *len) |
| | 446 | { |
| | 447 | size_t ofs; |
| | 448 | const wchar_t *map; |
| | 449 | |
| | 450 | /* get the mapping offset in the expansion array */ |
| | 451 | ofs = get_exp_mapping(unicode_char); |
| | 452 | |
| | 453 | /* if the mapping offset is zero, it means there's no mapping */ |
| | 454 | if (ofs == 0) |
| | 455 | { |
| | 456 | /* indicate that there's no mapping by returning null */ |
| | 457 | *len = 0; |
| | 458 | return 0; |
| | 459 | } |
| | 460 | |
| | 461 | /* get the mapping pointer */ |
| | 462 | map = get_exp_ptr(ofs); |
| | 463 | |
| | 464 | /* read the length and skip it */ |
| | 465 | *len = (size_t)*map++; |
| | 466 | |
| | 467 | /* return the pointer to the first character of the expansion */ |
| | 468 | return map; |
| | 469 | } |
| | 470 | |
| | 471 | protected: |
| | 472 | /* delete the mapping */ |
| | 473 | virtual ~CCharmapToLocal(); |
| | 474 | |
| | 475 | /* given a Unicode character, get the mapping for the character */ |
| | 476 | unsigned int get_mapping(wchar_t unicode_char) const |
| | 477 | { |
| | 478 | unsigned int *subtable; |
| | 479 | |
| | 480 | /* get the mapping table */ |
| | 481 | subtable = get_sub_table(unicode_char); |
| | 482 | |
| | 483 | /* |
| | 484 | * If there is no subtable, return the default character, which is |
| | 485 | * always at offset zero in the translation array; otherwise, use |
| | 486 | * the low-order 8 bits of the character code as the index into |
| | 487 | * the subtable and return the value we find there |
| | 488 | */ |
| | 489 | if (subtable == 0) |
| | 490 | return 0; |
| | 491 | else |
| | 492 | return subtable[unicode_char & 0xff]; |
| | 493 | } |
| | 494 | |
| | 495 | /* given a Unicode character, get the expansion for the character */ |
| | 496 | unsigned int get_exp_mapping(wchar_t unicode_char) const |
| | 497 | { |
| | 498 | unsigned int *subtable; |
| | 499 | |
| | 500 | /* get the mapping table */ |
| | 501 | subtable = get_exp_sub_table(unicode_char); |
| | 502 | |
| | 503 | /* |
| | 504 | * if there's no subtable, return zero to indicate there's no |
| | 505 | * expansion; otherwise, return the entry from the subtable |
| | 506 | */ |
| | 507 | return (subtable == 0 ? 0 : subtable[unicode_char & 0xff]); |
| | 508 | } |
| | 509 | |
| | 510 | /* |
| | 511 | * Get a pointer to the sequence of bytes in the translation array at |
| | 512 | * a given offset |
| | 513 | */ |
| | 514 | const unsigned char *get_xlat_ptr(unsigned int ofs) const |
| | 515 | { |
| | 516 | return &xlat_array_[ofs]; |
| | 517 | } |
| | 518 | |
| | 519 | /* |
| | 520 | * Get a pointer to the translation of a character and the length in |
| | 521 | * bytes of the translation |
| | 522 | */ |
| | 523 | const unsigned char *get_xlation(wchar_t unicode_char, size_t *map_len) |
| | 524 | const |
| | 525 | { |
| | 526 | const unsigned char *map; |
| | 527 | |
| | 528 | /* get the translation offset */ |
| | 529 | map = get_xlat_ptr(get_mapping(unicode_char)); |
| | 530 | |
| | 531 | /* read the length and skip it in the table */ |
| | 532 | *map_len = (size_t)*map++; |
| | 533 | |
| | 534 | /* return the mapped byte sequence */ |
| | 535 | return map; |
| | 536 | } |
| | 537 | |
| | 538 | /* |
| | 539 | * get a pointer to the sequence of wchar_t values in the expansion |
| | 540 | * array at a given offset |
| | 541 | */ |
| | 542 | const wchar_t *get_exp_ptr(unsigned int ofs) const |
| | 543 | { |
| | 544 | return &exp_array_[ofs]; |
| | 545 | } |
| | 546 | |
| | 547 | /* load the mapping table from a file */ |
| | 548 | void load_table(osfildef *fp); |
| | 549 | |
| | 550 | /* |
| | 551 | * Given a Unicode character, get the sub-table for the character, |
| | 552 | * or null if there is no sub-table for this character. |
| | 553 | */ |
| | 554 | unsigned int *get_sub_table(wchar_t unicode_char) const |
| | 555 | { |
| | 556 | /* |
| | 557 | * use the high-order 8 bits of the unicode character as the |
| | 558 | * index into the master table |
| | 559 | */ |
| | 560 | return map_[(unicode_char >> 8) & 0xff]; |
| | 561 | } |
| | 562 | |
| | 563 | /* |
| | 564 | * Given a Unicode character, get the expansion sub-table for the |
| | 565 | * character. or null if there is no sub-table for the character. |
| | 566 | */ |
| | 567 | unsigned int *get_exp_sub_table(wchar_t unicode_char) const |
| | 568 | { |
| | 569 | /* |
| | 570 | * use the high-order 8 bits of the unicode character as the index |
| | 571 | * into the master table |
| | 572 | */ |
| | 573 | return exp_map_[(unicode_char >> 8) & 0xff]; |
| | 574 | } |
| | 575 | |
| | 576 | /* |
| | 577 | * Set a mapping. This allocates a new sub-table if necessary, and |
| | 578 | * stores the local character mapping in the table. |
| | 579 | */ |
| | 580 | void set_mapping(wchar_t unicode_char, unsigned int xlat_offset); |
| | 581 | |
| | 582 | /* set an expansion mapping */ |
| | 583 | void set_exp_mapping(wchar_t unicode_char, unsigned int exp_offset); |
| | 584 | |
| | 585 | /* |
| | 586 | * The master mapping table list. Each entry points to the |
| | 587 | * sub-array that contains the mapping for the 256 characters whose |
| | 588 | * high-order 8 bits give the index into this table. Each entry of |
| | 589 | * the subarray is the offset within the xlat_array_ byte array of |
| | 590 | * the first byte of the translation for the unicode character. |
| | 591 | */ |
| | 592 | unsigned int *map_[256]; |
| | 593 | |
| | 594 | /* |
| | 595 | * The master expansion mapping list. This works just like map_, but |
| | 596 | * points to exp_array_ entries for unicode display expansions. |
| | 597 | */ |
| | 598 | unsigned int *exp_map_[256]; |
| | 599 | |
| | 600 | /* |
| | 601 | * The translation array. This is an array of bytes containing the |
| | 602 | * translations. map_[high_8_bits][low_8_bits] contains the offset |
| | 603 | * within this array of the translation of the character with the |
| | 604 | * given code ((high_8_bits << 8) + low_8_bits). The first byte at |
| | 605 | * this offset is the length in bytes of the translation, not |
| | 606 | * counting the length byte. The remaining bytes are the bytes of |
| | 607 | * the translation for the character. |
| | 608 | */ |
| | 609 | unsigned char *xlat_array_; |
| | 610 | |
| | 611 | /* size of the translation array */ |
| | 612 | size_t xlat_array_size_; |
| | 613 | |
| | 614 | /* |
| | 615 | * The expansion array. This is an array of unicode characters |
| | 616 | * containing the expansions for displaying unicode characters. This |
| | 617 | * works just like xlat_array_: each entry in expmap_ is an index into |
| | 618 | * this array, which gives the starting point in the array of the run |
| | 619 | * of entries for the expansion of that character. The first character |
| | 620 | * of a run is a length prefix giving the number of characters in the |
| | 621 | * expansion. |
| | 622 | */ |
| | 623 | wchar_t *exp_array_; |
| | 624 | }; |
| | 625 | |
| | 626 | |
| | 627 | /* ======================================================================== */ |
| | 628 | /* |
| | 629 | * Local character set - to - Unicode UTF-8 mappers |
| | 630 | */ |
| | 631 | |
| | 632 | /* ------------------------------------------------------------------------ */ |
| | 633 | /* |
| | 634 | * Trival UTF8-to-UTF8 mapper - performs no conversions. This can be |
| | 635 | * used when reading from an external data source that is itself in |
| | 636 | * UTF-8 format; since this is identical to the format we use |
| | 637 | * internally, no mapping is required. |
| | 638 | */ |
| | 639 | class CCharmapToUniUTF8: public CCharmapToUni |
| | 640 | { |
| | 641 | public: |
| | 642 | /* read from a file */ |
| | 643 | virtual size_t read_file(osfildef *fp, char *buf, size_t bufl, |
| | 644 | unsigned long read_limit); |
| | 645 | |
| | 646 | /* determine if a byte sequence forms a complete character */ |
| | 647 | virtual int is_complete_char(const char *p, size_t len) const |
| | 648 | { |
| | 649 | /* |
| | 650 | * For UTF-8, we can infer the byte length of a character from the |
| | 651 | * first byte of the sequence. If the given length is at least the |
| | 652 | * inferred byte length, we have a complete character. |
| | 653 | */ |
| | 654 | return (len >= utf8_ptr::s_charsize(*p)); |
| | 655 | } |
| | 656 | |
| | 657 | /* map a string */ |
| | 658 | size_t map(char **output_ptr, size_t *output_buf_len, |
| | 659 | const char *input_ptr, size_t input_len) const |
| | 660 | { |
| | 661 | size_t partial_len; |
| | 662 | |
| | 663 | /* |
| | 664 | * do the full mapping, discarding the partial last character byte |
| | 665 | * length information |
| | 666 | */ |
| | 667 | return map2(output_ptr, output_buf_len, input_ptr, input_len, |
| | 668 | &partial_len); |
| | 669 | } |
| | 670 | |
| | 671 | /* map a string, providing partial character info */ |
| | 672 | virtual size_t map2(char **output_ptr, size_t *output_buf_len, |
| | 673 | const char *input_ptr, size_t input_len, |
| | 674 | size_t *partial_len) const; |
| | 675 | |
| | 676 | protected: |
| | 677 | /* we don't need a mapping table - ignore any that is set */ |
| | 678 | virtual void set_mapping(wchar_t, wchar_t) { } |
| | 679 | }; |
| | 680 | |
| | 681 | /* ------------------------------------------------------------------------ */ |
| | 682 | /* |
| | 683 | * Character mapper base class for UCS-2 to UTF-8. We will subclass |
| | 684 | * this mapper for big-endian and little-endian UCS-2 representations, |
| | 685 | * but both mappers are essentially the same in that only format |
| | 686 | * translation is required, since UCS-2 and UTF-8 use the same code |
| | 687 | * point mapping (i.e., Unicode). |
| | 688 | */ |
| | 689 | class CCharmapToUniUcs2: public CCharmapToUni |
| | 690 | { |
| | 691 | public: |
| | 692 | /* read from a file */ |
| | 693 | virtual size_t read_file(osfildef *fp, char *buf, size_t bufl, |
| | 694 | unsigned long read_limit); |
| | 695 | |
| | 696 | /* determine if a byte sequence forms a complete character */ |
| | 697 | virtual int is_complete_char(const char *, size_t len) const |
| | 698 | { |
| | 699 | /* every character in UCS-2 requires two bytes */ |
| | 700 | return (len >= 2); |
| | 701 | } |
| | 702 | |
| | 703 | /* map a string, providing partial character info */ |
| | 704 | virtual size_t map2(char **output_ptr, size_t *output_buf_len, |
| | 705 | const char *input_ptr, size_t input_len, |
| | 706 | size_t *partial_len) const |
| | 707 | { |
| | 708 | /* |
| | 709 | * if the input length is odd, there's one byte of partial |
| | 710 | * character information at the end of the buffer; otherwise |
| | 711 | * everything is valid |
| | 712 | */ |
| | 713 | *partial_len = (input_len & 1); |
| | 714 | |
| | 715 | /* perform the usual mapping */ |
| | 716 | return map(output_ptr, output_buf_len, input_ptr, input_len); |
| | 717 | } |
| | 718 | |
| | 719 | protected: |
| | 720 | /* |
| | 721 | * there's no mapping table for UCS-2 translations, so we don't need |
| | 722 | * to do anything with mappings |
| | 723 | */ |
| | 724 | virtual void set_mapping(wchar_t, wchar_t) { } |
| | 725 | |
| | 726 | /* temporary buffer for reading files */ |
| | 727 | char inbuf_[512]; |
| | 728 | }; |
| | 729 | |
| | 730 | /* ------------------------------------------------------------------------ */ |
| | 731 | /* |
| | 732 | * Character mapper for UCS-2 little-endian to UTF-8 |
| | 733 | */ |
| | 734 | class CCharmapToUniUcs2Little: public CCharmapToUniUcs2 |
| | 735 | { |
| | 736 | public: |
| | 737 | /* map a string */ |
| | 738 | size_t map(char **output_ptr, size_t *output_buf_len, |
| | 739 | const char *input_ptr, size_t input_len) const; |
| | 740 | }; |
| | 741 | |
| | 742 | /* ------------------------------------------------------------------------ */ |
| | 743 | /* |
| | 744 | * Character mapper for UCS-2 big-endian to UTF-8 |
| | 745 | */ |
| | 746 | class CCharmapToUniUcs2Big: public CCharmapToUniUcs2 |
| | 747 | { |
| | 748 | public: |
| | 749 | /* map a string */ |
| | 750 | size_t map(char **output_ptr, size_t *output_buf_len, |
| | 751 | const char *input_ptr, size_t input_len) const; |
| | 752 | }; |
| | 753 | |
| | 754 | /* ------------------------------------------------------------------------ */ |
| | 755 | /* |
| | 756 | * Basic character mapper for single-byte character sets to UTF-8 |
| | 757 | */ |
| | 758 | class CCharmapToUniSB_basic: public CCharmapToUni |
| | 759 | { |
| | 760 | public: |
| | 761 | /* read from a single-byte input file, translating to UTF-8 */ |
| | 762 | virtual size_t read_file(osfildef *fp, char *buf, size_t bufl, |
| | 763 | unsigned long read_limit); |
| | 764 | |
| | 765 | /* determine if a byte sequence forms a complete character */ |
| | 766 | virtual int is_complete_char(const char *, size_t) const |
| | 767 | { |
| | 768 | /* |
| | 769 | * every character in a single-byte set requires just one byte; |
| | 770 | * since 'len' is required to be at least one, there's no way we |
| | 771 | * can't have a complete character |
| | 772 | */ |
| | 773 | return TRUE; |
| | 774 | } |
| | 775 | |
| | 776 | /* map a string, providing partial character info */ |
| | 777 | virtual size_t map2(char **output_ptr, size_t *output_buf_len, |
| | 778 | const char *input_ptr, size_t input_len, |
| | 779 | size_t *partial_len) const |
| | 780 | { |
| | 781 | /* |
| | 782 | * for all single-byte character sets, one byte == one character, |
| | 783 | * so it's impossible to have partial characters |
| | 784 | */ |
| | 785 | *partial_len = 0; |
| | 786 | |
| | 787 | /* perform the normal mapping */ |
| | 788 | return map(output_ptr, output_buf_len, input_ptr, input_len); |
| | 789 | } |
| | 790 | |
| | 791 | protected: |
| | 792 | /* temporary buffer for reading files */ |
| | 793 | char inbuf_[512]; |
| | 794 | }; |
| | 795 | |
| | 796 | /* ------------------------------------------------------------------------ */ |
| | 797 | /* |
| | 798 | * Character mapper for plain ASCII to UTF-8 |
| | 799 | */ |
| | 800 | class CCharmapToUniASCII: public CCharmapToUniSB_basic |
| | 801 | { |
| | 802 | public: |
| | 803 | /* map a string */ |
| | 804 | size_t map(char **output_ptr, size_t *output_buf_len, |
| | 805 | const char *input_ptr, size_t input_len) const; |
| | 806 | |
| | 807 | protected: |
| | 808 | /* |
| | 809 | * there's no map for the ASCII translation, so we can ignore |
| | 810 | * mapping calls |
| | 811 | */ |
| | 812 | void set_mapping(wchar_t, wchar_t) { } |
| | 813 | }; |
| | 814 | |
| | 815 | /* ------------------------------------------------------------------------ */ |
| | 816 | /* |
| | 817 | * Character mapper for single-byte character sets to UTF-8. |
| | 818 | */ |
| | 819 | class CCharmapToUniSB: public CCharmapToUniSB_basic |
| | 820 | { |
| | 821 | public: |
| | 822 | CCharmapToUniSB() |
| | 823 | { |
| | 824 | int i; |
| | 825 | |
| | 826 | /* initialize the mapping table to all U+FFFD */ |
| | 827 | for (i = 0 ; i < 256 ; ++i) |
| | 828 | map_[i] = 0xFFFD; |
| | 829 | } |
| | 830 | |
| | 831 | /* map a string */ |
| | 832 | size_t map(char **output_ptr, size_t *output_buf_len, |
| | 833 | const char *input_ptr, size_t input_len) const; |
| | 834 | |
| | 835 | protected: |
| | 836 | /* set a mapping */ |
| | 837 | void set_mapping(wchar_t uni_code_pt, wchar_t local_code_pt) |
| | 838 | { |
| | 839 | /* |
| | 840 | * set the mapping, ignoring characters outside of our 8-bit |
| | 841 | * range |
| | 842 | */ |
| | 843 | if (((unsigned int)local_code_pt) < 256) |
| | 844 | map_[local_code_pt] = uni_code_pt; |
| | 845 | } |
| | 846 | |
| | 847 | private: |
| | 848 | /* |
| | 849 | * our mapping table - since the source character set is |
| | 850 | * single-byte, we need only store a wchar_t for each of the |
| | 851 | * possible 256 source characters |
| | 852 | */ |
| | 853 | wchar_t map_[256]; |
| | 854 | }; |
| | 855 | |
| | 856 | /* ------------------------------------------------------------------------ */ |
| | 857 | /* |
| | 858 | * Character mapper for mixed multi-byte character sets to UTF-8. This |
| | 859 | * maps from local character sets that use a mixture of one-byte and |
| | 860 | * two-byte sequences to represent characters. |
| | 861 | */ |
| | 862 | |
| | 863 | /* |
| | 864 | * Primary-byte mapping table entry. This gives us mapping instructions |
| | 865 | * for each leading byte of a character sequence. |
| | 866 | * |
| | 867 | * Each character is represented by a one-byte or two-byte sequence. This |
| | 868 | * mapper assumes a context-free mapping, hence for each character |
| | 869 | * represented by a single byte, that single byte unambiguously indicates |
| | 870 | * that character, and hence is never the first byte of a two-byte |
| | 871 | * sequence. For each character represented by a two-byte sequence, the |
| | 872 | * first byte of the sequence can only be part of two-byte sequences, hence |
| | 873 | * whenever we see that first byte we'll know for sure we have a two-byte |
| | 874 | * character. |
| | 875 | * |
| | 876 | * Each mapping here is for a first byte. If the byte is a single-byte |
| | 877 | * character, then the 'sub' pointer is null and the 'ch' entry gives the |
| | 878 | * Unicode code point for the character. If the byte is the lead byte of |
| | 879 | * one or more two-byte characters, then the 'sub' pointer is non-null and |
| | 880 | * 'ch' is ignored. |
| | 881 | */ |
| | 882 | struct cmap_mb_entry |
| | 883 | { |
| | 884 | /* |
| | 885 | * The sub-mapping table. This is a pointer to a table of the Unicode |
| | 886 | * code points of the two-byte sequences that start with this byte. |
| | 887 | * Each entry in the array is a Unicode code point, and the array is |
| | 888 | * indexed by the second byte of the two-byte sequence. If this |
| | 889 | * pointer is null, then this lead byte is a single-byte character. |
| | 890 | * |
| | 891 | * Note that this pointer, if non-null, always points to a 256-element |
| | 892 | * array. This array can thus be indexed directly with any unsigned |
| | 893 | * 8-bit byte value without any range checking. |
| | 894 | */ |
| | 895 | wchar_t *sub; |
| | 896 | |
| | 897 | /* |
| | 898 | * The Unicode code point of this character, if this primary byte is a |
| | 899 | * one-byte character. |
| | 900 | */ |
| | 901 | wchar_t ch; |
| | 902 | }; |
| | 903 | |
| | 904 | |
| | 905 | /* |
| | 906 | * The multi-byte-to-UTF8 mapper |
| | 907 | */ |
| | 908 | class CCharmapToUniMB: public CCharmapToUni |
| | 909 | { |
| | 910 | public: |
| | 911 | CCharmapToUniMB(); |
| | 912 | |
| | 913 | /* delete the table */ |
| | 914 | virtual ~CCharmapToUniMB(); |
| | 915 | |
| | 916 | /* determine if a byte sequence forms a complete character */ |
| | 917 | virtual int is_complete_char(const char *p, size_t len) const |
| | 918 | { |
| | 919 | /* |
| | 920 | * Check the first byte to see if this is a leading byte or a |
| | 921 | * stand-alone single byte. |
| | 922 | */ |
| | 923 | if (map_[(unsigned char)*p].sub == 0) |
| | 924 | { |
| | 925 | /* |
| | 926 | * it's a stand-alone byte, so the character length is one; |
| | 927 | * 'len' is required to be at least 1, so we definitely have a |
| | 928 | * complete character |
| | 929 | */ |
| | 930 | return TRUE; |
| | 931 | } |
| | 932 | else |
| | 933 | { |
| | 934 | /* it's a lead byte, so the character length is two */ |
| | 935 | return (len >= 2); |
| | 936 | } |
| | 937 | } |
| | 938 | |
| | 939 | /* read from a multi-byte input file, translating to UTF-8 */ |
| | 940 | virtual size_t read_file(osfildef *fp, char *buf, size_t bufl, |
| | 941 | unsigned long read_limit); |
| | 942 | |
| | 943 | /* map a string */ |
| | 944 | size_t map(char **output_ptr, size_t *output_buf_len, |
| | 945 | const char *input_ptr, size_t input_len) const |
| | 946 | { |
| | 947 | size_t partial_len; |
| | 948 | |
| | 949 | /* |
| | 950 | * do the full mapping, discarding the partial last character byte |
| | 951 | * length information |
| | 952 | */ |
| | 953 | return map2(output_ptr, output_buf_len, input_ptr, input_len, |
| | 954 | &partial_len); |
| | 955 | } |
| | 956 | |
| | 957 | /* map a string, providing partial character info */ |
| | 958 | virtual size_t map2(char **output_ptr, size_t *output_buf_len, |
| | 959 | const char *input_ptr, size_t input_len, |
| | 960 | size_t *partial_len) const; |
| | 961 | |
| | 962 | protected: |
| | 963 | /* set a mapping */ |
| | 964 | void set_mapping(wchar_t uni_code_pt, wchar_t local_code_pt); |
| | 965 | |
| | 966 | private: |
| | 967 | /* the primary-byte mapping table */ |
| | 968 | cmap_mb_entry map_[256]; |
| | 969 | |
| | 970 | /* temporary buffer for reading files */ |
| | 971 | char inbuf_[512]; |
| | 972 | }; |
| | 973 | |
| | 974 | /* ------------------------------------------------------------------------ */ |
| | 975 | /* |
| | 976 | * Character mapper for double-byte character sets to UTF-8. This maps |
| | 977 | * from local character sets that use a two-byte sequence to represent |
| | 978 | * each local character. |
| | 979 | * |
| | 980 | * For now, this is a trivial subclass of the multi-byte mapper; that |
| | 981 | * mapper handles the more general case of varying-length local |
| | 982 | * characters, so it can easily handle the case where every where is |
| | 983 | * represented by two bytes. If there is sufficient demand for it, a |
| | 984 | * special-case subclass to handle double-byte character sets specifically |
| | 985 | * could provide efficiency gains, since it wouldn't have to check each |
| | 986 | * lead byte to determine the character sequence length. |
| | 987 | */ |
| | 988 | class CCharmapToUniDB: public CCharmapToUniMB |
| | 989 | { |
| | 990 | public: |
| | 991 | }; |
| | 992 | |
| | 993 | /* ------------------------------------------------------------------------ */ |
| | 994 | /* |
| | 995 | * Character mapper for plain ISO-8859-1 to UTF-8 |
| | 996 | */ |
| | 997 | class CCharmapToUni8859_1: public CCharmapToUniSB |
| | 998 | { |
| | 999 | public: |
| | 1000 | /* creation */ |
| | 1001 | CCharmapToUni8859_1() |
| | 1002 | { |
| | 1003 | wchar_t i; |
| | 1004 | |
| | 1005 | /* |
| | 1006 | * Initialize our mapping table. Each 8859-1 code point maps to |
| | 1007 | * the same code point in Unicode, so this is a trivial |
| | 1008 | * translation. |
| | 1009 | */ |
| | 1010 | for (i = 0 ; i < 256 ; ++i) |
| | 1011 | set_mapping(i, i); |
| | 1012 | } |
| | 1013 | }; |
| | 1014 | |
| | 1015 | /* ======================================================================== */ |
| | 1016 | /* |
| | 1017 | * Unicode UTF-8 - to - local character set mappers |
| | 1018 | */ |
| | 1019 | |
| | 1020 | /* ------------------------------------------------------------------------ */ |
| | 1021 | /* |
| | 1022 | * Trivial character mapper for UTF8-to-UTF8 conversions. This can be |
| | 1023 | * used when writing external data in UTF8 format; since this is the |
| | 1024 | * same format we use internally, no conversion is required. |
| | 1025 | */ |
| | 1026 | class CCharmapToLocalUTF8: public CCharmapToLocal |
| | 1027 | { |
| | 1028 | public: |
| | 1029 | /* map a string */ |
| | 1030 | virtual size_t map_utf8(char *dest, size_t dest_len, |
| | 1031 | utf8_ptr src, size_t src_byte_len, |
| | 1032 | size_t *src_bytes_used) const; |
| | 1033 | |
| | 1034 | /* map a null-terminated string */ |
| | 1035 | virtual size_t map_utf8z(char *dest, size_t dest_len, |
| | 1036 | utf8_ptr src) const; |
| | 1037 | |
| | 1038 | /* map a character */ |
| | 1039 | size_t map(wchar_t unicode_char, char **output_ptr, |
| | 1040 | size_t *output_len) const; |
| | 1041 | |
| | 1042 | /* |
| | 1043 | * determine if the given Unicode character has a mapping to the local |
| | 1044 | * character set |
| | 1045 | */ |
| | 1046 | virtual int is_mappable(wchar_t unicode_char) const |
| | 1047 | { |
| | 1048 | /* every character can be mapped UTF8-to-UTF8, obviously */ |
| | 1049 | return TRUE; |
| | 1050 | } |
| | 1051 | }; |
| | 1052 | |
| | 1053 | /* ------------------------------------------------------------------------ */ |
| | 1054 | /* |
| | 1055 | * Character mapper for single-byte character sets. Each character in |
| | 1056 | * the local (output) character set is represented by a single byte. |
| | 1057 | */ |
| | 1058 | class CCharmapToLocalSB: public CCharmapToLocal |
| | 1059 | { |
| | 1060 | public: |
| | 1061 | /* map a string */ |
| | 1062 | virtual size_t map_utf8(char *dest, size_t dest_len, |
| | 1063 | utf8_ptr src, size_t src_byte_len, |
| | 1064 | size_t *src_bytes_used) const; |
| | 1065 | |
| | 1066 | /* map a null-terminated string */ |
| | 1067 | virtual size_t map_utf8z(char *dest, size_t dest_len, |
| | 1068 | utf8_ptr src) const; |
| | 1069 | |
| | 1070 | /* map a character */ |
| | 1071 | size_t map(wchar_t unicode_char, char **output_ptr, |
| | 1072 | size_t *output_len) const; |
| | 1073 | }; |
| | 1074 | |
| | 1075 | |
| | 1076 | /* ------------------------------------------------------------------------ */ |
| | 1077 | /* |
| | 1078 | * Mixed multi-byte mapper. Each local character is represented by a |
| | 1079 | * sequence of one or more bytes. |
| | 1080 | * |
| | 1081 | * This class is a trivial subclass of CCharmapToLocalSB. The single-byte |
| | 1082 | * base class already does everything we need to do, because it is designed |
| | 1083 | * to cope with mappings that involve expansions that represent a single |
| | 1084 | * Unicode character with a sequence of local characters (for example, |
| | 1085 | * "(c)" for the copyright symbol). |
| | 1086 | */ |
| | 1087 | class CCharmapToLocalMB: public CCharmapToLocalSB |
| | 1088 | { |
| | 1089 | public: |
| | 1090 | }; |
| | 1091 | |
| | 1092 | /* |
| | 1093 | * Double-byte mapper. Each local character is represented by exactly two |
| | 1094 | * bytes. This class is a trivial subclass of CCharmapToLocalMB, because |
| | 1095 | * the multi-byte mapper already handles the more general case of local |
| | 1096 | * character representations that use varying byte lengths; there is no |
| | 1097 | * particular efficiency gain to be had by creating a separate special-case |
| | 1098 | * class for double-byte character sets. |
| | 1099 | */ |
| | 1100 | class CCharmapToLocalDB: public CCharmapToLocalMB |
| | 1101 | { |
| | 1102 | public: |
| | 1103 | }; |
| | 1104 | |
| | 1105 | |
| | 1106 | /* ------------------------------------------------------------------------ */ |
| | 1107 | /* |
| | 1108 | * Character mapper for mapping to local default 7-bit ASCII. This |
| | 1109 | * mapper is has a built-in character set translation so that we can |
| | 1110 | * always create one without having to find an external mapping file. |
| | 1111 | */ |
| | 1112 | class CCharmapToLocalASCII: public CCharmapToLocalSB |
| | 1113 | { |
| | 1114 | public: |
| | 1115 | CCharmapToLocalASCII(); |
| | 1116 | }; |
| | 1117 | |
| | 1118 | |
| | 1119 | /* |
| | 1120 | * Character mapper for mapping to local ISO-8859-1. This mapper has a |
| | 1121 | * built-in character set translation so that we can always create one |
| | 1122 | * even without an external mapping file. |
| | 1123 | */ |
| | 1124 | class CCharmapToLocal8859_1: public CCharmapToLocalSB |
| | 1125 | { |
| | 1126 | public: |
| | 1127 | CCharmapToLocal8859_1(); |
| | 1128 | }; |
| | 1129 | |
| | 1130 | /* ------------------------------------------------------------------------ */ |
| | 1131 | /* |
| | 1132 | * Character mapper for 16-bit Wide Unicode local character set. Stores |
| | 1133 | * characters in the correct local wchar_t representation. Assumes that |
| | 1134 | * the pointer is wchar_t-aligned. |
| | 1135 | * |
| | 1136 | * This is a trival translation. Because we're mapping from Unicode to |
| | 1137 | * Unicode, the only thing we're changing is the encoding format - the |
| | 1138 | * character code is simply copied without any translation, since |
| | 1139 | * Unicode is the same everywhere. |
| | 1140 | */ |
| | 1141 | class CCharmapToLocalWideUnicode: public CCharmapToLocal |
| | 1142 | { |
| | 1143 | public: |
| | 1144 | /* map a string */ |
| | 1145 | virtual size_t map_utf8(char *dest, size_t dest_len, |
| | 1146 | utf8_ptr src, size_t src_byte_len, |
| | 1147 | size_t *src_bytes_used) const; |
| | 1148 | |
| | 1149 | /* map a null-terminated string */ |
| | 1150 | virtual size_t map_utf8z(char *dest, size_t dest_len, |
| | 1151 | utf8_ptr src) const; |
| | 1152 | |
| | 1153 | /* map a character */ |
| | 1154 | size_t map(wchar_t unicode_char, char **output_ptr, |
| | 1155 | size_t *output_len) const; |
| | 1156 | |
| | 1157 | /* |
| | 1158 | * determine if the given Unicode character has a mapping to the local |
| | 1159 | * character set |
| | 1160 | */ |
| | 1161 | virtual int is_mappable(wchar_t unicode_char) const |
| | 1162 | { |
| | 1163 | /* every character can be mapped UTF8-to-UCS2 */ |
| | 1164 | return TRUE; |
| | 1165 | } |
| | 1166 | }; |
| | 1167 | |
| | 1168 | /* ------------------------------------------------------------------------ */ |
| | 1169 | /* |
| | 1170 | * Character mapper for 16-bit Wide Unicode, big-endian. Stores the |
| | 1171 | * characters in big-endian UCS-2 representation. |
| | 1172 | */ |
| | 1173 | class CCharmapToLocalUcs2Big: public CCharmapToLocal |
| | 1174 | { |
| | 1175 | public: |
| | 1176 | /* map a string */ |
| | 1177 | virtual size_t map_utf8(char *dest, size_t dest_len, |
| | 1178 | utf8_ptr src, size_t src_byte_len, |
| | 1179 | size_t *src_bytes_used) const; |
| | 1180 | |
| | 1181 | /* map a null-terminated string */ |
| | 1182 | virtual size_t map_utf8z(char *dest, size_t dest_len, |
| | 1183 | utf8_ptr src) const; |
| | 1184 | |
| | 1185 | /* map a character */ |
| | 1186 | size_t map(wchar_t unicode_char, char **output_ptr, |
| | 1187 | size_t *output_len) const; |
| | 1188 | }; |
| | 1189 | |
| | 1190 | /* ------------------------------------------------------------------------ */ |
| | 1191 | /* |
| | 1192 | * Character mapper for 16-bit Wide Unicode, little-endian. Stores the |
| | 1193 | * characters in little-endian UCS-2 representation. |
| | 1194 | */ |
| | 1195 | class CCharmapToLocalUcs2Little: public CCharmapToLocal |
| | 1196 | { |
| | 1197 | public: |
| | 1198 | /* map a string */ |
| | 1199 | virtual size_t map_utf8(char *dest, size_t dest_len, |
| | 1200 | utf8_ptr src, size_t src_byte_len, |
| | 1201 | size_t *src_bytes_used) const; |
| | 1202 | |
| | 1203 | /* map a null-terminated string */ |
| | 1204 | virtual size_t map_utf8z(char *dest, size_t dest_len, |
| | 1205 | utf8_ptr src) const; |
| | 1206 | |
| | 1207 | /* map a character */ |
| | 1208 | size_t map(wchar_t unicode_char, char **output_ptr, |
| | 1209 | size_t *output_len) const; |
| | 1210 | }; |
| | 1211 | |
| | 1212 | |
| | 1213 | #endif /* CHARMAP_H */ |