| | 1 | #ifdef RCSID |
| | 2 | static char RCSid[] = |
| | 3 | "$Header: d:/cvsroot/tads/tads3/charmap.cpp,v 1.3 1999/07/11 00:46:58 MJRoberts Exp $"; |
| | 4 | #endif |
| | 5 | |
| | 6 | /* |
| | 7 | * Copyright (c) 1998, 2002 Michael J. Roberts. All Rights Reserved. |
| | 8 | * |
| | 9 | * Please see the accompanying license file, LICENSE.TXT, for information |
| | 10 | * on using and copying this software. |
| | 11 | */ |
| | 12 | /* |
| | 13 | Name |
| | 14 | charmap.cpp - character mapper |
| | 15 | Function |
| | 16 | |
| | 17 | Notes |
| | 18 | |
| | 19 | Modified |
| | 20 | 10/17/98 MJRoberts - Creation |
| | 21 | */ |
| | 22 | |
| | 23 | #include <stdlib.h> |
| | 24 | #include <string.h> |
| | 25 | |
| | 26 | #include "t3std.h" |
| | 27 | #include "os.h" |
| | 28 | #include "utf8.h" |
| | 29 | #include "resload.h" |
| | 30 | #include "charmap.h" |
| | 31 | |
| | 32 | |
| | 33 | /* ------------------------------------------------------------------------ */ |
| | 34 | /* |
| | 35 | * Basic Mapper Class |
| | 36 | */ |
| | 37 | |
| | 38 | /* |
| | 39 | * Open and characterize a mapping file |
| | 40 | */ |
| | 41 | osfildef *CCharmap::open_map_file(class CResLoader *res_loader, |
| | 42 | const char *table_name, |
| | 43 | charmap_type_t *map_type) |
| | 44 | { |
| | 45 | osfildef *fp; |
| | 46 | char respath[100]; |
| | 47 | ulong startpos; |
| | 48 | uchar buf[256]; |
| | 49 | uint entry_cnt; |
| | 50 | int found_single; |
| | 51 | int found_double; |
| | 52 | |
| | 53 | /* |
| | 54 | * Generate the full resource path - character mapping resource paths |
| | 55 | * always start with "charmap/" followed by the table name, plus the |
| | 56 | * ".tcm" extension. |
| | 57 | */ |
| | 58 | strcpy(respath, "charmap/"); |
| | 59 | strcat(respath, table_name); |
| | 60 | strcat(respath, ".tcm"); |
| | 61 | |
| | 62 | /* open the file for the character set */ |
| | 63 | fp = res_loader->open_res_file(respath, "charmap/cmaplib", "CLIB"); |
| | 64 | |
| | 65 | /* if we couldn't open the mapping file, return failure */ |
| | 66 | if (fp == 0) |
| | 67 | return 0; |
| | 68 | |
| | 69 | /* note the initial seek position */ |
| | 70 | startpos = osfpos(fp); |
| | 71 | |
| | 72 | /* read the header and the local-to-unicode header */ |
| | 73 | if (osfrb(fp, buf, 6)) |
| | 74 | goto fail; |
| | 75 | |
| | 76 | /* get the number of entries from the local-to-unicode header */ |
| | 77 | entry_cnt = osrp2(buf + 4); |
| | 78 | |
| | 79 | /* |
| | 80 | * Scan the entries to determine if we have single-byte, |
| | 81 | * double-byte, or both. |
| | 82 | */ |
| | 83 | found_single = found_double = FALSE; |
| | 84 | while (entry_cnt > 0) |
| | 85 | { |
| | 86 | size_t cur; |
| | 87 | const uchar *p; |
| | 88 | |
| | 89 | /* read up to a buffer-full or the remaining size */ |
| | 90 | cur = sizeof(buf)/4; |
| | 91 | if (cur > entry_cnt) |
| | 92 | cur = entry_cnt; |
| | 93 | |
| | 94 | /* read it */ |
| | 95 | if (osfrb(fp, buf, cur*4)) |
| | 96 | goto fail; |
| | 97 | |
| | 98 | /* deduct the amount we just read from the amount remaining */ |
| | 99 | entry_cnt -= cur; |
| | 100 | |
| | 101 | /* scan the entries */ |
| | 102 | for (p = buf ; cur > 0 ; --cur, p += 4) |
| | 103 | { |
| | 104 | /* |
| | 105 | * Note whether this is a single-byte or double-byte entry. |
| | 106 | * If the high-order byte is non-zero, it's a double-byte |
| | 107 | * entry; otherwise, it's a single-byte entry. |
| | 108 | * |
| | 109 | * Note that we read the UINT2 at (p+2), because that's the |
| | 110 | * local character-set code point in this tuple. |
| | 111 | */ |
| | 112 | if (((uint)osrp2(p + 2)) > 0xFF) |
| | 113 | found_double = TRUE; |
| | 114 | else |
| | 115 | found_single = TRUE; |
| | 116 | } |
| | 117 | |
| | 118 | /* |
| | 119 | * if we've found both single- and double-byte characters so |
| | 120 | * far, there's no need to look any further, since we know |
| | 121 | * everything about the file now |
| | 122 | */ |
| | 123 | if (found_single && found_double) |
| | 124 | break; |
| | 125 | } |
| | 126 | |
| | 127 | /* |
| | 128 | * create the appropriate mapper, depending on whether we found |
| | 129 | * single, double, or mixed characters |
| | 130 | */ |
| | 131 | if (found_single && found_double) |
| | 132 | { |
| | 133 | /* it's mixed */ |
| | 134 | *map_type = CHARMAP_TYPE_MB; |
| | 135 | } |
| | 136 | else if (found_double) |
| | 137 | { |
| | 138 | /* it's all double-byte */ |
| | 139 | *map_type = CHARMAP_TYPE_DB; |
| | 140 | } |
| | 141 | else if (found_single) |
| | 142 | { |
| | 143 | /* it's all single-byte */ |
| | 144 | *map_type = CHARMAP_TYPE_SB; |
| | 145 | } |
| | 146 | else |
| | 147 | { |
| | 148 | /* no mappings found at all - presume it's a single-byte mapper */ |
| | 149 | *map_type = CHARMAP_TYPE_SB; |
| | 150 | } |
| | 151 | |
| | 152 | /* seek back to the start of the table */ |
| | 153 | osfseek(fp, startpos, OSFSK_SET); |
| | 154 | |
| | 155 | /* return the file pointer */ |
| | 156 | return fp; |
| | 157 | |
| | 158 | fail: |
| | 159 | /* close the file and return failure */ |
| | 160 | osfcls(fp); |
| | 161 | return 0; |
| | 162 | } |
| | 163 | |
| | 164 | /* ------------------------------------------------------------------------ */ |
| | 165 | /* |
| | 166 | * Special built-in mapper to 7-bit ASCII. This is available as a last |
| | 167 | * resort when no external mapping file can be found. |
| | 168 | */ |
| | 169 | |
| | 170 | /* |
| | 171 | * create a plain ascii translator |
| | 172 | */ |
| | 173 | CCharmapToLocalASCII::CCharmapToLocalASCII() |
| | 174 | { |
| | 175 | unsigned char *dst; |
| | 176 | wchar_t *exp_dst; |
| | 177 | size_t siz; |
| | 178 | size_t exp_siz; |
| | 179 | struct ascii_map_t |
| | 180 | { |
| | 181 | wchar_t uni; |
| | 182 | char asc[5]; |
| | 183 | }; |
| | 184 | ascii_map_t *p; |
| | 185 | static ascii_map_t ascii_mapping[] = |
| | 186 | { |
| | 187 | /* regular ASCII characters */ |
| | 188 | { 1, { 1 } }, |
| | 189 | { 2, { 2 } }, |
| | 190 | { 3, { 3 } }, |
| | 191 | { 4, { 4 } }, |
| | 192 | { 5, { 5 } }, |
| | 193 | { 6, { 6 } }, |
| | 194 | { 7, { 7 } }, |
| | 195 | { 8, { 8 } }, |
| | 196 | { 9, { 9 } }, |
| | 197 | { 10, { 10 } }, |
| | 198 | { 11, { 11 } }, |
| | 199 | { 12, { 12 } }, |
| | 200 | { 13, { 13 } }, |
| | 201 | { 14, { 14 } }, |
| | 202 | { 15, { 15 } }, |
| | 203 | { 16, { 16 } }, |
| | 204 | { 17, { 17 } }, |
| | 205 | { 18, { 18 } }, |
| | 206 | { 19, { 19 } }, |
| | 207 | { 20, { 20 } }, |
| | 208 | { 21, { 21 } }, |
| | 209 | { 22, { 22 } }, |
| | 210 | { 23, { 23 } }, |
| | 211 | { 24, { 24 } }, |
| | 212 | { 25, { 25 } }, |
| | 213 | { 26, { 26 } }, |
| | 214 | { 27, { 27 } }, |
| | 215 | { 28, { 28 } }, |
| | 216 | { 29, { 29 } }, |
| | 217 | { 30, { 30 } }, |
| | 218 | { 31, { 31 } }, |
| | 219 | { 32, { 32 } }, |
| | 220 | { 33, { 33 } }, |
| | 221 | { 34, { 34 } }, |
| | 222 | { 35, { 35 } }, |
| | 223 | { 36, { 36 } }, |
| | 224 | { 37, { 37 } }, |
| | 225 | { 38, { 38 } }, |
| | 226 | { 39, { 39 } }, |
| | 227 | { 40, { 40 } }, |
| | 228 | { 41, { 41 } }, |
| | 229 | { 42, { 42 } }, |
| | 230 | { 43, { 43 } }, |
| | 231 | { 44, { 44 } }, |
| | 232 | { 45, { 45 } }, |
| | 233 | { 46, { 46 } }, |
| | 234 | { 47, { 47 } }, |
| | 235 | { 48, { 48 } }, |
| | 236 | { 49, { 49 } }, |
| | 237 | { 50, { 50 } }, |
| | 238 | { 51, { 51 } }, |
| | 239 | { 52, { 52 } }, |
| | 240 | { 53, { 53 } }, |
| | 241 | { 54, { 54 } }, |
| | 242 | { 55, { 55 } }, |
| | 243 | { 56, { 56 } }, |
| | 244 | { 57, { 57 } }, |
| | 245 | { 58, { 58 } }, |
| | 246 | { 59, { 59 } }, |
| | 247 | { 60, { 60 } }, |
| | 248 | { 61, { 61 } }, |
| | 249 | { 62, { 62 } }, |
| | 250 | { 63, { 63 } }, |
| | 251 | { 64, { 64 } }, |
| | 252 | { 65, { 65 } }, |
| | 253 | { 66, { 66 } }, |
| | 254 | { 67, { 67 } }, |
| | 255 | { 68, { 68 } }, |
| | 256 | { 69, { 69 } }, |
| | 257 | { 70, { 70 } }, |
| | 258 | { 71, { 71 } }, |
| | 259 | { 72, { 72 } }, |
| | 260 | { 73, { 73 } }, |
| | 261 | { 74, { 74 } }, |
| | 262 | { 75, { 75 } }, |
| | 263 | { 76, { 76 } }, |
| | 264 | { 77, { 77 } }, |
| | 265 | { 78, { 78 } }, |
| | 266 | { 79, { 79 } }, |
| | 267 | { 80, { 80 } }, |
| | 268 | { 81, { 81 } }, |
| | 269 | { 82, { 82 } }, |
| | 270 | { 83, { 83 } }, |
| | 271 | { 84, { 84 } }, |
| | 272 | { 85, { 85 } }, |
| | 273 | { 86, { 86 } }, |
| | 274 | { 87, { 87 } }, |
| | 275 | { 88, { 88 } }, |
| | 276 | { 89, { 89 } }, |
| | 277 | { 90, { 90 } }, |
| | 278 | { 91, { 91 } }, |
| | 279 | { 92, { 92 } }, |
| | 280 | { 93, { 93 } }, |
| | 281 | { 94, { 94 } }, |
| | 282 | { 95, { 95 } }, |
| | 283 | { 96, { 96 } }, |
| | 284 | { 97, { 97 } }, |
| | 285 | { 98, { 98 } }, |
| | 286 | { 99, { 99 } }, |
| | 287 | { 100, { 100 } }, |
| | 288 | { 101, { 101 } }, |
| | 289 | { 102, { 102 } }, |
| | 290 | { 103, { 103 } }, |
| | 291 | { 104, { 104 } }, |
| | 292 | { 105, { 105 } }, |
| | 293 | { 106, { 106 } }, |
| | 294 | { 107, { 107 } }, |
| | 295 | { 108, { 108 } }, |
| | 296 | { 109, { 109 } }, |
| | 297 | { 110, { 110 } }, |
| | 298 | { 111, { 111 } }, |
| | 299 | { 112, { 112 } }, |
| | 300 | { 113, { 113 } }, |
| | 301 | { 114, { 114 } }, |
| | 302 | { 115, { 115 } }, |
| | 303 | { 116, { 116 } }, |
| | 304 | { 117, { 117 } }, |
| | 305 | { 118, { 118 } }, |
| | 306 | { 119, { 119 } }, |
| | 307 | { 120, { 120 } }, |
| | 308 | { 121, { 121 } }, |
| | 309 | { 122, { 122 } }, |
| | 310 | { 123, { 123 } }, |
| | 311 | { 124, { 124 } }, |
| | 312 | { 125, { 125 } }, |
| | 313 | { 126, { 126 } }, |
| | 314 | { 127, { 127 } }, |
| | 315 | |
| | 316 | /* Latin-1 accented characters and symbols */ |
| | 317 | { 353, "s" }, |
| | 318 | { 352, "S" }, |
| | 319 | { 8218, "\'" }, |
| | 320 | { 8222, "\"" }, |
| | 321 | { 8249, "<" }, |
| | 322 | { 338, "OE" }, |
| | 323 | { 8216, "\'" }, |
| | 324 | { 8217, "\'" }, |
| | 325 | { 8220, "\"" }, |
| | 326 | { 8221, "\"" }, |
| | 327 | { 8211, "-" }, |
| | 328 | { 8212, "--" }, |
| | 329 | { 8482, "(tm)" }, |
| | 330 | { 8250, ">" }, |
| | 331 | { 339, "oe" }, |
| | 332 | { 376, "Y" }, |
| | 333 | { 162, "c" }, |
| | 334 | { 163, "L" }, |
| | 335 | { 165, "Y" }, |
| | 336 | { 166, "|" }, |
| | 337 | { 169, "(c)" }, |
| | 338 | { 170, "a" }, |
| | 339 | { 173, " " }, |
| | 340 | { 174, "(R)" }, |
| | 341 | { 175, "-" }, |
| | 342 | { 177, "+/-" }, |
| | 343 | { 178, "2" }, |
| | 344 | { 179, "3" }, |
| | 345 | { 180, "\'" }, |
| | 346 | { 181, "u" }, |
| | 347 | { 182, "P" }, |
| | 348 | { 183, "*" }, |
| | 349 | { 184, "," }, |
| | 350 | { 185, "1" }, |
| | 351 | { 186, "o" }, |
| | 352 | { 171, "<<" }, |
| | 353 | { 187, ">>" }, |
| | 354 | { 188, "1/4" }, |
| | 355 | { 189, "1/2" }, |
| | 356 | { 190, "3/4" }, |
| | 357 | { 192, "A" }, |
| | 358 | { 193, "A" }, |
| | 359 | { 194, "A" }, |
| | 360 | { 195, "A" }, |
| | 361 | { 196, "A" }, |
| | 362 | { 197, "A" }, |
| | 363 | { 198, "AE" }, |
| | 364 | { 199, "C" }, |
| | 365 | { 200, "E" }, |
| | 366 | { 201, "E" }, |
| | 367 | { 202, "E" }, |
| | 368 | { 203, "E" }, |
| | 369 | { 204, "I" }, |
| | 370 | { 205, "I" }, |
| | 371 | { 206, "I" }, |
| | 372 | { 207, "I" }, |
| | 373 | { 209, "N" }, |
| | 374 | { 210, "O" }, |
| | 375 | { 211, "O" }, |
| | 376 | { 212, "O" }, |
| | 377 | { 213, "O" }, |
| | 378 | { 214, "O" }, |
| | 379 | { 215, "x" }, |
| | 380 | { 216, "O" }, |
| | 381 | { 217, "U" }, |
| | 382 | { 218, "U" }, |
| | 383 | { 219, "U" }, |
| | 384 | { 220, "U" }, |
| | 385 | { 221, "Y" }, |
| | 386 | { 223, "ss" }, |
| | 387 | { 224, "a" }, |
| | 388 | { 225, "a" }, |
| | 389 | { 226, "a" }, |
| | 390 | { 227, "a" }, |
| | 391 | { 228, "a" }, |
| | 392 | { 229, "a" }, |
| | 393 | { 230, "ae" }, |
| | 394 | { 231, "c" }, |
| | 395 | { 232, "e" }, |
| | 396 | { 233, "e" }, |
| | 397 | { 234, "e" }, |
| | 398 | { 235, "e" }, |
| | 399 | { 236, "i" }, |
| | 400 | { 237, "i" }, |
| | 401 | { 238, "i" }, |
| | 402 | { 239, "i" }, |
| | 403 | { 241, "n" }, |
| | 404 | { 242, "o" }, |
| | 405 | { 243, "o" }, |
| | 406 | { 244, "o" }, |
| | 407 | { 245, "o" }, |
| | 408 | { 246, "o" }, |
| | 409 | { 247, "/" }, |
| | 410 | { 248, "o" }, |
| | 411 | { 249, "u" }, |
| | 412 | { 250, "u" }, |
| | 413 | { 251, "u" }, |
| | 414 | { 252, "u" }, |
| | 415 | { 253, "y" }, |
| | 416 | { 255, "y" }, |
| | 417 | { 710, "^" }, |
| | 418 | { 732, "~" }, |
| | 419 | |
| | 420 | /* math symbols */ |
| | 421 | { 402, "f" }, |
| | 422 | |
| | 423 | /* other symbols */ |
| | 424 | { 8226, "*" }, |
| | 425 | |
| | 426 | /* arrows */ |
| | 427 | { 8592, "<-" }, |
| | 428 | { 8594, "->" }, |
| | 429 | |
| | 430 | /* several capital Greek letters look a lot like Roman letters */ |
| | 431 | { 913, "A" }, |
| | 432 | { 914, "B" }, |
| | 433 | { 918, "Z" }, |
| | 434 | { 919, "H" }, |
| | 435 | { 921, "I" }, |
| | 436 | { 922, "K" }, |
| | 437 | { 924, "M" }, |
| | 438 | { 925, "N" }, |
| | 439 | { 927, "O" }, |
| | 440 | { 929, "P" }, |
| | 441 | { 932, "T" }, |
| | 442 | { 933, "Y" }, |
| | 443 | { 935, "X" }, |
| | 444 | |
| | 445 | /* Latin-2 accented characters */ |
| | 446 | { 260, "A" }, |
| | 447 | { 321, "L" }, |
| | 448 | { 317, "L" }, |
| | 449 | { 346, "S" }, |
| | 450 | { 350, "S" }, |
| | 451 | { 356, "T" }, |
| | 452 | { 377, "Z" }, |
| | 453 | { 381, "Z" }, |
| | 454 | { 379, "Z" }, |
| | 455 | { 261, "a" }, |
| | 456 | { 731, "o" }, |
| | 457 | { 322, "l" }, |
| | 458 | { 318, "l" }, |
| | 459 | { 347, "s" }, |
| | 460 | { 351, "s" }, |
| | 461 | { 357, "t" }, |
| | 462 | { 378, "z" }, |
| | 463 | { 733, "\"" }, |
| | 464 | { 382, "z" }, |
| | 465 | { 380, "z" }, |
| | 466 | { 340, "R" }, |
| | 467 | { 258, "A" }, |
| | 468 | { 313, "L" }, |
| | 469 | { 262, "C" }, |
| | 470 | { 268, "C" }, |
| | 471 | { 280, "E" }, |
| | 472 | { 282, "E" }, |
| | 473 | { 270, "D" }, |
| | 474 | { 272, "D" }, |
| | 475 | { 323, "N" }, |
| | 476 | { 327, "N" }, |
| | 477 | { 336, "O" }, |
| | 478 | { 344, "R" }, |
| | 479 | { 366, "U" }, |
| | 480 | { 368, "U" }, |
| | 481 | { 354, "T" }, |
| | 482 | { 341, "r" }, |
| | 483 | { 259, "a" }, |
| | 484 | { 314, "l" }, |
| | 485 | { 263, "c" }, |
| | 486 | { 269, "c" }, |
| | 487 | { 281, "e" }, |
| | 488 | { 283, "e" }, |
| | 489 | { 271, "d" }, |
| | 490 | { 273, "d" }, |
| | 491 | { 324, "n" }, |
| | 492 | { 328, "n" }, |
| | 493 | { 337, "o" }, |
| | 494 | { 345, "r" }, |
| | 495 | { 367, "u" }, |
| | 496 | { 369, "u" }, |
| | 497 | { 355, "t" }, |
| | 498 | { 0, { 0 } } |
| | 499 | }; |
| | 500 | |
| | 501 | /* determine how much space we'll need in the translation array */ |
| | 502 | for (p = ascii_mapping, siz = 0, exp_siz = 0 ; p->uni != 0 ; ++p) |
| | 503 | { |
| | 504 | /* we need space for this mapping string, plus a length prefix byte */ |
| | 505 | siz += strlen(p->asc) + 1; |
| | 506 | |
| | 507 | /* |
| | 508 | * if this is a multi-character expansion, count it in the |
| | 509 | * expansion array size |
| | 510 | */ |
| | 511 | if (strlen(p->asc) > 1) |
| | 512 | exp_siz += strlen(p->asc) + 1; |
| | 513 | } |
| | 514 | |
| | 515 | /* add in space for the default entry */ |
| | 516 | siz += 2; |
| | 517 | |
| | 518 | /* allocate the translation array */ |
| | 519 | xlat_array_ = (unsigned char *)t3malloc(siz); |
| | 520 | |
| | 521 | /* |
| | 522 | * allocate the expansion array; allocate one extra entry for the null |
| | 523 | * mapping at index zero |
| | 524 | */ |
| | 525 | exp_array_ = (wchar_t *)t3malloc((exp_siz + 1) * sizeof(wchar_t)); |
| | 526 | |
| | 527 | /* |
| | 528 | * start at element 1 of the expansion array (element zero is reserved |
| | 529 | * to indicate the null mapping) |
| | 530 | */ |
| | 531 | dst = xlat_array_; |
| | 532 | exp_dst = exp_array_ + 1; |
| | 533 | |
| | 534 | /* |
| | 535 | * Add the zeroeth entry, which serves as the default mapping for |
| | 536 | * characters that aren't otherwise mappable. |
| | 537 | */ |
| | 538 | set_mapping(0, 0); |
| | 539 | *dst++ = 1; |
| | 540 | *dst++ = '?'; |
| | 541 | |
| | 542 | /* set up the arrays */ |
| | 543 | for (p = ascii_mapping ; p->uni != 0 ; ++p) |
| | 544 | { |
| | 545 | size_t len; |
| | 546 | |
| | 547 | /* set the mapping's offset in the translation array */ |
| | 548 | set_mapping(p->uni, dst - xlat_array_); |
| | 549 | |
| | 550 | /* get the length of this mapping */ |
| | 551 | len = strlen(p->asc); |
| | 552 | |
| | 553 | /* set this mapping's length */ |
| | 554 | *dst++ = (unsigned char)len; |
| | 555 | |
| | 556 | /* copy the mapping */ |
| | 557 | memcpy(dst, p->asc, len); |
| | 558 | |
| | 559 | /* move past the mapping in the translation array */ |
| | 560 | dst += len; |
| | 561 | |
| | 562 | /* add the expansion mapping if necessary */ |
| | 563 | if (len > 1) |
| | 564 | { |
| | 565 | size_t i; |
| | 566 | |
| | 567 | /* add an expansion mapping */ |
| | 568 | set_exp_mapping(p->uni, exp_dst - exp_array_); |
| | 569 | |
| | 570 | /* set the length prefix */ |
| | 571 | *exp_dst++ = (wchar_t)len; |
| | 572 | |
| | 573 | /* add the mapping */ |
| | 574 | for (i = 0 ; i < len ; ++i) |
| | 575 | *exp_dst++ = (wchar_t)p->asc[i]; |
| | 576 | } |
| | 577 | } |
| | 578 | } |
| | 579 | |
| | 580 | /* ------------------------------------------------------------------------ */ |
| | 581 | /* |
| | 582 | * Special built-in mapper to ISO-8859-1. Because of the widespread use |
| | 583 | * of this character set, we make this mapping available even when no |
| | 584 | * external mapping file is available. |
| | 585 | */ |
| | 586 | |
| | 587 | /* |
| | 588 | * create an 8859-1 mapper |
| | 589 | */ |
| | 590 | CCharmapToLocal8859_1::CCharmapToLocal8859_1() |
| | 591 | { |
| | 592 | unsigned char *dst; |
| | 593 | size_t siz; |
| | 594 | wchar_t c; |
| | 595 | |
| | 596 | /* |
| | 597 | * Determine how much space we'll need in the translation array - we |
| | 598 | * need one byte for each character, plus one byte for the length of |
| | 599 | * each character. We also need two bytes for the default entry. |
| | 600 | */ |
| | 601 | siz = 256 + 256 + 2; |
| | 602 | |
| | 603 | /* allocate the mapping */ |
| | 604 | xlat_array_ = (unsigned char *)t3malloc(siz); |
| | 605 | |
| | 606 | /* start at the start of the array */ |
| | 607 | dst = xlat_array_; |
| | 608 | |
| | 609 | /* |
| | 610 | * Add the zeroeth entry, which serves as the default mapping for |
| | 611 | * characters that aren't otherwise mappable. |
| | 612 | */ |
| | 613 | set_mapping(0, 0); |
| | 614 | *dst++ = 1; |
| | 615 | *dst++ = '?'; |
| | 616 | |
| | 617 | /* |
| | 618 | * Set up the mappings - this is easy because each Unicode code point |
| | 619 | * from 0 to 255 maps to the same ISO 8859-1 code point. |
| | 620 | */ |
| | 621 | for (c = 0 ; c < 256 ; ++c) |
| | 622 | { |
| | 623 | /* set the mapping's offset in the translation array */ |
| | 624 | set_mapping(c, dst - xlat_array_); |
| | 625 | |
| | 626 | /* store the length (always 1) and translated character */ |
| | 627 | *dst++ = 1; |
| | 628 | *dst++ = (unsigned char)c; |
| | 629 | } |
| | 630 | } |
| | 631 | |
| | 632 | |
| | 633 | /* ------------------------------------------------------------------------ */ |
| | 634 | /* |
| | 635 | * Character mapping for Unicode to Local |
| | 636 | */ |
| | 637 | |
| | 638 | /* |
| | 639 | * create the translator |
| | 640 | */ |
| | 641 | CCharmapToLocal::CCharmapToLocal() |
| | 642 | { |
| | 643 | /* no mapping sub-tables yet */ |
| | 644 | memset(map_, 0, sizeof(map_)); |
| | 645 | memset(exp_map_, 0, sizeof(exp_map_)); |
| | 646 | |
| | 647 | /* no translation or expansion arrays yet */ |
| | 648 | xlat_array_ = 0; |
| | 649 | exp_array_ = 0; |
| | 650 | } |
| | 651 | |
| | 652 | /* |
| | 653 | * delete the translator |
| | 654 | */ |
| | 655 | CCharmapToLocal::~CCharmapToLocal() |
| | 656 | { |
| | 657 | size_t i; |
| | 658 | |
| | 659 | /* delete the translation array */ |
| | 660 | if (xlat_array_ != 0) |
| | 661 | t3free(xlat_array_); |
| | 662 | |
| | 663 | /* delete the expansion array */ |
| | 664 | if (exp_array_ != 0) |
| | 665 | t3free(exp_array_); |
| | 666 | |
| | 667 | /* delete any mapping tables we've allocated */ |
| | 668 | for (i = 0 ; i < sizeof(map_)/sizeof(map_[0]) ; ++i) |
| | 669 | { |
| | 670 | /* delete this mapping if allocated */ |
| | 671 | if (map_[i] != 0) |
| | 672 | t3free(map_[i]); |
| | 673 | } |
| | 674 | |
| | 675 | /* delete any expansion mapping tables */ |
| | 676 | for (i = 0 ; i < sizeof(exp_map_)/sizeof(exp_map_[0]) ; ++i) |
| | 677 | { |
| | 678 | /* delete this expansion mapping if allocated */ |
| | 679 | if (exp_map_[i] != 0) |
| | 680 | t3free(exp_map_[i]); |
| | 681 | } |
| | 682 | } |
| | 683 | |
| | 684 | /* |
| | 685 | * Set a mapping |
| | 686 | */ |
| | 687 | void CCharmapToLocal::set_mapping(wchar_t unicode_char, |
| | 688 | unsigned int xlat_offset) |
| | 689 | { |
| | 690 | int master_idx; |
| | 691 | |
| | 692 | /* get the master table index for this unicode character */ |
| | 693 | master_idx = (int)((unicode_char >> 8) & 0xff); |
| | 694 | |
| | 695 | /* if there's no sub-table here yet, create one */ |
| | 696 | if (map_[master_idx] == 0) |
| | 697 | { |
| | 698 | int i; |
| | 699 | |
| | 700 | /* allocate it */ |
| | 701 | map_[master_idx] = |
| | 702 | (unsigned int *)t3malloc(256 * sizeof(unsigned int)); |
| | 703 | |
| | 704 | /* |
| | 705 | * Set each entry to the default character, so that it will |
| | 706 | * produce valid results if no mapping is ever specified for the |
| | 707 | * character. The default character is always at offset zero in |
| | 708 | * the translation array. |
| | 709 | */ |
| | 710 | for (i = 0 ; i < 256 ; ++i) |
| | 711 | map_[master_idx][i] = 0; |
| | 712 | } |
| | 713 | |
| | 714 | /* set the mapping for the character's entry in the sub-table */ |
| | 715 | map_[master_idx][unicode_char & 0xff] = xlat_offset; |
| | 716 | } |
| | 717 | |
| | 718 | /* |
| | 719 | * Set an expansion mapping |
| | 720 | */ |
| | 721 | void CCharmapToLocal::set_exp_mapping(wchar_t unicode_char, |
| | 722 | unsigned int exp_offset) |
| | 723 | { |
| | 724 | int master_idx; |
| | 725 | |
| | 726 | /* get the master table index for this unicode character */ |
| | 727 | master_idx = (int)((unicode_char >> 8) & 0xff); |
| | 728 | |
| | 729 | /* if there's no sub-table here yet, create one */ |
| | 730 | if (exp_map_[master_idx] == 0) |
| | 731 | { |
| | 732 | int i; |
| | 733 | |
| | 734 | /* allocate it */ |
| | 735 | exp_map_[master_idx] = |
| | 736 | (unsigned int *)t3malloc(256 * sizeof(unsigned int)); |
| | 737 | |
| | 738 | /* |
| | 739 | * Set each entry to the default character, so that it will produce |
| | 740 | * valid results if no mapping is ever specified for the character. |
| | 741 | * The default character is always at offset zero in the expansion |
| | 742 | * array. |
| | 743 | */ |
| | 744 | for (i = 0 ; i < 256 ; ++i) |
| | 745 | exp_map_[master_idx][i] = 0; |
| | 746 | } |
| | 747 | |
| | 748 | /* set the mapping for the character's entry in the sub-table */ |
| | 749 | exp_map_[master_idx][unicode_char & 0xff] = exp_offset; |
| | 750 | } |
| | 751 | |
| | 752 | /* |
| | 753 | * Map a UTF-8 string of known byte length to the local character set |
| | 754 | */ |
| | 755 | size_t CCharmapToLocal::map_utf8(char *dest, size_t dest_len, |
| | 756 | utf8_ptr src, size_t src_byte_len, |
| | 757 | size_t *src_bytes_used) const |
| | 758 | { |
| | 759 | utf8_ptr src_start; |
| | 760 | size_t cur_total; |
| | 761 | char *srcend; |
| | 762 | |
| | 763 | /* remember where we started */ |
| | 764 | src_start = src; |
| | 765 | |
| | 766 | /* compute where the source buffer ends */ |
| | 767 | srcend = src.getptr() + src_byte_len; |
| | 768 | |
| | 769 | /* copy characters until we reach the end of the source string */ |
| | 770 | for (cur_total = 0 ; src.getptr() < srcend ; src.inc()) |
| | 771 | { |
| | 772 | char mapbuf[10]; |
| | 773 | size_t maplen = sizeof(mapbuf); |
| | 774 | char *mapp = mapbuf; |
| | 775 | |
| | 776 | /* map this character */ |
| | 777 | maplen = map(src.getch(), &mapp, &maplen); |
| | 778 | |
| | 779 | /* determine how to store the character */ |
| | 780 | if (dest == 0) |
| | 781 | { |
| | 782 | /* we're just counting */ |
| | 783 | } |
| | 784 | else if (dest_len >= maplen) |
| | 785 | { |
| | 786 | /* we have room for it - add it in */ |
| | 787 | memcpy(dest, mapbuf, maplen); |
| | 788 | |
| | 789 | /* advance past it */ |
| | 790 | dest += maplen; |
| | 791 | dest_len -= maplen; |
| | 792 | } |
| | 793 | else |
| | 794 | { |
| | 795 | /* there's no more room - stop now */ |
| | 796 | break; |
| | 797 | } |
| | 798 | |
| | 799 | /* add this into the total */ |
| | 800 | cur_total += maplen; |
| | 801 | } |
| | 802 | |
| | 803 | /* if the caller wants to know how much space we used, tell them */ |
| | 804 | if (src_bytes_used != 0) |
| | 805 | *src_bytes_used = src.getptr() - src_start.getptr(); |
| | 806 | |
| | 807 | /* return the total length of the result */ |
| | 808 | return cur_total; |
| | 809 | } |
| | 810 | |
| | 811 | /* |
| | 812 | * Map a null-terminated UTF-8 string to the local character set |
| | 813 | */ |
| | 814 | size_t CCharmapToLocal::map_utf8z(char *dest, size_t dest_len, |
| | 815 | utf8_ptr src) const |
| | 816 | { |
| | 817 | size_t cur_total; |
| | 818 | |
| | 819 | /* copy characters until we find the terminating null */ |
| | 820 | for (cur_total = 0 ; src.getch() != 0 ; src.inc()) |
| | 821 | { |
| | 822 | /* |
| | 823 | * map this character into the output, if it will fit, but in |
| | 824 | * any case count the space it needs in the output |
| | 825 | */ |
| | 826 | cur_total += map(src.getch(), &dest, &dest_len); |
| | 827 | } |
| | 828 | |
| | 829 | /* |
| | 830 | * add a null terminator if there's room, but don't count it in the |
| | 831 | * result length |
| | 832 | */ |
| | 833 | map(0, &dest, &dest_len); |
| | 834 | |
| | 835 | /* return the total length of the result */ |
| | 836 | return cur_total; |
| | 837 | } |
| | 838 | |
| | 839 | /* |
| | 840 | * Map a null-terminated UTF-8 string to the local character set, escaping |
| | 841 | * characters that aren't part of the local character set. |
| | 842 | */ |
| | 843 | size_t CCharmapToLocal::map_utf8z_esc( |
| | 844 | char *dest, size_t dest_len, utf8_ptr src, |
| | 845 | size_t (*esc_fn)(wchar_t, char **, size_t *)) const |
| | 846 | { |
| | 847 | size_t cur_total; |
| | 848 | |
| | 849 | /* copy characters until we find the terminating null */ |
| | 850 | for (cur_total = 0 ; src.getch() != 0 ; src.inc()) |
| | 851 | { |
| | 852 | wchar_t ch = src.getch(); |
| | 853 | |
| | 854 | /* if this character is mappable, map it; otherwise, escape it */ |
| | 855 | if (is_mappable(src.getch())) |
| | 856 | { |
| | 857 | /* map the character */ |
| | 858 | cur_total += map(ch, &dest, &dest_len); |
| | 859 | } |
| | 860 | else |
| | 861 | { |
| | 862 | /* we can't map it, so let the escape callback handle it */ |
| | 863 | cur_total += (*esc_fn)(ch, &dest, &dest_len); |
| | 864 | } |
| | 865 | } |
| | 866 | |
| | 867 | /* |
| | 868 | * add a null terminator if there's room, but don't count it in the |
| | 869 | * result length |
| | 870 | */ |
| | 871 | map(0, &dest, &dest_len); |
| | 872 | |
| | 873 | /* return the total length of the result */ |
| | 874 | return cur_total; |
| | 875 | } |
| | 876 | |
| | 877 | /* |
| | 878 | * Escape callback for map_utf8z_esc() - prepares source-code-style |
| | 879 | * 'backslash' escape sequences for unmappable characters. |
| | 880 | */ |
| | 881 | size_t CCharmapToLocal::source_esc_cb(wchar_t ch, char **dest, size_t *len) |
| | 882 | { |
| | 883 | char buf[7]; |
| | 884 | size_t copylen; |
| | 885 | |
| | 886 | /* prepare our own representation */ |
| | 887 | sprintf(buf, "\\u%04x", (unsigned int)ch); |
| | 888 | |
| | 889 | /* copy the whole thing if possible, but limit to the available space */ |
| | 890 | copylen = 6; |
| | 891 | if (copylen > *len) |
| | 892 | copylen = *len; |
| | 893 | |
| | 894 | /* copy the bytes */ |
| | 895 | memcpy(*dest, buf, copylen); |
| | 896 | |
| | 897 | /* advance the buffer pointers */ |
| | 898 | *dest += copylen; |
| | 899 | *len -= copylen; |
| | 900 | |
| | 901 | /* return the full space needed */ |
| | 902 | return 6; |
| | 903 | } |
| | 904 | |
| | 905 | /* |
| | 906 | * Map to UTF8 |
| | 907 | */ |
| | 908 | size_t CCharmapToLocal::map_utf8(char *dest, size_t dest_len, |
| | 909 | const char *src, size_t src_byte_len, |
| | 910 | size_t *src_bytes_used) const |
| | 911 | { |
| | 912 | utf8_ptr src_ptr; |
| | 913 | |
| | 914 | /* set up the source UTF-8 pointer */ |
| | 915 | src_ptr.set((char *)src); |
| | 916 | |
| | 917 | /* map it and return the result */ |
| | 918 | return map_utf8(dest, dest_len, src_ptr, src_byte_len, src_bytes_used); |
| | 919 | } |
| | 920 | |
| | 921 | /* |
| | 922 | * Create a mapper and load a mapping file |
| | 923 | */ |
| | 924 | CCharmapToLocal *CCharmapToLocal::load(CResLoader *res_loader, |
| | 925 | const char *table_name) |
| | 926 | { |
| | 927 | osfildef *fp; |
| | 928 | CCharmapToLocal *mapper; |
| | 929 | charmap_type_t map_type; |
| | 930 | |
| | 931 | /* if they want a trivial UTF-8 translator, return one */ |
| | 932 | if (stricmp(table_name, "utf-8") == 0 |
| | 933 | || stricmp(table_name, "utf8") == 0) |
| | 934 | return new CCharmapToLocalUTF8(); |
| | 935 | |
| | 936 | /* if they want a Unicode 16-bit encoding, return one */ |
| | 937 | if (stricmp(table_name, "utf-16le") == 0 |
| | 938 | || stricmp(table_name, "unicodel") == 0) |
| | 939 | return new CCharmapToLocalUcs2Little(); |
| | 940 | if (stricmp(table_name, "utf-16be") == 0 |
| | 941 | || stricmp(table_name, "unicodeb") == 0) |
| | 942 | return new CCharmapToLocalUcs2Big(); |
| | 943 | |
| | 944 | /* presume failure */ |
| | 945 | mapper = 0; |
| | 946 | |
| | 947 | /* open and characterize the mapping file */ |
| | 948 | fp = open_map_file(res_loader, table_name, &map_type); |
| | 949 | |
| | 950 | /* check to make sure we opened the file */ |
| | 951 | if (fp == 0) |
| | 952 | { |
| | 953 | /* if they want a plain ASCII translator, return a default one */ |
| | 954 | if (name_is_ascii_synonym(table_name)) |
| | 955 | return new CCharmapToLocalASCII(); |
| | 956 | |
| | 957 | /* if they want a plain ISO-8859-1 translator, return a default one */ |
| | 958 | if (name_is_8859_1_synonym(table_name)) |
| | 959 | return new CCharmapToLocal8859_1(); |
| | 960 | |
| | 961 | /* return failure */ |
| | 962 | return 0; |
| | 963 | } |
| | 964 | |
| | 965 | /* create an appropriate mapper */ |
| | 966 | switch(map_type) |
| | 967 | { |
| | 968 | case CHARMAP_TYPE_SB: |
| | 969 | /* create a single-byte mapper */ |
| | 970 | mapper = new CCharmapToLocalSB(); |
| | 971 | break; |
| | 972 | |
| | 973 | case CHARMAP_TYPE_DB: |
| | 974 | /* create a double-byte mapper */ |
| | 975 | mapper = new CCharmapToLocalDB(); |
| | 976 | break; |
| | 977 | |
| | 978 | case CHARMAP_TYPE_MB: |
| | 979 | /* create a mixed multi-byte mapper */ |
| | 980 | mapper = new CCharmapToLocalMB(); |
| | 981 | break; |
| | 982 | |
| | 983 | default: |
| | 984 | /* other mapper types are currently unknown */ |
| | 985 | break; |
| | 986 | } |
| | 987 | |
| | 988 | /* if we successfully created a mapper, tell it to load the table */ |
| | 989 | if (mapper != 0) |
| | 990 | { |
| | 991 | /* load the table */ |
| | 992 | mapper->load_table(fp); |
| | 993 | } |
| | 994 | |
| | 995 | /* close the file */ |
| | 996 | osfcls(fp); |
| | 997 | |
| | 998 | /* return the mapper, if any */ |
| | 999 | return mapper; |
| | 1000 | } |
| | 1001 | |
| | 1002 | /* |
| | 1003 | * Load the character set translation table |
| | 1004 | */ |
| | 1005 | void CCharmapToLocal::load_table(osfildef *fp) |
| | 1006 | { |
| | 1007 | ulong startpos; |
| | 1008 | ulong ofs; |
| | 1009 | uchar buf[256]; |
| | 1010 | uint cnt; |
| | 1011 | ulong xbytes; |
| | 1012 | ulong xchars; |
| | 1013 | uint next_ofs; |
| | 1014 | |
| | 1015 | /* note the initial seek position */ |
| | 1016 | startpos = osfpos(fp); |
| | 1017 | |
| | 1018 | /* read the first entry, which gives the offset of the to-local table */ |
| | 1019 | if (osfrb(fp, buf, 4)) |
| | 1020 | return; |
| | 1021 | ofs = t3rp4u(buf); |
| | 1022 | |
| | 1023 | /* seek to the to-local table */ |
| | 1024 | osfseek(fp, startpos + ofs, OSFSK_SET); |
| | 1025 | |
| | 1026 | /* read the number of entries and number of bytes needed */ |
| | 1027 | if (osfrb(fp, buf, 6)) |
| | 1028 | return; |
| | 1029 | cnt = osrp2(buf); |
| | 1030 | xbytes = t3rp4u(buf + 2); |
| | 1031 | |
| | 1032 | /* |
| | 1033 | * Allocate space for the translation table. Note that we cannot |
| | 1034 | * handle translation tables bigger than the maximum allowed in a |
| | 1035 | * single allocation unit on the operating system. |
| | 1036 | */ |
| | 1037 | if (xbytes > OSMALMAX) |
| | 1038 | return; |
| | 1039 | xlat_array_ = (unsigned char *)t3malloc(xbytes); |
| | 1040 | if (xlat_array_ == 0) |
| | 1041 | return; |
| | 1042 | |
| | 1043 | /* |
| | 1044 | * Read each mapping |
| | 1045 | */ |
| | 1046 | for (next_ofs = 0 ; cnt > 0 ; --cnt) |
| | 1047 | { |
| | 1048 | wchar_t codept; |
| | 1049 | uint xlen; |
| | 1050 | |
| | 1051 | /* read the code point and translation length */ |
| | 1052 | if (osfrb(fp, buf, 3)) |
| | 1053 | return; |
| | 1054 | |
| | 1055 | /* decode the code point and translation length */ |
| | 1056 | codept = osrp2(buf); |
| | 1057 | xlen = (unsigned int)buf[2]; |
| | 1058 | |
| | 1059 | /* assign the mapping */ |
| | 1060 | set_mapping(codept, next_ofs); |
| | 1061 | |
| | 1062 | /* store the translation length */ |
| | 1063 | xlat_array_[next_ofs++] = buf[2]; |
| | 1064 | |
| | 1065 | /* read the translation bytes */ |
| | 1066 | if (osfrb(fp, xlat_array_ + next_ofs, xlen)) |
| | 1067 | return; |
| | 1068 | |
| | 1069 | /* skip past the translation bytes we've read */ |
| | 1070 | next_ofs += xlen; |
| | 1071 | } |
| | 1072 | |
| | 1073 | /* |
| | 1074 | * Next, read the expansions, if present. |
| | 1075 | * |
| | 1076 | * If we find the $EOF marker, it means it's an old-format file without |
| | 1077 | * the separate expansion definitions. Otherwise, we'll have the |
| | 1078 | * expansion entry count and the aggregate number of unicode characters |
| | 1079 | * in all of the expansions. |
| | 1080 | */ |
| | 1081 | if (osfrb(fp, buf, 6) || memcmp(buf, "$EOF", 4) == 0) |
| | 1082 | return; |
| | 1083 | |
| | 1084 | /* decode the expansion entry count and aggregate length */ |
| | 1085 | cnt = osrp2(buf); |
| | 1086 | xchars = t3rp4u(buf + 2); |
| | 1087 | |
| | 1088 | /* |
| | 1089 | * add one entry so that we can leave index zero unused, to indicate |
| | 1090 | * unmapped characters |
| | 1091 | */ |
| | 1092 | ++xchars; |
| | 1093 | |
| | 1094 | /* add one array slot per entry, for the length prefix slots */ |
| | 1095 | xchars += cnt; |
| | 1096 | |
| | 1097 | /* allocate space for the expansions */ |
| | 1098 | exp_array_ = (wchar_t *)t3malloc(xchars * sizeof(wchar_t)); |
| | 1099 | if (exp_array_ == 0) |
| | 1100 | return; |
| | 1101 | |
| | 1102 | /* |
| | 1103 | * read the mappings; start loading them at index 1, since we want to |
| | 1104 | * leave index 0 unused so that it can indicate unused mappings |
| | 1105 | */ |
| | 1106 | for (next_ofs = 1 ; cnt > 0 ; --cnt) |
| | 1107 | { |
| | 1108 | wchar_t codept; |
| | 1109 | uint xlen; |
| | 1110 | size_t i; |
| | 1111 | |
| | 1112 | /* read this entry's unicode value and expansion character length */ |
| | 1113 | if (osfrb(fp, buf, 3)) |
| | 1114 | return; |
| | 1115 | |
| | 1116 | /* decode the code point and expansion length */ |
| | 1117 | codept = osrp2(buf); |
| | 1118 | xlen = (uint)buf[2]; |
| | 1119 | |
| | 1120 | /* assign the expansion mapping */ |
| | 1121 | set_exp_mapping(codept, next_ofs); |
| | 1122 | |
| | 1123 | /* set the length prefix */ |
| | 1124 | exp_array_[next_ofs++] = (wchar_t)xlen; |
| | 1125 | |
| | 1126 | /* read and store the expansion characters */ |
| | 1127 | for (i = 0 ; i < xlen ; ++i) |
| | 1128 | { |
| | 1129 | /* read this translation */ |
| | 1130 | if (osfrb(fp, buf, 2)) |
| | 1131 | return; |
| | 1132 | |
| | 1133 | /* decode and store this translation */ |
| | 1134 | exp_array_[next_ofs++] = osrp2(buf); |
| | 1135 | } |
| | 1136 | } |
| | 1137 | } |
| | 1138 | |
| | 1139 | /* |
| | 1140 | * Write to a file |
| | 1141 | */ |
| | 1142 | int CCharmapToLocal::write_file(osfildef *fp, const char *buf, size_t bufl) |
| | 1143 | { |
| | 1144 | utf8_ptr p; |
| | 1145 | |
| | 1146 | /* set up to read from the buffer */ |
| | 1147 | p.set((char *)buf); |
| | 1148 | |
| | 1149 | /* map and write one buffer-full at a time */ |
| | 1150 | while (bufl > 0) |
| | 1151 | { |
| | 1152 | char conv_buf[256]; |
| | 1153 | size_t conv_len; |
| | 1154 | size_t used_src_len; |
| | 1155 | |
| | 1156 | /* map as much as we can fit into our buffer */ |
| | 1157 | conv_len = map_utf8(conv_buf, sizeof(conv_buf), p, bufl, |
| | 1158 | &used_src_len); |
| | 1159 | |
| | 1160 | /* write out this chunk */ |
| | 1161 | if (osfwb(fp, conv_buf, conv_len)) |
| | 1162 | return 1; |
| | 1163 | |
| | 1164 | /* advance past this chunk in the input */ |
| | 1165 | p.set(p.getptr() + used_src_len); |
| | 1166 | bufl -= used_src_len; |
| | 1167 | } |
| | 1168 | |
| | 1169 | /* no errors */ |
| | 1170 | return 0; |
| | 1171 | } |
| | 1172 | |
| | 1173 | |
| | 1174 | /* ------------------------------------------------------------------------ */ |
| | 1175 | /* |
| | 1176 | * Character mapper - trivial UTF8-to-UTF8 conversion |
| | 1177 | */ |
| | 1178 | |
| | 1179 | /* |
| | 1180 | * map a character |
| | 1181 | */ |
| | 1182 | size_t CCharmapToLocalUTF8::map(wchar_t unicode_char, char **output_ptr, |
| | 1183 | size_t *output_len) const |
| | 1184 | { |
| | 1185 | size_t map_len; |
| | 1186 | |
| | 1187 | /* get the character size */ |
| | 1188 | map_len = utf8_ptr::s_wchar_size(unicode_char); |
| | 1189 | |
| | 1190 | /* if we don't have room for one more character, abort */ |
| | 1191 | if (*output_len < map_len) |
| | 1192 | { |
| | 1193 | *output_len = 0; |
| | 1194 | return map_len; |
| | 1195 | } |
| | 1196 | |
| | 1197 | /* store the mapping */ |
| | 1198 | utf8_ptr::s_putch(*output_ptr, unicode_char); |
| | 1199 | |
| | 1200 | /* increment the pointer by the number of characters we copied */ |
| | 1201 | *output_ptr += map_len; |
| | 1202 | |
| | 1203 | /* adjust the remaining output length */ |
| | 1204 | *output_len -= map_len; |
| | 1205 | |
| | 1206 | /* return the size of the result */ |
| | 1207 | return map_len; |
| | 1208 | } |
| | 1209 | |
| | 1210 | /* |
| | 1211 | * Map a UTF-8 string of known byte length |
| | 1212 | */ |
| | 1213 | size_t CCharmapToLocalUTF8::map_utf8(char *dest, size_t dest_len, |
| | 1214 | utf8_ptr src, size_t src_byte_len, |
| | 1215 | size_t *src_bytes_used) const |
| | 1216 | { |
| | 1217 | size_t copy_len; |
| | 1218 | |
| | 1219 | /* |
| | 1220 | * if they didn't give us a destination buffer, tell them how much |
| | 1221 | * space is needed for the copy - this is identical to the length of |
| | 1222 | * the source string since we make no changes to it |
| | 1223 | */ |
| | 1224 | if (dest == 0) |
| | 1225 | { |
| | 1226 | *src_bytes_used = 0; |
| | 1227 | return src_byte_len; |
| | 1228 | } |
| | 1229 | |
| | 1230 | /* copy as much as we can, up to the output buffer length */ |
| | 1231 | copy_len = src_byte_len; |
| | 1232 | if (copy_len > dest_len) |
| | 1233 | copy_len = dest_len; |
| | 1234 | |
| | 1235 | /* |
| | 1236 | * if the last byte we'd copy is a continuation byte, don't copy it |
| | 1237 | * so that we keep whole characters intact |
| | 1238 | */ |
| | 1239 | if (copy_len > 0 |
| | 1240 | && utf8_ptr::s_is_continuation(src.getptr() + copy_len - 1)) |
| | 1241 | { |
| | 1242 | /* don't copy this byte */ |
| | 1243 | --copy_len; |
| | 1244 | |
| | 1245 | /* |
| | 1246 | * check the previous byte as well, since a given character can |
| | 1247 | * be up to three bytes long (hence we might have two |
| | 1248 | * continuation bytes) |
| | 1249 | */ |
| | 1250 | if (copy_len > 0 |
| | 1251 | && utf8_ptr::s_is_continuation(src.getptr() + copy_len - 1)) |
| | 1252 | --copy_len; |
| | 1253 | } |
| | 1254 | |
| | 1255 | /* if we have an output buffer, copy the data */ |
| | 1256 | if (dest != 0) |
| | 1257 | memcpy(dest, src.getptr(), copy_len); |
| | 1258 | |
| | 1259 | /* set the amount we copied, if the caller is interested */ |
| | 1260 | if (src_bytes_used != 0) |
| | 1261 | *src_bytes_used = copy_len; |
| | 1262 | |
| | 1263 | /* return the number of bytes we put in the destination buffer */ |
| | 1264 | return copy_len; |
| | 1265 | } |
| | 1266 | |
| | 1267 | /* |
| | 1268 | * Map a null-terminated UTF-8 string |
| | 1269 | */ |
| | 1270 | size_t CCharmapToLocalUTF8::map_utf8z(char *dest, size_t dest_len, |
| | 1271 | utf8_ptr src) const |
| | 1272 | { |
| | 1273 | size_t src_len; |
| | 1274 | |
| | 1275 | /* get the source length */ |
| | 1276 | src_len = strlen(src.getptr()); |
| | 1277 | |
| | 1278 | /* copy the bytes */ |
| | 1279 | map_utf8(dest, dest_len, src, src_len, 0); |
| | 1280 | |
| | 1281 | /* |
| | 1282 | * if there's room for the null terminator (which takes up just one |
| | 1283 | * byte in UTF-8), add it |
| | 1284 | */ |
| | 1285 | if (dest_len > src_len) |
| | 1286 | *(dest + src_len) = '\0'; |
| | 1287 | |
| | 1288 | /* |
| | 1289 | * return the amount of space needed to copy the whole string -- |
| | 1290 | * this is identical to the source length, since we don't make any |
| | 1291 | * changes to it |
| | 1292 | */ |
| | 1293 | return src_len; |
| | 1294 | } |
| | 1295 | |
| | 1296 | |
| | 1297 | /* ------------------------------------------------------------------------ */ |
| | 1298 | /* |
| | 1299 | * Character mapper - Unicode to Single-byte |
| | 1300 | */ |
| | 1301 | |
| | 1302 | /* |
| | 1303 | * map a character |
| | 1304 | */ |
| | 1305 | size_t CCharmapToLocalSB::map(wchar_t unicode_char, char **output_ptr, |
| | 1306 | size_t *output_len) const |
| | 1307 | { |
| | 1308 | const unsigned char *mapping; |
| | 1309 | size_t map_len; |
| | 1310 | |
| | 1311 | /* get the mapping */ |
| | 1312 | mapping = get_xlation(unicode_char, &map_len); |
| | 1313 | |
| | 1314 | /* if we don't have room for one more character, abort */ |
| | 1315 | if (*output_len < map_len) |
| | 1316 | { |
| | 1317 | *output_len = 0; |
| | 1318 | return map_len; |
| | 1319 | } |
| | 1320 | |
| | 1321 | /* copy the mapping */ |
| | 1322 | memcpy(*output_ptr, mapping, map_len); |
| | 1323 | |
| | 1324 | /* increment the pointer by the number of characters we copied */ |
| | 1325 | *output_ptr += map_len; |
| | 1326 | |
| | 1327 | /* adjust the remaining output length */ |
| | 1328 | *output_len -= map_len; |
| | 1329 | |
| | 1330 | /* return the size of the result */ |
| | 1331 | return map_len; |
| | 1332 | } |
| | 1333 | |
| | 1334 | /* |
| | 1335 | * Map a UTF-8 string of known byte length to the local character set |
| | 1336 | */ |
| | 1337 | size_t CCharmapToLocalSB::map_utf8(char *dest, size_t dest_len, |
| | 1338 | utf8_ptr src, size_t src_byte_len, |
| | 1339 | size_t *src_bytes_used) const |
| | 1340 | { |
| | 1341 | utf8_ptr src_start; |
| | 1342 | size_t cur_total; |
| | 1343 | char *srcend; |
| | 1344 | |
| | 1345 | /* remember where we started */ |
| | 1346 | src_start = src; |
| | 1347 | |
| | 1348 | /* compute where the source buffer ends */ |
| | 1349 | srcend = src.getptr() + src_byte_len; |
| | 1350 | |
| | 1351 | /* copy characters until we reach the end of the source string */ |
| | 1352 | for (cur_total = 0 ; src.getptr() < srcend ; src.inc()) |
| | 1353 | { |
| | 1354 | const unsigned char *mapping; |
| | 1355 | size_t map_len; |
| | 1356 | |
| | 1357 | /* get the mapping for this character */ |
| | 1358 | mapping = get_xlation(src.getch(), &map_len); |
| | 1359 | |
| | 1360 | /* |
| | 1361 | * if we have room, add it; otherwise, zero the output length |
| | 1362 | * remaining so we don't try to add anything more |
| | 1363 | */ |
| | 1364 | if (dest == 0) |
| | 1365 | { |
| | 1366 | /* we're just counting */ |
| | 1367 | } |
| | 1368 | else if (map_len <= dest_len) |
| | 1369 | { |
| | 1370 | /* add the sequence */ |
| | 1371 | memcpy(dest, mapping, map_len); |
| | 1372 | |
| | 1373 | /* adjust the output pointer and length remaining */ |
| | 1374 | dest += map_len; |
| | 1375 | dest_len -= map_len; |
| | 1376 | } |
| | 1377 | else |
| | 1378 | { |
| | 1379 | /* it doesn't fit - stop now */ |
| | 1380 | break; |
| | 1381 | } |
| | 1382 | |
| | 1383 | /* count the length in the total */ |
| | 1384 | cur_total += map_len; |
| | 1385 | } |
| | 1386 | |
| | 1387 | /* if the caller wants to know how much space we used, tell them */ |
| | 1388 | if (src_bytes_used != 0) |
| | 1389 | *src_bytes_used = src.getptr() - src_start.getptr(); |
| | 1390 | |
| | 1391 | /* return the total length of the result */ |
| | 1392 | return cur_total; |
| | 1393 | } |
| | 1394 | |
| | 1395 | /* |
| | 1396 | * Map a null-terminated UTF-8 string to the local character set |
| | 1397 | */ |
| | 1398 | size_t CCharmapToLocalSB::map_utf8z(char *dest, size_t dest_len, |
| | 1399 | utf8_ptr src) const |
| | 1400 | { |
| | 1401 | size_t cur_total; |
| | 1402 | |
| | 1403 | /* copy characters until we find the terminating null */ |
| | 1404 | for (cur_total = 0 ; src.getch() != 0 ; src.inc()) |
| | 1405 | { |
| | 1406 | const unsigned char *mapping; |
| | 1407 | size_t map_len; |
| | 1408 | |
| | 1409 | /* get the mapping for this character */ |
| | 1410 | mapping = get_xlation(src.getch(), &map_len); |
| | 1411 | |
| | 1412 | /* |
| | 1413 | * if we have room, add it; otherwise, zero the output length |
| | 1414 | * remaining so we don't try to add anything more |
| | 1415 | */ |
| | 1416 | if (map_len <= dest_len) |
| | 1417 | { |
| | 1418 | /* add the sequence */ |
| | 1419 | memcpy(dest, mapping, map_len); |
| | 1420 | |
| | 1421 | /* adjust the output pointer and length remaining */ |
| | 1422 | dest += map_len; |
| | 1423 | dest_len -= map_len; |
| | 1424 | } |
| | 1425 | else |
| | 1426 | { |
| | 1427 | /* it doesn't fit - zero the output length remaining */ |
| | 1428 | dest_len = 0; |
| | 1429 | } |
| | 1430 | |
| | 1431 | /* count the length in the total */ |
| | 1432 | cur_total += map_len; |
| | 1433 | } |
| | 1434 | |
| | 1435 | /* |
| | 1436 | * add a null terminator, if there's room, but don't count it in the |
| | 1437 | * output length |
| | 1438 | */ |
| | 1439 | if (dest_len > 0) |
| | 1440 | *dest = '\0'; |
| | 1441 | |
| | 1442 | /* return the total length of the result */ |
| | 1443 | return cur_total; |
| | 1444 | } |
| | 1445 | |
| | 1446 | |
| | 1447 | /* ------------------------------------------------------------------------ */ |
| | 1448 | /* |
| | 1449 | * Character mapper - Unicode to 16-bit Wide Unicode local character set |
| | 1450 | */ |
| | 1451 | |
| | 1452 | /* |
| | 1453 | * map a character |
| | 1454 | */ |
| | 1455 | size_t CCharmapToLocalWideUnicode::map(wchar_t unicode_char, |
| | 1456 | char **output_ptr, |
| | 1457 | size_t *output_len) const |
| | 1458 | { |
| | 1459 | /* if we don't have room for another wchar_t, abort */ |
| | 1460 | if (*output_len < sizeof(wchar_t)) |
| | 1461 | { |
| | 1462 | *output_len = 0; |
| | 1463 | return sizeof(wchar_t); |
| | 1464 | } |
| | 1465 | |
| | 1466 | /* |
| | 1467 | * Set the wide character to the unicode value, with no translation |
| | 1468 | * - unicode is the same everywhere. |
| | 1469 | * |
| | 1470 | * Note that the need to perform this trivial translation for this |
| | 1471 | * character set is a secondary reason that this routine is virtual |
| | 1472 | * (the primary reason is to handle the default ASCII translation). |
| | 1473 | */ |
| | 1474 | **(wchar_t **)output_ptr = unicode_char; |
| | 1475 | |
| | 1476 | /* increment the pointer by the size of a wide character */ |
| | 1477 | ++(*(wchar_t **)output_ptr); |
| | 1478 | |
| | 1479 | /* return the size of the result */ |
| | 1480 | return sizeof(wchar_t); |
| | 1481 | } |
| | 1482 | |
| | 1483 | /* |
| | 1484 | * Map a UTF-8 string of known byte length to the local character set |
| | 1485 | */ |
| | 1486 | size_t CCharmapToLocalWideUnicode:: |
| | 1487 | map_utf8(char *dest, size_t dest_len, |
| | 1488 | utf8_ptr src, size_t src_byte_len, |
| | 1489 | size_t *src_bytes_used) const |
| | 1490 | { |
| | 1491 | utf8_ptr src_start; |
| | 1492 | size_t cur_total; |
| | 1493 | char *srcend; |
| | 1494 | wchar_t *destw; |
| | 1495 | |
| | 1496 | /* remember where we started */ |
| | 1497 | src_start = src; |
| | 1498 | |
| | 1499 | /* compute where the source buffer ends */ |
| | 1500 | srcend = src.getptr() + src_byte_len; |
| | 1501 | |
| | 1502 | /* set up a wchar_t output pointer for convenience */ |
| | 1503 | destw = (wchar_t *)dest; |
| | 1504 | |
| | 1505 | /* copy characters until we reach the end of the source string */ |
| | 1506 | for (cur_total = 0 ; src.getptr() < srcend ; src.inc()) |
| | 1507 | { |
| | 1508 | /* |
| | 1509 | * if we have room, add it; otherwise, zero the output length |
| | 1510 | * remaining so we don't try to add anything more |
| | 1511 | */ |
| | 1512 | if (dest == 0) |
| | 1513 | { |
| | 1514 | /* we're just counting - don't store anything */ |
| | 1515 | } |
| | 1516 | else if (dest_len >= sizeof(wchar_t)) |
| | 1517 | { |
| | 1518 | /* add the sequence */ |
| | 1519 | *destw++ = src.getch(); |
| | 1520 | |
| | 1521 | /* adjust the length remaining */ |
| | 1522 | dest_len -= sizeof(wchar_t); |
| | 1523 | } |
| | 1524 | else |
| | 1525 | { |
| | 1526 | /* it doesn't fit - stop now */ |
| | 1527 | break; |
| | 1528 | } |
| | 1529 | |
| | 1530 | /* count the length in the total */ |
| | 1531 | cur_total += sizeof(wchar_t); |
| | 1532 | } |
| | 1533 | |
| | 1534 | /* if the caller wants to know how much space we used, tell them */ |
| | 1535 | if (src_bytes_used != 0) |
| | 1536 | *src_bytes_used = src.getptr() - src_start.getptr(); |
| | 1537 | |
| | 1538 | /* return the total length of the result */ |
| | 1539 | return cur_total; |
| | 1540 | } |
| | 1541 | |
| | 1542 | /* |
| | 1543 | * Map a null-terminated UTF-8 string to the local character set |
| | 1544 | */ |
| | 1545 | size_t CCharmapToLocalWideUnicode:: |
| | 1546 | map_utf8z(char *dest, size_t dest_len, utf8_ptr src) const |
| | 1547 | { |
| | 1548 | size_t cur_total; |
| | 1549 | wchar_t *destw; |
| | 1550 | |
| | 1551 | /* set up a wchar_t output pointer for convenience */ |
| | 1552 | destw = (wchar_t *)dest; |
| | 1553 | |
| | 1554 | /* copy characters until we find the terminating null */ |
| | 1555 | for (cur_total = 0 ; src.getch() != 0 ; src.inc()) |
| | 1556 | { |
| | 1557 | /* |
| | 1558 | * if we have room, add it; otherwise, zero the output length |
| | 1559 | * remaining so we don't try to add anything more |
| | 1560 | */ |
| | 1561 | if (dest_len >= sizeof(wchar_t)) |
| | 1562 | { |
| | 1563 | /* add the sequence */ |
| | 1564 | *destw++ = src.getch(); |
| | 1565 | |
| | 1566 | /* adjust the length remaining */ |
| | 1567 | dest_len -= sizeof(wchar_t); |
| | 1568 | } |
| | 1569 | else |
| | 1570 | { |
| | 1571 | /* it doesn't fit - zero the output length remaining */ |
| | 1572 | dest_len = 0; |
| | 1573 | } |
| | 1574 | |
| | 1575 | /* count the length in the total */ |
| | 1576 | cur_total += sizeof(wchar_t); |
| | 1577 | } |
| | 1578 | |
| | 1579 | /* |
| | 1580 | * if there's room for a null terminator character (not byte - we need |
| | 1581 | * to add an entire wide character), add it, but don't count it in the |
| | 1582 | * return length |
| | 1583 | */ |
| | 1584 | if (dest_len >= sizeof(wchar_t)) |
| | 1585 | *destw = '\0'; |
| | 1586 | |
| | 1587 | /* return the total length of the result */ |
| | 1588 | return cur_total; |
| | 1589 | } |
| | 1590 | |
| | 1591 | |
| | 1592 | /* ------------------------------------------------------------------------ */ |
| | 1593 | /* |
| | 1594 | * Character mapper for 16-bit Wide Unicode, big-endian. Stores the |
| | 1595 | * characters in big-endian UCS-2 representation. |
| | 1596 | */ |
| | 1597 | size_t CCharmapToLocalUcs2Big::map(wchar_t unicode_char, char **output_ptr, |
| | 1598 | size_t *output_len) const |
| | 1599 | { |
| | 1600 | /* |
| | 1601 | * If we don't have room for another byte pair, abort. Note that we |
| | 1602 | * really do want to store exactly two bytes, not sizeof(anything), |
| | 1603 | * since we're storing to the UCS-2 file format, which encodes each |
| | 1604 | * character in two bytes. |
| | 1605 | */ |
| | 1606 | if (*output_len < 2) |
| | 1607 | { |
| | 1608 | *output_len = 0; |
| | 1609 | return 2; |
| | 1610 | } |
| | 1611 | |
| | 1612 | /* |
| | 1613 | * Store the big-endian 16-bit value with no translation - unicode |
| | 1614 | * is the same everywhere. |
| | 1615 | * |
| | 1616 | * Note that the need to perform this trivial translation for this |
| | 1617 | * character set is a secondary reason that this routine is virtual |
| | 1618 | * (the primary reason is to handle the default ASCII translation). |
| | 1619 | * |
| | 1620 | * Store the high-order 8 bits in the first byte, and the low-order |
| | 1621 | * 8 bits in the second byte. |
| | 1622 | */ |
| | 1623 | **output_ptr = ((unicode_char >> 8) & 0xff); |
| | 1624 | *(*output_ptr + 1) = (unicode_char & 0xff); |
| | 1625 | |
| | 1626 | /* skip two bytes in the output */ |
| | 1627 | *output_ptr += 2; |
| | 1628 | *output_len -= 2; |
| | 1629 | |
| | 1630 | /* return the size of the result */ |
| | 1631 | return 2; |
| | 1632 | } |
| | 1633 | |
| | 1634 | /* |
| | 1635 | * Map a UTF-8 string of known byte length to the local character set |
| | 1636 | */ |
| | 1637 | size_t CCharmapToLocalUcs2Big:: |
| | 1638 | map_utf8(char *dest, size_t dest_len, |
| | 1639 | utf8_ptr src, size_t src_byte_len, |
| | 1640 | size_t *src_bytes_used) const |
| | 1641 | { |
| | 1642 | utf8_ptr src_start; |
| | 1643 | size_t cur_total; |
| | 1644 | char *srcend; |
| | 1645 | |
| | 1646 | /* remember where we started */ |
| | 1647 | src_start = src; |
| | 1648 | |
| | 1649 | /* compute where the source buffer ends */ |
| | 1650 | srcend = src.getptr() + src_byte_len; |
| | 1651 | |
| | 1652 | /* copy characters until we reach the end of the source string */ |
| | 1653 | for (cur_total = 0 ; src.getptr() < srcend ; src.inc()) |
| | 1654 | { |
| | 1655 | /* |
| | 1656 | * if we have room, add it; otherwise, zero the output length |
| | 1657 | * remaining so we don't try to add anything more |
| | 1658 | */ |
| | 1659 | if (dest == 0) |
| | 1660 | { |
| | 1661 | /* we're not storing anything */ |
| | 1662 | } |
| | 1663 | else if (dest_len >= 2) |
| | 1664 | { |
| | 1665 | wchar_t unicode_char; |
| | 1666 | |
| | 1667 | /* get the current character */ |
| | 1668 | unicode_char = src.getch(); |
| | 1669 | |
| | 1670 | /* add the sequence */ |
| | 1671 | *dest++ = ((unicode_char >> 8) & 0xff); |
| | 1672 | *dest++ = (unicode_char & 0xff); |
| | 1673 | |
| | 1674 | /* adjust the length remaining */ |
| | 1675 | dest_len -= 2; |
| | 1676 | } |
| | 1677 | else |
| | 1678 | { |
| | 1679 | /* it doesn't fit - stop now */ |
| | 1680 | break; |
| | 1681 | } |
| | 1682 | |
| | 1683 | /* count the length in the total */ |
| | 1684 | cur_total += 2; |
| | 1685 | } |
| | 1686 | |
| | 1687 | /* if the caller wants to know how much space we used, tell them */ |
| | 1688 | if (src_bytes_used != 0) |
| | 1689 | *src_bytes_used = src.getptr() - src_start.getptr(); |
| | 1690 | |
| | 1691 | /* return the total length of the result */ |
| | 1692 | return cur_total; |
| | 1693 | } |
| | 1694 | |
| | 1695 | /* |
| | 1696 | * Map a null-terminated UTF-8 string to the local character set |
| | 1697 | */ |
| | 1698 | size_t CCharmapToLocalUcs2Big:: |
| | 1699 | map_utf8z(char *dest, size_t dest_len, utf8_ptr src) const |
| | 1700 | { |
| | 1701 | size_t cur_total; |
| | 1702 | |
| | 1703 | /* copy characters until we find the terminating null */ |
| | 1704 | for (cur_total = 0 ; src.getch() != 0 ; src.inc()) |
| | 1705 | { |
| | 1706 | /* |
| | 1707 | * if we have room, add it; otherwise, zero the output length |
| | 1708 | * remaining so we don't try to add anything more |
| | 1709 | */ |
| | 1710 | if (dest_len >= 2) |
| | 1711 | { |
| | 1712 | wchar_t unicode_char; |
| | 1713 | |
| | 1714 | /* get the current character */ |
| | 1715 | unicode_char = src.getch(); |
| | 1716 | |
| | 1717 | /* add the sequence */ |
| | 1718 | *dest++ = ((unicode_char >> 8) & 0xff); |
| | 1719 | *dest++ = (unicode_char & 0xff); |
| | 1720 | |
| | 1721 | /* adjust the length remaining */ |
| | 1722 | dest_len -= 2; |
| | 1723 | } |
| | 1724 | else |
| | 1725 | { |
| | 1726 | /* it doesn't fit - zero the output length remaining */ |
| | 1727 | dest_len = 0; |
| | 1728 | } |
| | 1729 | |
| | 1730 | /* count the length in the total */ |
| | 1731 | cur_total += 2; |
| | 1732 | } |
| | 1733 | |
| | 1734 | /* return the total length of the result */ |
| | 1735 | return cur_total; |
| | 1736 | } |
| | 1737 | |
| | 1738 | |
| | 1739 | /* ------------------------------------------------------------------------ */ |
| | 1740 | /* |
| | 1741 | * Character mapper for 16-bit Wide Unicode, little-endian. Stores the |
| | 1742 | * characters in little-endian UCS-2 representation. |
| | 1743 | */ |
| | 1744 | size_t CCharmapToLocalUcs2Little::map(wchar_t unicode_char, |
| | 1745 | char **output_ptr, |
| | 1746 | size_t *output_len) const |
| | 1747 | { |
| | 1748 | /* |
| | 1749 | * If we don't have room for another byte pair, abort. Note that we |
| | 1750 | * really do want to store exactly two bytes, not sizeof(anything), |
| | 1751 | * since we're storing to the UCS-2 file format, which encodes each |
| | 1752 | * character in two bytes. |
| | 1753 | */ |
| | 1754 | if (*output_len < 2) |
| | 1755 | { |
| | 1756 | *output_len = 0; |
| | 1757 | return 2; |
| | 1758 | } |
| | 1759 | |
| | 1760 | /* |
| | 1761 | * Store the little-endian 16-bit value with no translation - |
| | 1762 | * unicode is the same everywhere. |
| | 1763 | * |
| | 1764 | * Note that the need to perform this trivial translation for this |
| | 1765 | * character set is a secondary reason that this routine is virtual |
| | 1766 | * (the primary reason is to handle the default ASCII translation). |
| | 1767 | * |
| | 1768 | * Store the low-order 8 bits in the first byte, and the high-order |
| | 1769 | * 8 bits in the second byte. |
| | 1770 | */ |
| | 1771 | **output_ptr = (unicode_char & 0xff); |
| | 1772 | *(*output_ptr + 1) = ((unicode_char >> 8) & 0xff); |
| | 1773 | |
| | 1774 | /* skip two bytes in the output */ |
| | 1775 | *output_ptr += 2; |
| | 1776 | *output_len -= 2; |
| | 1777 | |
| | 1778 | /* return the size of the result */ |
| | 1779 | return 2; |
| | 1780 | } |
| | 1781 | |
| | 1782 | /* |
| | 1783 | * Map a UTF-8 string of known byte length to the local character set |
| | 1784 | */ |
| | 1785 | size_t CCharmapToLocalUcs2Little:: |
| | 1786 | map_utf8(char *dest, size_t dest_len, |
| | 1787 | utf8_ptr src, size_t src_byte_len, |
| | 1788 | size_t *src_bytes_used) const |
| | 1789 | { |
| | 1790 | utf8_ptr src_start; |
| | 1791 | size_t cur_total; |
| | 1792 | char *srcend; |
| | 1793 | |
| | 1794 | /* remember where we started */ |
| | 1795 | src_start = src; |
| | 1796 | |
| | 1797 | /* compute where the source buffer ends */ |
| | 1798 | srcend = src.getptr() + src_byte_len; |
| | 1799 | |
| | 1800 | /* copy characters until we reach the end of the source string */ |
| | 1801 | for (cur_total = 0 ; src.getptr() < srcend ; src.inc()) |
| | 1802 | { |
| | 1803 | /* |
| | 1804 | * if we have room, add it; otherwise, zero the output length |
| | 1805 | * remaining so we don't try to add anything more |
| | 1806 | */ |
| | 1807 | if (dest == 0) |
| | 1808 | { |
| | 1809 | /* we're just counting - don't store anything */ |
| | 1810 | } |
| | 1811 | else if (dest_len >= 2) |
| | 1812 | { |
| | 1813 | wchar_t unicode_char; |
| | 1814 | |
| | 1815 | /* get the current character */ |
| | 1816 | unicode_char = src.getch(); |
| | 1817 | |
| | 1818 | /* add the sequence */ |
| | 1819 | *dest++ = (unicode_char & 0xff); |
| | 1820 | *dest++ = ((unicode_char >> 8) & 0xff); |
| | 1821 | |
| | 1822 | /* adjust the length remaining */ |
| | 1823 | dest_len -= 2; |
| | 1824 | } |
| | 1825 | else |
| | 1826 | { |
| | 1827 | /* it doesn't fit - stop now */ |
| | 1828 | break; |
| | 1829 | } |
| | 1830 | |
| | 1831 | /* count the length in the total */ |
| | 1832 | cur_total += 2; |
| | 1833 | } |
| | 1834 | |
| | 1835 | /* if the caller wants to know how much space we used, tell them */ |
| | 1836 | if (src_bytes_used != 0) |
| | 1837 | *src_bytes_used = src.getptr() - src_start.getptr(); |
| | 1838 | |
| | 1839 | /* return the total length of the result */ |
| | 1840 | return cur_total; |
| | 1841 | } |
| | 1842 | |
| | 1843 | /* |
| | 1844 | * Map a null-terminated UTF-8 string to the local character set |
| | 1845 | */ |
| | 1846 | size_t CCharmapToLocalUcs2Little:: |
| | 1847 | map_utf8z(char *dest, size_t dest_len, utf8_ptr src) const |
| | 1848 | { |
| | 1849 | size_t cur_total; |
| | 1850 | |
| | 1851 | /* copy characters until we find the terminating null */ |
| | 1852 | for (cur_total = 0 ; src.getch() != 0 ; src.inc()) |
| | 1853 | { |
| | 1854 | /* |
| | 1855 | * if we have room, add it; otherwise, zero the output length |
| | 1856 | * remaining so we don't try to add anything more |
| | 1857 | */ |
| | 1858 | if (dest_len >= 2) |
| | 1859 | { |
| | 1860 | wchar_t unicode_char; |
| | 1861 | |
| | 1862 | /* get the current character */ |
| | 1863 | unicode_char = src.getch(); |
| | 1864 | |
| | 1865 | /* add the sequence */ |
| | 1866 | *dest++ = (unicode_char & 0xff); |
| | 1867 | *dest++ = ((unicode_char >> 8) & 0xff); |
| | 1868 | |
| | 1869 | /* adjust the length remaining */ |
| | 1870 | dest_len -= 2; |
| | 1871 | } |
| | 1872 | else |
| | 1873 | { |
| | 1874 | /* it doesn't fit - zero the output length remaining */ |
| | 1875 | dest_len = 0; |
| | 1876 | } |
| | 1877 | |
| | 1878 | /* count the length in the total */ |
| | 1879 | cur_total += 2; |
| | 1880 | } |
| | 1881 | |
| | 1882 | /* |
| | 1883 | * if there's room for a null terminator character (which takes two |
| | 1884 | * bytes in UCS-2), add it, but don't count it in the return length |
| | 1885 | */ |
| | 1886 | if (dest_len >= 2) |
| | 1887 | { |
| | 1888 | *dest++ = '\0'; |
| | 1889 | *dest++ = '\0'; |
| | 1890 | } |
| | 1891 | |
| | 1892 | /* return the total length of the result */ |
| | 1893 | return cur_total; |
| | 1894 | } |
| | 1895 | |
| | 1896 | |
| | 1897 | /* ------------------------------------------------------------------------ */ |
| | 1898 | /* |
| | 1899 | * Character mapper - local to UTF-8 |
| | 1900 | */ |
| | 1901 | |
| | 1902 | /* |
| | 1903 | * create an appropriate mapping object for the given mapping file |
| | 1904 | */ |
| | 1905 | CCharmapToUni *CCharmapToUni::load(class CResLoader *res_loader, |
| | 1906 | const char *table_name) |
| | 1907 | { |
| | 1908 | osfildef *fp; |
| | 1909 | CCharmapToUni *mapper; |
| | 1910 | charmap_type_t map_type; |
| | 1911 | |
| | 1912 | /* if they want a trivial UTF-8 translator, return one */ |
| | 1913 | if (stricmp(table_name, "utf-8") == 0 |
| | 1914 | || stricmp(table_name, "utf8") == 0) |
| | 1915 | return new CCharmapToUniUTF8(); |
| | 1916 | |
| | 1917 | /* if they want a 16-bit Unicode mapping, return one */ |
| | 1918 | if (stricmp(table_name, "utf-16le") == 0 |
| | 1919 | || stricmp(table_name, "unicodel") == 0) |
| | 1920 | return new CCharmapToUniUcs2Little(); |
| | 1921 | if (stricmp(table_name, "utf-16be") == 0 |
| | 1922 | || stricmp(table_name, "unicodeb") == 0) |
| | 1923 | return new CCharmapToUniUcs2Big(); |
| | 1924 | |
| | 1925 | /* presume failure */ |
| | 1926 | mapper = 0; |
| | 1927 | |
| | 1928 | /* open and characterize the mapping file */ |
| | 1929 | fp = open_map_file(res_loader, table_name, &map_type); |
| | 1930 | |
| | 1931 | /* check to make sure we opened a file */ |
| | 1932 | if (fp == 0) |
| | 1933 | { |
| | 1934 | /* |
| | 1935 | * if there was no file, and they want a plain ASCII translator, |
| | 1936 | * return a default ASCII translator |
| | 1937 | */ |
| | 1938 | if (name_is_ascii_synonym(table_name)) |
| | 1939 | return new CCharmapToUniASCII(); |
| | 1940 | |
| | 1941 | /* if they want an ISO-8859-1 translator, return a default one */ |
| | 1942 | if (name_is_8859_1_synonym(table_name)) |
| | 1943 | return new CCharmapToUni8859_1(); |
| | 1944 | |
| | 1945 | /* return failure */ |
| | 1946 | return 0; |
| | 1947 | } |
| | 1948 | |
| | 1949 | /* create an appropriate mapper */ |
| | 1950 | switch(map_type) |
| | 1951 | { |
| | 1952 | case CHARMAP_TYPE_SB: |
| | 1953 | /* create a single-byte mapper */ |
| | 1954 | mapper = new CCharmapToUniSB(); |
| | 1955 | break; |
| | 1956 | |
| | 1957 | case CHARMAP_TYPE_DB: |
| | 1958 | /* create a double-byte mapper */ |
| | 1959 | mapper = new CCharmapToUniDB(); |
| | 1960 | break; |
| | 1961 | |
| | 1962 | case CHARMAP_TYPE_MB: |
| | 1963 | /* create a mixed multi-byte mapper */ |
| | 1964 | mapper = new CCharmapToUniMB(); |
| | 1965 | break; |
| | 1966 | |
| | 1967 | default: |
| | 1968 | /* other mapper types are currently unknown */ |
| | 1969 | break; |
| | 1970 | } |
| | 1971 | |
| | 1972 | /* if we successfully created a mapper, tell it to load the table */ |
| | 1973 | if (mapper != 0) |
| | 1974 | { |
| | 1975 | /* load the table */ |
| | 1976 | mapper->load_table(fp); |
| | 1977 | } |
| | 1978 | |
| | 1979 | /* close the file */ |
| | 1980 | osfcls(fp); |
| | 1981 | |
| | 1982 | /* return the mapper, if any */ |
| | 1983 | return mapper; |
| | 1984 | } |
| | 1985 | |
| | 1986 | |
| | 1987 | /* |
| | 1988 | * load a mapping table |
| | 1989 | */ |
| | 1990 | void CCharmapToUni::load_table(osfildef *fp) |
| | 1991 | { |
| | 1992 | uchar buf[256]; |
| | 1993 | uint entry_cnt; |
| | 1994 | |
| | 1995 | /* read the header and the local table header */ |
| | 1996 | if (osfrb(fp, buf, 6)) |
| | 1997 | return; |
| | 1998 | |
| | 1999 | /* get the local table size from the local table header */ |
| | 2000 | entry_cnt = osrp2(buf + 4); |
| | 2001 | |
| | 2002 | /* read the mappings */ |
| | 2003 | while (entry_cnt > 0) |
| | 2004 | { |
| | 2005 | size_t cur; |
| | 2006 | const uchar *p; |
| | 2007 | |
| | 2008 | /* figure out how many entries we can read this time */ |
| | 2009 | cur = sizeof(buf)/4; |
| | 2010 | if (cur > entry_cnt) |
| | 2011 | cur = entry_cnt; |
| | 2012 | |
| | 2013 | /* read the entries */ |
| | 2014 | if (osfrb(fp, buf, cur*4)) |
| | 2015 | return; |
| | 2016 | |
| | 2017 | /* deduct this number from the remaining count */ |
| | 2018 | entry_cnt -= cur; |
| | 2019 | |
| | 2020 | /* scan the entries */ |
| | 2021 | for (p = buf ; cur > 0 ; p += 4, --cur) |
| | 2022 | { |
| | 2023 | /* map this entry */ |
| | 2024 | set_mapping(osrp2(p), osrp2(p+2)); |
| | 2025 | } |
| | 2026 | } |
| | 2027 | } |
| | 2028 | |
| | 2029 | /* |
| | 2030 | * Map a null-terminated string into a buffer |
| | 2031 | */ |
| | 2032 | size_t CCharmapToUni::map_str(char *outbuf, size_t outbuflen, |
| | 2033 | const char *input_str) |
| | 2034 | { |
| | 2035 | size_t input_len; |
| | 2036 | size_t output_len; |
| | 2037 | |
| | 2038 | /* get the length of the input string */ |
| | 2039 | input_len = strlen(input_str); |
| | 2040 | |
| | 2041 | /* map the string to the output buffer */ |
| | 2042 | output_len = map(&outbuf, &outbuflen, input_str, input_len); |
| | 2043 | |
| | 2044 | /* if there's space remaining in the output buffer, add the null byte */ |
| | 2045 | if (outbuflen != 0) |
| | 2046 | *outbuf = '\0'; |
| | 2047 | |
| | 2048 | /* return the number of bytes needed for the conversion */ |
| | 2049 | return output_len; |
| | 2050 | } |
| | 2051 | |
| | 2052 | /* ------------------------------------------------------------------------ */ |
| | 2053 | /* |
| | 2054 | * Basic single-byte character set to UTF-8 mapper |
| | 2055 | */ |
| | 2056 | |
| | 2057 | /* |
| | 2058 | * read from a single-byte file and translate to UTF-8 |
| | 2059 | */ |
| | 2060 | size_t CCharmapToUniSB_basic::read_file(osfildef *fp, |
| | 2061 | char *buf, size_t bufl, |
| | 2062 | unsigned long read_limit) |
| | 2063 | { |
| | 2064 | size_t inlen; |
| | 2065 | |
| | 2066 | /* |
| | 2067 | * Compute how much to read from the file. The input file is |
| | 2068 | * composed of single-byte characters, so only read up to one third |
| | 2069 | * of the buffer length; this will ensure that we can always fit |
| | 2070 | * what we read into the caller's buffer. |
| | 2071 | */ |
| | 2072 | inlen = bufl / 3; |
| | 2073 | |
| | 2074 | /* in any case, we can't read more than our own buffer size */ |
| | 2075 | if (inlen > sizeof(inbuf_)) |
| | 2076 | inlen = sizeof(inbuf_); |
| | 2077 | |
| | 2078 | /* limit the read length to the caller's read limit, if appropriate */ |
| | 2079 | if (read_limit != 0 && inlen > read_limit) |
| | 2080 | inlen = (size_t)read_limit; |
| | 2081 | |
| | 2082 | /* read from the file */ |
| | 2083 | inlen = osfrbc(fp, inbuf_, inlen); |
| | 2084 | |
| | 2085 | /* |
| | 2086 | * Map data to the caller's buffer, and return the result. We're |
| | 2087 | * certain that the data will fit in the caller's buffer: we're |
| | 2088 | * mapping only a third as many characters as we have bytes |
| | 2089 | * available, and each character can take up at most three bytes, |
| | 2090 | * hence the worst case is that we fill the buffer completely. |
| | 2091 | * |
| | 2092 | * On the other hand, we may only fill the buffer to a third of its |
| | 2093 | * capacity, but this is okay too, since we're not required to give |
| | 2094 | * the caller everything they asked for. |
| | 2095 | */ |
| | 2096 | return map(&buf, &bufl, inbuf_, inlen); |
| | 2097 | } |
| | 2098 | |
| | 2099 | |
| | 2100 | /* ------------------------------------------------------------------------ */ |
| | 2101 | /* |
| | 2102 | * Plain ASCII local to UTF-8 mapper |
| | 2103 | */ |
| | 2104 | |
| | 2105 | /* |
| | 2106 | * map a string from the single-byte local character set to UTF-8 |
| | 2107 | */ |
| | 2108 | size_t CCharmapToUniASCII::map(char **outp, size_t *outlen, |
| | 2109 | const char *inp, size_t inlen) const |
| | 2110 | { |
| | 2111 | size_t tot_outlen; |
| | 2112 | |
| | 2113 | /* we haven't written any characters to the output buffer yet */ |
| | 2114 | tot_outlen = 0; |
| | 2115 | |
| | 2116 | /* scan each character (character == byte) in the input string */ |
| | 2117 | for ( ; inlen > 0 ; --inlen, ++inp) |
| | 2118 | { |
| | 2119 | wchar_t uni; |
| | 2120 | size_t csiz; |
| | 2121 | |
| | 2122 | /* |
| | 2123 | * map any character outside of the 7-bit range to U+FFFD, the |
| | 2124 | * Unicode REPLACEMENT CHARACTER, which is the standard way to |
| | 2125 | * represent characters that can't be mapped from an incoming |
| | 2126 | * character set |
| | 2127 | */ |
| | 2128 | if (((unsigned char)*inp) > 127) |
| | 2129 | uni = 0xfffd; |
| | 2130 | else |
| | 2131 | uni = ((wchar_t)(unsigned char)*inp); |
| | 2132 | |
| | 2133 | /* get the size of this character */ |
| | 2134 | csiz = utf8_ptr::s_wchar_size(uni); |
| | 2135 | |
| | 2136 | /* add it to the total output lenght */ |
| | 2137 | tot_outlen += csiz; |
| | 2138 | |
| | 2139 | /* if there's room, add it to our output buffer */ |
| | 2140 | if (*outlen >= csiz) |
| | 2141 | { |
| | 2142 | /* write it out */ |
| | 2143 | *outp += utf8_ptr::s_putch(*outp, uni); |
| | 2144 | |
| | 2145 | /* deduct it from the remaining output length */ |
| | 2146 | *outlen -= csiz; |
| | 2147 | } |
| | 2148 | else |
| | 2149 | { |
| | 2150 | /* there's no room - set the remaining output length to zero */ |
| | 2151 | *outlen = 0; |
| | 2152 | } |
| | 2153 | } |
| | 2154 | |
| | 2155 | /* return the total output length */ |
| | 2156 | return tot_outlen; |
| | 2157 | } |
| | 2158 | |
| | 2159 | |
| | 2160 | /* ------------------------------------------------------------------------ */ |
| | 2161 | /* |
| | 2162 | * Single-byte mapped local to UTF-8 mapper |
| | 2163 | */ |
| | 2164 | |
| | 2165 | /* |
| | 2166 | * map a string from the single-byte local character set to UTF-8 |
| | 2167 | */ |
| | 2168 | size_t CCharmapToUniSB::map(char **outp, size_t *outlen, |
| | 2169 | const char *inp, size_t inlen) const |
| | 2170 | { |
| | 2171 | size_t tot_outlen; |
| | 2172 | |
| | 2173 | /* we haven't written any characters to the output buffer yet */ |
| | 2174 | tot_outlen = 0; |
| | 2175 | |
| | 2176 | /* scan each character (character == byte) in the input string */ |
| | 2177 | for ( ; inlen > 0 ; --inlen, ++inp) |
| | 2178 | { |
| | 2179 | wchar_t uni; |
| | 2180 | size_t csiz; |
| | 2181 | |
| | 2182 | /* get the unicode mapping for this character */ |
| | 2183 | uni = map_[(unsigned char)*inp]; |
| | 2184 | |
| | 2185 | /* get the size of this character */ |
| | 2186 | csiz = utf8_ptr::s_wchar_size(uni); |
| | 2187 | |
| | 2188 | /* add it to the total output lenght */ |
| | 2189 | tot_outlen += csiz; |
| | 2190 | |
| | 2191 | /* if there's room, add it to our output buffer */ |
| | 2192 | if (*outlen >= csiz) |
| | 2193 | { |
| | 2194 | /* write it out */ |
| | 2195 | *outp += utf8_ptr::s_putch(*outp, uni); |
| | 2196 | |
| | 2197 | /* deduct it from the remaining output length */ |
| | 2198 | *outlen -= csiz; |
| | 2199 | } |
| | 2200 | else |
| | 2201 | { |
| | 2202 | /* there's no room - set the remaining output length to zero */ |
| | 2203 | *outlen = 0; |
| | 2204 | } |
| | 2205 | } |
| | 2206 | |
| | 2207 | /* return the total output length */ |
| | 2208 | return tot_outlen; |
| | 2209 | } |
| | 2210 | |
| | 2211 | /* ------------------------------------------------------------------------ */ |
| | 2212 | /* |
| | 2213 | * Trivial UTF8-to-UTF8 input mapper |
| | 2214 | */ |
| | 2215 | |
| | 2216 | /* |
| | 2217 | * map a string |
| | 2218 | */ |
| | 2219 | size_t CCharmapToUniUTF8::map2(char **outp, size_t *outlen, |
| | 2220 | const char *inp, size_t inlen, |
| | 2221 | size_t *partial_len) const |
| | 2222 | { |
| | 2223 | size_t copy_len; |
| | 2224 | |
| | 2225 | /* |
| | 2226 | * Make sure we copy only whole characters, by truncating the string |
| | 2227 | * to a length that includes only whole characters. |
| | 2228 | */ |
| | 2229 | copy_len = utf8_ptr::s_trunc(inp, inlen); |
| | 2230 | |
| | 2231 | /* |
| | 2232 | * note the length of any partial characters at the end of the buffer |
| | 2233 | * for the caller - this is simply the difference between the original |
| | 2234 | * length and the truncated copy length, since the truncation length |
| | 2235 | * is simply the length excluding the partial last character bytes |
| | 2236 | */ |
| | 2237 | *partial_len = inlen - copy_len; |
| | 2238 | |
| | 2239 | /* limit the copying to what will fit in the output buffer */ |
| | 2240 | if (copy_len > *outlen) |
| | 2241 | { |
| | 2242 | /* don't copy more than will fit, and don't copy partial characters */ |
| | 2243 | copy_len = utf8_ptr::s_trunc(inp, *outlen); |
| | 2244 | |
| | 2245 | /* we don't have enough room, so set the output size to zero */ |
| | 2246 | *outlen = 0; |
| | 2247 | } |
| | 2248 | else |
| | 2249 | { |
| | 2250 | /* we have room, so decrement the output size by the copy size */ |
| | 2251 | *outlen -= copy_len; |
| | 2252 | } |
| | 2253 | |
| | 2254 | /* copy the data */ |
| | 2255 | memcpy(*outp, inp, copy_len); |
| | 2256 | |
| | 2257 | /* advance the output pointer past the copied data */ |
| | 2258 | *outp += copy_len; |
| | 2259 | |
| | 2260 | /* |
| | 2261 | * return the total input length -- the total output length is |
| | 2262 | * always identical to the input length, because we don't change |
| | 2263 | * anything |
| | 2264 | */ |
| | 2265 | return inlen; |
| | 2266 | } |
| | 2267 | |
| | 2268 | /* |
| | 2269 | * read a file |
| | 2270 | */ |
| | 2271 | size_t CCharmapToUniUTF8::read_file(osfildef *fp, |
| | 2272 | char *buf, size_t bufl, |
| | 2273 | unsigned long read_limit) |
| | 2274 | { |
| | 2275 | size_t read_len; |
| | 2276 | char *last_start; |
| | 2277 | size_t last_got_len; |
| | 2278 | size_t last_need_len; |
| | 2279 | |
| | 2280 | /* make sure we don't read past the read limit, if applicable */ |
| | 2281 | if (read_limit != 0 && bufl > read_limit) |
| | 2282 | bufl = (size_t)read_limit; |
| | 2283 | |
| | 2284 | /* |
| | 2285 | * Read directly from the file, up the buffer size minus two bytes. |
| | 2286 | * We want to leave two extra bytes so that we can read any extra |
| | 2287 | * continuation bytes for the last character, in order to ensure |
| | 2288 | * that we always read whole characters; in the worst case, the last |
| | 2289 | * character could be three bytes long, in which case we'd need to |
| | 2290 | * read two extra bytes. |
| | 2291 | * |
| | 2292 | * If the available buffer size is less than three bytes, just read |
| | 2293 | * the number of bytes they asked for and don't bother trying to |
| | 2294 | * keep continuation sequences intact. |
| | 2295 | */ |
| | 2296 | if (bufl < 3) |
| | 2297 | return osfrbc(fp, buf, bufl); |
| | 2298 | |
| | 2299 | /* |
| | 2300 | * read up to the buffer size, less two bytes for possible |
| | 2301 | * continuation bytes |
| | 2302 | */ |
| | 2303 | read_len = osfrbc(fp, buf, bufl - 2); |
| | 2304 | |
| | 2305 | /* |
| | 2306 | * if we didn't satisfy the entire request, we're at the end of the |
| | 2307 | * file, so there's no point in trying to finish off any |
| | 2308 | * continuation sequences - in this case, just return what we have |
| | 2309 | */ |
| | 2310 | if (read_len < bufl - 2) |
| | 2311 | return read_len; |
| | 2312 | |
| | 2313 | /* |
| | 2314 | * Check the last byte we read to see if there's another byte or two |
| | 2315 | * following. |
| | 2316 | * |
| | 2317 | * If the last byte is a continuation byte, this is a bit trickier. |
| | 2318 | * We must back up to the preceding lead byte to figure out what we |
| | 2319 | * have in this case. |
| | 2320 | */ |
| | 2321 | last_start = &buf[read_len - 1]; |
| | 2322 | last_got_len = 1; |
| | 2323 | if (utf8_ptr::s_is_continuation(last_start)) |
| | 2324 | { |
| | 2325 | /* |
| | 2326 | * if we only read one byte, simply return the one byte - we |
| | 2327 | * started in the middle of a sequence, so there's no way we can |
| | 2328 | * read a complete sequence |
| | 2329 | */ |
| | 2330 | if (read_len == 1) |
| | 2331 | return read_len; |
| | 2332 | |
| | 2333 | /* back up to the byte we're continuing from */ |
| | 2334 | --last_start; |
| | 2335 | ++last_got_len; |
| | 2336 | |
| | 2337 | /* |
| | 2338 | * if this is another continuation byte, we've reached the maximum |
| | 2339 | * byte length of three for a single character, so there's no way |
| | 2340 | * we could need to read anything more |
| | 2341 | */ |
| | 2342 | if (utf8_ptr::s_is_continuation(last_start)) |
| | 2343 | return read_len; |
| | 2344 | } |
| | 2345 | |
| | 2346 | /* |
| | 2347 | * Okay: we have last_start pointing to the start of the last |
| | 2348 | * character, and last_got_len the number of bytes we actually have for |
| | 2349 | * that last character. If the needed length differs from the length |
| | 2350 | * we actually have, we need to read more. |
| | 2351 | */ |
| | 2352 | last_need_len = utf8_ptr::s_charsize(*last_start); |
| | 2353 | if (last_need_len > last_got_len) |
| | 2354 | { |
| | 2355 | /* |
| | 2356 | * we need more than we actually read, so read the remaining |
| | 2357 | * characters |
| | 2358 | */ |
| | 2359 | read_len += osfrbc(fp, buf + read_len, last_need_len - last_got_len); |
| | 2360 | } |
| | 2361 | |
| | 2362 | /* return the length we read */ |
| | 2363 | return read_len; |
| | 2364 | } |
| | 2365 | |
| | 2366 | /* ------------------------------------------------------------------------ */ |
| | 2367 | /* |
| | 2368 | * Basic UCS-2 to UTF-8 mapper |
| | 2369 | */ |
| | 2370 | |
| | 2371 | /* |
| | 2372 | * Read from a file, translating to UTF-8 encoding |
| | 2373 | */ |
| | 2374 | size_t CCharmapToUniUcs2::read_file(osfildef *fp, |
| | 2375 | char *buf, size_t bufl, |
| | 2376 | unsigned long read_limit) |
| | 2377 | { |
| | 2378 | size_t inlen; |
| | 2379 | |
| | 2380 | /* |
| | 2381 | * Compute how much to read from the file. The input file is composed |
| | 2382 | * of two-byte characters, so only read up to two thirds of the buffer |
| | 2383 | * length; this will ensure that we can always fit what we read into |
| | 2384 | * the caller's buffer. |
| | 2385 | * |
| | 2386 | * Note that we divide by three first, then double the result, to |
| | 2387 | * ensure that we read an even number of bytes. Each UCS-2 character |
| | 2388 | * is represented in exactly two bytes, so we must always read pairs of |
| | 2389 | * bytes to be sure we're reading whole characters. |
| | 2390 | */ |
| | 2391 | inlen = bufl / 3; |
| | 2392 | inlen *= 2; |
| | 2393 | |
| | 2394 | /* in any case, we can't read more than our own buffer size */ |
| | 2395 | if (inlen > sizeof(inbuf_)) |
| | 2396 | inlen = sizeof(inbuf_); |
| | 2397 | |
| | 2398 | /* don't read past the read limit, if applicable */ |
| | 2399 | if (read_limit != 0 && inlen > read_limit) |
| | 2400 | inlen = (size_t)read_limit; |
| | 2401 | |
| | 2402 | /* read from the file */ |
| | 2403 | inlen = osfrbc(fp, inbuf_, inlen); |
| | 2404 | |
| | 2405 | /* |
| | 2406 | * Map data to the caller's buffer, and return the result. We're |
| | 2407 | * certain that the data will fit in the caller's buffer: we're |
| | 2408 | * mapping only a third as many characters as we have bytes |
| | 2409 | * available, and each character can take up at most three bytes, |
| | 2410 | * hence the worst case is that we fill the buffer completely. |
| | 2411 | * |
| | 2412 | * On the other hand, we may only fill the buffer to a third of its |
| | 2413 | * capacity, but this is okay too, since we're not required to give |
| | 2414 | * the caller everything they asked for. |
| | 2415 | */ |
| | 2416 | return map(&buf, &bufl, inbuf_, inlen); |
| | 2417 | } |
| | 2418 | |
| | 2419 | /* ------------------------------------------------------------------------ */ |
| | 2420 | /* |
| | 2421 | * UCS-2 little-endian to UTF-8 mapper |
| | 2422 | */ |
| | 2423 | |
| | 2424 | /* |
| | 2425 | * map a string |
| | 2426 | */ |
| | 2427 | size_t CCharmapToUniUcs2Little::map(char **outp, size_t *outlen, |
| | 2428 | const char *inp, size_t inlen) const |
| | 2429 | { |
| | 2430 | size_t tot_outlen; |
| | 2431 | |
| | 2432 | /* we haven't written any characters to the output buffer yet */ |
| | 2433 | tot_outlen = 0; |
| | 2434 | |
| | 2435 | /* scan each character (character == byte pair) in the input string */ |
| | 2436 | for ( ; inlen > 1 ; inlen -= 2, inp += 2) |
| | 2437 | { |
| | 2438 | wchar_t uni; |
| | 2439 | size_t csiz; |
| | 2440 | |
| | 2441 | /* |
| | 2442 | * read the little-endian two-byte value - no mapping is |
| | 2443 | * required, since UCS-2 uses the same code point assignments as |
| | 2444 | * UTF-8 |
| | 2445 | */ |
| | 2446 | uni = ((wchar_t)(unsigned char)*inp) |
| | 2447 | + (((wchar_t)(unsigned char)*(inp + 1)) << 8); |
| | 2448 | |
| | 2449 | /* get the size of this character */ |
| | 2450 | csiz = utf8_ptr::s_wchar_size(uni); |
| | 2451 | |
| | 2452 | /* add it to the total output lenght */ |
| | 2453 | tot_outlen += csiz; |
| | 2454 | |
| | 2455 | /* if there's room, add it to our output buffer */ |
| | 2456 | if (*outlen >= csiz) |
| | 2457 | { |
| | 2458 | /* write it out */ |
| | 2459 | *outp += utf8_ptr::s_putch(*outp, uni); |
| | 2460 | |
| | 2461 | /* deduct it from the remaining output length */ |
| | 2462 | *outlen -= csiz; |
| | 2463 | } |
| | 2464 | else |
| | 2465 | { |
| | 2466 | /* there's no room - set the remaining output length to zero */ |
| | 2467 | *outlen = 0; |
| | 2468 | } |
| | 2469 | } |
| | 2470 | |
| | 2471 | /* return the total output length */ |
| | 2472 | return tot_outlen; |
| | 2473 | } |
| | 2474 | |
| | 2475 | /* ------------------------------------------------------------------------ */ |
| | 2476 | /* |
| | 2477 | * UCS-2 big-endian to UTF-8 mapper |
| | 2478 | */ |
| | 2479 | |
| | 2480 | /* |
| | 2481 | * map a string |
| | 2482 | */ |
| | 2483 | size_t CCharmapToUniUcs2Big::map(char **outp, size_t *outlen, |
| | 2484 | const char *inp, size_t inlen) const |
| | 2485 | { |
| | 2486 | size_t tot_outlen; |
| | 2487 | |
| | 2488 | /* we haven't written any characters to the output buffer yet */ |
| | 2489 | tot_outlen = 0; |
| | 2490 | |
| | 2491 | /* scan each character (character == byte pair) in the input string */ |
| | 2492 | for ( ; inlen > 1 ; inlen -= 2, inp += 2) |
| | 2493 | { |
| | 2494 | wchar_t uni; |
| | 2495 | size_t csiz; |
| | 2496 | |
| | 2497 | /* |
| | 2498 | * read the big-endian two-byte value - no mapping is required, |
| | 2499 | * since UCS-2 uses the same code point assignments as UTF-8 |
| | 2500 | */ |
| | 2501 | uni = (((wchar_t)(unsigned char)*inp) << 8) |
| | 2502 | + ((wchar_t)(unsigned char)*(inp + 1)); |
| | 2503 | |
| | 2504 | /* get the size of this character */ |
| | 2505 | csiz = utf8_ptr::s_wchar_size(uni); |
| | 2506 | |
| | 2507 | /* add it to the total output lenght */ |
| | 2508 | tot_outlen += csiz; |
| | 2509 | |
| | 2510 | /* if there's room, add it to our output buffer */ |
| | 2511 | if (*outlen >= csiz) |
| | 2512 | { |
| | 2513 | /* write it out */ |
| | 2514 | *outp += utf8_ptr::s_putch(*outp, uni); |
| | 2515 | |
| | 2516 | /* deduct it from the remaining output length */ |
| | 2517 | *outlen -= csiz; |
| | 2518 | } |
| | 2519 | else |
| | 2520 | { |
| | 2521 | /* there's no room - set the remaining output length to zero */ |
| | 2522 | *outlen = 0; |
| | 2523 | } |
| | 2524 | } |
| | 2525 | |
| | 2526 | /* return the total output length */ |
| | 2527 | return tot_outlen; |
| | 2528 | } |
| | 2529 | |
| | 2530 | /* ------------------------------------------------------------------------ */ |
| | 2531 | /* |
| | 2532 | * Multi-byte character set translation to Unicode |
| | 2533 | */ |
| | 2534 | |
| | 2535 | /* |
| | 2536 | * construct the mapper |
| | 2537 | */ |
| | 2538 | CCharmapToUniMB::CCharmapToUniMB() |
| | 2539 | { |
| | 2540 | int i; |
| | 2541 | cmap_mb_entry *p; |
| | 2542 | |
| | 2543 | /* clear out the mapping table */ |
| | 2544 | for (i = 0, p = map_ ; i < 256 ; ++i, ++p) |
| | 2545 | { |
| | 2546 | /* assume this lead byte won't have a sub-table */ |
| | 2547 | p->sub = 0; |
| | 2548 | |
| | 2549 | /* |
| | 2550 | * we don't have a mapping for this lead byte yet, so use U+FFFD |
| | 2551 | * (the Unicode REPLACEMENT CHARACTER) as the default mapping in |
| | 2552 | * case we never assign it any other mapping |
| | 2553 | */ |
| | 2554 | p->ch = 0xFFFD; |
| | 2555 | } |
| | 2556 | } |
| | 2557 | |
| | 2558 | /* |
| | 2559 | * delete the mapper |
| | 2560 | */ |
| | 2561 | CCharmapToUniMB::~CCharmapToUniMB() |
| | 2562 | { |
| | 2563 | int i; |
| | 2564 | cmap_mb_entry *p; |
| | 2565 | |
| | 2566 | /* delete all of our sub-tables */ |
| | 2567 | for (i = 0, p = map_ ; i < 256 ; ++i, ++p) |
| | 2568 | { |
| | 2569 | /* if this sub-table was allocated, delete it */ |
| | 2570 | if (p->sub != 0) |
| | 2571 | t3free(p->sub); |
| | 2572 | } |
| | 2573 | } |
| | 2574 | |
| | 2575 | /* |
| | 2576 | * Set a mapping |
| | 2577 | */ |
| | 2578 | void CCharmapToUniMB::set_mapping(wchar_t uni_code_pt, wchar_t local_code_pt) |
| | 2579 | { |
| | 2580 | /* |
| | 2581 | * Check to see if it's a one-byte or two-byte mapping. If the local |
| | 2582 | * code point is in the range 0-255, it's a one-byte character; |
| | 2583 | * otherwise, it's a two-byte character. |
| | 2584 | */ |
| | 2585 | if (local_code_pt <= 255) |
| | 2586 | { |
| | 2587 | /* it's a single-byte character, so simply set the mapping */ |
| | 2588 | map_[(unsigned char)local_code_pt].ch = uni_code_pt; |
| | 2589 | } |
| | 2590 | else |
| | 2591 | { |
| | 2592 | cmap_mb_entry *entp; |
| | 2593 | wchar_t *subp; |
| | 2594 | |
| | 2595 | /* |
| | 2596 | * Get the mapping table entry for the lead byte. The lead byte of |
| | 2597 | * the local code point is given by the high-order byte of the |
| | 2598 | * local code point. (Note that this doesn't have anything to do |
| | 2599 | * with the endian-ness of the local platform. The generic Unicode |
| | 2600 | * mapping tables are specifically designed this way, independently |
| | 2601 | * of endian-ness.) |
| | 2602 | */ |
| | 2603 | entp = &map_[(unsigned char)((local_code_pt >> 8) & 0xff)]; |
| | 2604 | |
| | 2605 | /* |
| | 2606 | * It's a two-byte character. The high-order byte is the lead |
| | 2607 | * byte, and the low-order byte is the trailing byte of the |
| | 2608 | * two-byte sequence. |
| | 2609 | * |
| | 2610 | * If we haven't previously set up a sub-table for the lead byte, |
| | 2611 | * do so now. |
| | 2612 | */ |
| | 2613 | if ((subp = entp->sub) == 0) |
| | 2614 | { |
| | 2615 | size_t i; |
| | 2616 | wchar_t *p; |
| | 2617 | |
| | 2618 | /* allocate a new sub-mapping table for the lead byte */ |
| | 2619 | subp = entp->sub = (wchar_t *)t3malloc(256 * sizeof(wchar_t)); |
| | 2620 | |
| | 2621 | /* initialize each entry to U+FFFD, in case we never map them */ |
| | 2622 | for (i = 256, p = subp ; i != 0 ; --i, *p++ = 0xFFFD) ; |
| | 2623 | } |
| | 2624 | |
| | 2625 | /* set the mapping in the sub-table for the second byte */ |
| | 2626 | subp[(unsigned char)(local_code_pt & 0xff)] = uni_code_pt; |
| | 2627 | } |
| | 2628 | } |
| | 2629 | |
| | 2630 | /* |
| | 2631 | * map a string, providing partial character info |
| | 2632 | */ |
| | 2633 | size_t CCharmapToUniMB::map2(char **output_ptr, size_t *output_buf_len, |
| | 2634 | const char *input_ptr, size_t input_len, |
| | 2635 | size_t *partial_len) const |
| | 2636 | { |
| | 2637 | size_t needed_out_len; |
| | 2638 | |
| | 2639 | /* presume we won't have a partial last character */ |
| | 2640 | *partial_len = 0; |
| | 2641 | |
| | 2642 | /* we haven't found anything to store in the output yet */ |
| | 2643 | needed_out_len = 0; |
| | 2644 | |
| | 2645 | /* keep going until we've mapped each character */ |
| | 2646 | while (input_len != 0) |
| | 2647 | { |
| | 2648 | unsigned char c; |
| | 2649 | const cmap_mb_entry *entp; |
| | 2650 | wchar_t wc; |
| | 2651 | size_t wlen; |
| | 2652 | |
| | 2653 | /* get the lead byte of the next input character */ |
| | 2654 | c = *input_ptr; |
| | 2655 | |
| | 2656 | /* get the primary mapping table entry for the lead byte */ |
| | 2657 | entp = &map_[c]; |
| | 2658 | |
| | 2659 | /* check for a one-byte or two-byte mapping */ |
| | 2660 | if (entp->sub == 0) |
| | 2661 | { |
| | 2662 | /* it's a one-byte character - get the mapping */ |
| | 2663 | wc = entp->ch; |
| | 2664 | |
| | 2665 | /* skip the single byte of input */ |
| | 2666 | ++input_ptr; |
| | 2667 | --input_len; |
| | 2668 | } |
| | 2669 | else |
| | 2670 | { |
| | 2671 | /* |
| | 2672 | * it's a two-byte character lead byte - make sure we have a |
| | 2673 | * complete input character |
| | 2674 | */ |
| | 2675 | if (input_len < 2) |
| | 2676 | { |
| | 2677 | /* we have an incomplete last character - tell the caller */ |
| | 2678 | *partial_len = 1; |
| | 2679 | |
| | 2680 | /* we're done mapping it */ |
| | 2681 | break; |
| | 2682 | } |
| | 2683 | |
| | 2684 | /* get the second byte of the sequence */ |
| | 2685 | c = input_ptr[1]; |
| | 2686 | |
| | 2687 | /* get the translation from the sub-table */ |
| | 2688 | wc = entp->sub[c]; |
| | 2689 | |
| | 2690 | /* skip the two-byte sequence */ |
| | 2691 | input_ptr += 2; |
| | 2692 | input_len -= 2; |
| | 2693 | } |
| | 2694 | |
| | 2695 | /* we have the translation - note its stored UTF-8 byte size */ |
| | 2696 | wlen = utf8_ptr::s_wchar_size(wc); |
| | 2697 | |
| | 2698 | /* check for room to store the output character */ |
| | 2699 | if (wlen > *output_buf_len) |
| | 2700 | { |
| | 2701 | /* |
| | 2702 | * there's no room to store this character - zero out the |
| | 2703 | * output buffer length so that we know not to try storing |
| | 2704 | * anything else in the buffer |
| | 2705 | */ |
| | 2706 | *output_buf_len = 0; |
| | 2707 | } |
| | 2708 | else |
| | 2709 | { |
| | 2710 | /* there's room - store it */ |
| | 2711 | wlen = utf8_ptr::s_putch(*output_ptr, wc); |
| | 2712 | |
| | 2713 | /* consume output buffer space */ |
| | 2714 | *output_ptr += wlen; |
| | 2715 | *output_buf_len -= wlen; |
| | 2716 | } |
| | 2717 | |
| | 2718 | /* count the needed length, whether we stored it or not */ |
| | 2719 | needed_out_len += wlen; |
| | 2720 | } |
| | 2721 | |
| | 2722 | /* return the required output length */ |
| | 2723 | return needed_out_len; |
| | 2724 | } |
| | 2725 | |
| | 2726 | /* |
| | 2727 | * read from a multi-byte input file, translating to UTF-8 |
| | 2728 | */ |
| | 2729 | size_t CCharmapToUniMB::read_file(osfildef *fp, char *buf, size_t bufl, |
| | 2730 | unsigned long read_limit) |
| | 2731 | { |
| | 2732 | size_t inlen; |
| | 2733 | size_t outlen; |
| | 2734 | size_t partial; |
| | 2735 | |
| | 2736 | /* |
| | 2737 | * Compute how much to read from the file. The input file is composed |
| | 2738 | * of one-byte or two-byte characters, so only read up to one-third of |
| | 2739 | * the caller's buffer length; this will ensure that in the worst case |
| | 2740 | * we can always fit what we read into the caller's buffer. (The worst |
| | 2741 | * case is that the input is entirely single-byte local characters that |
| | 2742 | * translate into three-byte UTF-8 characters.) |
| | 2743 | */ |
| | 2744 | inlen = bufl / 3; |
| | 2745 | |
| | 2746 | /* in any case, we can't read more than our own buffer size */ |
| | 2747 | if (inlen >= sizeof(inbuf_)) |
| | 2748 | inlen = sizeof(inbuf_); |
| | 2749 | |
| | 2750 | /* limit the read length to the caller's read limit, if appropriate */ |
| | 2751 | if (read_limit != 0 && inlen > read_limit) |
| | 2752 | inlen = (size_t)read_limit; |
| | 2753 | |
| | 2754 | /* read raw bytes from the file */ |
| | 2755 | inlen = osfrbc(fp, inbuf_, inlen); |
| | 2756 | |
| | 2757 | /* |
| | 2758 | * Map data to the caller's buffer. Note if we have a partial |
| | 2759 | * character at the end of the buffer (i.e., the last byte of the |
| | 2760 | * buffer is a lead byte that requires a second byte to make up a |
| | 2761 | * complete two-byte local character), so that we can read an |
| | 2762 | * additional byte to complete the two-byte final character if |
| | 2763 | * necessary. |
| | 2764 | */ |
| | 2765 | outlen = map2(&buf, &bufl, inbuf_, inlen, &partial); |
| | 2766 | |
| | 2767 | /* |
| | 2768 | * if we have a partial trailing character, read the other half of the |
| | 2769 | * final character |
| | 2770 | */ |
| | 2771 | if (partial != 0) |
| | 2772 | { |
| | 2773 | /* move the lead byte to the start of our buffer */ |
| | 2774 | inbuf_[0] = inbuf_[inlen - 1]; |
| | 2775 | |
| | 2776 | /* read the extra byte to form a complete character */ |
| | 2777 | inlen = 1 + osfrbc(fp, inbuf_ + 1, 1); |
| | 2778 | |
| | 2779 | /* if we got the second byte, map the complete final character */ |
| | 2780 | if (inlen == 2) |
| | 2781 | outlen += map(&buf, &bufl, inbuf_, inlen); |
| | 2782 | } |
| | 2783 | |
| | 2784 | /* return the result length */ |
| | 2785 | return outlen; |
| | 2786 | } |
| | 2787 | |