| | 1 | #ifdef RCSID |
| | 2 | static char RCSid[] = |
| | 3 | "$Header: d:/cvsroot/tads/tads3/TCSRC.CPP,v 1.3 1999/07/11 00:46:55 MJRoberts Exp $"; |
| | 4 | #endif |
| | 5 | |
| | 6 | /* |
| | 7 | * Copyright (c) 1999, 2002 Michael J. Roberts. All Rights Reserved. |
| | 8 | * |
| | 9 | * Please see the accompanying license file, LICENSE.TXT, for information |
| | 10 | * on using and copying this software. |
| | 11 | */ |
| | 12 | /* |
| | 13 | Name |
| | 14 | tcsrc.cpp - source file reader |
| | 15 | Function |
| | 16 | |
| | 17 | Notes |
| | 18 | |
| | 19 | Modified |
| | 20 | 04/13/99 MJRoberts - Creation |
| | 21 | */ |
| | 22 | |
| | 23 | #include <string.h> |
| | 24 | #include <stdlib.h> |
| | 25 | |
| | 26 | #include "os.h" |
| | 27 | #include "t3std.h" |
| | 28 | #include "tcsrc.h" |
| | 29 | #include "tcglob.h" |
| | 30 | #include "charmap.h" |
| | 31 | |
| | 32 | |
| | 33 | /* ------------------------------------------------------------------------ */ |
| | 34 | /* |
| | 35 | * Deletion |
| | 36 | */ |
| | 37 | CTcSrcFile::~CTcSrcFile() |
| | 38 | { |
| | 39 | /* close my source file */ |
| | 40 | if (fp_ != 0) |
| | 41 | osfcls(fp_); |
| | 42 | |
| | 43 | /* release my character mapper */ |
| | 44 | if (mapper_ != 0) |
| | 45 | mapper_->release_ref(); |
| | 46 | } |
| | 47 | |
| | 48 | |
| | 49 | #if 0 |
| | 50 | // we don't currently need this, but keep the source in case it |
| | 51 | // becomes interesting later |
| | 52 | // |
| | 53 | /* ------------------------------------------------------------------------ */ |
| | 54 | /* |
| | 55 | * Open a plain ASCII file, with no #charset marker. |
| | 56 | */ |
| | 57 | CTcSrcFile *CTcSrcFile::open_plain(const char *filename) |
| | 58 | { |
| | 59 | osfildef *fp; |
| | 60 | char buf[5]; |
| | 61 | size_t siz; |
| | 62 | |
| | 63 | /* |
| | 64 | * open the file in binary mode, since we do all of the newline |
| | 65 | * interpretation explicitly |
| | 66 | */ |
| | 67 | if ((fp = osfoprb(filename, OSFTTEXT)) == 0) |
| | 68 | return 0; |
| | 69 | |
| | 70 | /* read the first few bytes of the file */ |
| | 71 | siz = osfrbc(fp, buf, sizeof(buf)); |
| | 72 | |
| | 73 | /* check for a 3-byte UTF-8 marker */ |
| | 74 | if (siz >= 3 |
| | 75 | && (uchar)buf[0] == 0xEF |
| | 76 | && (uchar)buf[1] == 0xBB |
| | 77 | && (uchar)buf[2] == 0xBF) |
| | 78 | { |
| | 79 | /* |
| | 80 | * seek to the byte after the marker, so that our caller won't see |
| | 81 | * the marker |
| | 82 | */ |
| | 83 | osfseek(fp, 3, OSFSK_SET); |
| | 84 | |
| | 85 | /* return a source file reader with a utf-8 mapper */ |
| | 86 | return new CTcSrcFile(fp, new CCharmapToUniUTF8()); |
| | 87 | } |
| | 88 | |
| | 89 | /* if we read at least two bytes, try auto-detecting UCS-2 */ |
| | 90 | if (siz >= 2) |
| | 91 | { |
| | 92 | /* if the first bytes are 0xFF 0xFE, it's UCS-2 low-byte first */ |
| | 93 | if ((unsigned char)buf[0] == 0xFF && (unsigned char)buf[1] == 0xFE) |
| | 94 | { |
| | 95 | /* seek to the byte after the marker */ |
| | 96 | osfseek(fp, 2, OSFSK_SET); |
| | 97 | |
| | 98 | /* return a reader with a little-endian mapper */ |
| | 99 | return new CTcSrcFile(fp, new CCharmapToUniUcs2Little()); |
| | 100 | } |
| | 101 | |
| | 102 | /* if the first bytes are 0xFE 0xFF, it's UCS-2 high-byte first */ |
| | 103 | if ((unsigned char)buf[0] == 0xFE && (unsigned char)buf[1] == 0xFF) |
| | 104 | { |
| | 105 | /* seek to the byte after the marker */ |
| | 106 | osfseek(fp, 2, OSFSK_SET); |
| | 107 | |
| | 108 | /* return a reader with a little-endian mapper */ |
| | 109 | return new CTcSrcFile(fp, new CCharmapToUniUcs2Big()); |
| | 110 | } |
| | 111 | } |
| | 112 | |
| | 113 | /* |
| | 114 | * there are no Unicode markers, so our only remaining option is plain |
| | 115 | * ASCII - return a source file object with a plain ASCII mapper |
| | 116 | */ |
| | 117 | return new CTcSrcFile(fp, new CCharmapToUniASCII()); |
| | 118 | } |
| | 119 | #endif |
| | 120 | |
| | 121 | /* ------------------------------------------------------------------------ */ |
| | 122 | /* |
| | 123 | * Open a plain ASCII source file. |
| | 124 | */ |
| | 125 | CTcSrcFile *CTcSrcFile::open_ascii(const char *filename) |
| | 126 | { |
| | 127 | osfildef *fp; |
| | 128 | |
| | 129 | /* |
| | 130 | * open the file in binary mode, since we do all of the newline |
| | 131 | * interpretation explicitly |
| | 132 | */ |
| | 133 | if ((fp = osfoprb(filename, OSFTTEXT)) == 0) |
| | 134 | return 0; |
| | 135 | |
| | 136 | /* return a source reader with a plain ASCII mapper */ |
| | 137 | return new CTcSrcFile(fp, new CCharmapToUniASCII()); |
| | 138 | } |
| | 139 | |
| | 140 | |
| | 141 | /* ------------------------------------------------------------------------ */ |
| | 142 | /* |
| | 143 | * Open a source file |
| | 144 | */ |
| | 145 | CTcSrcFile *CTcSrcFile::open_source(const char *filename, |
| | 146 | class CResLoader *res_loader, |
| | 147 | const char *default_charset, |
| | 148 | int *charset_error, |
| | 149 | int *default_charset_error) |
| | 150 | { |
| | 151 | char buf[275]; |
| | 152 | size_t siz; |
| | 153 | osfildef *fp; |
| | 154 | long startofs; |
| | 155 | CCharmapToUni *mapper; |
| | 156 | |
| | 157 | /* presume we won't find an invalid #charset directive */ |
| | 158 | *charset_error = FALSE; |
| | 159 | |
| | 160 | /* presume we'll have no problem with the default character set */ |
| | 161 | *default_charset_error = FALSE; |
| | 162 | |
| | 163 | /* |
| | 164 | * open the file in binary mode, so that we can scan the first few |
| | 165 | * bytes to see if we can detect the character set from information |
| | 166 | * at the beginning of the file |
| | 167 | */ |
| | 168 | fp = osfoprb(filename, OSFTTEXT); |
| | 169 | |
| | 170 | /* if we couldn't open the file, return failure */ |
| | 171 | if (fp == 0) |
| | 172 | return 0; |
| | 173 | |
| | 174 | /* note the starting offset in the file */ |
| | 175 | startofs = osfpos(fp); |
| | 176 | |
| | 177 | /* read the first few bytes of the file */ |
| | 178 | siz = osfrbc(fp, buf, sizeof(buf)); |
| | 179 | |
| | 180 | /* check for a 3-byte UTF-8 byte-order marker */ |
| | 181 | if (siz >= 3 && (uchar)buf[0] == 0xEF && (uchar)buf[1] == 0xBB |
| | 182 | && (uchar)buf[2] == 0xBF) |
| | 183 | { |
| | 184 | char *p; |
| | 185 | size_t rem; |
| | 186 | uint skip; |
| | 187 | |
| | 188 | /* skip at least the three-byte marker sequence */ |
| | 189 | skip = 3; |
| | 190 | |
| | 191 | /* |
| | 192 | * check for a #charset marker for utf-8 - this would be redundant, |
| | 193 | * but we'll allow it |
| | 194 | */ |
| | 195 | p = buf + 3; |
| | 196 | rem = siz - 3; |
| | 197 | if (rem > 9 && memcmp(p, "#charset ", 9) == 0) |
| | 198 | { |
| | 199 | /* skip spaces */ |
| | 200 | for (p += 9, rem -= 9 ; rem != 0 && (*p == ' ' || *p == '\t') ; |
| | 201 | ++p, --rem); |
| | 202 | |
| | 203 | /* check for valid character set markers */ |
| | 204 | if (rem >= 7 && memicmp(p, "\"utf-8\"", 7) == 0) |
| | 205 | { |
| | 206 | /* skip the whole sequence */ |
| | 207 | skip = (p + 7) - buf; |
| | 208 | } |
| | 209 | else if (rem >= 6 && memicmp(p, "\"utf8\"", 6) == 0) |
| | 210 | { |
| | 211 | /* skip the whole sequence */ |
| | 212 | skip = (p + 6) - buf; |
| | 213 | } |
| | 214 | } |
| | 215 | |
| | 216 | /* seek past the character set markers */ |
| | 217 | osfseek(fp, startofs + skip, OSFSK_SET); |
| | 218 | |
| | 219 | /* return a new utf-8 decoder */ |
| | 220 | return new CTcSrcFile(fp, new CCharmapToUniUTF8()); |
| | 221 | } |
| | 222 | |
| | 223 | /* if we read at least two bytes, try auto-detecting unicode */ |
| | 224 | if (siz >= 2) |
| | 225 | { |
| | 226 | CTcSrcFile *srcf; |
| | 227 | const char *const *cs_names; |
| | 228 | int bige; |
| | 229 | |
| | 230 | /* presume we won't find a byte-order marker */ |
| | 231 | srcf = 0; |
| | 232 | |
| | 233 | /* if the first bytes are 0xFF 0xFE, it's UCS-2 low-byte first */ |
| | 234 | if ((unsigned char)buf[0] == 0xFF && (unsigned char)buf[1] == 0xFE) |
| | 235 | { |
| | 236 | static const char *names[] = { "unicodel", "utf-16le", 0 }; |
| | 237 | |
| | 238 | /* create a UCS-2 little-endian reader */ |
| | 239 | srcf = new CTcSrcFile(fp, new CCharmapToUniUcs2Little()); |
| | 240 | bige = FALSE; |
| | 241 | cs_names = names; |
| | 242 | } |
| | 243 | |
| | 244 | /* if the first bytes are 0xFE 0xFF, it's UCS-2 high-byte first */ |
| | 245 | if ((unsigned char)buf[0] == 0xFE && (unsigned char)buf[1] == 0xFF) |
| | 246 | { |
| | 247 | static const char *names[] = { "unicodeb", "utf-16be", 0 }; |
| | 248 | |
| | 249 | /* create a UCS-2 little-endian reader */ |
| | 250 | srcf = new CTcSrcFile(fp, new CCharmapToUniUcs2Big()); |
| | 251 | bige = TRUE; |
| | 252 | cs_names = names; |
| | 253 | } |
| | 254 | |
| | 255 | /* if we found the byte-order marker, we know the character set */ |
| | 256 | if (srcf != 0) |
| | 257 | { |
| | 258 | uint skip; |
| | 259 | |
| | 260 | /* we at least want to skip the byte-order marker */ |
| | 261 | skip = 2; |
| | 262 | |
| | 263 | /* check to see if we have a '#charset' directive */ |
| | 264 | if (ucs_str_starts_with(buf + 2, siz - 2, "#charset ", |
| | 265 | bige, FALSE)) |
| | 266 | { |
| | 267 | char *p; |
| | 268 | size_t rem; |
| | 269 | |
| | 270 | /* scan past following spaces */ |
| | 271 | for (p = buf + 2 + 18, rem = siz - 2 - 18 ; |
| | 272 | rem >= 2 && (ucs_char_eq(p, ' ', bige, FALSE) |
| | 273 | || ucs_char_eq(p, '\t', bige, FALSE)) ; |
| | 274 | p += 2, rem -= 2) ; |
| | 275 | |
| | 276 | /* check for a '"' */ |
| | 277 | if (rem >= 2 && ucs_char_eq(p, '"', bige, FALSE)) |
| | 278 | { |
| | 279 | const char *const *n; |
| | 280 | |
| | 281 | /* skip the '"' */ |
| | 282 | p += 2; |
| | 283 | rem -= 2; |
| | 284 | |
| | 285 | /* |
| | 286 | * check for a match to any of the valid names for this |
| | 287 | * character set |
| | 288 | */ |
| | 289 | for (n = cs_names ; *n != 0 ; ++n) |
| | 290 | { |
| | 291 | /* if it's a match, stop scanning */ |
| | 292 | if (ucs_str_starts_with(p, rem, *n, bige, TRUE)) |
| | 293 | { |
| | 294 | size_t l; |
| | 295 | |
| | 296 | /* get the length of the name */ |
| | 297 | l = strlen(*n) * 2; |
| | 298 | |
| | 299 | /* check for a close quote */ |
| | 300 | if (rem >= l + 2 |
| | 301 | && ucs_char_eq(p + l, '"', bige, FALSE)) |
| | 302 | { |
| | 303 | /* skip the name and the quote */ |
| | 304 | p += l + 2; |
| | 305 | rem -= l + 2; |
| | 306 | |
| | 307 | /* skip the source text to this point */ |
| | 308 | skip = p - buf; |
| | 309 | |
| | 310 | /* stop scanning */ |
| | 311 | break; |
| | 312 | } |
| | 313 | } |
| | 314 | } |
| | 315 | } |
| | 316 | } |
| | 317 | |
| | 318 | /* seek just past the character set indicators */ |
| | 319 | osfseek(fp, startofs + skip, OSFSK_SET); |
| | 320 | |
| | 321 | /* return the file */ |
| | 322 | return srcf; |
| | 323 | } |
| | 324 | } |
| | 325 | |
| | 326 | /* |
| | 327 | * It doesn't appear to use UCS-2 encoding (at least, the file |
| | 328 | * doesn't start with a byte-order sensing sequence). Check to see |
| | 329 | * if the file starts with "#charset " in ASCII single-byte |
| | 330 | * characters. |
| | 331 | */ |
| | 332 | if (siz >= 9 && memcmp(buf, "#charset ", 9) == 0) |
| | 333 | { |
| | 334 | char *p; |
| | 335 | size_t rem; |
| | 336 | |
| | 337 | /* skip the #charset string and any following spaces */ |
| | 338 | for (p = buf + 9, rem = siz - 9 ; |
| | 339 | rem > 0 && (*p == ' ' || *p == '\t') ; ++p, --rem) ; |
| | 340 | |
| | 341 | /* make sure we're looking at a '"' */ |
| | 342 | if (rem != 0 && *p == '"') |
| | 343 | { |
| | 344 | char *charset_name; |
| | 345 | |
| | 346 | /* skip the open quote */ |
| | 347 | ++p; |
| | 348 | --rem; |
| | 349 | |
| | 350 | /* remember where the character set name starts */ |
| | 351 | charset_name = p; |
| | 352 | |
| | 353 | /* |
| | 354 | * find the closing quote, which must occur before a CR or |
| | 355 | * LF character |
| | 356 | */ |
| | 357 | for ( ; rem > 0 && *p != '"' && *p != 10 && *p != 13 ; |
| | 358 | ++p, --rem) ; |
| | 359 | |
| | 360 | /* make sure we found a matching quote */ |
| | 361 | if (rem != 0 && *p == '"') |
| | 362 | { |
| | 363 | /* seek just past the #charset string */ |
| | 364 | osfseek(fp, startofs + (p - buf) + 1, OSFSK_SET); |
| | 365 | |
| | 366 | /* |
| | 367 | * put a null terminator at the end of the character set |
| | 368 | * name |
| | 369 | */ |
| | 370 | *p = '\0'; |
| | 371 | |
| | 372 | /* create a mapper */ |
| | 373 | mapper = CCharmapToUni::load(res_loader, charset_name); |
| | 374 | |
| | 375 | /* |
| | 376 | * if that succeeded, return a reader for the mapper; |
| | 377 | * otherwise, simply proceed as though no #charset had |
| | 378 | * been present, so that we create a default mapper |
| | 379 | */ |
| | 380 | if (mapper != 0) |
| | 381 | { |
| | 382 | /* success - return a reader */ |
| | 383 | return new CTcSrcFile(fp, mapper); |
| | 384 | } |
| | 385 | else |
| | 386 | { |
| | 387 | /* tell the caller the #charset was invalid */ |
| | 388 | *charset_error = TRUE; |
| | 389 | } |
| | 390 | } |
| | 391 | } |
| | 392 | } |
| | 393 | |
| | 394 | /* |
| | 395 | * we didn't find any sensing codes, so seek back to the start of |
| | 396 | * the file |
| | 397 | */ |
| | 398 | osfseek(fp, startofs, OSFSK_SET); |
| | 399 | |
| | 400 | /* |
| | 401 | * We couldn't identify the file's character set based on anything |
| | 402 | * in the file, so create a mapper for the given default character |
| | 403 | * set. If there's not even a default character set defined, create |
| | 404 | * a plain ASCII mapper. |
| | 405 | */ |
| | 406 | if (default_charset != 0) |
| | 407 | mapper = CCharmapToUni::load(res_loader, default_charset); |
| | 408 | else |
| | 409 | mapper = new CCharmapToUniASCII(); |
| | 410 | |
| | 411 | /* check to see if we created a mapper */ |
| | 412 | if (mapper != 0) |
| | 413 | { |
| | 414 | /* return a source file reader based on the mapper */ |
| | 415 | return new CTcSrcFile(fp, mapper); |
| | 416 | } |
| | 417 | else |
| | 418 | { |
| | 419 | /* |
| | 420 | * we failed to create a mapper for the default character set - |
| | 421 | * flag the problem |
| | 422 | */ |
| | 423 | *default_charset_error = TRUE; |
| | 424 | |
| | 425 | /* close the input file */ |
| | 426 | osfcls(fp); |
| | 427 | |
| | 428 | /* return failure */ |
| | 429 | return 0; |
| | 430 | } |
| | 431 | } |
| | 432 | |
| | 433 | /* ------------------------------------------------------------------------ */ |
| | 434 | /* |
| | 435 | * Read a line of text from the file. |
| | 436 | */ |
| | 437 | size_t CTcSrcFile::read_line(char *buf, size_t bufl) |
| | 438 | { |
| | 439 | char *dst; |
| | 440 | |
| | 441 | /* start out writing to the start of the caller's buffer */ |
| | 442 | dst = buf; |
| | 443 | |
| | 444 | /* |
| | 445 | * Keep going until we run out of input file, fill up the buffer, or |
| | 446 | * reach the end of a line |
| | 447 | */ |
| | 448 | for (;;) |
| | 449 | { |
| | 450 | char *src; |
| | 451 | |
| | 452 | /* read some more data if our buffer is empty */ |
| | 453 | if (rem_ == 0) |
| | 454 | { |
| | 455 | /* load another buffer-full */ |
| | 456 | rem_ = mapper_->read_file(fp_, buf_, sizeof(buf_), 0); |
| | 457 | |
| | 458 | /* |
| | 459 | * If we didn't read anything, we've reached the end of the |
| | 460 | * file. If we've already copied anything into the caller's |
| | 461 | * buffer, null-terminate their buffer and return success; |
| | 462 | * otherwise, return failure, since the caller has already |
| | 463 | * read everything available from the file. |
| | 464 | */ |
| | 465 | if (rem_ == 0) |
| | 466 | { |
| | 467 | /* |
| | 468 | * Remember that we've reached the end of the file. |
| | 469 | * We're about to return the last of the data, so the |
| | 470 | * caller will not need to call us again (although it's |
| | 471 | * legal if they do - we'll just return a zero length on |
| | 472 | * the next call). |
| | 473 | */ |
| | 474 | at_eof_ = TRUE; |
| | 475 | |
| | 476 | /* check if we've copied anything to the caller's buffer */ |
| | 477 | if (buf == dst) |
| | 478 | { |
| | 479 | /* the caller's buffer is empty - return end of file */ |
| | 480 | return 0; |
| | 481 | } |
| | 482 | else |
| | 483 | { |
| | 484 | /* null-terminate the caller's buffer */ |
| | 485 | *dst++ = '\0'; |
| | 486 | |
| | 487 | /* |
| | 488 | * return the number of bytes copied, including the null |
| | 489 | * terminator |
| | 490 | */ |
| | 491 | return (dst - buf); |
| | 492 | } |
| | 493 | } |
| | 494 | |
| | 495 | /* start over at the beginning of the buffer */ |
| | 496 | p_ = buf_; |
| | 497 | } |
| | 498 | |
| | 499 | /* |
| | 500 | * Scan the input buffer one character (not byte) at a time. |
| | 501 | * Keep track of how much many bytes we've skipped. Stop when |
| | 502 | * we reach a CR or LF character, or when skipping another |
| | 503 | * character would exceed the remaining capacity of the caller's |
| | 504 | * buffer, or when we run out of data in our input buffer. |
| | 505 | */ |
| | 506 | for (src = p_ ; rem_ > 0 ; ) |
| | 507 | { |
| | 508 | size_t csiz; |
| | 509 | |
| | 510 | /* get the length of the current character */ |
| | 511 | csiz = utf8_ptr::s_charsize(*src); |
| | 512 | |
| | 513 | /* |
| | 514 | * if this character plus a null terminator wouldn't fit in |
| | 515 | * the output buffer, stop scanning |
| | 516 | */ |
| | 517 | if (csiz >= bufl) |
| | 518 | { |
| | 519 | /* |
| | 520 | * There's no more room in the caller's buffer. Copy |
| | 521 | * what we've scanned so far to the output buffer and |
| | 522 | * null-terminate the buffer. |
| | 523 | */ |
| | 524 | memcpy(dst, p_, src - p_); |
| | 525 | |
| | 526 | /* advance past the copied bytes and write the null byte */ |
| | 527 | dst += (src - p_); |
| | 528 | *dst++ = '\0'; |
| | 529 | |
| | 530 | /* advance the buffer read pointer over the copied bytes */ |
| | 531 | p_ = src; |
| | 532 | |
| | 533 | /* return success - indicate the number of bytes copied */ |
| | 534 | return (dst - buf); |
| | 535 | } |
| | 536 | |
| | 537 | /* |
| | 538 | * If it's a newline character of some kind, we're done with |
| | 539 | * this line. Note that we can just check the byte directly, |
| | 540 | * since if it's a multi-byte character, we'll never mistake |
| | 541 | * the first byte for a single-byte newline or carriage return |
| | 542 | * character, since a UTF-8 lead byte always has the high bit |
| | 543 | * set. |
| | 544 | * |
| | 545 | * Also treat the Unicode character 0x2028 (line separator) as |
| | 546 | * a newline. |
| | 547 | */ |
| | 548 | if (*src == '\n' || *src == '\r' |
| | 549 | || utf8_ptr::s_getch(src) == 0x2028) |
| | 550 | { |
| | 551 | char nl; |
| | 552 | |
| | 553 | /* copy what we've scanned so far to the caller's buffer */ |
| | 554 | memcpy(dst, p_, src - p_); |
| | 555 | |
| | 556 | /* advance past the copied bytes */ |
| | 557 | dst += src - p_; |
| | 558 | |
| | 559 | /* |
| | 560 | * add a newline to the caller's buffer -- always add a |
| | 561 | * '\n' newline, regardless of what kind of newline |
| | 562 | * sequence we found in the input; also add a null |
| | 563 | * terminator |
| | 564 | */ |
| | 565 | *dst++ = '\n'; |
| | 566 | *dst++ = '\0'; |
| | 567 | |
| | 568 | /* remember which type of newline we found */ |
| | 569 | nl = *src; |
| | 570 | |
| | 571 | /* advance past the newline */ |
| | 572 | p_ = src + csiz; |
| | 573 | rem_ -= csiz; |
| | 574 | |
| | 575 | /* |
| | 576 | * If the input buffer is empty, read more, so that we |
| | 577 | * can check the next character after the newline |
| | 578 | * character. |
| | 579 | */ |
| | 580 | if (rem_ == 0) |
| | 581 | { |
| | 582 | /* read more data */ |
| | 583 | rem_ = mapper_->read_file(fp_, buf_, sizeof(buf_), 0); |
| | 584 | |
| | 585 | /* start over at the start of the buffer */ |
| | 586 | p_ = buf_; |
| | 587 | } |
| | 588 | |
| | 589 | /* |
| | 590 | * Check for a paired newline character. If we found a |
| | 591 | * CR, check for an LF; if we found an LF, check for a |
| | 592 | * CR. This will ensure that we will recognize |
| | 593 | * essentially any newline character sequence for any |
| | 594 | * platform - this will accept CR, LF, CR-LF, or LF-CR |
| | 595 | * sequences. |
| | 596 | */ |
| | 597 | if (rem_ != 0 |
| | 598 | && ((nl == '\n' && *p_ == '\r') |
| | 599 | || (nl == '\r' && *p_ == '\n'))) |
| | 600 | { |
| | 601 | /* it's a paired newline - skip the second character */ |
| | 602 | ++p_; |
| | 603 | --rem_; |
| | 604 | } |
| | 605 | |
| | 606 | /* we've finished this line - return success */ |
| | 607 | return dst - buf; |
| | 608 | } |
| | 609 | |
| | 610 | /* skip this character in the input and proceed */ |
| | 611 | src += csiz; |
| | 612 | rem_ -= csiz; |
| | 613 | |
| | 614 | /* consider this character consumed in the caller's buffer */ |
| | 615 | bufl -= csiz; |
| | 616 | } |
| | 617 | |
| | 618 | /* |
| | 619 | * We've exhausted the current input buffer, without filling the |
| | 620 | * caller's buffer. Copy what we've skipped so far into the |
| | 621 | * caller's buffer. |
| | 622 | */ |
| | 623 | memcpy(dst, p_, src - p_); |
| | 624 | |
| | 625 | /* |
| | 626 | * Advance the output pointer past the data we just copied, then |
| | 627 | * continue looping to read more data from the input file. |
| | 628 | */ |
| | 629 | dst += src - p_; |
| | 630 | } |
| | 631 | } |
| | 632 | |
| | 633 | /* ------------------------------------------------------------------------ */ |
| | 634 | /* |
| | 635 | * Buffer reader source object |
| | 636 | */ |
| | 637 | |
| | 638 | /* |
| | 639 | * allocate |
| | 640 | */ |
| | 641 | CTcSrcMemory::CTcSrcMemory(const char *buf, CCharmapToUni *mapper) |
| | 642 | { |
| | 643 | size_t len; |
| | 644 | size_t alo_len; |
| | 645 | char *p; |
| | 646 | |
| | 647 | /* get the length of the null-terminated source string */ |
| | 648 | len = strlen(buf); |
| | 649 | |
| | 650 | /* |
| | 651 | * Allocate a buffer for a UTF8-encoded copy of the buffer - |
| | 652 | * allocate three bytes per byte of the original, since this is the |
| | 653 | * worst case for expansion of the encoding. Allocate one extra |
| | 654 | * byte to ensure we have space for a null terminator. |
| | 655 | */ |
| | 656 | alo_len = len*3; |
| | 657 | buf_alo_ = (char *)t3malloc(alo_len + 1); |
| | 658 | |
| | 659 | /* map the buffer */ |
| | 660 | p = buf_alo_; |
| | 661 | mapper->map(&p, &alo_len, buf, len); |
| | 662 | |
| | 663 | /* null-terminate the translated buffer */ |
| | 664 | *p = '\0'; |
| | 665 | |
| | 666 | /* start reading at the start of the translated buffer */ |
| | 667 | buf_ = buf_alo_; |
| | 668 | } |
| | 669 | |
| | 670 | /* |
| | 671 | * delete |
| | 672 | */ |
| | 673 | CTcSrcMemory::~CTcSrcMemory() |
| | 674 | { |
| | 675 | /* free our buffer */ |
| | 676 | t3free(buf_alo_); |
| | 677 | } |
| | 678 | |
| | 679 | /* |
| | 680 | * read next line |
| | 681 | */ |
| | 682 | size_t CTcSrcMemory::read_line(char *buf, size_t bufl) |
| | 683 | { |
| | 684 | char *dst; |
| | 685 | const char *src; |
| | 686 | |
| | 687 | /* if there's nothing left in our buffer, return EOF */ |
| | 688 | if (*buf_ == '\0') |
| | 689 | return 0; |
| | 690 | |
| | 691 | /* start out writing to the start of the caller's buffer */ |
| | 692 | dst = buf; |
| | 693 | |
| | 694 | /* |
| | 695 | * Scan the input buffer one character (not byte) at a time. Keep |
| | 696 | * track of how much many bytes we've skipped. Stop when we reach a |
| | 697 | * CR or LF character, or when skipping another character would |
| | 698 | * exceed the remaining capacity of the caller's buffer, or when we |
| | 699 | * run out of data in our input buffer. |
| | 700 | */ |
| | 701 | for (src = buf_ ; *src != '\0' ; ) |
| | 702 | { |
| | 703 | size_t csiz; |
| | 704 | |
| | 705 | /* get the length of the current character */ |
| | 706 | csiz = utf8_ptr::s_charsize(*src); |
| | 707 | |
| | 708 | /* |
| | 709 | * if this character plus a null terminator wouldn't fit in the |
| | 710 | * output buffer, stop scanning |
| | 711 | */ |
| | 712 | if (csiz >= bufl) |
| | 713 | { |
| | 714 | /* |
| | 715 | * There's no more room in the caller's buffer. Copy what |
| | 716 | * we've scanned so far to the output buffer and |
| | 717 | * null-terminate the buffer. |
| | 718 | */ |
| | 719 | memcpy(dst, buf_, src - buf_); |
| | 720 | |
| | 721 | /* advance past the copied bytes and write the null byte */ |
| | 722 | dst += (src - buf_); |
| | 723 | *dst++ = '\0'; |
| | 724 | |
| | 725 | /* advance the buffer read pointer over the copied bytes */ |
| | 726 | buf_ = src; |
| | 727 | |
| | 728 | /* return success - indicate the number of bytes copied */ |
| | 729 | return (dst - buf); |
| | 730 | } |
| | 731 | |
| | 732 | /* |
| | 733 | * If it's a newline character of some kind, we're done with this |
| | 734 | * line. Note that we can just check the byte directly, since if |
| | 735 | * it's a multi-byte character, we'll never mistake the first byte |
| | 736 | * for a single-byte newline or carriage return character, since a |
| | 737 | * UTF-8 lead byte always has the high bit set. Allow Unicode |
| | 738 | * character 0x2028 (line separator) as a newline as well. |
| | 739 | */ |
| | 740 | if (*src == '\n' || *src == '\r' || utf8_ptr::s_getch(src) == 0x2028) |
| | 741 | { |
| | 742 | char nl; |
| | 743 | |
| | 744 | /* copy what we've scanned so far to the caller's buffer */ |
| | 745 | memcpy(dst, buf_, src - buf_); |
| | 746 | |
| | 747 | /* advance past the copied bytes */ |
| | 748 | dst += src - buf_; |
| | 749 | |
| | 750 | /* |
| | 751 | * add a newline to the caller's buffer -- always add a '\n' |
| | 752 | * newline, regardless of what kind of newline sequence we |
| | 753 | * found in the input; also add a null terminator |
| | 754 | */ |
| | 755 | *dst++ = '\n'; |
| | 756 | *dst++ = '\0'; |
| | 757 | |
| | 758 | /* remember which type of newline we found */ |
| | 759 | nl = *src; |
| | 760 | |
| | 761 | /* advance past the newline */ |
| | 762 | buf_ = src + csiz; |
| | 763 | |
| | 764 | /* |
| | 765 | * Check for a paired newline character. If we found a CR, |
| | 766 | * check for an LF; if we found an LF, check for a CR. This |
| | 767 | * will ensure that we will recognize essentially any |
| | 768 | * newline character sequence for any platform - this will |
| | 769 | * accept CR, LF, CR-LF, or LF-CR sequences. |
| | 770 | */ |
| | 771 | if ((nl == '\n' && *buf_ == '\r') |
| | 772 | || (nl == '\r' && *buf_ == '\n')) |
| | 773 | { |
| | 774 | /* it's a paired newline - skip the second character */ |
| | 775 | ++buf_; |
| | 776 | } |
| | 777 | |
| | 778 | /* we've finished this line - return its length */ |
| | 779 | return dst - buf; |
| | 780 | } |
| | 781 | |
| | 782 | /* skip this character in the input and proceed */ |
| | 783 | src += csiz; |
| | 784 | |
| | 785 | /* consider this space consumed in the caller's buffer */ |
| | 786 | bufl -= csiz; |
| | 787 | } |
| | 788 | |
| | 789 | /* |
| | 790 | * We've exhausted the input buffer, without filling the caller's |
| | 791 | * buffer. Copy what we've skipped so far into the caller's buffer. |
| | 792 | */ |
| | 793 | memcpy(dst, buf_, src - buf_); |
| | 794 | dst += src - buf_; |
| | 795 | |
| | 796 | /* null-terminate the result buffer */ |
| | 797 | *dst++ = '\0'; |
| | 798 | |
| | 799 | /* advance our input pointer to the new (EOF) position */ |
| | 800 | buf_ = src; |
| | 801 | |
| | 802 | /* return the buffer length */ |
| | 803 | return dst - buf; |
| | 804 | } |
| | 805 | |