| | 1 | /* $Header: d:/cvsroot/tads/tads3/TCSRC.H,v 1.3 1999/07/11 00:46:55 MJRoberts Exp $ */ |
| | 2 | |
| | 3 | /* |
| | 4 | * Copyright (c) 1999, 2002 Michael J. Roberts. All Rights Reserved. |
| | 5 | * |
| | 6 | * Please see the accompanying license file, LICENSE.TXT, for information |
| | 7 | * on using and copying this software. |
| | 8 | */ |
| | 9 | /* |
| | 10 | Name |
| | 11 | tcsrc.h - TADS3 compiler source file reader |
| | 12 | Function |
| | 13 | Provides I/O on source files, translating character sets between |
| | 14 | the file's character set and the internal UTF8 representation. |
| | 15 | |
| | 16 | A source file's character set is determined as follows: |
| | 17 | |
| | 18 | - If the first two bytes of the file are 0xFF, 0xFE, the file is taken |
| | 19 | as Unicode UCS-2 encoding, 16 bits per character, with low-order bytes |
| | 20 | stored first in each byte pair making up a character. (The special |
| | 21 | Unicode marker character is 0xFEFF, and the character 0xFFFE is |
| | 22 | specifically illegal. So, if we see FF FE in that order, we know that |
| | 23 | it must be taken to mean 0xFEFF, hence high byte second, because the |
| | 24 | other possibility - high byte first - would yield an invalid character.) |
| | 25 | |
| | 26 | - If the first two bytes of the file are 0xFE, 0xFF, the file is taken |
| | 27 | as Unicode UCS-2 encoding, 16 bits per character, with high-order bytes |
| | 28 | stored first. |
| | 29 | |
| | 30 | - If the first nine bytes of the contain the ASCII string "#charset" |
| | 31 | followed by an ASCII 0x20 byte, we obtain the name of the character |
| | 32 | set stored in the file by skipping past any following 0x20 or 0x09 |
| | 33 | bytes. The next character must be a double quote (0x22) character. |
| | 34 | We will scan characters up to the next double quote (0x22) character, |
| | 35 | which must occur within 256 bytes or before any CR or LF (0x0D, 0x0A) |
| | 36 | characters. The characters between the quotes will be taken as the |
| | 37 | character set name. |
| | 38 | |
| | 39 | - If none of the above information is found at the beginning of the |
| | 40 | file, we take the file to be in the current global default character |
| | 41 | set. |
| | 42 | Notes |
| | 43 | |
| | 44 | Modified |
| | 45 | 04/12/99 MJRoberts - Creation |
| | 46 | */ |
| | 47 | |
| | 48 | #ifndef TCSRC_H |
| | 49 | #define TCSRC_H |
| | 50 | |
| | 51 | #include <stdlib.h> |
| | 52 | |
| | 53 | |
| | 54 | /* ------------------------------------------------------------------------ */ |
| | 55 | /* |
| | 56 | * Generic source object |
| | 57 | */ |
| | 58 | class CTcSrcObject |
| | 59 | { |
| | 60 | public: |
| | 61 | CTcSrcObject() { } |
| | 62 | virtual ~CTcSrcObject() { } |
| | 63 | |
| | 64 | /* |
| | 65 | * Read the next line. Fills in the buffer with a null-terminated |
| | 66 | * string, ending in a newline if the line fits in the buffer, or |
| | 67 | * ending without a newline if not. Returns zero at end of file. |
| | 68 | */ |
| | 69 | virtual size_t read_line(char *buf, size_t buflen) = 0; |
| | 70 | |
| | 71 | /* have we reached the end of the file? */ |
| | 72 | virtual int at_eof() const = 0; |
| | 73 | }; |
| | 74 | |
| | 75 | /* ------------------------------------------------------------------------ */ |
| | 76 | /* |
| | 77 | * Source File base class |
| | 78 | */ |
| | 79 | class CTcSrcFile: public CTcSrcObject |
| | 80 | { |
| | 81 | public: |
| | 82 | /* |
| | 83 | * Create the source file reader. We take ownership of the mapper |
| | 84 | * object, so we'll delete it when we're deleted. |
| | 85 | */ |
| | 86 | CTcSrcFile(osfildef *fp, class CCharmapToUni *mapper) |
| | 87 | { |
| | 88 | /* remember my source file */ |
| | 89 | fp_ = fp; |
| | 90 | |
| | 91 | /* net yet at end of file */ |
| | 92 | at_eof_ = FALSE; |
| | 93 | |
| | 94 | /* there's no data in the buffer yet */ |
| | 95 | rem_ = 0; |
| | 96 | |
| | 97 | /* remember my character mapper */ |
| | 98 | mapper_ = mapper; |
| | 99 | } |
| | 100 | |
| | 101 | virtual ~CTcSrcFile(); |
| | 102 | |
| | 103 | /* |
| | 104 | * Read a line of text from the file. On success, returns the |
| | 105 | * length of the line read, including the null terminator character |
| | 106 | * and any newline character. At end of file or other error, |
| | 107 | * returns zero. Because the result is always null-terminated, a |
| | 108 | * return value of zero will never occur except on error. |
| | 109 | * |
| | 110 | * The result will be a UTF-8 string, and will always be |
| | 111 | * null-terminated. If the source line fits into the buffer, a |
| | 112 | * newline character ('\n') will be the last character of the |
| | 113 | * string. If the line is too long for the buffer, the result will |
| | 114 | * end with something other than a newline character, and the next |
| | 115 | * read_line() call will retrieve the remainder of the line (or as |
| | 116 | * much of the remainder as will fit into the buffer for that call). |
| | 117 | * The one exception is that, if the file does not end in a newline, |
| | 118 | * the last line will be returned without a newline; this condition |
| | 119 | * can be distinguished by the non-zero return value of the |
| | 120 | * subsequent call to read_line(). |
| | 121 | * |
| | 122 | * This routine translates from the source character set to UTF-8, |
| | 123 | * and automatically translates newline conventions. We handle CR, |
| | 124 | * LF, CR-LF, or LF-CR newlines (CR is ASCII 13, LF is ASCII 10). |
| | 125 | */ |
| | 126 | size_t read_line(char *buf, size_t bufl); |
| | 127 | |
| | 128 | /* determine if we've reached end of file */ |
| | 129 | int at_eof() const { return at_eof_; } |
| | 130 | |
| | 131 | /* |
| | 132 | * Open a source file. We'll scan the beginning of the file to |
| | 133 | * determine what type of source file reader to use, then create an |
| | 134 | * appropriate source file reader subclass to read the file. We |
| | 135 | * expect the filename to be limited to ASCII characters. |
| | 136 | * |
| | 137 | * If we can't identify the character set that the file uses, we'll |
| | 138 | * use the given default character set. If no default character set |
| | 139 | * is given, we'll create a plain ASCII reader. |
| | 140 | * |
| | 141 | * If we encounter a #charset directive, and we can't load the |
| | 142 | * desired character set map, we'll set *charset_error to true; |
| | 143 | * otherwise, we'll set *charset_error to false. Note that |
| | 144 | * *charset_error will be set to false if there's simply no #charset |
| | 145 | * directive. |
| | 146 | * |
| | 147 | * If we fail to open the default character set, we'll return null |
| | 148 | * and set *default_charset_error to true. |
| | 149 | */ |
| | 150 | static CTcSrcFile *open_source(const char *filename, |
| | 151 | class CResLoader *res_loader, |
| | 152 | const char *default_charset, |
| | 153 | int *charset_error, |
| | 154 | int *default_charset_error); |
| | 155 | |
| | 156 | /* |
| | 157 | * Open a plain ASCII source file or a Unicode file. This doesn't look |
| | 158 | * for a #charset marker, but it does check for Unicode byte-order |
| | 159 | * markers. If we find a Unicode byte-order marker, we'll read the |
| | 160 | * file using the suitable Unicode mapper; otherwise we'll read it |
| | 161 | * using a plain ASCII mapper. |
| | 162 | */ |
| | 163 | static CTcSrcFile *open_plain(const char *filename); |
| | 164 | |
| | 165 | /* |
| | 166 | * Open a plain ASCII source file. |
| | 167 | */ |
| | 168 | static CTcSrcFile *open_ascii(const char *filename); |
| | 169 | |
| | 170 | protected: |
| | 171 | /* |
| | 172 | * match the leading substring of a unicode utf-16 string to a given |
| | 173 | * ascii string |
| | 174 | */ |
| | 175 | static int ucs_str_starts_with(const char *ustr, size_t ulen, |
| | 176 | const char *astr, |
| | 177 | int big_endian, int case_fold) |
| | 178 | { |
| | 179 | /* compare each character of the unicode string to the ascii string */ |
| | 180 | for ( ; ulen >= 2 && *astr != '\0' ; ustr += 2, ulen -= 2, ++astr) |
| | 181 | { |
| | 182 | /* if the characters don't match, we don't have a match */ |
| | 183 | if (!ucs_char_eq(ustr, *astr, big_endian, case_fold)) |
| | 184 | return FALSE; |
| | 185 | } |
| | 186 | |
| | 187 | /* |
| | 188 | * if we reached the end of the ASCII string, we have a match; |
| | 189 | * otherwise, we ran out of the Unicode string before we ran out of |
| | 190 | * the ASCII string, so we don't have a match |
| | 191 | */ |
| | 192 | return (*astr == 0); |
| | 193 | } |
| | 194 | |
| | 195 | /* does a Unicode character match an ASCII character? */ |
| | 196 | static int ucs_char_eq(const char *ustr, char ac, int big_endian, |
| | 197 | int case_fold) |
| | 198 | { |
| | 199 | uchar lo, hi; |
| | 200 | uint uc; |
| | 201 | |
| | 202 | /* get this unicode character, translating its endianness */ |
| | 203 | if (big_endian) |
| | 204 | hi = (uchar)*ustr, lo = (uchar)*(ustr + 1); |
| | 205 | else |
| | 206 | lo = (uchar)*ustr, hi = (uchar)*(ustr + 1); |
| | 207 | uc = (hi << 8) + lo; |
| | 208 | |
| | 209 | /* if it's outside of ASCII range, we obviously can't match */ |
| | 210 | if (uc > 127) |
| | 211 | return FALSE; |
| | 212 | |
| | 213 | /* if we're folding case, convert both to lower case */ |
| | 214 | if (case_fold) |
| | 215 | ac = (char)tolower(ac), uc = tolower((char)uc); |
| | 216 | |
| | 217 | /* compare the characters */ |
| | 218 | return (ac == (char)uc); |
| | 219 | } |
| | 220 | |
| | 221 | /* end-of-file flag */ |
| | 222 | unsigned int at_eof_ : 1; |
| | 223 | |
| | 224 | /* my source file */ |
| | 225 | osfildef *fp_; |
| | 226 | |
| | 227 | /* read buffer */ |
| | 228 | char buf_[1024]; |
| | 229 | |
| | 230 | /* amount of data in the buffer */ |
| | 231 | size_t rem_; |
| | 232 | |
| | 233 | /* current position in buffer */ |
| | 234 | char *p_; |
| | 235 | |
| | 236 | /* my character mapper */ |
| | 237 | class CCharmapToUni *mapper_; |
| | 238 | }; |
| | 239 | |
| | 240 | /* ------------------------------------------------------------------------ */ |
| | 241 | /* |
| | 242 | * Memory buffer-based source reader |
| | 243 | */ |
| | 244 | class CTcSrcMemory: public CTcSrcObject |
| | 245 | { |
| | 246 | public: |
| | 247 | CTcSrcMemory(const char *buf, class CCharmapToUni *mapper); |
| | 248 | ~CTcSrcMemory(); |
| | 249 | |
| | 250 | /* read the next line */ |
| | 251 | size_t read_line(char *buf, size_t bufl); |
| | 252 | |
| | 253 | /* determine if we've reached end of file */ |
| | 254 | int at_eof() const { return (*buf_ == '\0'); } |
| | 255 | |
| | 256 | private: |
| | 257 | /* allocated buffer */ |
| | 258 | char *buf_alo_; |
| | 259 | |
| | 260 | /* current buffer pointer */ |
| | 261 | const char *buf_; |
| | 262 | }; |
| | 263 | |
| | 264 | #endif /* TCSRC_H */ |
| | 265 | |