cfad47cfa3/t3compiler/tads3/tcsrc.h

4b825dc642cb6eb9a060e54bf8d69288fbee4904cfad47cfa334b206c65f22086bcc5d63e6f70944
1
/* $Header: d:/cvsroot/tads/tads3/TCSRC.H,v 1.3 1999/07/11 00:46:55 MJRoberts Exp $ */
2
3
/* 
4
 *   Copyright (c) 1999, 2002 Michael J. Roberts.  All Rights Reserved.
5
 *   
6
 *   Please see the accompanying license file, LICENSE.TXT, for information
7
 *   on using and copying this software.  
8
 */
9
/*
10
Name
11
  tcsrc.h - TADS3 compiler source file reader
12
Function
13
  Provides I/O on source files, translating character sets between
14
  the file's character set and the internal UTF8 representation.
15
16
  A source file's character set is determined as follows:
17
18
  - If the first two bytes of the file are 0xFF, 0xFE, the file is taken
19
  as Unicode UCS-2 encoding, 16 bits per character, with low-order bytes
20
  stored first in each byte pair making up a character.  (The special
21
  Unicode marker character is 0xFEFF, and the character 0xFFFE is
22
  specifically illegal.  So, if we see FF FE in that order, we know that
23
  it must be taken to mean 0xFEFF, hence high byte second, because the
24
  other possibility - high byte first - would yield an invalid character.)
25
26
  - If the first two bytes of the file are 0xFE, 0xFF, the file is taken
27
  as Unicode UCS-2 encoding, 16 bits per character, with high-order bytes
28
  stored first.
29
30
  - If the first nine bytes of the contain the ASCII string "#charset"
31
  followed by an ASCII 0x20 byte, we obtain the name of the character
32
  set stored in the file by skipping past any following 0x20 or 0x09
33
  bytes.  The next character must be a double quote (0x22) character.
34
  We will scan characters up to the next double quote (0x22) character,
35
  which must occur within 256 bytes or before any CR or LF (0x0D, 0x0A)
36
  characters.  The characters between the quotes will be taken as the
37
  character set name.
38
39
  - If none of the above information is found at the beginning of the
40
  file, we take the file to be in the current global default character
41
  set.
42
Notes
43
  
44
Modified
45
  04/12/99 MJRoberts  - Creation
46
*/
47
48
#ifndef TCSRC_H
49
#define TCSRC_H
50
51
#include <stdlib.h>
52
53
54
/* ------------------------------------------------------------------------ */
55
/*
56
 *   Generic source object 
57
 */
58
class CTcSrcObject
59
{
60
public:
61
    CTcSrcObject() { }
62
    virtual ~CTcSrcObject() { }
63
64
    /* 
65
     *   Read the next line.  Fills in the buffer with a null-terminated
66
     *   string, ending in a newline if the line fits in the buffer, or
67
     *   ending without a newline if not.  Returns zero at end of file.
68
     */
69
    virtual size_t read_line(char *buf, size_t buflen) = 0;
70
71
    /* have we reached the end of the file? */
72
    virtual int at_eof() const = 0;
73
};
74
75
/* ------------------------------------------------------------------------ */
76
/*
77
 *   Source File base class
78
 */
79
class CTcSrcFile: public CTcSrcObject
80
{
81
public:
82
    /*
83
     *   Create the source file reader.  We take ownership of the mapper
84
     *   object, so we'll delete it when we're deleted. 
85
     */
86
    CTcSrcFile(osfildef *fp, class CCharmapToUni *mapper)
87
    {
88
        /* remember my source file */
89
        fp_ = fp;
90
91
        /* net yet at end of file */
92
        at_eof_ = FALSE;
93
94
        /* there's no data in the buffer yet */
95
        rem_ = 0;
96
97
        /* remember my character mapper */
98
        mapper_ = mapper;
99
    }
100
    
101
    virtual ~CTcSrcFile();
102
103
    /* 
104
     *   Read a line of text from the file.  On success, returns the
105
     *   length of the line read, including the null terminator character
106
     *   and any newline character.  At end of file or other error,
107
     *   returns zero.  Because the result is always null-terminated, a
108
     *   return value of zero will never occur except on error.
109
     *   
110
     *   The result will be a UTF-8 string, and will always be
111
     *   null-terminated.  If the source line fits into the buffer, a
112
     *   newline character ('\n') will be the last character of the
113
     *   string.  If the line is too long for the buffer, the result will
114
     *   end with something other than a newline character, and the next
115
     *   read_line() call will retrieve the remainder of the line (or as
116
     *   much of the remainder as will fit into the buffer for that call).
117
     *   The one exception is that, if the file does not end in a newline,
118
     *   the last line will be returned without a newline; this condition
119
     *   can be distinguished by the non-zero return value of the
120
     *   subsequent call to read_line().
121
     *   
122
     *   This routine translates from the source character set to UTF-8,
123
     *   and automatically translates newline conventions.  We handle CR,
124
     *   LF, CR-LF, or LF-CR newlines (CR is ASCII 13, LF is ASCII 10).  
125
     */
126
    size_t read_line(char *buf, size_t bufl);
127
128
    /* determine if we've reached end of file */
129
    int at_eof() const { return at_eof_; }
130
131
    /*
132
     *   Open a source file.  We'll scan the beginning of the file to
133
     *   determine what type of source file reader to use, then create an
134
     *   appropriate source file reader subclass to read the file.  We
135
     *   expect the filename to be limited to ASCII characters.
136
     *   
137
     *   If we can't identify the character set that the file uses, we'll
138
     *   use the given default character set.  If no default character set
139
     *   is given, we'll create a plain ASCII reader.
140
     *   
141
     *   If we encounter a #charset directive, and we can't load the
142
     *   desired character set map, we'll set *charset_error to true;
143
     *   otherwise, we'll set *charset_error to false.  Note that
144
     *   *charset_error will be set to false if there's simply no #charset
145
     *   directive.
146
     *   
147
     *   If we fail to open the default character set, we'll return null
148
     *   and set *default_charset_error to true.  
149
     */
150
    static CTcSrcFile *open_source(const char *filename,
151
                                   class CResLoader *res_loader,
152
                                   const char *default_charset,
153
                                   int *charset_error,
154
                                   int *default_charset_error);
155
156
    /*
157
     *   Open a plain ASCII source file or a Unicode file.  This doesn't look
158
     *   for a #charset marker, but it does check for Unicode byte-order
159
     *   markers.  If we find a Unicode byte-order marker, we'll read the
160
     *   file using the suitable Unicode mapper; otherwise we'll read it
161
     *   using a plain ASCII mapper.  
162
     */
163
    static CTcSrcFile *open_plain(const char *filename);
164
165
    /*
166
     *   Open a plain ASCII source file. 
167
     */
168
    static CTcSrcFile *open_ascii(const char *filename);
169
170
protected:
171
    /* 
172
     *   match the leading substring of a unicode utf-16 string to a given
173
     *   ascii string 
174
     */
175
    static int ucs_str_starts_with(const char *ustr, size_t ulen,
176
                                   const char *astr,
177
                                   int big_endian, int case_fold)
178
    {
179
        /* compare each character of the unicode string to the ascii string */
180
        for ( ; ulen >= 2 && *astr != '\0' ; ustr += 2, ulen -= 2, ++astr)
181
        {
182
            /* if the characters don't match, we don't have a match */
183
            if (!ucs_char_eq(ustr, *astr, big_endian, case_fold))
184
                return FALSE;
185
        }
186
187
        /* 
188
         *   if we reached the end of the ASCII string, we have a match;
189
         *   otherwise, we ran out of the Unicode string before we ran out of
190
         *   the ASCII string, so we don't have a match 
191
         */
192
        return (*astr == 0);
193
    }
194
195
    /* does a Unicode character match an ASCII character? */
196
    static int ucs_char_eq(const char *ustr, char ac, int big_endian,
197
                           int case_fold)
198
    {
199
        uchar lo, hi;
200
        uint uc;
201
202
        /* get this unicode character, translating its endianness */
203
        if (big_endian)
204
            hi = (uchar)*ustr, lo = (uchar)*(ustr + 1);
205
        else
206
            lo = (uchar)*ustr, hi = (uchar)*(ustr + 1);
207
        uc = (hi << 8) + lo;
208
209
        /* if it's outside of ASCII range, we obviously can't match */
210
        if (uc > 127)
211
            return FALSE;
212
213
        /* if we're folding case, convert both to lower case */
214
        if (case_fold)
215
            ac = (char)tolower(ac), uc = tolower((char)uc);
216
217
        /* compare the characters */
218
        return (ac == (char)uc);
219
    }
220
221
    /* end-of-file flag */
222
    unsigned int at_eof_ : 1;
223
    
224
    /* my source file */
225
    osfildef *fp_;
226
227
    /* read buffer */
228
    char buf_[1024];
229
230
    /* amount of data in the buffer */
231
    size_t rem_;
232
233
    /* current position in buffer */
234
    char *p_;
235
236
    /* my character mapper */
237
    class CCharmapToUni *mapper_;
238
};
239
240
/* ------------------------------------------------------------------------ */
241
/*
242
 *   Memory buffer-based source reader 
243
 */
244
class CTcSrcMemory: public CTcSrcObject
245
{
246
public:
247
    CTcSrcMemory(const char *buf, class CCharmapToUni *mapper);
248
    ~CTcSrcMemory();
249
250
    /* read the next line */
251
    size_t read_line(char *buf, size_t bufl);
252
253
    /* determine if we've reached end of file */
254
    int at_eof() const { return (*buf_ == '\0'); }
255
256
private:
257
    /* allocated buffer */
258
    char *buf_alo_;
259
    
260
    /* current buffer pointer */
261
    const char *buf_;
262
};
263
264
#endif /* TCSRC_H */
265