cfad47cfa3/t3compiler/tads3/tcsrc.cpp
Commiter: Nikos Chantziaras
Author: Nikos Chantziaras
Revision: cfad47cfa3
File Size: 24.8 KB
(June 01, 2009 20:54 UTC) Almost 3 years ago
Initial commit.
Showing without highlighting since it looks like a big file and may slow your browser - show with highlighting
Show/hide line numbers#ifdef RCSID
static char RCSid[] =
"$Header: d:/cvsroot/tads/tads3/TCSRC.CPP,v 1.3 1999/07/11 00:46:55 MJRoberts Exp $";
#endif
/*
* Copyright (c) 1999, 2002 Michael J. Roberts. All Rights Reserved.
*
* Please see the accompanying license file, LICENSE.TXT, for information
* on using and copying this software.
*/
/*
Name
tcsrc.cpp - source file reader
Function
Notes
Modified
04/13/99 MJRoberts - Creation
*/
#include <string.h>
#include <stdlib.h>
#include "os.h"
#include "t3std.h"
#include "tcsrc.h"
#include "tcglob.h"
#include "charmap.h"
/* ------------------------------------------------------------------------ */
/*
* Deletion
*/
CTcSrcFile::~CTcSrcFile()
{
/* close my source file */
if (fp_ != 0)
osfcls(fp_);
/* release my character mapper */
if (mapper_ != 0)
mapper_->release_ref();
}
#if 0
// we don't currently need this, but keep the source in case it
// becomes interesting later
//
/* ------------------------------------------------------------------------ */
/*
* Open a plain ASCII file, with no #charset marker.
*/
CTcSrcFile *CTcSrcFile::open_plain(const char *filename)
{
osfildef *fp;
char buf[5];
size_t siz;
/*
* open the file in binary mode, since we do all of the newline
* interpretation explicitly
*/
if ((fp = osfoprb(filename, OSFTTEXT)) == 0)
return 0;
/* read the first few bytes of the file */
siz = osfrbc(fp, buf, sizeof(buf));
/* check for a 3-byte UTF-8 marker */
if (siz >= 3
&& (uchar)buf[0] == 0xEF
&& (uchar)buf[1] == 0xBB
&& (uchar)buf[2] == 0xBF)
{
/*
* seek to the byte after the marker, so that our caller won't see
* the marker
*/
osfseek(fp, 3, OSFSK_SET);
/* return a source file reader with a utf-8 mapper */
return new CTcSrcFile(fp, new CCharmapToUniUTF8());
}
/* if we read at least two bytes, try auto-detecting UCS-2 */
if (siz >= 2)
{
/* if the first bytes are 0xFF 0xFE, it's UCS-2 low-byte first */
if ((unsigned char)buf[0] == 0xFF && (unsigned char)buf[1] == 0xFE)
{
/* seek to the byte after the marker */
osfseek(fp, 2, OSFSK_SET);
/* return a reader with a little-endian mapper */
return new CTcSrcFile(fp, new CCharmapToUniUcs2Little());
}
/* if the first bytes are 0xFE 0xFF, it's UCS-2 high-byte first */
if ((unsigned char)buf[0] == 0xFE && (unsigned char)buf[1] == 0xFF)
{
/* seek to the byte after the marker */
osfseek(fp, 2, OSFSK_SET);
/* return a reader with a little-endian mapper */
return new CTcSrcFile(fp, new CCharmapToUniUcs2Big());
}
}
/*
* there are no Unicode markers, so our only remaining option is plain
* ASCII - return a source file object with a plain ASCII mapper
*/
return new CTcSrcFile(fp, new CCharmapToUniASCII());
}
#endif
/* ------------------------------------------------------------------------ */
/*
* Open a plain ASCII source file.
*/
CTcSrcFile *CTcSrcFile::open_ascii(const char *filename)
{
osfildef *fp;
/*
* open the file in binary mode, since we do all of the newline
* interpretation explicitly
*/
if ((fp = osfoprb(filename, OSFTTEXT)) == 0)
return 0;
/* return a source reader with a plain ASCII mapper */
return new CTcSrcFile(fp, new CCharmapToUniASCII());
}
/* ------------------------------------------------------------------------ */
/*
* Open a source file
*/
CTcSrcFile *CTcSrcFile::open_source(const char *filename,
class CResLoader *res_loader,
const char *default_charset,
int *charset_error,
int *default_charset_error)
{
char buf[275];
size_t siz;
osfildef *fp;
long startofs;
CCharmapToUni *mapper;
/* presume we won't find an invalid #charset directive */
*charset_error = FALSE;
/* presume we'll have no problem with the default character set */
*default_charset_error = FALSE;
/*
* open the file in binary mode, so that we can scan the first few
* bytes to see if we can detect the character set from information
* at the beginning of the file
*/
fp = osfoprb(filename, OSFTTEXT);
/* if we couldn't open the file, return failure */
if (fp == 0)
return 0;
/* note the starting offset in the file */
startofs = osfpos(fp);
/* read the first few bytes of the file */
siz = osfrbc(fp, buf, sizeof(buf));
/* check for a 3-byte UTF-8 byte-order marker */
if (siz >= 3 && (uchar)buf[0] == 0xEF && (uchar)buf[1] == 0xBB
&& (uchar)buf[2] == 0xBF)
{
char *p;
size_t rem;
uint skip;
/* skip at least the three-byte marker sequence */
skip = 3;
/*
* check for a #charset marker for utf-8 - this would be redundant,
* but we'll allow it
*/
p = buf + 3;
rem = siz - 3;
if (rem > 9 && memcmp(p, "#charset ", 9) == 0)
{
/* skip spaces */
for (p += 9, rem -= 9 ; rem != 0 && (*p == ' ' || *p == '\t') ;
++p, --rem);
/* check for valid character set markers */
if (rem >= 7 && memicmp(p, "\"utf-8\"", 7) == 0)
{
/* skip the whole sequence */
skip = (p + 7) - buf;
}
else if (rem >= 6 && memicmp(p, "\"utf8\"", 6) == 0)
{
/* skip the whole sequence */
skip = (p + 6) - buf;
}
}
/* seek past the character set markers */
osfseek(fp, startofs + skip, OSFSK_SET);
/* return a new utf-8 decoder */
return new CTcSrcFile(fp, new CCharmapToUniUTF8());
}
/* if we read at least two bytes, try auto-detecting unicode */
if (siz >= 2)
{
CTcSrcFile *srcf;
const char *const *cs_names;
int bige;
/* presume we won't find a byte-order marker */
srcf = 0;
/* if the first bytes are 0xFF 0xFE, it's UCS-2 low-byte first */
if ((unsigned char)buf[0] == 0xFF && (unsigned char)buf[1] == 0xFE)
{
static const char *names[] = { "unicodel", "utf-16le", 0 };
/* create a UCS-2 little-endian reader */
srcf = new CTcSrcFile(fp, new CCharmapToUniUcs2Little());
bige = FALSE;
cs_names = names;
}
/* if the first bytes are 0xFE 0xFF, it's UCS-2 high-byte first */
if ((unsigned char)buf[0] == 0xFE && (unsigned char)buf[1] == 0xFF)
{
static const char *names[] = { "unicodeb", "utf-16be", 0 };
/* create a UCS-2 little-endian reader */
srcf = new CTcSrcFile(fp, new CCharmapToUniUcs2Big());
bige = TRUE;
cs_names = names;
}
/* if we found the byte-order marker, we know the character set */
if (srcf != 0)
{
uint skip;
/* we at least want to skip the byte-order marker */
skip = 2;
/* check to see if we have a '#charset' directive */
if (ucs_str_starts_with(buf + 2, siz - 2, "#charset ",
bige, FALSE))
{
char *p;
size_t rem;
/* scan past following spaces */
for (p = buf + 2 + 18, rem = siz - 2 - 18 ;
rem >= 2 && (ucs_char_eq(p, ' ', bige, FALSE)
|| ucs_char_eq(p, '\t', bige, FALSE)) ;
p += 2, rem -= 2) ;
/* check for a '"' */
if (rem >= 2 && ucs_char_eq(p, '"', bige, FALSE))
{
const char *const *n;
/* skip the '"' */
p += 2;
rem -= 2;
/*
* check for a match to any of the valid names for this
* character set
*/
for (n = cs_names ; *n != 0 ; ++n)
{
/* if it's a match, stop scanning */
if (ucs_str_starts_with(p, rem, *n, bige, TRUE))
{
size_t l;
/* get the length of the name */
l = strlen(*n) * 2;
/* check for a close quote */
if (rem >= l + 2
&& ucs_char_eq(p + l, '"', bige, FALSE))
{
/* skip the name and the quote */
p += l + 2;
rem -= l + 2;
/* skip the source text to this point */
skip = p - buf;
/* stop scanning */
break;
}
}
}
}
}
/* seek just past the character set indicators */
osfseek(fp, startofs + skip, OSFSK_SET);
/* return the file */
return srcf;
}
}
/*
* It doesn't appear to use UCS-2 encoding (at least, the file
* doesn't start with a byte-order sensing sequence). Check to see
* if the file starts with "#charset " in ASCII single-byte
* characters.
*/
if (siz >= 9 && memcmp(buf, "#charset ", 9) == 0)
{
char *p;
size_t rem;
/* skip the #charset string and any following spaces */
for (p = buf + 9, rem = siz - 9 ;
rem > 0 && (*p == ' ' || *p == '\t') ; ++p, --rem) ;
/* make sure we're looking at a '"' */
if (rem != 0 && *p == '"')
{
char *charset_name;
/* skip the open quote */
++p;
--rem;
/* remember where the character set name starts */
charset_name = p;
/*
* find the closing quote, which must occur before a CR or
* LF character
*/
for ( ; rem > 0 && *p != '"' && *p != 10 && *p != 13 ;
++p, --rem) ;
/* make sure we found a matching quote */
if (rem != 0 && *p == '"')
{
/* seek just past the #charset string */
osfseek(fp, startofs + (p - buf) + 1, OSFSK_SET);
/*
* put a null terminator at the end of the character set
* name
*/
*p = '\0';
/* create a mapper */
mapper = CCharmapToUni::load(res_loader, charset_name);
/*
* if that succeeded, return a reader for the mapper;
* otherwise, simply proceed as though no #charset had
* been present, so that we create a default mapper
*/
if (mapper != 0)
{
/* success - return a reader */
return new CTcSrcFile(fp, mapper);
}
else
{
/* tell the caller the #charset was invalid */
*charset_error = TRUE;
}
}
}
}
/*
* we didn't find any sensing codes, so seek back to the start of
* the file
*/
osfseek(fp, startofs, OSFSK_SET);
/*
* We couldn't identify the file's character set based on anything
* in the file, so create a mapper for the given default character
* set. If there's not even a default character set defined, create
* a plain ASCII mapper.
*/
if (default_charset != 0)
mapper = CCharmapToUni::load(res_loader, default_charset);
else
mapper = new CCharmapToUniASCII();
/* check to see if we created a mapper */
if (mapper != 0)
{
/* return a source file reader based on the mapper */
return new CTcSrcFile(fp, mapper);
}
else
{
/*
* we failed to create a mapper for the default character set -
* flag the problem
*/
*default_charset_error = TRUE;
/* close the input file */
osfcls(fp);
/* return failure */
return 0;
}
}
/* ------------------------------------------------------------------------ */
/*
* Read a line of text from the file.
*/
size_t CTcSrcFile::read_line(char *buf, size_t bufl)
{
char *dst;
/* start out writing to the start of the caller's buffer */
dst = buf;
/*
* Keep going until we run out of input file, fill up the buffer, or
* reach the end of a line
*/
for (;;)
{
char *src;
/* read some more data if our buffer is empty */
if (rem_ == 0)
{
/* load another buffer-full */
rem_ = mapper_->read_file(fp_, buf_, sizeof(buf_), 0);
/*
* If we didn't read anything, we've reached the end of the
* file. If we've already copied anything into the caller's
* buffer, null-terminate their buffer and return success;
* otherwise, return failure, since the caller has already
* read everything available from the file.
*/
if (rem_ == 0)
{
/*
* Remember that we've reached the end of the file.
* We're about to return the last of the data, so the
* caller will not need to call us again (although it's
* legal if they do - we'll just return a zero length on
* the next call).
*/
at_eof_ = TRUE;
/* check if we've copied anything to the caller's buffer */
if (buf == dst)
{
/* the caller's buffer is empty - return end of file */
return 0;
}
else
{
/* null-terminate the caller's buffer */
*dst++ = '\0';
/*
* return the number of bytes copied, including the null
* terminator
*/
return (dst - buf);
}
}
/* start over at the beginning of the buffer */
p_ = buf_;
}
/*
* Scan the input buffer one character (not byte) at a time.
* Keep track of how much many bytes we've skipped. Stop when
* we reach a CR or LF character, or when skipping another
* character would exceed the remaining capacity of the caller's
* buffer, or when we run out of data in our input buffer.
*/
for (src = p_ ; rem_ > 0 ; )
{
size_t csiz;
/* get the length of the current character */
csiz = utf8_ptr::s_charsize(*src);
/*
* if this character plus a null terminator wouldn't fit in
* the output buffer, stop scanning
*/
if (csiz >= bufl)
{
/*
* There's no more room in the caller's buffer. Copy
* what we've scanned so far to the output buffer and
* null-terminate the buffer.
*/
memcpy(dst, p_, src - p_);
/* advance past the copied bytes and write the null byte */
dst += (src - p_);
*dst++ = '\0';
/* advance the buffer read pointer over the copied bytes */
p_ = src;
/* return success - indicate the number of bytes copied */
return (dst - buf);
}
/*
* If it's a newline character of some kind, we're done with
* this line. Note that we can just check the byte directly,
* since if it's a multi-byte character, we'll never mistake
* the first byte for a single-byte newline or carriage return
* character, since a UTF-8 lead byte always has the high bit
* set.
*
* Also treat the Unicode character 0x2028 (line separator) as
* a newline.
*/
if (*src == '\n' || *src == '\r'
|| utf8_ptr::s_getch(src) == 0x2028)
{
char nl;
/* copy what we've scanned so far to the caller's buffer */
memcpy(dst, p_, src - p_);
/* advance past the copied bytes */
dst += src - p_;
/*
* add a newline to the caller's buffer -- always add a
* '\n' newline, regardless of what kind of newline
* sequence we found in the input; also add a null
* terminator
*/
*dst++ = '\n';
*dst++ = '\0';
/* remember which type of newline we found */
nl = *src;
/* advance past the newline */
p_ = src + csiz;
rem_ -= csiz;
/*
* If the input buffer is empty, read more, so that we
* can check the next character after the newline
* character.
*/
if (rem_ == 0)
{
/* read more data */
rem_ = mapper_->read_file(fp_, buf_, sizeof(buf_), 0);
/* start over at the start of the buffer */
p_ = buf_;
}
/*
* Check for a paired newline character. If we found a
* CR, check for an LF; if we found an LF, check for a
* CR. This will ensure that we will recognize
* essentially any newline character sequence for any
* platform - this will accept CR, LF, CR-LF, or LF-CR
* sequences.
*/
if (rem_ != 0
&& ((nl == '\n' && *p_ == '\r')
|| (nl == '\r' && *p_ == '\n')))
{
/* it's a paired newline - skip the second character */
++p_;
--rem_;
}
/* we've finished this line - return success */
return dst - buf;
}
/* skip this character in the input and proceed */
src += csiz;
rem_ -= csiz;
/* consider this character consumed in the caller's buffer */
bufl -= csiz;
}
/*
* We've exhausted the current input buffer, without filling the
* caller's buffer. Copy what we've skipped so far into the
* caller's buffer.
*/
memcpy(dst, p_, src - p_);
/*
* Advance the output pointer past the data we just copied, then
* continue looping to read more data from the input file.
*/
dst += src - p_;
}
}
/* ------------------------------------------------------------------------ */
/*
* Buffer reader source object
*/
/*
* allocate
*/
CTcSrcMemory::CTcSrcMemory(const char *buf, CCharmapToUni *mapper)
{
size_t len;
size_t alo_len;
char *p;
/* get the length of the null-terminated source string */
len = strlen(buf);
/*
* Allocate a buffer for a UTF8-encoded copy of the buffer -
* allocate three bytes per byte of the original, since this is the
* worst case for expansion of the encoding. Allocate one extra
* byte to ensure we have space for a null terminator.
*/
alo_len = len*3;
buf_alo_ = (char *)t3malloc(alo_len + 1);
/* map the buffer */
p = buf_alo_;
mapper->map(&p, &alo_len, buf, len);
/* null-terminate the translated buffer */
*p = '\0';
/* start reading at the start of the translated buffer */
buf_ = buf_alo_;
}
/*
* delete
*/
CTcSrcMemory::~CTcSrcMemory()
{
/* free our buffer */
t3free(buf_alo_);
}
/*
* read next line
*/
size_t CTcSrcMemory::read_line(char *buf, size_t bufl)
{
char *dst;
const char *src;
/* if there's nothing left in our buffer, return EOF */
if (*buf_ == '\0')
return 0;
/* start out writing to the start of the caller's buffer */
dst = buf;
/*
* Scan the input buffer one character (not byte) at a time. Keep
* track of how much many bytes we've skipped. Stop when we reach a
* CR or LF character, or when skipping another character would
* exceed the remaining capacity of the caller's buffer, or when we
* run out of data in our input buffer.
*/
for (src = buf_ ; *src != '\0' ; )
{
size_t csiz;
/* get the length of the current character */
csiz = utf8_ptr::s_charsize(*src);
/*
* if this character plus a null terminator wouldn't fit in the
* output buffer, stop scanning
*/
if (csiz >= bufl)
{
/*
* There's no more room in the caller's buffer. Copy what
* we've scanned so far to the output buffer and
* null-terminate the buffer.
*/
memcpy(dst, buf_, src - buf_);
/* advance past the copied bytes and write the null byte */
dst += (src - buf_);
*dst++ = '\0';
/* advance the buffer read pointer over the copied bytes */
buf_ = src;
/* return success - indicate the number of bytes copied */
return (dst - buf);
}
/*
* If it's a newline character of some kind, we're done with this
* line. Note that we can just check the byte directly, since if
* it's a multi-byte character, we'll never mistake the first byte
* for a single-byte newline or carriage return character, since a
* UTF-8 lead byte always has the high bit set. Allow Unicode
* character 0x2028 (line separator) as a newline as well.
*/
if (*src == '\n' || *src == '\r' || utf8_ptr::s_getch(src) == 0x2028)
{
char nl;
/* copy what we've scanned so far to the caller's buffer */
memcpy(dst, buf_, src - buf_);
/* advance past the copied bytes */
dst += src - buf_;
/*
* add a newline to the caller's buffer -- always add a '\n'
* newline, regardless of what kind of newline sequence we
* found in the input; also add a null terminator
*/
*dst++ = '\n';
*dst++ = '\0';
/* remember which type of newline we found */
nl = *src;
/* advance past the newline */
buf_ = src + csiz;
/*
* Check for a paired newline character. If we found a CR,
* check for an LF; if we found an LF, check for a CR. This
* will ensure that we will recognize essentially any
* newline character sequence for any platform - this will
* accept CR, LF, CR-LF, or LF-CR sequences.
*/
if ((nl == '\n' && *buf_ == '\r')
|| (nl == '\r' && *buf_ == '\n'))
{
/* it's a paired newline - skip the second character */
++buf_;
}
/* we've finished this line - return its length */
return dst - buf;
}
/* skip this character in the input and proceed */
src += csiz;
/* consider this space consumed in the caller's buffer */
bufl -= csiz;
}
/*
* We've exhausted the input buffer, without filling the caller's
* buffer. Copy what we've skipped so far into the caller's buffer.
*/
memcpy(dst, buf_, src - buf_);
dst += src - buf_;
/* null-terminate the result buffer */
*dst++ = '\0';
/* advance our input pointer to the new (EOF) position */
buf_ = src;
/* return the buffer length */
return dst - buf;
}
|