/* FILE: lex.cpp
* PURPOSE: implement lexer for X
* COPYRIGHT: W. M. McKeeman 2003. You may do anything you like with
* this file except alter or remove this notice.
* MODIFIED: McKeeman at WangInst -- 86.10.08 -- original release
* McKeeman at WangInst -- 87.06.11 -- from C to D
* McKeeman at WangInst -- 87.07.07 -- single scan loop
* McKeeman at Harvard -- 88.03.01 -- went hyper
* McKeeman at Harvard -- 88.04.12 -- from D to X
* McKeeman at Dartmouth -- 00.06.09 -- upgrade to ISO C
* McKeeman at Dartmouth -- 03.05.02 -- OO version
* McKeeman at Dartmouth -- 05.03.23 -- remove perfect hash
* who @ where -- when -- what
* METHOD: Switch on leading character; report token category.
*/
# include <cstring>
# include <cctype>
# include <cstdio>
# include "lex.macros"
# include "lex.h"
// ----------------- static tables (see lex.macros) ------------------
# define SIZE(a) (sizeof(a)/sizeof(*a))
# undef FIRST
# undef SECOND
# define FIRST(a,b) a,
# define SECOND(a,b) b,
static char *rw[] = { RESERVED_WORDS(FIRST) };
static SymbolCode rwcode[] = { RESERVED_WORDS(SECOND) };
static char *op[] = { RESERVED_OPS(FIRST) };
static SymbolCode opcode[] = { RESERVED_OPS(SECOND) };
static char *sep[] = { RESERVED_SEPS(FIRST) };
static SymbolCode sepcode[] = { RESERVED_SEPS(SECOND) };
# undef FIRST
# undef SECOND
// -------------------- table initialization ---------------------
static SymbolCode
rwlookup(char *start, int len) { // reserved word
if (len < 2) return idSYM; // no 1-char rw
for (int i=0; i<SIZE(rw); i++) {
if (strlen(rw[i]) == len && strncmp(start, rw[i], len) == 0) {
return rwcode[i]; // found it
}
}
return idSYM;
}
static SymbolCode
oplookup(char *start, int len) { // reserved op
for (int i=0; i<SIZE(op); i++) {
if (strlen(op[i]) == len && strncmp(start, op[i], len) == 0) {
return opcode[i]; // found it
}
}
return opSYM;
}
static SymbolCode
seplookup(char *start) { // reserved sep
for (int i=0; i<SIZE(sep); i++) {
if (*start == *sep[i]) { // alway one char
return sepcode[i]; // found it
}
}
return sepSYM;
}
static int isop[256]; // classes
static int initialized = 0; // one time only
static void
lexInit(void) {
if (initialized) return;
memset(isop, 0, sizeof(isop)); // op class
for (int i=0; i<strlen(OPS); i++) {
isop[OPS[i]] = 1;
}
initialized = 1;
}
// ------------------------- Lex methods -------------------------
Lex::Lex(void) {lexInit();} // ctor
Lex::~Lex(void) {} // dtor
void Lex::
lex(char *src) { // null terminated
lex(src, strlen(src));
}
void Lex::
lex(char *src, int len) { // find tokens
char *beyond = src; // where to start
char *lim = src+len; // where to quit
for (;;) { // until EOL
char *begin = beyond; // remember for report
if (beyond >= lim) return; // no more chrs
switch (*begin) {
case ' ': case '\t': case '\n':
while (++beyond<lim && isspace(*beyond)); // white
report(whiteSYM, begin, beyond);
break;
case '`': // comment
while (++beyond<lim && *beyond != '\n' && *beyond != 0);
report(commentSYM, begin, beyond);
break;
case 0: // null
return; // end of src
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
case 's': case 't': case 'u': case 'v': case 'w': case 'x':
case 'y': case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
while (++beyond<lim && isalnum(*beyond)); // L(L|D)*
report(rwlookup(begin, beyond-begin), begin, beyond);
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
while (++beyond<lim && isdigit(*beyond)); // D+
if (*beyond == '.') {
while (++beyond<lim && isdigit(*beyond)); // D+.D*
report(realSYM, begin, beyond);
} else {
report(integerSYM, begin, beyond);
}
break;
case '+': case '-': case '*': case '/': case '|': case '&':
case '<': case '=': case '>': case '~': case ':': case '?':
while (++beyond<lim && isop[*beyond]); // O+
report(oplookup(begin, beyond-begin), begin, beyond);
break;
case ',': case ';': case '(': case ')':
beyond++;
report(seplookup(begin), begin, beyond); // separator
break;
default:
++beyond; // bad input char
report(charERRSYM, begin, beyond);
break;
}
}
}