00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "compiler.h"
00025 #include <cstdlib>
00026 #include <sstream>
00027 #include <ostream>
00028 #include <cctype>
00029 #include <cstdio>
00030
00031 namespace Aseba
00032 {
00034 Compiler::Token::Token(Type type, SourcePos pos, const std::string& value) :
00035 type(type),
00036 sValue(value),
00037 pos(pos)
00038 {
00039 if (type == TOKEN_INT_LITERAL)
00040 {
00041 bool wasSigned = false;
00042 if ((value.length() > 1) && (value[1] == 'x'))
00043 iValue = strtol(value.c_str() + 2, NULL, 16);
00044 else if ((value.length() > 1) && (value[1] == 'b'))
00045 iValue = strtol(value.c_str() + 2, NULL, 2);
00046 else
00047 {
00048 iValue = atoi(value.c_str());
00049 wasSigned = true;
00050 }
00051 if ((wasSigned == false) && (iValue > 32767))
00052 iValue -= 65536;
00053 }
00054 else
00055 iValue = 0;
00056 pos.column--;
00057 pos.character--;
00058 }
00059
00061 const char* Compiler::Token::typeName() const
00062 {
00063 switch (type)
00064 {
00065 case TOKEN_END_OF_STREAM: return "end of stream";
00066 case TOKEN_STR_when: return "when keyword";
00067 case TOKEN_STR_emit: return "emit keyword";
00068 case TOKEN_STR_for: return "for keyword";
00069 case TOKEN_STR_in: return "in keyword";
00070 case TOKEN_STR_step: return "step keyword";
00071 case TOKEN_STR_while: return "while keyword";
00072 case TOKEN_STR_do: return "do keyword";
00073 case TOKEN_STR_if: return "if keyword";
00074 case TOKEN_STR_then: return "then keyword";
00075 case TOKEN_STR_else: return "else keyword";
00076 case TOKEN_STR_elseif: return "elseif keyword";
00077 case TOKEN_STR_end: return "end keyword";
00078 case TOKEN_STR_var: return "var keyword";
00079 case TOKEN_STR_call: return "call keyword";
00080 case TOKEN_STR_sub: return "sub keyword";
00081 case TOKEN_STR_callsub: return "callsub keyword";
00082 case TOKEN_STR_onevent: return "onevent keyword";
00083 case TOKEN_STR_abs: return "abs keyword";
00084 case TOKEN_STRING_LITERAL: return "string";
00085 case TOKEN_INT_LITERAL: return "integer";
00086 case TOKEN_PAR_OPEN: return "( (open parenthesis)";
00087 case TOKEN_PAR_CLOSE: return ") (close parenthesis)";
00088 case TOKEN_BRACKET_OPEN: return "[ (open bracket)";
00089 case TOKEN_BRACKET_CLOSE: return "] (close bracket)";
00090 case TOKEN_COLON: return ": (colon)";
00091 case TOKEN_COMMA: return ", (comma)";
00092 case TOKEN_ASSIGN: return "= (assignation)";
00093 case TOKEN_OP_OR: return "or";
00094 case TOKEN_OP_AND: return "and";
00095 case TOKEN_OP_NOT: return "not";
00096 case TOKEN_OP_BIT_OR: return "binary or";
00097 case TOKEN_OP_BIT_XOR: return "binary xor";
00098 case TOKEN_OP_BIT_AND: return "binary and";
00099 case TOKEN_OP_BIT_NOT: return "binary not";
00100 case TOKEN_OP_EQUAL: return "== (equal to)";
00101 case TOKEN_OP_NOT_EQUAL: return "!= (not equal to)";
00102 case TOKEN_OP_BIGGER: return "> (bigger than)";
00103 case TOKEN_OP_BIGGER_EQUAL: return ">= (bigger or equal than)";
00104 case TOKEN_OP_SMALLER: return "< (smaller than)";
00105 case TOKEN_OP_SMALLER_EQUAL: return "<= (smaller or equal than)";
00106 case TOKEN_OP_SHIFT_LEFT: return "<< (shift left)";
00107 case TOKEN_OP_SHIFT_RIGHT: return ">> (shift right)";
00108 case TOKEN_OP_ADD: return "+ (plus)";
00109 case TOKEN_OP_NEG: return "- (minus)";
00110 case TOKEN_OP_MULT: return "* (time)";
00111 case TOKEN_OP_DIV: return "/ (divide)";
00112 case TOKEN_OP_MOD: return "modulo";
00113 default: return "unknown";
00114 }
00115 }
00116
00118 std::string Compiler::Token::toString() const
00119 {
00120 std::ostringstream oss;
00121 oss << "Line: " << pos.row + 1 << " Col: " << pos.column + 1 << " : ";
00122 oss << typeName();
00123 if (type == TOKEN_INT_LITERAL)
00124 oss << " : " << iValue;
00125 if (type == TOKEN_STRING_LITERAL)
00126 oss << " : " << sValue;
00127 return oss.str();
00128 }
00129
00130
00133 void Compiler::tokenize(std::istream& source)
00134 {
00135 tokens.clear();
00136 SourcePos pos(0, 0, 0);
00137 const unsigned tabSize = 4;
00138
00139
00140 while (source.good())
00141 {
00142 int c = source.get();
00143
00144 if (c == EOF)
00145 break;
00146
00147 pos.column++;
00148 pos.character++;
00149
00150 switch (c)
00151 {
00152
00153 case ' ': break;
00154
00155 case '\t': break;
00156 case '\n': pos.row++; pos.column = -1; break;
00157 case '\r': pos.column = -1; break;
00158 case '(': tokens.push_back(Token(Token::TOKEN_PAR_OPEN, pos)); break;
00159 case ')': tokens.push_back(Token(Token::TOKEN_PAR_CLOSE, pos)); break;
00160 case '[': tokens.push_back(Token(Token::TOKEN_BRACKET_OPEN, pos)); break;
00161 case ']': tokens.push_back(Token(Token::TOKEN_BRACKET_CLOSE, pos)); break;
00162 case ':': tokens.push_back(Token(Token::TOKEN_COLON, pos)); break;
00163 case ',': tokens.push_back(Token(Token::TOKEN_COMMA, pos)); break;
00164 case '+': tokens.push_back(Token(Token::TOKEN_OP_ADD, pos)); break;
00165 case '-': tokens.push_back(Token(Token::TOKEN_OP_NEG, pos)); break;
00166 case '*': tokens.push_back(Token(Token::TOKEN_OP_MULT, pos)); break;
00167 case '/': tokens.push_back(Token(Token::TOKEN_OP_DIV, pos)); break;
00168 case '%': tokens.push_back(Token(Token::TOKEN_OP_MOD, pos)); break;
00169 case '|': tokens.push_back(Token(Token::TOKEN_OP_BIT_OR, pos)); break;
00170 case '^': tokens.push_back(Token(Token::TOKEN_OP_BIT_XOR, pos)); break;
00171 case '&': tokens.push_back(Token(Token::TOKEN_OP_BIT_AND, pos)); break;
00172 case '~': tokens.push_back(Token(Token::TOKEN_OP_BIT_NOT, pos)); break;
00173
00174
00175 case '#':
00176 {
00177 while ((c != '\n') && (c != '\r') && (c != EOF))
00178 {
00179 if (c == '\t')
00180 pos.column += tabSize;
00181 else
00182 pos.column++;
00183 c = source.get();
00184 pos.character++;
00185 }
00186 if (c == '\n')
00187 {
00188 pos.row++;
00189 pos.column = 0;
00190 }
00191 else if (c == '\r')
00192 pos.column = 0;
00193 }
00194 break;
00195
00196
00197 case '!':
00198 if (source.peek() == '=')
00199 {
00200 tokens.push_back(Token(Token::TOKEN_OP_NOT_EQUAL, pos));
00201 source.get();
00202 pos.column++;
00203 pos.character++;
00204 }
00205 else
00206 throw Error(pos, "syntax error");
00207 break;
00208
00209 case '=':
00210 if (source.peek() == '=')
00211 {
00212 tokens.push_back(Token(Token::TOKEN_OP_EQUAL, pos));
00213 source.get();
00214 pos.column++;
00215 pos.character++;
00216 }
00217 else
00218 tokens.push_back(Token(Token::TOKEN_ASSIGN, pos));
00219 break;
00220
00221 case '<':
00222 if (source.peek() == '<')
00223 {
00224 tokens.push_back(Token(Token::TOKEN_OP_SHIFT_LEFT, pos));
00225 source.get();
00226 pos.column++;
00227 pos.character++;
00228 }
00229 else if (source.peek() == '=')
00230 {
00231 tokens.push_back(Token(Token::TOKEN_OP_SMALLER_EQUAL, pos));
00232 source.get();
00233 pos.column++;
00234 pos.character++;
00235 }
00236 else
00237 tokens.push_back(Token(Token::TOKEN_OP_SMALLER, pos));
00238 break;
00239
00240 case '>':
00241 if (source.peek() == '>')
00242 {
00243 tokens.push_back(Token(Token::TOKEN_OP_SHIFT_RIGHT, pos));
00244 source.get();
00245 pos.column++;
00246 pos.character++;
00247 }
00248 else if (source.peek() == '=')
00249 {
00250 tokens.push_back(Token(Token::TOKEN_OP_BIGGER_EQUAL, pos));
00251 source.get();
00252 pos.column++;
00253 pos.character++;
00254 }
00255 else
00256 tokens.push_back(Token(Token::TOKEN_OP_BIGGER, pos));
00257 break;
00258
00259
00260 default:
00261 {
00262
00263 if (!std::isalnum(c) && (c != '_'))
00264 throw Error(pos, "identifiers must begin with _ and an alphanumeric character");
00265
00266
00267 std::string s;
00268 s += c;
00269 int nextC = source.peek();
00270 int posIncrement = 0;
00271 while ((nextC != EOF) && (std::isalnum(nextC) || (nextC == '_') || (nextC == '.')))
00272 {
00273 s += nextC;
00274 source.get();
00275 posIncrement++;
00276 nextC = source.peek();
00277 }
00278
00279
00280 if (std::isdigit(s[0]))
00281 {
00282
00283 if ((s.length() > 1) && (s[0] == '0') && (!std::isdigit(s[1])))
00284 {
00285
00286 if (s[1] == 'x')
00287 {
00288 for (unsigned i = 2; i < s.size(); i++)
00289 if (!std::isxdigit(s[i]))
00290 throw Error(pos, "error in hexadecimal number");
00291 }
00292 else if (s[1] == 'b')
00293 {
00294 for (unsigned i = 2; i < s.size(); i++)
00295 if ((s[i] != '0') && (s[i] != '1'))
00296 throw Error(pos, "error in binary number");
00297 }
00298 else
00299 throw Error(pos, "error in number, invalid base");
00300
00301 }
00302 else
00303 {
00304
00305 for (unsigned i = 1; i < s.size(); i++)
00306 if (!std::isdigit(s[i]))
00307 throw Error(pos, "error in number");
00308 }
00309 tokens.push_back(Token(Token::TOKEN_INT_LITERAL, pos, s));
00310 }
00311 else
00312 {
00313
00314 if (s == "when")
00315 tokens.push_back(Token(Token::TOKEN_STR_when, pos));
00316 else if (s == "emit")
00317 tokens.push_back(Token(Token::TOKEN_STR_emit, pos));
00318 else if (s == "for")
00319 tokens.push_back(Token(Token::TOKEN_STR_for, pos));
00320 else if (s == "in")
00321 tokens.push_back(Token(Token::TOKEN_STR_in, pos));
00322 else if (s == "step")
00323 tokens.push_back(Token(Token::TOKEN_STR_step, pos));
00324 else if (s == "while")
00325 tokens.push_back(Token(Token::TOKEN_STR_while, pos));
00326 else if (s == "do")
00327 tokens.push_back(Token(Token::TOKEN_STR_do, pos));
00328 else if (s == "if")
00329 tokens.push_back(Token(Token::TOKEN_STR_if, pos));
00330 else if (s == "then")
00331 tokens.push_back(Token(Token::TOKEN_STR_then, pos));
00332 else if (s == "else")
00333 tokens.push_back(Token(Token::TOKEN_STR_else, pos));
00334 else if (s == "elseif")
00335 tokens.push_back(Token(Token::TOKEN_STR_elseif, pos));
00336 else if (s == "end")
00337 tokens.push_back(Token(Token::TOKEN_STR_end, pos));
00338 else if (s == "var")
00339 tokens.push_back(Token(Token::TOKEN_STR_var, pos));
00340 else if (s == "call")
00341 tokens.push_back(Token(Token::TOKEN_STR_call, pos));
00342 else if (s == "sub")
00343 tokens.push_back(Token(Token::TOKEN_STR_sub, pos));
00344 else if (s == "callsub")
00345 tokens.push_back(Token(Token::TOKEN_STR_callsub, pos));
00346 else if (s == "onevent")
00347 tokens.push_back(Token(Token::TOKEN_STR_onevent, pos));
00348 else if (s == "abs")
00349 tokens.push_back(Token(Token::TOKEN_STR_abs, pos));
00350 else if (s == "or")
00351 tokens.push_back(Token(Token::TOKEN_OP_OR, pos));
00352 else if (s == "and")
00353 tokens.push_back(Token(Token::TOKEN_OP_AND, pos));
00354 else if (s == "not")
00355 tokens.push_back(Token(Token::TOKEN_OP_NOT, pos));
00356 else
00357 tokens.push_back(Token(Token::TOKEN_STRING_LITERAL, pos, s));
00358 }
00359
00360 pos.column += posIncrement;
00361 pos.character += posIncrement;
00362 }
00363 break;
00364 }
00365 }
00366
00367 tokens.push_back(Token(Token::TOKEN_END_OF_STREAM, pos));
00368 }
00369
00371 void Compiler::dumpTokens(std::ostream &dest) const
00372 {
00373 for (unsigned i = 0; i < tokens.size(); i++)
00374 dest << tokens[i].toString() << std::endl;
00375 }
00376 };