asebaros: /opt/ros/diamondback/stacks/ethzasl_aseba/asebaros/aseba/svn/compiler/lexer.cpp Source File

00001 /*
00002         Aseba - an event-based framework for distributed robot control
00003         Copyright (C) 2007--2009:
00004                 Stephane Magnenat <stephane at magnenat dot net>
00005                 (http://stephane.magnenat.net)
00006                 and other contributors, see authors.txt for details
00007                 Mobots group, Laboratory of Robotics Systems, EPFL, Lausanne
00008         
00009         This program is free software: you can redistribute it and/or modify
00010         it under the terms of the GNU General Public License as published by
00011         the Free Software Foundation, either version 3 of the License, or
00012         any other version as decided by the two original authors
00013         Stephane Magnenat and Valentin Longchamp.
00014         
00015         This program is distributed in the hope that it will be useful,
00016         but WITHOUT ANY WARRANTY; without even the implied warranty of
00017         MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018         GNU General Public License for more details.
00019         
00020         You should have received a copy of the GNU General Public License
00021         along with this program.  If not, see <http://www.gnu.org/licenses/>.
00022 */
00023 
00024 #include "compiler.h"
00025 #include <cstdlib>
00026 #include <sstream>
00027 #include <ostream>
00028 #include <cctype>
00029 #include <cstdio>
00030 
00031 namespace Aseba
00032 {
00034         Compiler::Token::Token(Type type, SourcePos pos, const std::string& value) :
00035                 type(type),
00036                 sValue(value),
00037                 pos(pos)
00038         {
00039                 if (type == TOKEN_INT_LITERAL)
00040                 {
00041                         bool wasSigned = false;
00042                         if ((value.length() > 1) && (value[1] == 'x'))
00043                                 iValue = strtol(value.c_str() + 2, NULL, 16);
00044                         else if ((value.length() > 1) && (value[1] == 'b'))
00045                                 iValue = strtol(value.c_str() + 2, NULL, 2);
00046                         else
00047                         {
00048                                 iValue = atoi(value.c_str());
00049                                 wasSigned = true;
00050                         }
00051                         if ((wasSigned == false) && (iValue > 32767))
00052                                 iValue -= 65536;
00053                 }
00054                 else
00055                         iValue = 0;
00056                 pos.column--; // column has already been incremented when token is created, so we remove one
00057                 pos.character--; // character has already been incremented when token is created, so we remove one
00058         }
00059         
00061         const char* Compiler::Token::typeName() const
00062         {
00063                 switch (type)
00064                 {
00065                         case TOKEN_END_OF_STREAM: return "end of stream";
00066                         case TOKEN_STR_when: return "when keyword";
00067                         case TOKEN_STR_emit: return "emit keyword";
00068                         case TOKEN_STR_for: return "for keyword";
00069                         case TOKEN_STR_in: return "in keyword";
00070                         case TOKEN_STR_step: return "step keyword";
00071                         case TOKEN_STR_while: return "while keyword";
00072                         case TOKEN_STR_do: return "do keyword";
00073                         case TOKEN_STR_if: return "if keyword";
00074                         case TOKEN_STR_then: return "then keyword";
00075                         case TOKEN_STR_else: return "else keyword";
00076                         case TOKEN_STR_elseif: return "elseif keyword";
00077                         case TOKEN_STR_end: return "end keyword";
00078                         case TOKEN_STR_var: return "var keyword";
00079                         case TOKEN_STR_call: return "call keyword";
00080                         case TOKEN_STR_sub: return "sub keyword";
00081                         case TOKEN_STR_callsub: return "callsub keyword";
00082                         case TOKEN_STR_onevent: return "onevent keyword";
00083                         case TOKEN_STR_abs: return "abs keyword";
00084                         case TOKEN_STRING_LITERAL: return "string";
00085                         case TOKEN_INT_LITERAL: return "integer";
00086                         case TOKEN_PAR_OPEN: return "( (open parenthesis)";
00087                         case TOKEN_PAR_CLOSE: return ") (close parenthesis)";
00088                         case TOKEN_BRACKET_OPEN: return "[ (open bracket)";
00089                         case TOKEN_BRACKET_CLOSE: return "] (close bracket)";
00090                         case TOKEN_COLON: return ": (colon)";
00091                         case TOKEN_COMMA: return ", (comma)";
00092                         case TOKEN_ASSIGN: return "= (assignation)";
00093                         case TOKEN_OP_OR: return "or";
00094                         case TOKEN_OP_AND: return "and";
00095                         case TOKEN_OP_NOT: return "not";
00096                         case TOKEN_OP_BIT_OR: return "binary or";
00097                         case TOKEN_OP_BIT_XOR: return "binary xor";
00098                         case TOKEN_OP_BIT_AND: return "binary and";
00099                         case TOKEN_OP_BIT_NOT: return "binary not";
00100                         case TOKEN_OP_EQUAL: return "== (equal to)";
00101                         case TOKEN_OP_NOT_EQUAL: return "!= (not equal to)";
00102                         case TOKEN_OP_BIGGER: return "> (bigger than)";
00103                         case TOKEN_OP_BIGGER_EQUAL: return ">= (bigger or equal than)";
00104                         case TOKEN_OP_SMALLER: return "< (smaller than)";
00105                         case TOKEN_OP_SMALLER_EQUAL: return "<= (smaller or equal than)";
00106                         case TOKEN_OP_SHIFT_LEFT: return "<< (shift left)";
00107                         case TOKEN_OP_SHIFT_RIGHT: return ">> (shift right)";
00108                         case TOKEN_OP_ADD: return "+ (plus)";
00109                         case TOKEN_OP_NEG: return "- (minus)";
00110                         case TOKEN_OP_MULT: return "* (time)";
00111                         case TOKEN_OP_DIV: return "/ (divide)";
00112                         case TOKEN_OP_MOD: return "modulo";
00113                         default: return "unknown";
00114                 }
00115         }
00116         
00118         std::string Compiler::Token::toString() const
00119         {
00120                 std::ostringstream oss;
00121                 oss << "Line: " << pos.row + 1 << " Col: " << pos.column + 1 << " : ";
00122                 oss << typeName();
00123                 if (type == TOKEN_INT_LITERAL)
00124                         oss << " : " << iValue;
00125                 if (type == TOKEN_STRING_LITERAL)
00126                         oss << " : " << sValue;
00127                 return oss.str();
00128         }
00129         
00130         
00133         void Compiler::tokenize(std::istream& source)
00134         {
00135                 tokens.clear();
00136                 SourcePos pos(0, 0, 0);
00137                 const unsigned tabSize = 4;
00138                 
00139                 // tokenize text source
00140                 while (source.good())
00141                 {
00142                         int c = source.get();
00143                         
00144                         if (c == EOF)
00145                                 break;
00146                         
00147                         pos.column++;
00148                         pos.character++;
00149                         
00150                         switch (c)
00151                         {
00152                                 // simple cases of one character
00153                                 case ' ': break;
00154                                 //case '\t': pos.column += tabSize - 1; break;
00155                                 case '\t': break;
00156                                 case '\n': pos.row++; pos.column = -1; break; // -1 so next call to pos.column++ result set 0
00157                                 case '\r': pos.column = -1; break; // -1 so next call to pos.column++ result set 0
00158                                 case '(': tokens.push_back(Token(Token::TOKEN_PAR_OPEN, pos)); break;
00159                                 case ')': tokens.push_back(Token(Token::TOKEN_PAR_CLOSE, pos)); break;
00160                                 case '[': tokens.push_back(Token(Token::TOKEN_BRACKET_OPEN, pos)); break;
00161                                 case ']': tokens.push_back(Token(Token::TOKEN_BRACKET_CLOSE, pos)); break;
00162                                 case ':': tokens.push_back(Token(Token::TOKEN_COLON, pos)); break;
00163                                 case ',': tokens.push_back(Token(Token::TOKEN_COMMA, pos)); break;
00164                                 case '+': tokens.push_back(Token(Token::TOKEN_OP_ADD, pos)); break;
00165                                 case '-': tokens.push_back(Token(Token::TOKEN_OP_NEG, pos)); break;
00166                                 case '*': tokens.push_back(Token(Token::TOKEN_OP_MULT, pos)); break;
00167                                 case '/': tokens.push_back(Token(Token::TOKEN_OP_DIV, pos)); break;
00168                                 case '%': tokens.push_back(Token(Token::TOKEN_OP_MOD, pos)); break;
00169                                 case '|': tokens.push_back(Token(Token::TOKEN_OP_BIT_OR, pos)); break;
00170                                 case '^': tokens.push_back(Token(Token::TOKEN_OP_BIT_XOR, pos)); break;
00171                                 case '&': tokens.push_back(Token(Token::TOKEN_OP_BIT_AND, pos)); break;
00172                                 case '~': tokens.push_back(Token(Token::TOKEN_OP_BIT_NOT, pos)); break;
00173                                 
00174                                 // special case for comment
00175                                 case '#':
00176                                 {
00177                                         while ((c != '\n') && (c != '\r') && (c != EOF))
00178                                         {
00179                                                 if (c == '\t')
00180                                                         pos.column += tabSize;
00181                                                 else
00182                                                         pos.column++;
00183                                                 c = source.get();
00184                                                 pos.character++;
00185                                         }
00186                                         if (c == '\n')
00187                                         {
00188                                                 pos.row++;
00189                                                 pos.column = 0;
00190                                         }
00191                                         else if (c == '\r')
00192                                                 pos.column = 0;
00193                                 }
00194                                 break;
00195                                 
00196                                 // cases that require one character look-ahead
00197                                 case '!':
00198                                         if (source.peek() == '=')
00199                                         {
00200                                                 tokens.push_back(Token(Token::TOKEN_OP_NOT_EQUAL, pos));
00201                                                 source.get();
00202                                                 pos.column++;
00203                                                 pos.character++;
00204                                         }
00205                                         else
00206                                                 throw Error(pos, "syntax error");
00207                                 break;
00208                                 
00209                                 case '=':
00210                                         if (source.peek() == '=')
00211                                         {
00212                                                 tokens.push_back(Token(Token::TOKEN_OP_EQUAL, pos));
00213                                                 source.get();
00214                                                 pos.column++;
00215                                                 pos.character++;
00216                                         }
00217                                         else
00218                                                 tokens.push_back(Token(Token::TOKEN_ASSIGN, pos));
00219                                 break;
00220                                 
00221                                 case '<':
00222                                         if (source.peek() == '<')
00223                                         {
00224                                                 tokens.push_back(Token(Token::TOKEN_OP_SHIFT_LEFT, pos));
00225                                                 source.get();
00226                                                 pos.column++;
00227                                                 pos.character++;
00228                                         }
00229                                         else if (source.peek() == '=')
00230                                         {
00231                                                 tokens.push_back(Token(Token::TOKEN_OP_SMALLER_EQUAL, pos));
00232                                                 source.get();
00233                                                 pos.column++;
00234                                                 pos.character++;
00235                                         }
00236                                         else
00237                                                 tokens.push_back(Token(Token::TOKEN_OP_SMALLER, pos));
00238                                 break;
00239                                 
00240                                 case '>':
00241                                         if (source.peek() == '>')
00242                                         {
00243                                                 tokens.push_back(Token(Token::TOKEN_OP_SHIFT_RIGHT, pos));
00244                                                 source.get();
00245                                                 pos.column++;
00246                                                 pos.character++;
00247                                         }
00248                                         else if (source.peek() == '=')
00249                                         {
00250                                                 tokens.push_back(Token(Token::TOKEN_OP_BIGGER_EQUAL, pos));
00251                                                 source.get();
00252                                                 pos.column++;
00253                                                 pos.character++;
00254                                         }
00255                                         else
00256                                                 tokens.push_back(Token(Token::TOKEN_OP_BIGGER, pos));
00257                                 break;
00258                                 
00259                                 // cases that require to look for a while
00260                                 default:
00261                                 {
00262                                         // check first character
00263                                         if (!std::isalnum(c) && (c != '_'))
00264                                                 throw Error(pos, "identifiers must begin with _ and an alphanumeric character");
00265                                         
00266                                         // get a string
00267                                         std::string s;
00268                                         s += c;
00269                                         int nextC = source.peek();
00270                                         int posIncrement = 0;
00271                                         while ((nextC != EOF) && (std::isalnum(nextC) || (nextC == '_') || (nextC == '.')))
00272                                         {
00273                                                 s += nextC;
00274                                                 source.get();
00275                                                 posIncrement++;
00276                                                 nextC = source.peek();
00277                                         }
00278                                         
00279                                         // we now have a string, let's check what it is
00280                                         if (std::isdigit(s[0]))
00281                                         {
00282                                                 // check if hex or binary
00283                                                 if ((s.length() > 1) && (s[0] == '0') && (!std::isdigit(s[1])))
00284                                                 {
00285                                                         // check if we have a valid number
00286                                                         if (s[1] == 'x')
00287                                                         {
00288                                                                 for (unsigned i = 2; i < s.size(); i++)
00289                                                                         if (!std::isxdigit(s[i]))
00290                                                                                 throw Error(pos, "error in hexadecimal number");
00291                                                         }
00292                                                         else if (s[1] == 'b')
00293                                                         {
00294                                                                 for (unsigned i = 2; i < s.size(); i++)
00295                                                                         if ((s[i] != '0') && (s[i] != '1'))
00296                                                                                 throw Error(pos, "error in binary number");
00297                                                         }
00298                                                         else
00299                                                                 throw Error(pos, "error in number, invalid base");
00300                                                         
00301                                                 }
00302                                                 else
00303                                                 {
00304                                                         // check if we have a valid number
00305                                                         for (unsigned i = 1; i < s.size(); i++)
00306                                                                 if (!std::isdigit(s[i]))
00307                                                                         throw Error(pos, "error in number");
00308                                                 }
00309                                                 tokens.push_back(Token(Token::TOKEN_INT_LITERAL, pos, s));
00310                                         }
00311                                         else
00312                                         {
00313                                                 // check if it is a known keyword
00314                                                 if (s == "when")
00315                                                         tokens.push_back(Token(Token::TOKEN_STR_when, pos));
00316                                                 else if (s == "emit")
00317                                                         tokens.push_back(Token(Token::TOKEN_STR_emit, pos));
00318                                                 else if (s == "for")
00319                                                         tokens.push_back(Token(Token::TOKEN_STR_for, pos));
00320                                                 else if (s == "in")
00321                                                         tokens.push_back(Token(Token::TOKEN_STR_in, pos));
00322                                                 else if (s == "step")
00323                                                         tokens.push_back(Token(Token::TOKEN_STR_step, pos));
00324                                                 else if (s == "while")
00325                                                         tokens.push_back(Token(Token::TOKEN_STR_while, pos));
00326                                                 else if (s == "do")
00327                                                         tokens.push_back(Token(Token::TOKEN_STR_do, pos));
00328                                                 else if (s == "if")
00329                                                         tokens.push_back(Token(Token::TOKEN_STR_if, pos));
00330                                                 else if (s == "then")
00331                                                         tokens.push_back(Token(Token::TOKEN_STR_then, pos));
00332                                                 else if (s == "else")
00333                                                         tokens.push_back(Token(Token::TOKEN_STR_else, pos));
00334                                                 else if (s == "elseif")
00335                                                         tokens.push_back(Token(Token::TOKEN_STR_elseif, pos));
00336                                                 else if (s == "end")
00337                                                         tokens.push_back(Token(Token::TOKEN_STR_end, pos));
00338                                                 else if (s == "var")
00339                                                         tokens.push_back(Token(Token::TOKEN_STR_var, pos));
00340                                                 else if (s == "call")
00341                                                         tokens.push_back(Token(Token::TOKEN_STR_call, pos));
00342                                                 else if (s == "sub")
00343                                                         tokens.push_back(Token(Token::TOKEN_STR_sub, pos));
00344                                                 else if (s == "callsub")
00345                                                         tokens.push_back(Token(Token::TOKEN_STR_callsub, pos));
00346                                                 else if (s == "onevent")
00347                                                         tokens.push_back(Token(Token::TOKEN_STR_onevent, pos));
00348                                                 else if (s == "abs")
00349                                                         tokens.push_back(Token(Token::TOKEN_STR_abs, pos));
00350                                                 else if (s == "or")
00351                                                         tokens.push_back(Token(Token::TOKEN_OP_OR, pos));
00352                                                 else if (s == "and")
00353                                                         tokens.push_back(Token(Token::TOKEN_OP_AND, pos));
00354                                                 else if (s == "not")
00355                                                         tokens.push_back(Token(Token::TOKEN_OP_NOT, pos));
00356                                                 else
00357                                                         tokens.push_back(Token(Token::TOKEN_STRING_LITERAL, pos, s));
00358                                         }
00359                                         
00360                                         pos.column += posIncrement;
00361                                         pos.character += posIncrement;
00362                                 }
00363                                 break;
00364                         } // switch (c)
00365                 } // while (source.good())
00366                 
00367                 tokens.push_back(Token(Token::TOKEN_END_OF_STREAM, pos));
00368         }
00369         
00371         void Compiler::dumpTokens(std::ostream &dest) const
00372         {
00373                 for (unsigned i = 0; i < tokens.size(); i++)
00374                         dest << tokens[i].toString() << std::endl;
00375         }
00376 }; // Aseba