lexer.cpp
Go to the documentation of this file.
00001 /*
00002         Aseba - an event-based framework for distributed robot control
00003         Copyright (C) 2007--2012:
00004                 Stephane Magnenat <stephane at magnenat dot net>
00005                 (http://stephane.magnenat.net)
00006                 and other contributors, see authors.txt for details
00007         
00008         This program is free software: you can redistribute it and/or modify
00009         it under the terms of the GNU Lesser General Public License as published
00010         by the Free Software Foundation, version 3 of the License.
00011         
00012         This program is distributed in the hope that it will be useful,
00013         but WITHOUT ANY WARRANTY; without even the implied warranty of
00014         MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015         GNU Lesser General Public License for more details.
00016         
00017         You should have received a copy of the GNU Lesser General Public License
00018         along with this program. If not, see <http://www.gnu.org/licenses/>.
00019 */
00020 
00021 #include "compiler.h"
00022 #include "../utils/FormatableString.h"
00023 #include <cstdlib>
00024 #include <sstream>
00025 #include <ostream>
00026 #include <cctype>
00027 #include <cstdio>
00028 
00029 namespace Aseba
00030 {
00032         Compiler::Token::Token(Type type, SourcePos pos, const std::wstring& value) :
00033                 type(type),
00034                 sValue(value),
00035                 pos(pos)
00036         {
00037                 if (type == TOKEN_INT_LITERAL)
00038                 {
00039                         bool wasSigned = false;
00040                         if ((value.length() > 1) && (value[1] == 'x'))
00041                                 iValue = wcstol(value.c_str() + 2, NULL, 16);
00042                         else if ((value.length() > 1) && (value[1] == 'b'))
00043                                 iValue = wcstol(value.c_str() + 2, NULL, 2);
00044                         else
00045                         {
00046                                 iValue = wcstol(value.c_str(), NULL, 10);
00047                                 wasSigned = true;
00048                         }
00049                         if ((wasSigned == false) && (iValue > 32767))
00050                                 iValue -= 65536;
00051                 }
00052                 else
00053                         iValue = 0;
00054                 pos.column--; // column has already been incremented when token is created, so we remove one
00055                 pos.character--; // character has already been incremented when token is created, so we remove one
00056         }
00057         
00059         const std::wstring Compiler::Token::typeName() const
00060         {
00061                 switch (type)
00062                 {
00063                         case TOKEN_END_OF_STREAM: return translate(ERROR_TOKEN_END_OF_STREAM);
00064                         case TOKEN_STR_when: return translate(ERROR_TOKEN_STR_when);
00065                         case TOKEN_STR_emit: return translate(ERROR_TOKEN_STR_emit);
00066                         case TOKEN_STR_for: return translate(ERROR_TOKEN_STR_for);
00067                         case TOKEN_STR_in: return translate(ERROR_TOKEN_STR_in);
00068                         case TOKEN_STR_step: return translate(ERROR_TOKEN_STR_step);
00069                         case TOKEN_STR_while: return translate(ERROR_TOKEN_STR_while);
00070                         case TOKEN_STR_do: return translate(ERROR_TOKEN_STR_do);
00071                         case TOKEN_STR_if: return translate(ERROR_TOKEN_STR_if);
00072                         case TOKEN_STR_then: return translate(ERROR_TOKEN_STR_then);
00073                         case TOKEN_STR_else: return translate(ERROR_TOKEN_STR_else);
00074                         case TOKEN_STR_elseif: return translate(ERROR_TOKEN_STR_elseif);
00075                         case TOKEN_STR_end: return translate(ERROR_TOKEN_STR_end);
00076                         case TOKEN_STR_var: return translate(ERROR_TOKEN_STR_var);
00077                         case TOKEN_STR_call: return translate(ERROR_TOKEN_STR_call);
00078                         case TOKEN_STR_sub: return translate(ERROR_TOKEN_STR_sub);
00079                         case TOKEN_STR_callsub: return translate(ERROR_TOKEN_STR_callsub);
00080                         case TOKEN_STR_onevent: return translate(ERROR_TOKEN_STR_onevent);
00081                         case TOKEN_STR_abs: return translate(ERROR_TOKEN_STR_abs);
00082                         case TOKEN_STR_return: return translate(ERROR_TOKEN_STR_return);
00083                         case TOKEN_STRING_LITERAL: return translate(ERROR_TOKEN_STRING_LITERAL);
00084                         case TOKEN_INT_LITERAL: return translate(ERROR_TOKEN_INT_LITERAL);
00085                         case TOKEN_PAR_OPEN: return translate(ERROR_TOKEN_PAR_OPEN);
00086                         case TOKEN_PAR_CLOSE: return translate(ERROR_TOKEN_PAR_CLOSE);
00087                         case TOKEN_BRACKET_OPEN: return translate(ERROR_TOKEN_BRACKET_OPEN);
00088                         case TOKEN_BRACKET_CLOSE: return translate(ERROR_TOKEN_BRACKET_CLOSE);
00089                         case TOKEN_COLON: return translate(ERROR_TOKEN_COLON);
00090                         case TOKEN_COMMA: return translate(ERROR_TOKEN_COMMA);
00091                         case TOKEN_ASSIGN: return translate(ERROR_TOKEN_ASSIGN);
00092                         case TOKEN_OP_OR: return translate(ERROR_TOKEN_OP_OR);
00093                         case TOKEN_OP_AND: return translate(ERROR_TOKEN_OP_AND);
00094                         case TOKEN_OP_NOT: return translate(ERROR_TOKEN_OP_NOT);
00095                         case TOKEN_OP_BIT_OR: return translate(ERROR_TOKEN_OP_BIT_OR);
00096                         case TOKEN_OP_BIT_XOR: return translate(ERROR_TOKEN_OP_BIT_XOR);
00097                         case TOKEN_OP_BIT_AND: return translate(ERROR_TOKEN_OP_BIT_AND);
00098                         case TOKEN_OP_BIT_NOT: return translate(ERROR_TOKEN_OP_BIT_NOT);
00099                         case TOKEN_OP_BIT_OR_EQUAL: return translate(ERROR_TOKEN_OP_BIT_OR_EQUAL);
00100                         case TOKEN_OP_BIT_XOR_EQUAL: return translate(ERROR_TOKEN_OP_BIT_XOR_EQUAL);
00101                         case TOKEN_OP_BIT_AND_EQUAL: return translate(ERROR_TOKEN_OP_BIT_AND_EQUAL);
00102                         case TOKEN_OP_EQUAL: return translate(ERROR_TOKEN_OP_EQUAL);
00103                         case TOKEN_OP_NOT_EQUAL: return translate(ERROR_TOKEN_OP_NOT_EQUAL);
00104                         case TOKEN_OP_BIGGER: return translate(ERROR_TOKEN_OP_BIGGER);
00105                         case TOKEN_OP_BIGGER_EQUAL: return translate(ERROR_TOKEN_OP_BIGGER_EQUAL);
00106                         case TOKEN_OP_SMALLER: return translate(ERROR_TOKEN_OP_SMALLER);
00107                         case TOKEN_OP_SMALLER_EQUAL: return translate(ERROR_TOKEN_OP_SMALLER_EQUAL);
00108                         case TOKEN_OP_SHIFT_LEFT: return translate(ERROR_TOKEN_OP_SHIFT_LEFT);
00109                         case TOKEN_OP_SHIFT_RIGHT: return translate(ERROR_TOKEN_OP_SHIFT_RIGHT);
00110                         case TOKEN_OP_SHIFT_LEFT_EQUAL: return translate(ERROR_TOKEN_OP_SHIFT_LEFT_EQUAL);
00111                         case TOKEN_OP_SHIFT_RIGHT_EQUAL: return translate(ERROR_TOKEN_OP_SHIFT_RIGHT_EQUAL);
00112                         case TOKEN_OP_ADD: return translate(ERROR_TOKEN_OP_ADD);
00113                         case TOKEN_OP_NEG: return translate(ERROR_TOKEN_OP_NEG);
00114                         case TOKEN_OP_ADD_EQUAL: return translate(ERROR_TOKEN_OP_ADD_EQUAL);
00115                         case TOKEN_OP_NEG_EQUAL: return translate(ERROR_TOKEN_OP_NEG_EQUAL);
00116                         case TOKEN_OP_PLUS_PLUS: return translate(ERROR_TOKEN_OP_PLUS_PLUS);
00117                         case TOKEN_OP_MINUS_MINUS: return translate(ERROR_TOKEN_OP_MINUS_MINUS);
00118                         case TOKEN_OP_MULT: return translate(ERROR_TOKEN_OP_MULT);
00119                         case TOKEN_OP_DIV: return translate(ERROR_TOKEN_OP_DIV);
00120                         case TOKEN_OP_MOD: return translate(ERROR_TOKEN_OP_MOD);
00121                         case TOKEN_OP_MULT_EQUAL: return translate(ERROR_TOKEN_OP_MULT_EQUAL);
00122                         case TOKEN_OP_DIV_EQUAL: return translate(ERROR_TOKEN_OP_DIV_EQUAL);
00123                         case TOKEN_OP_MOD_EQUAL: return translate(ERROR_TOKEN_OP_MOD_EQUAL);
00124                         default: return translate(ERROR_TOKEN_UNKNOWN);
00125                 }
00126         }
00127         
00129         std::wstring Compiler::Token::toWString() const
00130         {
00131                 std::wostringstream oss;
00132                 oss << translate(ERROR_LINE) << pos.row + 1 << translate(ERROR_COL) << pos.column + 1 << L" : ";
00133                 oss << typeName();
00134                 if (type == TOKEN_INT_LITERAL)
00135                         oss << L" : " << iValue;
00136                 if (type == TOKEN_STRING_LITERAL)
00137                         oss << L" : " << sValue;
00138                 return oss.str();
00139         }
00140         
00141         
00144         void Compiler::tokenize(std::wistream& source)
00145         {
00146                 tokens.clear();
00147                 SourcePos pos(0, 0, 0);
00148                 const unsigned tabSize = 4;
00149                 
00150                 // tokenize text source
00151                 while (source.good())
00152                 {
00153                         wchar_t c = source.get();
00154                         
00155                         if (source.eof())
00156                                 break;
00157                         
00158                         pos.column++;
00159                         pos.character++;
00160                         
00161                         switch (c)
00162                         {
00163                                 // simple cases of one character
00164                                 case ' ': break;
00165                                 //case '\t': pos.column += tabSize - 1; break;
00166                                 case '\t': break;
00167                                 case '\n': pos.row++; pos.column = -1; break; // -1 so next call to pos.column++ result set 0
00168                                 case '\r': pos.column = -1; break; // -1 so next call to pos.column++ result set 0
00169                                 case '(': tokens.push_back(Token(Token::TOKEN_PAR_OPEN, pos)); break;
00170                                 case ')': tokens.push_back(Token(Token::TOKEN_PAR_CLOSE, pos)); break;
00171                                 case '[': tokens.push_back(Token(Token::TOKEN_BRACKET_OPEN, pos)); break;
00172                                 case ']': tokens.push_back(Token(Token::TOKEN_BRACKET_CLOSE, pos)); break;
00173                                 case ':': tokens.push_back(Token(Token::TOKEN_COLON, pos)); break;
00174                                 case ',': tokens.push_back(Token(Token::TOKEN_COMMA, pos)); break;
00175                                 
00176                                 // special case for comment
00177                                 case '#':
00178                                 {
00179                                         // check if it's a comment block #* ... *#
00180                                         if (source.peek() == '*')
00181                                         {
00182                                                 // comment block
00183                                                 // record position of the begining
00184                                                 SourcePos begin(pos);
00185                                                 // move forward by 2 characters then search for the end
00186                                                 int step = 2;
00187                                                 while ((step > 0) || (c != '*') || (source.peek() != '#'))
00188                                                 {
00189                                                         if (step)
00190                                                                 step--;
00191 
00192                                                         if (c == '\t')
00193                                                                 pos.column += tabSize;
00194                                                         else if (c == '\n')
00195                                                         {
00196                                                                 pos.row++;
00197                                                                 pos.column = 0;
00198                                                         }
00199                                                         else
00200                                                                 pos.column++;
00201                                                         c = source.get();
00202                                                         pos.character++;
00203                                                         if (source.eof())
00204                                                         {
00205                                                                 // EOF -> unbalanced block
00206                                                                 throw TranslatableError(begin, ERROR_UNBALANCED_COMMENT_BLOCK);
00207                                                         }
00208                                                 }
00209                                                 // fetch the #
00210                                                 getNextCharacter(source, pos);
00211                                         }
00212                                         else
00213                                         {
00214                                                 // simple comment
00215                                                 while ((c != '\n') && (c != '\r') && (!source.eof()))
00216                                                 {
00217                                                         if (c == '\t')
00218                                                                 pos.column += tabSize;
00219                                                         else
00220                                                                 pos.column++;
00221                                                         c = source.get();
00222                                                         pos.character++;
00223                                                 }
00224                                                 if (c == '\n')
00225                                                 {
00226                                                         pos.row++;
00227                                                         pos.column = 0;
00228                                                 }
00229                                                 else if (c == '\r')
00230                                                         pos.column = 0;
00231                                         }
00232                                 }
00233                                 break;
00234                                 
00235                                 // cases that require one character look-ahead
00236                                 case '+':
00237                                         if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_ADD_EQUAL))
00238                                                 break;
00239                                         if (testNextCharacter(source, pos, '+', Token::TOKEN_OP_PLUS_PLUS))
00240                                                 break;
00241                                         tokens.push_back(Token(Token::TOKEN_OP_ADD, pos));
00242                                         break;
00243 
00244                                 case '-':
00245                                         if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_NEG_EQUAL))
00246                                                 break;
00247                                         if (testNextCharacter(source, pos, '-', Token::TOKEN_OP_MINUS_MINUS))
00248                                                 break;
00249                                         tokens.push_back(Token(Token::TOKEN_OP_NEG, pos));
00250                                         break;
00251 
00252                                 case '*':
00253                                         if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_MULT_EQUAL))
00254                                                 break;
00255                                         tokens.push_back(Token(Token::TOKEN_OP_MULT, pos));
00256                                         break;
00257 
00258                                 case '/':
00259                                         if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_DIV_EQUAL))
00260                                                 break;
00261                                         tokens.push_back(Token(Token::TOKEN_OP_DIV, pos));
00262                                         break;
00263 
00264                                 case '%':
00265                                         if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_MOD_EQUAL))
00266                                                 break;
00267                                         tokens.push_back(Token(Token::TOKEN_OP_MOD, pos));
00268                                         break;
00269 
00270                                 case '|':
00271                                         if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_BIT_OR_EQUAL))
00272                                                 break;
00273                                         tokens.push_back(Token(Token::TOKEN_OP_BIT_OR, pos));
00274                                         break;
00275 
00276                                 case '^':
00277                                         if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_BIT_XOR_EQUAL))
00278                                                 break;
00279                                         tokens.push_back(Token(Token::TOKEN_OP_BIT_XOR, pos));
00280                                         break;
00281 
00282                                 case '&':
00283                                         if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_BIT_AND_EQUAL))
00284                                                 break;
00285                                         tokens.push_back(Token(Token::TOKEN_OP_BIT_AND, pos));
00286                                         break;
00287 
00288                                 case '~':
00289                                         tokens.push_back(Token(Token::TOKEN_OP_BIT_NOT, pos));
00290                                         break;
00291 
00292                                 case '!':
00293                                         if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_NOT_EQUAL))
00294                                                 break;
00295                                         throw TranslatableError(pos, ERROR_SYNTAX);
00296                                         break;
00297                                 
00298                                 case '=':
00299                                         if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_EQUAL))
00300                                                 break;
00301                                         tokens.push_back(Token(Token::TOKEN_ASSIGN, pos));
00302                                         break;
00303                                 
00304                                 // cases that require two characters look-ahead
00305                                 case '<':
00306                                         if (source.peek() == '<')
00307                                         {
00308                                                 // <<
00309                                                 getNextCharacter(source, pos);
00310                                                 if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_SHIFT_LEFT_EQUAL))
00311                                                         break;
00312                                                 tokens.push_back(Token(Token::TOKEN_OP_SHIFT_LEFT, pos));
00313                                                 break;
00314                                         }
00315                                         // <
00316                                         if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_SMALLER_EQUAL))
00317                                                 break;
00318                                         tokens.push_back(Token(Token::TOKEN_OP_SMALLER, pos));
00319                                         break;
00320                                 
00321                                 case '>':
00322                                         if (source.peek() == '>')
00323                                         {
00324                                                 // >>
00325                                                 getNextCharacter(source, pos);
00326                                                 if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_SHIFT_RIGHT_EQUAL))
00327                                                         break;
00328                                                 tokens.push_back(Token(Token::TOKEN_OP_SHIFT_RIGHT, pos));
00329                                                 break;
00330                                         }
00331                                         // >
00332                                         if (testNextCharacter(source, pos, '=', Token::TOKEN_OP_BIGGER_EQUAL))
00333                                                 break;
00334                                         tokens.push_back(Token(Token::TOKEN_OP_BIGGER, pos));
00335                                         break;
00336                                 
00337                                 // cases that require to look for a while
00338                                 default:
00339                                 {
00340                                         // check first character
00341                                         if (!iswalnum(c) && (c != '_'))
00342                                                 throw TranslatableError(pos, ERROR_INVALID_IDENTIFIER).arg((unsigned)c, 0, 16);
00343                                         
00344                                         // get a string
00345                                         std::wstring s;
00346                                         s += c;
00347                                         wchar_t nextC = source.peek();
00348                                         int posIncrement = 0;
00349                                         while ((source.good()) && (iswalnum(nextC) || (nextC == '_') || (nextC == '.')))
00350                                         {
00351                                                 s += nextC;
00352                                                 source.get();
00353                                                 posIncrement++;
00354                                                 nextC = source.peek();
00355                                         }
00356                                         
00357                                         // we now have a string, let's check what it is
00358                                         if (std::isdigit(s[0]))
00359                                         {
00360                                                 // check if hex or binary
00361                                                 if ((s.length() > 1) && (s[0] == '0') && (!std::isdigit(s[1])))
00362                                                 {
00363                                                         // check if we have a valid number
00364                                                         if (s[1] == 'x')
00365                                                         {
00366                                                                 for (unsigned i = 2; i < s.size(); i++)
00367                                                                         if (!std::isxdigit(s[i]))
00368                                                                                 throw TranslatableError(pos, ERROR_INVALID_HEXA_NUMBER);
00369                                                         }
00370                                                         else if (s[1] == 'b')
00371                                                         {
00372                                                                 for (unsigned i = 2; i < s.size(); i++)
00373                                                                         if ((s[i] != '0') && (s[i] != '1'))
00374                                                                                 throw TranslatableError(pos, ERROR_INVALID_BINARY_NUMBER);
00375                                                         }
00376                                                         else
00377                                                                 throw TranslatableError(pos, ERROR_NUMBER_INVALID_BASE);
00378                                                         
00379                                                 }
00380                                                 else
00381                                                 {
00382                                                         // check if we have a valid number
00383                                                         for (unsigned i = 1; i < s.size(); i++)
00384                                                                 if (!std::isdigit(s[i]))
00385                                                                         throw TranslatableError(pos, ERROR_IN_NUMBER);
00386                                                 }
00387                                                 tokens.push_back(Token(Token::TOKEN_INT_LITERAL, pos, s));
00388                                         }
00389                                         else
00390                                         {
00391                                                 // check if it is a known keyword
00392                                                 if (s == L"when")
00393                                                         tokens.push_back(Token(Token::TOKEN_STR_when, pos));
00394                                                 else if (s == L"emit")
00395                                                         tokens.push_back(Token(Token::TOKEN_STR_emit, pos));
00396                                                 else if (s == L"for")
00397                                                         tokens.push_back(Token(Token::TOKEN_STR_for, pos));
00398                                                 else if (s == L"in")
00399                                                         tokens.push_back(Token(Token::TOKEN_STR_in, pos));
00400                                                 else if (s == L"step")
00401                                                         tokens.push_back(Token(Token::TOKEN_STR_step, pos));
00402                                                 else if (s == L"while")
00403                                                         tokens.push_back(Token(Token::TOKEN_STR_while, pos));
00404                                                 else if (s == L"do")
00405                                                         tokens.push_back(Token(Token::TOKEN_STR_do, pos));
00406                                                 else if (s == L"if")
00407                                                         tokens.push_back(Token(Token::TOKEN_STR_if, pos));
00408                                                 else if (s == L"then")
00409                                                         tokens.push_back(Token(Token::TOKEN_STR_then, pos));
00410                                                 else if (s == L"else")
00411                                                         tokens.push_back(Token(Token::TOKEN_STR_else, pos));
00412                                                 else if (s == L"elseif")
00413                                                         tokens.push_back(Token(Token::TOKEN_STR_elseif, pos));
00414                                                 else if (s == L"end")
00415                                                         tokens.push_back(Token(Token::TOKEN_STR_end, pos));
00416                                                 else if (s == L"var")
00417                                                         tokens.push_back(Token(Token::TOKEN_STR_var, pos));
00418                                                 else if (s == L"call")
00419                                                         tokens.push_back(Token(Token::TOKEN_STR_call, pos));
00420                                                 else if (s == L"sub")
00421                                                         tokens.push_back(Token(Token::TOKEN_STR_sub, pos));
00422                                                 else if (s == L"callsub")
00423                                                         tokens.push_back(Token(Token::TOKEN_STR_callsub, pos));
00424                                                 else if (s == L"onevent")
00425                                                         tokens.push_back(Token(Token::TOKEN_STR_onevent, pos));
00426                                                 else if (s == L"abs")
00427                                                         tokens.push_back(Token(Token::TOKEN_STR_abs, pos));
00428                                                 else if (s == L"return")
00429                                                         tokens.push_back(Token(Token::TOKEN_STR_return, pos));
00430                                                 else if (s == L"or")
00431                                                         tokens.push_back(Token(Token::TOKEN_OP_OR, pos));
00432                                                 else if (s == L"and")
00433                                                         tokens.push_back(Token(Token::TOKEN_OP_AND, pos));
00434                                                 else if (s == L"not")
00435                                                         tokens.push_back(Token(Token::TOKEN_OP_NOT, pos));
00436                                                 else
00437                                                         tokens.push_back(Token(Token::TOKEN_STRING_LITERAL, pos, s));
00438                                         }
00439                                         
00440                                         pos.column += posIncrement;
00441                                         pos.character += posIncrement;
00442                                 }
00443                                 break;
00444                         } // switch (c)
00445                 } // while (source.good())
00446                 
00447                 tokens.push_back(Token(Token::TOKEN_END_OF_STREAM, pos));
00448         }
00449 
00450         wchar_t Compiler::getNextCharacter(std::wistream &source, SourcePos &pos)
00451         {
00452                 pos.column++;
00453                 pos.character++;
00454                 return source.get();
00455         }
00456 
00457         bool Compiler::testNextCharacter(std::wistream &source, SourcePos &pos, wchar_t test, Token::Type tokenIfTrue)
00458         {
00459                 if (source.peek() == int(test))
00460                 {
00461                         tokens.push_back(Token(tokenIfTrue, pos));
00462                         getNextCharacter(source, pos);
00463                         return true;
00464                 }
00465                 return false;
00466         }
00467         
00469         void Compiler::dumpTokens(std::wostream &dest) const
00470         {
00471                 for (unsigned i = 0; i < tokens.size(); i++)
00472                         dest << tokens[i].toWString() << std::endl;
00473         }
00474 }; // Aseba


aseba
Author(s): Stéphane Magnenat
autogenerated on Sun Oct 5 2014 23:46:38