tokenize.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Copyright 2007 Neal Norwitz
00004 # Portions Copyright 2007 Google Inc.
00005 #
00006 # Licensed under the Apache License, Version 2.0 (the "License");
00007 # you may not use this file except in compliance with the License.
00008 # You may obtain a copy of the License at
00009 #
00010 #      http://www.apache.org/licenses/LICENSE-2.0
00011 #
00012 # Unless required by applicable law or agreed to in writing, software
00013 # distributed under the License is distributed on an "AS IS" BASIS,
00014 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 # See the License for the specific language governing permissions and
00016 # limitations under the License.
00017 
00018 """Tokenize C++ source code."""
00019 
00020 __author__ = 'nnorwitz@google.com (Neal Norwitz)'
00021 
00022 
00023 try:
00024     # Python 3.x
00025     import builtins
00026 except ImportError:
00027     # Python 2.x
00028     import __builtin__ as builtins
00029 
00030 
00031 import sys
00032 
00033 from cpp import utils
00034 
00035 
00036 if not hasattr(builtins, 'set'):
00037     # Nominal support for Python 2.3.
00038     from sets import Set as set
00039 
00040 
00041 # Add $ as a valid identifier char since so much code uses it.
00042 _letters = 'abcdefghijklmnopqrstuvwxyz'
00043 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
00044 HEX_DIGITS = set('0123456789abcdefABCDEF')
00045 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
00046 
00047 
00048 # C++0x string preffixes.
00049 _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
00050 
00051 
00052 # Token types.
00053 UNKNOWN = 'UNKNOWN'
00054 SYNTAX = 'SYNTAX'
00055 CONSTANT = 'CONSTANT'
00056 NAME = 'NAME'
00057 PREPROCESSOR = 'PREPROCESSOR'
00058 
00059 # Where the token originated from.  This can be used for backtracking.
00060 # It is always set to WHENCE_STREAM in this code.
00061 WHENCE_STREAM, WHENCE_QUEUE = range(2)
00062 
00063 
00064 class Token(object):
00065     """Data container to represent a C++ token.
00066 
00067     Tokens can be identifiers, syntax char(s), constants, or
00068     pre-processor directives.
00069 
00070     start contains the index of the first char of the token in the source
00071     end contains the index of the last char of the token in the source
00072     """
00073 
00074     def __init__(self, token_type, name, start, end):
00075         self.token_type = token_type
00076         self.name = name
00077         self.start = start
00078         self.end = end
00079         self.whence = WHENCE_STREAM
00080 
00081     def __str__(self):
00082         if not utils.DEBUG:
00083             return 'Token(%r)' % self.name
00084         return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
00085 
00086     __repr__ = __str__
00087 
00088 
00089 def _GetString(source, start, i):
00090     i = source.find('"', i+1)
00091     while source[i-1] == '\\':
00092         # Count the trailing backslashes.
00093         backslash_count = 1
00094         j = i - 2
00095         while source[j] == '\\':
00096             backslash_count += 1
00097             j -= 1
00098         # When trailing backslashes are even, they escape each other.
00099         if (backslash_count % 2) == 0:
00100             break
00101         i = source.find('"', i+1)
00102     return i + 1
00103 
00104 
00105 def _GetChar(source, start, i):
00106     # NOTE(nnorwitz): may not be quite correct, should be good enough.
00107     i = source.find("'", i+1)
00108     while source[i-1] == '\\':
00109         # Need to special case '\\'.
00110         if (i - 2) > start and source[i-2] == '\\':
00111             break
00112         i = source.find("'", i+1)
00113     # Try to handle unterminated single quotes (in a #if 0 block).
00114     if i < 0:
00115         i = start
00116     return i + 1
00117 
00118 
00119 def GetTokens(source):
00120     """Returns a sequence of Tokens.
00121 
00122     Args:
00123       source: string of C++ source code.
00124 
00125     Yields:
00126       Token that represents the next token in the source.
00127     """
00128     # Cache various valid character sets for speed.
00129     valid_identifier_chars = VALID_IDENTIFIER_CHARS
00130     hex_digits = HEX_DIGITS
00131     int_or_float_digits = INT_OR_FLOAT_DIGITS
00132     int_or_float_digits2 = int_or_float_digits | set('.')
00133 
00134     # Only ignore errors while in a #if 0 block.
00135     ignore_errors = False
00136     count_ifs = 0
00137 
00138     i = 0
00139     end = len(source)
00140     while i < end:
00141         # Skip whitespace.
00142         while i < end and source[i].isspace():
00143             i += 1
00144         if i >= end:
00145             return
00146 
00147         token_type = UNKNOWN
00148         start = i
00149         c = source[i]
00150         if c.isalpha() or c == '_':              # Find a string token.
00151             token_type = NAME
00152             while source[i] in valid_identifier_chars:
00153                 i += 1
00154             # String and character constants can look like a name if
00155             # they are something like L"".
00156             if (source[i] == "'" and (i - start) == 1 and
00157                 source[start:i] in 'uUL'):
00158                 # u, U, and L are valid C++0x character preffixes.
00159                 token_type = CONSTANT
00160                 i = _GetChar(source, start, i)
00161             elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
00162                 token_type = CONSTANT
00163                 i = _GetString(source, start, i)
00164         elif c == '/' and source[i+1] == '/':    # Find // comments.
00165             i = source.find('\n', i)
00166             if i == -1:  # Handle EOF.
00167                 i = end
00168             continue
00169         elif c == '/' and source[i+1] == '*':    # Find /* comments. */
00170             i = source.find('*/', i) + 2
00171             continue
00172         elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
00173             token_type = SYNTAX
00174             i += 1
00175             new_ch = source[i]
00176             if new_ch == c:
00177                 i += 1
00178             elif c == '-' and new_ch == '>':
00179                 i += 1
00180             elif new_ch == '=':
00181                 i += 1
00182         elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
00183             token_type = SYNTAX
00184             i += 1
00185             if c == '.' and source[i].isdigit():
00186                 token_type = CONSTANT
00187                 i += 1
00188                 while source[i] in int_or_float_digits:
00189                     i += 1
00190                 # Handle float suffixes.
00191                 for suffix in ('l', 'f'):
00192                     if suffix == source[i:i+1].lower():
00193                         i += 1
00194                         break
00195         elif c.isdigit():                        # Find integer.
00196             token_type = CONSTANT
00197             if c == '0' and source[i+1] in 'xX':
00198                 # Handle hex digits.
00199                 i += 2
00200                 while source[i] in hex_digits:
00201                     i += 1
00202             else:
00203                 while source[i] in int_or_float_digits2:
00204                     i += 1
00205             # Handle integer (and float) suffixes.
00206             for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
00207                 size = len(suffix)
00208                 if suffix == source[i:i+size].lower():
00209                     i += size
00210                     break
00211         elif c == '"':                           # Find string.
00212             token_type = CONSTANT
00213             i = _GetString(source, start, i)
00214         elif c == "'":                           # Find char.
00215             token_type = CONSTANT
00216             i = _GetChar(source, start, i)
00217         elif c == '#':                           # Find pre-processor command.
00218             token_type = PREPROCESSOR
00219             got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
00220             if got_if:
00221                 count_ifs += 1
00222             elif source[i:i+6] == '#endif':
00223                 count_ifs -= 1
00224                 if count_ifs == 0:
00225                     ignore_errors = False
00226 
00227             # TODO(nnorwitz): handle preprocessor statements (\ continuations).
00228             while 1:
00229                 i1 = source.find('\n', i)
00230                 i2 = source.find('//', i)
00231                 i3 = source.find('/*', i)
00232                 i4 = source.find('"', i)
00233                 # NOTE(nnorwitz): doesn't handle comments in #define macros.
00234                 # Get the first important symbol (newline, comment, EOF/end).
00235                 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
00236 
00237                 # Handle #include "dir//foo.h" properly.
00238                 if source[i] == '"':
00239                     i = source.find('"', i+1) + 1
00240                     assert i > 0
00241                     continue
00242                 # Keep going if end of the line and the line ends with \.
00243                 if not (i == i1 and source[i-1] == '\\'):
00244                     if got_if:
00245                         condition = source[start+4:i].lstrip()
00246                         if (condition.startswith('0') or
00247                             condition.startswith('(0)')):
00248                             ignore_errors = True
00249                     break
00250                 i += 1
00251         elif c == '\\':                          # Handle \ in code.
00252             # This is different from the pre-processor \ handling.
00253             i += 1
00254             continue
00255         elif ignore_errors:
00256             # The tokenizer seems to be in pretty good shape.  This
00257             # raise is conditionally disabled so that bogus code
00258             # in an #if 0 block can be handled.  Since we will ignore
00259             # it anyways, this is probably fine.  So disable the
00260             # exception and  return the bogus char.
00261             i += 1
00262         else:
00263             sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
00264                              ('?', i, c, source[i-10:i+10]))
00265             raise RuntimeError('unexpected token')
00266 
00267         if i <= 0:
00268             print('Invalid index, exiting now.')
00269             return
00270         yield Token(token_type, source[start:i], start, i)
00271 
00272 
00273 if __name__ == '__main__':
00274     def main(argv):
00275         """Driver mostly for testing purposes."""
00276         for filename in argv[1:]:
00277             source = utils.ReadFile(filename)
00278             if source is None:
00279                 continue
00280 
00281             for token in GetTokens(source):
00282                 print('%-12s: %s' % (token.token_type, token.name))
00283                 # print('\r%6.2f%%' % (100.0 * index / token.end),)
00284             sys.stdout.write('\n')
00285 
00286 
00287     main(sys.argv)


ros_opcua_impl_freeopcua
Author(s): Denis Štogl
autogenerated on Sat Jun 8 2019 18:24:57