00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 """Tokenize C++ source code."""
00019
00020 __author__ = 'nnorwitz@google.com (Neal Norwitz)'
00021
00022
00023 try:
00024
00025 import builtins
00026 except ImportError:
00027
00028 import __builtin__ as builtins
00029
00030
00031 import sys
00032
00033 from cpp import utils
00034
00035
00036 if not hasattr(builtins, 'set'):
00037
00038 from sets import Set as set
00039
00040
00041
00042 _letters = 'abcdefghijklmnopqrstuvwxyz'
00043 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
00044 HEX_DIGITS = set('0123456789abcdefABCDEF')
00045 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
00046
00047
00048
00049 _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
00050
00051
00052
00053 UNKNOWN = 'UNKNOWN'
00054 SYNTAX = 'SYNTAX'
00055 CONSTANT = 'CONSTANT'
00056 NAME = 'NAME'
00057 PREPROCESSOR = 'PREPROCESSOR'
00058
00059
00060
00061 WHENCE_STREAM, WHENCE_QUEUE = range(2)
00062
00063
00064 class Token(object):
00065 """Data container to represent a C++ token.
00066
00067 Tokens can be identifiers, syntax char(s), constants, or
00068 pre-processor directives.
00069
00070 start contains the index of the first char of the token in the source
00071 end contains the index of the last char of the token in the source
00072 """
00073
00074 def __init__(self, token_type, name, start, end):
00075 self.token_type = token_type
00076 self.name = name
00077 self.start = start
00078 self.end = end
00079 self.whence = WHENCE_STREAM
00080
00081 def __str__(self):
00082 if not utils.DEBUG:
00083 return 'Token(%r)' % self.name
00084 return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
00085
00086 __repr__ = __str__
00087
00088
00089 def _GetString(source, start, i):
00090 i = source.find('"', i+1)
00091 while source[i-1] == '\\':
00092
00093 backslash_count = 1
00094 j = i - 2
00095 while source[j] == '\\':
00096 backslash_count += 1
00097 j -= 1
00098
00099 if (backslash_count % 2) == 0:
00100 break
00101 i = source.find('"', i+1)
00102 return i + 1
00103
00104
00105 def _GetChar(source, start, i):
00106
00107 i = source.find("'", i+1)
00108 while source[i-1] == '\\':
00109
00110 if (i - 2) > start and source[i-2] == '\\':
00111 break
00112 i = source.find("'", i+1)
00113
00114 if i < 0:
00115 i = start
00116 return i + 1
00117
00118
00119 def GetTokens(source):
00120 """Returns a sequence of Tokens.
00121
00122 Args:
00123 source: string of C++ source code.
00124
00125 Yields:
00126 Token that represents the next token in the source.
00127 """
00128
00129 valid_identifier_chars = VALID_IDENTIFIER_CHARS
00130 hex_digits = HEX_DIGITS
00131 int_or_float_digits = INT_OR_FLOAT_DIGITS
00132 int_or_float_digits2 = int_or_float_digits | set('.')
00133
00134
00135 ignore_errors = False
00136 count_ifs = 0
00137
00138 i = 0
00139 end = len(source)
00140 while i < end:
00141
00142 while i < end and source[i].isspace():
00143 i += 1
00144 if i >= end:
00145 return
00146
00147 token_type = UNKNOWN
00148 start = i
00149 c = source[i]
00150 if c.isalpha() or c == '_':
00151 token_type = NAME
00152 while source[i] in valid_identifier_chars:
00153 i += 1
00154
00155
00156 if (source[i] == "'" and (i - start) == 1 and
00157 source[start:i] in 'uUL'):
00158
00159 token_type = CONSTANT
00160 i = _GetChar(source, start, i)
00161 elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
00162 token_type = CONSTANT
00163 i = _GetString(source, start, i)
00164 elif c == '/' and source[i+1] == '/':
00165 i = source.find('\n', i)
00166 if i == -1:
00167 i = end
00168 continue
00169 elif c == '/' and source[i+1] == '*':
00170 i = source.find('*/', i) + 2
00171 continue
00172 elif c in ':+-<>&|*=':
00173 token_type = SYNTAX
00174 i += 1
00175 new_ch = source[i]
00176 if new_ch == c:
00177 i += 1
00178 elif c == '-' and new_ch == '>':
00179 i += 1
00180 elif new_ch == '=':
00181 i += 1
00182 elif c in '()[]{}~!?^%;/.,':
00183 token_type = SYNTAX
00184 i += 1
00185 if c == '.' and source[i].isdigit():
00186 token_type = CONSTANT
00187 i += 1
00188 while source[i] in int_or_float_digits:
00189 i += 1
00190
00191 for suffix in ('l', 'f'):
00192 if suffix == source[i:i+1].lower():
00193 i += 1
00194 break
00195 elif c.isdigit():
00196 token_type = CONSTANT
00197 if c == '0' and source[i+1] in 'xX':
00198
00199 i += 2
00200 while source[i] in hex_digits:
00201 i += 1
00202 else:
00203 while source[i] in int_or_float_digits2:
00204 i += 1
00205
00206 for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
00207 size = len(suffix)
00208 if suffix == source[i:i+size].lower():
00209 i += size
00210 break
00211 elif c == '"':
00212 token_type = CONSTANT
00213 i = _GetString(source, start, i)
00214 elif c == "'":
00215 token_type = CONSTANT
00216 i = _GetChar(source, start, i)
00217 elif c == '#':
00218 token_type = PREPROCESSOR
00219 got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
00220 if got_if:
00221 count_ifs += 1
00222 elif source[i:i+6] == '#endif':
00223 count_ifs -= 1
00224 if count_ifs == 0:
00225 ignore_errors = False
00226
00227
00228 while 1:
00229 i1 = source.find('\n', i)
00230 i2 = source.find('//', i)
00231 i3 = source.find('/*', i)
00232 i4 = source.find('"', i)
00233
00234
00235 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
00236
00237
00238 if source[i] == '"':
00239 i = source.find('"', i+1) + 1
00240 assert i > 0
00241 continue
00242
00243 if not (i == i1 and source[i-1] == '\\'):
00244 if got_if:
00245 condition = source[start+4:i].lstrip()
00246 if (condition.startswith('0') or
00247 condition.startswith('(0)')):
00248 ignore_errors = True
00249 break
00250 i += 1
00251 elif c == '\\':
00252
00253 i += 1
00254 continue
00255 elif ignore_errors:
00256
00257
00258
00259
00260
00261 i += 1
00262 else:
00263 sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
00264 ('?', i, c, source[i-10:i+10]))
00265 raise RuntimeError('unexpected token')
00266
00267 if i <= 0:
00268 print('Invalid index, exiting now.')
00269 return
00270 yield Token(token_type, source[start:i], start, i)
00271
00272
00273 if __name__ == '__main__':
00274 def main(argv):
00275 """Driver mostly for testing purposes."""
00276 for filename in argv[1:]:
00277 source = utils.ReadFile(filename)
00278 if source is None:
00279 continue
00280
00281 for token in GetTokens(source):
00282 print('%-12s: %s' % (token.token_type, token.name))
00283
00284 sys.stdout.write('\n')
00285
00286
00287 main(sys.argv)