00001
00002
00003
00004
00005
00006 """ simple parser using rules defined in EBNF format
00007
00008 This module allows you to parse string according rules
00009 you defined in EBNF format (light)
00010
00011 You use it as the re module:
00012
00013 rp.match(rule,'string to be parsed')
00014
00015 Result: RP Object when parsing is ok,
00016 else None
00017
00018 where:
00019 rule is a list of rules definitions
00020
00021 ex: rule= ['main ::= 'SELECT field FROM table' ,
00022 'field ::= alphanum* ',
00023 'alphanum ::= r"[A-Za-z0-9]" ',
00024 'table ::= alphanum* ' ]
00025
00026 For more details read the rp.doc
00027 """
00028
00029 import re,string
00030
00031 __version__='0.91'
00032
00033
00034
00035
00036
00037
00038 def compile(rule):
00039 ret=RP()
00040 ret.compile(rule)
00041 return ret
00042
00043 def match(rule,thestr):
00044 """ match the string against the rule """
00045 return compile(rule).match(thestr)
00046
00047
00048 RESERVED={'and':'',
00049 'as':'',
00050 'assert':'',
00051 'break':'',
00052 'class':'',
00053 'continue':'',
00054 'def':'',
00055 'del':'',
00056 'elif':'',
00057 'else':'',
00058 'except':'',
00059 'exec':'',
00060 'finally':'',
00061 'for':'',
00062 'from':'',
00063 'global':'',
00064 'if':'',
00065 'import':'',
00066 'in':'',
00067 'is':'',
00068 'lambda':'',
00069 'not':'',
00070 'or':'',
00071 'pass':'',
00072 'print':'',
00073 'raise':'',
00074 'return':'',
00075 'try':'',
00076 'while':'',
00077 'with':'',
00078 'yield':'',
00079 }
00080
00081
00082 class _Tokenizer:
00083 """ class representing the string to parse """
00084 def __init__(self, string,rp):
00085 """ set default values for string """
00086 self.string = string
00087 self.index = 0
00088 self.depth=0
00089 self.maxscan=0
00090 self.rp=rp
00091 def peek(self):
00092 """ peek current character in string, without consume it """
00093 try:
00094 this = self.string[self.index]
00095 except Exception:
00096 this=''
00097
00098 return this
00099 def read(self):
00100 """ read current character, and increment cursor """
00101 this = self.peek()
00102 self.index+=1
00103 self.maxscan=max(self.maxscan,self.index)
00104 return this
00105 def getString(self,ptr):
00106 """ return the parsed substring (ptr is the start point) """
00107 return self.string[ptr:self.index]
00108 def getRemaining(self):
00109 """ returns the remaining part of string not parsed """
00110 return self.string[self.index:]
00111 def removeBlanks(self):
00112 """ removed blanks of string before parsing rule, terminal or regular character
00113 this function could be desactivated using rp.IGNORE_BLANKS
00114 """
00115 if self.rp.ignore_blanks:
00116 while self.peek()==' ':
00117 self.read()
00118 return self.index
00119 def reset(self,ptr=0):
00120 """ reset pointer when rule parsing is ko """
00121 self.index=ptr
00122 def getIndent(self,_down,_str='.'):
00123 """ returns string of ... for debugging purposing """
00124 if _down:
00125 _str=_str*self.depth
00126 self.depth+=1
00127 else:
00128 self.depth-=1
00129 _str=_str*self.depth
00130 return _str
00131
00132 class _RpRule:
00133 """ Class that defines rule """
00134 def __init__(self,name,_rp):
00135 self.name=name
00136 self.definitions=[]
00137 self.rp=_rp
00138 def addDefinition(self,aDef):
00139 """ add a definition to the rule """
00140 self.definitions.append(aDef)
00141 def match(self,_tk):
00142 """ match the string against the rule
00143 Usually, this method is only used for the main rule
00144 (the first rule defined)
00145 """
00146 for init in self.rp.code_init:
00147 if self.rp.execute_code:
00148 exec(init,self.rp.vals)
00149 self.rp.code_array.append(init)
00150 _tk=_Tokenizer(_tk,self.rp)
00151 self.scanMax=0
00152 self._parse(_tk)
00153 len1=_tk.index
00154 if self.rp.ignore_blanks:
00155 len2=len(_tk.string.rstrip())
00156 else:
00157 len2=len(_tk.string)
00158 if len1<len2:
00159 self.rp.stringError=('-'*(_tk.maxscan)+'^')
00160 if self.rp.verbose:
00161 print _tk.string
00162 print self.rp.stringError
00163 else:
00164 for post in self.rp.code_post:
00165 if self.rp.execute_code:
00166 exec(post,self.rp.vals)
00167 self.rp.code_array.append(post)
00168 codestr=''
00169 for x in self.rp.code_array:
00170 codestr+=x+'\n'
00171 self.rp.code=codestr
00172 return len1>=len2
00173 def _parse(self,_tk,mult='',n_val=None):
00174 """ parse the string against the rule """
00175 if self.rp.verbose: print _tk.getIndent(True)+'Parsing rule: "'+self.name+'" for string:"'+_tk.getRemaining()+'"'
00176 ret=-1
00177 _top=_tk.index
00178 _top2=_tk.removeBlanks()
00179 if mult=='' or mult=='?':
00180 for d in self.definitions:
00181 ret=d._parse(_tk)
00182 if ret>=0: break
00183 if ret<0 and mult=='?':
00184 ret=0
00185 else:
00186 fullret=-1
00187
00188
00189
00190 for d in self.definitions:
00191 ret=d._parse(_tk)
00192 if ret>0:
00193 fullret = ret
00194 break
00195
00196
00197
00198 while ret>0:
00199 for d in self.definitions:
00200 ret=d._parse(_tk)
00201 if ret>0:
00202 break
00203
00204
00205
00206 ret=fullret
00207 if ret<0 and mult=='*':
00208 ret=0
00209 _retstr=''
00210 _retindent='.'
00211 if ret>=0:
00212 self.rp.vals[self.name]=_tk.getString(_top2)
00213 _retstr=_tk.getString(_top2)
00214 if ret>0: _retindent='<'
00215 elif ret<0:
00216 _tk.reset(_top)
00217 if self.rp.verbose: print _tk.getIndent(False,_retindent)+'Parsed rule :"'+self.name+'" - value="'+_retstr+'"'
00218 return ret
00219
00220 class _RpDefinition:
00221 """ Class containing definitions (rule, terminal,...) """
00222 def __init__(self,aDef,_rp,code=''):
00223 """ in init, we make:
00224 - scan the definitions to create relative objects
00225 such as Rule, Terminal, Regular.
00226 - acquire the future to be executed
00227 """
00228 self.rp=_rp
00229 self.definition=self._scan(aDef)
00230 self.code=code
00231 def _addcode(self,code):
00232 """ add code statement to the current definition """
00233 self.code+='\n'+code
00234 def _parse(self,_tk,mult=''):
00235 """ parse the definition:
00236 - loop on all items of definition
00237 - and call appropriate objet._parse
00238 (Rule,Terminal,Regular)
00239 """
00240 ret=-1
00241 _top=_tk.index
00242 fullret=-1
00243 for _i,_def in enumerate(self.definition):
00244
00245 d,mult,notrule=_def
00246 if notrule: break
00247 n_d,n_mult,n_notrule,n_val=None,None,None,None
00248 if _i<(len(self.definition)-1):
00249 n_d,n_mult,n_notrule=self.definition[_i+1]
00250 if n_notrule:
00251 if isinstance(n_d,_RpRule) and self.rp.vals.has_key(n_d.name):
00252 n_val=self.rp.vals[n_d.name]
00253 try:
00254 ret=d._parse(_tk,mult,n_val)
00255 if ret>0:
00256 fullret=max(fullret,ret)
00257 except IndexError,err:
00258 print '+++',err
00259 ret=-1
00260 _tk.getIndent(False)
00261 break
00262 if ret<0:
00263 fullret=-1
00264 break
00265 ret=fullret
00266 if ret<0:
00267 _tk.reset(_top)
00268 elif ret>0:
00269
00270
00271
00272
00273 self._interpretCode()
00274 return ret
00275
00276 def _scan(self,adef):
00277 """ scan definition to create objects """
00278 definition=[]
00279 for a in adef.split():
00280 mult=''
00281 m=re.match(r'([^\+\?\*]*)(.?)$',a)
00282 if m==None:
00283 if len(a)>2 and (a[0:3]=='"+"' or a[0:3]=='"?"' or a[0:3]=='"*"'):
00284 key=a[0:3]
00285 mult=a[3:]
00286 elif len(a)>2 and ( a[0:4]=='"**"' or a[0:4]=='"+="' or a[0:4]=='"*="'):
00287 key=a[0:4]
00288 mult=a[4:]
00289 elif len(a)>2 and ( a[0:5]=='"**="'):
00290 key=a[0:5]
00291 mult=a[5:]
00292 else:
00293 raise Exception('Invalid definition: %s in %s' % (a,adef))
00294 else:
00295 key=m.group(1)
00296 mult=m.group(2)
00297 notrule=False
00298
00299
00300
00301 if key.find('"..."')>=2:
00302 m2=re.match('"(.)"\.{3}"(.)"',key)
00303 if m2==None:
00304 raise Exception , 'Invalid expression:'+key
00305 a1=_RpRegular('['+m2.group(1)+'-'+m2.group(2)+']',self.rp)
00306
00307
00308
00309 elif key[0]=='"' and key[-1]=='"':
00310 a1=_RpTerminal(key[1:-1],self.rp)
00311
00312
00313
00314 elif key[0]=='"' and key[-1]!='"':
00315 raise Exception('End " is missing: %s in %s' % (a,adef))
00316
00317
00318
00319 elif key[0]=='r' and key[1]=='"' and key[-1]=='"':
00320 a1=_RpRegular(key[2:-1],self.rp)
00321
00322
00323
00324 elif key.islower():
00325 if key[0]=='^':
00326 notrule=True
00327 key=key[1:]
00328 if self.rp.rules.has_key(key):
00329 a1=self.rp.rules[key]
00330 else:
00331 a1=_RpRule(key,self.rp)
00332 self.rp.rules[key]=a1
00333
00334
00335
00336 else:
00337 a1=_RpTerminal(key,self.rp)
00338 definition.append([a1,mult,notrule])
00339 return definition
00340 def _interpretCode(self):
00341 """ interpret the code defined for rule
00342 - variables defined in code must begin with '$'
00343 followed by the rule name
00344 """
00345 if len(self.code)==0: return
00346 sepcode=self.code[0]
00347 codes=self.code.split(sepcode)[1:]
00348 for cc in codes:
00349
00350
00351
00352
00353
00354 m=re.match(r'^([^\$]*)\$([A-Za-z0-9_]*)(.*)$',cc,re.DOTALL)
00355 while m!=None:
00356 _deb=m.group(1)
00357 _var=m.group(2)
00358 _fin=m.group(3)
00359 if self.rp.vals.has_key(_var):
00360 cc=_deb+self.rp.vals.get(_var)+_fin
00361 else:
00362 raise Exception,_var+' not set'
00363 m=re.match(r'^([^\$]*)\$([A-Za-z0-9]*)(.*)$',cc,re.DOTALL)
00364 if self.rp.execute_code:
00365 try:
00366 exec(cc,self.rp.vals)
00367 except Exception,error:
00368 print '+++',error
00369 print '+++Code=',cc
00370 self.rp.code_array.append(cc)
00371
00372 class _RpTerminal:
00373 """ class to handle terminals """
00374 def __init__(self,term,rp):
00375 """ init terminal
00376 - check abbreviation of terminal
00377 ex: SEParator
00378 minimum to check = 3
00379 """
00380 self.rp=rp
00381 term=term.strip()
00382 min=len(term)
00383
00384
00385
00386 m=re.match(r'([A-Za-z0-9_]*)',term)
00387 if m!=None:
00388 term=term.upper()
00389 min=len(m.group(1))
00390
00391 self.terminal=term
00392 self.min=min
00393 def _parse(self,_tk,mult='',n_val=None):
00394 """ parse the terminal """
00395 if self.rp.verbose: print _tk.getIndent(True)+'Parsing terminal:"'+self.terminal+'" for string:"'+_tk.getRemaining()+'"'
00396 ret=-1
00397 _top=_tk.index
00398 _top2=_tk.removeBlanks()
00399 min=0
00400 while (min<self.min or min<len(self.terminal)) and _tk.peek().upper()==self.terminal[min]:
00401 _tk.read()
00402 min+=1
00403 ret=min
00404 if _tk.index - _top2 < self.min:
00405 ret=-1
00406 _tk.reset(_top)
00407 if min>0 and min<len(self.terminal):
00408 ntok=_tk.peek().strip()
00409 nterm=self.terminal[min].strip()
00410
00411
00412 if ntok!='' and nterm!='' and string.ascii_letters.find(ntok)>-1 and ntok!=nterm:
00413 ret=-1
00414 if (mult=='?' or mult=='*'):
00415 ret=max(0,ret)
00416 _retstr=''
00417 _retindent='.'
00418 if ret>0:
00419 _retstr=_tk.getString(_top2)
00420 _retindent='<'
00421 if self.rp.verbose: print _tk.getIndent(False,_retindent)+'Parsed terminal :"'+self.terminal+'" - value="'+_retstr+'"'
00422 return ret
00423
00424 class _RpRegular:
00425 """ class to handle regular expressions """
00426 def __init__(self,term,rp):
00427 """ init class, compile reg """
00428 self.source=term
00429 self.rp=rp
00430 self.regular=re.compile(term)
00431 def _parse(self,_tk,mult='',n_val=None):
00432 """ parsing string against reg expression """
00433 if self.rp.verbose: print _tk.getIndent(True)+'Parsing regular:"'+self.source+'" for string:"'+_tk.getRemaining()+'"'
00434 if n_val!=None and n_val==_tk.peek():
00435 if self.rp.verbose: print _tk.getIndent(False)+'Parsed regular :"'+self.source+'" - value=""'
00436 return -1
00437 ret=-1
00438 min=0
00439 _top=_tk.index
00440 if mult=='' or mult=='+':
00441 _passed=False
00442 try:
00443 _passed=self.regular.match(_tk.peek())
00444 except TypeError:
00445 pass
00446 if _passed:
00447 _tk.read()
00448 min+=1
00449 ret=min
00450 if mult=='+':
00451 while 1:
00452 try:
00453 if self.regular.match(_tk.peek()):
00454 min+=1
00455 _tk.read()
00456 ret=min
00457 else:
00458 break
00459 except IndexError:
00460 break
00461
00462 elif mult=='?' or mult=='*':
00463 ret=0
00464 if self.regular.match(_tk.peek()):
00465 _tk.read()
00466 min+=1
00467 ret=min
00468 if mult=='*':
00469 while 1:
00470 try:
00471 if self.regular.match(_tk.peek()):
00472 min+=1
00473 _tk.read()
00474 ret=min
00475 else:
00476 break
00477 except IndexError:
00478 break
00479 else:
00480 raise Exception('Invalid multiplicator:',mult)
00481 if ret==-1:
00482 _tk.reset(_top)
00483 _retstr=''
00484 _retindent='.'
00485 if ret>0:
00486 _retstr=_tk.getString(_top)
00487 _retindent='<'
00488 if self.rp.verbose: print _tk.getIndent(False,_retindent)+'Parsed regular :"'+self.source+'" - value="'+_retstr+'"'
00489 return ret
00490
00491 class RP:
00492 def __init__(self):
00493 self.rules={}
00494 self.vals={}
00495 self.verbose=False
00496 self.debug=False
00497 self.ignore_blanks=True
00498 self.execute_code=True
00499 self.sepcode='@'
00500 self.code_array=[]
00501 self.code_init=[]
00502 self.code_post=[]
00503 self.code=''
00504 self.maindef=None
00505 self.stringError=''
00506 self.rp_locals=locals
00507 def compile(self,rule):
00508 cmp1=re.compile(r'^\s*([A-Za-z0-9_-]*)\s*::=\s*([^'+self.sepcode+r']*)(.*)$')
00509 cmp2=re.compile(r'^\s*\|\s*([^'+self.sepcode+r']*)(.*)$')
00510 cmp3=re.compile(r'^\s*'+self.sepcode+r'(.*)$')
00511 currentRule=None
00512 for line in rule:
00513 line=line.strip()
00514 if line=='' or line[0]=='#': continue
00515 else:
00516 m1,m2,m3,m4,m5,m6,m7=None,None,None,None,None,None,None
00517 m1=cmp1.match(line)
00518 if m1==None: m2=cmp2.match(line)
00519 if m2==None: m3=cmp3.match(line)
00520 if m3==None: m4=re.match(r'^\s*[Ii][Nn][Ii][Tt]\s+(.*)$',line)
00521 if m4==None: m5=re.match(r'^\s*[Ii][Mm][Pp][Oo][Rr][Tt]\s+(.*)$',line)
00522 if m5==None: m6=re.match(r'^\s*[Oo][Pp][Tt][Ii][Oo][Nn]\s+(.*)$',line)
00523 if m6==None: m7=re.match(r'^\s*[Pp][Oo][Ss][Tt]\s+(.*)$',line)
00524 if m1!=None:
00525 _rule=m1.group(1)
00526 if RESERVED.has_key(_rule):
00527 raise Exception("Invalid rule name: '%s' is a python reserved word" % (_rule) )
00528 _def=m1.group(2)
00529 _code=m1.group(3)
00530 if not self.rules.has_key(_rule):
00531 currentRule=_RpRule(_rule,self)
00532 self.rules[_rule]=currentRule
00533 else:
00534 currentRule=self.rules[_rule]
00535 _def,newrules=splitBrackets(_def)
00536 _ndef=re.split(r'\|',_def)
00537 for _def in _ndef:
00538 _def=_xlate(_def,True)
00539 ruleDef=_RpDefinition(_def,self,_code)
00540 currentRule.addDefinition(ruleDef)
00541 rule.extend(newrules)
00542 if self.maindef==None: self.maindef=currentRule
00543 elif m2!=None:
00544 _def=m2.group(1)
00545 _code=m2.group(2)
00546 _def,newrules=splitBrackets(_def)
00547 _ndef=re.split(r'\|',_def)
00548 for _def in _ndef:
00549 _def=_xlate(_def,True)
00550 ruleDef=_RpDefinition(_def,self,_code)
00551 currentRule.addDefinition(ruleDef)
00552 rule.extend(newrules)
00553 elif m3!=None:
00554 _code=m3.group(1)
00555 ruleDef._addcode(_code)
00556 elif m4!=None:
00557 _initcode=m4.group(1)
00558 self.code_init.append(_initcode)
00559 elif m5!=None:
00560 _infile=m5.group(1)
00561 try:
00562 _file=open(_infile,'r')
00563 _ret=_file.readlines()
00564 _file.close()
00565 rule.extend(_ret)
00566 except Exception:
00567 raise Exception('File not found:'+_infile)
00568 elif m6!=None:
00569 val=m6.group(1).strip()
00570 uval=val.upper()
00571 if uval=='VERBOSE':
00572 self.verbose=True
00573 elif uval=='TRACE':
00574 self.trace=True
00575 elif uval=='BLANKS':
00576 self.ignore_blanks=False
00577 elif uval=='NORUN':
00578 self.execute_code=False
00579 elif len(val)>1:
00580 vals=val.split(' ')
00581 if vals[0].upper()=='SEPCODE':
00582 self.sepcode=vals[1]
00583 cmp1=re.compile(r'^\s*([A-Za-z0-9_-]*)\s*::=\s*([^'+self.sepcode+r']*)(.*)$')
00584 cmp2=re.compile(r'^\s*\|\s*([^'+self.sepcode+r']*)(.*)$')
00585 cmp3=re.compile(r'^\s*'+self.sepcode+r'(.*)$')
00586 else:
00587 raise Exception('Invalid option:'+val)
00588 elif m7!=None:
00589 _postcode=m7.group(1)
00590 self.code_post.append(_postcode)
00591 else:
00592 raise Exception('Invalid rule: '+line)
00593 self.check_rules()
00594
00595 def check_rules(self):
00596 """ routine to check if all rules are defined """
00597 for k,v in self.rules.iteritems():
00598 if len(v.definitions)==0:
00599 raise Exception('No definition for rule:'+k)
00600
00601 def match(self,data):
00602 if self.maindef.match(data):
00603 for x,y in self.vals.iteritems():
00604 if isinstance(y,str) and len(y)>0:
00605 if y[0]=='"' or y[0]=="'":
00606 exec("self."+x+"="+y)
00607 else:
00608 exec('self.'+x+'="'+y+'"')
00609 elif isinstance(y,int):
00610 exec("self."+x+"="+str(y))
00611 elif isinstance(y,list):
00612 exec("self."+x+"="+str(y))
00613 return self
00614 return None
00615
00616 def get(self,var):
00617 if self.vals.has_key(var):
00618 return self.vals.get(var)
00619 return None
00620
00621 def _xlate(str,reverse=False):
00622 xlation={'"("':'"l_parent"',
00623 '")"':'"r_parent"',
00624 '"|"':'"or_term"',
00625 '"[':'"l_bracket"',
00626 ']"':'"r_bracket"'}
00627 for k,v in xlation.iteritems():
00628 if reverse:
00629 str=str.replace(v,k)
00630 else:
00631 str=str.replace(k,v)
00632 return str
00633
00634 def splitBrackets(inputStr):
00635 """ Split rules definitions
00636 split along () , []
00637 """
00638 import random
00639 newrules=[]
00640 inputStr=_xlate(inputStr)
00641
00642
00643 nr1=re.findall('\([^\)]*\)[\+\?\*]?',inputStr)
00644 nr2=re.split('\([^\)]*\)[\+\?\*]?',inputStr)
00645 nstr=nr2.pop(0)
00646 for nr in nr1:
00647 if nr[-1]==')':
00648 suffix=''
00649 nr=nr[1:-1]
00650 else:
00651 suffix=nr[-1]
00652 nr=nr[1:-2]
00653 a_rule='rule' + str(random.randint(1,1000000))
00654 nstr+=(' '+a_rule+suffix+' ')
00655 newrules.append(a_rule+' ::= '+ _xlate(nr,True))
00656 nstr+=nr2.pop(0)
00657 inputStr=nstr
00658
00659
00660 nr1=re.findall('\[[^\]]*\]',inputStr)
00661 nr2=re.split('\[[^\]]*\]',inputStr)
00662 nstr=nr2.pop(0)
00663 for nr in nr1:
00664 nr=nr[1:-1]
00665 a_rule='rule' + str(random.randint(1,1000000))
00666 nstr+=(' '+a_rule+'? ')
00667 newrules.append(a_rule+' ::= '+ _xlate(nr,True))
00668 nstr+=nr2.pop(0)
00669 inputStr=nstr
00670 return inputStr,newrules
00671
00672
00673 if __name__=='__main__':
00674 rule=['init loc_range=[] ',
00675 'locate ::= range? string ',
00676 'range ::= "(" group+ ")" ',
00677 ' | group ',
00678 'group ::= grp ',
00679 'grp ::= nn "-" mm @loc_range.append([$nn,$mm])',
00680 ' | nn "." mm @loc_range.append([$nn,($nn+$mm)])',
00681 ' | nn @loc_range.append([$nn,9999])',
00682 'nn ::= r"[0-9]"+ ',
00683 'mm ::= r"[0-9]"+ ',
00684 'string ::= sep char* sep @loc_string="$char" ',
00685 'sep ::= r"\S" ',
00686 'char ::= r"." ^sep' ]
00687 stringsToTry=(' 1-20 /hello world/ ',
00688 ' 4-10 ,12345678',
00689 ' 4-10 ,aaaaaa,',
00690 ' /location/',
00691 ' (4.10 25-28 39.3) /location/ ',
00692 ' (4.10 25-28 /location/ ',
00693 ' 46- /location/ ')
00694 r=compile(rule)
00695 for st in stringsToTry:
00696 ok_ko=(r.match(st)!=None)
00697 print '\n---------', st, '------------',ok_ko
00698 if not ok_ko:
00699 print '"'+st+'"'
00700 print " "+r.stringError
00701 else:
00702 print "range=",r.loc_range
00703 print "string=",r.loc_string
00704
00705
00706
00707