libvlfeat: formatter.py Source File

Go to the documentation of this file.
00001 #!/usr/bin/python
00002 # file:        formatter.py
00003 # author:      Andrea Vedaldi
00004 # description: Utility to format MATLAB comments.
00005 
00006 # Copyright (C) 2007-12 Andrea Vedaldi and Brian Fulkerson.
00007 # All rights reserved.
00008 #
00009 # This file is part of the VLFeat library and is made available under
00010 # the terms of the BSD license (see the COPYING file).
00011 
00012 """
00013 MDOC fromats the help block of a MATLAB M-file based on a simple set
00014 of rules. Pharagraphs, verbatim sections, lists and other structures
00015 are automatically instantiated by looking at blank lines, indentation
00016 and a few decoration symbols.
00017 
00018 The documentation starts at a conventional indentation level N (by
00019 default 2). A block of non-epmty lines prefixed by N characters is
00020 considered a paragraph. For instance
00021 
00022  |  Bla bla bla
00023  |  bla bla bla.
00024  |
00025  |  Bla bla.
00026 
00027 generates two pharagraps. If there are more than N white spaces,
00028 then the block is taken verbatim instead (and rendered in <pre> HTML
00029 tags). For instance
00030 
00031  |  Bla bla bla
00032  |   Code Code Code
00033  |
00034  |   Code Code Code
00035 
00036 generates one paragraph followed by one verbatim section.
00037 """
00038 
00039 import xml.dom.minidom
00040 import sys
00041 import os
00042 import re
00043 
00044 __mpname__           = 'MDocFormatter'
00045 __version__          = '0.1'
00046 __date__             = '2008-01-01'
00047 __description__      = 'MDoc formatting module'
00048 __long_description__ = __doc__
00049 __license__          = 'BSD'
00050 __author__           = 'Andrea Vedaldi'
00051 
00052 # terminal
00053 class Terminal:
00054     def isa(self, classinfo):
00055         return isinstance(self, classinfo)
00056 
00057 # empty terminal
00058 class E (Terminal):
00059     pass
00060 
00061 # blank line
00062 class B (Terminal):
00063     content = ""
00064 
00065 # non-blank line
00066 class L (Terminal):
00067     indent  = 0
00068 
00069 # regular line
00070 class PL (L):
00071     pass
00072 
00073 # line with bullet
00074 class BL (L):
00075     bullet = None
00076     inner_indent = 0
00077 
00078 # line with description
00079 class DL (L):
00080     pass
00081 
00082 # --------------------------------------------------------------------
00083 def lex(line):
00084 # --------------------------------------------------------------------
00085     """
00086     Parse the string LINE to a terminal symbol. Each line corresponds
00087     to exactly one terminal type. Terminal types are the leaf of a
00088     hierarchy of types.
00089     """
00090 
00091     # a blank line
00092     match = re.match(r"\s*\n?$", line) ;
00093     if match: return B()
00094 
00095     # a line of the type '  content::inner_content'
00096     match = re.match(r"(\s*)(.*)::(.*)\n?$", line)
00097     if match:
00098         x = DL()
00099         x.indent        = len(match.group(1))
00100         x.content       = match.group(2)
00101         x.inner_content = match.group(3)
00102         return x
00103 
00104     # a line of the type '  - inner_contet'
00105     match = re.match(r"(\s*)([-\*#]\s*)(\S.*)\n?$", line)
00106     if match:
00107         x = BL()
00108         x.indent        = len(match.group(1))
00109         x.inner_content = match.group(3)
00110         x.bullet        = match.group(2)
00111         x.inner_indent  = x.indent + len(x.bullet)
00112         x.content       = x.bullet + x.inner_content
00113         return x
00114 
00115     # a line of the type  '   content'
00116     match = re.match(r"(\s*)(\S.*)\n?$", line)
00117     if match:
00118         x = PL()
00119         x.indent  = len(match.group(1))
00120         x.content = match.group(2)
00121         return x
00122 
00123 # --------------------------------------------------------------------
00124 class Lexer(object):
00125 # --------------------------------------------------------------------
00126     """
00127     l = Lexer(LINES) parses the array of strings LINES. Lexer has a
00128     head pointing to the current line. The head can be controlled by
00129     the following methods:
00130 
00131     l.next() advances the head and fetches the next terminal.
00132     l.back() moves back the head.
00133     l.getpos() returns the head position.
00134     l.seek(POS) sets the head position to POS.
00135     """
00136     def __init__(self, lines):
00137         self.tokens = []
00138         self.pos    = -1
00139         for line in lines:
00140             self.tokens.append(lex(line))
00141 
00142     def next(self):
00143         self.pos = self.pos + 1
00144         if self.pos >= len(self.tokens):
00145             return E()
00146         else:
00147             return self.tokens [self.pos]
00148 
00149     def seek(self, pos):
00150         self.pos = pos
00151 
00152     def back(self):
00153         if self.pos >=0: self.pos -= 1
00154 
00155     def rewrite(self, str):
00156         self.tokens [self.pos] = str ;
00157 
00158     def getpos(self):
00159         return self.pos
00160 
00161     def __str__(self):
00162         str = ""
00163         for i,t in enumerate(self.tokens):
00164              str += "%5d) %s %s\n" % (i, t.__class__.__name__,t.content)
00165         return str
00166 
00167 # --------------------------------------------------------------------
00168 class Formatter:
00169 # --------------------------------------------------------------------
00170     """
00171     f = Formatter(LINES) parses the array of strings LINES.
00172 
00173     f = Formatter(LINES, FUNCS) takes the dictionary of functions
00174     FUNCS. Function names must be uppercase. The dictionary entries
00175     are used to cross link functions in the generated documentation.
00176 
00177     Formatter(LINES, FUNCS, LINKTYPE) produces links of the specified
00178     type.  Use 'a' for HTML anchors and 'wiki' for MediaWiki style
00179     links.
00180 
00181     f.toDOM() process the data to construct an XML (HTML) representation
00182     of them.
00183     """
00184     def __init__ (self, lines, funcs={}, linktype='a'):
00185         self.indentinit = 0
00186         lineone = lines[0]
00187         while lineone.startswith(' '):
00188             lineone = lineone[1:]
00189             self.indentinit += 1
00190 
00191         self.tokens = Lexer(lines)
00192         self.xmldoc = xml.dom.minidom.Document()
00193         self.funcs = funcs
00194         self.linktype = linktype
00195         #print self.tokens
00196 
00197     def toTextNode(self,s):
00198         return self.xmldoc.createTextNode(unicode(s, 'iso-8859-1'))
00199 
00200     def addAttr(self, tag, attr, val):
00201         x = self.xmldoc.createAttribute(attr)
00202         x.nodeValue = val
00203         tag.setAttributeNode(x)
00204 
00205     def addText(self, tag, s):
00206         txt = self.toTextNode(s)
00207         tag.appendChild(txt)
00208 
00209     def addFancyText(self, tag, s):
00210         "Adds text while transforming function references to links."
00211         xs = []
00212         last = -1
00213         iter = re.finditer(r'(?:'
00214                            r'(?P<function>[A-Z][A-Z0-9_]*)'
00215                            r'\([^\)]*\)'
00216                            r')|(?:'
00217                            r'<a href="matlab:vl_help\(\''
00218                            r'(?P<page>[a-zA-Z0-9_]*)'
00219                            r'\'\)">'
00220                            r'(?P<text>[^<]*)'
00221                            r'</a>'
00222                            r')',s)
00223 
00224                            # r'(?P<page>[a-zA-Z0-9_]*)'
00225                            # r')', s)
00226 
00227 
00228 
00229                            # r')', s)
00230 
00231         for i in iter:
00232             func_name = i.group("function")
00233             page_name = i.group("page")
00234 
00235             if func_name and self.funcs.has_key(func_name.upper()):
00236                 # retrieve function HTML location
00237                 func_href = self.funcs[func_name.upper()]
00238 
00239                 # add text so far
00240                 xs.append(self.toTextNode(s[last+1:i.start()]))
00241 
00242                 if self.linktype == 'a':
00243                     # add link to function
00244                     atag = self.xmldoc.createElement(u"a")
00245                     self.addText(atag, i.group('function'))
00246                     atag.setAttribute(u"href", u"%s" % (func_href))
00247                     xs.append(atag)
00248                 elif self.linktype == 'wiki':
00249                     linktxt = "[[%s|%s]]" % (func_href, i.group('function'))
00250                     xs.append(self.toTextNode(linktxt))
00251 
00252                 # set head
00253                 last = i.start()+len(i.group(1))-1
00254 
00255             elif page_name:
00256                 #print "page %s:" % page_name, i.group("text")
00257                 page_href = "%%dox:%s;" % page_name
00258 
00259                 # add text so far
00260                 xs.append(self.toTextNode(s[last+1:i.start()]))
00261 
00262                 if self.linktype == 'a':
00263                     # add link to function
00264                     atag = self.xmldoc.createElement(u"a")
00265                     self.addText(atag, i.group('text'))
00266                     atag.setAttribute(u"href", u"%s" % (page_href))
00267                     xs.append(atag)
00268                 elif self.linktype == 'wiki':
00269                     linktxt = "[[%s|%s]]" % (func_href, i.group('function'))
00270                     xs.append(self.toTextNode(linktxt))
00271 
00272                 # set head
00273                 last = i.end()-1
00274 
00275         xs.append(self.toTextNode(s[last+1:]))
00276         for x in xs:
00277             tag.appendChild(x)
00278 
00279     # ................................................................
00280     # E, B, L, PL, BL, DL, ...
00281     def parse_Terminal(self, T):
00282         "If the next terminal on the stream is of type T, the terminal"
00283         "is extracted and returned. Otherwise the function returns None"
00284         pos = self.tokens.getpos()
00285         t = self.tokens.next()
00286         if t.isa(T):
00287             return t
00288         self.tokens.seek(pos)
00289         return None
00290 
00291     # ................................................................
00292     # DIV(N) -> (B | P(N) | BL(N) | DL(N) | V(N))+
00293     def parse_DIV(self, indent):
00294         "Parse a DIV(N) symbol. A DIV(N) a sequence of blank"
00295         "lines (B or other blocks at indentation level N, such as"
00296         "pharagraphs P(N), bullet lists BL(N), description lists DN(N)"
00297         pos = self.tokens.getpos()
00298         xs = []
00299         while True:
00300             x = self.parse_Terminal(B)
00301             if x: continue
00302 
00303             x = self.parse_P(indent)
00304             if x:
00305                 xs.append(x)
00306                 continue
00307 
00308             x = self.parse_V(indent)
00309             if x:
00310                 xs.append(x)
00311                 continue
00312 
00313             x = self.parse_UL(indent)
00314             if x:
00315                 xs.append(x)
00316                 continue
00317 
00318             x = self.parse_DL(indent)
00319             if x:
00320                 xs.append(x)
00321                 continue
00322 
00323             break
00324         if len(xs) == 0: return None
00325         return xs
00326 
00327     # ................................................................
00328     # P(N) -> PL(N) L(N)*
00329     def parse_P(self, indent):
00330         content = "\n"
00331         good = False
00332         pos = self.tokens.getpos()
00333 
00334         # Introduced by PL
00335         x = self.parse_Terminal(PL)
00336         if x:
00337             if x.indent == indent:
00338                 content += x.content + "\n"
00339                 good = True
00340             else:
00341                 self.tokens.back()
00342         if not good:
00343             return None
00344 
00345         # Continued by zero or more L
00346         while True:
00347             x = self.parse_Terminal(L)
00348             if x:
00349                 if x.indent == indent:
00350                     content += x.content + "\n"
00351                     good = True
00352                     continue
00353                 else:
00354                     self.tokens.back()
00355             break
00356 
00357         ptag = self.xmldoc.createElement("p")
00358         self.addFancyText(ptag, content)
00359         return ptag
00360 
00361     # ................................................................
00362     # V(N) -> L(M)+, M > N
00363     def parse_V(self, indent):
00364         content = "\n"
00365         good = False
00366         pos = self.tokens.getpos()
00367         while True:
00368             x = self.parse_Terminal(L)
00369             if x:
00370                 if x.indent > indent:
00371                     content += " "*(x.indent - indent) + x.content + "\n"
00372                     good = True
00373                     continue
00374                 else:
00375                     self.tokens.back()
00376             x = self.parse_Terminal(B)
00377             if x:
00378                 content += "\n"
00379                 continue
00380             break
00381         if good:
00382             ptag = self.xmldoc.createElement("pre")
00383             # remove potential blank line at the end
00384             if content[-2:] == "\n\n":
00385                 content= content[:-1]
00386             self.addText(ptag, content)
00387             return ptag
00388         self.tokens.seek(pos)
00389         return None
00390 
00391     # ................................................................
00392     # UL(N) -> ULI(N)+
00393     def parse_UL(self, indent):
00394         xs = []
00395         while True:
00396             x = self.parse_ULI(indent)
00397             if x:
00398                 xs.append(x)
00399                 continue
00400             break
00401         if len(xs) == 0: return None
00402         ultag = self.xmldoc.createElement("ul")
00403         for x in xs:
00404             ultag.appendChild(x)
00405         return ultag
00406 
00407     # ................................................................
00408     # ULI(N) -> UL(N,M) L(M)* DIV(M), M > N
00409     def parse_ULI(self, indent):
00410         content = "\n"
00411         good = False
00412         pos = self.tokens.getpos()
00413 
00414         # Introduced by UL
00415         x = self.parse_Terminal(BL)
00416         if x:
00417             if x.indent == indent:
00418                 content += x.inner_content + "\n"
00419                 indent   = x.inner_indent
00420                 good = True
00421             else:
00422                 self.tokens.back()
00423         if not good:
00424             return None
00425 
00426         # Continued by zero or more L
00427         while True:
00428             x = self.parse_Terminal(L)
00429             if x:
00430                 if x.indent == indent:
00431                     content += x.content + "\n"
00432                     good = True
00433                     continue
00434                 else:
00435                     self.tokens.back()
00436             break
00437         litag = self.xmldoc.createElement(u"li")
00438         ptag  = self.xmldoc.createElement(u"p")
00439         self.addFancyText(ptag, content)
00440         litag.appendChild(ptag)
00441 
00442         # Continued by DIV
00443         xs = self.parse_DIV(indent)
00444         if xs:
00445             for x in xs:
00446                 litag.appendChild(x)
00447 
00448         return litag
00449 
00450 
00451     # ................................................................
00452     # DL(N) -> DI(N)+
00453     def parse_DL(self, indent):
00454         xs = []
00455         while True:
00456             x = self.parse_DI(indent)
00457             if x:
00458                 xs += x
00459                 continue
00460             break
00461         if len(xs) == 0: return None
00462         dltag = self.xmldoc.createElement(u"dl")
00463         for x in xs:
00464             dltag.appendChild(x)
00465         return dltag
00466 
00467     # ................................................................
00468     # DI(N) -> DL(N) DIV(M)?, M > N
00469     def parse_DI(self, indent):
00470         content = "\n"
00471         good   = False
00472         pos    = self.tokens.getpos()
00473         xs     = []
00474 
00475         # Introduced by DL
00476         x = self.parse_Terminal(DL)
00477         if x:
00478             if x.indent == indent:
00479                 content += x.content + "\n"
00480                 good = True
00481             else:
00482                 self.tokens.back()
00483         if not good:
00484             return None
00485 
00486         if False:
00487             # adds text after :: as part of the description dd
00488             dttag = self.xmldoc.createElement(u"dt")
00489             dttxt = self.toTextNode(content)
00490             dttag.appendChild(dttxt)
00491             xs.append(dttag)
00492 
00493             # Inject inner_content
00494             c = x.inner_content.strip()
00495             if len(c) > 0:
00496                 tk = PL()
00497                 tk.content = x.inner_content
00498                 t = self.tokens.next()
00499                 self.tokens.back()
00500                 if t.isa(L) and t.indent > indent:
00501                     tk.indent = t.indent
00502                 else:
00503                     tk.indent = indent+1 ;
00504                     self.tokens.rewrite(tk)
00505                     self.tokens.back()
00506         else:
00507             # adds text after :: as part of the description term dt
00508             dttag = self.xmldoc.createElement(u"dt")
00509             dttxt = self.toTextNode(content)
00510             dttag.appendChild(dttxt)
00511             c = x.inner_content.strip()
00512             if len(c) > 0:
00513                 deftag = self.xmldoc.createElement(u"span")
00514                 self.addAttr(deftag, "class", "defaults")
00515                 self.addText(deftag, c)
00516                 dttag.appendChild(deftag)
00517             xs.append(dttag)
00518 
00519         # Continued by DIV
00520         t = self.tokens.next()
00521         self.tokens.back()
00522         if t.isa(L) and t.indent > indent:
00523             xs_ = self.parse_DIV(t.indent)
00524             if len(xs_) > 0:
00525                 ddtag = self.xmldoc.createElement(u"dd")
00526                 for x in xs_:
00527                     ddtag.appendChild(x)
00528                 xs.append(ddtag)
00529 
00530         return xs
00531 
00532     # ................................................................
00533     def toDOM(self):
00534         # write <mfile></mfile>
00535         xmf = self.xmldoc.createElement("div")
00536         xmf.setAttribute(u"class", u"documentation")
00537 
00538         self.xmldoc.appendChild(xmf)
00539 
00540         # parse documentation
00541         xs = self.parse_DIV(self.indentinit)
00542         for x in xs: xmf.appendChild(x)
00543 
00544         return self.xmldoc
00545 
00546 
00547 if __name__ == '__main__':
00548     text=""" Lorem Ipsum is simply dummy text of the printing and typesetting
00549  industry. Lorem Ipsum has been the industry's standard dummy text
00550  ever since the 1500s, when an unknown printer took a galley of type
00551  and scrambled it to make a type specimen book. It has survived not
00552  only five centuries, but also the leap into electronic typesetting,
00553  remaining essentially unchanged. It was popularised in the 1960s with
00554  the release of Letraset sheets containing Lorem Ipsum passages, and
00555  more recently with desktop publishing software like Aldus PageMaker
00556  including versions of Lorem Ipsum.
00557 
00558  Also <a href="matlab:vl_help('fisher')">Fisher vectors</a>.
00559 
00560  These are links BL(), BL(A,B) and BLA(A,A) (as long as the dictionary
00561  cites them).
00562 
00563  Mimamama
00564    verbatim1
00565    verbatim2
00566    verbatim3
00567 
00568    verbatim4
00569    verbatim5
00570  Lorem Ipsum is simply dummy text of the printing and typesetting
00571  industry. Lorem Ipsum has been the industry's standard dummy text
00572  ever since the 1500s, when an unknown printer took a galley of type
00573  and scrambled it to make a type specimen book. It has survived not
00574  only five centuries, but also the leap into electronic typesetting,
00575  remaining essentially unchanged. It was popularised in the 1960s with
00576  the release of Letraset sheets containing Lorem Ipsum passages, and
00577  more recently with desktop publishing software like Aldus PageMaker
00578  including versions of Lorem Ipsum.
00579 
00580  - outer1 /
00581    outer1 line 2 /
00582    outer1 line 3 /
00583 
00584    outer1 new paragarph
00585 
00586    - inner1
00587    - inner2
00588    - inner3
00589      continued on next line
00590        continued with verbatim
00591 
00592        more verbatim after blank
00593    - inner4
00594  - outer again
00595  - outer
00596  bla
00597 
00598  - list2
00599  - list4
00600  - BL()
00601  - BL(A,B)
00602 
00603  Test descrition::
00604      Lorem Ipsum is simply dummy text of the printing
00605      and typesetting industry. Lorem Ipsum has been the industry's
00606      standard dummy text ever since the 1500s, when an unknown printer
00607      took a galley of type and scrambled it to make a type specimen
00608      book. It has survived not only five centuries, but also the leap
00609      into electronic typesetting, remaining essentially unchanged. It
00610      was popularised in the 1960s with the release of Letraset sheets
00611      containing Lorem Ipsum passages, and more recently with desktop
00612      publishing software like Aldus PageMaker including versions of
00613      Lorem Ipsum.
00614 
00615  Ancora::
00616      Bli bli bli
00617      Blu blu blu
00618 
00619      - list
00620      - lust
00621      - last
00622 
00623      Bli bla
00624 
00625   Verbatimmo
00626 """
00627     lines = text.splitlines()
00628     formatter = Formatter(lines, {'BL':'http://www.google.com'}, 'a')
00629     print formatter.toDOM().toxml("UTF-8")