pyclearsilver: trans.py Source File

Go to the documentation of this file.
00001 #!/neo/opt/bin/python
00002 
00003 import sys, string, os, getopt, pwd, signal, time, re
00004 import fcntl
00005 
00006 import tstart
00007 
00008 import db_trans
00009 from log import *
00010 import neo_cgi, neo_util
00011 import odb
00012 
00013 eTransError = "eTransError"
00014 
00015 DONE = 0
00016 DEBUG = 0
00017 
00018 TIER2_DIV = 11
00019 TIER1_DIV = 11 * TIER2_DIV
00020 
00021 if not DEBUG: LOGGING_STATUS[DEV_UPDATE] = 0
00022 
00023 def handleSignal(*arg):
00024   global DONE
00025   DONE = 1
00026 
00027 def usage():
00028   print "usage info!!"
00029 
00030 def exceptionString():
00031   import StringIO, traceback
00032 
00033   ## get the traceback message  
00034   sfp = StringIO.StringIO()
00035   traceback.print_exc(file=sfp)
00036   exception = sfp.getvalue()
00037   sfp.close()
00038 
00039   return exception
00040 
00041 class TransLoc:
00042     def __init__ (self, string_id, filename, location):
00043         self.string_id = string_id
00044         self.filename = filename
00045         self.location = location
00046 
00047 class Translator:
00048     _HTML_TAG_RE = None
00049     _HTML_TAG_REGEX = '<[^!][^>]*?>'
00050     _HTML_CMT_RE = None
00051     _HTML_CMT_REGEX = '<!--.*?-->'
00052     _CS_TAG_RE = None
00053     _CS_TAG_REGEX = '<\\?.+?\\?>'
00054 
00055     def __init__ (self):
00056         self.tdb = db_trans.trans_connect()
00057 
00058         # configuration data ......
00059         #  - we should stop hardcoding this... - jeske
00060         
00061         self.root = "testroot"
00062         self.languages = ['es', 'en'] 
00063 
00064         self.ignore_paths = ['tmpl/m']  # common place for mockups
00065         self.ignore_files = ['blah_ignore.cs'] # ignore clearsilver file
00066 
00067         # ignore clearsilver javascript files
00068         self.ignore_patterns = ['tmpl/[^ ]*_js.cs'] 
00069 
00070         # ............................
00071 
00072 
00073         if self.root is None:
00074             raise "Unable to determine installation root"
00075 
00076 
00077         if Translator._HTML_TAG_RE is None:
00078             Translator._HTML_TAG_RE = re.compile(Translator._HTML_TAG_REGEX, re.MULTILINE | re.DOTALL)
00079         if Translator._HTML_CMT_RE is None:
00080             Translator._HTML_CMT_RE = re.compile(Translator._HTML_CMT_REGEX, re.MULTILINE | re.DOTALL)
00081         if Translator._CS_TAG_RE is None:
00082             Translator._CS_TAG_RE = re.compile(Translator._CS_TAG_REGEX, re.MULTILINE | re.DOTALL)
00083 
00084         self._html_state = 0
00085 
00086        
00087     def parseHTMLTag(self, data):
00088         # this is only called if we see a full tag in one parse...
00089         i = 0
00090         if len(data) == 0: return []
00091         if data[0] in '/?': return []
00092         while i < len(data) and data[i] not in ' \n\r\t>': i = i + 1
00093         if i == len(data): return []
00094         tag = data[:i].lower()
00095         #print "Searching tag: %s" % data
00096         #print "Found tag: %s" % tag
00097         results = []
00098         attrfind = re.compile(
00099             r'\s*([a-zA-Z_][-.a-zA-Z_0-9]*)(\s*=\s*'
00100             r'(\'[^\']*\'|"[^"]*"|[^ \t\n<>]*))?')
00101         k = i
00102         attrs = {}
00103         attrs_beg = {}
00104         while k < len(data):
00105             match = attrfind.match(data, k)
00106             if not match: break
00107             attrname, rest, attrvalue = match.group(1, 2, 3)
00108             if not rest:
00109                attrvalue = attrname
00110             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
00111                  attrvalue[:1] == '"' == attrvalue[-1:]:
00112                attrvalue = attrvalue[1:-1]
00113             attrname = attrname.lower()
00114             if attrs.has_key(attrname):
00115                 log("Can't handle duplicate attrs: %s" % attrname)
00116             attrs[attrname] = attrvalue
00117             attrs_beg[attrname] = match.start(3)
00118             k = match.end(0)
00119 
00120         find_l = []
00121         if tag == "input":
00122             if attrs.get('type', "").lower() in ["submit", "button"]:
00123                 find_l.append((attrs.get('value', ''), attrs_beg.get('value', 0)))
00124 
00125         for s,k in find_l:
00126             if s:
00127                 x = data[k:].find(s)
00128                 if x != -1: results.append((s, x+k, 1))
00129 
00130         return results
00131 
00132     def parseHTML(self, data, reset=1):
00133         if reset: self._html_state = 0
00134         if DEBUG: print "- %d ---------\n%s\n- E ---------" % (self._html_state, data)
00135 
00136         results = []
00137         i = 0
00138         n = len(data)
00139         # if we had state from the last parse... find it
00140         if self._html_state:
00141             if self._html_state == 2:
00142                 x = string.find(data[i:], '-->')
00143                 l = 3
00144             else:
00145                 x = string.find(data[i:], '>')
00146                 l = 1
00147             if x == -1: return results
00148             i = i + x + l
00149             self._html_state = 0
00150         while i < n:
00151             if DEBUG: print "MATCHING>%s<MATCHING" % data[i:]
00152             cmt_b = string.find(data[i:], '<!--')
00153             cmt_e = string.find(data[i:], '-->')
00154             tag_b = string.find(data[i:], '<')
00155             tag_e = string.find(data[i:], '>')
00156             if DEBUG: print "B> %d %d %d %d <B" % (cmt_b, cmt_e, tag_b, tag_e)
00157             if cmt_b != -1 and cmt_b <= tag_b:
00158                 x = i
00159                 y = i+cmt_b-1
00160                 while x < y and data[x] in string.whitespace: x+=1
00161                 while y > x and data[y] in string.whitespace: y-=1
00162                 results.append((data[x:y+1], x, 1))
00163                 if cmt_e == -1: # partial comment:
00164                     self._html_state = 2
00165                     break
00166                 i = i + cmt_e + 3
00167             elif tag_b != -1:
00168                 x = i
00169                 y = i+tag_b-1
00170                 while x < y and data[x] in string.whitespace: x+=1
00171                 while y > x and data[y] in string.whitespace: y-=1
00172                 results.append((data[x:y+1], x, 1))
00173                 if tag_e == -1: # partial tag
00174                     self._html_state = 1
00175                     break
00176                 h_results = self.parseHTMLTag(data[i+tag_b+1:i+tag_e])
00177                 h_results = map(lambda x: (x[0], x[1] + i+tag_b+1, x[2]), h_results)
00178                 results = results + h_results
00179                 i = i + tag_e + 1
00180             else:
00181                 x = i
00182                 y = n-1 
00183                 while x < y and data[x] in string.whitespace: x+=1
00184                 while y > x and data[y] in string.whitespace: y-=1
00185                 results.append((data[x:y+1], x, 1))
00186                 break
00187         return results
00188 
00189     def parseCS(self, data):
00190         results = []
00191         i = 0
00192         n = len(data)
00193         while i < n:
00194             m = Translator._CS_TAG_RE.search(data, i)
00195             if not m:
00196                 # search for a partial...
00197                 x = string.find(data[i:], '<?')
00198                 if x == -1:
00199                     results.append((data[i:], i))
00200                 else:
00201                     results.append((data[i:x], i))
00202                 break
00203             (b, e) = m.span()
00204             if i != b: results.append((data[i:b], i))
00205             i = e 
00206         t_results = []
00207         self._html_in = 0
00208         for (s, ofs) in results:
00209             r = self.parseHTML(s, reset=0)
00210             r = map(lambda x: (x[0], x[1] + ofs, x[2]), r)
00211             t_results = t_results + r
00212         return t_results
00213 
00214     def descendHDF(self, obj, prefix):
00215         results = []
00216         while obj is not None:
00217             if obj.value():
00218                 attrs = obj.attrs()
00219                 attrs = map(lambda x: x[0], attrs)
00220                 if "Lang" in attrs:
00221                     if prefix:
00222                         results.append((obj.value(), "%s.%s" % (prefix, obj.name()), 0))
00223                     else:
00224                         results.append((obj.value(), "%s" % (obj.name()), 0))
00225             if obj.child():
00226                 if prefix:
00227                     results = results + self.descendHDF(obj.child(), "%s.%s" % (prefix, obj.name()))
00228                 else:
00229                     results = results + self.descendHDF(obj.child(), (obj.name()))
00230             obj = obj.next()
00231         return results
00232 
00233     def parseHDF(self, data):
00234         # Ok, we handle HDF files specially.. the theory is, we only
00235         # extract entire HDF elements which have the attribute Lang
00236         hdf = neo_util.HDF()
00237         hdf.readString(data, 1)
00238         return self.descendHDF(hdf, "")
00239 
00240     def handleFile(self, file):
00241         if file in self.ignore_files: return []
00242         for a_re in self.ignore_patterns:
00243             if re.match(a_re,file): 
00244                 return []
00245         fpath = self.root + '/' + file
00246         x = string.rfind(file, '.')
00247         if x == -1: return []
00248         data = open(fpath, 'r').read()
00249         ext = file[x:]
00250         strings = []
00251         if ext in ['.cst', '.cs']:
00252             strings = self.parseCS(data)
00253         elif ext in ['.html', '.htm']:
00254             strings = self.parseHTML(data)
00255         elif ext in ['.hdf']:
00256             strings = self.parseHDF(data)
00257         if len(strings):
00258             print "Found %d strings in %s" % (len(strings), file)
00259             return strings
00260         return []
00261 
00262     def walkDirectory(self, path):
00263         if path in self.ignore_paths: return []
00264         fpath = self.root + '/' + path
00265         files = os.listdir(fpath)
00266         dirs = []
00267         results = []
00268         for file in files:
00269             if file[0] == '.': continue
00270             fname = fpath + '/' + file
00271             if os.path.isdir(fname):
00272                 dirs.append(file)
00273             else:
00274                 strings = self.handleFile(path + '/' + file)
00275                 if len(strings):
00276                     results.append((path + '/' + file, strings))
00277         for dir in dirs:
00278             if dir not in ["release"]:
00279                 results = results + self.walkDirectory(path + '/' + dir)
00280         return results
00281 
00282     def cleanHtmlString(self, s):
00283         s = re.sub("\s+", " ", s)
00284         return string.strip(s)
00285 
00286     def containsWords(self, s, ishtml):
00287         if ishtml:
00288             s = string.replace(s, '&nbsp;', ' ')
00289             s = string.replace(s, '&quot;', '"')
00290             s = string.replace(s, '&copy;', '')
00291             s = string.replace(s, '&lt;', '<')
00292             s = string.replace(s, '&gt;', '>')
00293             s = string.replace(s, '&amp;', '&')
00294         for x in range (len (s)):
00295           n = ord(s[x])
00296           if (n>47 and n<58) or (n>64 and n<91) or (n>96 and n<123): return 1
00297         return 0
00298         
00299     def findString(self, s):
00300         rows = self.tdb.strings.fetchRows( ('string', s) )
00301         if len(rows) == 0:
00302             row = self.tdb.strings.newRow()
00303             row.string = s
00304             row.save()
00305             return row.string_id
00306         elif len(rows) > 1:
00307             raise eTransError, "String %s exists multiple times!" % s
00308         else:
00309             return rows[0].string_id
00310 
00311     def loadStrings(self, one_file=None, verbose=0):
00312         if one_file is not None:
00313             strings = self.handleFile(one_file)
00314             results = [(one_file, strings)]
00315         else:
00316             results = self.walkDirectory('tmpl')
00317         uniq = {}
00318         cnt = 0
00319         seen_hdf = {}
00320         for fname, strings in results:
00321             for (s, ofs, ishtml) in strings:
00322                 if s and string.strip(s):
00323                     l = len(s)
00324                     if ishtml:
00325                         s = self.cleanHtmlString(s)
00326                     if self.containsWords(s, ishtml):
00327                         if type(ofs) == type(""): # HDF
00328                             if seen_hdf.has_key(ofs):
00329                                 if seen_hdf[ofs][0] != s:
00330                                     log("Duplicate HDF Name %s:\n\t file %s = %s\n\t file %s = %s" % (ofs, seen_hdf[ofs][1], seen_hdf[ofs][0], fname, s))
00331                             else:
00332                                 seen_hdf[ofs] = (s, fname)
00333                         try:
00334                             uniq[s].append((fname, ofs, l))
00335                         except KeyError:
00336                             uniq[s] = [(fname, ofs, l)]
00337                         cnt = cnt + 1
00338         print "%d strings, %d unique" % (cnt, len(uniq.keys()))
00339         fp = open("map", 'w')
00340         for (s, locs) in uniq.items():
00341             locs = map(lambda x: "%s:%s:%d" % x, locs)
00342             fp.write('#: %s\n' % (string.join(locs, ',')))
00343             fp.write('msgid=%s\n\n' % repr(s))
00344 
00345         log("Loading strings/locations into database")
00346         locations = []
00347         for (s, locs) in uniq.items():
00348             s_id = self.findString(s)
00349             for (fname, ofs, l) in locs:
00350                 if type(ofs) == type(""): # ie, its HDF
00351                     location = "hdf:%s" % ofs
00352                 else:
00353                     location = "ofs:%d:%d" % (ofs, l)
00354                 loc_r = TransLoc(s_id, fname, location)
00355                 locations.append(loc_r)
00356         return locations
00357 
00358     def stringsHDF(self, prefix, locations, lang='en', exist=0, tiered=0):
00359         hdf = neo_util.HDF()
00360         if exist and lang == 'en': return hdf
00361         done = {}
00362         locations.sort()
00363         maps = self.tdb.maps.fetchRows( ('lang', lang) )
00364         maps_d = {}
00365         for map in maps:
00366             maps_d[int(map.string_id)] = map
00367         strings = self.tdb.strings.fetchRows()
00368         strings_d = {}
00369         for string in strings:
00370             strings_d[int(string.string_id)] = string
00371         count = 0
00372         for loc in locations:
00373             s_id = int(loc.string_id)
00374             if done.has_key(s_id): continue
00375             try:
00376                 s_row = maps_d[s_id]
00377                 if exist: continue
00378             except KeyError:
00379                 try:
00380                     s_row = strings_d[s_id]
00381                 except KeyError:
00382                     log("Missing string_id %d, skipping" % s_id)
00383                     continue
00384             count = count + 1
00385             if tiered:
00386                 hdf.setValue("%s.%d.%d.%s" % (prefix, int(s_id) / TIER1_DIV, int(s_id) / TIER2_DIV, s_id), s_row.string)
00387             else:
00388                 hdf.setValue("%s.%s" % (prefix, s_id), s_row.string)
00389             done[s_id] = 1
00390         if exist == 1: log("Missing %d strings for lang %s" % (count, lang))
00391         return hdf
00392 
00393     def dumpStrings(self, locations, lang=None):
00394         log("Dumping strings to HDF")
00395         if lang is None:
00396             langs = ['en']
00397             sql = "select lang from nt_trans_maps group by lang"
00398             cursor = self.tdb.defaultCursor()
00399             cursor.execute(sql)
00400             rows = cursor.fetchall()
00401             for row in rows:
00402                 langs.append(row[0])
00403         else:
00404             langs = [lang]
00405 
00406         for a_lang in langs:
00407             hdf = self.stringsHDF('S', locations, a_lang)
00408             hdf.writeFile("strings_%s.hdf" % a_lang)
00409 
00410         for a_lang in langs:
00411             hdf = self.stringsHDF('S', locations, a_lang, exist=1)
00412             if hdf.child():
00413                 hdf.writeFile("strings_missing_%s.hdf" % a_lang)
00414 
00415     def fetchString(self, s_id, lang):
00416         if lang == "hdf":
00417             return "<?cs var:Lang.Extracted.%d.%d.%s ?>" % (int(s_id) / TIER1_DIV, int(s_id) / TIER2_DIV, s_id)
00418         rows = self.tdb.maps.fetchRows( [('string_id', s_id), ('lang', lang)] )
00419         if len(rows) == 0:
00420             try:
00421                 row = self.tdb.strings.fetchRow( ('string_id', s_id) )
00422             except odb.eNoMatchingRows:
00423                 log("Unable to find string id %s" % s_id)
00424                 raise eNoString
00425             if lang != 'en':
00426                 log("Untranslated string for id %s" % s_id)
00427             return row.string
00428         else:
00429             return rows[0].string
00430 
00431     def dumpFiles(self, locations, lang):
00432         log("Dumping files for %s" % lang)
00433         files = {}
00434         for row in locations:
00435             try:
00436                 files[row.filename].append(row)
00437             except KeyError:
00438                 files[row.filename] = [row]
00439 
00440         hdf_map = []
00441 
00442         os.system("rm -rf %s/gen/tmpl" % (self.root))
00443         for file in files.keys():
00444             fname = "%s/gen/%s" % (self.root, file)
00445             try:
00446                 os.makedirs(os.path.dirname(fname))
00447             except OSError, reason:
00448                 if reason[0] != 17:
00449                     raise
00450             do_hdf = 0
00451             x = string.rfind(file, '.')
00452             if x != -1 and file[x:] == '.hdf':
00453                 do_hdf = 1
00454             ofs = []
00455             for loc in files[file]:
00456                 parts = string.split(loc.location, ':')
00457                 if len(parts) == 3 and parts[0] == 'ofs' and do_hdf == 0: 
00458                     ofs.append((int(parts[1]), int(parts[2]), loc.string_id))
00459                 elif len(parts) == 2 and parts[0] == 'hdf' and do_hdf == 1:
00460                     hdf_map.append((parts[1], loc.string_id))
00461                 else:
00462                     log("Invalid location for loc_id %s" % loc.loc_id)
00463                     continue
00464             if not do_hdf:
00465                 ofs.sort()
00466                 data = open(self.root + '/' + file).read()
00467                 # ok, now we split up the original data into sections
00468                 x = 0
00469                 n = len(data)
00470                 out = []
00471                 #sys.stderr.write("%s\n" % repr(ofs))
00472                 while len(ofs):
00473                     if ofs[0][0] > x:
00474                         out.append(data[x:ofs[0][0]])
00475                         x = ofs[0][0]
00476                     elif ofs[0][0] == x:
00477                         out.append(self.fetchString(ofs[0][2], lang))
00478                         x = ofs[0][0] + ofs[0][1]
00479                         ofs = ofs[1:]
00480                     else:
00481                         log("How did we get here? %s x=%d ofs=%d sid=%d" % (file, x, ofs[0][0], ofs[0][2]))
00482                         log("Data[x:20]: %s" % data[x:20])
00483                         log("Data[ofs:20]: %s" % data[ofs[0][0]:20])
00484                         break
00485                 if n > x:
00486                     out.append(data[x:])
00487                 odata = string.join(out, '')
00488                 open(fname, 'w').write(odata)
00489 
00490         if lang == "hdf":
00491             langs = self.languages
00492         else:
00493             langs = [lang]
00494 
00495         for d_lang in langs:
00496           # dumping the extracted strings
00497           hdf = self.stringsHDF('Lang.Extracted', locations, d_lang, tiered=1)
00498           fname = "%s/gen/tmpl/lang_%s.hdf" % (self.root, d_lang)
00499           hdf.writeFile(fname)
00500           data = open(fname).read()
00501           fp = open(fname, 'w')
00502           fp.write('## AUTOMATICALLY GENERATED -- DO NOT EDIT\n\n')
00503           fp.write(data)
00504           fp.write('\n#include "lang_map.hdf"\n')
00505 
00506           # dumping the hdf strings file
00507           if d_lang == "en":
00508             map_file = "%s/gen/tmpl/lang_map.hdf" % (self.root)
00509           else:
00510             map_file = "%s/gen/tmpl/%s/lang_map.hdf" % (self.root, d_lang)
00511           try:
00512               os.makedirs(os.path.dirname(map_file))
00513           except OSError, reason: 
00514               if reason[0] != 17: raise
00515           map_hdf = neo_util.HDF()
00516           for (name, s_id) in hdf_map:
00517               str = hdf.getValue('Lang.Extracted.%d.%d.%s' % (int(s_id) / TIER1_DIV, int(s_id) / TIER2_DIV, s_id), '')
00518               map_hdf.setValue(name, str)
00519           map_hdf.writeFile(map_file)
00520 
00521     def loadMap(self, file, prefix, lang):
00522         log("Loading map for language %s" % lang)
00523         hdf = neo_util.HDF()
00524         hdf.readFile(file)
00525         obj = hdf.getChild(prefix)
00526         updates = 0
00527         new_r = 0
00528         while obj is not None:
00529             s_id = obj.name()
00530             str = obj.value()
00531 
00532             try:
00533                 map_r = self.tdb.maps.fetchRow( [('string_id', s_id), ('lang', lang)])
00534             except odb.eNoMatchingRows:
00535                 map_r = self.tdb.maps.newRow()
00536                 map_r.string_id = s_id
00537                 map_r.lang = lang
00538                 new_r = new_r + 1
00539 
00540             if map_r.string != str:
00541                 updates = updates + 1
00542                 map_r.string = str
00543                 map_r.save()
00544 
00545             obj = obj.next()
00546         log("New maps: %d  Updates: %d" % (new_r, updates - new_r))
00547         
00548 
00549 def main(argv):
00550   alist, args = getopt.getopt(argv[1:], "f:v:", ["help", "load=", "lang="])
00551 
00552   one_file = None
00553   verbose = 0
00554   load_file = None
00555   lang = 'en'
00556   for (field, val) in alist:
00557     if field == "--help":
00558       usage(argv[0])
00559       return -1
00560     if field == "-f":
00561       one_file = val
00562     if field == "-v":
00563       verbose = int(val)
00564     if field == "--load":
00565         load_file = val
00566     if field == "--lang":
00567         lang = val
00568         
00569 
00570   global DONE
00571 
00572   #signal.signal(signal.SIGTERM, handleSignal)
00573   #signal.signal(signal.SIGINT, handleSignal)
00574 
00575   log("trans: start")
00576 
00577   start_time = time.time()
00578 
00579   try:
00580     t = Translator()
00581     if load_file:
00582         t.loadMap(load_file, 'S', lang)
00583     else:
00584         locations = t.loadStrings(one_file, verbose=verbose)
00585         t.dumpStrings(locations)
00586         t.dumpFiles(locations, 'hdf')
00587   except KeyboardInterrupt:
00588     pass
00589   except:
00590     import handle_error
00591     handle_error.handleException("Translation Error")
00592 
00593 if __name__ == "__main__":
00594   main(sys.argv)