Package translate :: Package storage :: Module lisa
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.lisa

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2006-2007 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  # 
 22   
 23  """Parent class for LISA standards (TMX, TBX, XLIFF)""" 
 24   
 25  import re 
 26   
 27  from translate.storage import base 
 28  from translate.lang import data 
 29  try: 
 30      from lxml import etree 
 31  except ImportError, e: 
 32      raise ImportError("lxml is not installed. It might be possible to continue without support for XML formats.") 
 33   
34 -def getText(node):
35 """joins together the text from all the text nodes in the nodelist and their children""" 36 # node.xpath is very slow, so we only use it if there are children 37 # TODO: consider rewriting by iterating over children 38 if node: # The etree way of testing for children 39 return node.xpath("string()") # specific to lxml.etree 40 else: 41 return data.forceunicode(node.text) or u""
42 # if node.text is none, we want to return "" since the tag is there 43
44 -def _findAllMatches(text, re_obj):
45 """generate match objects for all @re_obj matches in @text.""" 46 start = 0 47 max = len(text) 48 while start < max: 49 m = re_obj.search(text, start) 50 if not m: break 51 yield m 52 start = m.end()
53 54 placeholders = ['(%[diouxXeEfFgGcrs])', r'(\\+.?)', '(%[0-9]$lx)', '(%[0-9]\$[a-z])', '(<.+?>)'] 55 re_placeholders = [re.compile(ph) for ph in placeholders]
56 -def _getPhMatches(text):
57 'return list of regexp matchobjects for with all place holders in the @text' 58 matches = [] 59 for re_ph in re_placeholders: 60 matches.extend(list(_findAllMatches(text, re_ph))) 61 62 # sort them so they come sequentially 63 matches.sort(lambda a,b: cmp(a.start(),b.start())) 64 return matches
65 66 XML_NS = 'http://www.w3.org/XML/1998/namespace' 67
68 -def setXMLlang(node, lang):
69 """Sets the xml:lang attribute on node""" 70 node.set("{%s}lang" % XML_NS, lang)
71
72 -def setXMLspace(node, value):
73 """Sets the xml:space attribute on node""" 74 node.set("{%s}space" % XML_NS, value)
75
76 -def namespaced(namespace, name):
77 """Returns name in Clark notation within the given namespace. 78 79 For example namespaced("source") in an XLIFF document might return 80 {urn:oasis:names:tc:xliff:document:1.1}source 81 This is needed throughout lxml. 82 """ 83 if namespace: 84 return "{%s}%s" % (namespace, name) 85 else: 86 return name
87
88 -class LISAunit(base.TranslationUnit):
89 """A single unit in the file. 90 Provisional work is done to make several languages possible.""" 91 92 #The name of the root element of this unit type:(termEntry, tu, trans-unit) 93 rootNode = "" 94 #The name of the per language element of this unit type:(termEntry, tu, trans-unit) 95 languageNode = "" 96 #The name of the innermost element of this unit type:(term, seg) 97 textNode = "" 98 99 namespace = None 100
101 - def __init__(self, source, empty=False):
102 """Constructs a unit containing the given source string""" 103 if empty: 104 return 105 self.xmlelement = etree.Element(self.rootNode) 106 #add descrip, note, etc. 107 108 super(LISAunit, self).__init__(source)
109
110 - def __eq__(self, other):
111 """Compares two units""" 112 languageNodes = self.getlanguageNodes() 113 otherlanguageNodes = other.getlanguageNodes() 114 if len(languageNodes) != len(otherlanguageNodes): 115 return False 116 for i in range(len(languageNodes)): 117 mytext = self.getNodeText(languageNodes[i]) 118 othertext = other.getNodeText(otherlanguageNodes[i]) 119 if mytext != othertext: 120 #TODO:^ maybe we want to take children and notes into account 121 return False 122 return True
123
124 - def namespaced(self, name):
125 """Returns name in Clark notation. 126 127 For example namespaced("source") in an XLIFF document might return 128 {urn:oasis:names:tc:xliff:document:1.1}source 129 This is needed throughout lxml. 130 """ 131 return namespaced(self.namespace, name)
132
133 - def setsource(self, source, sourcelang='en'):
134 source = data.forceunicode(source) 135 languageNodes = self.getlanguageNodes() 136 sourcelanguageNode = self.createlanguageNode(sourcelang, source, "source") 137 if len(languageNodes) > 0: 138 self.xmlelement[0] = sourcelanguageNode 139 else: 140 self.xmlelement.append(sourcelanguageNode)
141
142 - def getsource(self):
143 return self.getNodeText(self.getlanguageNode(lang=None, index=0))
144 source = property(getsource, setsource) 145
146 - def settarget(self, text, lang='xx', append=False):
147 #XXX: we really need the language - can't really be optional 148 """Sets the "target" string (second language), or alternatively appends to the list""" 149 text = data.forceunicode(text) 150 #Firstly deal with reinitialising to None or setting to identical string 151 if self.gettarget() == text: 152 return 153 languageNodes = self.getlanguageNodes() 154 assert len(languageNodes) > 0 155 if not text is None: 156 languageNode = self.createlanguageNode(lang, text, "target") 157 if append or len(languageNodes) == 1: 158 self.xmlelement.append(languageNode) 159 else: 160 self.xmlelement.insert(1, languageNode) 161 if not append and len(languageNodes) > 1: 162 self.xmlelement.remove(languageNodes[1])
163
164 - def gettarget(self, lang=None):
165 """retrieves the "target" text (second entry), or the entry in the 166 specified language, if it exists""" 167 if lang: 168 node = self.getlanguageNode(lang=lang) 169 else: 170 node = self.getlanguageNode(lang=None, index=1) 171 return self.getNodeText(node)
172 target = property(gettarget, settarget) 173
174 - def createlanguageNode(self, lang, text, purpose=None):
175 """Returns a xml Element setup with given parameters to represent a 176 single language entry. Has to be overridden.""" 177 return None
178
179 - def createPHnodes(self, parent, text):
180 """Create the text node in parent containing all the ph tags""" 181 matches = _getPhMatches(text) 182 if not matches: 183 parent.text = text 184 return 185 186 # Now we know there will definitely be some ph tags 187 start = matches[0].start() 188 pretext = text[:start] 189 if pretext: 190 parent.text = pretext 191 lasttag = parent 192 for i, m in enumerate(matches): 193 #pretext 194 pretext = text[start:m.start()] 195 # this will never happen with the first ph tag 196 if pretext: 197 lasttag.tail = pretext 198 #ph node 199 phnode = etree.SubElement(parent, "ph") 200 phnode.set("id", str(i+1)) 201 phnode.text = m.group() 202 lasttag = phnode 203 start = m.end() 204 #post text 205 if text[start:]: 206 lasttag.tail = text[start:]
207
208 - def getlanguageNodes(self):
209 """Returns a list of all nodes that contain per language information.""" 210 return self.xmlelement.findall(self.namespaced(self.languageNode))
211
212 - def getlanguageNode(self, lang=None, index=None):
213 """Retrieves a languageNode either by language or by index""" 214 if lang is None and index is None: 215 raise KeyError("No criterea for languageNode given") 216 languageNodes = self.getlanguageNodes() 217 if lang: 218 for set in languageNodes: 219 if set.get("{%s}lang" % XML_NS) == lang: 220 return set 221 else:#have to use index 222 if index >= len(languageNodes): 223 return None 224 else: 225 return languageNodes[index] 226 return None
227
228 - def getNodeText(self, languageNode):
229 """Retrieves the term from the given languageNode""" 230 if languageNode is None: 231 return None 232 if self.textNode: 233 terms = languageNode.findall('.//%s' % self.namespaced(self.textNode)) 234 if len(terms) == 0: 235 return None 236 return getText(terms[0]) 237 else: 238 return getText(languageNode)
239
240 - def __str__(self):
241 return etree.tostring(self.xmlelement, pretty_print=True, encoding='utf-8')
242
243 - def createfromxmlElement(cls, element):
244 term = cls(None, empty=True) 245 term.xmlelement = element 246 return term
247 createfromxmlElement = classmethod(createfromxmlElement)
248
249 -class LISAfile(base.TranslationStore):
250 """A class representing a file store for one of the LISA file formats.""" 251 UnitClass = LISAunit 252 #The root node of the XML document: 253 rootNode = "" 254 #The root node of the content section: 255 bodyNode = "" 256 #The XML skeleton to use for empty construction: 257 XMLskeleton = "" 258 259 namespace = None 260
261 - def __init__(self, inputfile=None, sourcelanguage='en', targetlanguage=None, unitclass=None):
262 super(LISAfile, self).__init__(unitclass=unitclass) 263 self.setsourcelanguage(sourcelanguage) 264 self.settargetlanguage(targetlanguage) 265 if inputfile is not None: 266 self.parse(inputfile) 267 assert self.document.getroot().tag == self.namespaced(self.rootNode) 268 else: 269 # We strip out newlines to ensure that spaces in the skeleton doesn't 270 # interfere with the the pretty printing of lxml 271 self.parse(self.XMLskeleton.replace("\n", "")) 272 self.addheader()
273
274 - def addheader(self):
275 """Method to be overridden to initialise headers, etc.""" 276 pass
277
278 - def namespaced(self, name):
279 """Returns name in Clark notation. 280 281 For example namespaced("source") in an XLIFF document might return 282 {urn:oasis:names:tc:xliff:document:1.1}source 283 This is needed throughout lxml. 284 """ 285 return namespaced(self.namespace, name)
286
287 - def initbody(self):
288 """Initialises self.body so it never needs to be retrieved from the XML again.""" 289 self.namespace = self.document.getroot().nsmap.get(None, None) 290 self.body = self.document.find('//%s' % self.namespaced(self.bodyNode))
291
292 - def setsourcelanguage(self, sourcelanguage):
293 """Sets the source language for this store""" 294 self.sourcelanguage = sourcelanguage
295
296 - def settargetlanguage(self, targetlanguage):
297 """Sets the target language for this store""" 298 self.targetlanguage = targetlanguage
299
300 - def addsourceunit(self, source):
301 #TODO: miskien moet hierdie eerder addsourcestring of iets genoem word? 302 """Adds and returns a new unit with the given string as first entry.""" 303 newunit = self.UnitClass(source) 304 self.addunit(newunit) 305 return newunit
306
307 - def addunit(self, unit):
308 unit.namespace = self.namespace 309 self.body.append(unit.xmlelement) 310 self.units.append(unit)
311
312 - def __str__(self):
313 """Converts to a string containing the file's XML""" 314 return etree.tostring(self.document, pretty_print=True, xml_declaration=True, encoding='utf-8')
315
316 - def parse(self, xml):
317 """Populates this object from the given xml string""" 318 if not hasattr(self, 'filename'): 319 self.filename = getattr(xml, 'name', '') 320 if hasattr(xml, "read"): 321 xml.seek(0) 322 posrc = xml.read() 323 xml = posrc 324 self.document = etree.fromstring(xml).getroottree() 325 self.encoding = self.document.docinfo.encoding 326 self.initbody() 327 assert self.document.getroot().tag == self.namespaced(self.rootNode) 328 termEntries = self.body.findall('.//%s' % self.namespaced(self.UnitClass.rootNode)) 329 if termEntries is None: 330 return 331 for entry in termEntries: 332 term = self.UnitClass.createfromxmlElement(entry) 333 term.namespace = self.namespace 334 self.units.append(term)
335