Package translate :: Package search :: Module match
[hide private]
[frames] | no frames]

Source Code for Module translate.search.match

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright 2006-2007 Zuza Software Foundation 
  4  #  
  5  # This file is part of translate. 
  6  # 
  7  # translate is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11  #  
 12  # translate is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16  # 
 17  # You should have received a copy of the GNU General Public License 
 18  # along with translate; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20  # 
 21   
 22  """Class to perform translation memory matching from a store of translation units""" 
 23   
 24  from translate.search import lshtein 
 25  from translate.search import terminology 
 26  from translate.storage import base 
 27  from translate.storage import po 
 28  from translate.misc.multistring import multistring 
 29  import heapq 
 30   
31 -def sourcelen(unit):
32 """Returns the length of the source string""" 33 return len(unit.source)
34
35 -def sourcelencmp(x, y):
36 """Compares using sourcelen""" 37 # This is mostly useful for Python 2.3 38 xlen = sourcelen(x) 39 ylen = sourcelen(y) 40 return cmp(xlen, ylen)
41
42 -class matcher:
43 """A class that will do matching and store configuration for the matching process"""
44 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
45 """max_candidates is the maximum number of candidates that should be assembled, 46 min_similarity is the minimum similarity that must be attained to be included in 47 the result, comparer is an optional Comparer with similarity() function""" 48 if comparer is None: 49 comparer = lshtein.LevenshteinComparer(max_length) 50 self.comparer = comparer 51 self.setparameters(max_candidates, min_similarity, max_length) 52 self.usefuzzy = usefuzzy 53 self.inittm(store) 54 self.addpercentage = True
55
56 - def usable(self, unit):
57 """Returns whether this translation unit is usable for TM""" 58 #TODO: We might want to consider more attributes, such as approved, reviewed, etc. 59 source = unit.source 60 target = unit.target 61 if source and target and (self.usefuzzy or not unit.isfuzzy()): 62 if source in self.existingunits and self.existingunits[source] == target: 63 return False 64 else: 65 self.existingunits[source] = target 66 return True 67 return False
68
69 - def inittm(self, stores):
70 """Initialises the memory for later use. We use simple base units for 71 speedup.""" 72 self.existingunits = {} 73 self.candidates = base.TranslationStore() 74 75 if not isinstance(stores, list): 76 stores = [stores] 77 for store in stores: 78 self.extendtm(store.units, store=store, sort=False) 79 self.candidates.units.sort(sourcelencmp)
80 # print "TM initialised with %d candidates (%d to %d characters long)" % \ 81 # (len(self.candidates.units), len(self.candidates.units[0].source), len(self.candidates.units[-1].source)) 82
83 - def extendtm(self, units, store=None, sort=True):
84 """Extends the memory with extra unit(s). 85 86 @param units: The units to add to the TM. 87 @param store: Optional store from where some metadata can be retrieved 88 and associated with each unit. 89 @param sort: Optional parameter that can be set to False to supress 90 sorting of the candidates list. This should probably only be used in 91 inittm(). 92 """ 93 if not isinstance(units, list): 94 units = [units] 95 candidates = filter(self.usable, units) 96 for candidate in candidates: 97 simpleunit = base.TranslationUnit("") 98 # We need to ensure that we don't pass multistrings futher, since 99 # some modules (like the native Levenshtein) can't use it. 100 if isinstance(candidate.source, multistring): 101 if len(candidate.source.strings) > 1: 102 simpleunit.orig_source = candidate.source 103 simpleunit.orig_target = candidate.target 104 simpleunit.source = unicode(candidate.source) 105 simpleunit.target = unicode(candidate.target) 106 else: 107 simpleunit.source = candidate.source 108 simpleunit.target = candidate.target 109 # If we now only get translator comments, we don't get programmer 110 # comments in TM suggestions (in Pootle, for example). If we get all 111 # notes, pot2po adds all previous comments as translator comments 112 # in the new po file 113 simpleunit.addnote(candidate.getnotes(origin="translator")) 114 simpleunit.fuzzy = candidate.isfuzzy() 115 if store: 116 simpleunit.filepath = store.filepath 117 simpleunit.translator = store.translator 118 simpleunit.date = store.date 119 self.candidates.units.append(simpleunit) 120 if sort: 121 self.candidates.units.sort(sourcelencmp)
122
123 - def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
124 """Sets the parameters without reinitialising the tm. If a parameter 125 is not specified, it is set to the default, not ignored""" 126 self.MAX_CANDIDATES = max_candidates 127 self.MIN_SIMILARITY = min_similarity 128 self.MAX_LENGTH = max_length
129
130 - def getstoplength(self, min_similarity, text):
131 """Calculates a length beyond which we are not interested. 132 The extra fat is because we don't use plain character distance only.""" 133 return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)
134
135 - def getstartlength(self, min_similarity, text):
136 """Calculates the minimum length we are interested in. 137 The extra fat is because we don't use plain character distance only.""" 138 return max(len(text) * (min_similarity/100.0), 1)
139
140 - def matches(self, text):
141 """Returns a list of possible matches for given source text. 142 143 @type text: String 144 @param text: The text that will be search for in the translation memory 145 @rtype: list 146 @return: a list of units with the source and target strings from the 147 translation memory. If self.addpercentage is true (default) the match 148 quality is given as a percentage in the notes. 149 """ 150 bestcandidates = [(0.0,None)]*self.MAX_CANDIDATES 151 heapq.heapify(bestcandidates) 152 #We use self.MIN_SIMILARITY, but if we already know we have max_candidates 153 #that are better, we can adjust min_similarity upwards for speedup 154 min_similarity = self.MIN_SIMILARITY 155 156 # We want to limit our search in self.candidates, so we want to ignore 157 # all units with a source string that is too short or too long 158 159 # minimum source string length to be considered 160 startlength = self.getstartlength(min_similarity, text) 161 startindex = 0 162 for index, candidate in enumerate(self.candidates.units): 163 if len(candidate.source) >= startlength: 164 startindex = index 165 break 166 167 # maximum source string length to be considered 168 stoplength = self.getstoplength(min_similarity, text) 169 170 for candidate in self.candidates.units[startindex:]: 171 cmpstring = candidate.source 172 if len(cmpstring) > stoplength: 173 break 174 similarity = self.comparer.similarity(text, cmpstring, min_similarity) 175 if similarity < min_similarity: 176 continue 177 lowestscore = bestcandidates[0][0] 178 if similarity > lowestscore: 179 targetstring = candidate.target 180 heapq.heapreplace(bestcandidates, (similarity, candidate)) 181 if min_similarity < bestcandidates[0][0]: 182 min_similarity = bestcandidates[0][0] 183 stoplength = self.getstoplength(min_similarity, text) 184 185 #Remove the empty ones: 186 def notzero(item): 187 score = item[0] 188 return score != 0
189 bestcandidates = filter(notzero, bestcandidates) 190 #Sort for use as a general list, and reverse so the best one is at index 0 191 bestcandidates.sort() 192 bestcandidates.reverse() 193 return self.buildunits(bestcandidates)
194
195 - def buildunits(self, candidates):
196 """Builds a list of units conforming to base API, with the score in the comment""" 197 units = [] 198 for score, candidate in candidates: 199 if hasattr(candidate, "orig_source"): 200 candidate.source = candidate.orig_source 201 candidate.target = candidate.orig_target 202 newunit = po.pounit(candidate.source) 203 newunit.target = candidate.target 204 newunit.markfuzzy(candidate.fuzzy) 205 newunit.filepath = candidate.filepath 206 newunit.translator = candidate.translator 207 newunit.date = candidate.date 208 candidatenotes = candidate.getnotes().strip() 209 if candidatenotes: 210 newunit.addnote(candidatenotes) 211 if self.addpercentage: 212 newunit.addnote("%d%%" % score) 213 units.append(newunit) 214 return units
215
216 -class terminologymatcher(matcher):
217 """A matcher with settings specifically for terminology matching"""
218 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
219 if comparer is None: 220 comparer = terminology.TerminologyComparer(max_length) 221 matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer) 222 self.addpercentage = False
223
224 - def inittm(self, store):
225 """Normal initialisation, but convert all source strings to lower case""" 226 matcher.inittm(self, store) 227 for unit in self.candidates.units: 228 unit.source = unit.source.lower()
229
230 - def getstartlength(self, min_similarity, text):
231 # Let's number false matches by not working with terms of two 232 # characters or less 233 return 3
234
235 - def getstoplength(self, min_similarity, text):
236 # Let's ignore terms with more than 30 characters. Perhaps someone 237 # gave a file with normal (long) translations 238 return 30
239
240 - def matches(self, text):
241 """Normal matching after converting text to lower case. Then replace 242 with the original unit to retain comments, etc.""" 243 text = text.lower() 244 matches = matcher.matches(self, text) 245 return matches
246