1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 """reads a set of .po or .pot files to produce a pootle-terminology.pot"""
21
22 from translate.storage import factory
23 from translate.lang import factory as lang_factory
24 from translate.storage import po
25 from translate.misc import optrecurse
26 import sys
27 import os
28 import re
29
31 """a specialized Option Parser for the terminology tool..."""
32
33
34 formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]")
35
36 xmlpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>")
37
38 sortorders = [ "frequency", "dictionary", "length" ]
39
40 files = 0
41 units = 0
42
44 """parses the command line options, handling implicit input/output args"""
45 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values)
46
47 if args and not options.input:
48 if not options.output and len(args) > 1:
49 options.input = args[:-1]
50 args = args[-1:]
51 else:
52 options.input = args
53 args = []
54 if args and not options.output:
55 options.output = args[-1]
56 args = args[:-1]
57 if not options.output:
58 options.output = "pootle-terminology.pot"
59 if args:
60 self.error("You have used an invalid combination of --input, --output and freestanding args")
61 if isinstance(options.input, list) and len(options.input) == 1:
62 options.input = options.input[0]
63 return (options, args)
64
66 """sets the usage string - if usage not given, uses getusagestring for each option"""
67 if usage is None:
68 self.usage = "%prog " + " ".join([self.getusagestring(option) for option in self.option_list]) + \
69 "\n input directory is searched for PO files, terminology PO file is output file"
70 else:
71 super(TerminologyOptionParser, self).set_usage(usage)
72
80
82 """recurse through directories and process files"""
83 if self.isrecursive(options.input, 'input') and getattr(options, "allowrecursiveinput", True):
84 if isinstance(options.input, list):
85 inputfiles = self.recurseinputfilelist(options)
86 else:
87 inputfiles = self.recurseinputfiles(options)
88 else:
89 if options.input:
90 inputfiles = [os.path.basename(options.input)]
91 options.input = os.path.dirname(options.input)
92 else:
93 inputfiles = [options.input]
94 if os.path.isdir(options.output):
95 options.output = os.path.join(options.output,"pootle-terminology.pot")
96 self.stopwords = {}
97 self.stoprelist = []
98 actions = { '+': frozenset(), ':': frozenset(['skip']),
99 '<': frozenset(['phrase']), '=': frozenset(['word']),
100 '>': frozenset(['word','skip']),
101 '@': frozenset(['word','phrase']) }
102 if options.stopwordfile != None:
103 stopfile = open(options.stopwordfile, "r")
104 try:
105 for stopline in stopfile:
106 stoptype = stopline[0]
107 if stoptype == '#' or stoptype == "\n":
108 continue
109 elif stoptype == '/':
110 self.stoprelist.append(re.compile(stopline[1:-1]+'$'))
111 else:
112 self.stopwords[stopline[1:-1]] = actions[stoptype]
113 except KeyError, character:
114 self.warning("Bad line in stopword list %s starts with" % (options.stopwordfile), options, sys.exc_info())
115 stopfile.close()
116 self.glossary = {}
117 self.initprogressbar(inputfiles, options)
118 for inputpath in inputfiles:
119 self.files += 1
120 fullinputpath = self.getfullinputpath(options, inputpath)
121 try:
122 success = self.processfile(None, options, fullinputpath)
123 except Exception, error:
124 if isinstance(error, KeyboardInterrupt):
125 raise
126 self.warning("Error processing: input %s" % (fullinputpath), options, sys.exc_info())
127 success = False
128 self.reportprogress(inputpath, success)
129 del self.progressbar
130 self.outputterminology(options)
131
132 - def clean(self, string, options):
133 """returns the cleaned string that contains the text to be matched"""
134 for accelerator in options.accelchars:
135 string = string.replace(accelerator, "")
136 string = self.formatpat.sub(" ", string)
137 string = self.xmlpat.sub(" ", string)
138 string = string.strip()
139 return string
140
141 - def addphrases(self, words, skips, translation, partials=True):
142 """adds (sub)phrases with non-skipwords and more than one word"""
143 if (len(words) > skips + 1 and
144 'skip' not in self.stopwords.get(words[0], frozenset()) and
145 'skip' not in self.stopwords.get(words[-1], frozenset())):
146 self.glossary.setdefault(' '.join(words), []).append(translation)
147 if partials:
148 part = list(words)
149 while len(part) > 2:
150 if 'skip' in self.stopwords.get(part.pop(), frozenset()):
151 skips -= 1
152 if (len(part) > skips + 1 and
153 'skip' not in self.stopwords.get(part[0], frozenset()) and
154 'skip' not in self.stopwords.get(part[-1], frozenset())):
155 self.glossary.setdefault(' '.join(part), []).append(translation)
156
157
158 - def processfile(self, fileprocessor, options, fullinputpath):
159 """process an individual file"""
160 inputfile = self.openinputfile(options, fullinputpath)
161 inputfile = factory.getobject(inputfile)
162 sourcelang = lang_factory.getlanguage(options.sourcelanguage)
163 rematchignore = frozenset(('word','phrase'))
164 defaultignore = frozenset()
165 for unit in inputfile.units:
166 self.units += 1
167 if unit.isheader() or not unit.istranslated():
168 continue
169 if unit.hasplural():
170 continue
171 if not options.invert:
172 source = self.clean(unit.source, options)
173 target = self.clean(unit.target, options)
174 else:
175 target = self.clean(unit.source, options)
176 source = self.clean(unit.target, options)
177 if len(source) <= 1:
178 continue
179 for sentence in sourcelang.sentences(source):
180 words = []
181 skips = 0
182 for word in sourcelang.words(sentence):
183 if options.ignorecase or (options.foldtitle and word.istitle()):
184 word = word.lower()
185 ignore = defaultignore
186 if word in self.stopwords:
187 ignore = self.stopwords[word]
188 else:
189 for stopre in self.stoprelist:
190 if stopre.match(word) != None:
191 ignore = rematchignore
192 break
193 translation = (source, target, unit, fullinputpath)
194 if 'word' not in ignore:
195
196 root = word
197 if len(word) > 3 and word[-1] == 's' and word[0:-1] in self.glossary:
198 root = word[0:-1]
199 elif len(root) > 2 and root + 's' in self.glossary:
200 self.glossary[root] = self.glossary.pop(root + 's')
201 self.glossary.setdefault(root, []).append(translation)
202 if 'phrase' in ignore:
203
204 while len(words) > 2:
205 if 'skip' in self.stopwords.get(words.pop(0),defaultignore):
206 skips -= 1
207 self.addphrases(words, skips, translation)
208 words = []
209 skips = 0
210 else:
211 words.append(word)
212 if 'skip' in ignore:
213 skips += 1
214 if len(words) > options.termlength + skips:
215 while len(words) > options.termlength + skips:
216 if 'skip' in self.stopwords.get(words.pop(0),defaultignore):
217 skips -= 1
218 self.addphrases(words, skips, translation)
219 else:
220 self.addphrases(words, skips, translation, partials=False)
221
222 while len(words) > 2:
223 if 'skip' in self.stopwords.get(words.pop(0),defaultignore):
224 skips -= 1
225 self.addphrases(words, skips, translation)
226
228 """saves the generated terminology glossary"""
229 termfile = po.pofile()
230 terms = {}
231 locre = re.compile(r":[0-9]+$")
232 print "%d terms from %d units in %d files" % (len(self.glossary), self.units, self.files)
233 for term, translations in self.glossary.iteritems():
234 if len(translations) <= 1:
235 continue
236 filecounts = {}
237 sources = {}
238 termunit = po.pounit(term)
239 locations = {}
240 sourcenotes = {}
241 transnotes = {}
242 targets = {}
243 fullmsg = False
244 for source, target, unit, filename in translations:
245 sources[source] = 1
246 filecounts[filename] = filecounts.setdefault(filename, 0) + 1
247 if term.lower() == self.clean(unit.source, options).lower():
248 fullmsg = True
249 target = self.clean(unit.target, options)
250 if options.ignorecase or (options.foldtitle and target.istitle()):
251 target = target.lower()
252 unit.settarget(target)
253 if target != "":
254 targets.setdefault(target, []).append(filename)
255 if term.lower() == unit.source.strip().lower():
256 sourcenotes[unit.getnotes("source code")] = None;
257 transnotes[unit.getnotes("translator")] = None;
258 else:
259 unit.settarget("")
260 unit.setsource(term)
261 termunit.merge(unit, overwrite=False, comments=False)
262 for loc in unit.getlocations():
263 locations.setdefault(locre.sub("", loc))
264 numsources = len(sources)
265 numfiles = len(filecounts)
266 numlocs = len(locations)
267 if numfiles < options.inputmin or numlocs < options.locmin:
268 continue
269 if fullmsg:
270 if numsources < options.fullmsgmin:
271 continue
272 elif numsources < options.substrmin:
273 continue
274 if len(targets.keys()) > 1:
275 txt = '; '.join(["%s {%s}" % (target, ', '.join(files))
276 for target, files in targets.iteritems()])
277 if termunit.gettarget().find('};') < 0:
278 termunit.settarget(txt)
279 termunit.markfuzzy()
280 else:
281
282 termunit.addnote(txt, "translator")
283 for location in locations.keys():
284 termunit.addlocation(location)
285 for sourcenote in sourcenotes.keys():
286 termunit.addnote(sourcenote, "source code")
287 for transnote in transnotes.keys():
288 termunit.addnote(transnote, "translator")
289 for file, count in filecounts.iteritems():
290 termunit.othercomments.append("# (poterminology) %s (%d)\n" % (file, count))
291 terms[term] = (((10 * numfiles) + numsources, termunit))
292
293 termlist = terms.keys()
294 print "%d terms after thresholding" % len(termlist)
295 termlist.sort(lambda x, y: cmp(len(x),len(y)))
296 for term in termlist:
297 words = term.split()
298 if len(words) <= 2:
299 continue
300 while len(words) > 2:
301 words.pop()
302 if terms[term][0] == terms.get(' '.join(words),[0])[0]:
303 del terms[' '.join(words)]
304 words = term.split()
305 while len(words) > 2:
306 words.pop(0)
307 if terms[term][0] == terms.get(' '.join(words),[0])[0]:
308 del terms[' '.join(words)]
309 print "%d terms after subphrase reduction" % len(terms.keys())
310 termitems = terms.values()
311 if options.sortorders == None:
312 options.sortorders = self.sortorders
313 while len(options.sortorders) > 0:
314 order = options.sortorders.pop()
315 if order == "frequency":
316 termitems.sort(lambda x, y: cmp(y[0],x[0]))
317 elif order == "dictionary":
318 termitems.sort(lambda x, y: cmp(x[1].source.lower(), y[1].source.lower()))
319 elif order == "length":
320 termitems.sort(lambda x, y: cmp(len(x[1].source), len(y[1].source)))
321 else:
322 self.warning("unknown sort order %s" % order, options)
323 for count, unit in termitems:
324 termfile.units.append(unit)
325 open(options.output, "w").write(str(termfile))
326
328 formats = {"po":("po", None), None:("po", None)}
329 parser = TerminologyOptionParser(formats)
330 parser.add_option("-I", "--ignore-case", dest="ignorecase",
331 action="store_true", default=False, help="make all terms lowercase")
332 parser.add_option("-F", "--fold-titlecase", dest="foldtitle",
333 action="store_true", default=False, help="fold \"Title Case\" to lowercase")
334 parser.add_option("", "--accelerator", dest="accelchars", default="",
335 metavar="ACCELERATORS", help="ignores the given accelerator characters when matching")
336 parser.add_option("-t", "--term-words", type="int", dest="termlength", default="3",
337 help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH")
338 parser.add_option("", "--inputs-needed", type="int", dest="inputmin", default="2",
339 help="omit terms appearing in less than MIN input files (default 2)", metavar="MIN")
340 parser.add_option("", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1",
341 help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN")
342 parser.add_option("", "--substr-needed", type="int", dest="substrmin", default="2",
343 help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN")
344 parser.add_option("", "--locs-needed", type="int", dest="locmin", default="2",
345 help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN")
346 parser.add_option("", "--sort", dest="sortorders", action="append",
347 type="choice", choices=parser.sortorders, metavar="ORDER",
348 help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(parser.sortorders))
349 parser.add_option("-S", "--stopword-list", type="string", dest="stopwordfile",
350 help="name of file containing stopword list", metavar="FILENAME")
351 parser.add_option("", "--source-language", dest="sourcelanguage", default="en",
352 help="the source language code (default 'en')", metavar="LANG")
353 parser.add_option("-v", "--invert", dest="invert",
354 action="store_true", default=False, help="invert the source and target languages for terminology")
355 parser.set_usage()
356 parser.description = __doc__
357 parser.run()
358
359
360 if __name__ == '__main__':
361 main()
362