1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 import re
18 try:
19 import fintl
20 _ = fintl.gettext
21 except ImportError:
22 _ = lambda s: s
23
24 __doc__ = _("""pygettext -- Python equivalent of xgettext(1)
25
26 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
27 internationalization of C programs. Most of these tools are independent of
28 the programming language and can be used from within Python programs.
29 Martin von Loewis' work[1] helps considerably in this regard.
30
31 There's one problem though; xgettext is the program that scans source code
32 looking for message strings, but it groks only C (or C++). Python
33 introduces a few wrinkles, such as dual quoting characters, triple quoted
34 strings, and raw strings. xgettext understands none of this.
35
36 Enter pygettext, which uses Python's standard tokenize module to scan
37 Python source code, generating .pot files identical to what GNU xgettext[2]
38 generates for C and C++ code. From there, the standard GNU tools can be
39 used.
40
41 A word about marking Python strings as candidates for translation. GNU
42 xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
43 and gettext_noop. But those can be a lot of text to include all over your
44 code. C and C++ have a trick: they use the C preprocessor. Most
45 internationalized C source includes a #define for gettext() to _() so that
46 what has to be written in the source is much less. Thus these are both
47 translatable strings:
48
49 gettext("Translatable String")
50 _("Translatable String")
51
52 Python of course has no preprocessor so this doesn't work so well. Thus,
53 pygettext searches only for _() by default, but see the -k/--keyword flag
54 below for how to augment this.
55
56 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
57 [2] http://www.gnu.org/software/gettext/gettext.html
58
59 NOTE: pygettext attempts to be option and feature compatible with GNU
60 xgettext where ever possible. However some options are still missing or are
61 not fully implemented. Also, xgettext's use of command line switches with
62 option arguments is broken, and in these cases, pygettext just defines
63 additional switches.
64
65 Usage: pygettext [options] inputfile ...
66
67 Options:
68
69 -a
70 --extract-all
71 Extract all strings.
72
73 -d name
74 --default-domain=name
75 Rename the default output file from messages.pot to name.pot.
76
77 -E
78 --escape
79 Replace non-ASCII characters with octal escape sequences.
80
81 -D
82 --docstrings
83 Extract module, class, method, and function docstrings. These do
84 not need to be wrapped in _() markers, and in fact cannot be for
85 Python to consider them docstrings. (See also the -X option).
86
87 -h
88 --help
89 Print this help message and exit.
90
91 -k word
92 --keyword=word
93 Keywords to look for in addition to the default set, which are:
94 %(DEFAULTKEYWORDS)s
95
96 You can have multiple -k flags on the command line.
97
98 -K
99 --no-default-keywords
100 Disable the default set of keywords (see above). Any keywords
101 explicitly added with the -k/--keyword option are still recognized.
102
103 --no-location
104 Do not write filename/lineno location comments.
105
106 -n
107 --add-location
108 Write filename/lineno location comments indicating where each
109 extracted string is found in the source. These lines appear before
110 each msgid. The style of comments is controlled by the -S/--style
111 option. This is the default.
112
113 -o filename
114 --output=filename
115 Rename the default output file from messages.pot to filename. If
116 filename is `-' then the output is sent to standard out.
117
118 -p dir
119 --output-dir=dir
120 Output files will be placed in directory dir.
121
122 -S stylename
123 --style stylename
124 Specify which style to use for location comments. Two styles are
125 supported:
126
127 Solaris # File: filename, line: line-number
128 GNU #: filename:line
129
130 The style name is case insensitive. GNU style is the default.
131
132 -v
133 --verbose
134 Print the names of the files being processed.
135
136 -V
137 --version
138 Print the version of pygettext and exit.
139
140 -w columns
141 --width=columns
142 Set width of output to columns.
143
144 -x filename
145 --exclude-file=filename
146 Specify a file that contains a list of strings that are not be
147 extracted from the input files. Each string to be excluded must
148 appear on a line by itself in the file.
149
150 -X filename
151 --no-docstrings=filename
152 Specify a file that contains a list of files (one per line) that
153 should not have their docstrings extracted. This is only useful in
154 conjunction with the -D option above.
155
156 If `inputfile' is -, standard input is read.
157 """)
158
159 import os
160 import imp
161 import sys
162 import glob
163 import time
164 import getopt
165 import token
166 import tokenize
167 import operator
168
169 try:
170 import kid.parser as kid_parser
171 except ImportError:
172 kid_parser = None
173
174 try:
175 from genshi.template import MarkupTemplate as GenshiMarkupTemplate
176 from genshi.filters.i18n import Translator as GenshiTranslator
177 except ImportError:
178 GenshiMarkupTemplate = None
179
180 __version__ = '1.5'
181
182 default_keywords = ['_']
183 DEFAULTKEYWORDS = ', '.join(default_keywords)
184
185 EMPTYSTRING = ''
186
187
188
189 pot_header = _('''\
190 # SOME DESCRIPTIVE TITLE.
191 # Copyright (C) YEAR ORGANIZATION
192 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
193 #
194 msgid ""
195 msgstr ""
196 "Project-Id-Version: PACKAGE VERSION\\n"
197 "POT-Creation-Date: %(time)s\\n"
198 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
199 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
200 "Language-Team: LANGUAGE <LL@li.org>\\n"
201 "MIME-Version: 1.0\\n"
202 "Content-Type: text/plain; charset=%(charset)s\\n"
203 "Content-Transfer-Encoding: %(charset)s\\n"
204 "Generated-By: pygettext.py %(version)s\\n"
205
206 ''')
207
209 print >> sys.stderr, __doc__ % globals()
210 if msg:
211 print >> sys.stderr, msg
212 sys.exit(code)
213
214
215 escapes = []
216
218 global escapes
219 if pass_iso8859:
220
221
222
223 mod = 128
224 else:
225 mod = 256
226 for i in range(256):
227 if 32 <= (i % mod) <= 126:
228 escapes.append(chr(i))
229 else:
230 escapes.append("\\%03o" % i)
231 escapes[ord('\\')] = '\\\\'
232 escapes[ord('\t')] = '\\t'
233 escapes[ord('\r')] = '\\r'
234 escapes[ord('\n')] = '\\n'
235 escapes[ord('\"')] = '\\"'
236
237
239 """Escape all non-ascii text plus control chars and Python literals."""
240 s = list(s)
241 for i in range(len(s)):
242 s[i] = escapes[ord(s[i])]
243 return EMPTYSTRING.join(s)
244
245
247 """Escape control chars and Python literals, leave non-ascii text intact."""
248 s = s.replace('\\', '\\\\').replace('\t', '\\t').replace(
249 '\r', '\\r').replace('\n', '\\n').replace('\"', '\\"')
250
251 def repl(m):
252 return "\\%03o" % ord(m.group(0))
253 return re.sub('[\001-\037]', repl, s)
254
255
257
258 return eval(s, {'__builtins__':{}}, {})
259
260
262
263
264 lines = s.split('\n')
265 if len(lines) == 1:
266 s = '"' + escape_unicode(s) + '"'
267 else:
268 if not lines[-1]:
269 del lines[-1]
270 lines[-1] = lines[-1] + '\n'
271 for i in range(len(lines)):
272 lines[i] = escape_unicode(lines[i])
273 lineterm = '\\n"\n"'
274 s = '""\n"' + lineterm.join(lines) + '"'
275 if isinstance(s, unicode):
276 s = s.encode('utf-8')
277 if escape:
278 def repl(m):
279 return "\\%03o" % ord(m.group(0))
280 s = re.sub('[\200-\377]', repl, s)
281 return s
282
283
285 """Check whether 'str' contains ANY of the chars in 'set'"""
286 return 1 in [c in str for c in set]
287
288
290 """Helper for getFilesForName()."""
291
292 if '_py_ext' not in globals():
293 global _py_ext
294 _py_ext = [triple[0] for triple in imp.get_suffixes()
295 if triple[2] == imp.PY_SOURCE][0]
296
297 if 'CVS' in names:
298 names.remove('CVS')
299 if '.svn' in names:
300 names.remove('.svn')
301
302 list.extend([os.path.join(dirname, file) for file in names
303 if os.path.splitext(file)[1] == _py_ext])
304
305
307 """Get the filesystem path for a module or a package.
308
309 Return the file system path to a file for a module, and to a directory for
310 a package. Return None if the name is not found, or is a builtin or
311 extension module.
312 """
313
314 parts = dotted_name.split('.', 1)
315
316 if len(parts) > 1:
317
318 try:
319 file, pathname, description = imp.find_module(parts[0], pathlist)
320 if file:
321 file.close()
322 except ImportError:
323 return None
324
325
326 if description[2] == imp.PKG_DIRECTORY:
327
328 pathname = _get_modpkg_path(parts[1], [pathname])
329 else:
330 pathname = None
331 else:
332
333 try:
334 file, pathname, description = imp.find_module(
335 dotted_name, pathlist)
336 if file:
337 file.close()
338 if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]:
339 pathname = None
340 except ImportError:
341 pathname = None
342
343 return pathname
344
345
347 """Get a list of module files for a filename, a module or package name,
348 or a directory.
349 """
350 if not os.path.exists(name):
351
352 if containsAny(name, "*?[]"):
353 files = glob.glob(name)
354 list = []
355 for file in files:
356 list.extend(getFilesForName(file))
357 return list
358
359
360 name = _get_modpkg_path(name)
361 if not name:
362 return []
363
364 if os.path.isdir(name):
365
366 list = []
367 os.path.walk(name, _visit_pyfiles, list)
368 return list
369 elif os.path.exists(name):
370
371 return [name]
372
373 return []
374
375
377 """Extract translatable strings from a Genshi template.
378
379 The extractor will get all the text inside all elements which are
380 not in the ignore list (see options) and the values of all
381 attributes named in the include list.
382
383 Options:
384
385 `ignore_tags` -- `'script style'`
386 List of element names. Content inside elements named in
387 this list is not extracted as translatable text. Can be a
388 space-separated string or a list of string.
389 `include_attrs` -- `'abbr alt label prompt standby summary title'`
390 List of attribute names. Only values of the attributes named in this
391 list are extracted as translatable text. Can be a space-separated
392 string or a list of string.
393
394 See http://genshi.edgewall.org/wiki/Documentation/0.5.x/i18n.html for
395 more information.
396
397 """
398
399 if not GenshiMarkupTemplate:
400 raise ImportError("Genshi templating is not installed.")
401
402 if options is None:
403 options = {}
404
405 try:
406 stream = GenshiMarkupTemplate(
407 open(filename), filename=filename, filepath='.').stream
408 translator = GenshiTranslator(**options)
409 return translator.extract(stream)
410 except Exception:
411 print >> sys.stderr, "Extracting from Genshi template", filename
412 raise
413
414
416
418 self.__options = options
419 self.__messages = {}
420 self.__state = self.__waiting
421 self.__data = []
422 self.__lineno = -1
423 self.__freshmodule = 1
424 self.__curfile = None
425 self.__encoding = None
426
427 - def __call__(self, ttype, tstring, stup, etup, line):
428
429 self.__state(ttype, tstring, stup[0])
430
431 - def __waiting(self, ttype, tstring, lineno):
432 opts = self.__options
433
434 if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
435
436 if self.__freshmodule:
437 if ttype == tokenize.STRING:
438 self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
439 self.__freshmodule = 0
440 elif ttype not in (tokenize.COMMENT, tokenize.NL):
441 self.__freshmodule = 0
442 return
443
444 if ttype == tokenize.NAME and tstring in ('class', 'def'):
445 self.__state = self.__suiteseen
446 return
447 if ttype == tokenize.NAME and tstring in opts.keywords:
448 self.__state = self.__keywordseen
449
451
452 if ttype == tokenize.OP and tstring == ':':
453 self.__state = self.__suitedocstring
454
456
457 if ttype == tokenize.STRING:
458 self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
459 self.__state = self.__waiting
460 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
461 tokenize.COMMENT):
462
463 self.__state = self.__waiting
464
466 if ttype == tokenize.OP and tstring == '(':
467 self.__data = []
468 self.__lineno = lineno
469 self.__state = self.__openseen
470 else:
471 self.__state = self.__waiting
472
474 if ttype == tokenize.OP and tstring == ',':
475
476 if self.__data:
477 self.__addentry(EMPTYSTRING.join(self.__data))
478 self.__state = self.__waiting
479
480 elif ttype == tokenize.OP and tstring == ')':
481
482
483
484
485 if self.__data:
486 self.__addentry(EMPTYSTRING.join(self.__data))
487 self.__state = self.__waiting
488
489 elif ttype == tokenize.STRING:
490 self.__data.append(safe_eval(tstring))
491
492 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
493 token.NEWLINE, tokenize.NL]:
494
495 print >> sys.stderr, _(
496 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
497 ) % {
498 'token': tstring,
499 'file': self.__curfile,
500 'lineno': self.__lineno
501 }
502 self.__state = self.__waiting
503
504 - def __addentry(self, msg, lineno=None, isdocstring=0, istemplatestring=0):
505 """The tokenize module always returns unicode strings even when they
506 are in fact coded string instances. To deal with this we use a hack:
507 evaluate string's representation without leading "u" to force
508 interpration as a coded string, then we decode it using the already
509 known file encoding.
510
511 """
512 if not istemplatestring:
513 if type(msg) is str:
514 msg = eval(repr(msg))
515 else:
516 msg = eval(repr(msg)[1:])
517
518 msg = msg.decode(self.__encoding)
519
520 if lineno is None:
521 lineno = self.__lineno
522
523 if not msg in self.__options.toexclude:
524 entry = (self.__curfile, lineno)
525 self.__messages.setdefault(msg, {})[entry] = isdocstring
526
528 self.__curfile = filename
529 self.__freshmodule = 1
530
532 """Search for -*- coding: -*- magic comment to find out file encoding"""
533 self.__encoding = 'utf-8'
534 for line in fp.readlines()[:5]:
535 m = re.match('#\s*-\*-\s+coding:\s+(\w+)\s+-\*-', line)
536 if m:
537 self.__encoding = m.group(1)
538 break
539 fp.seek(0)
540
542 return '${' in msg and not '$${' in msg
543
545 return tag.split('}', 1)[-1]
546
548 """Extract translatable strings from a Genshi template.
549
550 See the docstring of the eponymous module function for documentation.
551
552 """
553 if self.__curfile:
554
555
556 for msg in extract_genshi_strings(self.__curfile):
557 lineno, text = msg[0], msg[2]
558 if text:
559 if isinstance(text, tuple):
560 for subtext in text:
561 if subtext:
562 self.__addentry(subtext, lineno, istemplatestring=1)
563 else:
564 self.__addentry(text, lineno, istemplatestring=1)
565
567 if not self.__curfile:
568 return
569 if not kid_parser:
570 raise ImportError("Kid templating is not installed.")
571 tag = None
572 tags = []
573 for ev, item in kid_parser.document(self.__curfile):
574 if ev == kid_parser.TEXT:
575 if tag:
576 item = item.strip()
577 if item and not self.__contains_inline_python(item):
578 self.__addentry(item, tag, istemplatestring=1)
579 elif ev == kid_parser.START:
580 tag = item.tag
581 if isinstance(tag, basestring):
582 tag = self.__strip_namespace_uri(tag)
583 if tag in ('script', 'style'):
584 tag = None
585 else:
586 tag = None
587 tags.append(tag)
588 elif ev == kid_parser.END:
589 if tags:
590 tag = tags.pop()
591
593 options = self.__options
594
595
596
597 timestamp = time.strftime('%Y-%m-%d %H:%M')
598
599
600 t = {'time': timestamp, 'version': __version__, 'charset':'utf-8'}
601 print >> fp, pot_header % t
602
603
604 reverse = {}
605 for k, v in self.__messages.items():
606 keys = v.keys()
607 keys.sort()
608 reverse.setdefault(tuple(keys), []).append((k, v))
609 rkeys = reverse.keys()
610 rkeys.sort()
611 for rkey in rkeys:
612 rentries = reverse[rkey]
613 rentries.sort()
614 for k, v in rentries:
615 isdocstring = 0
616
617
618
619 if reduce(operator.__add__, v.values()):
620 isdocstring = 1
621
622
623
624 v = v.keys()
625 v.sort()
626 if not options.writelocations:
627 pass
628
629 elif options.locationstyle == options.SOLARIS:
630 for filename, lineno in v:
631 d = {'filename': filename, 'lineno': lineno}
632 print >> fp, _(
633 '# File: %(filename)s, line: %(lineno)s') % d
634 elif options.locationstyle == options.GNU:
635
636
637 locline = '#:'
638 for filename, lineno in v:
639 d = {'filename': filename, 'lineno': lineno}
640 s = _(' %(filename)s:%(lineno)s') % d
641 if len(locline) + len(s) <= options.width:
642 locline += s
643 else:
644 print >> fp, locline
645 locline = "#:" + s
646 if len(locline) > 2:
647 print >> fp, locline
648 if isdocstring:
649 print >> fp, '#, docstring'
650 if k:
651 print >> fp, 'msgid', normalize(k, options.escape)
652 print >> fp, 'msgstr ""\n'
653
655 global default_keywords
656 try:
657 opts, args = getopt.getopt(
658 sys.argv[1:],
659 'ad:UDEhk:Kno:p:S:Vvw:x:X:',
660 ['extract-all', 'default-domain=', 'escape', 'help',
661 'keyword=', 'no-default-keywords',
662 'add-location', 'no-location', 'output=', 'output-dir=',
663 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
664 'docstrings', 'no-docstrings', 'support-unicode',
665 ])
666 except getopt.error, msg:
667 usage(1, msg)
668
669
670 class Options:
671
672 GNU = 1
673 SOLARIS = 2
674
675 extractall = 0
676 escape = 0
677 keywords = []
678 outpath = ''
679 outfile = 'messages.pot'
680 writelocations = 1
681 locationstyle = GNU
682 verbose = 0
683 width = 78
684 excludefilename = ''
685 docstrings = 0
686 nodocstrings = {}
687
688 options = Options()
689 locations = {'gnu' : options.GNU,
690 'solaris' : options.SOLARIS,
691 }
692
693
694 for opt, arg in opts:
695 if opt in ('-h', '--help'):
696 usage(0)
697 elif opt in ('-a', '--extract-all'):
698 options.extractall = 1
699 elif opt in ('-d', '--default-domain'):
700 options.outfile = arg + '.pot'
701 elif opt in ('-E', '--escape'):
702 options.escape = 1
703 elif opt in ('-D', '--docstrings'):
704 options.docstrings = 1
705 elif opt in ('-k', '--keyword'):
706 options.keywords.append(arg)
707 elif opt in ('-K', '--no-default-keywords'):
708 default_keywords = []
709 elif opt in ('-n', '--add-location'):
710 options.writelocations = 1
711 elif opt in ('--no-location',):
712 options.writelocations = 0
713 elif opt in ('-S', '--style'):
714 options.locationstyle = locations.get(arg.lower())
715 if options.locationstyle is None:
716 usage(1, _('Invalid value for --style: %s') % arg)
717 elif opt in ('-o', '--output'):
718 options.outfile = arg
719 elif opt in ('-p', '--output-dir'):
720 options.outpath = arg
721 elif opt in ('-v', '--verbose'):
722 options.verbose = 1
723 elif opt in ('-V', '--version'):
724 print _('pygettext.py (xgettext for Python) %s') % __version__
725 sys.exit(0)
726 elif opt in ('-w', '--width'):
727 try:
728 options.width = int(arg)
729 except ValueError:
730 usage(1, _('--width argument must be an integer: %s') % arg)
731 elif opt in ('-x', '--exclude-file'):
732 options.excludefilename = arg
733 elif opt in ('-X', '--no-docstrings'):
734 fp = open(arg)
735 try:
736 while 1:
737 line = fp.readline()
738 if not line:
739 break
740 options.nodocstrings[line[:-1]] = 1
741 finally:
742 fp.close()
743
744
745 make_escapes(0)
746
747
748 options.keywords.extend(default_keywords)
749
750
751 if options.excludefilename:
752 try:
753 fp = open(options.excludefilename)
754 options.toexclude = fp.readlines()
755 fp.close()
756 except IOError:
757 print >> sys.stderr, _(
758 "Can't read --exclude-file: %s") % options.excludefilename
759 sys.exit(1)
760 else:
761 options.toexclude = []
762
763
764 expanded = []
765 for arg in args:
766 if arg == '-':
767 expanded.append(arg)
768 else:
769 expanded.extend(getFilesForName(arg))
770 args = expanded
771
772
773 eater = TokenEater(options)
774 for filename in args:
775 if filename == '-':
776 if options.verbose:
777 print _('Reading standard input')
778 fp = sys.stdin
779 closep = 0
780
781 else:
782 if options.verbose:
783 print _('Working on %s') % filename
784 fp = open(filename)
785 eater.set_file_encoding(fp)
786 closep = 1
787
788 try:
789 eater.set_filename(filename)
790 if os.path.splitext(filename)[-1].lower() == '.kid':
791 try:
792 eater.extract_kid_strings()
793 except Exception, e:
794 print >> sys.stderr, "Kid eater exception:", e
795
796 elif os.path.splitext(filename)[-1].lower() == '.html':
797 try:
798 eater.extract_genshi_strings()
799 except Exception, e:
800 print >> sys.stderr, "Genshi eater exception:", e
801
802 else:
803 try:
804 tokenize.tokenize(fp.readline, eater)
805 except tokenize.TokenError, e:
806 print >> sys.stderr, '%s: %s, line %d, column %d' % (
807 e[0], filename, e[1][0], e[1][1])
808
809 finally:
810 if closep:
811 fp.close()
812
813
814 if options.outfile == '-':
815 fp = sys.stdout
816 closep = 0
817
818 else:
819 if options.outpath:
820 options.outfile = os.path.join(options.outpath, options.outfile)
821
822 fp = open(options.outfile, 'w+')
823 closep = 1
824
825 try:
826 eater.write(fp)
827
828 finally:
829 if closep:
830 fp.close()
831
832
833 if __name__ == '__main__':
834 main()
835
836 _(u'a unicode string')
837
838 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
839 _('more' 'than' 'one' 'string')
840