diff options
author | astokes_rhn <astokes_rhn@ef72aa8b-4018-0410-8976-d6e080ef94d8> | 2007-07-25 15:10:20 +0000 |
---|---|---|
committer | astokes_rhn <astokes_rhn@ef72aa8b-4018-0410-8976-d6e080ef94d8> | 2007-07-25 15:10:20 +0000 |
commit | 7e9c9b84a6449f70c768a2334c1a68ce73499cd0 (patch) | |
tree | 7072d53ee8d1ac0aa4d848dfc47640a6bbcde2ce /trunk/src/tools/pygettext.py | |
parent | 28f13dc4aec9d474447a5021eb45409bcba81182 (diff) | |
download | sos-7e9c9b84a6449f70c768a2334c1a68ce73499cd0.tar.gz |
branching off for stokes work
git-svn-id: svn+ssh://svn.fedorahosted.org/svn/sos/trunk@223 ef72aa8b-4018-0410-8976-d6e080ef94d8
Diffstat (limited to 'trunk/src/tools/pygettext.py')
-rw-r--r-- | trunk/src/tools/pygettext.py | 762 |
1 files changed, 762 insertions, 0 deletions
diff --git a/trunk/src/tools/pygettext.py b/trunk/src/tools/pygettext.py new file mode 100644 index 00000000..040b5c7f --- /dev/null +++ b/trunk/src/tools/pygettext.py @@ -0,0 +1,762 @@ +#! /usr/bin/env python +# -*- coding: iso-8859-1 -*- +# Originally written by Barry Warsaw <barry@zope.com> +# +# Minimally patched to make it even more xgettext compatible +# by Peter Funk <pf@artcom-gmbh.de> +# +# 2002-11-22 Jürgen Hermann <jh@web.de> +# Added checks that _() only contains string literals, and +# command line args are resolved to module lists, i.e. you +# can now pass a filename, a module or package name, or a +# directory (including globbing chars, important for Win32). +# Made docstring fit in 80 chars wide displays using pydoc. +# + +import codecs + +# for selftesting +import re +try: + import fintl + _ = fintl.gettext +except ImportError: + _ = lambda s: s + +__doc__ = _("""pygettext -- Python equivalent of xgettext(1) + +Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the +internationalization of C programs. Most of these tools are independent of +the programming language and can be used from within Python programs. +Martin von Loewis' work[1] helps considerably in this regard. + +There's one problem though; xgettext is the program that scans source code +looking for message strings, but it groks only C (or C++). Python +introduces a few wrinkles, such as dual quoting characters, triple quoted +strings, and raw strings. xgettext understands none of this. + +Enter pygettext, which uses Python's standard tokenize module to scan +Python source code, generating .pot files identical to what GNU xgettext[2] +generates for C and C++ code. From there, the standard GNU tools can be +used. + +A word about marking Python strings as candidates for translation. GNU +xgettext recognizes the following keywords: gettext, dgettext, dcgettext, +and gettext_noop. But those can be a lot of text to include all over your +code. C and C++ have a trick: they use the C preprocessor. Most +internationalized C source includes a #define for gettext() to _() so that +what has to be written in the source is much less. Thus these are both +translatable strings: + + gettext("Translatable String") + _("Translatable String") + +Python of course has no preprocessor so this doesn't work so well. Thus, +pygettext searches only for _() by default, but see the -k/--keyword flag +below for how to augment this. + + [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html + [2] http://www.gnu.org/software/gettext/gettext.html + +NOTE: pygettext attempts to be option and feature compatible with GNU +xgettext where ever possible. However some options are still missing or are +not fully implemented. Also, xgettext's use of command line switches with +option arguments is broken, and in these cases, pygettext just defines +additional switches. + +Usage: pygettext [options] inputfile ... + +Options: + + -a + --extract-all + Extract all strings. + + -d name + --default-domain=name + Rename the default output file from messages.pot to name.pot. + + -E + --escape + Replace non-ASCII characters with octal escape sequences. + + -D + --docstrings + Extract module, class, method, and function docstrings. These do + not need to be wrapped in _() markers, and in fact cannot be for + Python to consider them docstrings. (See also the -X option). + + -h + --help + Print this help message and exit. + + -k word + --keyword=word + Keywords to look for in addition to the default set, which are: + %(DEFAULTKEYWORDS)s + + You can have multiple -k flags on the command line. + + -K + --no-default-keywords + Disable the default set of keywords (see above). Any keywords + explicitly added with the -k/--keyword option are still recognized. + + --no-location + Do not write filename/lineno location comments. + + -n + --add-location + Write filename/lineno location comments indicating where each + extracted string is found in the source. These lines appear before + each msgid. The style of comments is controlled by the -S/--style + option. This is the default. + + -o filename + --output=filename + Rename the default output file from messages.pot to filename. If + filename is `-' then the output is sent to standard out. + + -p dir + --output-dir=dir + Output files will be placed in directory dir. + + -S stylename + --style stylename + Specify which style to use for location comments. Two styles are + supported: + + Solaris # File: filename, line: line-number + GNU #: filename:line + + The style name is case insensitive. GNU style is the default. + + -v + --verbose + Print the names of the files being processed. + + -V + --version + Print the version of pygettext and exit. + + -w columns + --width=columns + Set width of output to columns. + + -x filename + --exclude-file=filename + Specify a file that contains a list of strings that are not be + extracted from the input files. Each string to be excluded must + appear on a line by itself in the file. + + -X filename + --no-docstrings=filename + Specify a file that contains a list of files (one per line) that + should not have their docstrings extracted. This is only useful in + conjunction with the -D option above. + +If `inputfile' is -, standard input is read. +""") + +import os +import imp +import sys +import glob +import time +import getopt +import token +import tokenize +import operator +import codecs + +from elementtree.ElementTree import ElementTree, XML + +__version__ = '1.5' + +default_keywords = ['_'] +DEFAULTKEYWORDS = ', '.join(default_keywords) + +EMPTYSTRING = '' + + + +# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's +# there. +pot_header = _('''\ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR ORGANIZATION +# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. +# +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\\n" +"POT-Creation-Date: %(time)s\\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" +"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" +"Language-Team: LANGUAGE <LL@li.org>\\n" +"MIME-Version: 1.0\\n" +"Content-Type: text/plain; charset=%(charset)s\\n" +"Content-Transfer-Encoding: %(charset)s\\n" +"Generated-By: pygettext.py %(version)s\\n" + +''') + + +def usage(code, msg=''): + print >> sys.stderr, __doc__ % globals() + if msg: + print >> sys.stderr, msg + sys.exit(code) + + + +escapes = [] + +def make_escapes(pass_iso8859): + global escapes + if pass_iso8859: + # Allow iso-8859 characters to pass through so that e.g. 'msgid + # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we + # escape any character outside the 32..126 range. + mod = 128 + else: + mod = 256 + for i in range(256): + if 32 <= (i % mod) <= 126: + escapes.append(chr(i)) + else: + escapes.append("\\%03o" % i) + escapes[ord('\\')] = '\\\\' + escapes[ord('\t')] = '\\t' + escapes[ord('\r')] = '\\r' + escapes[ord('\n')] = '\\n' + escapes[ord('\"')] = '\\"' + + +def escape_ascii(s): + "Escapes all text outside of 7bit ASCII plus control characters and Python literals." + global escapes + s = list(s) + for i in range(len(s)): + s[i] = escapes[ord(s[i])] + return EMPTYSTRING.join(s) + +def escape_unicode(s): + "Escapes control characters and Python literals only leaving non-ascii text intact." + #for sp in ('\t', '\r', '\n', '\"', '\\'): + s = s.replace('\\', '\\\\') + s = s.replace('\t', '\\t') + s = s.replace('\r', '\\r') + s = s.replace('\n', '\\n') + s = s.replace('\"', '\\"') + # escape control chars + def repl(m): return "\\%03o" % ord(m.group(0)) + s = re.sub('[\001-\037]', repl, s) + return s + +def safe_eval(s): + # unwrap quotes, safely + return eval(s, {'__builtins__':{}}, {}) + + +def normalize(s, escape=False): + # This converts the various Python string types into a format that is + # appropriate for .po files, namely much closer to C style. + lines = s.split('\n') + if len(lines) == 1: + s = '"' + escape_unicode(s) + '"' + else: + if not lines[-1]: + del lines[-1] + lines[-1] = lines[-1] + '\n' + for i in range(len(lines)): + lines[i] = escape_unicode(lines[i]) + lineterm = '\\n"\n"' + s = '""\n"' + lineterm.join(lines) + '"' + if isinstance(s, unicode): + s = s.encode('utf-8') + if escape: + def repl(m): return "\\%03o" % ord(m.group(0)) + s = re.sub('[\200-\377]', repl, s) + return s + + +def containsAny(str, set): + """Check whether 'str' contains ANY of the chars in 'set'""" + return 1 in [c in str for c in set] + + +def _visit_pyfiles(list, dirname, names): + """Helper for getFilesForName().""" + # get extension for python source files + if not globals().has_key('_py_ext'): + global _py_ext + _py_ext = [triple[0] for triple in imp.get_suffixes() + if triple[2] == imp.PY_SOURCE][0] + + # don't recurse into CVS directories + if 'CVS' in names: + names.remove('CVS') + if '.svn' in names: + names.remove('.svn') + + # add all *.py files to list + list.extend( + [os.path.join(dirname, file) for file in names + if os.path.splitext(file)[1] == _py_ext] + ) + + +def _get_modpkg_path(dotted_name, pathlist=None): + """Get the filesystem path for a module or a package. + + Return the file system path to a file for a module, and to a directory for + a package. Return None if the name is not found, or is a builtin or + extension module. + """ + # split off top-most name + parts = dotted_name.split('.', 1) + + if len(parts) > 1: + # we have a dotted path, import top-level package + try: + file, pathname, description = imp.find_module(parts[0], pathlist) + if file: file.close() + except ImportError: + return None + + # check if it's indeed a package + if description[2] == imp.PKG_DIRECTORY: + # recursively handle the remaining name parts + pathname = _get_modpkg_path(parts[1], [pathname]) + else: + pathname = None + else: + # plain name + try: + file, pathname, description = imp.find_module( + dotted_name, pathlist) + if file: + file.close() + if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]: + pathname = None + except ImportError: + pathname = None + + return pathname + + +def getFilesForName(name): + """Get a list of module files for a filename, a module or package name, + or a directory. + """ + if not os.path.exists(name): + # check for glob chars + if containsAny(name, "*?[]"): + files = glob.glob(name) + list = [] + for file in files: + list.extend(getFilesForName(file)) + return list + + # try to find module or package + name = _get_modpkg_path(name) + if not name: + return [] + + if os.path.isdir(name): + # find all python files in directory + list = [] + os.path.walk(name, _visit_pyfiles, list) + return list + elif os.path.exists(name): + # a single file + return [name] + + return [] + + +class TokenEater: + def __init__(self, options): + self.__options = options + self.__messages = {} + self.__state = self.__waiting + self.__data = [] + self.__lineno = -1 + self.__freshmodule = 1 + self.__curfile = None + self.__encoding = None + + def __call__(self, ttype, tstring, stup, etup, line): + # dispatch +## import token +## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \ +## 'tstring:', tstring + self.__state(ttype, tstring, stup[0]) + + def __waiting(self, ttype, tstring, lineno): + opts = self.__options + # Do docstring extractions, if enabled + if opts.docstrings and not opts.nodocstrings.get(self.__curfile): + # module docstring? + if self.__freshmodule: + if ttype == tokenize.STRING: + self.__addentry(safe_eval(tstring), lineno, isdocstring=1) + self.__freshmodule = 0 + elif ttype not in (tokenize.COMMENT, tokenize.NL): + self.__freshmodule = 0 + return + # class docstring? + if ttype == tokenize.NAME and tstring in ('class', 'def'): + self.__state = self.__suiteseen + return + if ttype == tokenize.NAME and tstring in opts.keywords: + self.__state = self.__keywordseen + + def __suiteseen(self, ttype, tstring, lineno): + # ignore anything until we see the colon + if ttype == tokenize.OP and tstring == ':': + self.__state = self.__suitedocstring + + def __suitedocstring(self, ttype, tstring, lineno): + # ignore any intervening noise + if ttype == tokenize.STRING: + self.__addentry(safe_eval(tstring), lineno, isdocstring=1) + self.__state = self.__waiting + elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, + tokenize.COMMENT): + # there was no class docstring + self.__state = self.__waiting + + def __keywordseen(self, ttype, tstring, lineno): + if ttype == tokenize.OP and tstring == '(': + self.__data = [] + self.__lineno = lineno + self.__state = self.__openseen + else: + self.__state = self.__waiting + + def __openseen(self, ttype, tstring, lineno): + if ttype == tokenize.OP and tstring == ')': + # We've seen the last of the translatable strings. Record the + # line number of the first line of the strings and update the list + # of messages seen. Reset state for the next batch. If there + # were no strings inside _(), then just ignore this entry. + if self.__data: + self.__addentry(EMPTYSTRING.join(self.__data)) + self.__state = self.__waiting + elif ttype == tokenize.STRING: + self.__data.append(safe_eval(tstring)) + elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, + token.NEWLINE, tokenize.NL]: + # warn if we see anything else than STRING or whitespace + print >> sys.stderr, _( + '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' + ) % { + 'token': tstring, + 'file': self.__curfile, + 'lineno': self.__lineno + } + self.__state = self.__waiting + + def __addentry(self, msg, lineno=None, isdocstring=0, iskidstring=0): + # tokenize module always return unicode strings + # even when they are in fact coded string instances + # to deal with this we use a hack: + # evaluate string's representation without leading "u" + # to force interpration as coded string + # then we decode it using already known file's encoding + if not iskidstring: + if type(msg) is str: + msg = eval(repr(msg)) + else: + msg = eval(repr(msg)[1:]) + msg = msg.decode(self.__encoding) + if lineno is None: + lineno = self.__lineno + if not msg in self.__options.toexclude: + entry = (self.__curfile, lineno) + self.__messages.setdefault(msg, {})[entry] = isdocstring + + def set_filename(self, filename): + self.__curfile = filename + self.__freshmodule = 1 + + def set_file_encoding(self, fp): + """Searches for -*- coding: -*- magic comment to find out file encoding.""" + self.__encoding = 'utf-8' # reset to default for each new file + for line in fp.readlines()[:5]: + m = re.match('#\s*-\*-\s+coding:\s+(\w+)\s+-\*-', line) + if m: + self.__encoding = m.group(1) + break + fp.seek(0) + + def contains_inline_python(self,msg): + if '${' in msg and not '$${' in msg: return True + return False + + def strip_namespace_uri(self,tag): + return tag.split('}')[-1] + + def get_text_node(self,node): + tag = re.sub('({[^}]+})?(\w+)', '\\2', node.tag) + + if node.text: + msg = node.text.strip() + if msg and not self.contains_inline_python(msg): + if tag not in ['script','style']: + self.__addentry(msg,self.strip_namespace_uri(node.tag), iskidstring=1) + + if node.getchildren(): + for child in node: self.get_text_node(child) + + if node.tail: + msg = node.tail.strip() + if msg and not self.contains_inline_python(msg): + self.__addentry(msg,self.strip_namespace_uri(node.tag), iskidstring=1) + + def extract_kid_strings(self): + if not self.__curfile: return + f = None + try: + file = open(self.__curfile) + f = ElementTree(XML( fixentities(file.read() ))) + except Exception, e: + print 'Skip %s: %s' % (self.__curfile, e) + return + + node = f.getroot() + self.get_text_node(node) + + def write(self, fp): + options = self.__options + # format without tz information + # because %Z is timezone's name, not offset + # and, say, on localized Windows XP this is non-ascii string + timestamp = time.strftime('%Y-%m-%d %H:%M') + # The time stamp in the header doesn't have the same format as that + # generated by xgettext... + t = {'time': timestamp, 'version': __version__, 'charset':'utf-8'} + print >> fp, pot_header % t + # Sort the entries. First sort each particular entry's keys, then + # sort all the entries by their first item. + reverse = {} + for k, v in self.__messages.items(): + keys = v.keys() + keys.sort() + reverse.setdefault(tuple(keys), []).append((k, v)) + rkeys = reverse.keys() + rkeys.sort() + for rkey in rkeys: + rentries = reverse[rkey] + rentries.sort() + for k, v in rentries: + isdocstring = 0 + # If the entry was gleaned out of a docstring, then add a + # comment stating so. This is to aid translators who may wish + # to skip translating some unimportant docstrings. + if reduce(operator.__add__, v.values()): + isdocstring = 1 + # k is the message string, v is a dictionary-set of (filename, + # lineno) tuples. We want to sort the entries in v first by + # file name and then by line number. + v = v.keys() + v.sort() + if not options.writelocations: + pass + # location comments are different b/w Solaris and GNU: + elif options.locationstyle == options.SOLARIS: + for filename, lineno in v: + d = {'filename': filename, 'lineno': lineno} + print >>fp, _( + '# File: %(filename)s, line: %(lineno)s') % d + elif options.locationstyle == options.GNU: + # fit as many locations on one line, as long as the + # resulting line length doesn't exceeds 'options.width' + locline = '#:' + for filename, lineno in v: + d = {'filename': filename, 'lineno': lineno} + s = _(' %(filename)s:%(lineno)s') % d + if len(locline) + len(s) <= options.width: + locline = locline + s + else: + print >> fp, locline + locline = "#:" + s + if len(locline) > 2: + print >> fp, locline + if isdocstring: + print >> fp, '#, docstring' + if k: # do not output empty msgid + print >> fp, 'msgid', normalize(k, options.escape) + print >> fp, 'msgstr ""\n' + + +def main(): + global default_keywords + try: + opts, args = getopt.getopt( + sys.argv[1:], + 'ad:UDEhk:Kno:p:S:Vvw:x:X:', + ['extract-all', 'default-domain=', 'escape', 'help', + 'keyword=', 'no-default-keywords', + 'add-location', 'no-location', 'output=', 'output-dir=', + 'style=', 'verbose', 'version', 'width=', 'exclude-file=', + 'docstrings', 'no-docstrings', 'support-unicode', + ]) + except getopt.error, msg: + usage(1, msg) + + # for holding option values + class Options: + # constants + GNU = 1 + SOLARIS = 2 + # defaults + extractall = 0 # FIXME: currently this option has no effect at all. + escape = 0 + keywords = [] + outpath = '' + outfile = 'messages.pot' + writelocations = 1 + locationstyle = GNU + verbose = 0 + width = 78 + excludefilename = '' + docstrings = 0 + nodocstrings = {} + + options = Options() + locations = {'gnu' : options.GNU, + 'solaris' : options.SOLARIS, + } + + # parse options + for opt, arg in opts: + if opt in ('-h', '--help'): + usage(0) + elif opt in ('-a', '--extract-all'): + options.extractall = 1 + elif opt in ('-d', '--default-domain'): + options.outfile = arg + '.pot' + elif opt in ('-E', '--escape'): + options.escape = 1 + elif opt in ('-D', '--docstrings'): + options.docstrings = 1 + elif opt in ('-k', '--keyword'): + options.keywords.append(arg) + elif opt in ('-K', '--no-default-keywords'): + default_keywords = [] + elif opt in ('-n', '--add-location'): + options.writelocations = 1 + elif opt in ('--no-location',): + options.writelocations = 0 + elif opt in ('-S', '--style'): + options.locationstyle = locations.get(arg.lower()) + if options.locationstyle is None: + usage(1, _('Invalid value for --style: %s') % arg) + elif opt in ('-o', '--output'): + options.outfile = arg + elif opt in ('-p', '--output-dir'): + options.outpath = arg + elif opt in ('-v', '--verbose'): + options.verbose = 1 + elif opt in ('-V', '--version'): + print _('pygettext.py (xgettext for Python) %s') % __version__ + sys.exit(0) + elif opt in ('-w', '--width'): + try: + options.width = int(arg) + except ValueError: + usage(1, _('--width argument must be an integer: %s') % arg) + elif opt in ('-x', '--exclude-file'): + options.excludefilename = arg + elif opt in ('-X', '--no-docstrings'): + fp = open(arg) + try: + while 1: + line = fp.readline() + if not line: + break + options.nodocstrings[line[:-1]] = 1 + finally: + fp.close() + + # calculate escapes + make_escapes(0) + + # calculate all keywords + options.keywords.extend(default_keywords) + + # initialize list of strings to exclude + if options.excludefilename: + try: + fp = open(options.excludefilename) + options.toexclude = fp.readlines() + fp.close() + except IOError: + print >> sys.stderr, _( + "Can't read --exclude-file: %s") % options.excludefilename + sys.exit(1) + else: + options.toexclude = [] + + # resolve args to module lists + expanded = [] + for arg in args: + if arg == '-': + expanded.append(arg) + else: + expanded.extend(getFilesForName(arg)) + args = expanded + + # slurp through all the files + eater = TokenEater(options) + for filename in args: + if filename == '-': + if options.verbose: + print _('Reading standard input') + fp = sys.stdin + closep = 0 + else: + if options.verbose: + print _('Working on %s') % filename + fp = open(filename) + eater.set_file_encoding(fp) + closep = 1 + try: + eater.set_filename(filename) + try: + tokenize.tokenize(fp.readline, eater) + except tokenize.TokenError, e: + print >> sys.stderr, '%s: %s, line %d, column %d' % ( + e[0], filename, e[1][0], e[1][1]) + finally: + if closep: + fp.close() + + if os.path.splitext(filename)[-1].lower() == '.kid': eater.extract_kid_strings() + + # write the output + if options.outfile == '-': + fp = sys.stdout + closep = 0 + else: + if options.outpath: + options.outfile = os.path.join(options.outpath, options.outfile) + fp = open(options.outfile, 'wt') + closep = 1 + try: + eater.write(fp) + finally: + if closep: + fp.close() + + +if __name__ == '__main__': + main() + # some more test strings + _(u'a unicode string') + # this one creates a warning + _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} + _('more' 'than' 'one' 'string') |