diff options
-rw-r--r-- | README.rst | 6 | ||||
-rwxr-xr-x | bin/wikitrans | 45 | ||||
-rw-r--r-- | wikitrans/wikimarkup.py | 35 | ||||
-rw-r--r-- | wikitrans/wikitoken.py | 2 |
4 files changed, 62 insertions, 26 deletions
@@ -37,6 +37,12 @@ media_base = *url* Base URL for media files. Default is ``http://www.mediawiki.org/xml/export-0.3`` +debug_level = *int* + Debug verbosity level (0 - no debug info, 100 - excessively + copious debug messages). Default is 0. + +strict = *bool* + Strict parsing mode. Throw exceptions on syntax errors. Default is False. class ``TextWikiMarkup`` ------------------------ diff --git a/bin/wikitrans b/bin/wikitrans index 4a0fc06..09ba0b3 100755 --- a/bin/wikitrans +++ b/bin/wikitrans @@ -1,17 +1,17 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # Copyright (C) 2008-2018 Sergey Poznyakoff -# +# # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. @@ -32,13 +32,13 @@ from wikitrans.wiki2texi import TexiWikiMarkup from wikitrans.wikimarkup import WikiMarkup from wikitrans.wikidump import DumpWikiMarkup -# Set utf-8 as the default encoding for Python 2.7. -# Trying to do so using encode('utf_8')/unicode, which is +# Set utf-8 as the default encoding for Python 2.7. +# Trying to do so using encode('utf_8')/unicode, which is # supposed to be the right way, does not work in Python 2.7 # Simply calling sys.setdefaultencoding is not possible, -# because, for some obscure reason, Python chooses to delete -# this symbol from the namespace after setting its default -# encoding in site.py. That's why reload is needed. +# because, for some obscure reason, Python chooses to delete +# this symbol from the namespace after setting its default +# encoding in site.py. That's why reload is needed. try: reload(sys) sys.setdefaultencoding('utf-8') @@ -71,8 +71,13 @@ def setkw(option, opt, value, parser): if val: parser.values.kwdict[kw] = val +def setdebug(option, opt, value, parser): + if not parser.values.kwdict: + parser.values.kwdict = {} + parser.values.kwdict['debug_level'] = value + def getwiki(url, options): - tmp = tempfile.NamedTemporaryFile() + tmp = tempfile.NamedTemporaryFile() if sys.version_info[0] > 2: import urllib.request with urllib.request.urlopen(url) as u: @@ -95,9 +100,9 @@ def getwiki(url, options): options.kwdict['html_base'] = m.group('url') + '/wiki/' if m.group('root') == 'wiktionary': options.itype = 'wiktionary' - + options.kwdict['text'] = text.text.encode() - + def main(): usage = '%prog [OPTIONS] ARG' version = '%prog 1.2' @@ -106,10 +111,10 @@ If ARG looks like a URL, the wiki text to be converted will be downloaded from that URL. Otherwise, if --base-url is given, ARG is treated as the name of the page to get from the WikiMedia istallation at that URL. -Otherwise, ARG is name of the file to read wiki material from. +Otherwise, ARG is name of the file to read wiki material from. """ epilog = "Report bugs to: <gray+wikitrans@gnu.org.ua>" - + parser = OptionParser(usage=usage, version=version, description=description, @@ -135,8 +140,8 @@ Otherwise, ARG is name of the file to read wiki material from. default={}, help='set keyword option for the parser class constructor') parser.add_option('-d', '--debug', - action='store', type='int', dest='debug', - default=0, + action='callback', callback=setdebug, + type='int', dest='kwdict', help='set debug level (0..100)') parser.add_option('-D', '--dump', action='store_const', const='dump', @@ -145,12 +150,13 @@ Otherwise, ARG is name of the file to read wiki material from. parser.add_option('-b', '--base-url', action='store', type='string', dest='base_url', help='set base url') - + (options, args) = parser.parse_args() if len(args) == 1: if options.base_url: - getwiki(options.base_url + '/wiki/Special:Export/' + args[0], options) + getwiki(options.base_url + '/wiki/Special:Export/' + args[0], + options) elif args[0] == '-': options.kwdict['file'] = sys.stdin elif re.match('^(http|ftp)s?://',args[0]): @@ -159,7 +165,7 @@ Otherwise, ARG is name of the file to read wiki material from. options.kwdict['filename'] = args[0] else: parser.error("bad number of arguments") - + options.kwdict['lang'] = options.lang # FIXME if options.otype == 'dump' and not 'indent' in options.kwdict: @@ -167,7 +173,6 @@ Otherwise, ARG is name of the file to read wiki material from. if options.otype in handlers: if options.itype in handlers[options.otype]: markup = handlers[options.otype][options.itype](**options.kwdict) - markup.debug_level = options.debug markup.parse() print("%s" % str(markup)) exit(0) @@ -178,4 +183,4 @@ Otherwise, ARG is name of the file to read wiki material from. exit(1) if __name__ == '__main__': - main() + main() diff --git a/wikitrans/wikimarkup.py b/wikitrans/wikimarkup.py index 19f69e6..1e2429f 100644 --- a/wikitrans/wikimarkup.py +++ b/wikitrans/wikimarkup.py @@ -145,6 +145,13 @@ class WikiMarkupParser(object): Public attributes: + Input: + debug_level -- debug verbosity level (0 - no debug info, 100 - excessively + copious debug messages). Default is 0. + strict -- if True, parser will throw exception upon encountering + invalid markup tag (mostly for future use) + + Output: tree -- constructed parse tree (a subclass of WikiNode) """ @@ -175,6 +182,7 @@ class WikiMarkupParser(object): tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ] debug_level = 0 + strict = False def dprint(self, lev, fmt, *argv): """If current debug level is greater than or equal to lev, print *argv @@ -187,7 +195,7 @@ class WikiMarkupParser(object): inline_delims = [ "''", "'''", "[", "]", "[[", "]]", "{{", "}}", "|" ] token_class = {} - + def _new_node(self, **kwarg): return self.token_class[kwarg['type']](self, **kwarg) @@ -374,7 +382,7 @@ class WikiMarkupParser(object): stack.append(i) # Redefine all non-matched tokens as TEXT for i in stack: - # FIXME + # FIXME: How to convert node to TEXT? self.toklist[i] = self._new_node(type='TEXT', content=str(self.toklist[i])) @@ -492,7 +500,10 @@ class WikiMarkupParser(object): flush() acc['seq'].append(self.parse_inline_delim(tok)) else: - raise UnexpectedTokenError(tok) + if self.strict: + raise UnexpectedTokenError(tok) + # FIXME: Another possible variant of handling this case is to + # convert tok to TEXT node and append it to acc['seq'] tok = self.getkn() flush() if acc['seq']: @@ -976,6 +987,13 @@ class WikiMarkup(WikiMarkupParser): media_base=URL Base URL for media files. Default is 'http://www.mediawiki.org/xml/export-0.3' + + debug_level=INT + debug verbosity level (0 - no debug info, 100 - excessively + copious debug messages). Default is 0. + strict=BOOL + Strict parsing mode. Throw exceptions on syntax errors. Default + is False. """ self.token_class = { 'NIL': WikiNode, @@ -999,14 +1017,17 @@ class WikiMarkup(WikiMarkupParser): 'LINK': WikiSeqNode, 'HDR': WikiHdrNode } - + for kw in keywords: if kw == 'file': self.file = keywords[kw] elif kw == 'filename': self.file = open(keywords[kw]) elif kw == 'text': - self.text = keywords[kw].split("\n") + if sys.version_info[0] > 2: + self.text = keywords[kw].decode('utf-8').split("\n") + else: + self.text = keywords[kw].split("\n") elif kw == 'lang': self.lang = keywords[kw] elif kw == 'html_base': @@ -1015,6 +1036,10 @@ class WikiMarkup(WikiMarkupParser): self.image_base = keywords[kw] elif kw == 'media_base': self.media_base = keywords[kw] + elif kw == 'strict': + self.strict = keywords[kw] + elif kw == 'debug_level': + self.debug_level = keywords[kw] def __del__(self): if self.file: diff --git a/wikitrans/wikitoken.py b/wikitrans/wikitoken.py index deedea8..1f81092 100644 --- a/wikitrans/wikitoken.py +++ b/wikitrans/wikitoken.py @@ -311,7 +311,7 @@ class WikiEnvNode(WikiContentNode): return { 'envtype': self.envtype, 'level': self.level, - 'content': map(lambda x: x.json_encode(), self.content) + 'content': [x for x in map(lambda x: x.json_encode(), self.content)] } |