1 files changed, 1285 insertions, 0 deletions
diff --git a/wikitrans/wikimarkup.py b/wikitrans/wikimarkup.py
new file mode 100644
index 0000000..77c3b30
--- /dev/null
+++ b/wikitrans/wikimarkup.py
@@ -0,0 +1,1285 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008-2018 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+"""
+Wiki markup parser.
+
+This module provides two class:
+
+WikiMarkupParser:
+   An abstract parser class, which serves as a base class for all markup
+   classes in this package.
+
+WikiMarkup
+   A subclass of the above, providing basic input method.
+
+"""
+
+from __future__ import print_function
+import sys
+import re
+from types import *
+from wikitrans.wikitoken import *
+
+__all__ = [ "WikiMarkupParser", "WikiMarkup",
+            "TagAttributes", "TagAttributeSyntaxError" ]
+
+class UnexpectedTokenError(Exception):
+    def __init__(self, value):
+        self.value = value
+
+class TagAttributeSyntaxError(Exception):
+    def __init__(self, value):
+        self.value = value
+    def __str__(self):
+        return repr(self.value)
+
+class TagAttributes(object):
+    """A dictionary-like collection of tag attributes.
+
+    Example:
+
+      attr = TagAttributes('href="foo" length=2')
+      if 'href' in attr:
+          print(x['href'])   # returns "foo"
+      for a in attr:
+          ...
+    """
+    
+    attrstart = re.compile("^(?P<attr>[a-zA-Z0-9_-]+)(?P<eq>=\")?")
+    valseg = re.compile("^[^\\\"]+")
+    tab = {}
+    printable = None
+    def __init__(self, string):
+        if not string:
+            self.printable = ''
+            return
+        self.printable = string
+        s = string
+        self.tab = {}
+        while s != '':
+            s = s.strip()
+            m = self.attrstart.match(s)
+            if m:
+                name = m.group('attr')
+                val = ''
+                s = s[m.end(0):]
+                if m.group('eq'):
+                    while 1:
+                        m = self.valseg.match(s)
+                        val += m.group(0)
+                        s = s[m.end(0):]
+                        if s[0] == '\\':
+                            val += s[1]
+                            s += 2
+                        elif s[0] == '"':
+                            s = s[1:]
+                            break
+                else:
+                    val = 1
+                self.tab[name] = val
+            else:
+                raise TagAttributeSyntaxError(s)
+    def __len__(self):
+        return len(self.tab)
+    def __getitem__(self, key):
+        return self.tab[key]
+    def __contains__(self, key):
+        return key in self.tab
+    def __iter__(self):
+        for key in self.tab:
+            yield(key)
+    def has_key(self, key):
+        return self.__contains__(key)
+    def __setitem__(self, key, value):
+        self.tab[key] = value
+    def __delitem__(self, key):
+        del self.tab[key]
+    def __str__(self):
+        return self.printable
+    def __repr__(self):
+        return self.printable
+
+class WikiMarkupParser(object):
+    """Parser for Wiki markup language.
+
+    Given input in Wiki markup language creates an abstract parse tree for it.
+    This is a base class for actual parsers. The subclasses must provide the
+    input method.
+
+    Public methods:
+
+    parse()  --  parse the input.
+
+    Abstract methods (must be overridden by the subclass):
+
+    input()  --  returns next physical line from the input material.
+
+    Public attributes:
+
+    tree     --  constructed parse tree (a subclass of WikiNode)
+
+    """
+
+    delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
+    otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^/][^>]+))?\s*(?P<closed>/)?>")
+    ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+    refstart = re.compile("^https?://")
+
+    close_delim = {
+        '[': ']',
+        '[[': ']]',
+        '{{': '}}'
+    }
+
+    # Environment types:
+    envtypes = { "*": [ "unnumbered", 0 ],
+                 "#": [ "numbered", 0 ],
+                 ";": [ "defn", 0 ],
+                 ":": [ "defn", 1 ]
+    }
+
+    toklist = None
+    tokind = 0
+    newline = 0
+    tree = None
+
+    tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ]
+
+    debug_level = 0
+
+    def dprint(self, lev, fmt, *argv):
+        """If current debug level is greater than or equal to lev, print *argv
+        according to format.
+        """
+        if self.debug_level >= lev:
+            for l in (fmt % argv).split('\n'):
+                print("[DEBUG] %s" % l)
+
+    inline_delims = [ "''", "'''", "[", "]", "[[", "]]", "{{", "}}", "|" ]
+
+    token_class = {
+        'NIL':   WikiNode,
+        'NL':    WikiNode,
+        'OTAG':  WikiTagNode,
+        'CTAG':  WikiTagNode,
+        'TAG':   WikiTagNode,
+        'DELIM': WikiDelimNode,
+        'TEXT':  WikiTextNode,
+        'PRE':   WikiContentNode,
+        'PARA':  WikiSeqNode,
+        'BAR':   WikiNode,
+        'SEQ':   WikiSeqNode,
+        'IND':   WikiIndNode,
+        'REF':   WikiRefNode,
+        'TMPL':  WikiSeqNode,
+        'IT':    WikiSeqNode,
+        'BOLD':  WikiSeqNode,
+        'ELT':   WikiEltNode,
+        'ENV':   WikiEnvNode,
+        'LINK':  WikiSeqNode,
+        'HDR':   WikiHdrNode
+    }
+
+    def _new_node(self,**kwarg):
+        return self.token_class[kwarg['type']](self, **kwarg)
+
+    def tokread(self):
+        """Read next token from the input. Return it as a subclass of WikiNode."""
+        line = None
+        pos = 0
+        while 1:
+            if (not line or pos == len(line)):
+                try:
+                    line = self.input()
+                    pos = 0
+                except StopIteration:
+                    line = u''
+
+            if not line or line == "":
+                yield(self._new_node(type='NIL'))
+                break
+
+            if line == '\n':
+                yield(self._new_node(type='NL'))
+                line = None
+                continue
+
+            self.dprint(100, "LINE: %s", line[pos:])
+            m = self.delim.search(line, pos)
+
+            if m:
+                if (pos < m.start(0)):
+                    yield(self._new_node(type='TEXT',
+                                         content=line[pos:m.start(0)]))
+                pos = m.start(0)
+                t = None
+
+                if line[m.start(0)] == '<':
+                    m = self.otag.match(line, pos)
+                    if m:
+                        pos = m.end(0)
+                        if m.group('tag') == 'nowiki':
+                            if not m.group('closed'):
+                                while 1:
+                                    try:
+                                        m = self.ctag.search(line, pos)
+                                        if m and m.group('tag') == 'nowiki':
+                                            yield(self._new_node(type='TEXT',
+                                                                 content=line[pos:m.start(0)] ))
+                                            pos = m.end(0)
+                                            break
+
+                                        yield(self._new_node(type='TEXT',
+                                                             content=line[pos:]))
+
+                                        line = self.input()
+                                        pos = 0
+                                    except StopIteration:
+                                        break
+                            continue
+                        elif m.group('tag') in self.tags:
+                            try:
+                                yield(self._new_node(type='OTAG',
+                                                  tag=m.group('tag'),
+                                                  isblock=(line[pos] == '\n'),
+                                                  args=TagAttributes(m.group('args'))))
+                                if m.group('closed'):
+                                    yield(self._new_node(type='CTAG',
+                                                         tag=m.group('tag')))
+                            except TagAttributeSyntaxError:
+                                yield(self._new_node(type='TEXT',
+                                                     content=m.group(0)))
+                            continue
+                        else:
+                            yield(self._new_node(type='TEXT',content=m.group(0)))
+                            continue
+                    else:
+                        m = self.ctag.match(line, pos)
+                        if m:
+                            if m.group('tag') in self.tags:
+                                yield(self._new_node(type='CTAG',
+                                                     tag=m.group('tag')))
+                                pos = m.end(0)
+                                continue
+                        else:
+                            yield(self._new_node(type='TEXT',
+                                                 content=line[pos:pos+1]))
+                            pos += 1
+                            continue
+                else:
+                    pos = m.end(0)
+                    content = m.group(0)
+                    if content[0] in self.envtypes:
+                        node = self._new_node(type='DELIM',
+                                              content=content,
+                                              isblock=True,
+                                              continuation=pos < len(line) and line[pos] == ":")
+                        if node.continuation:
+                            node.content += node.content[0]
+                            pos += 1
+
+                        yield(node)
+
+                        while pos < len(line) and line[pos] in [' ', '\t']:
+                            pos += 1
+                    else:
+                        yield(self._new_node(type='DELIM',
+                                             isblock=(content.strip() not in self.inline_delims),
+                                             content=content.strip()))
+                    continue
+
+            if line:
+                if line[-1] == '\n':
+                    if line[pos:-1] != '':
+                        yield(self._new_node(type='TEXT',content=line[pos:-1]))
+                    yield(self._new_node(type='NL'))
+                else:
+                    yield(self._new_node(type='TEXT',content=line[pos:]))
+                line = None
+
+
+    def input(self):
+        """Return next physical line from the input.
+
+        This method must be overridden by the subclass.
+        """
+        return None
+
+    def swaptkn(self, i, j):
+        """Swap tokens at indices i and j in toklist."""
+        self.dprint(80, "SWAPPING %s <-> %s", i, j)
+        x = self.toklist[i]
+        self.toklist[i] = self.toklist[j]
+        self.toklist[j] = x
+
+    def tokenize(self):
+        """Tokenize the input.
+
+        Read tokens from the input (supplied by the input() method). Place the
+        obtained tokens in the toklist array.
+        """
+        self.toklist = []
+        for tok in self.tokread():
+            self.dprint(100, "TOK: %s", tok)
+            self.toklist.append(tok)
+        # Determine and fix up the ordering of bold and italic markers
+        # There are three possible cases:
+        #
+        # 1a. '''a b ''c'' d'''
+        # 1b. ''a b '''c''' d''
+        #
+        # 2a. '''''a b'' c d'''
+        # 2b. '''''a b''' c d''
+        #
+        # 3a. '''a b ''c d'''''
+        # 3b. ''a b '''c d'''''
+        stack = []
+        for i in range(0,len(self.toklist)):
+            if (self.toklist[i].type == 'DELIM'
+                and (self.toklist[i].content == "''"
+                     or self.toklist[i].content == "'''")):
+                if len(stack) > 0:
+                    if self.toklist[stack[-1]].content == self.toklist[i].content:
+                        # Case 1: just pop the matching delimiter off the stack
+                        stack.pop()
+                    elif len(stack) == 2 and stack[-2] + 1 == stack[-1]:
+                        # Case 2: swap delimiters saved on stack ...
+                        self.swaptkn(stack[-2], stack[-1])
+                        #         and pop off the matching one
+                        stack.pop()
+                    elif (i < len(self.toklist)
+                          and self.toklist[i+1].type == 'DELIM'
+                          and self.toklist[stack[-1]].content
+                                      == self.toklist[i+1].content):
+                        # Case 3: swap current and next tokens
+                        self.swaptkn(i, i+1)
+                        #         and pop off the matching one
+                        stack.pop()
+                    else:
+                        # Push the token on stack
+                        stack.append(i)
+                else:
+                    # Push the token on stack
+                    stack.append(i)
+        # Redefine all non-matched tokens as TEXT
+        for i in stack:
+            self.toklist[i].type = 'TEXT' # FIXME
+
+    mark = []
+
+    def push_mark(self):
+        """Save the current token index on stack."""
+        self.mark.append(self.tokind)
+
+    def pop_mark(self):
+        """Restore the token index from top of stack."""
+        self.tokind = self.mark.pop()
+
+    def clear_mark(self):
+        """Forget the last mark."""
+        self.mark.pop()
+
+    def lookahead(self, off=0):
+        """Peek a token at index (tokind+off)."""
+        tok = self.toklist[self.tokind+off]
+        self.dprint(20, "lookahead(%s): %s", off, tok)
+        return tok
+
+    def setkn(self,val):
+        """Store token val at the current token index."""
+        self.toklist[self.tokind] = val
+
+    def getkn(self):
+        """Get next token from the toklist. Advance tokind."""
+        self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL'
+        if self.tokind == len(self.toklist):
+            return self._new_node(type='NIL')
+        tok = self.toklist[self.tokind]
+        self.tokind = self.tokind + 1
+        self.dprint(20, "getkn: %s", tok)
+        return tok
+
+    def ungetkn(self, tok=None):
+        """Unget the last read token.
+
+        Decrease the tokind by one, so the last read token will be read again.
+        If optional argument is supplied and is not None, store it in the toklist
+        in place of the current token.
+        """
+        self.tokind = self.tokind - 1
+        self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL'
+        if tok:
+             self.toklist[self.tokind] = tok
+        self.dprint(20, "ungetkn: %s", tok)
+        return self.toklist[self.tokind]
+
+    def fixuptkn(self, tok):
+        """Replace the recently read token by tok."""
+        if self.tokind == 0:
+            raise IndexError('WikiMarkupParser.fixuptkn called at start of input')
+        self.toklist[self.tokind-1] = tok
+        return tok
+
+    def dump(self, tree, file=sys.stdout):
+        """Dump the tree to file, node by node."""
+        for node in tree:
+            file.write(str(node))
+            file.write('\n')
+
+    def is_block_end(self, tok):
+        """Return True if tok ends a block environment."""
+        if tok.type == 'NIL':
+            return True
+        elif tok.type == 'NL':
+            if self.lookahead().type == 'NIL':
+                return True
+            elif self.lookahead().type == 'NL':
+                self.getkn()
+                return True
+        elif tok.type in ['DELIM', 'CTAG', 'TAG']:
+            if tok.isblock:
+                self.ungetkn(tok)
+                return True
+        return False
+
+    def parse_para(self, tok):
+        """Read paragraph starting at tok."""
+        self.dprint(80, "ENTER parse_para: %s", tok)
+
+        acc = { 'seq': [],
+                'textlist': [] }
+
+        def flush():
+            if acc['textlist']:
+                acc['seq'].append(self._new_node(type='TEXT',
+                                                 content=''.join(acc['textlist'])))
+                acc['textlist'] = []
+
+        if (isinstance(tok, WikiContentNode)
+            and isinstance(tok.content,str)
+            and re.match("^[ \t]", tok.content)):
+            type = 'PRE'
+            rx = re.compile("^\S")
+        else:
+            type = 'PARA'
+            rx = re.compile("^[ \t]")
+
+        while not self.is_block_end(tok):
+            if tok.type == 'TEXT':
+                if rx and self.newline and rx.match(tok.content):
+                    self.ungetkn()
+                    break
+                acc['textlist'].append(tok.content)
+            elif tok.type == 'NL':
+                acc['textlist'].append('\n')
+            elif tok.type == 'OTAG':
+                flush()
+                acc['seq'].append(self.parse_tag(tok))
+            elif tok.type == 'DELIM':
+                flush()
+                acc['seq'].append(self.parse_inline_delim(tok))
+            else:
+                raise UnexpectedTokenError(tok)
+            tok = self.getkn()
+        flush()
+        if acc['seq']:
+            tok = self._new_node(type=type, content=acc['seq'])
+        else:
+            tok = None
+        self.dprint(80, "LEAVE parse_para=%s", tok)
+        return tok
+
+    def parse_block_delim(self, tok):
+        """Parse block environment starting at tok."""
+        self.dprint(80, "ENTER parse_block_delim")
+        assert(tok.type == 'DELIM')
+        if tok.content == "----":
+            node = self._new_node(type = 'BAR')
+        elif tok.content[0:2] == "==":
+            node = self.parse_header(tok)
+            if not node:
+                tok = self.ungetkn(self._new_node(type='TEXT',
+                                                  content=tok.content))
+        elif tok.content[0] in self.envtypes:
+            node = None
+            if tok.content[0] == ':':
+                t = self.lookahead(-2)
+                if not (t.type == 'DELIM' and t.content == ';'):
+                    node = self.parse_indent(tok)
+            if not node:
+                node = self.parse_env(tok)
+        else:
+            self.ungetkn(tok)
+            node = None
+        self.dprint(80, "LEAVE parse_block_delim=%s", node)
+        return node
+
+    def parse_line(self):
+        """Parse the input line."""
+        self.dprint(80, "ENTER parse_line")
+        list = []
+        while True:
+            tok = self.getkn()
+            if tok.type == 'NL' or tok.type == 'NIL':
+                break
+            elif tok.type == 'TEXT':
+                list.append(tok)
+            elif tok.type == 'DELIM':
+                if tok.isblock:
+                    tok = self._new_node(type = 'TEXT', content = tok.content)
+                    self.fixuptkn(tok)
+                    list.append(tok)
+                elif tok.content[0] == ":":
+                    # FIXME
+                    list.append(self.parse_indent(tok))
+                    break
+                else:
+                    x = self.parse_inline_delim(tok)
+                    if x:
+                        list.append(x)
+                    else:
+                        list.append(self.fixuptkn(self._new_node(type = 'TEXT',
+                                                                 content = tok.content)))
+            elif tok.type == 'OTAG':
+                if tok.isblock:
+                    self.ungetkn()
+                    break
+                list.append(self.parse_tag(tok))
+            else:
+                list.append(tok)
+        ret = self._new_node(type='SEQ', content=list)
+        self.dprint(80, "LEAVE parse_line=%s", ret)
+        return ret
+
+    def parse_indent(self, tok):
+        """Parse indented block starting at tok."""
+        lev = len(tok.content)
+        self.dprint(80, "ENTER parse_indent(%s)", lev)
+        x = self._new_node(type='IND', level=lev, content=self.parse_line())
+        self.dprint(80, "LEAVE parse_indent=%s", x)
+        return x
+
+    def parse_fontmod(self,delim,what):
+        """Parse font modification directive (bold or italics).
+
+        Arguments:
+
+        delim    --  starting delimiter ("''" or "'''")
+        what     --  'IT' or 'BOLD'
+        """
+        self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s",
+                    delim, what, self.lookahead())
+        seq = []
+        text = ''
+        while True:
+            tok = self.getkn()
+            if tok.type == 'TEXT':
+                text += tok.content
+            elif self.is_block_end(tok):
+                self.dprint(80, "LEAVE parse_fontmod=%s", "None")
+                return None
+            elif tok.type == 'DELIM':
+#                self.dprint(80, "got %s, want %s", tok.content, delim)
+                if tok.content == delim:
+                    break
+                else:
+                    if text:
+                        seq.append(self._new_node(type='TEXT', content=text))
+                        text = ''
+                    x = self.parse_inline_delim(tok)
+                    if x:
+                        seq.append(x)
+                    else:
+                        self.dprint(80, "LEAVE parse_fontmod=%s", "None")
+                        return None
+            elif tok.type == 'NL':
+                seq.append(self._new_node(type='TEXT', content='\n'))
+            else:
+                self.dprint(80, "LEAVE parse_fontmod=None")
+                return None
+        if text:
+            seq.append(self._new_node(type='TEXT', content=text))
+        res = self._new_node(type=what, content=seq)
+        self.dprint(80, "LEAVE parse_fontmod=%s", res)
+        return res
+
+    def parse_ref(self):
+        """Parse a reference block ([...])"""
+        self.dprint(80, "ENTER parse_ref")
+        tok = self.getkn()
+        if not (tok.type == 'TEXT' and self.refstart.match(tok.content)):
+            self.dprint(80, "LEAVE parse_ref=None")
+            return None
+
+        seq = []
+        (ref,sep,text) = tok.content.partition(' ')
+        if text:
+            seq.insert(0, self._new_node(type='TEXT', content=text))
+
+        while True:
+            tok = self.getkn()
+            if tok.type == 'NIL':
+                self.dprint(80, "LEAVE parse_ref=None")
+                return None
+            elif self.is_block_end(tok):
+                self.dprint(80, "LEAVE parse_ref=None")
+                return None
+            elif tok.type == 'DELIM':
+                if tok.content == ']':
+                    break
+                else:
+                    tok = self.parse_inline_delim(tok)
+                    if tok:
+                        seq.append(tok)
+                    else:
+                        self.dprint(80, "LEAVE parse_ref=None")
+                        return None
+            elif tok.type == 'OTAG':
+                list.append(self.parse_tag(tok))
+            else:
+                seq.append(tok)
+
+        ret = self._new_node(type='REF', ref=ref,
+                             content=self._new_node(type='SEQ', content=seq))
+        self.dprint(80, "LEAVE parse_ref= %s", ret)
+        return ret
+
+    def parse_link(self, type, delim):
+        """Parse an external link ([[...]]).
+
+        In this implementation, it is also used to parse template
+        references ({{...}}).
+
+        Arguments:
+
+        type   -- 'LINK' or 'TMPL'
+        delim  -- expected closing delimiter.
+        """
+        self.dprint(80, "ENTER parse_link(%s,%s)", type, delim)
+        subtree = []
+        list = []
+        while True:
+            tok = self.getkn()
+            if tok.type == 'NIL':
+                self.dprint(80, "LEAVE parse_link=None [EOF]")
+                return None
+            if tok.type == 'DELIM':
+                if tok.content == delim:
+                    if list:
+                        subtree.append(self._new_node(type='SEQ',
+                                                      content=list))
+                    break
+                elif tok.content == "|":
+                    if len(list) > 1:
+                        subtree.append(self._new_node(type='SEQ',
+                                                      content=list))
+                    elif list:
+                        subtree.append(list[0])
+                    list = []
+                else:
+                    x = self.parse_inline_delim(tok)
+                    if x:
+                        list.append(x)
+                    else:
+                        self.dprint(80, "LEAVE parse_link=None [bad inline]")
+                        return None
+            elif tok.type == 'TEXT':
+                list.append(tok)
+            else:
+                self.dprint(80, "LEAVE parse_link=None [unexpected token]")
+                return None
+        ret = self._new_node(type=type, content=subtree)
+        self.dprint(80, "LEAVE parse_link=%s", ret)
+        return ret
+
+    def parse_inline_delim(self, tok):
+        """Parse an inline block."""
+        self.dprint(80, "ENTER parse_inline_delim")
+        assert(tok.type == 'DELIM')
+        self.push_mark()
+        if tok.content == "''":
+            x = self.parse_fontmod(tok.content, 'IT')
+        elif tok.content == "'''":
+            x = self.parse_fontmod(tok.content, 'BOLD')
+        elif tok.content == "[":
+            x = self.parse_ref()
+        elif tok.content == "[[":
+            x = self.parse_link('LINK', "]]")
+        elif tok.content == "{{":
+            x = self.parse_link('TMPL', "}}")
+        else:
+            x = None
+
+        if x:
+            self.clear_mark()
+        else:
+            self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok)
+            self.pop_mark()
+            x = self.fixuptkn(self._new_node(type='TEXT', content=tok.content))
+            od = tok.content
+            if od in self.close_delim:
+                cd = self.close_delim[od]
+                lev = 0
+                for i,tok in enumerate(self.toklist[self.tokind+1:]):
+                    if tok.type == 'NIL':
+                        break
+                    elif tok.type == 'DELIM':
+                        if tok.content == od:
+                            lev += 1
+                        elif tok.content == cd:
+                            if lev == 0:
+                                tok = self._new_node(type='TEXT',
+                                                     content=tok.content)
+                                self.toklist[self.tokind+1+i] = tok
+                            lev -= 1
+                            break
+            self.dprint(80, "END DELIMITER RECOVERY: %s", tok)
+
+        self.dprint(80, "LEAVE parse_inline_delim=%s", x)
+        return x
+
+    def parse_tag(self, tag):
+        """Parse an xml-like tag (such as, e.g. "<tt>...</tt>")."""
+        self.dprint(80, "ENTER parse_tag")
+        list = []
+        self.push_mark()
+        while True:
+            tok = self.getkn()
+            if tok.type == 'NIL':
+                self.pop_mark()
+                s = '<' + tag.tag
+                if tag.args:
+                    s += ' ' + str(tag.args)
+                s += '>'
+                node = self._new_node(type='TEXT',content=s)
+                if tag.content:
+                    self.tree[self.tokind:self.tokind] = tag.content
+                self.dprint(80, "LEAVE parse_tag = %s (tree modified)", node)
+                return node
+            elif tok.type == 'DELIM':
+                if tok.isblock:
+                    tok = self.parse_block_delim(tok)
+                else:
+                    tok = self.parse_inline_delim(tok)
+                if not tok:
+                    tok = self.getkn()
+            elif tok.type == 'CTAG':
+                if tag.tag == tok.tag:
+                    break
+                s = '</' + tag.tag + '>'
+                tok = self.fixuptkn(self._new_node(type='TEXT', content=s))
+            elif tok.type == 'NL':
+                tok = self._new_node(type = 'TEXT', content = '\n')
+            list.append(tok)
+
+        self.clear_mark()
+        ret = self._new_node(type = 'TAG',
+                             tag  = tag.tag,
+                             args = tag.args,
+                             isblock = tag.isblock,
+                             content = self._new_node(type = 'SEQ', content = list))
+        self.dprint(80, "LEAVE parse_tag = %s", ret)
+        return ret
+
+    def parse_env(self, tok):
+        """Parse a block environment (numbered, unnumbered, or definition list)."""
+        type = self.envtypes[tok.content[0]][0]
+        lev = len(tok.content)
+        self.dprint(80, "ENTER parse_env(%s,%s)",type,lev)
+        list = []
+        while True:
+            if (tok.type == 'DELIM'
+                and tok.content[0] in self.envtypes
+                and type == self.envtypes[tok.content[0]][0]):
+                if len(tok.content) < lev:
+                    self.ungetkn()
+                    break
+                elif len(tok.content) > lev:
+                    elt = self.parse_env(tok)
+                else:
+                    elt = self.parse_line()
+                    if not tok.continuation:
+                        list.append(self._new_node(type='ELT',
+                                        subtype=self.envtypes[tok.content[0]][1],
+                                        content=elt))
+                        tok = self.getkn()
+                        continue
+
+                if list:
+                    if list[-1].content.type != 'SEQ':
+                        x = list[-1].content.content
+                        # FIXME:
+                        list[-1].content = self._new_node(type='SEQ', content=[x])
+                    list[-1].content.content.append(elt)
+            else:
+                self.ungetkn()
+                break
+
+            tok = self.getkn()
+
+        ret = self._new_node(type='ENV',
+                             envtype=type,
+                             level=lev,
+                             content=list)
+        self.dprint(80, "LEAVE parse_env=%s", ret)
+        return ret
+
+    def parse_header(self, tok):
+        """Parse a Wiki header."""
+        self.dprint(80, "ENTER parse_header")
+        self.push_mark()
+        list = []
+        delim = tok.content
+        while True:
+            tok = self.getkn()
+            if tok.type == 'NL':
+                self.pop_mark()
+                self.dprint(80, "LEAVE parse_header=None")
+                return None
+            elif tok.type == 'TEXT':
+                list.append(tok)
+            elif tok.type == 'DELIM':
+                if tok.content == delim:
+                    if self.lookahead().type == 'NL':
+                        self.getkn()
+                        if self.lookahead().type == 'NL':
+                            self.getkn()
+                        break
+                    else:
+                        self.pop_mark()
+                        self.dprint(80, "LEAVE parse_header=None")
+                        return None
+                elif tok.isblock:
+                    self.pop_mark()
+                    self.dprint(80, "LEAVE parse_header=None")
+                    return None
+                else:
+                    list.append(self.parse_inline_delim(tok))
+            elif tok.type == 'OTAG':
+                if tok.isblock:
+                    self.pop_mark()
+                    self.dprint(80, "LEAVE parse_header=None")
+                    return None
+                list.append(self.parse_tag(tok))
+        self.clear_mark()
+        ret = self._new_node(type='HDR',
+                             level=len(delim),
+                             content=self._new_node(type='SEQ', content=list))
+        self.dprint(80, "LEAVE parse_header=%s", ret)
+        return ret
+
+    def parse_block(self):
+        """Parse next block: newline, delimiter, tag, or paragraph."""
+        tok = self.getkn()
+        while tok.type == 'NL':
+            tok = self.getkn()
+        if tok == None or tok.type == 'NIL':
+            return None
+        elif tok.type == 'DELIM':
+            tok = self.parse_block_delim(tok)
+            if tok:
+                return tok
+            else:
+                tok = self.getkn()
+        elif tok.type == 'OTAG' and tok.isblock:
+            return self.parse_tag(tok)
+
+        return self.parse_para(tok)
+
+    def parse(self):
+        """Parse Wiki material supplied by the input() method.
+
+        Store the resulting abstract parsing tree in the tree attribute.
+        """
+        if not self.toklist:
+            self.tokenize()
+        if self.debug_level >= 90:
+            print("TOKEN DUMP BEGIN")
+            self.dump(self.toklist)
+            print("TOKEN DUMP END")
+        self.tokind = 0
+        self.tree = []
+        while 1:
+            subtree = self.parse_block()
+            if subtree == None:
+                break
+            self.tree.append(subtree)
+        if self.debug_level >= 70:
+            print("TREE DUMP BEGIN")
+            self.dump(self.tree)
+            print("TREE DUMP END")
+
+    def __str__(self):
+        return str(self.tree)
+
+
+class WikiMarkup(WikiMarkupParser):
+    """
+    A derived parser class that supplies a basic input method.
+
+    Three types of inputs are available:
+
+    1. filename=<file>
+    The file <file> is opened and used for input.
+    2. file=<file>
+    The already opened file <file> is used for input.
+    3. text=<string>
+    Input is taken from <string>, line by line.
+
+    Usage:
+
+    obj = WikiMarkup(arg=val)
+    obj.parse
+    ... Do whatever you need with obj.tree ...
+
+    """
+
+    file = None
+    text = None
+    lang = 'en'
+    html_base = 'http://%(lang)s.wiktionary.org/wiki/'
+    image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf'
+    media_base = 'http://www.mediawiki.org/xml/export-0.3'
+
+    def __init__(self, *args, **keywords):
+        for kw in keywords:
+            if kw == 'file':
+                self.file = keywords[kw]
+            elif kw == 'filename':
+                self.file = open(keywords[kw])
+            elif kw == 'text':
+                self.text = keywords[kw].split("\n")
+            elif kw == 'lang':
+                self.lang = keywords[kw]
+            elif kw == 'html_base':
+                self.html_base = keywords[kw]
+            elif kw == 'image_base':
+                self.image_base = keywords[kw]
+            elif kw == 'media_base':
+                self.media_base = keywords[kw]
+
+    def __del__(self):
+        if self.file:
+            self.file.close()
+
+    def input(self):
+        if self.file:
+            return self.file.readline()
+        elif self.text:
+            return self.text.pop(0) + '\n'
+        else:
+            return None
+
+    # ISO 639
+    langtab = {
+        "aa": "Afar",            # Afar
+        "ab": "Аҧсуа",           # Abkhazian
+        "ae": None,              # Avestan
+        "af": "Afrikaans",       # Afrikaans
+        "ak": "Akana",           # Akan
+        "als": "Alemannisch",
+        "am": "አማርኛ",            # Amharic
+        "an": "Aragonés",        # Aragonese
+        "ang": "Englisc",
+        "ar": "العربية" ,        # Arabic
+        "arc": "ܐܪܡܝܐ",
+        "as": "অসমীয়া",         # Assamese
+        "ast": "Asturian",
+        "av": "Авар",            # Avaric
+        "ay": "Aymara",           # Aymara
+        "az": "Azərbaycan" ,     # Azerbaijani
+
+        "ba": "Башҡорт",         # Bashkir
+        "bar": "Boarisch",
+        "bat-smg": "Žemaitėška",
+        "bcl": "Bikol",
+        "be": "Беларуская",      # Byelorussian; Belarusian
+        "be-x-old": "Беларуская (тарашкевіца)",
+        "bg": "Български",       # Bulgarian
+        "bh": "भोजपुरी",  # Bihari
+        "bi": "Bislama",         # Bislama
+        "bm": "Bamanankan",      # Bambara
+        "bn": "বাংলা" ,          # Bengali; Bangla
+        "bo": "བོད་སྐད",         # Tibetan
+        "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" ,
+        "br": "Brezhoneg" ,      # Breton