diff options
Diffstat (limited to 'wikitrans/wikimarkup.py')
-rw-r--r-- | wikitrans/wikimarkup.py | 1285 |
1 files changed, 1285 insertions, 0 deletions
diff --git a/wikitrans/wikimarkup.py b/wikitrans/wikimarkup.py new file mode 100644 index 0000000..77c3b30 --- /dev/null +++ b/wikitrans/wikimarkup.py @@ -0,0 +1,1285 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (C) 2008-2018 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +""" +Wiki markup parser. + +This module provides two class: + +WikiMarkupParser: + An abstract parser class, which serves as a base class for all markup + classes in this package. + +WikiMarkup + A subclass of the above, providing basic input method. + +""" + +from __future__ import print_function +import sys +import re +from types import * +from wikitrans.wikitoken import * + +__all__ = [ "WikiMarkupParser", "WikiMarkup", + "TagAttributes", "TagAttributeSyntaxError" ] + +class UnexpectedTokenError(Exception): + def __init__(self, value): + self.value = value + +class TagAttributeSyntaxError(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return repr(self.value) + +class TagAttributes(object): + """A dictionary-like collection of tag attributes. + + Example: + + attr = TagAttributes('href="foo" length=2') + if 'href' in attr: + print(x['href']) # returns "foo" + for a in attr: + ... + """ + + attrstart = re.compile("^(?P<attr>[a-zA-Z0-9_-]+)(?P<eq>=\")?") + valseg = re.compile("^[^\\\"]+") + tab = {} + printable = None + def __init__(self, string): + if not string: + self.printable = '' + return + self.printable = string + s = string + self.tab = {} + while s != '': + s = s.strip() + m = self.attrstart.match(s) + if m: + name = m.group('attr') + val = '' + s = s[m.end(0):] + if m.group('eq'): + while 1: + m = self.valseg.match(s) + val += m.group(0) + s = s[m.end(0):] + if s[0] == '\\': + val += s[1] + s += 2 + elif s[0] == '"': + s = s[1:] + break + else: + val = 1 + self.tab[name] = val + else: + raise TagAttributeSyntaxError(s) + def __len__(self): + return len(self.tab) + def __getitem__(self, key): + return self.tab[key] + def __contains__(self, key): + return key in self.tab + def __iter__(self): + for key in self.tab: + yield(key) + def has_key(self, key): + return self.__contains__(key) + def __setitem__(self, key, value): + self.tab[key] = value + def __delitem__(self, key): + del self.tab[key] + def __str__(self): + return self.printable + def __repr__(self): + return self.printable + +class WikiMarkupParser(object): + """Parser for Wiki markup language. + + Given input in Wiki markup language creates an abstract parse tree for it. + This is a base class for actual parsers. The subclasses must provide the + input method. + + Public methods: + + parse() -- parse the input. + + Abstract methods (must be overridden by the subclass): + + input() -- returns next physical line from the input material. + + Public attributes: + + tree -- constructed parse tree (a subclass of WikiNode) + + """ + + delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") + otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^/][^>]+))?\s*(?P<closed>/)?>") + ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>") + refstart = re.compile("^https?://") + + close_delim = { + '[': ']', + '[[': ']]', + '{{': '}}' + } + + # Environment types: + envtypes = { "*": [ "unnumbered", 0 ], + "#": [ "numbered", 0 ], + ";": [ "defn", 0 ], + ":": [ "defn", 1 ] + } + + toklist = None + tokind = 0 + newline = 0 + tree = None + + tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ] + + debug_level = 0 + + def dprint(self, lev, fmt, *argv): + """If current debug level is greater than or equal to lev, print *argv + according to format. + """ + if self.debug_level >= lev: + for l in (fmt % argv).split('\n'): + print("[DEBUG] %s" % l) + + inline_delims = [ "''", "'''", "[", "]", "[[", "]]", "{{", "}}", "|" ] + + token_class = { + 'NIL': WikiNode, + 'NL': WikiNode, + 'OTAG': WikiTagNode, + 'CTAG': WikiTagNode, + 'TAG': WikiTagNode, + 'DELIM': WikiDelimNode, + 'TEXT': WikiTextNode, + 'PRE': WikiContentNode, + 'PARA': WikiSeqNode, + 'BAR': WikiNode, + 'SEQ': WikiSeqNode, + 'IND': WikiIndNode, + 'REF': WikiRefNode, + 'TMPL': WikiSeqNode, + 'IT': WikiSeqNode, + 'BOLD': WikiSeqNode, + 'ELT': WikiEltNode, + 'ENV': WikiEnvNode, + 'LINK': WikiSeqNode, + 'HDR': WikiHdrNode + } + + def _new_node(self,**kwarg): + return self.token_class[kwarg['type']](self, **kwarg) + + def tokread(self): + """Read next token from the input. Return it as a subclass of WikiNode.""" + line = None + pos = 0 + while 1: + if (not line or pos == len(line)): + try: + line = self.input() + pos = 0 + except StopIteration: + line = u'' + + if not line or line == "": + yield(self._new_node(type='NIL')) + break + + if line == '\n': + yield(self._new_node(type='NL')) + line = None + continue + + self.dprint(100, "LINE: %s", line[pos:]) + m = self.delim.search(line, pos) + + if m: + if (pos < m.start(0)): + yield(self._new_node(type='TEXT', + content=line[pos:m.start(0)])) + pos = m.start(0) + t = None + + if line[m.start(0)] == '<': + m = self.otag.match(line, pos) + if m: + pos = m.end(0) + if m.group('tag') == 'nowiki': + if not m.group('closed'): + while 1: + try: + m = self.ctag.search(line, pos) + if m and m.group('tag') == 'nowiki': + yield(self._new_node(type='TEXT', + content=line[pos:m.start(0)] )) + pos = m.end(0) + break + + yield(self._new_node(type='TEXT', + content=line[pos:])) + + line = self.input() + pos = 0 + except StopIteration: + break + continue + elif m.group('tag') in self.tags: + try: + yield(self._new_node(type='OTAG', + tag=m.group('tag'), + isblock=(line[pos] == '\n'), + args=TagAttributes(m.group('args')))) + if m.group('closed'): + yield(self._new_node(type='CTAG', + tag=m.group('tag'))) + except TagAttributeSyntaxError: + yield(self._new_node(type='TEXT', + content=m.group(0))) + continue + else: + yield(self._new_node(type='TEXT',content=m.group(0))) + continue + else: + m = self.ctag.match(line, pos) + if m: + if m.group('tag') in self.tags: + yield(self._new_node(type='CTAG', + tag=m.group('tag'))) + pos = m.end(0) + continue + else: + yield(self._new_node(type='TEXT', + content=line[pos:pos+1])) + pos += 1 + continue + else: + pos = m.end(0) + content = m.group(0) + if content[0] in self.envtypes: + node = self._new_node(type='DELIM', + content=content, + isblock=True, + continuation=pos < len(line) and line[pos] == ":") + if node.continuation: + node.content += node.content[0] + pos += 1 + + yield(node) + + while pos < len(line) and line[pos] in [' ', '\t']: + pos += 1 + else: + yield(self._new_node(type='DELIM', + isblock=(content.strip() not in self.inline_delims), + content=content.strip())) + continue + + if line: + if line[-1] == '\n': + if line[pos:-1] != '': + yield(self._new_node(type='TEXT',content=line[pos:-1])) + yield(self._new_node(type='NL')) + else: + yield(self._new_node(type='TEXT',content=line[pos:])) + line = None + + + def input(self): + """Return next physical line from the input. + + This method must be overridden by the subclass. + """ + return None + + def swaptkn(self, i, j): + """Swap tokens at indices i and j in toklist.""" + self.dprint(80, "SWAPPING %s <-> %s", i, j) + x = self.toklist[i] + self.toklist[i] = self.toklist[j] + self.toklist[j] = x + + def tokenize(self): + """Tokenize the input. + + Read tokens from the input (supplied by the input() method). Place the + obtained tokens in the toklist array. + """ + self.toklist = [] + for tok in self.tokread(): + self.dprint(100, "TOK: %s", tok) + self.toklist.append(tok) + # Determine and fix up the ordering of bold and italic markers + # There are three possible cases: + # + # 1a. '''a b ''c'' d''' + # 1b. ''a b '''c''' d'' + # + # 2a. '''''a b'' c d''' + # 2b. '''''a b''' c d'' + # + # 3a. '''a b ''c d''''' + # 3b. ''a b '''c d''''' + stack = [] + for i in range(0,len(self.toklist)): + if (self.toklist[i].type == 'DELIM' + and (self.toklist[i].content == "''" + or self.toklist[i].content == "'''")): + if len(stack) > 0: + if self.toklist[stack[-1]].content == self.toklist[i].content: + # Case 1: just pop the matching delimiter off the stack + stack.pop() + elif len(stack) == 2 and stack[-2] + 1 == stack[-1]: + # Case 2: swap delimiters saved on stack ... + self.swaptkn(stack[-2], stack[-1]) + # and pop off the matching one + stack.pop() + elif (i < len(self.toklist) + and self.toklist[i+1].type == 'DELIM' + and self.toklist[stack[-1]].content + == self.toklist[i+1].content): + # Case 3: swap current and next tokens + self.swaptkn(i, i+1) + # and pop off the matching one + stack.pop() + else: + # Push the token on stack + stack.append(i) + else: + # Push the token on stack + stack.append(i) + # Redefine all non-matched tokens as TEXT + for i in stack: + self.toklist[i].type = 'TEXT' # FIXME + + mark = [] + + def push_mark(self): + """Save the current token index on stack.""" + self.mark.append(self.tokind) + + def pop_mark(self): + """Restore the token index from top of stack.""" + self.tokind = self.mark.pop() + + def clear_mark(self): + """Forget the last mark.""" + self.mark.pop() + + def lookahead(self, off=0): + """Peek a token at index (tokind+off).""" + tok = self.toklist[self.tokind+off] + self.dprint(20, "lookahead(%s): %s", off, tok) + return tok + + def setkn(self,val): + """Store token val at the current token index.""" + self.toklist[self.tokind] = val + + def getkn(self): + """Get next token from the toklist. Advance tokind.""" + self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL' + if self.tokind == len(self.toklist): + return self._new_node(type='NIL') + tok = self.toklist[self.tokind] + self.tokind = self.tokind + 1 + self.dprint(20, "getkn: %s", tok) + return tok + + def ungetkn(self, tok=None): + """Unget the last read token. + + Decrease the tokind by one, so the last read token will be read again. + If optional argument is supplied and is not None, store it in the toklist + in place of the current token. + """ + self.tokind = self.tokind - 1 + self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL' + if tok: + self.toklist[self.tokind] = tok + self.dprint(20, "ungetkn: %s", tok) + return self.toklist[self.tokind] + + def fixuptkn(self, tok): + """Replace the recently read token by tok.""" + if self.tokind == 0: + raise IndexError('WikiMarkupParser.fixuptkn called at start of input') + self.toklist[self.tokind-1] = tok + return tok + + def dump(self, tree, file=sys.stdout): + """Dump the tree to file, node by node.""" + for node in tree: + file.write(str(node)) + file.write('\n') + + def is_block_end(self, tok): + """Return True if tok ends a block environment.""" + if tok.type == 'NIL': + return True + elif tok.type == 'NL': + if self.lookahead().type == 'NIL': + return True + elif self.lookahead().type == 'NL': + self.getkn() + return True + elif tok.type in ['DELIM', 'CTAG', 'TAG']: + if tok.isblock: + self.ungetkn(tok) + return True + return False + + def parse_para(self, tok): + """Read paragraph starting at tok.""" + self.dprint(80, "ENTER parse_para: %s", tok) + + acc = { 'seq': [], + 'textlist': [] } + + def flush(): + if acc['textlist']: + acc['seq'].append(self._new_node(type='TEXT', + content=''.join(acc['textlist']))) + acc['textlist'] = [] + + if (isinstance(tok, WikiContentNode) + and isinstance(tok.content,str) + and re.match("^[ \t]", tok.content)): + type = 'PRE' + rx = re.compile("^\S") + else: + type = 'PARA' + rx = re.compile("^[ \t]") + + while not self.is_block_end(tok): + if tok.type == 'TEXT': + if rx and self.newline and rx.match(tok.content): + self.ungetkn() + break + acc['textlist'].append(tok.content) + elif tok.type == 'NL': + acc['textlist'].append('\n') + elif tok.type == 'OTAG': + flush() + acc['seq'].append(self.parse_tag(tok)) + elif tok.type == 'DELIM': + flush() + acc['seq'].append(self.parse_inline_delim(tok)) + else: + raise UnexpectedTokenError(tok) + tok = self.getkn() + flush() + if acc['seq']: + tok = self._new_node(type=type, content=acc['seq']) + else: + tok = None + self.dprint(80, "LEAVE parse_para=%s", tok) + return tok + + def parse_block_delim(self, tok): + """Parse block environment starting at tok.""" + self.dprint(80, "ENTER parse_block_delim") + assert(tok.type == 'DELIM') + if tok.content == "----": + node = self._new_node(type = 'BAR') + elif tok.content[0:2] == "==": + node = self.parse_header(tok) + if not node: + tok = self.ungetkn(self._new_node(type='TEXT', + content=tok.content)) + elif tok.content[0] in self.envtypes: + node = None + if tok.content[0] == ':': + t = self.lookahead(-2) + if not (t.type == 'DELIM' and t.content == ';'): + node = self.parse_indent(tok) + if not node: + node = self.parse_env(tok) + else: + self.ungetkn(tok) + node = None + self.dprint(80, "LEAVE parse_block_delim=%s", node) + return node + + def parse_line(self): + """Parse the input line.""" + self.dprint(80, "ENTER parse_line") + list = [] + while True: + tok = self.getkn() + if tok.type == 'NL' or tok.type == 'NIL': + break + elif tok.type == 'TEXT': + list.append(tok) + elif tok.type == 'DELIM': + if tok.isblock: + tok = self._new_node(type = 'TEXT', content = tok.content) + self.fixuptkn(tok) + list.append(tok) + elif tok.content[0] == ":": + # FIXME + list.append(self.parse_indent(tok)) + break + else: + x = self.parse_inline_delim(tok) + if x: + list.append(x) + else: + list.append(self.fixuptkn(self._new_node(type = 'TEXT', + content = tok.content))) + elif tok.type == 'OTAG': + if tok.isblock: + self.ungetkn() + break + list.append(self.parse_tag(tok)) + else: + list.append(tok) + ret = self._new_node(type='SEQ', content=list) + self.dprint(80, "LEAVE parse_line=%s", ret) + return ret + + def parse_indent(self, tok): + """Parse indented block starting at tok.""" + lev = len(tok.content) + self.dprint(80, "ENTER parse_indent(%s)", lev) + x = self._new_node(type='IND', level=lev, content=self.parse_line()) + self.dprint(80, "LEAVE parse_indent=%s", x) + return x + + def parse_fontmod(self,delim,what): + """Parse font modification directive (bold or italics). + + Arguments: + + delim -- starting delimiter ("''" or "'''") + what -- 'IT' or 'BOLD' + """ + self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s", + delim, what, self.lookahead()) + seq = [] + text = '' + while True: + tok = self.getkn() + if tok.type == 'TEXT': + text += tok.content + elif self.is_block_end(tok): + self.dprint(80, "LEAVE parse_fontmod=%s", "None") + return None + elif tok.type == 'DELIM': +# self.dprint(80, "got %s, want %s", tok.content, delim) + if tok.content == delim: + break + else: + if text: + seq.append(self._new_node(type='TEXT', content=text)) + text = '' + x = self.parse_inline_delim(tok) + if x: + seq.append(x) + else: + self.dprint(80, "LEAVE parse_fontmod=%s", "None") + return None + elif tok.type == 'NL': + seq.append(self._new_node(type='TEXT', content='\n')) + else: + self.dprint(80, "LEAVE parse_fontmod=None") + return None + if text: + seq.append(self._new_node(type='TEXT', content=text)) + res = self._new_node(type=what, content=seq) + self.dprint(80, "LEAVE parse_fontmod=%s", res) + return res + + def parse_ref(self): + """Parse a reference block ([...])""" + self.dprint(80, "ENTER parse_ref") + tok = self.getkn() + if not (tok.type == 'TEXT' and self.refstart.match(tok.content)): + self.dprint(80, "LEAVE parse_ref=None") + return None + + seq = [] + (ref,sep,text) = tok.content.partition(' ') + if text: + seq.insert(0, self._new_node(type='TEXT', content=text)) + + while True: + tok = self.getkn() + if tok.type == 'NIL': + self.dprint(80, "LEAVE parse_ref=None") + return None + elif self.is_block_end(tok): + self.dprint(80, "LEAVE parse_ref=None") + return None + elif tok.type == 'DELIM': + if tok.content == ']': + break + else: + tok = self.parse_inline_delim(tok) + if tok: + seq.append(tok) + else: + self.dprint(80, "LEAVE parse_ref=None") + return None + elif tok.type == 'OTAG': + list.append(self.parse_tag(tok)) + else: + seq.append(tok) + + ret = self._new_node(type='REF', ref=ref, + content=self._new_node(type='SEQ', content=seq)) + self.dprint(80, "LEAVE parse_ref= %s", ret) + return ret + + def parse_link(self, type, delim): + """Parse an external link ([[...]]). + + In this implementation, it is also used to parse template + references ({{...}}). + + Arguments: + + type -- 'LINK' or 'TMPL' + delim -- expected closing delimiter. + """ + self.dprint(80, "ENTER parse_link(%s,%s)", type, delim) + subtree = [] + list = [] + while True: + tok = self.getkn() + if tok.type == 'NIL': + self.dprint(80, "LEAVE parse_link=None [EOF]") + return None + if tok.type == 'DELIM': + if tok.content == delim: + if list: + subtree.append(self._new_node(type='SEQ', + content=list)) + break + elif tok.content == "|": + if len(list) > 1: + subtree.append(self._new_node(type='SEQ', + content=list)) + elif list: + subtree.append(list[0]) + list = [] + else: + x = self.parse_inline_delim(tok) + if x: + list.append(x) + else: + self.dprint(80, "LEAVE parse_link=None [bad inline]") + return None + elif tok.type == 'TEXT': + list.append(tok) + else: + self.dprint(80, "LEAVE parse_link=None [unexpected token]") + return None + ret = self._new_node(type=type, content=subtree) + self.dprint(80, "LEAVE parse_link=%s", ret) + return ret + + def parse_inline_delim(self, tok): + """Parse an inline block.""" + self.dprint(80, "ENTER parse_inline_delim") + assert(tok.type == 'DELIM') + self.push_mark() + if tok.content == "''": + x = self.parse_fontmod(tok.content, 'IT') + elif tok.content == "'''": + x = self.parse_fontmod(tok.content, 'BOLD') + elif tok.content == "[": + x = self.parse_ref() + elif tok.content == "[[": + x = self.parse_link('LINK', "]]") + elif tok.content == "{{": + x = self.parse_link('TMPL', "}}") + else: + x = None + + if x: + self.clear_mark() + else: + self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok) + self.pop_mark() + x = self.fixuptkn(self._new_node(type='TEXT', content=tok.content)) + od = tok.content + if od in self.close_delim: + cd = self.close_delim[od] + lev = 0 + for i,tok in enumerate(self.toklist[self.tokind+1:]): + if tok.type == 'NIL': + break + elif tok.type == 'DELIM': + if tok.content == od: + lev += 1 + elif tok.content == cd: + if lev == 0: + tok = self._new_node(type='TEXT', + content=tok.content) + self.toklist[self.tokind+1+i] = tok + lev -= 1 + break + self.dprint(80, "END DELIMITER RECOVERY: %s", tok) + + self.dprint(80, "LEAVE parse_inline_delim=%s", x) + return x + + def parse_tag(self, tag): + """Parse an xml-like tag (such as, e.g. "<tt>...</tt>").""" + self.dprint(80, "ENTER parse_tag") + list = [] + self.push_mark() + while True: + tok = self.getkn() + if tok.type == 'NIL': + self.pop_mark() + s = '<' + tag.tag + if tag.args: + s += ' ' + str(tag.args) + s += '>' + node = self._new_node(type='TEXT',content=s) + if tag.content: + self.tree[self.tokind:self.tokind] = tag.content + self.dprint(80, "LEAVE parse_tag = %s (tree modified)", node) + return node + elif tok.type == 'DELIM': + if tok.isblock: + tok = self.parse_block_delim(tok) + else: + tok = self.parse_inline_delim(tok) + if not tok: + tok = self.getkn() + elif tok.type == 'CTAG': + if tag.tag == tok.tag: + break + s = '</' + tag.tag + '>' + tok = self.fixuptkn(self._new_node(type='TEXT', content=s)) + elif tok.type == 'NL': + tok = self._new_node(type = 'TEXT', content = '\n') + list.append(tok) + + self.clear_mark() + ret = self._new_node(type = 'TAG', + tag = tag.tag, + args = tag.args, + isblock = tag.isblock, + content = self._new_node(type = 'SEQ', content = list)) + self.dprint(80, "LEAVE parse_tag = %s", ret) + return ret + + def parse_env(self, tok): + """Parse a block environment (numbered, unnumbered, or definition list).""" + type = self.envtypes[tok.content[0]][0] + lev = len(tok.content) + self.dprint(80, "ENTER parse_env(%s,%s)",type,lev) + list = [] + while True: + if (tok.type == 'DELIM' + and tok.content[0] in self.envtypes + and type == self.envtypes[tok.content[0]][0]): + if len(tok.content) < lev: + self.ungetkn() + break + elif len(tok.content) > lev: + elt = self.parse_env(tok) + else: + elt = self.parse_line() + if not tok.continuation: + list.append(self._new_node(type='ELT', + subtype=self.envtypes[tok.content[0]][1], + content=elt)) + tok = self.getkn() + continue + + if list: + if list[-1].content.type != 'SEQ': + x = list[-1].content.content + # FIXME: + list[-1].content = self._new_node(type='SEQ', content=[x]) + list[-1].content.content.append(elt) + else: + self.ungetkn() + break + + tok = self.getkn() + + ret = self._new_node(type='ENV', + envtype=type, + level=lev, + content=list) + self.dprint(80, "LEAVE parse_env=%s", ret) + return ret + + def parse_header(self, tok): + """Parse a Wiki header.""" + self.dprint(80, "ENTER parse_header") + self.push_mark() + list = [] + delim = tok.content + while True: + tok = self.getkn() + if tok.type == 'NL': + self.pop_mark() + self.dprint(80, "LEAVE parse_header=None") + return None + elif tok.type == 'TEXT': + list.append(tok) + elif tok.type == 'DELIM': + if tok.content == delim: + if self.lookahead().type == 'NL': + self.getkn() + if self.lookahead().type == 'NL': + self.getkn() + break + else: + self.pop_mark() + self.dprint(80, "LEAVE parse_header=None") + return None + elif tok.isblock: + self.pop_mark() + self.dprint(80, "LEAVE parse_header=None") + return None + else: + list.append(self.parse_inline_delim(tok)) + elif tok.type == 'OTAG': + if tok.isblock: + self.pop_mark() + self.dprint(80, "LEAVE parse_header=None") + return None + list.append(self.parse_tag(tok)) + self.clear_mark() + ret = self._new_node(type='HDR', + level=len(delim), + content=self._new_node(type='SEQ', content=list)) + self.dprint(80, "LEAVE parse_header=%s", ret) + return ret + + def parse_block(self): + """Parse next block: newline, delimiter, tag, or paragraph.""" + tok = self.getkn() + while tok.type == 'NL': + tok = self.getkn() + if tok == None or tok.type == 'NIL': + return None + elif tok.type == 'DELIM': + tok = self.parse_block_delim(tok) + if tok: + return tok + else: + tok = self.getkn() + elif tok.type == 'OTAG' and tok.isblock: + return self.parse_tag(tok) + + return self.parse_para(tok) + + def parse(self): + """Parse Wiki material supplied by the input() method. + + Store the resulting abstract parsing tree in the tree attribute. + """ + if not self.toklist: + self.tokenize() + if self.debug_level >= 90: + print("TOKEN DUMP BEGIN") + self.dump(self.toklist) + print("TOKEN DUMP END") + self.tokind = 0 + self.tree = [] + while 1: + subtree = self.parse_block() + if subtree == None: + break + self.tree.append(subtree) + if self.debug_level >= 70: + print("TREE DUMP BEGIN") + self.dump(self.tree) + print("TREE DUMP END") + + def __str__(self): + return str(self.tree) + + +class WikiMarkup(WikiMarkupParser): + """ + A derived parser class that supplies a basic input method. + + Three types of inputs are available: + + 1. filename=<file> + The file <file> is opened and used for input. + 2. file=<file> + The already opened file <file> is used for input. + 3. text=<string> + Input is taken from <string>, line by line. + + Usage: + + obj = WikiMarkup(arg=val) + obj.parse + ... Do whatever you need with obj.tree ... + + """ + + file = None + text = None + lang = 'en' + html_base = 'http://%(lang)s.wiktionary.org/wiki/' + image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf' + media_base = 'http://www.mediawiki.org/xml/export-0.3' + + def __init__(self, *args, **keywords): + for kw in keywords: + if kw == 'file': + self.file = keywords[kw] + elif kw == 'filename': + self.file = open(keywords[kw]) + elif kw == 'text': + self.text = keywords[kw].split("\n") + elif kw == 'lang': + self.lang = keywords[kw] + elif kw == 'html_base': + self.html_base = keywords[kw] + elif kw == 'image_base': + self.image_base = keywords[kw] + elif kw == 'media_base': + self.media_base = keywords[kw] + + def __del__(self): + if self.file: + self.file.close() + + def input(self): + if self.file: + return self.file.readline() + elif self.text: + return self.text.pop(0) + '\n' + else: + return None + + # ISO 639 + langtab = { + "aa": "Afar", # Afar + "ab": "Аҧсуа", # Abkhazian + "ae": None, # Avestan + "af": "Afrikaans", # Afrikaans + "ak": "Akana", # Akan + "als": "Alemannisch", + "am": "አማርኛ", # Amharic + "an": "Aragonés", # Aragonese + "ang": "Englisc", + "ar": "العربية" , # Arabic + "arc": "ܐܪܡܝܐ", + "as": "অসমীয়া", # Assamese + "ast": "Asturian", + "av": "Авар", # Avaric + "ay": "Aymara", # Aymara + "az": "Azərbaycan" , # Azerbaijani + + "ba": "Башҡорт", # Bashkir + "bar": "Boarisch", + "bat-smg": "Žemaitėška", + "bcl": "Bikol", + "be": "Беларуская", # Byelorussian; Belarusian + "be-x-old": "Беларуская (тарашкевіца)", + "bg": "Български", # Bulgarian + "bh": "भोजपुरी", # Bihari + "bi": "Bislama", # Bislama + "bm": "Bamanankan", # Bambara + "bn": "বাংলা" , # Bengali; Bangla + "bo": "བོད་སྐད", # Tibetan + "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" , + "br": "Brezhoneg" , # Breton |