summaryrefslogtreecommitdiff
path: root/wikitrans/wikimarkup.py
diff options
context:
space:
mode:
Diffstat (limited to 'wikitrans/wikimarkup.py')
-rw-r--r--wikitrans/wikimarkup.py1285
1 files changed, 1285 insertions, 0 deletions
diff --git a/wikitrans/wikimarkup.py b/wikitrans/wikimarkup.py
new file mode 100644
index 0000000..77c3b30
--- /dev/null
+++ b/wikitrans/wikimarkup.py
@@ -0,0 +1,1285 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008-2018 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+Wiki markup parser.
+
+This module provides two class:
+
+WikiMarkupParser:
+ An abstract parser class, which serves as a base class for all markup
+ classes in this package.
+
+WikiMarkup
+ A subclass of the above, providing basic input method.
+
+"""
+
+from __future__ import print_function
+import sys
+import re
+from types import *
+from wikitrans.wikitoken import *
+
+__all__ = [ "WikiMarkupParser", "WikiMarkup",
+ "TagAttributes", "TagAttributeSyntaxError" ]
+
+class UnexpectedTokenError(Exception):
+ def __init__(self, value):
+ self.value = value
+
+class TagAttributeSyntaxError(Exception):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+class TagAttributes(object):
+ """A dictionary-like collection of tag attributes.
+
+ Example:
+
+ attr = TagAttributes('href="foo" length=2')
+ if 'href' in attr:
+ print(x['href']) # returns "foo"
+ for a in attr:
+ ...
+ """
+
+ attrstart = re.compile("^(?P<attr>[a-zA-Z0-9_-]+)(?P<eq>=\")?")
+ valseg = re.compile("^[^\\\"]+")
+ tab = {}
+ printable = None
+ def __init__(self, string):
+ if not string:
+ self.printable = ''
+ return
+ self.printable = string
+ s = string
+ self.tab = {}
+ while s != '':
+ s = s.strip()
+ m = self.attrstart.match(s)
+ if m:
+ name = m.group('attr')
+ val = ''
+ s = s[m.end(0):]
+ if m.group('eq'):
+ while 1:
+ m = self.valseg.match(s)
+ val += m.group(0)
+ s = s[m.end(0):]
+ if s[0] == '\\':
+ val += s[1]
+ s += 2
+ elif s[0] == '"':
+ s = s[1:]
+ break
+ else:
+ val = 1
+ self.tab[name] = val
+ else:
+ raise TagAttributeSyntaxError(s)
+ def __len__(self):
+ return len(self.tab)
+ def __getitem__(self, key):
+ return self.tab[key]
+ def __contains__(self, key):
+ return key in self.tab
+ def __iter__(self):
+ for key in self.tab:
+ yield(key)
+ def has_key(self, key):
+ return self.__contains__(key)
+ def __setitem__(self, key, value):
+ self.tab[key] = value
+ def __delitem__(self, key):
+ del self.tab[key]
+ def __str__(self):
+ return self.printable
+ def __repr__(self):
+ return self.printable
+
+class WikiMarkupParser(object):
+ """Parser for Wiki markup language.
+
+ Given input in Wiki markup language creates an abstract parse tree for it.
+ This is a base class for actual parsers. The subclasses must provide the
+ input method.
+
+ Public methods:
+
+ parse() -- parse the input.
+
+ Abstract methods (must be overridden by the subclass):
+
+ input() -- returns next physical line from the input material.
+
+ Public attributes:
+
+ tree -- constructed parse tree (a subclass of WikiNode)
+
+ """
+
+ delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
+ otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^/][^>]+))?\s*(?P<closed>/)?>")
+ ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+ refstart = re.compile("^https?://")
+
+ close_delim = {
+ '[': ']',
+ '[[': ']]',
+ '{{': '}}'
+ }
+
+ # Environment types:
+ envtypes = { "*": [ "unnumbered", 0 ],
+ "#": [ "numbered", 0 ],
+ ";": [ "defn", 0 ],
+ ":": [ "defn", 1 ]
+ }
+
+ toklist = None
+ tokind = 0
+ newline = 0
+ tree = None
+
+ tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ]
+
+ debug_level = 0
+
+ def dprint(self, lev, fmt, *argv):
+ """If current debug level is greater than or equal to lev, print *argv
+ according to format.
+ """
+ if self.debug_level >= lev:
+ for l in (fmt % argv).split('\n'):
+ print("[DEBUG] %s" % l)
+
+ inline_delims = [ "''", "'''", "[", "]", "[[", "]]", "{{", "}}", "|" ]
+
+ token_class = {
+ 'NIL': WikiNode,
+ 'NL': WikiNode,
+ 'OTAG': WikiTagNode,
+ 'CTAG': WikiTagNode,
+ 'TAG': WikiTagNode,
+ 'DELIM': WikiDelimNode,
+ 'TEXT': WikiTextNode,
+ 'PRE': WikiContentNode,
+ 'PARA': WikiSeqNode,
+ 'BAR': WikiNode,
+ 'SEQ': WikiSeqNode,
+ 'IND': WikiIndNode,
+ 'REF': WikiRefNode,
+ 'TMPL': WikiSeqNode,
+ 'IT': WikiSeqNode,
+ 'BOLD': WikiSeqNode,
+ 'ELT': WikiEltNode,
+ 'ENV': WikiEnvNode,
+ 'LINK': WikiSeqNode,
+ 'HDR': WikiHdrNode
+ }
+
+ def _new_node(self,**kwarg):
+ return self.token_class[kwarg['type']](self, **kwarg)
+
+ def tokread(self):
+ """Read next token from the input. Return it as a subclass of WikiNode."""
+ line = None
+ pos = 0
+ while 1:
+ if (not line or pos == len(line)):
+ try:
+ line = self.input()
+ pos = 0
+ except StopIteration:
+ line = u''
+
+ if not line or line == "":
+ yield(self._new_node(type='NIL'))
+ break
+
+ if line == '\n':
+ yield(self._new_node(type='NL'))
+ line = None
+ continue
+
+ self.dprint(100, "LINE: %s", line[pos:])
+ m = self.delim.search(line, pos)
+
+ if m:
+ if (pos < m.start(0)):
+ yield(self._new_node(type='TEXT',
+ content=line[pos:m.start(0)]))
+ pos = m.start(0)
+ t = None
+
+ if line[m.start(0)] == '<':
+ m = self.otag.match(line, pos)
+ if m:
+ pos = m.end(0)
+ if m.group('tag') == 'nowiki':
+ if not m.group('closed'):
+ while 1:
+ try:
+ m = self.ctag.search(line, pos)
+ if m and m.group('tag') == 'nowiki':
+ yield(self._new_node(type='TEXT',
+ content=line[pos:m.start(0)] ))
+ pos = m.end(0)
+ break
+
+ yield(self._new_node(type='TEXT',
+ content=line[pos:]))
+
+ line = self.input()
+ pos = 0
+ except StopIteration:
+ break
+ continue
+ elif m.group('tag') in self.tags:
+ try:
+ yield(self._new_node(type='OTAG',
+ tag=m.group('tag'),
+ isblock=(line[pos] == '\n'),
+ args=TagAttributes(m.group('args'))))
+ if m.group('closed'):
+ yield(self._new_node(type='CTAG',
+ tag=m.group('tag')))
+ except TagAttributeSyntaxError:
+ yield(self._new_node(type='TEXT',
+ content=m.group(0)))
+ continue
+ else:
+ yield(self._new_node(type='TEXT',content=m.group(0)))
+ continue
+ else:
+ m = self.ctag.match(line, pos)
+ if m:
+ if m.group('tag') in self.tags:
+ yield(self._new_node(type='CTAG',
+ tag=m.group('tag')))
+ pos = m.end(0)
+ continue
+ else:
+ yield(self._new_node(type='TEXT',
+ content=line[pos:pos+1]))
+ pos += 1
+ continue
+ else:
+ pos = m.end(0)
+ content = m.group(0)
+ if content[0] in self.envtypes:
+ node = self._new_node(type='DELIM',
+ content=content,
+ isblock=True,
+ continuation=pos < len(line) and line[pos] == ":")
+ if node.continuation:
+ node.content += node.content[0]
+ pos += 1
+
+ yield(node)
+
+ while pos < len(line) and line[pos] in [' ', '\t']:
+ pos += 1
+ else:
+ yield(self._new_node(type='DELIM',
+ isblock=(content.strip() not in self.inline_delims),
+ content=content.strip()))
+ continue
+
+ if line:
+ if line[-1] == '\n':
+ if line[pos:-1] != '':
+ yield(self._new_node(type='TEXT',content=line[pos:-1]))
+ yield(self._new_node(type='NL'))
+ else:
+ yield(self._new_node(type='TEXT',content=line[pos:]))
+ line = None
+
+
+ def input(self):
+ """Return next physical line from the input.
+
+ This method must be overridden by the subclass.
+ """
+ return None
+
+ def swaptkn(self, i, j):
+ """Swap tokens at indices i and j in toklist."""
+ self.dprint(80, "SWAPPING %s <-> %s", i, j)
+ x = self.toklist[i]
+ self.toklist[i] = self.toklist[j]
+ self.toklist[j] = x
+
+ def tokenize(self):
+ """Tokenize the input.
+
+ Read tokens from the input (supplied by the input() method). Place the
+ obtained tokens in the toklist array.
+ """
+ self.toklist = []
+ for tok in self.tokread():
+ self.dprint(100, "TOK: %s", tok)
+ self.toklist.append(tok)
+ # Determine and fix up the ordering of bold and italic markers
+ # There are three possible cases:
+ #
+ # 1a. '''a b ''c'' d'''
+ # 1b. ''a b '''c''' d''
+ #
+ # 2a. '''''a b'' c d'''
+ # 2b. '''''a b''' c d''
+ #
+ # 3a. '''a b ''c d'''''
+ # 3b. ''a b '''c d'''''
+ stack = []
+ for i in range(0,len(self.toklist)):
+ if (self.toklist[i].type == 'DELIM'
+ and (self.toklist[i].content == "''"
+ or self.toklist[i].content == "'''")):
+ if len(stack) > 0:
+ if self.toklist[stack[-1]].content == self.toklist[i].content:
+ # Case 1: just pop the matching delimiter off the stack
+ stack.pop()
+ elif len(stack) == 2 and stack[-2] + 1 == stack[-1]:
+ # Case 2: swap delimiters saved on stack ...
+ self.swaptkn(stack[-2], stack[-1])
+ # and pop off the matching one
+ stack.pop()
+ elif (i < len(self.toklist)
+ and self.toklist[i+1].type == 'DELIM'
+ and self.toklist[stack[-1]].content
+ == self.toklist[i+1].content):
+ # Case 3: swap current and next tokens
+ self.swaptkn(i, i+1)
+ # and pop off the matching one
+ stack.pop()
+ else:
+ # Push the token on stack
+ stack.append(i)
+ else:
+ # Push the token on stack
+ stack.append(i)
+ # Redefine all non-matched tokens as TEXT
+ for i in stack:
+ self.toklist[i].type = 'TEXT' # FIXME
+
+ mark = []
+
+ def push_mark(self):
+ """Save the current token index on stack."""
+ self.mark.append(self.tokind)
+
+ def pop_mark(self):
+ """Restore the token index from top of stack."""
+ self.tokind = self.mark.pop()
+
+ def clear_mark(self):
+ """Forget the last mark."""
+ self.mark.pop()
+
+ def lookahead(self, off=0):
+ """Peek a token at index (tokind+off)."""
+ tok = self.toklist[self.tokind+off]
+ self.dprint(20, "lookahead(%s): %s", off, tok)
+ return tok
+
+ def setkn(self,val):
+ """Store token val at the current token index."""
+ self.toklist[self.tokind] = val
+
+ def getkn(self):
+ """Get next token from the toklist. Advance tokind."""
+ self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL'
+ if self.tokind == len(self.toklist):
+ return self._new_node(type='NIL')
+ tok = self.toklist[self.tokind]
+ self.tokind = self.tokind + 1
+ self.dprint(20, "getkn: %s", tok)
+ return tok
+
+ def ungetkn(self, tok=None):
+ """Unget the last read token.
+
+ Decrease the tokind by one, so the last read token will be read again.
+ If optional argument is supplied and is not None, store it in the toklist
+ in place of the current token.
+ """
+ self.tokind = self.tokind - 1
+ self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL'
+ if tok:
+ self.toklist[self.tokind] = tok
+ self.dprint(20, "ungetkn: %s", tok)
+ return self.toklist[self.tokind]
+
+ def fixuptkn(self, tok):
+ """Replace the recently read token by tok."""
+ if self.tokind == 0:
+ raise IndexError('WikiMarkupParser.fixuptkn called at start of input')
+ self.toklist[self.tokind-1] = tok
+ return tok
+
+ def dump(self, tree, file=sys.stdout):
+ """Dump the tree to file, node by node."""
+ for node in tree:
+ file.write(str(node))
+ file.write('\n')
+
+ def is_block_end(self, tok):
+ """Return True if tok ends a block environment."""
+ if tok.type == 'NIL':
+ return True
+ elif tok.type == 'NL':
+ if self.lookahead().type == 'NIL':
+ return True
+ elif self.lookahead().type == 'NL':
+ self.getkn()
+ return True
+ elif tok.type in ['DELIM', 'CTAG', 'TAG']:
+ if tok.isblock:
+ self.ungetkn(tok)
+ return True
+ return False
+
+ def parse_para(self, tok):
+ """Read paragraph starting at tok."""
+ self.dprint(80, "ENTER parse_para: %s", tok)
+
+ acc = { 'seq': [],
+ 'textlist': [] }
+
+ def flush():
+ if acc['textlist']:
+ acc['seq'].append(self._new_node(type='TEXT',
+ content=''.join(acc['textlist'])))
+ acc['textlist'] = []
+
+ if (isinstance(tok, WikiContentNode)
+ and isinstance(tok.content,str)
+ and re.match("^[ \t]", tok.content)):
+ type = 'PRE'
+ rx = re.compile("^\S")
+ else:
+ type = 'PARA'
+ rx = re.compile("^[ \t]")
+
+ while not self.is_block_end(tok):
+ if tok.type == 'TEXT':
+ if rx and self.newline and rx.match(tok.content):
+ self.ungetkn()
+ break
+ acc['textlist'].append(tok.content)
+ elif tok.type == 'NL':
+ acc['textlist'].append('\n')
+ elif tok.type == 'OTAG':
+ flush()
+ acc['seq'].append(self.parse_tag(tok))
+ elif tok.type == 'DELIM':
+ flush()
+ acc['seq'].append(self.parse_inline_delim(tok))
+ else:
+ raise UnexpectedTokenError(tok)
+ tok = self.getkn()
+ flush()
+ if acc['seq']:
+ tok = self._new_node(type=type, content=acc['seq'])
+ else:
+ tok = None
+ self.dprint(80, "LEAVE parse_para=%s", tok)
+ return tok
+
+ def parse_block_delim(self, tok):
+ """Parse block environment starting at tok."""
+ self.dprint(80, "ENTER parse_block_delim")
+ assert(tok.type == 'DELIM')
+ if tok.content == "----":
+ node = self._new_node(type = 'BAR')
+ elif tok.content[0:2] == "==":
+ node = self.parse_header(tok)
+ if not node:
+ tok = self.ungetkn(self._new_node(type='TEXT',
+ content=tok.content))
+ elif tok.content[0] in self.envtypes:
+ node = None
+ if tok.content[0] == ':':
+ t = self.lookahead(-2)
+ if not (t.type == 'DELIM' and t.content == ';'):
+ node = self.parse_indent(tok)
+ if not node:
+ node = self.parse_env(tok)
+ else:
+ self.ungetkn(tok)
+ node = None
+ self.dprint(80, "LEAVE parse_block_delim=%s", node)
+ return node
+
+ def parse_line(self):
+ """Parse the input line."""
+ self.dprint(80, "ENTER parse_line")
+ list = []
+ while True:
+ tok = self.getkn()
+ if tok.type == 'NL' or tok.type == 'NIL':
+ break
+ elif tok.type == 'TEXT':
+ list.append(tok)
+ elif tok.type == 'DELIM':
+ if tok.isblock:
+ tok = self._new_node(type = 'TEXT', content = tok.content)
+ self.fixuptkn(tok)
+ list.append(tok)
+ elif tok.content[0] == ":":
+ # FIXME
+ list.append(self.parse_indent(tok))
+ break
+ else:
+ x = self.parse_inline_delim(tok)
+ if x:
+ list.append(x)
+ else:
+ list.append(self.fixuptkn(self._new_node(type = 'TEXT',
+ content = tok.content)))
+ elif tok.type == 'OTAG':
+ if tok.isblock:
+ self.ungetkn()
+ break
+ list.append(self.parse_tag(tok))
+ else:
+ list.append(tok)
+ ret = self._new_node(type='SEQ', content=list)
+ self.dprint(80, "LEAVE parse_line=%s", ret)
+ return ret
+
+ def parse_indent(self, tok):
+ """Parse indented block starting at tok."""
+ lev = len(tok.content)
+ self.dprint(80, "ENTER parse_indent(%s)", lev)
+ x = self._new_node(type='IND', level=lev, content=self.parse_line())
+ self.dprint(80, "LEAVE parse_indent=%s", x)
+ return x
+
+ def parse_fontmod(self,delim,what):
+ """Parse font modification directive (bold or italics).
+
+ Arguments:
+
+ delim -- starting delimiter ("''" or "'''")
+ what -- 'IT' or 'BOLD'
+ """
+ self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s",
+ delim, what, self.lookahead())
+ seq = []
+ text = ''
+ while True:
+ tok = self.getkn()
+ if tok.type == 'TEXT':
+ text += tok.content
+ elif self.is_block_end(tok):
+ self.dprint(80, "LEAVE parse_fontmod=%s", "None")
+ return None
+ elif tok.type == 'DELIM':
+# self.dprint(80, "got %s, want %s", tok.content, delim)
+ if tok.content == delim:
+ break
+ else:
+ if text:
+ seq.append(self._new_node(type='TEXT', content=text))
+ text = ''
+ x = self.parse_inline_delim(tok)
+ if x:
+ seq.append(x)
+ else:
+ self.dprint(80, "LEAVE parse_fontmod=%s", "None")
+ return None
+ elif tok.type == 'NL':
+ seq.append(self._new_node(type='TEXT', content='\n'))
+ else:
+ self.dprint(80, "LEAVE parse_fontmod=None")
+ return None
+ if text:
+ seq.append(self._new_node(type='TEXT', content=text))
+ res = self._new_node(type=what, content=seq)
+ self.dprint(80, "LEAVE parse_fontmod=%s", res)
+ return res
+
+ def parse_ref(self):
+ """Parse a reference block ([...])"""
+ self.dprint(80, "ENTER parse_ref")
+ tok = self.getkn()
+ if not (tok.type == 'TEXT' and self.refstart.match(tok.content)):
+ self.dprint(80, "LEAVE parse_ref=None")
+ return None
+
+ seq = []
+ (ref,sep,text) = tok.content.partition(' ')
+ if text:
+ seq.insert(0, self._new_node(type='TEXT', content=text))
+
+ while True:
+ tok = self.getkn()
+ if tok.type == 'NIL':
+ self.dprint(80, "LEAVE parse_ref=None")
+ return None
+ elif self.is_block_end(tok):
+ self.dprint(80, "LEAVE parse_ref=None")
+ return None
+ elif tok.type == 'DELIM':
+ if tok.content == ']':
+ break
+ else:
+ tok = self.parse_inline_delim(tok)
+ if tok:
+ seq.append(tok)
+ else:
+ self.dprint(80, "LEAVE parse_ref=None")
+ return None
+ elif tok.type == 'OTAG':
+ list.append(self.parse_tag(tok))
+ else:
+ seq.append(tok)
+
+ ret = self._new_node(type='REF', ref=ref,
+ content=self._new_node(type='SEQ', content=seq))
+ self.dprint(80, "LEAVE parse_ref= %s", ret)
+ return ret
+
+ def parse_link(self, type, delim):
+ """Parse an external link ([[...]]).
+
+ In this implementation, it is also used to parse template
+ references ({{...}}).
+
+ Arguments:
+
+ type -- 'LINK' or 'TMPL'
+ delim -- expected closing delimiter.
+ """
+ self.dprint(80, "ENTER parse_link(%s,%s)", type, delim)
+ subtree = []
+ list = []
+ while True:
+ tok = self.getkn()
+ if tok.type == 'NIL':
+ self.dprint(80, "LEAVE parse_link=None [EOF]")
+ return None
+ if tok.type == 'DELIM':
+ if tok.content == delim:
+ if list:
+ subtree.append(self._new_node(type='SEQ',
+ content=list))
+ break
+ elif tok.content == "|":
+ if len(list) > 1:
+ subtree.append(self._new_node(type='SEQ',
+ content=list))
+ elif list:
+ subtree.append(list[0])
+ list = []
+ else:
+ x = self.parse_inline_delim(tok)
+ if x:
+ list.append(x)
+ else:
+ self.dprint(80, "LEAVE parse_link=None [bad inline]")
+ return None
+ elif tok.type == 'TEXT':
+ list.append(tok)
+ else:
+ self.dprint(80, "LEAVE parse_link=None [unexpected token]")
+ return None
+ ret = self._new_node(type=type, content=subtree)
+ self.dprint(80, "LEAVE parse_link=%s", ret)
+ return ret
+
+ def parse_inline_delim(self, tok):
+ """Parse an inline block."""
+ self.dprint(80, "ENTER parse_inline_delim")
+ assert(tok.type == 'DELIM')
+ self.push_mark()
+ if tok.content == "''":
+ x = self.parse_fontmod(tok.content, 'IT')
+ elif tok.content == "'''":
+ x = self.parse_fontmod(tok.content, 'BOLD')
+ elif tok.content == "[":
+ x = self.parse_ref()
+ elif tok.content == "[[":
+ x = self.parse_link('LINK', "]]")
+ elif tok.content == "{{":
+ x = self.parse_link('TMPL', "}}")
+ else:
+ x = None
+
+ if x:
+ self.clear_mark()
+ else:
+ self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok)
+ self.pop_mark()
+ x = self.fixuptkn(self._new_node(type='TEXT', content=tok.content))
+ od = tok.content
+ if od in self.close_delim:
+ cd = self.close_delim[od]
+ lev = 0
+ for i,tok in enumerate(self.toklist[self.tokind+1:]):
+ if tok.type == 'NIL':
+ break
+ elif tok.type == 'DELIM':
+ if tok.content == od:
+ lev += 1
+ elif tok.content == cd:
+ if lev == 0:
+ tok = self._new_node(type='TEXT',
+ content=tok.content)
+ self.toklist[self.tokind+1+i] = tok
+ lev -= 1
+ break
+ self.dprint(80, "END DELIMITER RECOVERY: %s", tok)
+
+ self.dprint(80, "LEAVE parse_inline_delim=%s", x)
+ return x
+
+ def parse_tag(self, tag):
+ """Parse an xml-like tag (such as, e.g. "<tt>...</tt>")."""
+ self.dprint(80, "ENTER parse_tag")
+ list = []
+ self.push_mark()
+ while True:
+ tok = self.getkn()
+ if tok.type == 'NIL':
+ self.pop_mark()
+ s = '<' + tag.tag
+ if tag.args:
+ s += ' ' + str(tag.args)
+ s += '>'
+ node = self._new_node(type='TEXT',content=s)
+ if tag.content:
+ self.tree[self.tokind:self.tokind] = tag.content
+ self.dprint(80, "LEAVE parse_tag = %s (tree modified)", node)
+ return node
+ elif tok.type == 'DELIM':
+ if tok.isblock:
+ tok = self.parse_block_delim(tok)
+ else:
+ tok = self.parse_inline_delim(tok)
+ if not tok:
+ tok = self.getkn()
+ elif tok.type == 'CTAG':
+ if tag.tag == tok.tag:
+ break
+ s = '</' + tag.tag + '>'
+ tok = self.fixuptkn(self._new_node(type='TEXT', content=s))
+ elif tok.type == 'NL':
+ tok = self._new_node(type = 'TEXT', content = '\n')
+ list.append(tok)
+
+ self.clear_mark()
+ ret = self._new_node(type = 'TAG',
+ tag = tag.tag,
+ args = tag.args,
+ isblock = tag.isblock,
+ content = self._new_node(type = 'SEQ', content = list))
+ self.dprint(80, "LEAVE parse_tag = %s", ret)
+ return ret
+
+ def parse_env(self, tok):
+ """Parse a block environment (numbered, unnumbered, or definition list)."""
+ type = self.envtypes[tok.content[0]][0]
+ lev = len(tok.content)
+ self.dprint(80, "ENTER parse_env(%s,%s)",type,lev)
+ list = []
+ while True:
+ if (tok.type == 'DELIM'
+ and tok.content[0] in self.envtypes
+ and type == self.envtypes[tok.content[0]][0]):
+ if len(tok.content) < lev:
+ self.ungetkn()
+ break
+ elif len(tok.content) > lev:
+ elt = self.parse_env(tok)
+ else:
+ elt = self.parse_line()
+ if not tok.continuation:
+ list.append(self._new_node(type='ELT',
+ subtype=self.envtypes[tok.content[0]][1],
+ content=elt))
+ tok = self.getkn()
+ continue
+
+ if list:
+ if list[-1].content.type != 'SEQ':
+ x = list[-1].content.content
+ # FIXME:
+ list[-1].content = self._new_node(type='SEQ', content=[x])
+ list[-1].content.content.append(elt)
+ else:
+ self.ungetkn()
+ break
+
+ tok = self.getkn()
+
+ ret = self._new_node(type='ENV',
+ envtype=type,
+ level=lev,
+ content=list)
+ self.dprint(80, "LEAVE parse_env=%s", ret)
+ return ret
+
+ def parse_header(self, tok):
+ """Parse a Wiki header."""
+ self.dprint(80, "ENTER parse_header")
+ self.push_mark()
+ list = []
+ delim = tok.content
+ while True:
+ tok = self.getkn()
+ if tok.type == 'NL':
+ self.pop_mark()
+ self.dprint(80, "LEAVE parse_header=None")
+ return None
+ elif tok.type == 'TEXT':
+ list.append(tok)
+ elif tok.type == 'DELIM':
+ if tok.content == delim:
+ if self.lookahead().type == 'NL':
+ self.getkn()
+ if self.lookahead().type == 'NL':
+ self.getkn()
+ break
+ else:
+ self.pop_mark()
+ self.dprint(80, "LEAVE parse_header=None")
+ return None
+ elif tok.isblock:
+ self.pop_mark()
+ self.dprint(80, "LEAVE parse_header=None")
+ return None
+ else:
+ list.append(self.parse_inline_delim(tok))
+ elif tok.type == 'OTAG':
+ if tok.isblock:
+ self.pop_mark()
+ self.dprint(80, "LEAVE parse_header=None")
+ return None
+ list.append(self.parse_tag(tok))
+ self.clear_mark()
+ ret = self._new_node(type='HDR',
+ level=len(delim),
+ content=self._new_node(type='SEQ', content=list))
+ self.dprint(80, "LEAVE parse_header=%s", ret)
+ return ret
+
+ def parse_block(self):
+ """Parse next block: newline, delimiter, tag, or paragraph."""
+ tok = self.getkn()
+ while tok.type == 'NL':
+ tok = self.getkn()
+ if tok == None or tok.type == 'NIL':
+ return None
+ elif tok.type == 'DELIM':
+ tok = self.parse_block_delim(tok)
+ if tok:
+ return tok
+ else:
+ tok = self.getkn()
+ elif tok.type == 'OTAG' and tok.isblock:
+ return self.parse_tag(tok)
+
+ return self.parse_para(tok)
+
+ def parse(self):
+ """Parse Wiki material supplied by the input() method.
+
+ Store the resulting abstract parsing tree in the tree attribute.
+ """
+ if not self.toklist:
+ self.tokenize()
+ if self.debug_level >= 90:
+ print("TOKEN DUMP BEGIN")
+ self.dump(self.toklist)
+ print("TOKEN DUMP END")
+ self.tokind = 0
+ self.tree = []
+ while 1:
+ subtree = self.parse_block()
+ if subtree == None:
+ break
+ self.tree.append(subtree)
+ if self.debug_level >= 70:
+ print("TREE DUMP BEGIN")
+ self.dump(self.tree)
+ print("TREE DUMP END")
+
+ def __str__(self):
+ return str(self.tree)
+
+
+class WikiMarkup(WikiMarkupParser):
+ """
+ A derived parser class that supplies a basic input method.
+
+ Three types of inputs are available:
+
+ 1. filename=<file>
+ The file <file> is opened and used for input.
+ 2. file=<file>
+ The already opened file <file> is used for input.
+ 3. text=<string>
+ Input is taken from <string>, line by line.
+
+ Usage:
+
+ obj = WikiMarkup(arg=val)
+ obj.parse
+ ... Do whatever you need with obj.tree ...
+
+ """
+
+ file = None
+ text = None
+ lang = 'en'
+ html_base = 'http://%(lang)s.wiktionary.org/wiki/'
+ image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf'
+ media_base = 'http://www.mediawiki.org/xml/export-0.3'
+
+ def __init__(self, *args, **keywords):
+ for kw in keywords:
+ if kw == 'file':
+ self.file = keywords[kw]
+ elif kw == 'filename':
+ self.file = open(keywords[kw])
+ elif kw == 'text':
+ self.text = keywords[kw].split("\n")
+ elif kw == 'lang':
+ self.lang = keywords[kw]
+ elif kw == 'html_base':
+ self.html_base = keywords[kw]
+ elif kw == 'image_base':
+ self.image_base = keywords[kw]
+ elif kw == 'media_base':
+ self.media_base = keywords[kw]
+
+ def __del__(self):
+ if self.file:
+ self.file.close()
+
+ def input(self):
+ if self.file:
+ return self.file.readline()
+ elif self.text:
+ return self.text.pop(0) + '\n'
+ else:
+ return None
+
+ # ISO 639
+ langtab = {
+ "aa": "Afar", # Afar
+ "ab": "Аҧсуа", # Abkhazian
+ "ae": None, # Avestan
+ "af": "Afrikaans", # Afrikaans
+ "ak": "Akana", # Akan
+ "als": "Alemannisch",
+ "am": "አማርኛ", # Amharic
+ "an": "Aragonés", # Aragonese
+ "ang": "Englisc",
+ "ar": "العربية" , # Arabic
+ "arc": "ܐܪܡܝܐ",
+ "as": "অসমীয়া", # Assamese
+ "ast": "Asturian",
+ "av": "Авар", # Avaric
+ "ay": "Aymara", # Aymara
+ "az": "Azərbaycan" , # Azerbaijani
+
+ "ba": "Башҡорт", # Bashkir
+ "bar": "Boarisch",
+ "bat-smg": "Žemaitėška",
+ "bcl": "Bikol",
+ "be": "Беларуская", # Byelorussian; Belarusian
+ "be-x-old": "Беларуская (тарашкевіца)",
+ "bg": "Български", # Bulgarian
+ "bh": "भोजपुरी", # Bihari
+ "bi": "Bislama", # Bislama
+ "bm": "Bamanankan", # Bambara
+ "bn": "বাংলা" , # Bengali; Bangla
+ "bo": "བོད་སྐད", # Tibetan
+ "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" ,
+ "br": "Brezhoneg" , # Breton