diff options
Diffstat (limited to 'wikimarkup.py')
-rw-r--r-- | wikimarkup.py | 747 |
1 files changed, 415 insertions, 332 deletions
diff --git a/wikimarkup.py b/wikimarkup.py index 4fd4e44..9cfdb09 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -1,6 +1,6 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -# Copyright (C) 2008 Sergey Poznyakoff +# Copyright (C) 2008, 2009 Sergey Poznyakoff # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -19,327 +19,451 @@ import sys import re from types import * -__all__ = [ "BaseWikiMarkup", "WikiMarkup" ] - -eltbeg = re.compile("=+|(^----$)|^[\\*#:]+") -eltre = re.compile("(\\[\\[)|(\\{\\{)|\\[|(\\'\\'\\'?)") -delims = { "[[" : re.compile("\\||(\\]\\])"), - "{{" : re.compile("\\||(\\}\\})") } -term = { "[[" : "]]" , "{{" : "}}" } -ends = { "[[" : re.compile("(\\[\\[)|(\\]\\])"), - "{{" : re.compile("(\\{\\{)|(\\}\\})") } -itend = re.compile("\\'\\'($|[^\\'])") -boend = re.compile("\\'\\'\\'($|[^\\'])") +__all__ = [ "BaseWikiMarkup", "WikiMarkup", + "NIL", "TEXT", "DELIM", "NL", "PARA", + "IT", "BOLD", "LINK", "TMPL", + "BAR", "HDR", "REF", "ENV", "IND", "SEQ", + "ENVUNNUM", "ENVNUM", "envtypes" ] + +delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^:+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)") + +NIL = 0 +TEXT = 1 +DELIM = 2 +NL = 3 + +PARA = 4 +IT = 5 +BOLD = 6 +LINK = 7 +TMPL = 8 +BAR = 9 +HDR = 10 +REF = 11 +ENV = 12 +IND = 13 +SEQ = 14 + +# Environment types: +# Unnumbered list +ENVUNNUM = 0 +# Numbered list +ENVNUM = 1 +envtypes = [ "*", "#" ] class BaseWikiMarkup: - """ -A base class for handling Wiki markups. -It handles: - 1. paragraphs; - 2. basic block markup (headers, numbered and unnumbered lists, - indentations); - 3. basic inline markup (bold, italic); - 4. basic reference markup (links, templates, external links). - It does NOT handle: - 1. pseudo-html markup (<nowiki></nowiki>, and similar); - 2. leading spaces meaning ``preserve formatting''; - 3. tables and math. - The above rests for FIXME. - - This class relies on its derived classes for providing input. They must - overload method `input', which must return one physical line of input for - each call. - - Variables: - - 1. tree - The parse tree. Valid after parse() finishes (see below). - - Methods: - - 1. parse() - Parse the input and build parse tree - - 2. input() - Virtual function. Return next line of input or None on EOF. - - 3. output() - Print the tree in internal representation. - """ - ## Token classes - # NIL: nothing - NIL = 0 - # TEXT: text - TEXT = 1 - # LINK: target, text - LINK = 2 - # Template: target, text - TMPL = 3 - # External ref: target, text - REF = 4 - # Italics: text - IT = 5 - # Bold: text - BOLD = 6 - # Header: level, text - HDR = 7 - # Horizontal bar: - BAR = 8 - # Environment: type, level - ENV = 9 - # Item: text - ITEM = 10 - # Sequence: seq - SEQ = 11 - # Paragraph - PARA = 12 - - # Environment types: - # Unnumbered list - ENVUNNUM = 0 - # Numbered list - ENVNUM = 1 - # Indent - INDENT = 2 - envtypes = [ "*", "#", ":" ] + toklist = None + tokind = 0 tree = None + + debug_level = 0 - def itend(self, line, pos): - while 1: - d = itend.search(line, pos) - if not d: - return -1 - elif d.start(0) == pos or line[d.start(0)-1] != "'": - return d.start(0) - else: - pos = d.start(0) + 1 - - def linkend(self, paren, line, pos): - r = ends[paren] - count = 1 - while count > 0: - m = r.search(line, pos); - if not m: - return len(line), len(line) - else: - pos = m.end(0) - if m.group(0) == paren: - count += 1 - else: - count -= 1 - return m.start(0), m.end(0) - - la = None - def putback(self, line): - self.la = line - - def nextkn(self, curlev=0, type = -1): + def dprint(self, lev, fmt, *argv): + if self.debug_level >= lev: + print "[DEBUG]", fmt % argv + + def tokread(self): + line = None + pos = 0 while 1: - if self.la: - line = self.la - self.putback(None) - else: + if (not line or pos == len(line)): try: line = self.input() + pos = 0 except StopIteration: line = u'' + if not line or line == "": - self.putback(line) + self.dprint(100, "YIELD: NIL") + yield(NIL,) break if line == '\n': - yield(self.PARA,) + self.dprint(100, "YIELD: NL") + yield(NL,line) + line = None continue + + self.dprint(100, "LINE: %s", line[pos:]) + m = delim.search(line, pos) - m = eltbeg.match(line) if m: - if m.group(0)[0] in self.envtypes: - btype = self.envtypes.index(m.group(0)[0]) - lev = len(m.group(0)) - if btype == type: - if lev == curlev: - yield(self.ITEM, - (self.SEQ, self.getkn(line[m.end(0):]))) - elif lev > curlev: - self.putback(line) - yield(self.ENV, btype, curlev + 1, - (self.SEQ, self.nextkn(curlev + 1, btype))) - else: - self.putback(line) + if (pos < m.start(0)): + self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)]) + yield(TEXT, line[pos:m.start(0)]) + pos = m.end(0) + if m.group(0)[0] in envtypes and line[pos] == ":": + self.dprint(100, "YIELD: DELIM %s, True", m.group(0)) + yield(DELIM, m.group(0), True) + pos += 1 + else: + self.dprint(100, "YIELD: DELIM %s", m.group(0)) + yield(DELIM, m.group(0)) + else: + if line[-1] == '\n': + self.dprint(100, "YIELD: TEXT %s", line[pos:-1]) + if line[pos:-1] != '': + yield(TEXT, line[pos:-1]) + self.dprint(100, "YIELD: NL") + yield(NL,'\n') + else: + self.dprint(100, "YIELD: TEXT %s", line[pos:]) + yield(TEXT, line[pos:]) + line = None + + def input(self): + return None + + def tokenize(self): + self.toklist = [] + for tok in self.tokread(): + self.toklist.append(tok) + + def peektkn(self): + return self.toklist[self.tokind] + + def setkn(self,val): + self.toklist[self.tokind] = val + + def getkn(self): + tok = self.toklist[self.tokind] + if tok[0] != NIL: + self.tokind = self.tokind + 1 + return tok + + def ungetkn(self): + self.tokind = self.tokind - 1 + return self.toklist[self.tokind] + + def parse_bold(self, nested = False): + self.dprint(80, "ENTER parse_bold(%s), tok %s", nested, self.peektkn()) + seq = [] + textlist = [] + while 1: + tok = self.getkn() + if tok[0] == TEXT: + textlist.append(tok[1]) + elif tok[0] == DELIM: + if tok[1] == "'''": + break + elif tok[1] == "''" and not nested: + if textlist: + seq.append((TEXT, textlist)) + textlist = [] + x = self.parse_it(True) + if not x: + self.dprint(80, "LEAVE parse_bold=None") + return None + seq.append(x) + else: + self.dprint(80, "LEAVE parse_bold=None") + return None + elif tok[0] == NL: + if self.peektkn()[0] == NL: + self.dprint(80, "LEAVE parse_bold=None") + return None + else: + self.dprint(80, "LEAVE parse_bold=None") + return None + if textlist: + seq.append((TEXT, textlist)) + self.dprint(80, "LEAVE parse_bold=(BOLD, %s", seq) + return (BOLD, seq) + + def parse_it(self, nested = False): + self.dprint(80, "ENTER parse_it(%s), tok %s", nested, self.peektkn()) + seq = [] + textlist = [] + while 1: + tok = self.getkn() + if tok[0] == TEXT: + textlist.append(tok[1]) + elif tok[0] == DELIM: + if tok[1] == "''": + break + elif tok[1] == "'''": + if nested: + # The tokenizer always puts longest match before the + # shortest one, so "'''" goes before "''". Swap + # them if the need is: + ntok = self.peektkn() + if ntok[0] == DELIM and ntok[1] == "''": + self.setkn((DELIM, "'''")) break + else: + self.dprint(80, "LEAVE parse_it=%s", "None") + return None else: - self.putback(line) - yield(self.ENV, btype, 1, self.nextkn(1, btype)) - + if textlist: + seq.append((TEXT, textlist)) + textlist = [] + x = self.parse_bold(True) + if not x: + self.dprint(80, "LEAVE parse_it=%s", "None") + return None + seq.append(x) else: - if curlev > 0: - self.putback(line) - break - elif m.group(0)[0:2] == "==" \ - and line.rstrip('\n').endswith(m.group(0)): - yield(self.HDR, len(m.group(0))-1, - self.getkn(line[m.end(0):-(1+len(m.group(0)))])) - elif m.group(0) == "----": - yield(self.BAR,) + self.dprint(80, "LEAVE parse_it=%s", "None") + return None + elif tok[0] == NL: + if self.peektkn()[0] == NL: + self.dprint(80, "LEAVE parse_it=%s", "None") + return None else: - if curlev > 0: - self.putback(line) + self.dprint(80, "LEAVE parse_it=%s", "None") + return None + if textlist: + seq.append((TEXT, textlist)) + self.dprint(80, "LEAVE parse_it=(IT,%s)", seq) + return (IT, seq) + + def parse_link(self, type, delim): + self.dprint(80, "ENTER parse_link(%s,%s), tok %s", + type, delim, self.peektkn()) + subtree = [] + list = [] + while 1: + tok = self.getkn() + if tok[0] == DELIM: + if tok[1] == delim: + if list: + subtree.append((SEQ,list)) break - yield(self.getkn(line)) + elif tok[1] == "|": + if len(list) > 1: + subtree.append((SEQ,list)) + else: + subtree.append(list[0]) + list = [] + else: + x = self.parse_inline(tok) + if x: + list.append(x) + else: + self.dprint(80, "LEAVE parse_link=%s", "None") + return None + elif tok[0] == TEXT: + list.append(tok) + else: + self.dprint(80, "LEAVE parse_link=%s", "None") + return None + self.dprint(80, "LEAVE parse_link=(%s,%s)", type, subtree) + return (type, subtree) - def getkn(self, line): - pos = 0 + def parse_ref(self): + self.dprint(80, "ENTER parse_ref, tok %s", self.peektkn()) + list = [] while 1: - if pos == len(line): - break; - m = eltre.search(line, pos) - if not m: - yield(self.TEXT, line[pos:]) - pos = len(line) + tok = self.getkn() + if tok[0] == DELIM: + if tok[1] == "]": + break + else: + x = self.parse_inline(tok) + if x: + list.append(x) + else: + self.dprint(80, "LEAVE parse_ref=%s", "None") + return None + elif tok[0] == TEXT: + list.append(tok) + elif tok[0] == NL: + continue else: - yield(self.TEXT, line[pos:m.start(0)]) - pos = m.end(0) - if m.group(0) == "[[" or m.group(0) == "{{": - d = delims[m.group(0)].search(line, pos) - if d.group(0) == "|": - target = (self.TEXT, line[pos:d.start(0)]) - (start,pos) = self.linkend(m.group(0), line, m.end(0)) - text = (self.SEQ, self.getkn(line[d.end(0):start])) - elif d.group(0) == term[m.group(0)]: - target = (self.TEXT, line[pos:d.start(0)]) - text = (self.NIL,) - pos = d.end(0) - if m.group(0) == "[[": - yield(self.LINK, target, text) + self.dprint(80, "LEAVE parse_ref=%s", "None") + return None + if len(list) == 0 or list[0][0] != TEXT: + self.dprint(80, "LEAVE parse_ref=%s", "None") + return None + (ref,sep,text) = list[0][1].partition(' ') + ret = (REF, ref, (SEQ, [(TEXT, text)] + list[1:])) + self.dprint(80, "LEAVE parse_ref= %s", ret) + return ret + + inline_delims = [ "''", "'''", "[", "[[", "{{" ] + def parse_inline(self, tok): + self.dprint(80, "ENTER parse_inline(%s), tok %s", tok, self.peektkn()) + tokind = self.tokind + if tok[1] == "''": + x = self.parse_it() + elif tok[1] == "'''": + x = self.parse_bold() + elif tok[1] == "[": + x = self.parse_ref() + elif tok[1] == "[[": + x = self.parse_link(LINK, "]]") + elif tok[1] == "{{": + x = self.parse_link(TMPL, "}}") + else: # FIXME + self.dprint(80, "LEAVE parse_inline=%s", "None") + x = None + if not x: + self.tokind = tokind + self.dprint(80, "LEAVE parse_inline=%s", x) + return x + + def parse_para(self): + self.dprint(80, "ENTER parse_para, tok %s", self.peektkn()) + seq = [] + textlist = [] + while 1: + tok = self.getkn() + if tok[0] == TEXT: + textlist.append(tok[1]) + elif tok[0] == NL: + tok = self.getkn() + if tok[0] == NL or tok[0] == NIL: + break + else: + self.ungetkn() + elif tok[0] == NIL: + break + elif tok[0] == DELIM: + if tok[1] in self.inline_delims: + if textlist: + seq.append((TEXT, textlist)) + textlist = [] + x = self.parse_inline(tok) + if x: + seq.append(x) else: - yield(self.TMPL, target, text) - elif m.group(0) == "[": - i = line.find("]", m.end(0)) - if i == -1: - i = len(line) - (target,sep,text) = line[m.end(0):i].partition(' ') - yield(self.REF, - (self.TEXT, target), - (self.SEQ, self.getkn(text))) - pos = i + 1 - elif m.group(0) == "'''": - e = boend.search(line, m.end(0)) - if e: - i = e.start(0) - pos = i + 3 + seq.append(tok) + break + else: + self.ungetkn() + break + if textlist: + seq.append((TEXT, textlist)) + self.dprint(80, "LEAVE parse_para=%s", seq) + return (PARA, seq) + + def parse_header(self, delim): + self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn()) + list = [] + while 1: + tok = self.getkn() + if tok[0] == NIL: + self.dprint(80, "LEAVE parse_header=%s", "None") + return None + elif tok[0] == TEXT: + list.append(tok) + elif tok[0] == DELIM: + if tok[1] == delim: + if self.peektkn()[0] == NL: + break else: - pos = len(line) - i = pos - yield(self.BOLD, - (self.SEQ, self.getkn(line[m.end(0):i]))) - elif m.group(0) == "''": - i = self.itend(line, m.end(0)) - if i == -1: - pos = len(line) - i = pos + self.dprint(80, "LEAVE parse_header=%s", "None") + return None + else: + x = self.parse_inline(tok) + if x: + list.append(x) else: - pos = i + 2 - yield(self.IT, - (self.SEQ, self.getkn(line[m.end(0):i]))) + self.dprint(80, "LEAVE parse_header=%s", "None") + return None #FIXME? + else: + self.dprint(80, "LEAVE parse_header=%s", "None") + return None + self.dprint(80, "LEAVE parse_header=(HDR, %s, (SEQ,%s))",len(delim)-1,list) + return (HDR,len(delim)-1,(SEQ,list)) - def input(self): - return None - def expandtok(self, tok): - if type(tok) == GeneratorType: - subtree = [self.SEQ] - for t in tok: - x = self.expandtok(t) + def parse_line(self): + self.dprint(80, "ENTER parse_line, tok %s", self.peektkn()) + list = [] + while 1: + tok = self.getkn() + if tok[0] == NL or tok[0] == NIL: + break + elif tok[0] == TEXT: + list.append(tok) + elif tok[0] == DELIM and tok[1][0] == ":": + list.append(self.parse_indent(len(tok[1]))) + break + else: + x = self.parse_inline(tok) if x: - subtree.append(x) - return tuple(subtree) if len(subtree) > 2 else \ - subtree[1] if len(subtree) == 2 else None + list.append(x) + else: + list.append(tok) + self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list) + return (SEQ, list) + + def parse_env(self, type, lev): + self.dprint(80, "ENTER parse_env(%s,%s), tok %s",type,lev,self.peektkn()) + list = [] + while 1: + tok = self.getkn() + if tok[0] == DELIM and tok[1][0] in envtypes and type == envtypes.index(tok[1][0]): + if len(tok[1]) < lev: + self.ungetkn() + break + elif len(tok[1]) > lev: + self.ungetkn() + elt = self.parse_env(type, len(tok[1])) + else: + elt = self.parse_line() + if len(tok) == 3: + if list[-1][0] != SEQ: + x = list[-1] + list[-1] = (SEQ, [x]) + list[-1][1].append(elt) + continue + list.append(elt) + else: + self.ungetkn() + break + self.dprint(80, "LEAVE parse_env=(ENV, %s, %s, %s)", type, lev, list) + return (ENV, type, lev, list) + + def parse_indent(self, lev): + self.dprint(80, "ENTER parse_indent(%s), tok %s", lev, self.peektkn()) + x = (IND, lev, self.parse_line()) + self.dprint(80, "LEAVE parse_indent=%s", x) + return x + + def parse0(self): + tok = self.getkn() toktype = tok[0] - if toktype == self.NIL: + if toktype == NIL: return None - if toktype == self.TEXT: - return tok if tok[1] != '' else None - elif toktype == self.LINK or toktype == self.TMPL \ - or toktype == self.REF: - return toktype, self.expandtok(tok[1]), self.expandtok(tok[2]) - elif toktype == self.IT or toktype == self.BOLD \ - or toktype == self.ITEM: - return toktype, self.expandtok(tok[1]) - elif toktype == self.HDR: - return toktype, tok[1], self.expandtok(tok[2]) - elif toktype == self.ENV: - return toktype,tok[1],tok[2],self.expandtok(tok[3]) - elif toktype == self.SEQ: - if len(tok) == 2: - return self.expandtok(tok[1]) - elif len(tok) == 1: - return None + elif toktype == TEXT: + self.ungetkn() + return self.parse_para() + elif toktype == DELIM: + if tok[1] == "----": + return (BAR,) + elif tok[1][0:2] == "==": + return self.parse_header(tok[1]) + elif tok[1][0] in envtypes: + type = envtypes.index(tok[1][0]) + lev = len(tok[1]) + self.ungetkn() + return self.parse_env(type, lev) + elif tok[1][0] == ":": + return self.parse_indent(len(tok[1])) else: - subtree = [self.SEQ] - for t in tok[1:]: - x = self.expandtok(t) - if x: - subtree.append(x) - return tuple(subtree) if len(subtree) > 2 else \ - subtree[1] if len(subtree) == 2 else None - else: - return tok - - def parse(self): - tree = [self.SEQ] - for tok in self.nextkn(): - tree.append(self.expandtok(tok)) - self.tree = tuple(tree) - - def prtok(self, tok, indent): - if not tok: - print " " * indent, "None" - return - toktype = tok[0] - if toktype == self.SEQ: - for t in tok[1:]: - self.prtok(t, indent) - else: - print " " * indent, - if toktype == self.NIL: - print "NIL" - if toktype == self.TEXT: - print "TEXT \"%s\"" % (tok[1].encode('string_escape')) - elif toktype == self.LINK: - print "LINK " - self.prtok(tok[1], indent+1) # target - self.prtok(tok[2], indent+1) # text - elif toktype == self.TMPL: - print "TMPL" - self.prtok(tok[1], indent+1) # target - self.prtok(tok[2], indent+1) # text - elif toktype == self.REF: - print "REF" - self.prtok(tok[1], indent+1) # target - self.prtok(tok[2], indent+1) # text - elif toktype == self.IT: - print "IT" - self.prtok(tok[1], indent+1) - elif toktype == self.BOLD: - print "BOLD" - self.prtok(tok[1], indent+1) - elif toktype == self.HDR: - print "HDR", tok[1] - self.prtok(tok[2], indent+1) - elif toktype == self.BAR: - print "BAR" - elif toktype == self.ENV: - print "ENV ",self.envtypes[tok[1]],tok[2] - self.prtok(tok[3], indent+1) - elif toktype == self.ITEM: - print "ITEM" - self.prtok(tok[1], indent+1) - elif toktype == self.PARA: - print "PARA" + self.ungetkn() + return self.parse_para() + elif toktype == NL: + return self.parse0() - def output(self): - self.prtok(self.tree, 0) + def parse(self): + if not self.toklist: + self.tokenize() + self.dprint(90, "TOKLIST: %s", self.toklist) + self.tokind = 0 + self.tree = [] + while 1: + subtree = self.parse0() + if subtree == None: + break + self.tree.append(subtree) + self.dprint(70, "TREE: %s", self.tree) + + def __str__(self): + return str(self.tree) class WikiMarkup (BaseWikiMarkup): @@ -698,52 +822,11 @@ class WikiMarkup (BaseWikiMarkup): "zu": "isiZulu" # Zulu } - def str_nil(self, tok, env): - return None - - def str_text(self, tok, env): - return tok[1] - - def str_seq(self, tok, env): - str = "" - for t in tok[1:]: - s = self.fmtok(t, env) - if s: - str += s - return str + - def fmtok(self, tok, env): - if type(tok) != TupleType: - return "" - toktype = tok[0] - if toktype == self.NIL: - return self.str_nil(tok, env) - if toktype == self.TEXT: - return self.str_text(tok, env) - elif toktype == self.LINK: - return self.str_link(tok, env) - elif toktype == self.TMPL: - return self.str_tmpl(tok, env) - elif toktype == self.REF: - return self.str_ref(tok, env) - elif toktype == self.IT: - return self.str_it(tok, env) - elif toktype == self.BOLD: - return self.str_bold(tok, env) - elif toktype == self.HDR: - return self.str_hdr(tok, env) - elif toktype == self.BAR: - return self.str_bar(tok, env) - elif toktype == self.ENV: - return self.str_env(tok, env) - elif toktype == self.ITEM: - return self.str_item(tok, env) - elif toktype == self.SEQ: - return self.str_seq(tok, env) - elif toktype == self.PARA: - return self.str_para(tok, env) + + + - def __str__(self): - return self.fmtok(self.tree, None) |