diff options
author | Sergey Poznyakoff <gray@Pirx.gnu.org.ua> | 2008-11-26 08:06:06 +0200 |
---|---|---|
committer | Sergey Poznyakoff <gray@Pirx.gnu.org.ua> | 2008-11-26 08:06:06 +0200 |
commit | 5dc93e466efaaa243e6490961b6e545eaa65f06c (patch) | |
tree | 844b75613cabb2c0394828492038546c1f9806d8 /wikimarkup.py | |
download | wikitrans-5dc93e466efaaa243e6490961b6e545eaa65f06c.tar.gz wikitrans-5dc93e466efaaa243e6490961b6e545eaa65f06c.tar.bz2 |
Initial commit
Diffstat (limited to 'wikimarkup.py')
-rw-r--r-- | wikimarkup.py | 362 |
1 files changed, 362 insertions, 0 deletions
diff --git a/wikimarkup.py b/wikimarkup.py new file mode 100644 index 0000000..3308da2 --- /dev/null +++ b/wikimarkup.py @@ -0,0 +1,362 @@ +#!/usr/bin/python +# Copyright (C) 2008 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import sys +import re +from types import * + +__all__ = [ "BaseWikiMarkup", "WikiMarkup" ] + +eltbeg = re.compile("=+|(^----$)|^[\\*#:]+") +eltre = re.compile("(\\[\\[)|(\\{\\{)|\\[|(\\'\\'\\'?)") +delims = { "[[" : re.compile("\\||(\\]\\])"), + "{{" : re.compile("\\||(\\}\\})") } +term = { "[[" : "]]" , "{{" : "}}" } +ends = { "[[" : re.compile("\\]\\]"), + "{{" : re.compile("\\}\\}") } +itend = re.compile("\\'\\'($|[^\\'])") +boend = re.compile("\\'\\'\\'($|[^\\'])") + +class BaseWikiMarkup: + """ +A base class for handling Wiki markups. +It handles: + 1. basic block markup (headers, numbered and unnumbered lists, + indentations); + 2. basic inline markup (bold, italic); + 3. basic reference markup (links, templates, external links). + It does NOT handle: + 1. pseudo-html markup (<nowiki></nowiki>, and similar); + 2. leading spaces meaning ``preserve formatting''; + 3. tables and math. + The above rests for FIXME. + + This class relies on its derived classes for providing input. They must + overload method `input', which must return one physical line of input for + each call. + + Variables: + + 1. tree + The parse tree. Valid after parse() finishes (see below). + + Methods: + + 1. parse() + Parse the input and build parse tree + + 2. input() + Virtual function. Return next line of input or None on EOF. + + 3. output() + Print the tree in internal representation. + """ + ## Token classes + # NIL: nothing + NIL = 0 + # TEXT: text + TEXT = 1 + # LINK: target, text + LINK = 2 + # Template: target, text + TMPL = 3 + # External ref: target, text + REF = 4 + # Italics: text + IT = 5 + # Bold: text + BOLD = 6 + # Header: level, text + HDR = 7 + # Horizontal bar: + BAR = 8 + # Environment: type, level + ENV = 9 + # Item: text + ITEM = 10 + # Sequence: seq + SEQ = 11 + + # Environment types: + # Unnumbered list + ENVUNNUM = 0 + # Numbered list + ENVNUM = 1 + # Indent + INDENT = 2 + envtypes = [ "*", "#", ":" ] + + tree = None + + def itend(self, line, pos): + while 1: + d = itend.search(line, pos) + if not d: + return -1 + elif d.start(0) == pos or line[d.start(0)-1] != "'": + return d.start(0) + else: + pos = d.start(0) + 1 + + la = None + def putback(self, line): + self.la = line + + def nextkn(self, curlev=0, type = -1): + while 1: + if self.la: + line = self.la + self.putback(None) + else: + try: + line = self.input() + except StopIteration: + line = u'' + if not line or line == "": + self.putback(line) + break + + m = eltbeg.match(line) + if m: + if m.group(0)[0] in self.envtypes: + btype = self.envtypes.index(m.group(0)[0]) + lev = len(m.group(0)) + if btype == type: + if lev == curlev: + yield(self.ITEM, + (self.SEQ, self.getkn(line[m.end(0):]))) + elif lev > curlev: + self.putback(line) + yield(self.ENV, btype, curlev + 1, + (self.SEQ, self.nextkn(curlev + 1, btype))) + else: + self.putback(line) + break + else: + self.putback(line) + yield(self.ENV, btype, 1, self.nextkn(1, btype)) + + else: + if curlev > 0: + self.putback(line) + break + elif m.group(0)[0:2] == "==" \ + and line.rstrip('\n').endswith(m.group(0)): + yield(self.HDR, len(m.group(0))-1, + self.getkn(line[m.end(0):-(1+len(m.group(0)))])) + elif m.group(0) == "----": + yield(self.BAR,) + else: + if curlev > 0: + self.putback(line) + break + yield(self.getkn(line)) + + + def getkn(self, line): + pos = 0 + while 1: + if pos == len(line): + break; + m = eltre.search(line, pos) + if not m: + yield(self.TEXT, line[pos:]) + pos = len(line) + else: + yield(self.TEXT, line[pos:m.start(0)]) + pos = m.end(0) + if m.group(0) == "[[" or m.group(0) == "{{": + d = delims[m.group(0)].search(line, pos) + if d.group(0) == "|": + e = ends[m.group(0)].search(line, d.end(0)) + target = (self.TEXT, line[pos:d.start(0)]) + text = (self.SEQ, self.getkn(line[d.end(0):e.start(0)])) + pos = e.end(0) + elif d.group(0) == term[m.group(0)]: + target = (self.TEXT, line[pos:d.start(0)]) + text = (self.NIL,) + pos = d.end(0) + if m.group(0) == "[[": + yield(self.LINK, target, text) + else: + yield(self.TMPL, target, text) + elif m.group(0) == "[": + i = line.find("]", m.end(0)) + if i == -1: + i = len(line) + (target,sep,text) = line[m.end(0):i].partition(' ') + yield(self.REF, + (self.TEXT, target), + (self.SEQ, self.getkn(text))) + pos = i + 1 + elif m.group(0) == "'''": + e = boend.search(line, m.end(0)) + if e: + i = e.start(0) + pos = e.end(0) + else: + pos = len(line) + i = pos + yield(self.BOLD, + (self.SEQ, self.getkn(line[m.end(0):i]))) + pos = e.end(0) + elif m.group(0) == "''": + i = self.itend(line, m.end(0)) + if i == -1: + i = len(line) + yield(self.IT, + (self.SEQ, self.getkn(line[m.end(0):i]))) + pos = i + 2 + + def input(self): + return None + + def expandtok(self, tok): + if type(tok) == GeneratorType: + subtree = [self.SEQ] + for t in tok: + x = self.expandtok(t) + if x: + subtree.append(x) + return tuple(subtree) if len(subtree) > 2 else \ + subtree[1] if len(subtree) == 2 else None + toktype = tok[0] + if toktype == self.NIL: + return None + if toktype == self.TEXT: + return tok if tok[1] != '' else None + elif toktype == self.LINK or toktype == self.TMPL \ + or toktype == self.REF: + return toktype, self.expandtok(tok[1]), self.expandtok(tok[2]) + elif toktype == self.IT or toktype == self.BOLD \ + or toktype == self.ITEM: + return toktype, self.expandtok(tok[1]) + elif toktype == self.HDR: + return toktype, tok[1], self.expandtok(tok[2]) + elif toktype == self.BAR: + return tok + elif toktype == self.ENV: + return toktype,tok[1],tok[2],self.expandtok(tok[3]) + elif toktype == self.SEQ: + if len(tok) == 2: + return self.expandtok(tok[1]) + elif len(tok) == 1: + return None + else: + subtree = [self.SEQ] + for t in tok[1:]: + x = self.expandtok(t) + if x: + subtree.append(x) + return tuple(subtree) if len(subtree) > 2 else \ + subtree[1] if len(subtree) == 2 else None + + def parse(self): + tree = [self.SEQ] + for tok in self.nextkn(): + tree.append(self.expandtok(tok)) + self.tree = tuple(tree) + + def prtok(self, tok, indent): + if not tok: + print " " * indent, "None" + return + toktype = tok[0] + if toktype == self.SEQ: + for t in tok[1:]: + self.prtok(t, indent) + else: + print " " * indent, + if toktype == self.NIL: + print "NIL" + if toktype == self.TEXT: + print "TEXT \"%s\"" % (tok[1].encode('string_escape')) + elif toktype == self.LINK: + print "LINK " + self.prtok(tok[1], indent+1) # target + self.prtok(tok[2], indent+1) # text + elif toktype == self.TMPL: + print "TMPL" + self.prtok(tok[1], indent+1) # target + self.prtok(tok[2], indent+1) # text + elif toktype == self.REF: + print "REF" + self.prtok(tok[1], indent+1) # target + self.prtok(tok[2], indent+1) # text + elif toktype == self.IT: + print "IT" + self.prtok(tok[1], indent+1) + elif toktype == self.BOLD: + print "BOLD" + self.prtok(tok[1], indent+1) + elif toktype == self.HDR: + print "HDR", tok[1] + self.prtok(tok[2], indent+1) + elif toktype == self.BAR: + print "BAR" + elif toktype == self.ENV: + print "ENV ",self.envtypes[tok[1]],tok[2] + self.prtok(tok[3], indent+1) + elif toktype == self.ITEM: + print "ITEM" + self.prtok(tok[1], indent+1) + + def output(self): + self.prtok(self.tree, 0) + + +class WikiMarkup (BaseWikiMarkup): + """ + A derived class, that supplies a basic input method. + + Three types of inputs are available: + + 1. filename=<file> + The file <file> is opened and used for input. + 2. file=<file> + The already opened file <file> is used for input. + 3. text=<string> + Input is taken from <string>, line by line. + + Usage: + + obj = WikiMarkup(arg=val) + obj.parse + ... Do whatever you need with obj.tree ... + + """ + file = None + text = None + def __init__(self, *args, **keywords): + if 'file' in keywords: + self.file = keywords['file'] + elif 'filename' in keywords: + self.file = open(keywords['filename']) + elif 'text' in keywords: + self.text = keywords['text'].split("\n") + + def __del__(self): + if self.file: + self.file.close() + + def input(self): + if self.file: + return self.file.readline() + elif self.text: + return self.text.pop(0) + else: + return None + |