summaryrefslogtreecommitdiff
path: root/wikimarkup.py
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@Pirx.gnu.org.ua>2008-11-26 08:06:06 +0200
committerSergey Poznyakoff <gray@Pirx.gnu.org.ua>2008-11-26 08:06:06 +0200
commit5dc93e466efaaa243e6490961b6e545eaa65f06c (patch)
tree844b75613cabb2c0394828492038546c1f9806d8 /wikimarkup.py
downloadwikitrans-5dc93e466efaaa243e6490961b6e545eaa65f06c.tar.gz
wikitrans-5dc93e466efaaa243e6490961b6e545eaa65f06c.tar.bz2
Initial commit
Diffstat (limited to 'wikimarkup.py')
-rw-r--r--wikimarkup.py362
1 files changed, 362 insertions, 0 deletions
diff --git a/wikimarkup.py b/wikimarkup.py
new file mode 100644
index 0000000..3308da2
--- /dev/null
+++ b/wikimarkup.py
@@ -0,0 +1,362 @@
+#!/usr/bin/python
+# Copyright (C) 2008 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import re
+from types import *
+
+__all__ = [ "BaseWikiMarkup", "WikiMarkup" ]
+
+eltbeg = re.compile("=+|(^----$)|^[\\*#:]+")
+eltre = re.compile("(\\[\\[)|(\\{\\{)|\\[|(\\'\\'\\'?)")
+delims = { "[[" : re.compile("\\||(\\]\\])"),
+ "{{" : re.compile("\\||(\\}\\})") }
+term = { "[[" : "]]" , "{{" : "}}" }
+ends = { "[[" : re.compile("\\]\\]"),
+ "{{" : re.compile("\\}\\}") }
+itend = re.compile("\\'\\'($|[^\\'])")
+boend = re.compile("\\'\\'\\'($|[^\\'])")
+
+class BaseWikiMarkup:
+ """
+A base class for handling Wiki markups.
+It handles:
+ 1. basic block markup (headers, numbered and unnumbered lists,
+ indentations);
+ 2. basic inline markup (bold, italic);
+ 3. basic reference markup (links, templates, external links).
+ It does NOT handle:
+ 1. pseudo-html markup (<nowiki></nowiki>, and similar);
+ 2. leading spaces meaning ``preserve formatting'';
+ 3. tables and math.
+ The above rests for FIXME.
+
+ This class relies on its derived classes for providing input. They must
+ overload method `input', which must return one physical line of input for
+ each call.
+
+ Variables:
+
+ 1. tree
+ The parse tree. Valid after parse() finishes (see below).
+
+ Methods:
+
+ 1. parse()
+ Parse the input and build parse tree
+
+ 2. input()
+ Virtual function. Return next line of input or None on EOF.
+
+ 3. output()
+ Print the tree in internal representation.
+ """
+ ## Token classes
+ # NIL: nothing
+ NIL = 0
+ # TEXT: text
+ TEXT = 1
+ # LINK: target, text
+ LINK = 2
+ # Template: target, text
+ TMPL = 3
+ # External ref: target, text
+ REF = 4
+ # Italics: text
+ IT = 5
+ # Bold: text
+ BOLD = 6
+ # Header: level, text
+ HDR = 7
+ # Horizontal bar:
+ BAR = 8
+ # Environment: type, level
+ ENV = 9
+ # Item: text
+ ITEM = 10
+ # Sequence: seq
+ SEQ = 11
+
+ # Environment types:
+ # Unnumbered list
+ ENVUNNUM = 0
+ # Numbered list
+ ENVNUM = 1
+ # Indent
+ INDENT = 2
+ envtypes = [ "*", "#", ":" ]
+
+ tree = None
+
+ def itend(self, line, pos):
+ while 1:
+ d = itend.search(line, pos)
+ if not d:
+ return -1
+ elif d.start(0) == pos or line[d.start(0)-1] != "'":
+ return d.start(0)
+ else:
+ pos = d.start(0) + 1
+
+ la = None
+ def putback(self, line):
+ self.la = line
+
+ def nextkn(self, curlev=0, type = -1):
+ while 1:
+ if self.la:
+ line = self.la
+ self.putback(None)
+ else:
+ try:
+ line = self.input()
+ except StopIteration:
+ line = u''
+ if not line or line == "":
+ self.putback(line)
+ break
+
+ m = eltbeg.match(line)
+ if m:
+ if m.group(0)[0] in self.envtypes:
+ btype = self.envtypes.index(m.group(0)[0])
+ lev = len(m.group(0))
+ if btype == type:
+ if lev == curlev:
+ yield(self.ITEM,
+ (self.SEQ, self.getkn(line[m.end(0):])))
+ elif lev > curlev:
+ self.putback(line)
+ yield(self.ENV, btype, curlev + 1,
+ (self.SEQ, self.nextkn(curlev + 1, btype)))
+ else:
+ self.putback(line)
+ break
+ else:
+ self.putback(line)
+ yield(self.ENV, btype, 1, self.nextkn(1, btype))
+
+ else:
+ if curlev > 0:
+ self.putback(line)
+ break
+ elif m.group(0)[0:2] == "==" \
+ and line.rstrip('\n').endswith(m.group(0)):
+ yield(self.HDR, len(m.group(0))-1,
+ self.getkn(line[m.end(0):-(1+len(m.group(0)))]))
+ elif m.group(0) == "----":
+ yield(self.BAR,)
+ else:
+ if curlev > 0:
+ self.putback(line)
+ break
+ yield(self.getkn(line))
+
+
+ def getkn(self, line):
+ pos = 0
+ while 1:
+ if pos == len(line):
+ break;
+ m = eltre.search(line, pos)
+ if not m:
+ yield(self.TEXT, line[pos:])
+ pos = len(line)
+ else:
+ yield(self.TEXT, line[pos:m.start(0)])
+ pos = m.end(0)
+ if m.group(0) == "[[" or m.group(0) == "{{":
+ d = delims[m.group(0)].search(line, pos)
+ if d.group(0) == "|":
+ e = ends[m.group(0)].search(line, d.end(0))
+ target = (self.TEXT, line[pos:d.start(0)])
+ text = (self.SEQ, self.getkn(line[d.end(0):e.start(0)]))
+ pos = e.end(0)
+ elif d.group(0) == term[m.group(0)]:
+ target = (self.TEXT, line[pos:d.start(0)])
+ text = (self.NIL,)
+ pos = d.end(0)
+ if m.group(0) == "[[":
+ yield(self.LINK, target, text)
+ else:
+ yield(self.TMPL, target, text)
+ elif m.group(0) == "[":
+ i = line.find("]", m.end(0))
+ if i == -1:
+ i = len(line)
+ (target,sep,text) = line[m.end(0):i].partition(' ')
+ yield(self.REF,
+ (self.TEXT, target),
+ (self.SEQ, self.getkn(text)))
+ pos = i + 1
+ elif m.group(0) == "'''":
+ e = boend.search(line, m.end(0))
+ if e:
+ i = e.start(0)
+ pos = e.end(0)
+ else:
+ pos = len(line)
+ i = pos
+ yield(self.BOLD,
+ (self.SEQ, self.getkn(line[m.end(0):i])))
+ pos = e.end(0)
+ elif m.group(0) == "''":
+ i = self.itend(line, m.end(0))
+ if i == -1:
+ i = len(line)
+ yield(self.IT,
+ (self.SEQ, self.getkn(line[m.end(0):i])))
+ pos = i + 2
+
+ def input(self):
+ return None
+
+ def expandtok(self, tok):
+ if type(tok) == GeneratorType:
+ subtree = [self.SEQ]
+ for t in tok:
+ x = self.expandtok(t)
+ if x:
+ subtree.append(x)
+ return tuple(subtree) if len(subtree) > 2 else \
+ subtree[1] if len(subtree) == 2 else None
+ toktype = tok[0]
+ if toktype == self.NIL:
+ return None
+ if toktype == self.TEXT:
+ return tok if tok[1] != '' else None
+ elif toktype == self.LINK or toktype == self.TMPL \
+ or toktype == self.REF:
+ return toktype, self.expandtok(tok[1]), self.expandtok(tok[2])
+ elif toktype == self.IT or toktype == self.BOLD \
+ or toktype == self.ITEM:
+ return toktype, self.expandtok(tok[1])
+ elif toktype == self.HDR:
+ return toktype, tok[1], self.expandtok(tok[2])
+ elif toktype == self.BAR:
+ return tok
+ elif toktype == self.ENV:
+ return toktype,tok[1],tok[2],self.expandtok(tok[3])
+ elif toktype == self.SEQ:
+ if len(tok) == 2:
+ return self.expandtok(tok[1])
+ elif len(tok) == 1:
+ return None
+ else:
+ subtree = [self.SEQ]
+ for t in tok[1:]:
+ x = self.expandtok(t)
+ if x:
+ subtree.append(x)
+ return tuple(subtree) if len(subtree) > 2 else \
+ subtree[1] if len(subtree) == 2 else None
+
+ def parse(self):
+ tree = [self.SEQ]
+ for tok in self.nextkn():
+ tree.append(self.expandtok(tok))
+ self.tree = tuple(tree)
+
+ def prtok(self, tok, indent):
+ if not tok:
+ print " " * indent, "None"
+ return
+ toktype = tok[0]
+ if toktype == self.SEQ:
+ for t in tok[1:]:
+ self.prtok(t, indent)
+ else:
+ print " " * indent,
+ if toktype == self.NIL:
+ print "NIL"
+ if toktype == self.TEXT:
+ print "TEXT \"%s\"" % (tok[1].encode('string_escape'))
+ elif toktype == self.LINK:
+ print "LINK "
+ self.prtok(tok[1], indent+1) # target
+ self.prtok(tok[2], indent+1) # text
+ elif toktype == self.TMPL:
+ print "TMPL"
+ self.prtok(tok[1], indent+1) # target
+ self.prtok(tok[2], indent+1) # text
+ elif toktype == self.REF:
+ print "REF"
+ self.prtok(tok[1], indent+1) # target
+ self.prtok(tok[2], indent+1) # text
+ elif toktype == self.IT:
+ print "IT"
+ self.prtok(tok[1], indent+1)
+ elif toktype == self.BOLD:
+ print "BOLD"
+ self.prtok(tok[1], indent+1)
+ elif toktype == self.HDR:
+ print "HDR", tok[1]
+ self.prtok(tok[2], indent+1)
+ elif toktype == self.BAR:
+ print "BAR"
+ elif toktype == self.ENV:
+ print "ENV ",self.envtypes[tok[1]],tok[2]
+ self.prtok(tok[3], indent+1)
+ elif toktype == self.ITEM:
+ print "ITEM"
+ self.prtok(tok[1], indent+1)
+
+ def output(self):
+ self.prtok(self.tree, 0)
+
+
+class WikiMarkup (BaseWikiMarkup):
+ """
+ A derived class, that supplies a basic input method.
+
+ Three types of inputs are available:
+
+ 1. filename=<file>
+ The file <file> is opened and used for input.
+ 2. file=<file>
+ The already opened file <file> is used for input.
+ 3. text=<string>
+ Input is taken from <string>, line by line.
+
+ Usage:
+
+ obj = WikiMarkup(arg=val)
+ obj.parse
+ ... Do whatever you need with obj.tree ...
+
+ """
+ file = None
+ text = None
+ def __init__(self, *args, **keywords):
+ if 'file' in keywords:
+ self.file = keywords['file']
+ elif 'filename' in keywords:
+ self.file = open(keywords['filename'])
+ elif 'text' in keywords:
+ self.text = keywords['text'].split("\n")
+
+ def __del__(self):
+ if self.file:
+ self.file.close()
+
+ def input(self):
+ if self.file:
+ return self.file.readline()
+ elif self.text:
+ return self.text.pop(0)
+ else:
+ return None
+

Return to:

Send suggestions and report system problems to the System administrator.