summaryrefslogtreecommitdiffabout
authorSergey Poznyakoff <gray@gnu.org.ua>2009-03-02 18:58:09 (GMT)
committer Sergey Poznyakoff <gray@gnu.org.ua>2009-03-02 22:31:18 (GMT)
commitae8b8bc81eab08b2ebe9f8c0957c085b5d45fc2b (patch) (side-by-side diff)
tree551e90f993a83674faa367b776538c44704e78a6
parent86ee544f442aa3c4a0516a620890ec64de0770cc (diff)
downloadwikitrans-ae8b8bc81eab08b2ebe9f8c0957c085b5d45fc2b.tar.gz
wikitrans-ae8b8bc81eab08b2ebe9f8c0957c085b5d45fc2b.tar.bz2
Rewrite from scratch. Text conversion almost(TM) works
Diffstat (more/less context) (ignore whitespace changes)
-rw-r--r--testdata/bold.wiki1
-rw-r--r--testdata/boldit0.wiki1
-rw-r--r--testdata/boldit1.wiki1
-rw-r--r--testdata/boldit2.wiki1
-rw-r--r--testdata/boldit3.wiki1
-rw-r--r--testdata/header0.wiki3
-rw-r--r--testdata/it.wiki1
-rw-r--r--testdata/link.wiki1
-rw-r--r--testdata/para.wiki5
-rw-r--r--testdata/reclink.wiki1
-rw-r--r--wiki2text.py177
-rw-r--r--wikicvt.py24
-rw-r--r--wikimarkup.py747
13 files changed, 561 insertions, 403 deletions
diff --git a/testdata/bold.wiki b/testdata/bold.wiki
new file mode 100644
index 0000000..90317f2
--- a/dev/null
+++ b/testdata/bold.wiki
@@ -0,0 +1 @@
+now is the time for '''all good''' men to come to
diff --git a/testdata/boldit0.wiki b/testdata/boldit0.wiki
new file mode 100644
index 0000000..e1317e9
--- a/dev/null
+++ b/testdata/boldit0.wiki
@@ -0,0 +1 @@
+now is the time for '''''all good''''' men to come to
diff --git a/testdata/boldit1.wiki b/testdata/boldit1.wiki
new file mode 100644
index 0000000..6ac9262
--- a/dev/null
+++ b/testdata/boldit1.wiki
@@ -0,0 +1 @@
+now is the time for ''all '''good''''' men to come to
diff --git a/testdata/boldit2.wiki b/testdata/boldit2.wiki
new file mode 100644
index 0000000..0cca5c3
--- a/dev/null
+++ b/testdata/boldit2.wiki
@@ -0,0 +1 @@
+now is the time for '''all ''good''''' men to come to
diff --git a/testdata/boldit3.wiki b/testdata/boldit3.wiki
new file mode 100644
index 0000000..49d8a7e
--- a/dev/null
+++ b/testdata/boldit3.wiki
@@ -0,0 +1 @@
+now is the time for ''all '''good''' men'' to come to
diff --git a/testdata/header0.wiki b/testdata/header0.wiki
new file mode 100644
index 0000000..e9bea57
--- a/dev/null
+++ b/testdata/header0.wiki
@@ -0,0 +1,3 @@
+== Header ==
+
+Paragraph.
diff --git a/testdata/it.wiki b/testdata/it.wiki
new file mode 100644
index 0000000..8e9e4f2
--- a/dev/null
+++ b/testdata/it.wiki
@@ -0,0 +1 @@
+now is the time for ''all good'' men to come to
diff --git a/testdata/link.wiki b/testdata/link.wiki
new file mode 100644
index 0000000..3168c45
--- a/dev/null
+++ b/testdata/link.wiki
@@ -0,0 +1 @@
+[[link|foo|bar|baz|text]] is a simple link.
diff --git a/testdata/para.wiki b/testdata/para.wiki
new file mode 100644
index 0000000..04395d4
--- a/dev/null
+++ b/testdata/para.wiki
@@ -0,0 +1,5 @@
+First paragraph consists of two sentences.
+Each sentence occupies a line.
+
+Second paragraph consists of two sentences as well.
+Each of them, again, occupies its own line.
diff --git a/testdata/reclink.wiki b/testdata/reclink.wiki
new file mode 100644
index 0000000..a03db16
--- a/dev/null
+++ b/testdata/reclink.wiki
@@ -0,0 +1 @@
+[[link|foo|bar|baz|text [[inny link|znów text]] słowo [[jeszcze link]]]]
diff --git a/wiki2text.py b/wiki2text.py
index f28c343..c41c4e0 100644
--- a/wiki2text.py
+++ b/wiki2text.py
@@ -26,7 +26,7 @@ class TextWikiMarkup (WikiMarkup):
"""
# Output width
- width = 80
+ width = 78
# Do not show references.
references = False
# Provide a minimum markup
@@ -57,22 +57,26 @@ class TextWikiMarkup (WikiMarkup):
for elt in wiki_ns_re[self.lang][str]:
if str.beginswith(elt[0]) and str.endswith(elt[1]):
return elt[2]
- return None
+ return None
+
def mktgt(self, tgt, lang = None):
if not lang:
lang = self.lang
return self.html_base % { 'lang' : lang } + urllib.quote(tgt)
- def link(self, tok, env, istmpl):
- arg = self.fmtok(tok[1], env)
- text = self.fmtok(tok[2], env)
+ def fmtlink(self, elt, istmpl):
+ arg = self.format(elt[1][0])
+ if len(elt[1]) > 1:
+ text = self.format(elt[1][1])
+ else:
+ text = None
(qual,sep,tgt) = arg.partition(':')
if tgt != '':
ns = self.wiki_ns_name(qual)
if ns:
if ns == 'NS_IMAGE':
if not self.references:
- return None
+ return ""
text = "[%s: %s]" % (qual, text if text else arg)
tgt = self.image_base + '/' + \
urllib.quote(tgt) + \
@@ -94,41 +98,9 @@ class TextWikiMarkup (WikiMarkup):
return arg
else:
return text
-
- def str_link(self, tok, env):
- return self.link(tok, env, False)
-
- def str_tmpl(self, tok, env):
- return self.link(tok, env, True)
-
- def str_ref(self, tok, env):
- return self.xref(self.fmtok(tok[2], env), self.fmtok(tok[1], env))
-
- def str_it(self, tok, env):
- if self.markup:
- return "_" + self.fmtok(tok[1], env) + "_"
- return self.fmtok(tok[1], env);
-
- def str_bold(self, tok, env):
- if self.markup:
- return self.fmtok(tok[1], env).upper()
- return self.fmtok(tok[1], env);
-
- def str_hdr(self, tok, env):
- level = tok[1]
- return "\n\n" + ("*" * level) + " " + self.fmtok(tok[2], env) + "\n\n"
-
- def str_bar(self, tok, env):
- w = self.width
- if w < 5:
- w = 5
- return "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
-
- def str_env(self, tok, env):
- self.num = 1
- return "\n" + self.fmtok(tok[3], tok)
def indent (self, lev, text):
+ print "T \"",text,"\""
w = self.width
self.width = w - lev
if text.find('\n') == -1:
@@ -136,34 +108,117 @@ class TextWikiMarkup (WikiMarkup):
else:
s = ""
for elt in text.split('\n'):
- s += (" " * lev) + elt
- if elt == '':
- s += "\n"
+ s += (" " * lev) + elt + '\n'
self.width = w
return s
+
+ def fmtpara(self, input):
+ output = ""
+ linebuf = ""
+ length = 0
+ for s in input.split():
+ wlen = len(s)
+ if linebuf.endswith("."):
+ wsc = 2
+ else:
+ wsc = 1
+ if length + wsc + wlen > self.width:
+ # FIXME: fill out linebuf
+ output += linebuf + '\n'
+ wsc = 0
+ length = 0
+ linebuf = ""
+ linebuf += " " * wsc + s
+ length += wsc + wlen
+ return output + linebuf
- def str_item(self, tok, env):
- t = env[1]
- lev = env[2]
- if lev > self.width - 4:
- lev = 1
- if t == self.INDENT:
- return self.indent(lev, self.fmtok(tok[1], env))
- elif t == self.ENVNUM:
- n = self.num
- self.num += 1
- return "" + self.indent(lev,
- "%d. %s" % (n, self.fmtok(tok[1], env)))
- elif t == self.ENVUNNUM:
- return "" + self.indent(lev,
- "- " + self.fmtok(tok[1], env))
+ def fmtelt(self, elt, indent=0):
+ if elt[0] == TEXT:
+ if isinstance(elt[1],list):
+ string = ""
+ for s in elt[1]:
+ if string:
+ if string.endswith("."):
+ string += " "
+ else:
+ string += " "
+ string += s.rstrip(" ")
+ else:
+ string = elt[1]
+ elif elt[0] == PARA:
+ string = "";
+ for x in elt[1]:
+ string += self.format(x)
+ string = self.fmtpara(string) + '\n\n'
+ elif elt[0] == IT:
+ string = ""
+ for x in elt[1]:
+ s = self.format(x)
+ if s:
+ string += " " + s.rstrip(" ")
+ string = "_" + string.lstrip(" ") + "_"
+ elif elt[0] == BOLD:
+ string = ""
+ for x in elt[1]:
+ s = self.format(x)
+ if s:
+ if string.endswith("."):
+ string += " "
+ else:
+ string += " "
+ string += s.rstrip(" ")
+ string = string.upper()
+ elif elt[0] == LINK:
+ string = self.fmtlink(elt, False)
+ elif elt[0] == TMPL:
+ string = '\n' + self.fmtlink(elt, True) + '\n'
+ elif elt[0] == BAR:
+ w = self.width
+ if w < 5:
+ w = 5
+ string = "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
+ elif elt[0] == HDR:
+ level = elt[1]
+ string = "\n" + ("*" * level) + " " + \
+ self.format(elt[2]).lstrip(" ") + "\n\n"
+ elif elt[0] == REF:
+ string = self.xref(self.format(elt[2]), elt[1])
+ elif elt[0] == ENV:
+ type = elt[1]
+ lev = elt[2]
+ if lev > self.width - 4:
+ lev = 1
+ string = "\n"
+ n = 1
+ for s in elt[3]:
+ x = self.format(s)
+# print "X",x
+ if type == ENVUNNUM:
+ string += self.indent(lev, "*" + x.lstrip(" ")) + '\n'
+ elif type == ENVNUM:
+ string += self.indent(lev, "%d. %s" % (n, x)) + '\n'
+ n += 1
+ elif elt[0] == IND:
+ string = (" " * elt[1]) + self.format(elt[2]) + '\n'
+ else:
+ string = str(elt)
+ return string
+
+ def format(self, elt, indent=0):
+ string = ""
+ if elt[0] == SEQ:
+ for x in elt[1]:
+ string += " " + self.format(x, indent)
+ else:
+ string += " " + self.fmtelt(elt, indent)
+ return string
- def str_para(self, tok, env):
- return "\n"
-
def __str__(self):
- return self.fmtok(self.tree, None)
+ str = ""
+ for elt in self.tree:
+ str += self.format(elt)
+ return str
class TextWiktionaryMarkup (TextWikiMarkup):
"""
diff --git a/wikicvt.py b/wikicvt.py
index 758bcb1..a2e95e4 100644
--- a/wikicvt.py
+++ b/wikicvt.py
@@ -32,9 +32,11 @@ def main():
html = 1
lang = "pl"
kwdict = {}
+ debug = 0
+
try:
- opts, args = getopt.getopt(sys.argv[1:], "hl:o:tv",
- ["help", "lang=", "option=",
+ opts, args = getopt.getopt(sys.argv[1:], "d:hl:o:tv",
+ ["debug=", "help", "lang=", "option=",
"text", "input-text", "verbose" ])
except getopt.GetoptError:
usage(1)
@@ -42,18 +44,20 @@ def main():
for o, a in opts:
if o in ("-h", "--help"):
usage()
- if o in ("-v", "--verbose"):
+ elif o in ("-v", "--verbose"):
verbose_flag = verbose_flag + 1
- if o in ("-t", "--text"):
+ elif o in ("-t", "--text"):
html = 0
- if o in ("-l", "--lang"):
+ elif o in ("-l", "--lang"):
lang = a
- if o in ("-o", "--option"):
+ elif o in ("-o", "--option"):
(kw,sep,val) = a.partition('=')
if val != '':
kwdict[kw] = eval(val)
- if o == "--input-text":
+ elif o == "--input-text":
input_text = True
+ elif o in ("-d", "--debug"):
+ debug = eval(a)
if len(args) == 1:
if args[0] == '-':
@@ -68,11 +72,11 @@ def main():
markup = HtmlWiktionaryMarkup(**kwdict)
else:
markup = TextWiktionaryMarkup(**kwdict)
-
+ markup.debug_level = debug
markup.parse()
print str(markup)
- if verbose_flag > 0:
- markup.output()
+# if verbose_flag > 0:
+# markup.output()
if __name__ == '__main__':
main()
diff --git a/wikimarkup.py b/wikimarkup.py
index 4fd4e44..9cfdb09 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -1,6 +1,6 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
-# Copyright (C) 2008 Sergey Poznyakoff
+# Copyright (C) 2008, 2009 Sergey Poznyakoff
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -19,327 +19,451 @@ import sys
import re
from types import *
-__all__ = [ "BaseWikiMarkup", "WikiMarkup" ]
-
-eltbeg = re.compile("=+|(^----$)|^[\\*#:]+")
-eltre = re.compile("(\\[\\[)|(\\{\\{)|\\[|(\\'\\'\\'?)")
-delims = { "[[" : re.compile("\\||(\\]\\])"),
- "{{" : re.compile("\\||(\\}\\})") }
-term = { "[[" : "]]" , "{{" : "}}" }
-ends = { "[[" : re.compile("(\\[\\[)|(\\]\\])"),
- "{{" : re.compile("(\\{\\{)|(\\}\\})") }
-itend = re.compile("\\'\\'($|[^\\'])")
-boend = re.compile("\\'\\'\\'($|[^\\'])")
+__all__ = [ "BaseWikiMarkup", "WikiMarkup",
+ "NIL", "TEXT", "DELIM", "NL", "PARA",
+ "IT", "BOLD", "LINK", "TMPL",
+ "BAR", "HDR", "REF", "ENV", "IND", "SEQ",
+ "ENVUNNUM", "ENVNUM", "envtypes" ]
+
+delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^:+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)")
+
+NIL = 0
+TEXT = 1
+DELIM = 2
+NL = 3
+
+PARA = 4
+IT = 5
+BOLD = 6
+LINK = 7
+TMPL = 8
+BAR = 9
+HDR = 10
+REF = 11
+ENV = 12
+IND = 13
+SEQ = 14
+
+# Environment types:
+# Unnumbered list
+ENVUNNUM = 0
+# Numbered list
+ENVNUM = 1
+envtypes = [ "*", "#" ]
class BaseWikiMarkup:
- """
-A base class for handling Wiki markups.
-It handles:
- 1. paragraphs;
- 2. basic block markup (headers, numbered and unnumbered lists,
- indentations);
- 3. basic inline markup (bold, italic);
- 4. basic reference markup (links, templates, external links).
- It does NOT handle:
- 1. pseudo-html markup (<nowiki></nowiki>, and similar);
- 2. leading spaces meaning ``preserve formatting'';
- 3. tables and math.
- The above rests for FIXME.
-
- This class relies on its derived classes for providing input. They must
- overload method `input', which must return one physical line of input for
- each call.
-
- Variables:
-
- 1. tree
- The parse tree. Valid after parse() finishes (see below).
-
- Methods:
-
- 1. parse()
- Parse the input and build parse tree
-
- 2. input()
- Virtual function. Return next line of input or None on EOF.
-
- 3. output()
- Print the tree in internal representation.
- """
- ## Token classes
- # NIL: nothing
- NIL = 0
- # TEXT: text
- TEXT = 1
- # LINK: target, text
- LINK = 2
- # Template: target, text
- TMPL = 3
- # External ref: target, text
- REF = 4
- # Italics: text
- IT = 5
- # Bold: text
- BOLD = 6
- # Header: level, text
- HDR = 7
- # Horizontal bar:
- BAR = 8
- # Environment: type, level
- ENV = 9
- # Item: text
- ITEM = 10
- # Sequence: seq
- SEQ = 11
- # Paragraph
- PARA = 12
-
- # Environment types:
- # Unnumbered list
- ENVUNNUM = 0
- # Numbered list
- ENVNUM = 1
- # Indent
- INDENT = 2
- envtypes = [ "*", "#", ":" ]
+ toklist = None
+ tokind = 0
tree = None
+
+ debug_level = 0
- def itend(self, line, pos):
- while 1:
- d = itend.search(line, pos)
- if not d:
- return -1
- elif d.start(0) == pos or line[d.start(0)-1] != "'":
- return d.start(0)
- else:
- pos = d.start(0) + 1
-
- def linkend(self, paren, line, pos):
- r = ends[paren]
- count = 1
- while count > 0:
- m = r.search(line, pos);
- if not m:
- return len(line), len(line)
- else:
- pos = m.end(0)
- if m.group(0) == paren:
- count += 1
- else:
- count -= 1
- return m.start(0), m.end(0)
-
- la = None
- def putback(self, line):
- self.la = line
-
- def nextkn(self, curlev=0, type = -1):
+ def dprint(self, lev, fmt, *argv):
+ if self.debug_level >= lev:
+ print "[DEBUG]", fmt % argv
+
+ def tokread(self):
+ line = None
+ pos = 0
while 1:
- if self.la:
- line = self.la
- self.putback(None)
- else:
+ if (not line or pos == len(line)):
try:
line = self.input()
+ pos = 0
except StopIteration:
line = u''
+
if not line or line == "":
- self.putback(line)
+ self.dprint(100, "YIELD: NIL")
+ yield(NIL,)
break
if line == '\n':
- yield(self.PARA,)
+ self.dprint(100, "YIELD: NL")
+ yield(NL,line)
+ line = None
continue
+
+ self.dprint(100, "LINE: %s", line[pos:])
+ m = delim.search(line, pos)
- m = eltbeg.match(line)
if m:
- if m.group(0)[0] in self.envtypes:
- btype = self.envtypes.index(m.group(0)[0])
- lev = len(m.group(0))
- if btype == type:
- if lev == curlev:
- yield(self.ITEM,
- (self.SEQ, self.getkn(line[m.end(0):])))
- elif lev > curlev:
- self.putback(line)
- yield(self.ENV, btype, curlev + 1,
- (self.SEQ, self.nextkn(curlev + 1, btype)))
- else:
- self.putback(line)
+ if (pos < m.start(0)):
+ self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)])
+ yield(TEXT, line[pos:m.start(0)])
+ pos = m.end(0)
+ if m.group(0)[0] in envtypes and line[pos] == ":":
+ self.dprint(100, "YIELD: DELIM %s, True", m.group(0))
+ yield(DELIM, m.group(0), True)
+ pos += 1
+ else:
+ self.dprint(100, "YIELD: DELIM %s", m.group(0))
+ yield(DELIM, m.group(0))
+ else:
+ if line[-1] == '\n':
+ self.dprint(100, "YIELD: TEXT %s", line[pos:-1])
+ if line[pos:-1] != '':
+ yield(TEXT, line[pos:-1])
+ self.dprint(100, "YIELD: NL")
+ yield(NL,'\n')
+ else:
+ self.dprint(100, "YIELD: TEXT %s", line[pos:])
+ yield(TEXT, line[pos:])
+ line = None
+
+ def input(self):
+ return None
+
+ def tokenize(self):
+ self.toklist = []
+ for tok in self.tokread():
+ self.toklist.append(tok)
+
+ def peektkn(self):
+ return self.toklist[self.tokind]
+
+ def setkn(self,val):
+ self.toklist[self.tokind] = val
+
+ def getkn(self):
+ tok = self.toklist[self.tokind]
+ if tok[0] != NIL:
+ self.tokind = self.tokind + 1
+ return tok
+
+ def ungetkn(self):
+ self.tokind = self.tokind - 1
+ return self.toklist[self.tokind]
+
+ def parse_bold(self, nested = False):
+ self.dprint(80, "ENTER parse_bold(%s), tok %s", nested, self.peektkn())
+ seq = []
+ textlist = []
+ while 1:
+ tok = self.getkn()
+ if tok[0] == TEXT:
+ textlist.append(tok[1])
+ elif tok[0] == DELIM:
+ if tok[1] == "'''":
+ break
+ elif tok[1] == "''" and not nested:
+ if textlist:
+ seq.append((TEXT, textlist))
+ textlist = []
+ x = self.parse_it(True)
+ if not x:
+ self.dprint(80, "LEAVE parse_bold=None")
+ return None
+ seq.append(x)
+ else:
+ self.dprint(80, "LEAVE parse_bold=None")
+ return None
+ elif tok[0] == NL:
+ if self.peektkn()[0] == NL:
+ self.dprint(80, "LEAVE parse_bold=None")
+ return None
+ else:
+ self.dprint(80, "LEAVE parse_bold=None")
+ return None
+ if textlist:
+ seq.append((TEXT, textlist))
+ self.dprint(80, "LEAVE parse_bold=(BOLD, %s", seq)
+ return (BOLD, seq)
+
+ def parse_it(self, nested = False):
+ self.dprint(80, "ENTER parse_it(%s), tok %s", nested, self.peektkn())
+ seq = []
+ textlist = []
+ while 1:
+ tok = self.getkn()
+ if tok[0] == TEXT:
+ textlist.append(tok[1])
+ elif tok[0] == DELIM:
+ if tok[1] == "''":
+ break
+ elif tok[1] == "'''":
+ if nested:
+ # The tokenizer always puts longest match before the
+ # shortest one, so "'''" goes before "''". Swap
+ # them if the need is:
+ ntok = self.peektkn()
+ if ntok[0] == DELIM and ntok[1] == "''":
+ self.setkn((DELIM, "'''"))
break
+ else:
+ self.dprint(80, "LEAVE parse_it=%s", "None")
+ return None
else:
- self.putback(line)
- yield(self.ENV, btype, 1, self.nextkn(1, btype))
-
+ if textlist:
+ seq.append((TEXT, textlist))
+ textlist = []
+ x = self.parse_bold(True)
+ if not x:
+ self.dprint(80, "LEAVE parse_it=%s", "None")
+ return None
+ seq.append(x)
else:
- if curlev > 0:
- self.putback(line)
- break
- elif m.group(0)[0:2] == "==" \
- and line.rstrip('\n').endswith(m.group(0)):
- yield(self.HDR, len(m.group(0))-1,
- self.getkn(line[m.end(0):-(1+len(m.group(0)))]))
- elif m.group(0) == "----":
- yield(self.BAR,)
+ self.dprint(80, "LEAVE parse_it=%s", "None")
+ return None
+ elif tok[0] == NL:
+ if self.peektkn()[0] == NL:
+ self.dprint(80, "LEAVE parse_it=%s", "None")
+ return None
else:
- if curlev > 0:
- self.putback(line)
+ self.dprint(80, "LEAVE parse_it=%s", "None")
+ return None
+ if textlist:
+ seq.append((TEXT, textlist))
+ self.dprint(80, "LEAVE parse_it=(IT,%s)", seq)
+ return (IT, seq)
+
+ def parse_link(self, type, delim):
+ self.dprint(80, "ENTER parse_link(%s,%s), tok %s",
+ type, delim, self.peektkn())
+ subtree = []
+ list = []
+ while 1:
+ tok = self.getkn()
+ if tok[0] == DELIM:
+ if tok[1] == delim:
+ if list:
+ subtree.append((SEQ,list))
break
- yield(self.getkn(line))
+ elif tok[1] == "|":
+ if len(list) > 1:
+ subtree.append((SEQ,list))
+ else:
+ subtree.append(list[0])
+ list = []
+ else:
+ x = self.parse_inline(tok)
+ if x:
+ list.append(x)
+ else:
+ self.dprint(80, "LEAVE parse_link=%s", "None")
+ return None
+ elif tok[0] == TEXT:
+ list.append(tok)
+ else:
+ self.dprint(80, "LEAVE parse_link=%s", "None")
+ return None
+ self.dprint(80, "LEAVE parse_link=(%s,%s)", type, subtree)
+ return (type, subtree)
- def getkn(self, line):
- pos = 0
+ def parse_ref(self):
+ self.dprint(80, "ENTER parse_ref, tok %s", self.peektkn())
+ list = []
while 1:
- if pos == len(line):
- break;
- m = eltre.search(line, pos)
- if not m:
- yield(self.TEXT, line[pos:])
- pos = len(line)
+ tok = self.getkn()
+ if tok[0] == DELIM:
+ if tok[1] == "]":
+ break
+ else:
+ x = self.parse_inline(tok)
+ if x:
+ list.append(x)
+ else:
+ self.dprint(80, "LEAVE parse_ref=%s", "None")
+ return None
+ elif tok[0] == TEXT:
+ list.append(tok)
+ elif tok[0] == NL:
+ continue
else:
- yield(self.TEXT, line[pos:m.start(0)])
- pos = m.end(0)
- if m.group(0) == "[[" or m.group(0) == "{{":
- d = delims[m.group(0)].search(line, pos)
- if d.group(0) == "|":
- target = (self.TEXT, line[pos:d.start(0)])
- (start,pos) = self.linkend(m.group(0), line, m.end(0))
- text = (self.SEQ, self.getkn(line[d.end(0):start]))
- elif d.group(0) == term[m.group(0)]:
- target = (self.TEXT, line[pos:d.start(0)])
- text = (self.NIL,)
- pos = d.end(0)
- if m.group(0) == "[[":
- yield(self.LINK, target, text)
+ self.dprint(80, "LEAVE parse_ref=%s", "None")
+ return None
+ if len(list) == 0 or list[0][0] != TEXT:
+ self.dprint(80, "LEAVE parse_ref=%s", "None")
+ return None
+ (ref,sep,text) = list[0][1].partition(' ')
+ ret = (REF, ref, (SEQ, [(TEXT, text)] + list[1:]))
+ self.dprint(80, "LEAVE parse_ref= %s", ret)
+ return ret
+
+ inline_delims = [ "''", "'''", "[", "[[", "{{" ]
+ def parse_inline(self, tok):
+ self.dprint(80, "ENTER parse_inline(%s), tok %s", tok, self.peektkn())
+ tokind = self.tokind
+ if tok[1] == "''":
+ x = self.parse_it()
+ elif tok[1] == "'''":
+ x = self.parse_bold()
+ elif tok[1] == "[":
+ x = self.parse_ref()
+ elif tok[1] == "[[":
+ x = self.parse_link(LINK, "]]")
+ elif tok[1] == "{{":
+ x = self.parse_link(TMPL, "}}")
+ else: # FIXME
+ self.dprint(80, "LEAVE parse_inline=%s", "None")
+ x = None
+ if not x:
+ self.tokind = tokind
+ self.dprint(80, "LEAVE parse_inline=%s", x)
+ return x
+
+ def parse_para(self):
+ self.dprint(80, "ENTER parse_para, tok %s", self.peektkn())
+ seq = []
+ textlist = []
+ while 1:
+ tok = self.getkn()
+ if tok[0] == TEXT:
+ textlist.append(tok[1])
+ elif tok[0] == NL:
+ tok = self.getkn()
+ if tok[0] == NL or tok[0] == NIL:
+ break
+ else:
+ self.ungetkn()
+ elif tok[0] == NIL:
+ break
+ elif tok[0] == DELIM:
+ if tok[1] in self.inline_delims:
+ if textlist:
+ seq.append((TEXT, textlist))
+ textlist = []
+ x = self.parse_inline(tok)
+ if x:
+ seq.append(x)
else:
- yield(self.TMPL, target, text)
- elif m.group(0) == "[":
- i = line.find("]", m.end(0))
- if i == -1:
- i = len(line)
- (target,sep,text) = line[m.end(0):i].partition(' ')
- yield(self.REF,
- (self.TEXT, target),
- (self.SEQ, self.getkn(text)))
- pos = i + 1
- elif m.group(0) == "'''":
- e = boend.search(line, m.end(0))
- if e:
- i = e.start(0)
- pos = i + 3
+ seq.append(tok)
+ break
+ else:
+ self.ungetkn()
+ break
+ if textlist:
+ seq.append((TEXT, textlist))
+ self.dprint(80, "LEAVE parse_para=%s", seq)
+ return (PARA, seq)
+
+ def parse_header(self, delim):
+ self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn())
+ list = []
+ while 1:
+ tok = self.getkn()
+ if tok[0] == NIL:
+ self.dprint(80, "LEAVE parse_header=%s", "None")
+ return None
+ elif tok[0] == TEXT:
+ list.append(tok)
+ elif tok[0] == DELIM:
+ if tok[1] == delim:
+ if self.peektkn()[0] == NL:
+ break
else:
- pos = len(line)
- i = pos
- yield(self.BOLD,
- (self.SEQ, self.getkn(line[m.end(0):i])))
- elif m.group(0) == "''":
- i = self.itend(line, m.end(0))
- if i == -1:
- pos = len(line)
- i = pos
+ self.dprint(80, "LEAVE parse_header=%s", "None")
+ return None
+ else:
+ x = self.parse_inline(tok)
+ if x:
+ list.append(x)
else:
- pos = i + 2
- yield(self.IT,
- (self.SEQ, self.getkn(line[m.end(0):i])))
+ self.dprint(80, "LEAVE parse_header=%s", "None")
+ return None #FIXME?
+ else:
+ self.dprint(80, "LEAVE parse_header=%s", "None")
+ return None
+ self.dprint(80, "LEAVE parse_header=(HDR, %s, (SEQ,%s))",len(delim)-1,list)
+ return (HDR,len(delim)-1,(SEQ,list))
- def input(self):
- return None
- def expandtok(self, tok):
- if type(tok) == GeneratorType:
- subtree = [self.SEQ]
- for t in tok:
- x = self.expandtok(t)
+ def parse_line(self):
+ self.dprint(80, "ENTER parse_line, tok %s", self.peektkn())
+ list = []
+ while 1:
+ tok = self.getkn()
+ if tok[0] == NL or tok[0] == NIL:
+ break
+ elif tok[0] == TEXT:
+ list.append(tok)
+ elif tok[0] == DELIM and tok[1][0] == ":":
+ list.append(self.parse_indent(len(tok[1])))
+ break
+ else:
+ x = self.parse_inline(tok)
if x:
- subtree.append(x)
- return tuple(subtree) if len(subtree) > 2 else \
- subtree[1] if len(subtree) == 2 else None
+ list.append(x)
+ else:
+ list.append(tok)
+ self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list)
+ return (SEQ, list)
+
+ def parse_env(self, type, lev):
+ self.dprint(80, "ENTER parse_env(%s,%s), tok %s",type,lev,self.peektkn())
+ list = []
+ while 1:
+ tok = self.getkn()
+ if tok[0] == DELIM and tok[1][0] in envtypes and type == envtypes.index(tok[1][0]):
+ if len(tok[1]) < lev:
+ self.ungetkn()
+ break
+ elif len(tok[1]) > lev:
+ self.ungetkn()
+ elt = self.parse_env(type, len(tok[1]))
+ else:
+ elt = self.parse_line()
+ if len(tok) == 3:
+ if list[-1][0] != SEQ:
+ x = list[-1]
+ list[-1] = (SEQ, [x])
+ list[-1][1].append(elt)
+ continue
+ list.append(elt)
+ else:
+ self.ungetkn()
+ break
+ self.dprint(80, "LEAVE parse_env=(ENV, %s, %s, %s)", type, lev, list)
+ return (ENV, type, lev, list)
+
+ def parse_indent(self, lev):
+ self.dprint(80, "ENTER parse_indent(%s), tok %s", lev, self.peektkn())
+ x = (IND, lev, self.parse_line())
+ self.dprint(80, "LEAVE parse_indent=%s", x)
+ return x
+
+ def parse0(self):
+ tok = self.getkn()
toktype = tok[0]
- if toktype == self.NIL:
+ if toktype == NIL:
return None
- if toktype == self.TEXT:
- return tok if tok[1] != '' else None
- elif toktype == self.LINK or toktype == self.TMPL \
- or toktype == self.REF:
- return toktype, self.expandtok(tok[1]), self.expandtok(tok[2])
- elif toktype == self.IT or toktype == self.BOLD \
- or toktype == self.ITEM:
- return toktype, self.expandtok(tok[1])
- elif toktype == self.HDR:
- return toktype, tok[1], self.expandtok(tok[2])
- elif toktype == self.ENV:
- return toktype,tok[1],tok[2],self.expandtok(tok[3])
- elif toktype == self.SEQ:
- if len(tok) == 2:
- return self.expandtok(tok[1])
- elif len(tok) == 1:
- return None
+ elif toktype == TEXT:
+ self.ungetkn()
+ return self.parse_para()
+ elif toktype == DELIM:
+ if tok[1] == "----":
+ return (BAR,)
+ elif tok[1][0:2] == "==":
+ return self.parse_header(tok[1])
+ elif tok[1][0] in envtypes:
+ type = envtypes.index(tok[1][0])
+ lev = len(tok[1])
+ self.ungetkn()
+ return self.parse_env(type, lev)
+ elif tok[1][0] == ":":
+ return self.parse_indent(len(tok[1]))
else:
- subtree = [self.SEQ]
- for t in tok[1:]:
- x = self.expandtok(t)
- if x:
- subtree.append(x)
- return tuple(subtree) if len(subtree) > 2 else \
- subtree[1] if len(subtree) == 2 else None
- else:
- return tok
-
- def parse(self):
- tree = [self.SEQ]
- for tok in self.nextkn():
- tree.append(self.expandtok(tok))
- self.tree = tuple(tree)
-
- def prtok(self, tok, indent):
- if not tok:
- print " " * indent, "None"
- return
- toktype = tok[0]
- if toktype == self.SEQ:
- for t in tok[1:]:
- self.prtok(t, indent)
- else:
- print " " * indent,
- if toktype == self.NIL:
- print "NIL"
- if toktype == self.TEXT:
- print "TEXT \"%s\"" % (tok[1].encode('string_escape'))
- elif toktype == self.LINK:
- print "LINK "
- self.prtok(tok[1], indent+1) # target
- self.prtok(tok[2], indent+1) # text
- elif toktype == self.TMPL:
- print "TMPL"
- self.prtok(tok[1], indent+1) # target
- self.prtok(tok[2], indent+1) # text
- elif toktype == self.REF:
- print "REF"
- self.prtok(tok[1], indent+1) # target
- self.prtok(tok[2], indent+1) # text
- elif toktype == self.IT:
- print "IT"
- self.prtok(tok[1], indent+1)
- elif toktype == self.BOLD:
- print "BOLD"
- self.prtok(tok[1], indent+1)
- elif toktype == self.HDR:
- print "HDR", tok[1]
- self.prtok(tok[2], indent+1)
- elif toktype == self.BAR:
- print "BAR"
- elif toktype == self.ENV:
- print "ENV ",self.envtypes[tok[1]],tok[2]
- self.prtok(tok[3], indent+1)
- elif toktype == self.ITEM:
- print "ITEM"
- self.prtok(tok[1], indent+1)
- elif toktype == self.PARA:
- print "PARA"
+ self.ungetkn()
+ return self.parse_para()
+ elif toktype == NL:
+ return self.parse0()
- def output(self):
- self.prtok(self.tree, 0)
+ def parse(self):
+ if not self.toklist:
+ self.tokenize()
+ self.dprint(90, "TOKLIST: %s", self.toklist)
+ self.tokind = 0
+ self.tree = []
+ while 1:
+ subtree = self.parse0()
+ if subtree == None:
+ break
+ self.tree.append(subtree)
+ self.dprint(70, "TREE: %s", self.tree)
+
+ def __str__(self):
+ return str(self.tree)
class WikiMarkup (BaseWikiMarkup):
@@ -698,52 +822,11 @@ class WikiMarkup (BaseWikiMarkup):
"zu": "isiZulu" # Zulu
}
- def str_nil(self, tok, env):
- return None
-
- def str_text(self, tok, env):
- return tok[1]
-
- def str_seq(self, tok, env):
- str = ""
- for t in tok[1:]:
- s = self.fmtok(t, env)
- if s:
- str += s
- return str
+
- def fmtok(self, tok, env):
- if type(tok) != TupleType:
- return ""
- toktype = tok[0]
- if toktype == self.NIL:
- return self.str_nil(tok, env)
- if toktype == self.TEXT:
- return self.str_text(tok, env)
- elif toktype == self.LINK:
- return self.str_link(tok, env)
- elif toktype == self.TMPL:
- return self.str_tmpl(tok, env)
- elif toktype == self.REF:
- return self.str_ref(tok, env)
- elif toktype == self.IT:
- return self.str_it(tok, env)
- elif toktype == self.BOLD:
- return self.str_bold(tok, env)
- elif toktype == self.HDR:
- return self.str_hdr(tok, env)
- elif toktype == self.BAR:
- return self.str_bar(tok, env)
- elif toktype == self.ENV:
- return self.str_env(tok, env)
- elif toktype == self.ITEM:
- return self.str_item(tok, env)
- elif toktype == self.SEQ:
- return self.str_seq(tok, env)
- elif toktype == self.PARA:
- return self.str_para(tok, env)
+
+
+
- def __str__(self):
- return self.fmtok(self.tree, None)

Return to:

Send suggestions and report system problems to the System administrator.