From ae8b8bc81eab08b2ebe9f8c0957c085b5d45fc2b Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Mon, 2 Mar 2009 20:58:09 +0200 Subject: Rewrite from scratch. Text conversion almost(TM) works --- testdata/bold.wiki | 1 + testdata/boldit0.wiki | 1 + testdata/boldit1.wiki | 1 + testdata/boldit2.wiki | 1 + testdata/boldit3.wiki | 1 + testdata/header0.wiki | 3 + testdata/it.wiki | 1 + testdata/link.wiki | 1 + testdata/para.wiki | 5 + testdata/reclink.wiki | 1 + wiki2text.py | 177 +++++++----- wikicvt.py | 24 +- wikimarkup.py | 747 ++++++++++++++++++++++++++++---------------------- 13 files changed, 561 insertions(+), 403 deletions(-) create mode 100644 testdata/bold.wiki create mode 100644 testdata/boldit0.wiki create mode 100644 testdata/boldit1.wiki create mode 100644 testdata/boldit2.wiki create mode 100644 testdata/boldit3.wiki create mode 100644 testdata/header0.wiki create mode 100644 testdata/it.wiki create mode 100644 testdata/link.wiki create mode 100644 testdata/para.wiki create mode 100644 testdata/reclink.wiki diff --git a/testdata/bold.wiki b/testdata/bold.wiki new file mode 100644 index 0000000..90317f2 --- /dev/null +++ b/testdata/bold.wiki @@ -0,0 +1 @@ +now is the time for '''all good''' men to come to diff --git a/testdata/boldit0.wiki b/testdata/boldit0.wiki new file mode 100644 index 0000000..e1317e9 --- /dev/null +++ b/testdata/boldit0.wiki @@ -0,0 +1 @@ +now is the time for '''''all good''''' men to come to diff --git a/testdata/boldit1.wiki b/testdata/boldit1.wiki new file mode 100644 index 0000000..6ac9262 --- /dev/null +++ b/testdata/boldit1.wiki @@ -0,0 +1 @@ +now is the time for ''all '''good''''' men to come to diff --git a/testdata/boldit2.wiki b/testdata/boldit2.wiki new file mode 100644 index 0000000..0cca5c3 --- /dev/null +++ b/testdata/boldit2.wiki @@ -0,0 +1 @@ +now is the time for '''all ''good''''' men to come to diff --git a/testdata/boldit3.wiki b/testdata/boldit3.wiki new file mode 100644 index 0000000..49d8a7e --- /dev/null +++ b/testdata/boldit3.wiki @@ -0,0 +1 @@ +now is the time for ''all '''good''' men'' to come to diff --git a/testdata/header0.wiki b/testdata/header0.wiki new file mode 100644 index 0000000..e9bea57 --- /dev/null +++ b/testdata/header0.wiki @@ -0,0 +1,3 @@ +== Header == + +Paragraph. diff --git a/testdata/it.wiki b/testdata/it.wiki new file mode 100644 index 0000000..8e9e4f2 --- /dev/null +++ b/testdata/it.wiki @@ -0,0 +1 @@ +now is the time for ''all good'' men to come to diff --git a/testdata/link.wiki b/testdata/link.wiki new file mode 100644 index 0000000..3168c45 --- /dev/null +++ b/testdata/link.wiki @@ -0,0 +1 @@ +[[link|foo|bar|baz|text]] is a simple link. diff --git a/testdata/para.wiki b/testdata/para.wiki new file mode 100644 index 0000000..04395d4 --- /dev/null +++ b/testdata/para.wiki @@ -0,0 +1,5 @@ +First paragraph consists of two sentences. +Each sentence occupies a line. + +Second paragraph consists of two sentences as well. +Each of them, again, occupies its own line. diff --git a/testdata/reclink.wiki b/testdata/reclink.wiki new file mode 100644 index 0000000..a03db16 --- /dev/null +++ b/testdata/reclink.wiki @@ -0,0 +1 @@ +[[link|foo|bar|baz|text [[inny link|znów text]] słowo [[jeszcze link]]]] diff --git a/wiki2text.py b/wiki2text.py index f28c343..c41c4e0 100644 --- a/wiki2text.py +++ b/wiki2text.py @@ -26,7 +26,7 @@ class TextWikiMarkup (WikiMarkup): """ # Output width - width = 80 + width = 78 # Do not show references. references = False # Provide a minimum markup @@ -57,22 +57,26 @@ class TextWikiMarkup (WikiMarkup): for elt in wiki_ns_re[self.lang][str]: if str.beginswith(elt[0]) and str.endswith(elt[1]): return elt[2] - return None + return None + def mktgt(self, tgt, lang = None): if not lang: lang = self.lang return self.html_base % { 'lang' : lang } + urllib.quote(tgt) - def link(self, tok, env, istmpl): - arg = self.fmtok(tok[1], env) - text = self.fmtok(tok[2], env) + def fmtlink(self, elt, istmpl): + arg = self.format(elt[1][0]) + if len(elt[1]) > 1: + text = self.format(elt[1][1]) + else: + text = None (qual,sep,tgt) = arg.partition(':') if tgt != '': ns = self.wiki_ns_name(qual) if ns: if ns == 'NS_IMAGE': if not self.references: - return None + return "" text = "[%s: %s]" % (qual, text if text else arg) tgt = self.image_base + '/' + \ urllib.quote(tgt) + \ @@ -94,41 +98,9 @@ class TextWikiMarkup (WikiMarkup): return arg else: return text - - def str_link(self, tok, env): - return self.link(tok, env, False) - - def str_tmpl(self, tok, env): - return self.link(tok, env, True) - - def str_ref(self, tok, env): - return self.xref(self.fmtok(tok[2], env), self.fmtok(tok[1], env)) - - def str_it(self, tok, env): - if self.markup: - return "_" + self.fmtok(tok[1], env) + "_" - return self.fmtok(tok[1], env); - - def str_bold(self, tok, env): - if self.markup: - return self.fmtok(tok[1], env).upper() - return self.fmtok(tok[1], env); - - def str_hdr(self, tok, env): - level = tok[1] - return "\n\n" + ("*" * level) + " " + self.fmtok(tok[2], env) + "\n\n" - - def str_bar(self, tok, env): - w = self.width - if w < 5: - w = 5 - return "\n" + ("-" * (w - 5)).center(w - 1) + "\n" - - def str_env(self, tok, env): - self.num = 1 - return "\n" + self.fmtok(tok[3], tok) def indent (self, lev, text): + print "T \"",text,"\"" w = self.width self.width = w - lev if text.find('\n') == -1: @@ -136,34 +108,117 @@ class TextWikiMarkup (WikiMarkup): else: s = "" for elt in text.split('\n'): - s += (" " * lev) + elt - if elt == '': - s += "\n" + s += (" " * lev) + elt + '\n' self.width = w return s + + def fmtpara(self, input): + output = "" + linebuf = "" + length = 0 + for s in input.split(): + wlen = len(s) + if linebuf.endswith("."): + wsc = 2 + else: + wsc = 1 + if length + wsc + wlen > self.width: + # FIXME: fill out linebuf + output += linebuf + '\n' + wsc = 0 + length = 0 + linebuf = "" + linebuf += " " * wsc + s + length += wsc + wlen + return output + linebuf - def str_item(self, tok, env): - t = env[1] - lev = env[2] - if lev > self.width - 4: - lev = 1 - if t == self.INDENT: - return self.indent(lev, self.fmtok(tok[1], env)) - elif t == self.ENVNUM: - n = self.num - self.num += 1 - return "" + self.indent(lev, - "%d. %s" % (n, self.fmtok(tok[1], env))) - elif t == self.ENVUNNUM: - return "" + self.indent(lev, - "- " + self.fmtok(tok[1], env)) + def fmtelt(self, elt, indent=0): + if elt[0] == TEXT: + if isinstance(elt[1],list): + string = "" + for s in elt[1]: + if string: + if string.endswith("."): + string += " " + else: + string += " " + string += s.rstrip(" ") + else: + string = elt[1] + elif elt[0] == PARA: + string = ""; + for x in elt[1]: + string += self.format(x) + string = self.fmtpara(string) + '\n\n' + elif elt[0] == IT: + string = "" + for x in elt[1]: + s = self.format(x) + if s: + string += " " + s.rstrip(" ") + string = "_" + string.lstrip(" ") + "_" + elif elt[0] == BOLD: + string = "" + for x in elt[1]: + s = self.format(x) + if s: + if string.endswith("."): + string += " " + else: + string += " " + string += s.rstrip(" ") + string = string.upper() + elif elt[0] == LINK: + string = self.fmtlink(elt, False) + elif elt[0] == TMPL: + string = '\n' + self.fmtlink(elt, True) + '\n' + elif elt[0] == BAR: + w = self.width + if w < 5: + w = 5 + string = "\n" + ("-" * (w - 5)).center(w - 1) + "\n" + elif elt[0] == HDR: + level = elt[1] + string = "\n" + ("*" * level) + " " + \ + self.format(elt[2]).lstrip(" ") + "\n\n" + elif elt[0] == REF: + string = self.xref(self.format(elt[2]), elt[1]) + elif elt[0] == ENV: + type = elt[1] + lev = elt[2] + if lev > self.width - 4: + lev = 1 + string = "\n" + n = 1 + for s in elt[3]: + x = self.format(s) +# print "X",x + if type == ENVUNNUM: + string += self.indent(lev, "*" + x.lstrip(" ")) + '\n' + elif type == ENVNUM: + string += self.indent(lev, "%d. %s" % (n, x)) + '\n' + n += 1 + elif elt[0] == IND: + string = (" " * elt[1]) + self.format(elt[2]) + '\n' + else: + string = str(elt) + return string + + def format(self, elt, indent=0): + string = "" + if elt[0] == SEQ: + for x in elt[1]: + string += " " + self.format(x, indent) + else: + string += " " + self.fmtelt(elt, indent) + return string - def str_para(self, tok, env): - return "\n" - def __str__(self): - return self.fmtok(self.tree, None) + str = "" + for elt in self.tree: + str += self.format(elt) + return str class TextWiktionaryMarkup (TextWikiMarkup): """ diff --git a/wikicvt.py b/wikicvt.py index 758bcb1..a2e95e4 100644 --- a/wikicvt.py +++ b/wikicvt.py @@ -32,9 +32,11 @@ def main(): html = 1 lang = "pl" kwdict = {} + debug = 0 + try: - opts, args = getopt.getopt(sys.argv[1:], "hl:o:tv", - ["help", "lang=", "option=", + opts, args = getopt.getopt(sys.argv[1:], "d:hl:o:tv", + ["debug=", "help", "lang=", "option=", "text", "input-text", "verbose" ]) except getopt.GetoptError: usage(1) @@ -42,18 +44,20 @@ def main(): for o, a in opts: if o in ("-h", "--help"): usage() - if o in ("-v", "--verbose"): + elif o in ("-v", "--verbose"): verbose_flag = verbose_flag + 1 - if o in ("-t", "--text"): + elif o in ("-t", "--text"): html = 0 - if o in ("-l", "--lang"): + elif o in ("-l", "--lang"): lang = a - if o in ("-o", "--option"): + elif o in ("-o", "--option"): (kw,sep,val) = a.partition('=') if val != '': kwdict[kw] = eval(val) - if o == "--input-text": + elif o == "--input-text": input_text = True + elif o in ("-d", "--debug"): + debug = eval(a) if len(args) == 1: if args[0] == '-': @@ -68,11 +72,11 @@ def main(): markup = HtmlWiktionaryMarkup(**kwdict) else: markup = TextWiktionaryMarkup(**kwdict) - + markup.debug_level = debug markup.parse() print str(markup) - if verbose_flag > 0: - markup.output() +# if verbose_flag > 0: +# markup.output() if __name__ == '__main__': main() diff --git a/wikimarkup.py b/wikimarkup.py index 4fd4e44..9cfdb09 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -1,6 +1,6 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -# Copyright (C) 2008 Sergey Poznyakoff +# Copyright (C) 2008, 2009 Sergey Poznyakoff # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -19,327 +19,451 @@ import sys import re from types import * -__all__ = [ "BaseWikiMarkup", "WikiMarkup" ] - -eltbeg = re.compile("=+|(^----$)|^[\\*#:]+") -eltre = re.compile("(\\[\\[)|(\\{\\{)|\\[|(\\'\\'\\'?)") -delims = { "[[" : re.compile("\\||(\\]\\])"), - "{{" : re.compile("\\||(\\}\\})") } -term = { "[[" : "]]" , "{{" : "}}" } -ends = { "[[" : re.compile("(\\[\\[)|(\\]\\])"), - "{{" : re.compile("(\\{\\{)|(\\}\\})") } -itend = re.compile("\\'\\'($|[^\\'])") -boend = re.compile("\\'\\'\\'($|[^\\'])") +__all__ = [ "BaseWikiMarkup", "WikiMarkup", + "NIL", "TEXT", "DELIM", "NL", "PARA", + "IT", "BOLD", "LINK", "TMPL", + "BAR", "HDR", "REF", "ENV", "IND", "SEQ", + "ENVUNNUM", "ENVNUM", "envtypes" ] + +delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^:+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)") + +NIL = 0 +TEXT = 1 +DELIM = 2 +NL = 3 + +PARA = 4 +IT = 5 +BOLD = 6 +LINK = 7 +TMPL = 8 +BAR = 9 +HDR = 10 +REF = 11 +ENV = 12 +IND = 13 +SEQ = 14 + +# Environment types: +# Unnumbered list +ENVUNNUM = 0 +# Numbered list +ENVNUM = 1 +envtypes = [ "*", "#" ] class BaseWikiMarkup: - """ -A base class for handling Wiki markups. -It handles: - 1. paragraphs; - 2. basic block markup (headers, numbered and unnumbered lists, - indentations); - 3. basic inline markup (bold, italic); - 4. basic reference markup (links, templates, external links). - It does NOT handle: - 1. pseudo-html markup (, and similar); - 2. leading spaces meaning ``preserve formatting''; - 3. tables and math. - The above rests for FIXME. - - This class relies on its derived classes for providing input. They must - overload method `input', which must return one physical line of input for - each call. - - Variables: - - 1. tree - The parse tree. Valid after parse() finishes (see below). - - Methods: - - 1. parse() - Parse the input and build parse tree - - 2. input() - Virtual function. Return next line of input or None on EOF. - - 3. output() - Print the tree in internal representation. - """ - ## Token classes - # NIL: nothing - NIL = 0 - # TEXT: text - TEXT = 1 - # LINK: target, text - LINK = 2 - # Template: target, text - TMPL = 3 - # External ref: target, text - REF = 4 - # Italics: text - IT = 5 - # Bold: text - BOLD = 6 - # Header: level, text - HDR = 7 - # Horizontal bar: - BAR = 8 - # Environment: type, level - ENV = 9 - # Item: text - ITEM = 10 - # Sequence: seq - SEQ = 11 - # Paragraph - PARA = 12 - - # Environment types: - # Unnumbered list - ENVUNNUM = 0 - # Numbered list - ENVNUM = 1 - # Indent - INDENT = 2 - envtypes = [ "*", "#", ":" ] + toklist = None + tokind = 0 tree = None + + debug_level = 0 - def itend(self, line, pos): - while 1: - d = itend.search(line, pos) - if not d: - return -1 - elif d.start(0) == pos or line[d.start(0)-1] != "'": - return d.start(0) - else: - pos = d.start(0) + 1 - - def linkend(self, paren, line, pos): - r = ends[paren] - count = 1 - while count > 0: - m = r.search(line, pos); - if not m: - return len(line), len(line) - else: - pos = m.end(0) - if m.group(0) == paren: - count += 1 - else: - count -= 1 - return m.start(0), m.end(0) - - la = None - def putback(self, line): - self.la = line - - def nextkn(self, curlev=0, type = -1): + def dprint(self, lev, fmt, *argv): + if self.debug_level >= lev: + print "[DEBUG]", fmt % argv + + def tokread(self): + line = None + pos = 0 while 1: - if self.la: - line = self.la - self.putback(None) - else: + if (not line or pos == len(line)): try: line = self.input() + pos = 0 except StopIteration: line = u'' + if not line or line == "": - self.putback(line) + self.dprint(100, "YIELD: NIL") + yield(NIL,) break if line == '\n': - yield(self.PARA,) + self.dprint(100, "YIELD: NL") + yield(NL,line) + line = None continue + + self.dprint(100, "LINE: %s", line[pos:]) + m = delim.search(line, pos) - m = eltbeg.match(line) if m: - if m.group(0)[0] in self.envtypes: - btype = self.envtypes.index(m.group(0)[0]) - lev = len(m.group(0)) - if btype == type: - if lev == curlev: - yield(self.ITEM, - (self.SEQ, self.getkn(line[m.end(0):]))) - elif lev > curlev: - self.putback(line) - yield(self.ENV, btype, curlev + 1, - (self.SEQ, self.nextkn(curlev + 1, btype))) - else: - self.putback(line) + if (pos < m.start(0)): + self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)]) + yield(TEXT, line[pos:m.start(0)]) + pos = m.end(0) + if m.group(0)[0] in envtypes and line[pos] == ":": + self.dprint(100, "YIELD: DELIM %s, True", m.group(0)) + yield(DELIM, m.group(0), True) + pos += 1 + else: + self.dprint(100, "YIELD: DELIM %s", m.group(0)) + yield(DELIM, m.group(0)) + else: + if line[-1] == '\n': + self.dprint(100, "YIELD: TEXT %s", line[pos:-1]) + if line[pos:-1] != '': + yield(TEXT, line[pos:-1]) + self.dprint(100, "YIELD: NL") + yield(NL,'\n') + else: + self.dprint(100, "YIELD: TEXT %s", line[pos:]) + yield(TEXT, line[pos:]) + line = None + + def input(self): + return None + + def tokenize(self): + self.toklist = [] + for tok in self.tokread(): + self.toklist.append(tok) + + def peektkn(self): + return self.toklist[self.tokind] + + def setkn(self,val): + self.toklist[self.tokind] = val + + def getkn(self): + tok = self.toklist[self.tokind] + if tok[0] != NIL: + self.tokind = self.tokind + 1 + return tok + + def ungetkn(self): + self.tokind = self.tokind - 1 + return self.toklist[self.tokind] + + def parse_bold(self, nested = False): + self.dprint(80, "ENTER parse_bold(%s), tok %s", nested, self.peektkn()) + seq = [] + textlist = [] + while 1: + tok = self.getkn() + if tok[0] == TEXT: + textlist.append(tok[1]) + elif tok[0] == DELIM: + if tok[1] == "'''": + break + elif tok[1] == "''" and not nested: + if textlist: + seq.append((TEXT, textlist)) + textlist = [] + x = self.parse_it(True) + if not x: + self.dprint(80, "LEAVE parse_bold=None") + return None + seq.append(x) + else: + self.dprint(80, "LEAVE parse_bold=None") + return None + elif tok[0] == NL: + if self.peektkn()[0] == NL: + self.dprint(80, "LEAVE parse_bold=None") + return None + else: + self.dprint(80, "LEAVE parse_bold=None") + return None + if textlist: + seq.append((TEXT, textlist)) + self.dprint(80, "LEAVE parse_bold=(BOLD, %s", seq) + return (BOLD, seq) + + def parse_it(self, nested = False): + self.dprint(80, "ENTER parse_it(%s), tok %s", nested, self.peektkn()) + seq = [] + textlist = [] + while 1: + tok = self.getkn() + if tok[0] == TEXT: + textlist.append(tok[1]) + elif tok[0] == DELIM: + if tok[1] == "''": + break + elif tok[1] == "'''": + if nested: + # The tokenizer always puts longest match before the + # shortest one, so "'''" goes before "''". Swap + # them if the need is: + ntok = self.peektkn() + if ntok[0] == DELIM and ntok[1] == "''": + self.setkn((DELIM, "'''")) break + else: + self.dprint(80, "LEAVE parse_it=%s", "None") + return None else: - self.putback(line) - yield(self.ENV, btype, 1, self.nextkn(1, btype)) - + if textlist: + seq.append((TEXT, textlist)) + textlist = [] + x = self.parse_bold(True) + if not x: + self.dprint(80, "LEAVE parse_it=%s", "None") + return None + seq.append(x) else: - if curlev > 0: - self.putback(line) - break - elif m.group(0)[0:2] == "==" \ - and line.rstrip('\n').endswith(m.group(0)): - yield(self.HDR, len(m.group(0))-1, - self.getkn(line[m.end(0):-(1+len(m.group(0)))])) - elif m.group(0) == "----": - yield(self.BAR,) + self.dprint(80, "LEAVE parse_it=%s", "None") + return None + elif tok[0] == NL: + if self.peektkn()[0] == NL: + self.dprint(80, "LEAVE parse_it=%s", "None") + return None else: - if curlev > 0: - self.putback(line) + self.dprint(80, "LEAVE parse_it=%s", "None") + return None + if textlist: + seq.append((TEXT, textlist)) + self.dprint(80, "LEAVE parse_it=(IT,%s)", seq) + return (IT, seq) + + def parse_link(self, type, delim): + self.dprint(80, "ENTER parse_link(%s,%s), tok %s", + type, delim, self.peektkn()) + subtree = [] + list = [] + while 1: + tok = self.getkn() + if tok[0] == DELIM: + if tok[1] == delim: + if list: + subtree.append((SEQ,list)) break - yield(self.getkn(line)) + elif tok[1] == "|": + if len(list) > 1: + subtree.append((SEQ,list)) + else: + subtree.append(list[0]) + list = [] + else: + x = self.parse_inline(tok) + if x: + list.append(x) + else: + self.dprint(80, "LEAVE parse_link=%s", "None") + return None + elif tok[0] == TEXT: + list.append(tok) + else: + self.dprint(80, "LEAVE parse_link=%s", "None") + return None + self.dprint(80, "LEAVE parse_link=(%s,%s)", type, subtree) + return (type, subtree) - def getkn(self, line): - pos = 0 + def parse_ref(self): + self.dprint(80, "ENTER parse_ref, tok %s", self.peektkn()) + list = [] while 1: - if pos == len(line): - break; - m = eltre.search(line, pos) - if not m: - yield(self.TEXT, line[pos:]) - pos = len(line) + tok = self.getkn() + if tok[0] == DELIM: + if tok[1] == "]": + break + else: + x = self.parse_inline(tok) + if x: + list.append(x) + else: + self.dprint(80, "LEAVE parse_ref=%s", "None") + return None + elif tok[0] == TEXT: + list.append(tok) + elif tok[0] == NL: + continue else: - yield(self.TEXT, line[pos:m.start(0)]) - pos = m.end(0) - if m.group(0) == "[[" or m.group(0) == "{{": - d = delims[m.group(0)].search(line, pos) - if d.group(0) == "|": - target = (self.TEXT, line[pos:d.start(0)]) - (start,pos) = self.linkend(m.group(0), line, m.end(0)) - text = (self.SEQ, self.getkn(line[d.end(0):start])) - elif d.group(0) == term[m.group(0)]: - target = (self.TEXT, line[pos:d.start(0)]) - text = (self.NIL,) - pos = d.end(0) - if m.group(0) == "[[": - yield(self.LINK, target, text) + self.dprint(80, "LEAVE parse_ref=%s", "None") + return None + if len(list) == 0 or list[0][0] != TEXT: + self.dprint(80, "LEAVE parse_ref=%s", "None") + return None + (ref,sep,text) = list[0][1].partition(' ') + ret = (REF, ref, (SEQ, [(TEXT, text)] + list[1:])) + self.dprint(80, "LEAVE parse_ref= %s", ret) + return ret + + inline_delims = [ "''", "'''", "[", "[[", "{{" ] + def parse_inline(self, tok): + self.dprint(80, "ENTER parse_inline(%s), tok %s", tok, self.peektkn()) + tokind = self.tokind + if tok[1] == "''": + x = self.parse_it() + elif tok[1] == "'''": + x = self.parse_bold() + elif tok[1] == "[": + x = self.parse_ref() + elif tok[1] == "[[": + x = self.parse_link(LINK, "]]") + elif tok[1] == "{{": + x = self.parse_link(TMPL, "}}") + else: # FIXME + self.dprint(80, "LEAVE parse_inline=%s", "None") + x = None + if not x: + self.tokind = tokind + self.dprint(80, "LEAVE parse_inline=%s", x) + return x + + def parse_para(self): + self.dprint(80, "ENTER parse_para, tok %s", self.peektkn()) + seq = [] + textlist = [] + while 1: + tok = self.getkn() + if tok[0] == TEXT: + textlist.append(tok[1]) + elif tok[0] == NL: + tok = self.getkn() + if tok[0] == NL or tok[0] == NIL: + break + else: + self.ungetkn() + elif tok[0] == NIL: + break + elif tok[0] == DELIM: + if tok[1] in self.inline_delims: + if textlist: + seq.append((TEXT, textlist)) + textlist = [] + x = self.parse_inline(tok) + if x: + seq.append(x) else: - yield(self.TMPL, target, text) - elif m.group(0) == "[": - i = line.find("]", m.end(0)) - if i == -1: - i = len(line) - (target,sep,text) = line[m.end(0):i].partition(' ') - yield(self.REF, - (self.TEXT, target), - (self.SEQ, self.getkn(text))) - pos = i + 1 - elif m.group(0) == "'''": - e = boend.search(line, m.end(0)) - if e: - i = e.start(0) - pos = i + 3 + seq.append(tok) + break + else: + self.ungetkn() + break + if textlist: + seq.append((TEXT, textlist)) + self.dprint(80, "LEAVE parse_para=%s", seq) + return (PARA, seq) + + def parse_header(self, delim): + self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn()) + list = [] + while 1: + tok = self.getkn() + if tok[0] == NIL: + self.dprint(80, "LEAVE parse_header=%s", "None") + return None + elif tok[0] == TEXT: + list.append(tok) + elif tok[0] == DELIM: + if tok[1] == delim: + if self.peektkn()[0] == NL: + break else: - pos = len(line) - i = pos - yield(self.BOLD, - (self.SEQ, self.getkn(line[m.end(0):i]))) - elif m.group(0) == "''": - i = self.itend(line, m.end(0)) - if i == -1: - pos = len(line) - i = pos + self.dprint(80, "LEAVE parse_header=%s", "None") + return None + else: + x = self.parse_inline(tok) + if x: + list.append(x) else: - pos = i + 2 - yield(self.IT, - (self.SEQ, self.getkn(line[m.end(0):i]))) + self.dprint(80, "LEAVE parse_header=%s", "None") + return None #FIXME? + else: + self.dprint(80, "LEAVE parse_header=%s", "None") + return None + self.dprint(80, "LEAVE parse_header=(HDR, %s, (SEQ,%s))",len(delim)-1,list) + return (HDR,len(delim)-1,(SEQ,list)) - def input(self): - return None - def expandtok(self, tok): - if type(tok) == GeneratorType: - subtree = [self.SEQ] - for t in tok: - x = self.expandtok(t) + def parse_line(self): + self.dprint(80, "ENTER parse_line, tok %s", self.peektkn()) + list = [] + while 1: + tok = self.getkn() + if tok[0] == NL or tok[0] == NIL: + break + elif tok[0] == TEXT: + list.append(tok) + elif tok[0] == DELIM and tok[1][0] == ":": + list.append(self.parse_indent(len(tok[1]))) + break + else: + x = self.parse_inline(tok) if x: - subtree.append(x) - return tuple(subtree) if len(subtree) > 2 else \ - subtree[1] if len(subtree) == 2 else None + list.append(x) + else: + list.append(tok) + self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list) + return (SEQ, list) + + def parse_env(self, type, lev): + self.dprint(80, "ENTER parse_env(%s,%s), tok %s",type,lev,self.peektkn()) + list = [] + while 1: + tok = self.getkn() + if tok[0] == DELIM and tok[1][0] in envtypes and type == envtypes.index(tok[1][0]): + if len(tok[1]) < lev: + self.ungetkn() + break + elif len(tok[1]) > lev: + self.ungetkn() + elt = self.parse_env(type, len(tok[1])) + else: + elt = self.parse_line() + if len(tok) == 3: + if list[-1][0] != SEQ: + x = list[-1] + list[-1] = (SEQ, [x]) + list[-1][1].append(elt) + continue + list.append(elt) + else: + self.ungetkn() + break + self.dprint(80, "LEAVE parse_env=(ENV, %s, %s, %s)", type, lev, list) + return (ENV, type, lev, list) + + def parse_indent(self, lev): + self.dprint(80, "ENTER parse_indent(%s), tok %s", lev, self.peektkn()) + x = (IND, lev, self.parse_line()) + self.dprint(80, "LEAVE parse_indent=%s", x) + return x + + def parse0(self): + tok = self.getkn() toktype = tok[0] - if toktype == self.NIL: + if toktype == NIL: return None - if toktype == self.TEXT: - return tok if tok[1] != '' else None - elif toktype == self.LINK or toktype == self.TMPL \ - or toktype == self.REF: - return toktype, self.expandtok(tok[1]), self.expandtok(tok[2]) - elif toktype == self.IT or toktype == self.BOLD \ - or toktype == self.ITEM: - return toktype, self.expandtok(tok[1]) - elif toktype == self.HDR: - return toktype, tok[1], self.expandtok(tok[2]) - elif toktype == self.ENV: - return toktype,tok[1],tok[2],self.expandtok(tok[3]) - elif toktype == self.SEQ: - if len(tok) == 2: - return self.expandtok(tok[1]) - elif len(tok) == 1: - return None + elif toktype == TEXT: + self.ungetkn() + return self.parse_para() + elif toktype == DELIM: + if tok[1] == "----": + return (BAR,) + elif tok[1][0:2] == "==": + return self.parse_header(tok[1]) + elif tok[1][0] in envtypes: + type = envtypes.index(tok[1][0]) + lev = len(tok[1]) + self.ungetkn() + return self.parse_env(type, lev) + elif tok[1][0] == ":": + return self.parse_indent(len(tok[1])) else: - subtree = [self.SEQ] - for t in tok[1:]: - x = self.expandtok(t) - if x: - subtree.append(x) - return tuple(subtree) if len(subtree) > 2 else \ - subtree[1] if len(subtree) == 2 else None - else: - return tok - - def parse(self): - tree = [self.SEQ] - for tok in self.nextkn(): - tree.append(self.expandtok(tok)) - self.tree = tuple(tree) - - def prtok(self, tok, indent): - if not tok: - print " " * indent, "None" - return - toktype = tok[0] - if toktype == self.SEQ: - for t in tok[1:]: - self.prtok(t, indent) - else: - print " " * indent, - if toktype == self.NIL: - print "NIL" - if toktype == self.TEXT: - print "TEXT \"%s\"" % (tok[1].encode('string_escape')) - elif toktype == self.LINK: - print "LINK " - self.prtok(tok[1], indent+1) # target - self.prtok(tok[2], indent+1) # text - elif toktype == self.TMPL: - print "TMPL" - self.prtok(tok[1], indent+1) # target - self.prtok(tok[2], indent+1) # text - elif toktype == self.REF: - print "REF" - self.prtok(tok[1], indent+1) # target - self.prtok(tok[2], indent+1) # text - elif toktype == self.IT: - print "IT" - self.prtok(tok[1], indent+1) - elif toktype == self.BOLD: - print "BOLD" - self.prtok(tok[1], indent+1) - elif toktype == self.HDR: - print "HDR", tok[1] - self.prtok(tok[2], indent+1) - elif toktype == self.BAR: - print "BAR" - elif toktype == self.ENV: - print "ENV ",self.envtypes[tok[1]],tok[2] - self.prtok(tok[3], indent+1) - elif toktype == self.ITEM: - print "ITEM" - self.prtok(tok[1], indent+1) - elif toktype == self.PARA: - print "PARA" + self.ungetkn() + return self.parse_para() + elif toktype == NL: + return self.parse0() - def output(self): - self.prtok(self.tree, 0) + def parse(self): + if not self.toklist: + self.tokenize() + self.dprint(90, "TOKLIST: %s", self.toklist) + self.tokind = 0 + self.tree = [] + while 1: + subtree = self.parse0() + if subtree == None: + break + self.tree.append(subtree) + self.dprint(70, "TREE: %s", self.tree) + + def __str__(self): + return str(self.tree) class WikiMarkup (BaseWikiMarkup): @@ -698,52 +822,11 @@ class WikiMarkup (BaseWikiMarkup): "zu": "isiZulu" # Zulu } - def str_nil(self, tok, env): - return None - - def str_text(self, tok, env): - return tok[1] - - def str_seq(self, tok, env): - str = "" - for t in tok[1:]: - s = self.fmtok(t, env) - if s: - str += s - return str + - def fmtok(self, tok, env): - if type(tok) != TupleType: - return "" - toktype = tok[0] - if toktype == self.NIL: - return self.str_nil(tok, env) - if toktype == self.TEXT: - return self.str_text(tok, env) - elif toktype == self.LINK: - return self.str_link(tok, env) - elif toktype == self.TMPL: - return self.str_tmpl(tok, env) - elif toktype == self.REF: - return self.str_ref(tok, env) - elif toktype == self.IT: - return self.str_it(tok, env) - elif toktype == self.BOLD: - return self.str_bold(tok, env) - elif toktype == self.HDR: - return self.str_hdr(tok, env) - elif toktype == self.BAR: - return self.str_bar(tok, env) - elif toktype == self.ENV: - return self.str_env(tok, env) - elif toktype == self.ITEM: - return self.str_item(tok, env) - elif toktype == self.SEQ: - return self.str_seq(tok, env) - elif toktype == self.PARA: - return self.str_para(tok, env) + + + - def __str__(self): - return self.fmtok(self.tree, None) -- cgit v1.2.1