summaryrefslogtreecommitdiffabout
authorSergey Poznyakoff <gray@gnu.org>2015-07-22 09:16:38 (GMT)
committer Sergey Poznyakoff <gray@gnu.org>2015-07-22 09:16:38 (GMT)
commit32be559549aab3d71bee6be566782eef6594442d (patch) (side-by-side diff)
tree71bde2c14190a742832b9a2591c3fb0aa4c2f75d
parent64cf5fdb880815ff21652ddb74e48490dd2f56fe (diff)
downloadwikitrans-32be559549aab3d71bee6be566782eef6594442d.tar.gz
wikitrans-32be559549aab3d71bee6be566782eef6594442d.tar.bz2
Major rewrite
Use dedicated classes, instead of dictionaries, to represent markup tokens * WikiTrans/wikitoken.py: New file. Defines Wiki markup tokens. * WikiTrans/wikimarkup.py: Rewrite. * WikiTrans/wiki2html.py: Update. * WikiTrans/wiki2texi.py: Update. * WikiTrans/wiki2text.py: Update. * bin/wikitrans: Update
Diffstat (more/less context) (ignore whitespace changes)
-rw-r--r--WikiTrans/wiki2html.py88
-rw-r--r--WikiTrans/wiki2texi.py112
-rw-r--r--WikiTrans/wiki2text.py81
-rw-r--r--WikiTrans/wikimarkup.py923
-rw-r--r--WikiTrans/wikitoken.py166
-rwxr-xr-xbin/wikitrans2
-rw-r--r--testdata/colon.html5
-rw-r--r--testdata/headings.html11
-rw-r--r--testdata/hz.html5
-rw-r--r--testdata/nowiki-ind.html7
-rw-r--r--testdata/nowiki.html1
-rw-r--r--testdata/numlist.html4
-rw-r--r--testdata/para.html4
-rw-r--r--testdata/unlist.html4
14 files changed, 770 insertions, 643 deletions
diff --git a/WikiTrans/wiki2html.py b/WikiTrans/wiki2html.py
index 754fa9b..122c91c 100644
--- a/WikiTrans/wiki2html.py
+++ b/WikiTrans/wiki2html.py
@@ -85,10 +85,10 @@ class HtmlWikiMarkup (WikiMarkup):
def fmtlink(self, elt, istmpl):
- arg = self.format(elt['content'][0])
+ arg = self.format(elt.content[0])
text = None
- if len(elt['content']) > 1:
- s = [x for x in map(self.format, elt['content'])]
+ if len(elt.content) > 1:
+ s = [x for x in map(self.format, elt.content)]
if s[0] == 'disambigR' or s[0] == 'wikiquote':
return ""
elif len(s) > 1 and s[1] == 'thumb':
@@ -133,8 +133,8 @@ class HtmlWikiMarkup (WikiMarkup):
return self.fmtlink(elt, True)
def str_ref(self, elt):
- target = elt['ref']
- text = self.format(elt['content'])
+ target = elt.ref
+ text = self.format(elt.content)
return "<a href=\"%s\">%s</a>" % (target,
text if (text and text != '') \
else target)
@@ -146,30 +146,30 @@ class HtmlWikiMarkup (WikiMarkup):
return string
def str_it(self, elt):
- return "<i>" + self.concat(elt['content']) + "</i>"
+ return "<i>" + self.concat(elt.content) + "</i>"
def str_bold(self, elt):
- return "<b>" + self.concat(elt['content']) + "</b>"
+ return "<b>" + self.concat(elt.content) + "</b>"
def str_hdr(self, elt):
- level = elt['level'] + 1
+ level = elt.level
if level > 4:
level = 4
- return "<h%s>%s</h%s>" % (level, self.format(elt['content']), level)
+ return "<h%s>%s</h%s>\n\n" % (level, self.format(elt.content), level)
def str_bar(self):
- return "<hr/>"
+ return "<hr/>\n"
def str_env(self, elt):
- type = elt['envtype']
- lev = elt['level']
+ type = elt.envtype
+ lev = elt.level
if lev > 4:
lev = 2
string = ""
- for s in elt['content']:
- n = s['subtype'];
+ for s in elt.content:
+ n = s.subtype;
string += "<%s>%s</%s>" % (self.envt[type]["elt"][n],
- self.format(s['content']),
+ self.format(s.content),
self.envt[type]["elt"][n])
return "<%s>%s</%s>" % (self.envt[type]["hdr"],
string,
@@ -177,72 +177,72 @@ class HtmlWikiMarkup (WikiMarkup):
return string
def str_tag(self, elt):
- if elt['tag'] == 'code':
+ if elt.tag == 'code':
self.nested += 1
- s = self.format(elt['content'])
+ s = self.format(elt.content)
self.nested -= 1
return '<pre><code>' + s + '</code></pre>' #FIXME
else:
- s = '<' + elt['tag']
- if elt['args']:
- s += ' ' + str(elt['args'])
+ s = '<' + elt.tag
+ if elt.args:
+ s += ' ' + str(elt.args)
s += '>'
- s += self.format(elt['content'])
- return s + '</' + elt['tag'] + '>'
+ s += self.format(elt.content)
+ return s + '</' + elt.tag + '>'
def str_para(self, elt):
string = "";
- for x in elt['content']:
+ for x in elt.content:
string += self.format(x)
- return "<p>" + string + "</p>"
+ return "<p>" + string + "</p>\n"
def str_pre(self, elt):
string = "";
- for x in elt['content']:
+ for x in elt.content:
string += self.format(x)
if self.nested:
return string
return '<pre>' + string + '</pre>'
def str_ind(self, elt):
- return ("<dl><dd>" * elt['level']) + self.format(elt['content']) + "</dd></dl>" * elt['level']
+ return ("<dl><dd>" * elt.level) + self.format(elt.content) + "</dd></dl>" * elt.level
def format(self, elt):
- if elt['type'] == 'TEXT':
- if isinstance(elt['content'],list):
+ if elt.type == 'TEXT':
+ if isinstance(elt.content,list):
string = ""
- for s in elt['content']:
+ for s in elt.content:
string += s
else:
- string = elt['content']
+ string = elt.content
return string
- elif elt['type'] == 'TAG':
+ elif elt.type == 'TAG':
return self.str_tag(elt)
- elif elt['type'] == 'PARA':
+ elif elt.type == 'PARA':
return self.str_para(elt)
- elif elt['type'] == 'PRE':
+ elif elt.type == 'PRE':
return self.str_pre(elt)
- elif elt['type'] == 'IT':
+ elif elt.type == 'IT':
return self.str_it(elt)
- elif elt['type'] == 'BOLD':
+ elif elt.type == 'BOLD':
return self.str_bold(elt)
- elif elt['type'] == 'LINK':
+ elif elt.type == 'LINK':
return self.str_link(elt)
- elif elt['type'] == 'TMPL':
+ elif elt.type == 'TMPL':
return self.str_tmpl(elt)
- elif elt['type'] == 'BAR':
+ elif elt.type == 'BAR':
return self.str_bar()
- elif elt['type'] == 'HDR':
+ elif elt.type == 'HDR':
return self.str_hdr(elt)
- elif elt['type'] == 'REF':
+ elif elt.type == 'REF':
return self.str_ref(elt)
- elif elt['type'] == 'ENV':
+ elif elt.type == 'ENV':
return self.str_env(elt)
- elif elt['type'] == 'IND':
+ elif elt.type == 'IND':
return self.str_ind(elt)
- elif elt['type'] == 'SEQ':
+ elif elt.type == 'SEQ':
string = ""
- for x in elt['content']:
+ for x in elt.content:
string += self.format(x)
return string
else:
diff --git a/WikiTrans/wiki2texi.py b/WikiTrans/wiki2texi.py
index f36c0a1..dfde565 100644
--- a/WikiTrans/wiki2texi.py
+++ b/WikiTrans/wiki2texi.py
@@ -101,72 +101,72 @@ class TexiWikiMarkup (WikiMarkup):
return self._end_print()
def format(self, elt):
- if elt['type'] == 'TEXT':
- if isinstance(elt['content'],list):
- for s in elt['content']:
+ if elt.type == 'TEXT':
+ if isinstance(elt.content,list):
+ for s in elt.content:
self._print(s)
else:
- self._print(elt['content'])
- elif elt['type'] == 'TAG':
+ self._print(elt.content)
+ elif elt.type == 'TAG':
self.str_tag(elt)
- elif elt['type'] == 'PARA':
+ elif elt.type == 'PARA':
self.str_para(elt)
- elif elt['type'] == 'PRE':
+ elif elt.type == 'PRE':
self.str_pre(elt)
- elif elt['type'] == 'IT':
+ elif elt.type == 'IT':
self.str_it(elt)
- elif elt['type'] == 'BOLD':
+ elif elt.type == 'BOLD':
self.str_bold(elt)
- elif elt['type'] == 'LINK':
+ elif elt.type == 'LINK':
self.str_link(elt)
- elif elt['type'] == 'TMPL':
+ elif elt.type == 'TMPL':
self.str_tmpl(elt)
- elif elt['type'] == 'BAR':
+ elif elt.type == 'BAR':
self.str_bar()
- elif elt['type'] == 'HDR':
+ elif elt.type == 'HDR':
self.str_hdr(elt)
- elif elt['type'] == 'REF':
+ elif elt.type == 'REF':
self.str_ref(elt)
- elif elt['type'] == 'ENV':
+ elif elt.type == 'ENV':
self.str_env(elt)
- elif elt['type'] == 'IND':
+ elif elt.type == 'IND':
self.str_ind(elt)
- elif elt['type'] == 'SEQ':
- for x in elt['content']:
+ elif elt.type == 'SEQ':
+ for x in elt.content:
self.format(x)
else:
self._print(str(elt))
def str_tag(self, elt):
- if elt['tag'] in ['code', 'tt']:
+ if elt.tag in ['code', 'tt']:
save = self._begin_print()
self.nested += 1
- self.format(elt['content'])
+ self.format(elt.content)
self.nested -= 1
s = self._end_print(save)
- if elt['isblock']:
+ if elt.isblock:
self._print('@example', nl=True, escape=False)
self._print(s, escape=False)
self._print('@end example\n', nl=True, escape=False)
else:
self._print('@code{%s}' % s, escape=False)
- elif elt['tag'] == 'div':
- if 'args' in elt and 'id' in elt['args']:
- self._print("@anchor{%s}\n" % elt['args']['id'],
+ elif elt.tag == 'div':
+ if elt.args and 'id' in elt.args:
+ self._print("@anchor{%s}\n" % elt.args['id'],
nl=True, escape=False)
- self.format(elt['content'])
+ self.format(elt.content)
else:
- self._print('<' + elt['tag'])
- if elt['args']:
- self._print(' ' + elt['args'])
+ self._print('<' + elt.tag)
+ if elt.args:
+ self._print(' ' + elt.args)
self._print('>');
- self.format(elt['content']);
- self._print('</' + elt['tag'] + '>')
+ self.format(elt.content);
+ self._print('</' + elt.tag + '>')
def str_para(self, elt):
if self.acc and not self.acc.endswith('\n\n'):
self._print('\n', nl=True)
- for x in elt['content']:
+ for x in elt.content:
self.format(x)
if self.acc and not self.acc.endswith('\n\n'):
self._print('\n', nl=True)
@@ -174,7 +174,7 @@ class TexiWikiMarkup (WikiMarkup):
def str_pre(self, elt):
if not self.nested:
self._print('@example\n', nl=True, escape=False)
- for x in elt['content']:
+ for x in elt.content:
self.format(x)
if not self.nested:
self._print('@end example\n', nl=True, escape=False)
@@ -185,26 +185,26 @@ class TexiWikiMarkup (WikiMarkup):
def str_it(self, elt):
self._print('@i{', escape=False)
- self.concat(elt['content'])
+ self.concat(elt.content)
self._print('}', escape=False)
def str_bold(self, elt):
self._print('@b{', escape=False)
- self.concat(elt['content'])
+ self.concat(elt.content)
self._print('}', escape=False)
def str_hdr(self, elt):
- level = elt['level']
+ level = elt.level
if level > len(self.sectcomm[self.sectioning_model]) - 1 - self.sectioning_start:
self._print("@* ", nl=True, escape=False)
- self.format(elt['content'])
+ self.format(elt.content)
else:
self._print(self.sectcomm[self.sectioning_model][level - self.sectioning_start] + " ", nl=True, escape=False)
- self.format(elt['content'])
+ self.format(elt.content)
self._print(None, nl=True)
if self.sectcomm[self.sectioning_model][0] == '@top':
self._print('@node ', nl=True, escape=False)
- self.format(elt['content'])
+ self.format(elt.content)
self._print('\n')
self._print(None, nl=True)
@@ -212,47 +212,47 @@ class TexiWikiMarkup (WikiMarkup):
self._print("\n-----\n")
def str_ind(self, elt):
- self._print("@w{ }" * elt['level'], nl=True, escape=False)
- self.format(elt['content'])
+ self._print("@w{ }" * elt.level, nl=True, escape=False)
+ self.format(elt.content)
self._print(None, nl=True)
def str_env(self, elt):
- if elt['envtype'] == 'unnumbered':
+ if elt.envtype == 'unnumbered':
self._print('@itemize @bullet\n', nl=True, escape=False)
- for s in elt['content']:
+ for s in elt.content:
self._print('@item ', nl=True, escape=False)
- self.format(s['content'])
+ self.format(s.content)
self._print(None, nl=True)
self._print('\n')
self._print('@end itemize\n', nl=True, escape=False)
- elif elt['envtype'] == 'numbered':
+ elif elt.envtype == 'numbered':
self._print('@enumerate\n', nl=True, escape=False)
- for s in elt['content']:
+ for s in elt.content:
self._print('@item ', nl=True, escape=False)
- self.format(s['content'])
+ self.format(s.content)
self._print(None, nl=True)
self._print('\n')
self._print('@end enumerate\n', nl=True, escape=False)
- elif elt['envtype'] == 'defn':
+ elif elt.envtype == 'defn':
self._print('@table @asis\n', nl=True, escape=False)
- for s in elt['content']:
- if s['subtype'] == 0:
+ for s in elt.content:
+ if s.subtype == 0:
self._print('@item ', nl=True, escape=False)
- self.format(s['content'])
+ self.format(s.content)
self._print(None, nl=True)
else:
- self.format(s['content'])
+ self.format(s.content)
self._print(None, nl=True)
self._print('\n')
self._print('@end table\n', nl=True, escape=False)
def str_link(self, elt):
save = self._begin_print()
- self.format(elt['content'][0])
+ self.format(elt.content[0])
arg = self._end_print()
- if len(elt['content']) > 1:
+ if len(elt.content) > 1:
s = []
- for x in elt['content'][0:2]:
+ for x in elt.content[0:2]:
self._begin_print()
self.format(x)
s.append(self._end_print())
@@ -279,9 +279,9 @@ class TexiWikiMarkup (WikiMarkup):
self._print("FIXME: str_tmpl not implemented\n")
def str_ref(self, elt):
- target = elt['ref']
+ target = elt.ref
save = self._begin_print()
- self.format(elt['content'])
+ self.format(elt.content)
text = self._end_print(save)
if text and text != '':
self._print("@uref{%s,%s}" % (target, text), escape=False)
diff --git a/WikiTrans/wiki2text.py b/WikiTrans/wiki2text.py
index 916391e..ee1748c 100644
--- a/WikiTrans/wiki2text.py
+++ b/WikiTrans/wiki2text.py
@@ -65,9 +65,9 @@ class TextWikiMarkup (WikiMarkup):
return self.html_base % { 'lang' : lang } + urllib.quote(tgt)
def fmtlink(self, elt, istmpl):
- arg = self.format(elt['content'][0])
- if len(elt['content']) > 1:
- s = [x for x in map(self.format, elt['content'])]
+ arg = self.format(elt.content[0])
+ if len(elt.content) > 1:
+ s = [x for x in map(self.format, elt.content)]
text = s[1]
else:
s = None
@@ -142,23 +142,23 @@ class TextWikiMarkup (WikiMarkup):
return output + linebuf
def str_tag(self, elt):
- if elt['tag'] == 'code':
+ if elt.tag == 'code':
self.nested += 1
- s = self.format(elt['content'])
+ s = self.format(elt.content)
self.nested -= 1
return s #FIXME
else:
- s = '<' + elt['tag']
- if elt['args']:
- s += ' ' + str(elt['args'])
- s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>'
+ s = '<' + elt.tag
+ if elt.args:
+ s += ' ' + str(elt.args)
+ s += '>' + self.format(elt.content) + '</' + elt.tag + '>'
return s
def format(self, elt):
- if elt['type'] == 'TEXT':
- if isinstance(elt['content'],list):
+ if elt.type == 'TEXT':
+ if isinstance(elt.content,list):
string = ""
- for s in elt['content']:
+ for s in elt.content:
if string:
if string.endswith("."):
string += " "
@@ -166,29 +166,29 @@ class TextWikiMarkup (WikiMarkup):
string += " "
string += s
else:
- string = elt['content']
- elif elt['type'] == 'PRE':
+ string = elt.content
+ elif elt.type == 'PRE':
string = ""
- for x in elt['content']:
+ for x in elt.content:
string += self.format(x)
string += '\n'
- elif elt['type'] == 'PARA':
+ elif elt.type == 'PARA':
string = "";
- for x in elt['content']:
+ for x in elt.content:
string += self.format(x)
string = self.fmtpara(string) + '\n\n'
- elif elt['type'] == 'TAG':
+ elif elt.type == 'TAG':
string = self.str_tag(elt)
- elif elt['type'] == 'IT':
+ elif elt.type == 'IT':
string = ""
- for x in elt['content']:
+ for x in elt.content:
s = self.format(x)
if s:
string += " " + s
string = "_" + string.lstrip(" ") + "_"
- elif elt['type'] == 'BOLD':
+ elif elt.type == 'BOLD':
string = ""
- for x in elt['content']:
+ for x in elt.content:
s = self.format(x)
if s:
if string.endswith("."):
@@ -197,54 +197,53 @@ class TextWikiMarkup (WikiMarkup):
string += " "
string += s
string = string.upper()
- elif elt['type'] == 'LINK':
+ elif elt.type == 'LINK':
string = self.fmtlink(elt, False)
- elif elt['type'] == 'TMPL':
+ elif elt.type == 'TMPL':
s = self.fmtlink(elt, True)
if s:
string = '[' + s + ']'
else:
string = s
- elif elt['type'] == 'BAR':
+ elif elt.type == 'BAR':
w = self.width
if w < 5:
w = 5
string = "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
- elif elt['type'] == 'HDR':
- level = elt['level']
- string = "\n" + ("*" * level) + " " + \
- self.format(elt['content']).lstrip(" ") + "\n\n"
- elif elt['type'] == 'REF':
- string = self.xref(self.format(elt['content']), elt['ref'])
- elif elt['type'] == 'ENV':
- type = elt['envtype']
- lev = elt['level']
+ elif elt.type == 'HDR':
+ string = "\n" + ("*" * elt.level) + " " + \
+ self.format(elt.content).lstrip(" ") + "\n\n"
+ elif elt.type == 'REF':
+ string = self.xref(self.format(elt.content), elt.ref)
+ elif elt.type == 'ENV':
+ type = elt.envtype
+ lev = elt.level
if lev > self.width - 4:
lev = 1
string = ""
n = 1
- for s in elt['content']:
+ for s in elt.content:
if not string.endswith("\n"):
string += "\n"
- x = self.format(s['content'])
+ x = self.format(s.content)
if type == "unnumbered":
string += self.fmtpara(self.indent(lev, "- " + x.lstrip(" ")))
elif type == "numbered":
string += self.fmtpara(self.indent(lev, "%d. %s" % (n, x)))
n += 1
elif type == "defn":
- if s['subtype'] == 0:
+ if s.subtype == 0:
string += self.indent(lev-1, x)
else:
string += self.indent(lev+3, x)
if not string.endswith("\n"):
string += "\n"
- elif elt['type'] == 'IND':
- string = (" " * elt['level']) + self.format(elt['content']) + '\n'
- elif elt['type'] == 'SEQ':
+ elif elt.type == 'IND':
+ string = (" " * elt.level) + self.format(elt.content) + '\n'
+ elif elt.type == 'SEQ':
string = ""
- for x in elt['content']:
+ for x in elt.content:
if len(string) > 1 and not string[-1].isspace():
string += ' '
string += self.format(x)
diff --git a/WikiTrans/wikimarkup.py b/WikiTrans/wikimarkup.py
index 2fad0af..f971347 100644
--- a/WikiTrans/wikimarkup.py
+++ b/WikiTrans/wikimarkup.py
@@ -19,10 +19,15 @@ from __future__ import print_function
import sys
import re
from types import *
+from wikitoken import *
__all__ = [ "BaseWikiMarkup", "WikiMarkup",
"TagAttributes", "TagAttributeSyntax" ]
+class UnexpectedToken(Exception):
+ def __init__(self, value):
+ self.value = value
+
class TagAttributeSyntax(Exception):
def __init__(self, value):
self.value = value
@@ -116,111 +121,11 @@ class BaseWikiMarkup(object):
def dprint(self, lev, fmt, *argv):
if self.debug_level >= lev:
- print("[DEBUG]", fmt % argv)
-
- def print_dump_prefix(self, level, file):
- file.write("[DUMP]" + ' ' * (2*level + 1))
+ for l in (fmt % argv).split('\n'):
+ print("[DEBUG] %s" % l)
- def dump_nil(self, node, level, file):
- pass
-
- def dump_text(self, node, level, file):
- self.print_dump_prefix(level, file)
- file.write("CONTENT: \"%s\"\n" % node['content'])
-
- def dump_delim(self, node, level, file):
- file.write("'%s'" % node['content'])
- if 'continuation' in node and node['continuation']:
- file.write(" (cont)")
- file.write("\n")
-
- def dump_tag(self, node, level, file):
- self.print_dump_prefix(level, file)
- file.write("TAG: %s\n" % node['tag'])
- if 'isblock' in node:
- self.print_dump_prefix(level, file)
- file.write("PLACEMENT: %s\n" % ('BLOCK' if node['isblock'] else 'INLINE'))
- if 'args' in node:
- self.print_dump_prefix(level, file)
- file.write("ARGS: %s\n" % node['args'])
- if 'content' in node:
- self.dump_node(node['content'], level + 1, file)
-
- def dump_seq(self, node, level, file):
- self.dump(node['content'], level + 1, file)
-
- def dump_ref(self, node, level, file):
- self.print_dump_prefix(level, file)
- file.write("REF: %s\n" % node['ref'])
- self.dump_node(node['content'], level + 1, file)
-
- def dump_hdr(self, node, level, file):
- self.print_dump_prefix(level, file)
- file.write("LEVEL: %s\n" % node['level'])
- self.dump_node(node['content'], level + 1, file)
-
- def dump_elt(self, node, level, file):
- self.print_dump_prefix(level, file)
- file.write("SUBTYPE: %s\n" % node['subtype'])
- self.dump_node(node['content'], level + 1, file)
-
- def dump_env(self, node, level, file):
- self.print_dump_prefix(level, file)
- file.write("ENVTYPE: %s\n" % node['envtype'])
- self.print_dump_prefix(level, file)
- file.write("LEVEL: %s\n" % node['level'])
- self.dump(node['content'], level + 1, file)
-
- def dump_ind(self, node, level, file):
- self.print_dump_prefix(level, file)
- file.write("LEVEL: %s\n" % node['level'])
- self.dump_node(node['content'], level + 1, file)
-
- def dump_link(self, node, level, file):
- self.dump(node['content'], level + 1, file)
-
- dump_type = {
- 'NIL': dump_nil,
- 'NL': dump_nil,
- 'TEXT': dump_text,
- 'DELIM': dump_delim,
- 'OTAG': dump_tag,
- 'CTAG': dump_tag,
- 'TAG': dump_tag,
- 'SEQ': dump_seq,
- 'REF': dump_ref,
- 'HDR': dump_hdr,
- 'ELT': dump_elt,
- 'ENV': dump_env,
- 'IND': dump_ind,
- 'BAR': dump_nil,
- 'PARA': dump_seq,
- 'PRE': dump_text,
- 'BOLD': dump_seq,
- 'IT': dump_seq,
- 'LINK': dump_link,
- }
+ inline_delims = [ "''", "'''", "[", "]", "[[", "]]", "{{", "}}", "|" ]
- def dump_node(self, node, level, file):
- if type(node) != dict:
- file.write("UNHANDLED NODE: %s, %s\n" % (type(node),node))
- return
-
- self.print_dump_prefix(level, file)
- file.write("NODE " + node['type'] + ":\n")
- if node['type'] in self.dump_type:
- self.dump_type[node['type']](self, node, level, file)
- else:
- self.print_dump_prefix(level, file)
- file.write("(UNHANDLED) ")
- file.write("%s\n" % node)
- self.print_dump_prefix(level, file)
- file.write("END NODE " + node['type'] + "\n")
-
- def dump(self, tree, level=0, file=sys.stdout):
- for node in tree:
- self.dump_node(node, level, file)
-
def tokread(self):
line = None
pos = 0
@@ -233,11 +138,11 @@ class BaseWikiMarkup(object):
line = u''
if not line or line == "":
- yield({ 'type': 'NIL' })
+ yield(WikiNode(type='NIL'))
break
if line == '\n':
- yield({ 'type': 'NL', 'content': line })
+ yield(WikiNode(type='NL'))
line = None
continue
@@ -246,7 +151,7 @@ class BaseWikiMarkup(object):
if m:
if (pos < m.start(0)):
- yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
+ yield(WikiTextNode(content=line[pos:m.start(0)]))
pos = m.start(0)
t = None
@@ -260,13 +165,11 @@ class BaseWikiMarkup(object):
try:
m = self.ctag.search(line, pos)
if m and m.group('tag') == 'nowiki':
- yield({ 'type': 'TEXT',
- 'content': line[pos:m.start(0)] })
+ yield(WikiTextNode(content=line[pos:m.start(0)] ))
pos = m.end(0)
break
- yield({ 'type': 'TEXT',
- 'content': line[pos:] })
+ yield(WikiTextNode(content=line[pos:]))
line = self.input()
pos = 0
@@ -275,65 +178,60 @@ class BaseWikiMarkup(object):
continue
elif m.group('tag') in self.tags:
try:
- t = { 'type': 'OTAG',
- 'tag': m.group('tag'),
- 'args': TagAttributes(m.group('args')) }
- yield(t)
+ yield(WikiTagNode(type='OTAG',
+ tag=m.group('tag'),
+ isblock=(line[pos] == '\n'),
+ args=TagAttributes(m.group('args'))))
if m.group('closed'):
- t['type'] = 'CTAG'
- yield(t)
+ yield(WikiTagNode(type='CTAG',
+ tag=m.group('tag')))
except TagAttributeSyntax:
- yield({'type': 'TEXT',
- 'content': m.group(0)})
+ yield(WikiTextNode(content=m.group(0)))
continue
else:
- yield({ 'type': 'TEXT',
- 'content': m.group(0) })
+ yield(WikiTextNode(content=m.group(0)))
continue
else:
m = self.ctag.match(line, pos)
if m:
if m.group('tag') in self.tags:
- yield( { 'type': 'CTAG',
- 'tag': m.group('tag') } )
+ yield(WikiTagNode(type='CTAG',
+ tag=m.group('tag')))
pos = m.end(0)
continue
else:
- yield( { 'type': 'TEXT',
- 'content': line[pos:pos+1] })
+ yield(WikiTextNode(content=line[pos:pos+1]))
pos += 1
continue
else:
pos = m.end(0)
content = m.group(0)
if content[0] in self.envtypes:
- t = { 'type': 'DELIM',
- 'content': content,
- 'continuation': pos < len(line) and line[pos] == ":" }
- if t['continuation']:
- t['content'] += t['content'][0]
+ node = WikiDelimNode(type='DELIM',
+ content=content,
+ isblock=True,
+ continuation=pos < len(line) and line[pos] == ":")
+ if node.continuation:
+ node.content += node.content[0]
pos += 1
- yield(t)
+ yield(node)
while pos < len(line) and line[pos] in [' ', '\t']:
pos += 1
else:
- yield({ 'type': 'DELIM',
- 'content': content.strip(),
- 'continuation': False})
+ yield(WikiDelimNode(type='DELIM',
+ isblock=(content.strip() not in self.inline_delims),
+ content=content.strip()))
continue
if line:
if line[-1] == '\n':
if line[pos:-1] != '':
- yield({ 'type': 'TEXT',
- 'content': line[pos:-1] })
- yield({ 'type': 'NL',
- 'content': '\n' })
+ yield(WikiTextNode(content=line[pos:-1]))
+ yield(WikiNode(type='NL'))
else:
- yield({ 'type': 'TEXT',
- 'content': line[pos:] })
+ yield(WikiTextNode(content=line[pos:]))
line = None
@@ -364,11 +262,11 @@ class BaseWikiMarkup(object):
# 3b. ''a b '''c d'''''
stack = []
for i in range(0,len(self.toklist)):
- if self.toklist[i]['type'] == 'DELIM' \
- and (self.toklist[i]['content'] == "''" \
- or self.toklist[i]['content'] == "'''"):
+ if self.toklist[i].type == 'DELIM' \
+ and (self.toklist[i].content == "''" \
+ or self.toklist[i].content == "'''"):
if len(stack) > 0:
- if self.toklist[stack[-1]]['content'] == self.toklist[i]['content']:
+ if self.toklist[stack[-1]].content == self.toklist[i].content:
# Case 1: just pop the matching delimiter off the stack
stack.pop()
elif len(stack) == 2 and stack[-2] + 1 == stack[-1]:
@@ -377,8 +275,8 @@ class BaseWikiMarkup(object):
# and pop off the matching one
stack.pop()
elif i < len(self.toklist) \
- and self.toklist[i+1]['type'] == 'DELIM' \
- and self.toklist[stack[-1]]['content'] == self.toklist[i+1]['content']:
+ and self.toklist[i+1].type == 'DELIM' \
+ and self.toklist[stack[-1]].content == self.toklist[i+1].content:
# Case 3: swap current and next tokens
self.swaptkn(i, i+1)
# and pop off the matching one
@@ -391,440 +289,487 @@ class BaseWikiMarkup(object):
stack.append(i)
# Redefine all non-matched tokens as TEXT
for i in stack:
- self.toklist[i]['type'] = 'TEXT'
+ self.toklist[i].type = 'TEXT' # FIXME
+
+ mark = []
+
+ def push_mark(self):
+ self.mark.append(self.tokind)
- def peektkn(self, off=0):
- return self.toklist[self.tokind-off]
+ def pop_mark(self):
+ self.tokind = self.mark.pop()
+
+ def clear_mark(self):
+ self.mark.pop()
+
+ def lookahead(self, off=0):
+ tok = self.toklist[self.tokind+off]
+ self.dprint(20, "lookahead(%s): %s", off, tok)
+ return tok
def setkn(self,val):
self.toklist[self.tokind] = val
def getkn(self):
- self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
+ self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL'
if self.tokind == len(self.toklist):
- return { 'type': 'NIL' }
+ return WikiNode(type='NIL')
tok = self.toklist[self.tokind]
self.tokind = self.tokind + 1
+ self.dprint(20, "getkn: %s", tok)
return tok
- def ungetkn(self):
+ def ungetkn(self, tok=None):
self.tokind = self.tokind - 1
- self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
+ self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL'
+ if tok:
+ self.toklist[self.tokind] = tok
+ self.dprint(20, "ungetkn: %s", tok)
return self.toklist[self.tokind]
+ def fixuptkn(self, tok):
+ if self.tokind == 0:
+ raise IndexError('wikimarkup.fixuptkn called at start of input')
+ self.toklist[self.tokind-1] = tok
+ return tok
+
+ def dump(self, tree, file=sys.stdout):
+ for node in tree:
+ file.write(str(node))
+ file.write('\n')
+
+ def is_block_end(self, tok):
+ if tok.type == 'NIL':
+ return True
+ elif tok.type == 'NL':
+ if self.lookahead().type == 'NIL':
+ return True
+ elif self.lookahead().type == 'NL':
+ self.getkn()
+ return True
+ elif tok.type in ['DELIM', 'CTAG', 'TAG']:
+ if tok.isblock:
+ self.ungetkn(tok)
+ return True
+ return False
+
+ def parse_para(self, tok):
+ self.dprint(80, "ENTER parse_para: %s", tok)
+
+ acc = { 'seq': [],
+ 'textlist': [] }
+
+ def flush():
+ if acc['textlist']:
+ acc['seq'].append(WikiContentNode(type='TEXT',
+ content=''.join(acc['textlist'])))
+ acc['textlist'] = []
+
+ if isinstance(tok, WikiContentNode) \
+ and isinstance(tok.content,str) \
+ and re.match("^[ \t]", tok.content):
+ type = 'PRE'
+ rx = re.compile("^\S")
+ else:
+ type = 'PARA'
+ rx = re.compile("^[ \t]")
+
+ while not self.is_block_end(tok):
+ if tok.type == 'TEXT':
+ if rx and self.newline and rx.match(tok.content):
+ self.ungetkn()
+ break
+ acc['textlist'].append(tok.content)
+ elif tok.type == 'NL':
+ acc['textlist'].append('\n')
+ elif tok.type == 'OTAG':
+ flush()
+ acc['seq'].append(self.parse_tag(tok))
+ elif tok.type == 'DELIM':
+ flush()
+ acc['seq'].append(self.parse_inline_delim(tok))
+ else:
+ raise UnexpectedToken(tok)
+ tok = self.getkn()
+ flush()
+ if acc['seq']:
+ tok = WikiSeqNode(type=type, content=acc['seq'])
+ else:
+ tok = None
+ self.dprint(80, "LEAVE parse_para=%s", tok)
+ return tok
+
+ def parse_block_delim(self, tok):
+ self.dprint(80, "ENTER parse_block_delim")
+ assert(tok.type == 'DELIM')
+ if tok.content == "----":
+ node = WikiNode(type = 'BAR')
+ elif tok.content[0:2] == "==":
+ node = self.parse_header(tok)
+ if not node:
+ tok = self.ungetkn(WikiTextNode(content=tok.content))
+ elif tok.content[0] in self.envtypes:
+ node = None
+ if tok.content[0] == ':':
+ t = self.lookahead(-2)
+ if not (t.type == 'DELIM' and t.content == ';'):
+ node = self.parse_indent(tok)
+ if not node:
+ node = self.parse_env(tok)
+ else:
+ self.ungetkn(tok)
+ node = None
+ self.dprint(80, "LEAVE parse_block_delim=%s", node)
+ return node
+
+ def parse_line(self):
+ self.dprint(80, "ENTER parse_line")
+ list = []
+ while True:
+ tok = self.getkn()
+ if tok.type == 'NL' or tok.type == 'NIL':
+ break
+ elif tok.type == 'TEXT':
+ list.append(tok)
+ elif tok.type == 'DELIM':
+ if tok.isblock:
+ tok = WikiContentNode(type = 'TEXT', content = tok.content)
+ self.fixuptkn(tok)
+ list.append(tok)
+ elif tok.content[0] == ":":
+ # FIXME
+ list.append(self.parse_indent(tok))
+ break
+ else:
+ x = self.parse_inline_delim(tok)
+ if x:
+ list.append(x)
+ else:
+ list.append(self.fixuptkn(WikiContentNode(type = 'TEXT', content = tok.content)))
+ elif tok.type == 'OTAG':
+ if tok.isblock:
+ self.ungetkn()
+ break
+ list.append(self.parse_tag(tok))
+ else:
+ list.append(tok)
+ ret = WikiSeqNode(type='SEQ', content=list)
+ self.dprint(80, "LEAVE parse_line=%s", ret)
+ return ret
+
+ def parse_indent(self, tok):
+ lev = len(tok.content)
+ self.dprint(80, "ENTER parse_indent(%s)", lev)
+ x = WikiIndNode(type='IND', level=lev, content=self.parse_line())
+ self.dprint(80, "LEAVE parse_indent=%s", x)
+ return x
+
def parse_fontmod(self,delim,what):
self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s",
- delim, what, self.peektkn())
+ delim, what, self.lookahead())
seq = []
text = ''
- while 1:
+ while True:
tok = self.getkn()
- if tok['type'] == 'TEXT':
- text += tok['content']
- elif tok['type'] == 'DELIM':
- if tok['content'] == delim:
+ if tok.type == 'TEXT':
+ text += tok.content
+ elif self.is_block_end(tok):
+ self.dprint(80, "LEAVE parse_fontmod=%s", "None")
+ return None
+ elif tok.type == 'DELIM':
+# self.dprint(80, "got %s, want %s", tok.content, delim)
+ if tok.content == delim:
break
- elif self.is_inline_delim(tok):
+ else:
if text:
- seq.append({ 'type': 'TEXT', 'content': text })
+ seq.append(WikiContentNode(type='TEXT', content=text))
text = ''
- x = self.parse_inline(tok)
+ x = self.parse_inline_delim(tok)
if x:
seq.append(x)
else:
self.dprint(80, "LEAVE parse_fontmod=%s", "None")
return None
- else:
- self.dprint(80, "LEAVE parse_fontmod=None")
- return None
- elif tok['type'] == 'NL':
- if self.peektkn()['type'] == 'NL':
- self.dprint(80, "LEAVE parse_fontmod=None")
- return None
- seq.append({ 'type': 'TEXT', 'content': '\n' })
+ elif tok.type == 'NL':
+ seq.append(WikiContentNode(type='TEXT', content='\n'))
else:
self.dprint(80, "LEAVE parse_fontmod=None")
return None
if text:
- seq.append({ 'type': 'TEXT', 'content': text })
- res = { 'type': what, 'content': seq }
+ seq.append(WikiContentNode(type='TEXT', content=text))
+ res = WikiSeqNode(type=what, content=seq)
self.dprint(80, "LEAVE parse_fontmod=%s", res)
return res
- def parse_link(self, type, delim):
- self.dprint(80, "ENTER parse_link(%s,%s), tok %s",
- type, delim, self.peektkn())
- subtree = []
- list = []
- while 1:
- tok = self.getkn()
- if tok['type'] == 'DELIM':
- if tok['content'] == delim:
- if list:
- subtree.append({ 'type': 'SEQ', 'content': list })
- break
- elif tok['content'] == "|":
- if len(list) > 1:
- subtree.append({ 'type': 'SEQ', 'content': list })
- elif list:
- subtree.append(list[0])
- list = []
- else:
- x = self.parse_inline(tok)
- if x:
- list.append(x)
- else:
- self.dprint(80, "LEAVE parse_link=%s", "None")
- return None
- elif tok['type'] == 'TEXT':
- list.append(tok)
- else:
- self.dprint(80, "LEAVE parse_link=%s", "None")
- return None
- self.dprint(80, "LEAVE parse_link=(%s,%s)", type, subtree)
- return { 'type': type, 'content': subtree }
-
def parse_ref(self):
+ self.dprint(80, "ENTER parse_ref")
tok = self.getkn()
- self.dprint(80, "ENTER parse_ref, tok %s", tok)
- if not (tok['type'] == 'TEXT' and self.refstart.match(tok['content'])):
+ if not (tok.type == 'TEXT' and self.refstart.match(tok.content)):
self.dprint(80, "LEAVE parse_ref=None")
return None
seq = []
- (ref,sep,text) = tok['content'].partition(' ')
+ (ref,sep,text) = tok.content.partition(' ')
if text:
- seq.insert(0, {'type': 'TEXT', 'content': text })
+ seq.insert(0, WikiContentNode(type='TEXT', content=text))
- while 1:
+ while True:
tok = self.getkn()
- if tok == None or tok['type'] == 'NIL':
+ if tok.type == 'NIL':
self.dprint(80, "LEAVE parse_ref=None")
return None
- if tok['type'] == 'DELIM':
- if tok['content'] == ']':
+ elif self.is_block_end(tok):
+ self.dprint(80, "LEAVE parse_ref=None")
+ return None
+ elif tok.type == 'DELIM':
+ if tok.content == ']':
break
else:
- tok = self.parse_inline(tok)
+ tok = self.parse_inline_delim(tok)
if tok:
seq.append(tok)
else:
self.dprint(80, "LEAVE parse_ref=None")
return None
- elif tok['type'] == 'OTAG':
+ elif tok.type == 'OTAG':
list.append(self.parse_tag(tok))
else:
seq.append(tok)
- ret = { 'type': 'REF',
- 'ref': ref,
- 'content': { 'type': 'SEQ', 'content': seq } }
+ ret = WikiRefNode(type='REF',
+ ref=ref,
+ content=WikiSeqNode(type='SEQ', content=seq))
self.dprint(80, "LEAVE parse_ref= %s", ret)
return ret
- inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ]
-
- def is_inline_delim(self, tok):
- return tok['type'] == 'DELIM' and tok['content'] in self.inline_delims
- def is_block_delim(self, tok):
- return tok['type'] == 'DELIM' and tok['content'] not in self.inline_delims
+ def parse_link(self, type, delim):
+ self.dprint(80, "ENTER parse_link(%s,%s)", type, delim)
+ subtree = []
+ list = []
+ while True:
+ tok = self.getkn()
+ if tok.type == 'NIL':
+ self.dprint(80, "LEAVE parse_link=None [EOF]")
+ return None
+ if tok.type == 'DELIM':
+ if tok.content == delim:
+ if list:
+ subtree.append(WikiSeqNode(type='SEQ',
+ content=list))
+ break
+ elif tok.content == "|":
+ if len(list) > 1:
+ subtree.append(WikiSeqNode(type='SEQ',
+ content=list))
+ elif list:
+ subtree.append(list[0])
+ list = []
+ else:
+ x = self.parse_inline_delim(tok)
+ if x:
+ list.append(x)
+ else:
+ self.dprint(80, "LEAVE parse_link=None [bad inline]")
+ return None
+ elif tok.type == 'TEXT':
+ list.append(tok)
+ else:
+ self.dprint(80, "LEAVE parse_link=None [unexpected token]")
+ return None
+ ret = WikiSeqNode(type=type, content=subtree)
+ self.dprint(80, "LEAVE parse_link=%s", ret)
+ return ret
- def parse_inline(self, tok):
- self.dprint(80, "ENTER parse_inline(%s), tok %s", tok, self.peektkn())
- tokind = self.tokind
- if tok['content'] == "''":
- x = self.parse_fontmod(tok['content'], 'IT')
- elif tok['content'] == "'''":
- x = self.parse_fontmod(tok['content'], 'BOLD')
- elif tok['content'] == "[":
+ def parse_inline_delim(self, tok):
+ self.dprint(80, "ENTER parse_inline_delim")
+ assert(tok.type == 'DELIM')
+ self.push_mark()
+ if tok.content == "''":
+ x = self.parse_fontmod(tok.content, 'IT')
+ elif tok.content == "'''":
+ x = self.parse_fontmod(tok.content, 'BOLD')
+ elif tok.content == "[":
x = self.parse_ref()
- elif tok['content'] == "[[":
+ elif tok.content == "[[":
x = self.parse_link('LINK', "]]")
- elif tok['content'] == "{{":
+ elif tok.content == "{{":
x = self.parse_link('TMPL', "}}")
else:
- self.dprint(80, "LEAVE parse_inline=%s (unhandled delimiter)", "None")
x = None
- if not x:
- self.tokind = tokind
- tok['type'] = 'TEXT'
+
+ if x:
+ self.clear_mark()
+ else:
self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok)
- od = tok['content']
+ self.pop_mark()
+ x = self.fixuptkn(WikiTextNode(content=tok.content))
+ od = tok.content
if od in self.close_delim:
cd = self.close_delim[od]
lev = 0
- for tok in self.toklist[self.tokind+1:]:
- if tok['type'] == 'NIL':
+ for i,tok in enumerate(self.toklist[self.tokind+1:]):
+ if tok.type == 'NIL':
break
- elif tok['type'] == 'DELIM':
- if tok['content'] == od:
+ elif tok.type == 'DELIM':
+ if tok.content == od:
lev += 1
- elif tok['content'] == cd:
+ elif tok.content == cd:
if lev == 0:
- tok['type'] = 'TEXT'
+ tok = WikiTextNode(content=tok.content)
+ self.toklist[self.tokind+1+i] = tok
lev -= 1
break
self.dprint(80, "END DELIMITER RECOVERY: %s", tok)
- self.dprint(80, "LEAVE parse_inline=%s", x)
+ self.dprint(80, "LEAVE parse_inline_delim=%s", x)
return x
-
- def parse_para(self):
- self.dprint(80, "ENTER parse_para, tok %s", self.peektkn())
- seq = []
- textlist = []
- tok = self.peektkn()
-
- if self.newline:
- if 'content' in tok and re.match("^\s", tok['content']):
- type = 'PRE'
- rx = re.compile("^\S")
- else:
- type = 'PARA'
- rx = re.compile("^\s")
- else:
- type = 'SEQ'
- rx = None
- self.dprint(80, "IN parse_para, type %s", type)
- while 1:
- tok = self.getkn()
- if tok['type'] == 'TEXT':
- if rx and self.newline and rx.match(tok['content']):
- self.ungetkn()
- break
- textlist.append(tok['content'])
- elif tok['type'] == 'NL':
- tok = self.getkn()
- if tok['type'] == 'NL' or tok['type'] == 'NIL':
- break
- else:
- self.ungetkn()
- if self.is_block_delim(tok):
- break
- textlist.append('\n')
- elif tok['type'] == 'NIL':
- break
- elif tok['type'] == 'OTAG':
- save = (self.tokind,self.newline)
- t = self.parse_tag(tok)
- if t['type'] == 'TAG' and t['isblock']:
- del self.toklist[save[0]:self.tokind]
- self.tokind = save[0]
- self.toklist[self.tokind] = t
- self.newline = save[1]
- break
- else:
- if textlist:
- seq.append({ 'type': 'TEXT',
- 'content': ''.join(textlist) })
- textlist = []
- seq.append(t)
- elif tok['type'] == 'TAG':
- if tok['isblock']:
- break
- else:
- if textlist:
- seq.append({ 'type': 'TEXT',
- 'content': ''.join(textlist) })
- textlist = []
- seq.append(tok)
- elif tok['type'] == 'CTAG':
- self.ungetkn()
- break
- elif tok['type'] == 'DELIM':
- if self.is_inline_delim(tok):
- if textlist:
- seq.append({ 'type': 'TEXT',
- 'content': ''.join(textlist) })
- textlist = []
- x = self.parse_inline(tok)
- if x:
- seq.append(x)
- else:
- self.ungetkn()
- # restart
- else:
- seq.append({ 'type': 'TEXT', 'content': tok['content'] })
- # self.ungetkn()
- break
- if textlist:
- seq.append({ 'type': 'TEXT', 'content': ''.join(textlist) })
- self.dprint(80, "LEAVE parse_para=%s", seq)
- return { 'type': type, 'content': seq }
-
- def parse_header(self, delim):
- self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn())
+
+ def parse_tag(self, tag):
+ self.dprint(80, "ENTER parse_tag")
list = []
- while 1:
+ self.push_mark()
+ while True:
tok = self.getkn()
- if tok['type'] == 'NIL':
- self.dprint(80, "LEAVE parse_header=%s", "None")
- return None
- elif tok['type'] == 'TEXT':
- list.append(tok)
- elif tok['type'] == 'DELIM':
- if tok['content'] == delim:
- if self.peektkn()['type'] == 'NL':
- break
- else:
- self.dprint(80, "LEAVE parse_header=%s, tok=%s",
- "None", self.peektkn())
- return None
+ if tok.type == 'NIL':
+ self.pop_mark()
+ s = '<' + tag.tag
+ if tag.args:
+ s += ' ' + str(tag.args)
+ s += '>'
+ node = WikiTextNode(content=s)
+ if tag.content:
+ self.tree[self.tokind:self.tokind] = tag.content
+ self.dprint(80, "LEAVE parse_tag = %s (tree modified)", node)
+ return node
+ elif tok.type == 'DELIM':
+ if tok.isblock:
+ tok = self.parse_block_delim(tok)
else:
- x = self.parse_inline(tok)
- if x:
- list.append(x)
- else:
- self.ungetkn()
- self.dprint(80, "LEAVE parse_header=%s", "None")
- return None #FIXME?
- else:
- self.dprint(80, "LEAVE parse_header=%s", "None")
- return None
- self.dprint(80, "LEAVE parse_header=(HDR, %s, (SEQ,%s))",len(delim)-1,list)
- return { 'type': 'HDR',
- 'level': len(delim)-1,
- 'content': { 'type': 'SEQ', 'content': list } }
-
-
- def parse_line(self):
- self.dprint(80, "ENTER parse_line, tok %s", self.peektkn())
- list = []
- while 1:
- tok = self.getkn()
- if tok['type'] == 'NL' or tok['type'] == 'NIL':
- break
- elif tok['type'] == 'TEXT':
- list.append(tok)
- elif tok['type'] == 'DELIM':
- if tok['content'][0] == ":":
- list.append(self.parse_indent(len(tok['content'])))
+ tok = self.parse_inline_delim(tok)
+ if not tok:
+ tok = self.getkn()
+ elif tok.type == 'CTAG':
+ if tag.tag == tok.tag:
break
- else:
- x = self.parse_inline(tok)
- if x:
- list.append(x)
- else:
- list.append(tok)
- elif tok['type'] == 'OTAG':
- list.append(self.parse_tag(tok))
- else:
- list.append(tok)
- self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list)
- return { 'type': 'SEQ', 'content': list }
-
- def parse_env(self, type, lev):
- self.dprint(80, "ENTER parse_env(%s,%s), tok %s",type,lev,self.peektkn())
+ s = '</' + tag.tag + '>'
+ tok = self.fixuptkn(WikiTextNode(content=s))
+ elif tok.type == 'NL':
+ tok = WikiContentNode(type = 'TEXT', content = '\n')
+ list.append(tok)
+
+ self.clear_mark()
+ ret = WikiTagNode(type = 'TAG',
+ tag = tag.tag,
+ args = tag.args,
+ isblock = tag.isblock,
+ content = WikiSeqNode(type = 'SEQ', content = list))
+ self.dprint(80, "LEAVE parse_tag = %s", ret)
+ return ret
+
+ def parse_env(self, tok):
+ type = self.envtypes[tok.content[0]][0]
+ lev = len(tok.content)
+ self.dprint(80, "ENTER parse_env(%s,%s)",type,lev)
list = []
- while 1:
- tok = self.getkn()
- if tok['type'] == 'DELIM' \
- and tok['content'][0] in self.envtypes \
- and type == self.envtypes[tok['content'][0]][0]:
- if len(tok['content']) < lev:
+ while True:
+ if tok.type == 'DELIM' \
+ and tok.content[0] in self.envtypes \
+ and type == self.envtypes[tok.content[0]][0]:
+ if len(tok.content) < lev:
self.ungetkn()
break
- elif len(tok['content']) > lev:
- self.ungetkn()
- elt = self.parse_env(type, len(tok['content']))
+ elif len(tok.content) > lev:
+ elt = self.parse_env(tok)
else:
elt = self.parse_line()
- if not tok['continuation']:
- list.append({ 'type': 'ELT',
- 'subtype': self.envtypes[tok['content'][0]][1],
- 'content': elt })
+ if not tok.continuation:
+ list.append(WikiEltNode(type='ELT',
+ subtype=self.envtypes[tok.content[0]][1],
+ content=elt))
+ tok = self.getkn()
continue
if list:
- if list[-1]['content']['type'] != 'SEQ':
- x = list[-1]['content']['content']
+ if list[-1].content.type != 'SEQ':
+ x = list[-1].content.content
# FIXME:
- list[-1]['content'] = { 'type': 'SEQ', 'content': [x] }
- list[-1]['content']['content'].append(elt)
+ list[-1].content = WikiNode(type='SEQ', content=[x])
+ list[-1].content.content.append(elt)
else:
self.ungetkn()
break
- self.dprint(80, "LEAVE parse_env=(ENV, %s, %s, %s)", type, lev, list)
- return { 'type': 'ENV', 'envtype': type, 'level': lev, 'content': list }
- def parse_indent(self, lev):
- self.dprint(80, "ENTER parse_indent(%s), tok %s", lev, self.peektkn())
- x = { 'type': 'IND', 'level': lev, 'content': self.parse_line() }
- self.dprint(80, "LEAVE parse_indent=%s", x)
- return x
+ tok = self.getkn()
- def parse_tag(self, tag):
- self.dprint(80, "ENTER parse_tag(%s)", tag)
- seq = []
- save = self.tokind
- t = self.peektkn()
- isblock = t['type'] == 'NL'
- while 1:
- t = self.parse0()
- if t == None or t['type'] == 'NIL':
- self.tokind = save
- s = '<' + tag['tag']
- if 'args' in tag and tag['args']:
- s += ' ' + str(tag['args'])
- del tag['args']
- s += '>'
- if 'content' in tag:
- subtree = tag['content']
- else:
- subtree = None
- tag['type'] = 'TEXT'
- tag['content'] = s
- if subtree:
- self.tree[self.tokind:self.tokind] = subtree
- self.dprint(80, "LEAVE parse_tag = %s (tree modified)", tag)
- self.ungetkn()
- return self.parse0()
+ ret = WikiEnvNode(type='ENV',
+ envtype=type,
+ level=lev,
+ content=list)
+ self.dprint(80, "LEAVE parse_env=%s", ret)
+ return ret
+
+ def parse_header(self, tok):
+ self.dprint(80, "ENTER parse_header")
+ self.push_mark()
+ list = []
+ delim = tok.content
+ while True:
+ tok = self.getkn()
- if t['type'] == 'CTAG' and tag['tag'] == t['tag']:
- break
- seq.append(t)
+ if tok.type == 'NL':
+ self.pop_mark()
+ self.dprint(80, "LEAVE parse_header=None")
+ return None
+ elif tok.type == 'TEXT':
+ list.append(tok)
+ elif tok.type == 'DELIM':
+ if tok.content == delim:
+ if self.lookahead().type == 'NL':
+ self.getkn()
+ if self.lookahead().type == 'NL':
+ self.getkn()
+ break
+ else:
+ self.pop_mark()
+ self.dprint(80, "LEAVE parse_header=None")
+ return None
+ elif tok.isblock:
+ self.pop_mark()
+ self.dprint(80, "LEAVE parse_header=None")
+ return None
+ else:
+ list.append(self.parse_inline_delim(tok))
+ elif tok.type == 'OTAG':
+ if tok.isblock:
+ self.pop_mark()
+ self.dprint(80, "LEAVE parse_header=None")
+ return None
+ list.append(self.parse_tag(tok))
+
- ret = { 'type': 'TAG',
- 'tag': tag['tag'],
- 'args': tag['args'],
- 'isblock': isblock,
- 'content': { 'type': 'SEQ', 'content': seq } }
- self.dprint(80, "LEAVE parse_tag = %s", ret)
+ self.clear_mark()
+ ret = WikiHdrNode(level = len(delim),
+ content = WikiSeqNode(type='SEQ', content=list))
+ self.dprint(80, "LEAVE parse_header=%s", ret)
return ret
-
- def parse0(self):
+
+ def parse_block(self):
tok = self.getkn()
- self.dprint(80, "ENTER parse0(%s)", tok)
- toktype = tok['type']
- if toktype == 'NIL':
+ while tok.type == 'NL':
+ tok = self.getkn()
+ if tok == None or tok.type == 'NIL':
return None
- elif toktype == 'TEXT':
- self.ungetkn()
- return self.parse_para()
- elif toktype == 'DELIM':
- if tok['content'] == "----":
- return { 'type': 'BAR' }
- elif tok['content'][0:2] == "==":
- return self.parse_header(tok['content'])
- elif tok['content'][0] in self.envtypes:
- type = self.envtypes[tok['content'][0]][0]
- lev = len(tok['content'])
- if tok['content'][0] == ':':
- t = self.peektkn(2)
- if not (t['type'] == 'DELIM' and t['content'] == ';'):
- return self.parse_indent(lev)
- self.ungetkn()
- return self.parse_env(type, lev)
+ elif tok.type == 'DELIM':
+ tok = self.parse_block_delim(tok)
+ if tok:
+ return tok
else:
- self.ungetkn()
- return self.parse_para()
- elif toktype == 'NL':
- return { 'type': 'TEXT', 'content': '\n' }
- elif toktype == 'OTAG':
+ tok = self.getkn()
+ elif tok.type == 'OTAG' and tok.isblock:
return self.parse_tag(tok)
- else:
- return tok
+ return self.parse_para(tok)
+
def parse(self):
if not self.toklist:
self.tokenize()
@@ -832,15 +777,13 @@ class BaseWikiMarkup(object):
print("TOKEN DUMP BEGIN")
self.dump(self.toklist)
print("TOKEN DUMP END")
-
self.tokind = 0
self.tree = []
while 1:
- subtree = self.parse0()
+ subtree = self.parse_block()
if subtree == None:
break
self.tree.append(subtree)
-
if self.debug_level >= 70:
print("TREE DUMP BEGIN")
self.dump(self.tree)
@@ -909,24 +852,24 @@ class WikiMarkup (BaseWikiMarkup):
return None
def is_lang_link(self, elt):
- if elt['type'] == 'LINK' \
- and isinstance(elt['content'], list) \
- and len(elt['content']) == 1:
- if elt['content'][0]['type'] == TEXT:
- m = re.match('([\w-]+):', elt['content'][0]['content'])
+ if elt.type == 'LINK' \
+ and isinstance(elt.content, list) \
+ and len(elt.content) == 1:
+ if elt.content[0].type == TEXT:
+ m = re.match('([\w-]+):', elt.content[0].content)
if m: # and m.group(1) in self.langtab:
return True
- elif elt['content'][0]['type'] == 'SEQ' \
- and len(elt['content'][0]['content']) == 1 and\
- elt['content'][0]['content'][0]['type'] == TEXT:
- m = re.match('([\w-]+):',elt['content'][0]['content'][0]['content'])
+ elif elt.content[0].type == 'SEQ' \
+ and len(elt.content[0].content) == 1 and\
+ elt.content[0].content[0].type == TEXT:
+ m = re.match('([\w-]+):',elt.content[0].content[0].content)
if m: # and m.group(1) in self.langtab:
return True
return False
def is_empty_text(self, elt):
- if elt['type'] == 'TEXT':
- if re.search('\w', elt['content']):
+ if elt.type == 'TEXT':
+ if re.search('\w', elt.content):
return False
return True
return False
diff --git a/WikiTrans/wikitoken.py b/WikiTrans/wikitoken.py
new file mode 100644
index 0000000..88c168e
--- a/dev/null
+++ b/WikiTrans/wikitoken.py
@@ -0,0 +1,166 @@
+# Wiki tokens. -*- coding: utf-8 -*-
+# Copyright (C) 2015 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import print_function
+import re
+
+def mkstr(func):
+ def _indent(s, n=0):
+ return ' ' * n + s
+ def _do_str(self):
+ ret = _indent('NODE %s' % self.type)
+ s = func(self)
+ if s:
+ if not s.startswith('\n'):
+ ret += '\n'
+ ret += '\n'.join(map(lambda x: _indent(x, 1), s.split("\n")))
+ if not ret.endswith('\n'):
+ ret += '\n'
+ return ret + _indent('END NODE %s' % self.type)
+ return _do_str
+
+def mkcontentstr(func):
+ def _indent(s, n=0):
+ return ' ' * n + s
+ def _do_str(self):
+ ret = _indent('NODE %s' % self.type)
+ s = func(self)
+ if s:
+ if not s.startswith('\n'):
+ ret += '\n'
+ ret += '\n'.join(map(lambda x: _indent(x,1), s.split("\n")))
+ if not ret.endswith('\n'):
+ ret += '\n'
+ if self.content:
+ ret += _indent('CONTENT BEGIN\n',1)
+ if isinstance(self.content,list):
+ ret += ',\n'.join(map(lambda x: re.sub('^', _indent('',2), str(x), 0, re.MULTILINE), self.content))
+ else:
+ ret += '\n'.join(map(lambda x: _indent(x, 2), str(self.content).split("\n")))
+ if not ret.endswith('\n'):
+ ret += '\n'
+ ret += _indent('CONTENT END\n',1)
+ return ret + _indent('END NODE %s' % self.type)
+ return _do_str
+
+class WikiNode(object):
+ type = 'UNDEF'
+ nesting = 0
+
+ def __init__(self, **kwargs):
+ for key in kwargs:
+ if hasattr(self,key):
+ self.__dict__[key] = kwargs[key]
+ else:
+ raise AttributeError("'%s' has no attribute '%s'" % (self.__class__.__name__, key))
+
+ @mkstr
+ def __str__(self):
+ s = ''
+ for x in dir(self):
+ if x == 'type' or x.startswith('_') or type(x) == 'function':
+ continue
+ if x in self.__dict__:
+ s += '\n%s=%s' % (x, self.__dict__[x])
+ return s
+
+class WikiContentNode(WikiNode):
+ content = None
+ @mkcontentstr
+ def __str__(self):
+ pass
+
+class WikiSeqNode(WikiContentNode):
+ pass
+
+# ##############
+
+class WikiTextNode(WikiContentNode):
+ type = 'TEXT'
+ @mkstr
+ def __str__(self):
+ return '\nCONTENT: "%s"' % self.content
+
+class WikiDelimNode(WikiContentNode):
+ type = 'DELIM'
+ isblock=False
+ continuation = False
+ @mkstr
+ def __str__(self):
+ s = '\nPLACEMENT: %s' % ('BLOCK' if self.isblock else 'INLINE')
+ if self.content:
+ s += '\nCONTENT: "%s"' % self.content
+ if self.continuation:
+ s += '\nCONTINUATION'
+ return s
+
+class WikiTagNode(WikiContentNode):
+ tag = None
+ isblock = False
+ args = None
+ @mkcontentstr
+ def __str__(self):
+ s = '\nTAG: %s' % self.tag
+ s += '\nPLACEMENT: %s' % ('BLOCK' if self.isblock else 'INLINE')
+ if self.args:
+ s += '\nARGS: %s' % str(self.args)
+ return s
+
+class WikiRefNode(WikiContentNode):
+ type = 'REF'
+ ref = None
+ @mkcontentstr
+ def __str__(self):
+ s = '\nREF: %s' % self.ref
+ return s
+
+class WikiHdrNode(WikiContentNode):
+ type = 'HDR'
+ level = None
+ @mkcontentstr
+ def __str__(self):
+ s = '\nLEVEL: %s' % self.level
+ return s
+
+class WikiEltNode(WikiContentNode):
+ type = 'ELT'
+ subtype = None
+ @mkcontentstr
+ def __str__(self):
+ s = '\nSUBTYPE: %s' % self.subtype
+ return s
+
+class WikiEnvNode(WikiContentNode):
+ type = 'ENV'
+ envtype = None
+ level = None
+ @mkcontentstr
+ def __str__(self):
+ s = '\nLEVEL: %s' % self.level
+ s += '\nENVTYPE: %s' % self.envtype
+ return s
+
+class WikiIndNode(WikiContentNode):
+ type = 'IND'
+ level = None
+ @mkcontentstr
+ def __str__(self):
+ return '\nLEVEL: %s' % self.level
+
+class WikiLinkNode(WikiContentNode):
+ type = 'LINK'
+
+
diff --git a/bin/wikitrans b/bin/wikitrans
index 0cb26a4..7f9789f 100755
--- a/bin/wikitrans
+++ b/bin/wikitrans
@@ -31,7 +31,7 @@ class DumpWikiMarkup (WikiMarkup):
def __str__(self):
if self.tree:
s = StringIO()
- self.dump(self.tree, 0, s)
+ self.dump(self.tree, s)
return s.getvalue()
else:
return ""
diff --git a/testdata/colon.html b/testdata/colon.html
index 2fb6ed3..2e973c1 100644
--- a/testdata/colon.html
+++ b/testdata/colon.html
@@ -1,4 +1,5 @@
<dl><dd>A colon (:) indents a line or paragraph.</dd></dl><p>A newline starts a new paragraph.
Should only be used on talk pages.
-For articles, you probably want the blockquote tag.</p><dl><dd>We use 1 colon to indent once.</dd></dl><dl><dd><dl><dd>We use 2 colons to indent twice.</dd></dl></dd></dl><dl><dd><dl><dd><dl><dd>3 colons to indent 3 times, and so on.</dd></dl></dd></dl></dd></dl>
-
+For articles, you probably want the blockquote tag.
+</p>
+<dl><dd>We use 1 colon to indent once.</dd></dl><dl><dd><dl><dd>We use 2 colons to indent twice.</dd></dl></dd></dl><dl><dd><dl><dd><dl><dd>3 colons to indent 3 times, and so on.</dd></dl></dd></dl></dd></dl>
diff --git a/testdata/headings.html b/testdata/headings.html
index 4dd1203..445c821 100644
--- a/testdata/headings.html
+++ b/testdata/headings.html
@@ -2,10 +2,15 @@
<p><i>Headings</i> organize your writing into
sections. The Wiki software can automatically
-generate a <a href="http://pl.wiktionary.org/wiki/table%20of%20contents">table of contents</a> from them.</p><h3>Subsection</h3>
-<p>Using more "equals" (=) signs creates a subsection.</p><h4>A smaller subsection</h4>
+generate a <a href="http://pl.wiktionary.org/wiki/table%20of%20contents">table of contents</a> from them.</p>
+<h3>Subsection</h3>
+
+<p>Using more "equals" (=) signs creates a subsection.</p>
+<h4>A smaller subsection</h4>
<p>Don't skip levels,
-like from two to four equals signs.</p><p>Start with 2 equals signs not 1
+like from two to four equals signs.</p>
+<p>Start with 2 equals signs not 1
because 1 creates H1 tags
which should be reserved for page title.</p>
+
diff --git a/testdata/hz.html b/testdata/hz.html
index 507a730..3e8941a 100644
--- a/testdata/hz.html
+++ b/testdata/hz.html
@@ -1,4 +1,7 @@
<p>You can make horizontal dividing lines (----)
-to separate text.</p><hr/>
+to separate text.
+</p>
+<hr/>
<p>But you should usually use sections instead,
so that they go in the table of contents.</p>
+
diff --git a/testdata/nowiki-ind.html b/testdata/nowiki-ind.html
index b2bd72c..7c7bdd7 100644
--- a/testdata/nowiki-ind.html
+++ b/testdata/nowiki-ind.html
@@ -1,5 +1,8 @@
-<p>Para</p><pre>
+<p>Para</p>
+<p>
a
b
c
-</pre><p>para</p>
+</p>
+<p>para</p>
+
diff --git a/testdata/nowiki.html b/testdata/nowiki.html
index 346d0c3..126fa01 100644
--- a/testdata/nowiki.html
+++ b/testdata/nowiki.html
@@ -1 +1,2 @@
<p>#:version=1.0<i>rest</i> of line</p>
+
diff --git a/testdata/numlist.html b/testdata/numlist.html
index 0ce8a9a..ac6aab2 100644
--- a/testdata/numlist.html
+++ b/testdata/numlist.html
@@ -1,2 +1,4 @@
-<ol><li><i>Numbered lists</i> are:<ol><li>Very organized</li><li>Easy to follow</li></ol></li></ol><p>A newline marks the end of the list.</p><ol><li>New numbering starts with 1.</li></ol>
+<ol><li><i>Numbered lists</i> are:<ol><li>Very organized</li><li>Easy to follow</li></ol></li></ol><p>A newline marks the end of the list.
+</p>
+<ol><li>New numbering starts with 1.</li></ol>
diff --git a/testdata/para.html b/testdata/para.html
index cd3f732..65819c1 100644
--- a/testdata/para.html
+++ b/testdata/para.html
@@ -1,3 +1,5 @@
<p>First paragraph consists of two sentences.
-Each sentence occupies a line.</p><p>Second paragraph consists of two sentences as well.
+Each sentence occupies a line.</p>
+<p>Second paragraph consists of two sentences as well.
Each of them, again, occupies its own line.</p>
+
diff --git a/testdata/unlist.html b/testdata/unlist.html
index ac4d32b..06693e8 100644
--- a/testdata/unlist.html
+++ b/testdata/unlist.html
@@ -1,2 +1,4 @@
-<ul><li><i>Unordered lists</i> are easy to do:<ul><li>Start every line with a star.<ul><li>More stars indicate a deeper level.</li></ul>Previous item continues.</li><li>A newline</li></ul></li><li>in a list </li></ul><p>marks the end of the list.</p><ul><li>Of course you can start again.</li></ul>
+<ul><li><i>Unordered lists</i> are easy to do:<ul><li>Start every line with a star.<ul><li>More stars indicate a deeper level.</li></ul>Previous item continues.</li><li>A newline</li></ul></li><li>in a list </li></ul><p>marks the end of the list.
+</p>
+<ul><li>Of course you can start again.</li></ul>

Return to:

Send suggestions and report system problems to the System administrator.