summaryrefslogtreecommitdiffabout
path: root/wikimarkup.py
Side-by-side diff
Diffstat (limited to 'wikimarkup.py') (more/less context) (ignore whitespace changes)
-rw-r--r--wikimarkup.py317
1 files changed, 247 insertions, 70 deletions
diff --git a/wikimarkup.py b/wikimarkup.py
index fde1ec1..9a79d1e 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -19,15 +19,15 @@ import sys
import re
from types import *
__all__ = [ "BaseWikiMarkup", "WikiMarkup",
"envtypes" ]
-delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)")
-otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
-ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
+otag = re.compile("(?P<pfx>[^<]*)<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
+ctag = re.compile("(?P<pfx>[^<]*)</(?P<tag>[a-zA-Z0-9_]+)\s*>")
close_delim = {
'[': ']',
'[[': ']]',
'{{': '}}'
}
@@ -43,22 +43,121 @@ class BaseWikiMarkup(object):
toklist = None
tokind = 0
newline = 0
tree = None
+ tags = [ 'code', 'nowiki', 'tt', 'div' ]
+
nested = 0
debug_level = 0
def dprint(self, lev, fmt, *argv):
if self.debug_level >= lev:
print "[DEBUG]", fmt % argv
- def input_tag(self, tag):
+ def print_dump_prefix(self, level, file):
+ file.write("[DUMP]" + ' ' * (2*level + 1))
+
+ def dump_nil(self, node, level, file):
pass
+ def dump_text(self, node, level, file):
+ self.print_dump_prefix(level, file)
+ file.write("CONTENT: \"%s\"\n" % node['content'])
+
+ def dump_delim(self, node, level, file):
+ file.write("'%s'" % node['content'])
+ if 'continuation' in node:
+ file.write(" (cont)")
+ file.write("\n")
+
+ def dump_tag(self, node, level, file):
+ self.print_dump_prefix(level, file)
+ file.write("TAG: %s\n" % node['tag'])
+ if 'args' in node:
+ self.print_dump_prefix(level, file)
+ file.write("ARGS: %s\n" % node['args'])
+ if 'content' in node:
+ self.dump_node(node['content'], level + 1, file)
+
+ def dump_seq(self, node, level, file):
+ self.dump(node['content'], level + 1, file)
+
+ def dump_ref(self, node, level, file):
+ self.print_dump_prefix(level, file)
+ file.write("REF: %s\n" % node['ref'])
+ self.dump_node(node['content'], level + 1, file)
+
+ def dump_hdr(self, node, level, file):
+ self.print_dump_prefix(level, file)
+ file.write("LEVEL: %s\n" % node['level'])
+ self.dump_node(node['content'], level + 1, file)
+
+ def dump_elt(self, node, level, file):
+ self.print_dump_prefix(level, file)
+ file.write("SUBTYPE: %s\n" % node['subtype'])
+ self.dump_node(node['content'], level + 1, file)
+
+ def dump_env(self, node, level, file):
+ self.print_dump_prefix(level, file)
+ file.write("ENVTYPE: %s\n" % node['envtype'])
+ self.print_dump_prefix(level, file)
+ file.write("LEVEL: %s\n" % node['level'])
+ self.dump(node['content'], level + 1, file)
+
+ def dump_ind(self, node, level, file):
+ self.print_dump_prefix(level, file)
+ file.write("LEVEL: %s\n" % node['level'])
+ self.dump_node(node['content'], level + 1, file)
+
+ def dump_link(self, node, level, file):
+ self.dump(node['content'], level + 1, file)
+
+ dump_type = {
+ 'NIL': dump_nil,
+ 'NL': dump_nil,
+ 'TEXT': dump_text,
+ 'DELIM': dump_delim,
+ 'OTAG': dump_tag,
+ 'CTAG': dump_tag,
+ 'TAG': dump_tag,
+ 'SEQ': dump_seq,
+ 'REF': dump_ref,
+ 'HDR': dump_hdr,
+ 'ELT': dump_elt,
+ 'ENV': dump_env,
+ 'IND': dump_ind,
+ 'BAR': dump_nil,
+ 'PARA': dump_seq,
+ 'PRE': dump_text,
+ 'BOLD': dump_seq,
+ 'IT': dump_seq,
+ 'LINK': dump_link,
+ }
+
+ def dump_node(self, node, level, file):
+ if type(node) != dict:
+ file.write("UNHANDLED NODE: %s, %s\n" % (type(node),node))
+ return
+
+ self.print_dump_prefix(level, file)
+ file.write("NODE " + node['type'] + ":\n")
+ if node['type'] in self.dump_type:
+ self.dump_type[node['type']](self, node, level, file)
+ else:
+ self.print_dump_prefix(level, file)
+ file.write("(UNHANDLED) ")
+ file.write("%s\n" % node)
+ self.print_dump_prefix(level, file)
+ file.write("END NODE " + node['type'] + "\n")
+
+ def dump(self, tree, level=0, file=sys.stdout):
+ for node in tree:
+ self.dump_node(node, level, file)
+
def tokread(self):
line = None
pos = 0
while 1:
if (not line or pos == len(line)):
try:
@@ -80,58 +179,78 @@ class BaseWikiMarkup(object):
m = delim.search(line, pos)
if m:
if (pos < m.start(0)):
yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
pos = m.end(0)
- if envtypes.has_key(m.group(0)[0]) and line[pos] == ":":
- # FIXME?
- # FIXME: What's "extra"?
+
+ if m and line[m.start(0)] != '<':
+ if m.group(0)[0] in envtypes and pos < len(line) and line[pos] == ":":
yield({ 'type': 'DELIM',
- 'content': m.group(0) })
+ 'content': m.group(0),
+ 'continuation': True })
pos += 1
else:
yield({ 'type': 'DELIM',
'content': m.group(0) })
else:
- m = otag.match(line)
if m:
- t = { 'type': 'TAG',
+ pos -= 1
+ t = None
+ m = otag.match(line, pos)
+ if m and m.group('tag') in self.tags:
+ rest = line[m.end(0):]
+ line = m.group('pfx')
+ pos = 0
+ t = { 'type': 'OTAG',
'tag': m.group('tag'),
'args': m.group('args') }
-
- if self.input_tag(t):
+ else:
+ m = ctag.match(line, pos)
+ if m and m.group('tag') in self.tags:
+ rest = line[m.end(0):]
+ line = m.group('pfx')
+ pos = 0
+ t = { 'type': 'CTAG',
+ 'tag': m.group('tag') }
+
+ if line:
+ if line[-1] == '\n':
+ if line[pos:-1] != '':
+ yield({ 'type': 'TEXT',
+ 'content': line[pos:-1] })
+ yield({ 'type': 'NL',
+ 'content': '\n' })
+ else:
+ yield({ 'type': 'TEXT',
+ 'content': line[pos:] })
+
+ if t:
+ if t['type'] == 'OTAG' and t['tag'] == 'nowiki':
s = ''
if not m.group('closed'):
while 1:
try:
l = self.input()
m = ctag.match(l)
if m and m.group('tag') == t['tag']:
break
s += l
except StopIteration:
break
- yield({ 'type': 'TAG',
- 'tag': t['tag'],
- 'args': t['args'],
- 'content': s
- })
- line = None
- continue
-
- if line[-1] == '\n':
- if line[pos:-1] != '':
- yield({ 'type': 'TEXT',
- 'content': line[pos:-1] })
- yield({ 'type': 'NL',
- 'content': '\n' })
+ t['type'] = 'TAG'
+ t['content'] = {'type': 'TEXT', 'content': s}
+
+ yield(t)
+ if t['type'] == 'OTAG' and m.group('closed'):
+ t['type'] = 'CTAG'
+ yield(t)
+ line = rest
+ pos = 0
else:
- yield({ 'type': 'TEXT',
- 'content': line[pos:] })
- line = None
+ line = None
def input(self):
return None
def swaptkn(self, i, j):
self.dprint(80, "SWAPPING %s <-> %s", i, j)
@@ -191,38 +310,39 @@ class BaseWikiMarkup(object):
def setkn(self,val):
self.toklist[self.tokind] = val
def getkn(self):
self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
+ if self.tokind == len(self.toklist):
+ return { 'type': 'NIL' }
tok = self.toklist[self.tokind]
- if tok['type'] != 'NIL':
- self.tokind = self.tokind + 1
+ self.tokind = self.tokind + 1
return tok
def ungetkn(self):
self.tokind = self.tokind - 1
self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
return self.toklist[self.tokind]
def parse_fontmod(self,delim,what):
self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s",
delim, what, self.peektkn())
seq = []
- textlist = []
+ text = ''
while 1:
tok = self.getkn()
if tok['type'] == 'TEXT':
- textlist.append(tok['content'])
+ text += tok['content']
elif tok['type'] == 'DELIM':
if tok['content'] == delim:
break
elif self.is_inline_delim(tok):
- if textlist:
- seq.append({ 'type': 'TEXT', 'content': textlist })
- textlist = []
+ if text:
+ seq.append({ 'type': 'TEXT', 'content': text })
+ text = ''
x = self.parse_inline(tok)
if x:
seq.append(x)
else:
self.dprint(80, "LEAVE parse_fontmod=%s", "None")
return None
@@ -234,14 +354,14 @@ class BaseWikiMarkup(object):
self.dprint(80, "LEAVE parse_fontmod=None")
return None
seq.append({ 'type': 'TEXT', 'content': '\n' })
else:
self.dprint(80, "LEAVE parse_fontmod=None")
return None
- if textlist:
- seq.append({ 'type': 'TEXT', 'content': textlist })
+ if text:
+ seq.append({ 'type': 'TEXT', 'content': text })
res = { 'type': what, 'content': seq }
self.dprint(80, "LEAVE parse_fontmod=%s", res)
return res
def parse_link(self, type, delim):
self.dprint(80, "ENTER parse_link(%s,%s), tok %s",
@@ -340,22 +460,28 @@ class BaseWikiMarkup(object):
def parse_para(self):
self.dprint(80, "ENTER parse_para, tok %s", self.peektkn())
seq = []
textlist = []
tok = self.peektkn()
- if re.match("^\s", tok['content']):
- type = 'PRE'
- rx = re.compile("^\S")
+
+ if self.newline:
+ if re.match("^\s", tok['content']):
+ type = 'PRE'
+ rx = re.compile("^\S")
+ else:
+ type = 'PARA'
+ rx = re.compile("^\s")
else:
- type = 'PARA'
- rx = re.compile("^\s")
+ type = 'SEQ'
+ rx = None
+
while 1:
tok = self.getkn()
if tok['type'] == 'TEXT':
- if self.newline and rx.match(tok['content']):
+ if rx and self.newline and rx.match(tok['content']):
self.ungetkn()
break
textlist.append(tok['content'])
elif tok['type'] == 'NL':
tok = self.getkn()
if tok['type'] == 'NL' or tok['type'] == 'NIL':
@@ -364,16 +490,20 @@ class BaseWikiMarkup(object):
self.ungetkn()
if self.is_block_delim(tok):
break
textlist.append('\n')
elif tok['type'] == 'NIL':
break
+ elif tok['type'] == 'OTAG' or tok['type'] == 'CTAG' or tok['type'] == 'TAG':
+ self.ungetkn()
+ break
elif tok['type'] == 'DELIM':
if self.is_inline_delim(tok):
if textlist:
- seq.append({ 'type': 'TEXT', 'content': textlist })
+ seq.append({ 'type': 'TEXT',
+ 'content': ''.join(textlist) })
textlist = []
x = self.parse_inline(tok)
if x:
seq.append(x)
else:
self.dprint(80, "ROLLBACK parse_para=%s", tok)
@@ -394,13 +524,13 @@ class BaseWikiMarkup(object):
break
else:
seq.append({ 'type': 'TEXT', 'content': tok['content'] })
# self.ungetkn()
break
if textlist:
- seq.append({ 'type': 'TEXT', 'content': textlist })
+ seq.append({ 'type': 'TEXT', 'content': ''.join(textlist) })
self.dprint(80, "LEAVE parse_para=%s", seq)
return { 'type': type, 'content': seq }
def parse_header(self, delim):
self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn())
list = []
@@ -440,107 +570,158 @@ class BaseWikiMarkup(object):
while 1:
tok = self.getkn()
if tok['type'] == 'NL' or tok['type'] == 'NIL':
break
elif tok['type'] == 'TEXT':
list.append(tok)
- elif tok['type'] == 'DELIM' and tok['content'][0] == ":":
- list.append(self.parse_indent(len(tok['content'])))
- break
- else:
- x = self.parse_inline(tok)
- if x:
- list.append(x)
+ elif tok['type'] == 'DELIM':
+ if tok['content'][0] == ":":
+ list.append(self.parse_indent(len(tok['content'])))
+ break
else:
- list.append(tok)
+ x = self.parse_inline(tok)
+ if x:
+ list.append(x)
+ else:
+ list.append(tok)
+ else:
+ list.append(tok)
self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list)
return { 'type': 'SEQ', 'content': list }
def parse_env(self, type, lev):
self.dprint(80, "ENTER parse_env(%s,%s), tok %s",type,lev,self.peektkn())
list = []
while 1:
tok = self.getkn()
if tok['type'] == 'DELIM' \
- and envtypes.has_key(tok['content'][0]) \
+ and tok['content'][0] in envtypes \
and type == envtypes[tok['content'][0]][0]:
if len(tok['content']) < lev:
self.ungetkn()
break
elif len(tok['content']) > lev:
self.ungetkn()
elt = self.parse_env(type, len(tok['content']))
else:
elt = self.parse_line()
- if len(tok.keys()) == 2:
+ if 'continuation' not in tok:
list.append({ 'type': 'ELT',
'subtype': envtypes[tok['content'][0]][1],
'content': elt })
continue
- if list[-1]['content']['type'] != 'SEQ':
- x = list[-1]['content']['content']
- # FIXME:
- list[-1]['content'] = { 'type': 'SEQ', 'content': [x] }
- list[-1]['content']['content'].append(elt)
+ if list:
+ if list[-1]['content']['type'] != 'SEQ':
+ x = list[-1]['content']['content']
+ # FIXME:
+ list[-1]['content'] = { 'type': 'SEQ', 'content': [x] }
+ list[-1]['content']['content'].append(elt)
else:
self.ungetkn()
break
self.dprint(80, "LEAVE parse_env=(ENV, %s, %s, %s)", type, lev, list)
return { 'type': 'ENV', 'envtype': type, 'level': lev, 'content': list }
def parse_indent(self, lev):
self.dprint(80, "ENTER parse_indent(%s), tok %s", lev, self.peektkn())
x = { 'type': 'IND', 'level': lev, 'content': self.parse_line() }
self.dprint(80, "LEAVE parse_indent=%s", x)
return x
+ def parse_til(self, tag):
+ self.dprint(80, "ENTER parse_til(%s)", tag)
+ seq = []
+ save = self.tokind
+ while 1:
+ t = self.parse0()
+ if t == None or t['type'] == 'NIL':
+ self.tokind = save
+ s = '<' + tag['tag']
+ if 'args' in tag and tag['args']:
+ s += ' ' + tag['args']
+ del tag['args']
+ s += '>'
+ if 'content' in tag:
+ subtree = tag['content']
+ else:
+ subtree = None
+ tag['type'] = 'TEXT'
+ tag['content'] = s
+ if subtree:
+ self.tree[self.tokind:self.tokind] = subtree
+ self.dprint(80, "LEAVE parse_til = %s (tree modified)", tag)
+ self.ungetkn()
+ return self.parse0()
+
+ if t['type'] == 'CTAG' and tag['tag'] == t['tag']:
+ break
+ seq.append(t)
+
+ ret = { 'type': 'TAG',
+ 'tag': tag['tag'],
+ 'args': tag['args'],
+ 'content': { 'type': 'SEQ', 'content': seq } }
+ self.dprint(80, "LEAVE parse_til = %s", ret)
+ return ret
+
def parse0(self):
tok = self.getkn()
+ self.dprint(80, "parse0: %s", tok)
toktype = tok['type']
if toktype == 'NIL':
return None
elif toktype == 'TEXT':
self.ungetkn()
return self.parse_para()
elif toktype == 'DELIM':
if tok['content'] == "----":
return { 'type': 'BAR' }
elif tok['content'][0:2] == "==":
return self.parse_header(tok['content'])
- elif envtypes.has_key(tok['content'][0]):
+ elif tok['content'][0] in envtypes:
type = envtypes[tok['content'][0]][0]
lev = len(tok['content'])
self.ungetkn()
return self.parse_env(type, lev)
elif tok['content'][0] == ":":
return self.parse_indent(len(tok['content']))
else:
self.ungetkn()
return self.parse_para()
elif toktype == 'NL':
return { 'type': 'TEXT', 'content': '\n' }
-# return self.parse0()
+ elif toktype == 'OTAG':
+ return self.parse_til(tok)
else:
return tok
def parse(self):
if not self.toklist:
self.tokenize()
- self.dprint(90, "TOKLIST: %s", self.toklist)
+ if self.debug_level >= 90:
+ print("TOKEN DUMP BEGIN")
+ self.dump(self.toklist)
+ print("TOKEN DUMP END")
+
self.tokind = 0
self.tree = []
while 1:
subtree = self.parse0()
if subtree == None:
break
self.tree.append(subtree)
+
if self.nested:
if self.tree[0]['type'] == 'PARA':
self.tree[0]['type'] = 'SEQ'
- self.dprint(70, "TREE: %s", self.tree)
+
+ if self.debug_level >= 70:
+ print("TREE DUMP BEGIN")
+ self.dump(self.tree)
+ print("TREE DUMP END")
def __str__(self):
return str(self.tree)
class WikiMarkup (BaseWikiMarkup):
@@ -616,17 +797,13 @@ class WikiMarkup (BaseWikiMarkup):
if m: # and m.group(1) in self.langtab:
return True
return False
def is_empty_text(self, elt):
if elt['type'] == 'TEXT':
- if isinstance(elt['content'],list):
- for s in elt['content']:
- if re.search('\w', s):
- return False
- elif re.search('\w', elt['content']):
+ if re.search('\w', elt['content']):
return False
return True
return False
def is_empty_para(self, seq):
for x in seq:

Return to:

Send suggestions and report system problems to the System administrator.