aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2015-07-15 14:52:15 +0300
committerSergey Poznyakoff <gray@gnu.org.ua>2015-07-15 14:52:15 +0300
commit288d3c09c06af73ca6413b9692c06d379de319b1 (patch)
tree0ef58f2d868a230cff8490e2821636f0b16d927b
parentf97542b428b1a008e2df955cf2047e4b6b9d73d3 (diff)
downloadwit-288d3c09c06af73ca6413b9692c06d379de319b1.tar.gz
wit-288d3c09c06af73ca6413b9692c06d379de319b1.tar.bz2
Improve tokenizer and parser.
* wikimarkup.py (ctag,otag): pfx group not needed anymore (refstart): New global (tokread): Clean up logic. Handle <</nowiki>tag> properly. (parse_ref): Rewrite. (parse_inline): Recover in case of unmatched delimiters (parse_line): Handle OTAG tokens.
-rw-r--r--wikimarkup.py260
1 files changed, 141 insertions, 119 deletions
diff --git a/wikimarkup.py b/wikimarkup.py
index 815e89d..b765594 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -20,15 +20,16 @@ import re
from types import *
__all__ = [ "BaseWikiMarkup", "WikiMarkup",
"envtypes" ]
delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
-otag = re.compile("(?P<pfx>[^<]*)<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
-ctag = re.compile("(?P<pfx>[^<]*)</(?P<tag>[a-zA-Z0-9_]+)\s*>")
-
+otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
+ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+refstart = re.compile("^https?://")
+
close_delim = {
'[': ']',
'[[': ']]',
'{{': '}}'
}
@@ -64,13 +65,13 @@ class BaseWikiMarkup(object):
def dump_text(self, node, level, file):
self.print_dump_prefix(level, file)
file.write("CONTENT: \"%s\"\n" % node['content'])
def dump_delim(self, node, level, file):
file.write("'%s'" % node['content'])
- if 'continuation' in node:
+ if 'continuation' in node and node['continuation']:
file.write(" (cont)")
file.write("\n")
def dump_tag(self, node, level, file):
self.print_dump_prefix(level, file)
file.write("TAG: %s\n" % node['tag'])
@@ -151,13 +152,23 @@ class BaseWikiMarkup(object):
self.print_dump_prefix(level, file)
file.write("END NODE " + node['type'] + "\n")
def dump(self, tree, level=0, file=sys.stdout):
for node in tree:
self.dump_node(node, level, file)
-
+
+ def rettext(self, text):
+ if text[-1] == '\n':
+ if text[0:-1] != '':
+ yield({ 'type': 'TEXT',
+ 'content': text[0:-1] })
+ yield({ 'type': 'NL',
+ 'content': '\n' })
+ else:
+ yield({ 'type': 'TEXT', 'content': text })
+
def tokread(self):
line = None
pos = 0
while 1:
if (not line or pos == len(line)):
try:
@@ -178,94 +189,93 @@ class BaseWikiMarkup(object):
self.dprint(100, "LINE: %s", line[pos:])
m = delim.search(line, pos)
if m:
if (pos < m.start(0)):
yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
- pos = m.end(0)
-
- if m and line[m.start(0)] != '<':
- content = m.group(0)
- if content[0] in envtypes:
- t = { 'type': 'DELIM',
- 'content': content,
- 'continuation': pos < len(line) and line[pos] == ":" }
- if t['continuation']:
- t['content'] += t['content'][0]
- pos += 1
-
- yield(t)
-
- while pos < len(line) and line[pos] in [' ', '\t']:
- pos += 1
- else:
- yield({ 'type': 'DELIM',
- 'content': content,
- 'continuation': False})
- else:
- if m:
- pos -= 1
+ pos = m.start(0)
t = None
- m = otag.match(line, pos)
- if m and m.group('tag') in self.tags:
- rest = line[m.end(0):]
- line = m.group('pfx')
- pos = 0
- t = { 'type': 'OTAG',
- 'tag': m.group('tag'),
- 'args': m.group('args') }
- else:
- m = ctag.match(line, pos)
- if m and m.group('tag') in self.tags:
- rest = line[m.end(0):]
- line = m.group('pfx')
- pos = 0
- t = { 'type': 'CTAG',
- 'tag': m.group('tag') }
-
- if line:
- if line[-1] == '\n':
- if line[pos:-1] != '':
- yield({ 'type': 'TEXT',
- 'content': line[pos:-1] })
- yield({ 'type': 'NL',
- 'content': '\n' })
- else:
- yield({ 'type': 'TEXT',
- 'content': line[pos:] })
- if t:
- line = rest
- pos = 0
- if t['type'] == 'OTAG' and t['tag'] == 'nowiki':
- if m.group('closed'):
- pass
- else:
- while 1:
- try:
- m = ctag.match(line)
- if m and m.group('tag') == t['tag']:
+ if line[m.start(0)] == '<':
+ m = otag.match(line, pos)
+ if m:
+ pos = m.end(0)
+ if m.group('tag') == 'nowiki':
+ if not m.group('closed'):
+ while 1:
+ try:
+ m = ctag.match(line)
+ if m and m.group('tag') == 'nowiki':
+ yield({ 'type': 'TEXT',
+ 'content': line[pos:m.start(0)] })
+ pos = m.end(0)
+ break
+
yield({ 'type': 'TEXT',
- 'content': m.group('pfx') })
- pos = m.end(0)
+ 'content': line[pos:] })
+
+ line = self.input()
+ pos = 0
+ except StopIteration:
break
-
- yield({ 'type': 'TEXT',
- 'content': line })
-
- line = self.input()
- except StopIteration:
- break
continue
-
- yield(t)
- if t['type'] == 'OTAG' and m.group('closed'):
- t['type'] = 'CTAG'
+ elif m.group('tag') in self.tags:
+ t = { 'type': 'OTAG',
+ 'tag': m.group('tag'),
+ 'args': m.group('args') }
+ yield(t)
+ if m.group('closed'):
+ t['type'] = 'CTAG'
+ yield(t)
+ continue
+ else:
+ m = ctag.match(line, pos)
+ if m:
+ if m.group('tag') in self.tags:
+ yield( { 'type': 'CTAG',
+ 'tag': m.group('tag') } )
+ pos = m.end(0)
+ continue
+ else:
+ yield( { 'type': 'TEXT',
+ 'content': line[pos:pos+1] })
+ pos += 1
+ continue
+ else:
+ pos = m.end(0)
+ content = m.group(0)
+ if content[0] in envtypes:
+ t = { 'type': 'DELIM',
+ 'content': content,
+ 'continuation': pos < len(line) and line[pos] == ":" }
+ if t['continuation']:
+ t['content'] += t['content'][0]
+ pos += 1
+
yield(t)
+
+ while pos < len(line) and line[pos] in [' ', '\t']:
+ pos += 1
+ else:
+ yield({ 'type': 'DELIM',
+ 'content': content,
+ 'continuation': False})
+ continue
+
+ if line:
+ if line[-1] == '\n':
+ if line[pos:-1] != '':
+ yield({ 'type': 'TEXT',
+ 'content': line[pos:-1] })
+ yield({ 'type': 'NL',
+ 'content': '\n' })
else:
- line = None
+ yield({ 'type': 'TEXT',
+ 'content': line[pos:] })
+ line = None
+
def input(self):
return None
def swaptkn(self, i, j):
self.dprint(80, "SWAPPING %s <-> %s", i, j)
@@ -409,42 +419,46 @@ class BaseWikiMarkup(object):
self.dprint(80, "LEAVE parse_link=%s", "None")
return None
self.dprint(80, "LEAVE parse_link=(%s,%s)", type, subtree)
return { 'type': type, 'content': subtree }
def parse_ref(self):
- self.dprint(80, "ENTER parse_ref, tok %s", self.peektkn())
- list = []
+ tok = self.getkn()
+ self.dprint(80, "ENTER parse_ref, tok %s", tok)
+ if not (tok['type'] == 'TEXT' and refstart.match(tok['content'])):
+ self.dprint(80, "LEAVE parse_ref=None")
+ return None
+
+ seq = []
+ (ref,sep,text) = tok['content'].partition(' ')
+ if text:
+ seq.insert(0, {'type': 'TEXT', 'content': text })
+
while 1:
tok = self.getkn()
+ if tok == None or tok['type'] == 'NIL':
+ self.dprint(80, "LEAVE parse_ref=None")
+ return None
if tok['type'] == 'DELIM':
- if tok['content'] == "]":
+ if tok['content'] == ']':
break
else:
- x = self.parse_inline(tok)
- if x:
- list.append(x)
+ tok = self.parse_inline(tok)
+ if tok:
+ seq.append(tok)
else:
- self.dprint(80, "LEAVE parse_ref=%s", "None")
+ self.dprint(80, "LEAVE parse_ref=None")
return None
- elif tok['type'] == 'TEXT':
- list.append(tok)
- elif tok['type'] == 'NL':
- list.append({ 'type': 'TEXT', 'content': '\n' })
- continue
+ elif tok['type'] == 'OTAG':
+ list.append(self.parse_til(tok))
else:
- self.dprint(80, "LEAVE parse_ref=%s", "None")
- return None
- if len(list) == 0 or list[0]['type'] != 'TEXT':
- self.dprint(80, "LEAVE parse_ref=%s", "None")
- return None
- (ref,sep,text) = list[0]['content'].partition(' ')
+ seq.append(tok)
+
ret = { 'type': 'REF',
'ref': ref,
- 'content': { 'type': 'SEQ',
- 'content': [{ 'type': 'TEXT', 'content': text }] + list[1:] } }
+ 'content': { 'type': 'SEQ', 'content': seq } }
self.dprint(80, "LEAVE parse_ref= %s", ret)
return ret
inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ]
def is_inline_delim(self, tok):
@@ -462,17 +476,36 @@ class BaseWikiMarkup(object):
elif tok['content'] == "[":
x = self.parse_ref()
elif tok['content'] == "[[":
x = self.parse_link('LINK', "]]")
elif tok['content'] == "{{":
x = self.parse_link('TMPL', "}}")
- else: # FIXME
- self.dprint(80, "LEAVE parse_inline=%s", "None")
+ else:
+ self.dprint(80, "LEAVE parse_inline=%s (unhandled delimiter)", "None")
x = None
if not x:
self.tokind = tokind
+ tok['type'] = 'TEXT'
+ self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok)
+ od = tok['content']
+ if od in close_delim:
+ cd = close_delim[od]
+ lev = 0
+ for tok in self.toklist[self.tokind+1:]:
+ if tok['type'] == 'NIL':
+ break
+ elif tok['type'] == 'DELIM':
+ if tok['content'] == od:
+ lev += 1
+ elif tok['content'] == cd:
+ if lev == 0:
+ tok['type'] = 'TEXT'
+ lev -= 1
+ break
+ self.dprint(80, "END DELIMITER RECOVERY: %s", tok)
+
self.dprint(80, "LEAVE parse_inline=%s", x)
return x
def parse_para(self):
self.dprint(80, "ENTER parse_para, tok %s", self.peektkn())
seq = []
@@ -518,28 +551,14 @@ class BaseWikiMarkup(object):
'content': ''.join(textlist) })
textlist = []
x = self.parse_inline(tok)
if x:
seq.append(x)
else:
- self.dprint(80, "ROLLBACK parse_para=%s", tok)
- od = tok['content']
- textlist.append(od)
- if close_delim.has_key(od):
- cd = close_delim[od]
- lev = 0
- for tok in self.toklist[self.tokind:]:
- if tok['type'] == 'NIL':
- break
- elif tok['type'] == 'DELIM':
- if tok['content'] == od:
- lev += 1
- elif tok['content'] == cd:
- if lev == 0:
- tok['type'] = 'TEXT'
- break
+ self.ungetkn()
+ # restart
else:
seq.append({ 'type': 'TEXT', 'content': tok['content'] })
# self.ungetkn()
break
if textlist:
seq.append({ 'type': 'TEXT', 'content': ''.join(textlist) })
@@ -565,12 +584,13 @@ class BaseWikiMarkup(object):
return None
else:
x = self.parse_inline(tok)
if x:
list.append(x)
else:
+ self.ungetkn()
self.dprint(80, "LEAVE parse_header=%s", "None")
return None #FIXME?
else:
self.dprint(80, "LEAVE parse_header=%s", "None")
return None
self.dprint(80, "LEAVE parse_header=(HDR, %s, (SEQ,%s))",len(delim)-1,list)
@@ -595,12 +615,14 @@ class BaseWikiMarkup(object):
else:
x = self.parse_inline(tok)
if x:
list.append(x)
else:
list.append(tok)
+ elif tok['type'] == 'OTAG':
+ list.append(self.parse_til(tok))
else:
list.append(tok)
self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list)
return { 'type': 'SEQ', 'content': list }
def parse_env(self, type, lev):
@@ -678,13 +700,13 @@ class BaseWikiMarkup(object):
'content': { 'type': 'SEQ', 'content': seq } }
self.dprint(80, "LEAVE parse_til = %s", ret)
return ret
def parse0(self):
tok = self.getkn()
- self.dprint(80, "parse0: %s", tok)
+ self.dprint(80, "ENTER parse0(%s)", tok)
toktype = tok['type']
if toktype == 'NIL':
return None
elif toktype == 'TEXT':
self.ungetkn()
return self.parse_para()

Return to:

Send suggestions and report system problems to the System administrator.