diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-15 14:52:15 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-15 14:52:15 +0300 |
commit | 288d3c09c06af73ca6413b9692c06d379de319b1 (patch) | |
tree | 0ef58f2d868a230cff8490e2821636f0b16d927b | |
parent | f97542b428b1a008e2df955cf2047e4b6b9d73d3 (diff) | |
download | wit-288d3c09c06af73ca6413b9692c06d379de319b1.tar.gz wit-288d3c09c06af73ca6413b9692c06d379de319b1.tar.bz2 |
Improve tokenizer and parser.
* wikimarkup.py (ctag,otag): pfx group not needed anymore
(refstart): New global
(tokread): Clean up logic. Handle <</nowiki>tag> properly.
(parse_ref): Rewrite.
(parse_inline): Recover in case of unmatched delimiters
(parse_line): Handle OTAG tokens.
-rw-r--r-- | wikimarkup.py | 260 |
1 files changed, 141 insertions, 119 deletions
diff --git a/wikimarkup.py b/wikimarkup.py index 815e89d..b765594 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -20,15 +20,16 @@ import re from types import * __all__ = [ "BaseWikiMarkup", "WikiMarkup", "envtypes" ] delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") -otag = re.compile("(?P<pfx>[^<]*)<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") -ctag = re.compile("(?P<pfx>[^<]*)</(?P<tag>[a-zA-Z0-9_]+)\s*>") - +otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") +ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>") +refstart = re.compile("^https?://") + close_delim = { '[': ']', '[[': ']]', '{{': '}}' } @@ -64,13 +65,13 @@ class BaseWikiMarkup(object): def dump_text(self, node, level, file): self.print_dump_prefix(level, file) file.write("CONTENT: \"%s\"\n" % node['content']) def dump_delim(self, node, level, file): file.write("'%s'" % node['content']) - if 'continuation' in node: + if 'continuation' in node and node['continuation']: file.write(" (cont)") file.write("\n") def dump_tag(self, node, level, file): self.print_dump_prefix(level, file) file.write("TAG: %s\n" % node['tag']) @@ -151,13 +152,23 @@ class BaseWikiMarkup(object): self.print_dump_prefix(level, file) file.write("END NODE " + node['type'] + "\n") def dump(self, tree, level=0, file=sys.stdout): for node in tree: self.dump_node(node, level, file) - + + def rettext(self, text): + if text[-1] == '\n': + if text[0:-1] != '': + yield({ 'type': 'TEXT', + 'content': text[0:-1] }) + yield({ 'type': 'NL', + 'content': '\n' }) + else: + yield({ 'type': 'TEXT', 'content': text }) + def tokread(self): line = None pos = 0 while 1: if (not line or pos == len(line)): try: @@ -178,94 +189,93 @@ class BaseWikiMarkup(object): self.dprint(100, "LINE: %s", line[pos:]) m = delim.search(line, pos) if m: if (pos < m.start(0)): yield({'type': 'TEXT', 'content': line[pos:m.start(0)]}) - pos = m.end(0) - - if m and line[m.start(0)] != '<': - content = m.group(0) - if content[0] in envtypes: - t = { 'type': 'DELIM', - 'content': content, - 'continuation': pos < len(line) and line[pos] == ":" } - if t['continuation']: - t['content'] += t['content'][0] - pos += 1 - - yield(t) - - while pos < len(line) and line[pos] in [' ', '\t']: - pos += 1 - else: - yield({ 'type': 'DELIM', - 'content': content, - 'continuation': False}) - else: - if m: - pos -= 1 + pos = m.start(0) t = None - m = otag.match(line, pos) - if m and m.group('tag') in self.tags: - rest = line[m.end(0):] - line = m.group('pfx') - pos = 0 - t = { 'type': 'OTAG', - 'tag': m.group('tag'), - 'args': m.group('args') } - else: - m = ctag.match(line, pos) - if m and m.group('tag') in self.tags: - rest = line[m.end(0):] - line = m.group('pfx') - pos = 0 - t = { 'type': 'CTAG', - 'tag': m.group('tag') } - - if line: - if line[-1] == '\n': - if line[pos:-1] != '': - yield({ 'type': 'TEXT', - 'content': line[pos:-1] }) - yield({ 'type': 'NL', - 'content': '\n' }) - else: - yield({ 'type': 'TEXT', - 'content': line[pos:] }) - if t: - line = rest - pos = 0 - if t['type'] == 'OTAG' and t['tag'] == 'nowiki': - if m.group('closed'): - pass - else: - while 1: - try: - m = ctag.match(line) - if m and m.group('tag') == t['tag']: + if line[m.start(0)] == '<': + m = otag.match(line, pos) + if m: + pos = m.end(0) + if m.group('tag') == 'nowiki': + if not m.group('closed'): + while 1: + try: + m = ctag.match(line) + if m and m.group('tag') == 'nowiki': + yield({ 'type': 'TEXT', + 'content': line[pos:m.start(0)] }) + pos = m.end(0) + break + yield({ 'type': 'TEXT', - 'content': m.group('pfx') }) - pos = m.end(0) + 'content': line[pos:] }) + + line = self.input() + pos = 0 + except StopIteration: break - - yield({ 'type': 'TEXT', - 'content': line }) - - line = self.input() - except StopIteration: - break continue - - yield(t) - if t['type'] == 'OTAG' and m.group('closed'): - t['type'] = 'CTAG' + elif m.group('tag') in self.tags: + t = { 'type': 'OTAG', + 'tag': m.group('tag'), + 'args': m.group('args') } + yield(t) + if m.group('closed'): + t['type'] = 'CTAG' + yield(t) + continue + else: + m = ctag.match(line, pos) + if m: + if m.group('tag') in self.tags: + yield( { 'type': 'CTAG', + 'tag': m.group('tag') } ) + pos = m.end(0) + continue + else: + yield( { 'type': 'TEXT', + 'content': line[pos:pos+1] }) + pos += 1 + continue + else: + pos = m.end(0) + content = m.group(0) + if content[0] in envtypes: + t = { 'type': 'DELIM', + 'content': content, + 'continuation': pos < len(line) and line[pos] == ":" } + if t['continuation']: + t['content'] += t['content'][0] + pos += 1 + yield(t) + + while pos < len(line) and line[pos] in [' ', '\t']: + pos += 1 + else: + yield({ 'type': 'DELIM', + 'content': content, + 'continuation': False}) + continue + + if line: + if line[-1] == '\n': + if line[pos:-1] != '': + yield({ 'type': 'TEXT', + 'content': line[pos:-1] }) + yield({ 'type': 'NL', + 'content': '\n' }) else: - line = None + yield({ 'type': 'TEXT', + 'content': line[pos:] }) + line = None + def input(self): return None def swaptkn(self, i, j): self.dprint(80, "SWAPPING %s <-> %s", i, j) @@ -409,42 +419,46 @@ class BaseWikiMarkup(object): self.dprint(80, "LEAVE parse_link=%s", "None") return None self.dprint(80, "LEAVE parse_link=(%s,%s)", type, subtree) return { 'type': type, 'content': subtree } def parse_ref(self): - self.dprint(80, "ENTER parse_ref, tok %s", self.peektkn()) - list = [] + tok = self.getkn() + self.dprint(80, "ENTER parse_ref, tok %s", tok) + if not (tok['type'] == 'TEXT' and refstart.match(tok['content'])): + self.dprint(80, "LEAVE parse_ref=None") + return None + + seq = [] + (ref,sep,text) = tok['content'].partition(' ') + if text: + seq.insert(0, {'type': 'TEXT', 'content': text }) + while 1: tok = self.getkn() + if tok == None or tok['type'] == 'NIL': + self.dprint(80, "LEAVE parse_ref=None") + return None if tok['type'] == 'DELIM': - if tok['content'] == "]": + if tok['content'] == ']': break else: - x = self.parse_inline(tok) - if x: - list.append(x) + tok = self.parse_inline(tok) + if tok: + seq.append(tok) else: - self.dprint(80, "LEAVE parse_ref=%s", "None") + self.dprint(80, "LEAVE parse_ref=None") return None - elif tok['type'] == 'TEXT': - list.append(tok) - elif tok['type'] == 'NL': - list.append({ 'type': 'TEXT', 'content': '\n' }) - continue + elif tok['type'] == 'OTAG': + list.append(self.parse_til(tok)) else: - self.dprint(80, "LEAVE parse_ref=%s", "None") - return None - if len(list) == 0 or list[0]['type'] != 'TEXT': - self.dprint(80, "LEAVE parse_ref=%s", "None") - return None - (ref,sep,text) = list[0]['content'].partition(' ') + seq.append(tok) + ret = { 'type': 'REF', 'ref': ref, - 'content': { 'type': 'SEQ', - 'content': [{ 'type': 'TEXT', 'content': text }] + list[1:] } } + 'content': { 'type': 'SEQ', 'content': seq } } self.dprint(80, "LEAVE parse_ref= %s", ret) return ret inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ] def is_inline_delim(self, tok): @@ -462,17 +476,36 @@ class BaseWikiMarkup(object): elif tok['content'] == "[": x = self.parse_ref() elif tok['content'] == "[[": x = self.parse_link('LINK', "]]") elif tok['content'] == "{{": x = self.parse_link('TMPL', "}}") - else: # FIXME - self.dprint(80, "LEAVE parse_inline=%s", "None") + else: + self.dprint(80, "LEAVE parse_inline=%s (unhandled delimiter)", "None") x = None if not x: self.tokind = tokind + tok['type'] = 'TEXT' + self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok) + od = tok['content'] + if od in close_delim: + cd = close_delim[od] + lev = 0 + for tok in self.toklist[self.tokind+1:]: + if tok['type'] == 'NIL': + break + elif tok['type'] == 'DELIM': + if tok['content'] == od: + lev += 1 + elif tok['content'] == cd: + if lev == 0: + tok['type'] = 'TEXT' + lev -= 1 + break + self.dprint(80, "END DELIMITER RECOVERY: %s", tok) + self.dprint(80, "LEAVE parse_inline=%s", x) return x def parse_para(self): self.dprint(80, "ENTER parse_para, tok %s", self.peektkn()) seq = [] @@ -518,28 +551,14 @@ class BaseWikiMarkup(object): 'content': ''.join(textlist) }) textlist = [] x = self.parse_inline(tok) if x: seq.append(x) else: - self.dprint(80, "ROLLBACK parse_para=%s", tok) - od = tok['content'] - textlist.append(od) - if close_delim.has_key(od): - cd = close_delim[od] - lev = 0 - for tok in self.toklist[self.tokind:]: - if tok['type'] == 'NIL': - break - elif tok['type'] == 'DELIM': - if tok['content'] == od: - lev += 1 - elif tok['content'] == cd: - if lev == 0: - tok['type'] = 'TEXT' - break + self.ungetkn() + # restart else: seq.append({ 'type': 'TEXT', 'content': tok['content'] }) # self.ungetkn() break if textlist: seq.append({ 'type': 'TEXT', 'content': ''.join(textlist) }) @@ -565,12 +584,13 @@ class BaseWikiMarkup(object): return None else: x = self.parse_inline(tok) if x: list.append(x) else: + self.ungetkn() self.dprint(80, "LEAVE parse_header=%s", "None") return None #FIXME? else: self.dprint(80, "LEAVE parse_header=%s", "None") return None self.dprint(80, "LEAVE parse_header=(HDR, %s, (SEQ,%s))",len(delim)-1,list) @@ -595,12 +615,14 @@ class BaseWikiMarkup(object): else: x = self.parse_inline(tok) if x: list.append(x) else: list.append(tok) + elif tok['type'] == 'OTAG': + list.append(self.parse_til(tok)) else: list.append(tok) self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list) return { 'type': 'SEQ', 'content': list } def parse_env(self, type, lev): @@ -678,13 +700,13 @@ class BaseWikiMarkup(object): 'content': { 'type': 'SEQ', 'content': seq } } self.dprint(80, "LEAVE parse_til = %s", ret) return ret def parse0(self): tok = self.getkn() - self.dprint(80, "parse0: %s", tok) + self.dprint(80, "ENTER parse0(%s)", tok) toktype = tok['type'] if toktype == 'NIL': return None elif toktype == 'TEXT': self.ungetkn() return self.parse_para() |