diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-15 14:52:15 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-15 14:52:15 +0300 |
commit | 288d3c09c06af73ca6413b9692c06d379de319b1 (patch) | |
tree | 0ef58f2d868a230cff8490e2821636f0b16d927b | |
parent | f97542b428b1a008e2df955cf2047e4b6b9d73d3 (diff) | |
download | wit-288d3c09c06af73ca6413b9692c06d379de319b1.tar.gz wit-288d3c09c06af73ca6413b9692c06d379de319b1.tar.bz2 |
Improve tokenizer and parser.
* wikimarkup.py (ctag,otag): pfx group not needed anymore
(refstart): New global
(tokread): Clean up logic. Handle <</nowiki>tag> properly.
(parse_ref): Rewrite.
(parse_inline): Recover in case of unmatched delimiters
(parse_line): Handle OTAG tokens.
-rw-r--r-- | wikimarkup.py | 208 |
1 files changed, 115 insertions, 93 deletions
diff --git a/wikimarkup.py b/wikimarkup.py index 815e89d..b765594 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -23,8 +23,9 @@ __all__ = [ "BaseWikiMarkup", "WikiMarkup", "envtypes" ] delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") -otag = re.compile("(?P<pfx>[^<]*)<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") -ctag = re.compile("(?P<pfx>[^<]*)</(?P<tag>[a-zA-Z0-9_]+)\s*>") +otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") +ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>") +refstart = re.compile("^https?://") close_delim = { '[': ']', @@ -67,7 +68,7 @@ class BaseWikiMarkup(object): def dump_delim(self, node, level, file): file.write("'%s'" % node['content']) - if 'continuation' in node: + if 'continuation' in node and node['continuation']: file.write(" (cont)") file.write("\n") @@ -155,6 +156,16 @@ class BaseWikiMarkup(object): for node in tree: self.dump_node(node, level, file) + def rettext(self, text): + if text[-1] == '\n': + if text[0:-1] != '': + yield({ 'type': 'TEXT', + 'content': text[0:-1] }) + yield({ 'type': 'NL', + 'content': '\n' }) + else: + yield({ 'type': 'TEXT', 'content': text }) + def tokread(self): line = None pos = 0 @@ -181,9 +192,56 @@ class BaseWikiMarkup(object): if m: if (pos < m.start(0)): yield({'type': 'TEXT', 'content': line[pos:m.start(0)]}) + pos = m.start(0) + t = None + + if line[m.start(0)] == '<': + m = otag.match(line, pos) + if m: + pos = m.end(0) + if m.group('tag') == 'nowiki': + if not m.group('closed'): + while 1: + try: + m = ctag.match(line) + if m and m.group('tag') == 'nowiki': + yield({ 'type': 'TEXT', + 'content': line[pos:m.start(0)] }) pos = m.end(0) + break + + yield({ 'type': 'TEXT', + 'content': line[pos:] }) - if m and line[m.start(0)] != '<': + line = self.input() + pos = 0 + except StopIteration: + break + continue + elif m.group('tag') in self.tags: + t = { 'type': 'OTAG', + 'tag': m.group('tag'), + 'args': m.group('args') } + yield(t) + if m.group('closed'): + t['type'] = 'CTAG' + yield(t) + continue + else: + m = ctag.match(line, pos) + if m: + if m.group('tag') in self.tags: + yield( { 'type': 'CTAG', + 'tag': m.group('tag') } ) + pos = m.end(0) + continue + else: + yield( { 'type': 'TEXT', + 'content': line[pos:pos+1] }) + pos += 1 + continue + else: + pos = m.end(0) content = m.group(0) if content[0] in envtypes: t = { 'type': 'DELIM', @@ -201,26 +259,7 @@ class BaseWikiMarkup(object): yield({ 'type': 'DELIM', 'content': content, 'continuation': False}) - else: - if m: - pos -= 1 - t = None - m = otag.match(line, pos) - if m and m.group('tag') in self.tags: - rest = line[m.end(0):] - line = m.group('pfx') - pos = 0 - t = { 'type': 'OTAG', - 'tag': m.group('tag'), - 'args': m.group('args') } - else: - m = ctag.match(line, pos) - if m and m.group('tag') in self.tags: - rest = line[m.end(0):] - line = m.group('pfx') - pos = 0 - t = { 'type': 'CTAG', - 'tag': m.group('tag') } + continue if line: if line[-1] == '\n': @@ -232,38 +271,9 @@ class BaseWikiMarkup(object): else: yield({ 'type': 'TEXT', 'content': line[pos:] }) - - if t: - line = rest - pos = 0 - if t['type'] == 'OTAG' and t['tag'] == 'nowiki': - if m.group('closed'): - pass - else: - while 1: - try: - m = ctag.match(line) - if m and m.group('tag') == t['tag']: - yield({ 'type': 'TEXT', - 'content': m.group('pfx') }) - pos = m.end(0) - break - - yield({ 'type': 'TEXT', - 'content': line }) - - line = self.input() - except StopIteration: - break - continue - - yield(t) - if t['type'] == 'OTAG' and m.group('closed'): - t['type'] = 'CTAG' - yield(t) - else: line = None + def input(self): return None @@ -412,36 +422,40 @@ class BaseWikiMarkup(object): return { 'type': type, 'content': subtree } def parse_ref(self): - self.dprint(80, "ENTER parse_ref, tok %s", self.peektkn()) - list = [] + tok = self.getkn() + self.dprint(80, "ENTER parse_ref, tok %s", tok) + if not (tok['type'] == 'TEXT' and refstart.match(tok['content'])): + self.dprint(80, "LEAVE parse_ref=None") + return None + + seq = [] + (ref,sep,text) = tok['content'].partition(' ') + if text: + seq.insert(0, {'type': 'TEXT', 'content': text }) + while 1: tok = self.getkn() + if tok == None or tok['type'] == 'NIL': + self.dprint(80, "LEAVE parse_ref=None") + return None if tok['type'] == 'DELIM': - if tok['content'] == "]": + if tok['content'] == ']': break else: - x = self.parse_inline(tok) - if x: - list.append(x) + tok = self.parse_inline(tok) + if tok: + seq.append(tok) else: - self.dprint(80, "LEAVE parse_ref=%s", "None") + self.dprint(80, "LEAVE parse_ref=None") return None - elif tok['type'] == 'TEXT': - list.append(tok) - elif tok['type'] == 'NL': - list.append({ 'type': 'TEXT', 'content': '\n' }) - continue + elif tok['type'] == 'OTAG': + list.append(self.parse_til(tok)) else: - self.dprint(80, "LEAVE parse_ref=%s", "None") - return None - if len(list) == 0 or list[0]['type'] != 'TEXT': - self.dprint(80, "LEAVE parse_ref=%s", "None") - return None - (ref,sep,text) = list[0]['content'].partition(' ') + seq.append(tok) + ret = { 'type': 'REF', 'ref': ref, - 'content': { 'type': 'SEQ', - 'content': [{ 'type': 'TEXT', 'content': text }] + list[1:] } } + 'content': { 'type': 'SEQ', 'content': seq } } self.dprint(80, "LEAVE parse_ref= %s", ret) return ret @@ -465,11 +479,30 @@ class BaseWikiMarkup(object): x = self.parse_link('LINK', "]]") elif tok['content'] == "{{": x = self.parse_link('TMPL', "}}") - else: # FIXME - self.dprint(80, "LEAVE parse_inline=%s", "None") + else: + self.dprint(80, "LEAVE parse_inline=%s (unhandled delimiter)", "None") x = None if not x: self.tokind = tokind + tok['type'] = 'TEXT' + self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok) + od = tok['content'] + if od in close_delim: + cd = close_delim[od] + lev = 0 + for tok in self.toklist[self.tokind+1:]: + if tok['type'] == 'NIL': + break + elif tok['type'] == 'DELIM': + if tok['content'] == od: + lev += 1 + elif tok['content'] == cd: + if lev == 0: + tok['type'] = 'TEXT' + lev -= 1 + break + self.dprint(80, "END DELIMITER RECOVERY: %s", tok) + self.dprint(80, "LEAVE parse_inline=%s", x) return x @@ -521,22 +554,8 @@ class BaseWikiMarkup(object): if x: seq.append(x) else: - self.dprint(80, "ROLLBACK parse_para=%s", tok) - od = tok['content'] - textlist.append(od) - if close_delim.has_key(od): - cd = close_delim[od] - lev = 0 - for tok in self.toklist[self.tokind:]: - if tok['type'] == 'NIL': - break - elif tok['type'] == 'DELIM': - if tok['content'] == od: - lev += 1 - elif tok['content'] == cd: - if lev == 0: - tok['type'] = 'TEXT' - break + self.ungetkn() + # restart else: seq.append({ 'type': 'TEXT', 'content': tok['content'] }) # self.ungetkn() @@ -568,6 +587,7 @@ class BaseWikiMarkup(object): if x: list.append(x) else: + self.ungetkn() self.dprint(80, "LEAVE parse_header=%s", "None") return None #FIXME? else: @@ -598,6 +618,8 @@ class BaseWikiMarkup(object): list.append(x) else: list.append(tok) + elif tok['type'] == 'OTAG': + list.append(self.parse_til(tok)) else: list.append(tok) self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list) @@ -681,7 +703,7 @@ class BaseWikiMarkup(object): def parse0(self): tok = self.getkn() - self.dprint(80, "parse0: %s", tok) + self.dprint(80, "ENTER parse0(%s)", tok) toktype = tok['type'] if toktype == 'NIL': return None |