diff options
-rw-r--r-- | wikimarkup.py | 260 |
1 files changed, 141 insertions, 119 deletions
diff --git a/wikimarkup.py b/wikimarkup.py index 815e89d..b765594 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -23,9 +23,10 @@ __all__ = [ "BaseWikiMarkup", "WikiMarkup", "envtypes" ] delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") -otag = re.compile("(?P<pfx>[^<]*)<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") -ctag = re.compile("(?P<pfx>[^<]*)</(?P<tag>[a-zA-Z0-9_]+)\s*>") - +otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") +ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>") +refstart = re.compile("^https?://") + close_delim = { '[': ']', '[[': ']]', @@ -67,7 +68,7 @@ class BaseWikiMarkup(object): def dump_delim(self, node, level, file): file.write("'%s'" % node['content']) - if 'continuation' in node: + if 'continuation' in node and node['continuation']: file.write(" (cont)") file.write("\n") @@ -154,7 +155,17 @@ class BaseWikiMarkup(object): def dump(self, tree, level=0, file=sys.stdout): for node in tree: self.dump_node(node, level, file) - + + def rettext(self, text): + if text[-1] == '\n': + if text[0:-1] != '': + yield({ 'type': 'TEXT', + 'content': text[0:-1] }) + yield({ 'type': 'NL', + 'content': '\n' }) + else: + yield({ 'type': 'TEXT', 'content': text }) + def tokread(self): line = None pos = 0 @@ -181,88 +192,87 @@ class BaseWikiMarkup(object): if m: if (pos < m.start(0)): yield({'type': 'TEXT', 'content': line[pos:m.start(0)]}) - pos = m.end(0) - - if m and line[m.start(0)] != '<': - content = m.group(0) - if content[0] in envtypes: - t = { 'type': 'DELIM', - 'content': content, - 'continuation': pos < len(line) and line[pos] == ":" } - if t['continuation']: - t['content'] += t['content'][0] - pos += 1 - - yield(t) - - while pos < len(line) and line[pos] in [' ', '\t']: - pos += 1 - else: - yield({ 'type': 'DELIM', - 'content': content, - 'continuation': False}) - else: - if m: - pos -= 1 + pos = m.start(0) t = None - m = otag.match(line, pos) - if m and m.group('tag') in self.tags: - rest = line[m.end(0):] - line = m.group('pfx') - pos = 0 - t = { 'type': 'OTAG', - 'tag': m.group('tag'), - 'args': m.group('args') } - else: - m = ctag.match(line, pos) - if m and m.group('tag') in self.tags: - rest = line[m.end(0):] - line = m.group('pfx') - pos = 0 - t = { 'type': 'CTAG', - 'tag': m.group('tag') } - - if line: - if line[-1] == '\n': - if line[pos:-1] != '': - yield({ 'type': 'TEXT', - 'content': line[pos:-1] }) - yield({ 'type': 'NL', - 'content': '\n' }) - else: - yield({ 'type': 'TEXT', - 'content': line[pos:] }) - if t: - line = rest - pos = 0 - if t['type'] == 'OTAG' and t['tag'] == 'nowiki': - if m.group('closed'): - pass - else: - while 1: - try: - m = ctag.match(line) - if m and m.group('tag') == t['tag']: + if line[m.start(0)] == '<': + m = otag.match(line, pos) + if m: + pos = m.end(0) + if m.group('tag') == 'nowiki': + if not m.group('closed'): + while 1: + try: + m = ctag.match(line) + if m and m.group('tag') == 'nowiki': + yield({ 'type': 'TEXT', + 'content': line[pos:m.start(0)] }) + pos = m.end(0) + break + yield({ 'type': 'TEXT', - 'content': m.group('pfx') }) - pos = m.end(0) + 'content': line[pos:] }) + + line = self.input() + pos = 0 + except StopIteration: break - - yield({ 'type': 'TEXT', - 'content': line }) - - line = self.input() - except StopIteration: - break continue - - yield(t) - if t['type'] == 'OTAG' and m.group('closed'): - t['type'] = 'CTAG' + elif m.group('tag') in self.tags: + t = { 'type': 'OTAG', + 'tag': m.group('tag'), + 'args': m.group('args') } + yield(t) + if m.group('closed'): + t['type'] = 'CTAG' + yield(t) + continue + else: + m = ctag.match(line, pos) + if m: + if m.group('tag') in self.tags: + yield( { 'type': 'CTAG', + 'tag': m.group('tag') } ) + pos = m.end(0) + continue + else: + yield( { 'type': 'TEXT', + 'content': line[pos:pos+1] }) + pos += 1 + continue + else: + pos = m.end(0) + content = m.group(0) + if content[0] in envtypes: + t = { 'type': 'DELIM', + 'content': content, + 'continuation': pos < len(line) and line[pos] == ":" } + if t['continuation']: + t['content'] += t['content'][0] + pos += 1 + yield(t) + + while pos < len(line) and line[pos] in [' ', '\t']: + pos += 1 + else: + yield({ 'type': 'DELIM', + 'content': content, + 'continuation': False}) + continue + + if line: + if line[-1] == '\n': + if line[pos:-1] != '': + yield({ 'type': 'TEXT', + 'content': line[pos:-1] }) + yield({ 'type': 'NL', + 'content': '\n' }) else: - line = None + yield({ 'type': 'TEXT', + 'content': line[pos:] }) + line = None + def input(self): return None @@ -412,36 +422,40 @@ class BaseWikiMarkup(object): return { 'type': type, 'content': subtree } def parse_ref(self): - self.dprint(80, "ENTER parse_ref, tok %s", self.peektkn()) - list = [] + tok = self.getkn() + self.dprint(80, "ENTER parse_ref, tok %s", tok) + if not (tok['type'] == 'TEXT' and refstart.match(tok['content'])): + self.dprint(80, "LEAVE parse_ref=None") + return None + + seq = [] + (ref,sep,text) = tok['content'].partition(' ') + if text: + seq.insert(0, {'type': 'TEXT', 'content': text }) + while 1: tok = self.getkn() + if tok == None or tok['type'] == 'NIL': + self.dprint(80, "LEAVE parse_ref=None") + return None if tok['type'] == 'DELIM': - if tok['content'] == "]": + if tok['content'] == ']': break else: - x = self.parse_inline(tok) - if x: - list.append(x) + tok = self.parse_inline(tok) + if tok: + seq.append(tok) else: - self.dprint(80, "LEAVE parse_ref=%s", "None") + self.dprint(80, "LEAVE parse_ref=None") return None - elif tok['type'] == 'TEXT': - list.append(tok) - elif tok['type'] == 'NL': - list.append({ 'type': 'TEXT', 'content': '\n' }) - continue + elif tok['type'] == 'OTAG': + list.append(self.parse_til(tok)) else: - self.dprint(80, "LEAVE parse_ref=%s", "None") - return None - if len(list) == 0 or list[0]['type'] != 'TEXT': - self.dprint(80, "LEAVE parse_ref=%s", "None") - return None - (ref,sep,text) = list[0]['content'].partition(' ') + seq.append(tok) + ret = { 'type': 'REF', 'ref': ref, - 'content': { 'type': 'SEQ', - 'content': [{ 'type': 'TEXT', 'content': text }] + list[1:] } } + 'content': { 'type': 'SEQ', 'content': seq } } self.dprint(80, "LEAVE parse_ref= %s", ret) return ret @@ -465,11 +479,30 @@ class BaseWikiMarkup(object): x = self.parse_link('LINK', "]]") elif tok['content'] == "{{": x = self.parse_link('TMPL', "}}") - else: # FIXME - self.dprint(80, "LEAVE parse_inline=%s", "None") + else: + self.dprint(80, "LEAVE parse_inline=%s (unhandled delimiter)", "None") x = None if not x: self.tokind = tokind + tok['type'] = 'TEXT' + self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok) + od = tok['content'] + if od in close_delim: + cd = close_delim[od] + lev = 0 + for tok in self.toklist[self.tokind+1:]: + if tok['type'] == 'NIL': + break + elif tok['type'] == 'DELIM': + if tok['content'] == od: + lev += 1 + elif tok['content'] == cd: + if lev == 0: + tok['type'] = 'TEXT' + lev -= 1 + break + self.dprint(80, "END DELIMITER RECOVERY: %s", tok) + self.dprint(80, "LEAVE parse_inline=%s", x) return x @@ -521,22 +554,8 @@ class BaseWikiMarkup(object): if x: seq.append(x) else: - self.dprint(80, "ROLLBACK parse_para=%s", tok) - od = tok['content'] - textlist.append(od) - if close_delim.has_key(od): - cd = close_delim[od] - lev = 0 - for tok in self.toklist[self.tokind:]: - if tok['type'] == 'NIL': - break - elif tok['type'] == 'DELIM': - if tok['content'] == od: - lev += 1 - elif tok['content'] == cd: - if lev == 0: - tok['type'] = 'TEXT' - break + self.ungetkn() + # restart else: seq.append({ 'type': 'TEXT', 'content': tok['content'] }) # self.ungetkn() @@ -568,6 +587,7 @@ class BaseWikiMarkup(object): if x: list.append(x) else: + self.ungetkn() self.dprint(80, "LEAVE parse_header=%s", "None") return None #FIXME? else: @@ -598,6 +618,8 @@ class BaseWikiMarkup(object): list.append(x) else: list.append(tok) + elif tok['type'] == 'OTAG': + list.append(self.parse_til(tok)) else: list.append(tok) self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list) @@ -681,7 +703,7 @@ class BaseWikiMarkup(object): def parse0(self): tok = self.getkn() - self.dprint(80, "parse0: %s", tok) + self.dprint(80, "ENTER parse0(%s)", tok) toktype = tok['type'] if toktype == 'NIL': return None |