1 files changed, 141 insertions, 119 deletions
diff --git a/wikimarkup.py b/wikimarkup.py
index 815e89d..b765594 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -23,9 +23,10 @@ __all__ = [ "BaseWikiMarkup", "WikiMarkup",
             "envtypes" ]
 
 delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
-otag = re.compile("(?P<pfx>[^<]*)<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
-ctag = re.compile("(?P<pfx>[^<]*)</(?P<tag>[a-zA-Z0-9_]+)\s*>")
-
+otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
+ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+refstart = re.compile("^https?://")
+    
 close_delim = {
     '[': ']',
     '[[': ']]',
@@ -67,7 +68,7 @@ class BaseWikiMarkup(object):
     
     def dump_delim(self, node, level, file):
         file.write("'%s'" % node['content'])
-        if 'continuation' in node:
+        if 'continuation' in node and node['continuation']:
             file.write(" (cont)")
         file.write("\n")
                        
@@ -154,7 +155,17 @@ class BaseWikiMarkup(object):
     def dump(self, tree, level=0, file=sys.stdout):
         for node in tree:
             self.dump_node(node, level, file)
-            
+
+    def rettext(self, text):
+        if text[-1] == '\n':
+            if text[0:-1] != '':
+                yield({ 'type': 'TEXT',
+                        'content': text[0:-1] })
+            yield({ 'type': 'NL',
+                    'content': '\n' })
+        else:
+            yield({ 'type': 'TEXT', 'content': text })
+
     def tokread(self):
         line = None
         pos = 0
@@ -181,88 +192,87 @@ class BaseWikiMarkup(object):
             if m:
                 if (pos < m.start(0)):
                     yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
-                pos = m.end(0)
-
-            if m and line[m.start(0)] != '<':
-                content = m.group(0)
-                if content[0] in envtypes:
-                    t = { 'type': 'DELIM',
-                           'content': content,
-                           'continuation': pos < len(line) and line[pos] == ":" }
-                    if t['continuation']:
-                        t['content'] += t['content'][0]
-                        pos += 1
-                        
-                    yield(t)
-
-                    while pos < len(line) and line[pos] in [' ', '\t']:
-                        pos += 1 
-                else:
-                    yield({ 'type': 'DELIM',
-                            'content': content,
-                            'continuation': False})
-            else:
-                if m:
-                    pos -= 1
+                pos = m.start(0)
                 t = None
-                m = otag.match(line, pos)
-                if m and m.group('tag') in self.tags:
-                    rest = line[m.end(0):]
-                    line = m.group('pfx')
-                    pos = 0
-                    t = { 'type': 'OTAG',
-                          'tag': m.group('tag'),
-                          'args': m.group('args') }
-                else:
-                    m = ctag.match(line, pos)
-                    if m and m.group('tag') in self.tags:
-                        rest = line[m.end(0):]
-                        line = m.group('pfx')
-                        pos = 0
-                        t = { 'type': 'CTAG',
-                              'tag': m.group('tag') }
-
-                if line:
-                    if line[-1] == '\n':
-                        if line[pos:-1] != '':
-                            yield({ 'type': 'TEXT',
-                                    'content': line[pos:-1] })
-                        yield({ 'type': 'NL',
-                                'content': '\n' })
-                    else:
-                        yield({ 'type': 'TEXT',
-                                'content': line[pos:] })
 
-                if t:
-                    line = rest
-                    pos = 0
-                    if t['type'] == 'OTAG' and t['tag'] == 'nowiki':
-                        if m.group('closed'):
-                            pass
-                        else:
-                            while 1:
-                                try:
-                                    m = ctag.match(line)
-                                    if m and m.group('tag') == t['tag']:
+                if line[m.start(0)] == '<':
+                    m = otag.match(line, pos)
+                    if m:
+                        pos = m.end(0)
+                        if m.group('tag') == 'nowiki':
+                            if not m.group('closed'):
+                                while 1:
+                                    try:
+                                        m = ctag.match(line)
+                                        if m and m.group('tag') == 'nowiki':
+                                            yield({ 'type': 'TEXT',
+                                                    'content': line[pos:m.start(0)] })
+                                            pos = m.end(0)
+                                            break
+
                                         yield({ 'type': 'TEXT',
-                                                'content': m.group('pfx') })
-                                        pos = m.end(0)
+                                              'content': line[pos:] })
+
+                                        line = self.input()
+                                        pos = 0
+                                    except StopIteration:
                                         break
-                                    
-                                    yield({ 'type': 'TEXT',
-                                            'content': line })
-                                    
-                                    line = self.input()
-                                except StopIteration:
-                                    break
                             continue
-                    
-                    yield(t)
-                    if t['type'] == 'OTAG' and m.group('closed'):
-                        t['type'] = 'CTAG'
+                        elif m.group('tag') in self.tags:
+                            t = { 'type': 'OTAG',
+                                  'tag': m.group('tag'),
+                                  'args': m.group('args') }
+                            yield(t)
+                            if m.group('closed'):
+                                t['type'] = 'CTAG'
+                                yield(t)
+                            continue
+                    else:
+                        m = ctag.match(line, pos)
+                        if m:
+                            if m.group('tag') in self.tags:
+                                yield( { 'type': 'CTAG',
+                                         'tag': m.group('tag') } )
+                                pos = m.end(0)
+                                continue
+                        else:
+                            yield( { 'type': 'TEXT',
+                                     'content': line[pos:pos+1] })
+                            pos += 1
+                            continue
+                else:
+                    pos = m.end(0)
+                    content = m.group(0)
+                    if content[0] in envtypes:
+                        t = { 'type': 'DELIM',
+                              'content': content,
+                              'continuation': pos < len(line) and line[pos] == ":" }
+                        if t['continuation']:
+                            t['content'] += t['content'][0]
+                            pos += 1
+
                         yield(t)
+
+                        while pos < len(line) and line[pos] in [' ', '\t']:
+                            pos += 1 
+                    else:
+                        yield({ 'type': 'DELIM',
+                                'content': content,
+                                'continuation': False})
+                    continue
+                
+            if line:
+                if line[-1] == '\n':
+                    if line[pos:-1] != '':
+                        yield({ 'type': 'TEXT',
+                                'content': line[pos:-1] })
+                    yield({ 'type': 'NL',
+                            'content': '\n' })
                 else:
-                    line = None
+                    yield({ 'type': 'TEXT',
+                            'content': line[pos:] })
+                line = None
+                
 
     def input(self):
         return None
@@ -412,36 +422,40 @@ class BaseWikiMarkup(object):
         return { 'type': type, 'content': subtree }
 
     def parse_ref(self):
-        self.dprint(80, "ENTER parse_ref, tok %s", self.peektkn())
-        list = []
+        tok = self.getkn()
+        self.dprint(80, "ENTER parse_ref, tok %s", tok)
+        if not (tok['type'] == 'TEXT' and refstart.match(tok['content'])):
+            self.dprint(80, "LEAVE parse_ref=None")
+            return None
+
+        seq = []
+        (ref,sep,text) = tok['content'].partition(' ')
+        if text:
+            seq.insert(0, {'type': 'TEXT', 'content': text })
+
         while 1:
             tok = self.getkn()
+            if tok == None or tok['type'] == 'NIL':
+                self.dprint(80, "LEAVE parse_ref=None")
+                return None
             if tok['type'] == 'DELIM':
-                if tok['content'] == "]":
+                if tok['content'] == ']':
                     break
                 else:
-                    x = self.parse_inline(tok)
-                    if x:
-                        list.append(x)
+                    tok = self.parse_inline(tok)
+                    if tok:
+                        seq.append(tok)
                     else:
-                        self.dprint(80, "LEAVE parse_ref=%s", "None")
+                        self.dprint(80, "LEAVE parse_ref=None")
                         return None
-            elif tok['type'] == 'TEXT':
-                list.append(tok)
-            elif tok['type'] == 'NL':
-                list.append({ 'type': 'TEXT', 'content': '\n' })
-                continue
+            elif tok['type'] == 'OTAG':
+                list.append(self.parse_til(tok))
             else:
-                self.dprint(80, "LEAVE parse_ref=%s", "None")
-                return None
-        if len(list) == 0 or list[0]['type'] != 'TEXT':
-            self.dprint(80, "LEAVE parse_ref=%s", "None")
-            return None
-        (ref,sep,text) = list[0]['content'].partition(' ')
+                seq.append(tok)
+
         ret = { 'type': 'REF', 
                 'ref': ref,
-                'content': { 'type': 'SEQ',
-                           'content': [{ 'type': 'TEXT', 'content': text }] + list[1:] } }
+                'content': { 'type': 'SEQ', 'content': seq } }
         self.dprint(80, "LEAVE parse_ref= %s", ret)
         return ret
 
@@ -465,11 +479,30 @@ class BaseWikiMarkup(object):
             x = self.parse_link('LINK', "]]")
         elif tok['content'] == "{{":
             x = self.parse_link('TMPL', "}}")
-        else: # FIXME
-            self.dprint(80, "LEAVE parse_inline=%s", "None")
+        else:
+            self.dprint(80, "LEAVE parse_inline=%s (unhandled delimiter)", "None")
             x = None
         if not x:
             self.tokind = tokind
+            tok['type'] = 'TEXT'
+            self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok)
+            od = tok['content']
+            if od in close_delim:
+                cd = close_delim[od]
+                lev = 0
+                for tok in self.toklist[self.tokind+1:]:
+                    if tok['type'] == 'NIL':
+                        break
+                    elif tok['type'] == 'DELIM':
+                        if tok['content'] == od:
+                            lev += 1
+                        elif tok['content'] == cd:
+                            if lev == 0:
+                                tok['type'] = 'TEXT'
+                            lev -= 1
+                            break
+            self.dprint(80, "END DELIMITER RECOVERY: %s", tok)
+
         self.dprint(80, "LEAVE parse_inline=%s", x)
         return x
 
@@ -521,22 +554,8 @@ class BaseWikiMarkup(object):
                     if x:
                         seq.append(x)
                     else:
-                        self.dprint(80, "ROLLBACK parse_para=%s", tok)
-                        od = tok['content']
-                        textlist.append(od)
-                        if close_delim.has_key(od):
-                            cd = close_delim[od]
-                            lev = 0
-                            for tok in self.toklist[self.tokind:]:
-                                if tok['type'] == 'NIL':
-                                    break
-                                elif tok['type'] == 'DELIM':
-                                    if tok['content'] == od:
-                                        lev += 1
-                                    elif tok['content'] == cd:
-                                        if lev == 0:
-                                            tok['type'] = 'TEXT'
-                                            break
+                        self.ungetkn()
+                        # restart
                 else:
                     seq.append({ 'type': 'TEXT', 'content': tok['content'] })
                 #    self.ungetkn()
@@ -568,6 +587,7 @@ class BaseWikiMarkup(object):
                     if x:
                         list.append(x)
                     else:
+                        self.ungetkn()
                         self.dprint(80, "LEAVE parse_header=%s", "None")
                         return None #FIXME?
             else:
@@ -598,6 +618,8 @@ class BaseWikiMarkup(object):
                         list.append(x)
                     else:
                         list.append(tok)
+            elif tok['type'] == 'OTAG':
+                list.append(self.parse_til(tok))
             else:
                 list.append(tok)
         self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list)
@@ -681,7 +703,7 @@ class BaseWikiMarkup(object):
             
     def parse0(self):
         tok = self.getkn()
-        self.dprint(80, "parse0: %s", tok)
+        self.dprint(80, "ENTER parse0(%s)", tok)
         toktype = tok['type']
         if toktype == 'NIL':
             return None