Improve tokenizer and parser.

* wikimarkup.py (ctag,otag): pfx group not needed anymore (refstart): New global (tokread): Clean up logic. Handle <</nowiki>tag> properly. (parse_ref): Rewrite. (parse_inline): Recover in case of unmatched delimiters (parse_line): Handle OTAG tokens.
author: Sergey Poznyakoff <gray@gnu.org.ua> 2015-07-15 14:52:15 +0300
committer: Sergey Poznyakoff <gray@gnu.org.ua> 2015-07-15 14:52:15 +0300
commit: 288d3c09c06af73ca6413b9692c06d379de319b1 (patch)
tree: 0ef58f2d868a230cff8490e2821636f0b16d927b
parent: f97542b428b1a008e2df955cf2047e4b6b9d73d3 (diff)
download: wit-288d3c09c06af73ca6413b9692c06d379de319b1.tar.gz
wit-288d3c09c06af73ca6413b9692c06d379de319b1.tar.bz2
1 files changed, 141 insertions, 119 deletions
diff --git a/wikimarkup.py b/wikimarkup.py
index 815e89d..b765594 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -20,15 +20,16 @@ import re
 from types import *
 
 __all__ = [ "BaseWikiMarkup", "WikiMarkup",
             "envtypes" ]
 
 delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
-otag = re.compile("(?P<pfx>[^<]*)<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
-ctag = re.compile("(?P<pfx>[^<]*)</(?P<tag>[a-zA-Z0-9_]+)\s*>")
-
+otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
+ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+refstart = re.compile("^https?://")
+    
 close_delim = {
     '[': ']',
     '[[': ']]',
     '{{': '}}'
 }
 
@@ -64,13 +65,13 @@ class BaseWikiMarkup(object):
     def dump_text(self, node, level, file):
         self.print_dump_prefix(level, file)
         file.write("CONTENT: \"%s\"\n" % node['content'])
     
     def dump_delim(self, node, level, file):
         file.write("'%s'" % node['content'])
-        if 'continuation' in node:
+        if 'continuation' in node and node['continuation']:
             file.write(" (cont)")
         file.write("\n")
                        
     def dump_tag(self, node, level, file):
         self.print_dump_prefix(level, file)
         file.write("TAG: %s\n" % node['tag'])
@@ -151,13 +152,23 @@ class BaseWikiMarkup(object):
         self.print_dump_prefix(level, file)
         file.write("END NODE " + node['type'] + "\n")
                 
     def dump(self, tree, level=0, file=sys.stdout):
         for node in tree:
             self.dump_node(node, level, file)
-            
+
+    def rettext(self, text):
+        if text[-1] == '\n':
+            if text[0:-1] != '':
+                yield({ 'type': 'TEXT',
+                        'content': text[0:-1] })
+            yield({ 'type': 'NL',
+                    'content': '\n' })
+        else:
+            yield({ 'type': 'TEXT', 'content': text })
+
     def tokread(self):
         line = None
         pos = 0
         while 1:
             if (not line or pos == len(line)):
                 try:
@@ -178,94 +189,93 @@ class BaseWikiMarkup(object):
             self.dprint(100, "LINE: %s", line[pos:])
             m = delim.search(line, pos)
             
             if m:
                 if (pos < m.start(0)):
                     yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
-                pos = m.end(0)
-
-            if m and line[m.start(0)] != '<':
-                content = m.group(0)
-                if content[0] in envtypes:
-                    t = { 'type': 'DELIM',
-                           'content': content,
-                           'continuation': pos < len(line) and line[pos] == ":" }
-                    if t['continuation']:
-                        t['content'] += t['content'][0]
-                        pos += 1
-                        
-                    yield(t)
-
-                    while pos < len(line) and line[pos] in [' ', '\t']:
-                        pos += 1 
-                else:
-                    yield({ 'type': 'DELIM',
-                            'content': content,
-                            'continuation': False})
-            else:
-                if m:
-                    pos -= 1
+                pos = m.start(0)
                 t = None
-                m = otag.match(line, pos)
-                if m and m.group('tag') in self.tags:
-                    rest = line[m.end(0):]
-                    line = m.group('pfx')
-                    pos = 0
-                    t = { 'type': 'OTAG',
-                          'tag': m.group('tag'),
-                          'args': m.group('args') }
-                else:
-                    m = ctag.match(line, pos)
-                    if m and m.group('tag') in self.tags:
-                        rest = line[m.end(0):]
-                        line = m.group('pfx')
-                        pos = 0
-                        t = { 'type': 'CTAG',
-                              'tag': m.group('tag') }
-
-                if line:
-                    if line[-1] == '\n':
-                        if line[pos:-1] != '':
-                            yield({ 'type': 'TEXT',
-                                    'content': line[pos:-1] })
-                        yield({ 'type': 'NL',
-                                'content': '\n' })
-                    else:
-                        yield({ 'type': 'TEXT',
-                                'content': line[pos:] })
 
-                if t:
-                    line = rest
-                    pos = 0
-                    if t['type'] == 'OTAG' and t['tag'] == 'nowiki':
-                        if m.group('closed'):
-                            pass
-                        else:
-                            while 1:
-                                try:
-                                    m = ctag.match(line)
-                                    if m and m.group('tag') == t['tag']:
+                if line[m.start(0)] == '<':
+                    m = otag.match(line, pos)
+                    if m:
+                        pos = m.end(0)
+                        if m.group('tag') == 'nowiki':
+                            if not m.group('closed'):
+                                while 1:
+                                    try:
+                                        m = ctag.match(line)
+                                        if m and m.group('tag') == 'nowiki':
+                                            yield({ 'type': 'TEXT',
+                                                    'content': line[pos:m.start(0)] })
+                                            pos = m.end(0)
+                                            break
+
                                         yield({ 'type': 'TEXT',
-                                                'content': m.group('pfx') })
-                                        pos = m.end(0)
+                                              'content': line[pos:] })
+
+                                        line = self.input()
+                                        pos = 0
+                                    except StopIteration:
                                         break
-                                    
-                                    yield({ 'type': 'TEXT',
-                                            'content': line })
-                                    
-                                    line = self.input()
-                                except StopIteration:
-                                    break
                             continue
-                    
-                    yield(t)
-                    if t['type'] == 'OTAG' and m.group('closed'):
-                        t['type'] = 'CTAG'
+                        elif m.group('tag') in self.tags:
+                            t = { 'type': 'OTAG',
+                                  'tag': m.group('tag'),
+                                  'args': m.group('args') }
+                            yield(t)
+                            if m.group('closed'):
+                                t['type'] = 'CTAG'
+                                yield(t)
+                            continue
+                    else:
+                        m = ctag.match(line, pos)
+                        if m:
+                            if m.group('tag') in self.tags:
+                                yield( { 'type': 'CTAG',
+                                         'tag': m.group('tag') } )
+                                pos = m.end(0)
+                                continue
+                        else:
+                            yield( { 'type': 'TEXT',
+                                     'content': line[pos:pos+1] })
+                            pos += 1
+                            continue
+                else:
+                    pos = m.end(0)
+                    content = m.group(0)
+                    if content[0] in envtypes:
+                        t = { 'type': 'DELIM',
+                              'content': content,
+                              'continuation': pos < len(line) and line[pos] == ":" }
+                        if t['continuation']:
+                            t['content'] += t['content'][0]
+                            pos += 1
+
                         yield(t)
+
+                        while pos < len(line) and line[pos] in [' ', '\t']:
+                            pos += 1 
+                    else:
+                        yield({ 'type': 'DELIM',
+                                'content': content,
+                                'continuation': False})
+                    continue
+                
+            if line:
+                if line[-1] == '\n':
+                    if line[pos:-1] != '':
+                        yield({ 'type': 'TEXT',
+                                'content': line[pos:-1] })
+                    yield({ 'type': 'NL',
+                            'content': '\n' })
                 else:
-                    line = None
+                    yield({ 'type': 'TEXT',
+                            'content': line[pos:] })
+                line = None
+                
 
     def input(self):
         return None
 
     def swaptkn(self, i, j):
         self.dprint(80, "SWAPPING %s <-> %s", i, j)
@@ -409,42 +419,46 @@ class BaseWikiMarkup(object):
                 self.dprint(80, "LEAVE parse_link=%s", "None")
                 return None
         self.dprint(80, "LEAVE parse_link=(%s,%s)", type, subtree)
         return { 'type': type, 'content': subtree }
 
     def parse_ref(self):
-        self.dprint(80, "ENTER parse_ref, tok %s", self.peektkn())
-        list = []
+        tok = self.getkn()
+        self.dprint(80, "ENTER parse_ref, tok %s", tok)
+        if not (tok['type'] == 'TEXT' and refstart.match(tok['content'])):
+            self.dprint(80, "LEAVE parse_ref=None")
+            return None
+
+        seq = []
+        (ref,sep,text) = tok['content'].partition(' ')
+        if text:
+            seq.insert(0, {'type': 'TEXT', 'content': text })
+
         while 1:
             tok = self.getkn()
+            if tok == None or tok['type'] == 'NIL':
+                self.dprint(80, "LEAVE parse_ref=None")
+                return None
             if tok['type'] == 'DELIM':
-                if tok['content'] == "]":
+                if tok['content'] == ']':
                     break
                 else:
-                    x = self.parse_inline(tok)
-                    if x:
-                        list.append(x)
+                    tok = self.parse_inline(tok)
+                    if tok:
+                        seq.append(tok)
                     else:
-                        self.dprint(80, "LEAVE parse_ref=%s", "None")
+                        self.dprint(80, "LEAVE parse_ref=None")
                         return None
-            elif tok['type'] == 'TEXT':
-                list.append(tok)
-            elif tok['type'] == 'NL':
-                list.append({ 'type': 'TEXT', 'content': '\n' })
-                continue
+            elif tok['type'] == 'OTAG':
+                list.append(self.parse_til(tok))
             else:
-                self.dprint(80, "LEAVE parse_ref=%s", "None")
-                return None
-        if len(list) == 0 or list[0]['type'] != 'TEXT':
-            self.dprint(80, "LEAVE parse_ref=%s", "None")
-            return None
-        (ref,sep,text) = list[0]['content'].partition(' ')
+                seq.append(tok)
+
         ret = { 'type': 'REF', 
                 'ref': ref,
-                'content': { 'type': 'SEQ',
-                           'content': [{ 'type': 'TEXT', 'content': text }] + list[1:] } }
+                'content': { 'type': 'SEQ', 'content': seq } }
         self.dprint(80, "LEAVE parse_ref= %s", ret)
         return ret
 
     inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ]
 
     def is_inline_delim(self, tok):
@@ -462,17 +476,36 @@ class BaseWikiMarkup(object):
         elif tok['content'] == "[":
             x = self.parse_ref()
         elif tok['content'] == "[[":
             x = self.parse_link('LINK', "]]")
         elif tok['content'] == "{{":
             x = self.parse_link('TMPL', "}}")
-        else: # FIXME
-            self.dprint(80, "LEAVE parse_inline=%s", "None")
+        else:
+            self.dprint(80, "LEAVE parse_inline=%s (unhandled delimiter)", "None")
             x = None
         if not x:
             self.tokind = tokind
+            tok['type'] = 'TEXT'
+            self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok)
+            od = tok['content']
+            if od in close_delim:
+                cd = close_delim[od]
+                lev = 0
+                for tok in self.toklist[self.tokind+1:]:
+                    if tok['type'] == 'NIL':
+                        break
+                    elif tok['type'] == 'DELIM':
+                        if tok['content'] == od:
+                            lev += 1
+                        elif tok['content'] == cd:
+                            if lev == 0:
+                                tok['type'] = 'TEXT'
+                            lev -= 1
+                            break
+            self.dprint(80, "END DELIMITER RECOVERY: %s", tok)
+
         self.dprint(80, "LEAVE parse_inline=%s", x)
         return x
 
     def parse_para(self):
         self.dprint(80, "ENTER parse_para, tok %s", self.peektkn())
         seq = []
@@ -518,28 +551,14 @@ class BaseWikiMarkup(object):
                                      'content': ''.join(textlist) })
                         textlist = []
                     x = self.parse_inline(tok)
                     if x:
                         seq.append(x)
                     else:
-                        self.dprint(80, "ROLLBACK parse_para=%s", tok)
-                        od = tok['content']
-                        textlist.append(od)
-                        if close_delim.has_key(od):
-                            cd = close_delim[od]
-                            lev = 0
-                            for tok in self.toklist[self.tokind:]:
-                                if tok['type'] == 'NIL':
-                                    break
-                                elif tok['type'] == 'DELIM':
-                                    if tok['content'] == od:
-                                        lev += 1
-                                    elif tok['content'] == cd:
-                                        if lev == 0:
-                                            tok['type'] = 'TEXT'
-                                            break
+                        self.ungetkn()
+                        # restart
                 else:
                     seq.append({ 'type': 'TEXT', 'content': tok['content'] })
                 #    self.ungetkn()
                     break
         if textlist:
             seq.append({ 'type': 'TEXT', 'content': ''.join(textlist) })
@@ -565,12 +584,13 @@ class BaseWikiMarkup(object):
                         return None
                 else:
                     x = self.parse_inline(tok)
                     if x:
                         list.append(x)
                     else:
+                        self.ungetkn()
                         self.dprint(80, "LEAVE parse_header=%s", "None")
                         return None #FIXME?
             else:
                 self.dprint(80, "LEAVE parse_header=%s", "None")
                 return None
         self.dprint(80, "LEAVE parse_header=(HDR, %s, (SEQ,%s))",len(delim)-1,list)
@@ -595,12 +615,14 @@ class BaseWikiMarkup(object):
                 else:
                     x = self.parse_inline(tok)
                     if x:
                         list.append(x)
                     else:
                         list.append(tok)
+            elif tok['type'] == 'OTAG':
+                list.append(self.parse_til(tok))
             else:
                 list.append(tok)
         self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list)
         return { 'type': 'SEQ', 'content': list }
     
     def parse_env(self, type, lev):
@@ -678,13 +700,13 @@ class BaseWikiMarkup(object):
                 'content': { 'type': 'SEQ', 'content': seq } }
         self.dprint(80, "LEAVE parse_til = %s", ret)
         return ret
             
     def parse0(self):
         tok = self.getkn()
-        self.dprint(80, "parse0: %s", tok)
+        self.dprint(80, "ENTER parse0(%s)", tok)
         toktype = tok['type']
         if toktype == 'NIL':
             return None
         elif toktype == 'TEXT':
             self.ungetkn()
             return self.parse_para()
author	Sergey Poznyakoff <gray@gnu.org.ua>	2015-07-15 14:52:15 +0300
committer	Sergey Poznyakoff <gray@gnu.org.ua>	2015-07-15 14:52:15 +0300
commit	288d3c09c06af73ca6413b9692c06d379de319b1 (patch)
tree	0ef58f2d868a230cff8490e2821636f0b16d927b
parent	f97542b428b1a008e2df955cf2047e4b6b9d73d3 (diff)
download	wit-288d3c09c06af73ca6413b9692c06d379de319b1.tar.gz wit-288d3c09c06af73ca6413b9692c06d379de319b1.tar.bz2