Improve tag handling and debugging

* wikimarkup.py: Rewrite tag recognition. Implement dump method. * wikicvt.py: New options -D (--dump), and -t dump * wiki2html.py (input_tag): Remove method (str_tag): Change handling of tags * wiki2texi.py: Likewise. * wiki2text.py: Likewise.
author: Sergey Poznyakoff <gray@gnu.org.ua> 2015-07-12 23:11:40 +0300
committer: Sergey Poznyakoff <gray@gnu.org.ua> 2015-07-12 23:11:40 +0300
commit: 28072898f1bd9a925d73ac187d560198d6345524 (patch)
tree: a46d781fb85d9dda61fc8f68e0ba6ec43d60ce55
parent: 75672b57a2d63f01d00795fe8d661d1efe7b6e8d (diff)
download: wit-28072898f1bd9a925d73ac187d560198d6345524.tar.gz
wit-28072898f1bd9a925d73ac187d560198d6345524.tar.bz2
5 files changed, 309 insertions, 126 deletions
diff --git a/wiki2html.py b/wiki2html.py
index 441bc76..66939c4 100644
--- a/wiki2html.py
+++ b/wiki2html.py
@@ -169,31 +169,27 @@ class HtmlWikiMarkup (WikiMarkup):
                                        self.envt[type]["elt"][n])
         return "<%s>%s</%s>" % (self.envt[type]["hdr"],
                                 string,
                                 self.envt[type]["hdr"])
         return string
 
-    supported_tags = [ 'nowiki', 'code' ]
-    def input_tag(self, tag):
-        return tag['tag'] in self.supported_tags
-    
     def str_tag(self, elt):
         if elt['tag'] == 'nowiki':
-            return '<pre>' + elt['content'] + '</pre>'
+            return '<pre>' + self.format(elt['content']) + '</pre>'
         elif elt['tag'] == 'code':
-            kwdict = {
-                'nested': self.nested + 1,
-                'lang': self.lang,
-                'text': elt['content'],
-                'html_base': self.html_base,
-                'image_base': self.image_base,
-                'media_base': self.media_base }
-            markup = HtmlWiktionaryMarkup(**kwdict)
-            markup.debug_level = self.debug_level
-            markup.parse()
-            return '<pre><code>' + str(markup) + '</code></pre>' #FIXME
+            self.nested += 1
+            s = self.format(elt['content'])
+            self.nested -= 1
+            return '<pre><code>' + s + '</code></pre>' #FIXME
+        else:
+            s = '<' + elt['tag']
+            if elt['args']:
+                s += ' ' + elt['args']
+            s += '>'
+            s += self.format(elt['content'])
+            return s + '</' + elt['tag'] + '>'
     
     def str_para(self, elt):
         string = "";
         for x in elt['content']:
             string += self.format(x)
         return "<p>" + string + "</p>"
diff --git a/wiki2texi.py b/wiki2texi.py
index 7cc67bd..0b3eb77 100644
--- a/wiki2texi.py
+++ b/wiki2texi.py
@@ -116,35 +116,34 @@ class TexiWikiMarkup (WikiMarkup):
             for x in elt['content']:
                 string += self.format(x)
             return string
         else:
             return str(elt)
 
-    supported_tags = [ 'nowiki', 'code' ]
-    def input_tag(self, tag):
-        return tag['tag'] in self.supported_tags
-    
     def str_tag(self, elt):
         if elt['tag'] == 'nowiki':
-            return '@example\n' + elt['content'] + '@end example\n'
+            return '@example\n' + self.format(elt['content']) + '@end example\n'
         elif elt['tag'] == 'code':
-            kwdict = {
-                'nested': self.nested + 1,
-                'lang': self.lang,
-                'text': elt['content'],
-                'html_base': self.html_base,
-                'image_base': self.image_base,
-                'media_base': self.media_base }
-            markup = TexiWikiMarkup(**kwdict)
-            markup.debug_level = self.debug_level
-            markup.parse()
-            s = str(markup)
+            self.nested += 1
+            s = self.format(elt['content'])
+            self.nested -= 1
             if not s.endswith("\n"):
-                s += "\n";
+                s += "\n"            
             return '@example\n' + s + '@end example\n'
-
+        elif elt['tag'] == 'tt':
+            self.nested += 1
+            s = self.format(elt['content'])
+            self.nested -= 1
+            return "@code{%s}" % s
+        else:
+            s = '<' + elt['tag']
+            if elt['args']:
+                s += ' ' + elt['args']
+            s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>'
+            return s
+ 
     def str_para(self, elt):
         string = "";
         for x in elt['content']:
             string += self.format(x)
         return "\n" + string + "\n"
 
@@ -153,13 +152,13 @@ class TexiWikiMarkup (WikiMarkup):
         for x in elt['content']:
             string += self.format(x)
         if self.nested:
             return string
         if not string.endswith("\n"):
             string += "\n";
-        return '@example\n' + string + '@end example\n'
+        return '\n@example\n' + string + '@end example\n'
 
     def concat(self, eltlist):
         string = ""
         for x in eltlist:
             string += self.format(x)
         return string
diff --git a/wiki2text.py b/wiki2text.py
index 27a7051..d4cab81 100644
--- a/wiki2text.py
+++ b/wiki2text.py
@@ -139,31 +139,26 @@ class TextWikiMarkup (WikiMarkup):
                 length = 0
                 linebuf = ""
             linebuf += " " * wsc + s
             length += wsc + wlen
         return output + linebuf
         
-    supported_tags = [ 'nowiki', 'code' ]
-    def input_tag(self, tag):
-        return tag['tag'] in self.supported_tags
-    
     def str_tag(self, elt):
         if elt['tag'] == 'nowiki':
-            return elt['content']
+            return self.format(elt['content'])
         elif elt['tag'] == 'code':
-            kwdict = {
-                'nested': self.nested + 1,
-                'lang': self.lang,
-                'text': elt['content'],
-                'html_base': self.html_base,
-                'image_base': self.image_base,
-                'media_base': self.media_base }
-            markup = TextWiktionaryMarkup(**kwdict)
-            markup.debug_level = self.debug_level
-            markup.parse()
-            return str(markup)
+            self.nested += 1
+            s = self.format(elt['content'])
+            self.nested -= 1
+            return s #FIXME
+        else:
+            s = '<' + elt['tag']
+            if elt['args']:
+                s += ' ' + elt['args']
+            s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>'
+            return s            
         
     def format(self, elt):
         if elt['type'] == 'TEXT':
             if isinstance(elt['content'],list):
                 string = ""
                 for s in elt['content']:
diff --git a/wikicvt.py b/wikicvt.py
index e61e28b..c8ca887 100755
--- a/wikicvt.py
+++ b/wikicvt.py
@@ -14,24 +14,37 @@
 # 
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import sys
 import getopt
+import StringIO
 from wiki2html import *
 from wiki2text import *
 from wiki2texi import *
 
+class DumpWikiMarkup (WikiMarkup):
+    def __str__(self):
+        if self.tree:
+            s = StringIO.StringIO()
+            self.dump(self.tree, 0, s)
+            return s.getvalue()
+        else:
+            return ""
+
 def usage(code=0):
     print """
 usage: %s [-hvt] [-I INTYPE] [-l lang] [-o kw=val] [--lang=lang] [--option kw=val]
           [--input-type=INTYPE] [--type=OUTTYPE] [--help] [--verbose] file
 """ % (sys.argv[0])
     sys.exit(code)
 
 handlers = {
+    'dump': {
+        'default': DumpWikiMarkup
+    },
     'html': {
         'default': HtmlWikiMarkup,
         'wiktionary': HtmlWiktionaryMarkup
     },
     'text': {
         'default': TextWikiMarkup,
@@ -48,15 +61,16 @@ def main():
     otype = 'html'
     lang = "pl"
     kwdict = {}
     debug = 0
     
     try:
-        opts, args = getopt.getopt(sys.argv[1:], "d:I:hl:o:t:v",
-                                   ["debug=", "help", "lang=", "option=",
-                                    "to", "type", "input-text", "input-type",
+        opts, args = getopt.getopt(sys.argv[1:], "Dd:I:hl:o:t:v",
+                                   ["dump",
+                                    "debug=", "help", "lang=", "option=",
+                                    "to=", "type=", "input-text", "input-type=",
                                     "verbose" ])
     except getopt.GetoptError:
         usage(1)
 
     for o, a in opts:
         if o in ("-h", "--help"):
@@ -74,25 +88,27 @@ def main():
             if val != '':
                 kwdict[kw] = val
         elif o == "--input-text":
             input_text = True
         elif o in ("-d", "--debug"):
             debug = eval(a)
+        elif o in ("-D", "--dump"):
+            otype = 'dump'
 
     if len(args) == 1:
         if args[0] == '-':
             kwdict['file'] = sys.stdin
         else:
             kwdict['filename'] = args[0]
     else:
         usage(1)
 
     kwdict['lang']=lang
 
-    if handlers.has_key(otype):
-        if handlers[otype].has_key(itype):
+    if otype in handlers:
+        if itype in handlers[otype]:
             markup = handlers[otype][itype](**kwdict)
             markup.debug_level = debug
             markup.parse()
             print str(markup)
             exit(0)
         else:
diff --git a/wikimarkup.py b/wikimarkup.py
index fde1ec1..9a79d1e 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -19,15 +19,15 @@ import sys
 import re
 from types import *
 
 __all__ = [ "BaseWikiMarkup", "WikiMarkup",
             "envtypes" ]
 
-delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)")
-otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
-ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
+otag = re.compile("(?P<pfx>[^<]*)<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
+ctag = re.compile("(?P<pfx>[^<]*)</(?P<tag>[a-zA-Z0-9_]+)\s*>")
 
 close_delim = {
     '[': ']',
     '[[': ']]',
     '{{': '}}'
 }
@@ -43,22 +43,121 @@ class BaseWikiMarkup(object):
 
     toklist = None
     tokind = 0
     newline = 0
     tree = None
 
+    tags = [ 'code', 'nowiki', 'tt', 'div' ]
+    
     nested = 0
     debug_level = 0
     
     def dprint(self, lev, fmt, *argv):
         if self.debug_level >= lev:
             print "[DEBUG]", fmt % argv
 
-    def input_tag(self, tag):
+    def print_dump_prefix(self, level, file):
+        file.write("[DUMP]" + ' ' * (2*level + 1))
+
+    def dump_nil(self, node, level, file):
         pass
     
+    def dump_text(self, node, level, file):
+        self.print_dump_prefix(level, file)
+        file.write("CONTENT: \"%s\"\n" % node['content'])
+    
+    def dump_delim(self, node, level, file):
+        file.write("'%s'" % node['content'])
+        if 'continuation' in node:
+            file.write(" (cont)")
+        file.write("\n")
+                       
+    def dump_tag(self, node, level, file):
+        self.print_dump_prefix(level, file)
+        file.write("TAG: %s\n" % node['tag'])
+        if 'args' in node:
+            self.print_dump_prefix(level, file)
+            file.write("ARGS: %s\n" % node['args'])
+        if 'content' in node:
+            self.dump_node(node['content'], level + 1, file)
+    
+    def dump_seq(self, node, level, file):
+        self.dump(node['content'], level + 1, file)
+        
+    def dump_ref(self, node, level, file):
+        self.print_dump_prefix(level, file)
+        file.write("REF: %s\n" % node['ref'])
+        self.dump_node(node['content'], level + 1, file)
+    
+    def dump_hdr(self, node, level, file):
+        self.print_dump_prefix(level, file)
+        file.write("LEVEL: %s\n" % node['level'])
+        self.dump_node(node['content'], level + 1, file)
+             
+    def dump_elt(self, node, level, file):
+        self.print_dump_prefix(level, file)
+        file.write("SUBTYPE: %s\n" % node['subtype'])
+        self.dump_node(node['content'], level + 1, file)
+        
+    def dump_env(self, node, level, file):
+        self.print_dump_prefix(level, file)
+        file.write("ENVTYPE: %s\n" % node['envtype'])
+        self.print_dump_prefix(level, file)
+        file.write("LEVEL: %s\n" % node['level'])
+        self.dump(node['content'], level + 1, file)
+    
+    def dump_ind(self, node, level, file):
+        self.print_dump_prefix(level, file)
+        file.write("LEVEL: %s\n" % node['level'])
+        self.dump_node(node['content'], level + 1, file)
+    
+    def dump_link(self, node, level, file):
+        self.dump(node['content'], level + 1, file)
+    
+    dump_type = {
+        'NIL': dump_nil,
+        'NL': dump_nil,
+        'TEXT': dump_text,
+        'DELIM': dump_delim,
+        'OTAG': dump_tag,
+        'CTAG': dump_tag,
+        'TAG': dump_tag,
+        'SEQ': dump_seq,
+        'REF': dump_ref,
+        'HDR': dump_hdr,
+        'ELT': dump_elt,
+        'ENV': dump_env,
+        'IND': dump_ind,
+        'BAR': dump_nil,
+        'PARA': dump_seq,
+        'PRE': dump_text,
+        'BOLD': dump_seq,
+        'IT': dump_seq,
+        'LINK': dump_link,
+    }
+    
+    def dump_node(self, node, level, file):
+        if type(node) != dict:
+            file.write("UNHANDLED NODE: %s, %s\n" % (type(node),node))
+            return
+         
+        self.print_dump_prefix(level, file)
+        file.write("NODE " + node['type'] + ":\n")
+        if node['type'] in self.dump_type:
+            self.dump_type[node['type']](self, node, level, file)
+        else:
+            self.print_dump_prefix(level, file)
+            file.write("(UNHANDLED) ")
+            file.write("%s\n" % node)
+        self.print_dump_prefix(level, file)
+        file.write("END NODE " + node['type'] + "\n")
+                
+    def dump(self, tree, level=0, file=sys.stdout):
+        for node in tree:
+            self.dump_node(node, level, file)
+            
     def tokread(self):
         line = None
         pos = 0
         while 1:
             if (not line or pos == len(line)):
                 try:
@@ -80,58 +179,78 @@ class BaseWikiMarkup(object):
             m = delim.search(line, pos)
             
             if m:
                 if (pos < m.start(0)):
                     yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
                 pos = m.end(0)
-                if envtypes.has_key(m.group(0)[0]) and line[pos] == ":":
-                    # FIXME?
-                    # FIXME: What's "extra"?
+
+            if m and line[m.start(0)] != '<':
+                if m.group(0)[0] in envtypes and pos < len(line) and line[pos] == ":":
                     yield({ 'type': 'DELIM',
-                            'content': m.group(0) })
+                            'content': m.group(0),
+                            'continuation': True })
                     pos += 1
                 else:
                     yield({ 'type': 'DELIM',
                             'content': m.group(0) })
             else:
-                m = otag.match(line)
                 if m:
-                    t = { 'type': 'TAG',
+                    pos -= 1
+                t = None
+                m = otag.match(line, pos)
+                if m and m.group('tag') in self.tags:
+                    rest = line[m.end(0):]
+                    line = m.group('pfx')
+                    pos = 0
+                    t = { 'type': 'OTAG',
                           'tag': m.group('tag'),
                           'args': m.group('args') }
-                    
-                    if self.input_tag(t):
+                else:
+                    m = ctag.match(line, pos)
+                    if m and m.group('tag') in self.tags:
+                        rest = line[m.end(0):]
+                        line = m.group('pfx')
+                        pos = 0
+                        t = { 'type': 'CTAG',
+                              'tag': m.group('tag') }
+
+                if line:
+                    if line[-1] == '\n':
+                        if line[pos:-1] != '':
+                            yield({ 'type': 'TEXT',
+                                    'content': line[pos:-1] })
+                        yield({ 'type': 'NL',
+                                'content': '\n' })
+                    else:
+                        yield({ 'type': 'TEXT',
+                                'content': line[pos:] })
+
+                if t:
+                    if t['type'] == 'OTAG' and t['tag'] == 'nowiki':
                         s = ''
                         if not m.group('closed'):
                             while 1:
                                 try:
                                     l = self.input()
                                     m = ctag.match(l)
                                     if m and m.group('tag') == t['tag']:
                                         break
                                     s += l
                                 except StopIteration:
                                     break
-                        yield({ 'type': 'TAG',
-                                'tag': t['tag'],
-                                'args': t['args'],
-                                'content': s
-                              })
-                        line = None
-                        continue
-                                
-                if line[-1] == '\n':
-                    if line[pos:-1] != '':
-                        yield({ 'type': 'TEXT',
-                                'content': line[pos:-1] })
-                    yield({ 'type': 'NL',
-                            'content': '\n' })
+                        t['type'] = 'TAG'
+                        t['content'] = {'type': 'TEXT', 'content': s}
+                    
+                    yield(t)
+                    if t['type'] == 'OTAG' and m.group('closed'):
+                        t['type'] = 'CTAG'
+                        yield(t)
+                    line = rest
+                    pos = 0
                 else:
-                    yield({ 'type': 'TEXT',
-                            'content': line[pos:] })
-                line = None
+                    line = None
 
     def input(self):
         return None
 
     def swaptkn(self, i, j):
         self.dprint(80, "SWAPPING %s <-> %s", i, j)
@@ -191,38 +310,39 @@ class BaseWikiMarkup(object):
 
     def setkn(self,val):
         self.toklist[self.tokind] = val
     
     def getkn(self):
         self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
+        if self.tokind == len(self.toklist):
+            return { 'type': 'NIL' }
         tok = self.toklist[self.tokind]
-        if tok['type'] != 'NIL':
-            self.tokind = self.tokind + 1
+        self.tokind = self.tokind + 1
         return tok
     
     def ungetkn(self):
         self.tokind = self.tokind - 1
         self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
         return self.toklist[self.tokind]
 
     def parse_fontmod(self,delim,what):
         self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s",
                     delim, what, self.peektkn())
         seq = []
-        textlist = []
+        text = ''
         while 1:
             tok = self.getkn()
             if tok['type'] == 'TEXT':
-                textlist.append(tok['content'])
+                text += tok['content']
             elif tok['type'] == 'DELIM':
                 if tok['content'] == delim:
                     break
                 elif self.is_inline_delim(tok):
-                    if textlist:
-                        seq.append({ 'type': 'TEXT', 'content': textlist })
-                        textlist = []
+                    if text:
+                        seq.append({ 'type': 'TEXT', 'content': text })
+                        text = ''
                     x = self.parse_inline(tok)
                     if x:
                         seq.append(x)
                     else:
                         self.dprint(80, "LEAVE parse_fontmod=%s", "None")
                         return None
@@ -234,14 +354,14 @@ class BaseWikiMarkup(object):
                     self.dprint(80, "LEAVE parse_fontmod=None")
                     return None
                 seq.append({ 'type': 'TEXT', 'content': '\n' })
             else:
                 self.dprint(80, "LEAVE parse_fontmod=None")
                 return None
-        if textlist:
-            seq.append({ 'type': 'TEXT', 'content': textlist })
+        if text:
+            seq.append({ 'type': 'TEXT', 'content': text })
         res = { 'type': what, 'content': seq }
         self.dprint(80, "LEAVE parse_fontmod=%s", res)    
         return res
 
     def parse_link(self, type, delim):
         self.dprint(80, "ENTER parse_link(%s,%s), tok %s",
@@ -340,22 +460,28 @@ class BaseWikiMarkup(object):
 
     def parse_para(self):
         self.dprint(80, "ENTER parse_para, tok %s", self.peektkn())
         seq = []
         textlist = []
         tok = self.peektkn()
-        if re.match("^\s", tok['content']):
-            type = 'PRE'
-            rx = re.compile("^\S")
+
+        if self.newline:
+            if re.match("^\s", tok['content']):
+                type = 'PRE'
+                rx = re.compile("^\S")
+            else:
+                type = 'PARA'
+                rx = re.compile("^\s")
         else:
-            type = 'PARA'
-            rx = re.compile("^\s")
+            type = 'SEQ'
+            rx = None
+            
         while 1:
             tok = self.getkn()
             if tok['type'] == 'TEXT':
-                if self.newline and rx.match(tok['content']):
+                if rx and self.newline and rx.match(tok['content']):
                     self.ungetkn()
                     break
                 textlist.append(tok['content'])
             elif tok['type'] == 'NL':
                 tok = self.getkn()
                 if tok['type'] == 'NL' or tok['type'] == 'NIL':
@@ -364,16 +490,20 @@ class BaseWikiMarkup(object):
                     self.ungetkn()
                     if self.is_block_delim(tok):
                         break
                 textlist.append('\n')
             elif tok['type'] == 'NIL':
                 break
+            elif tok['type'] == 'OTAG' or tok['type'] == 'CTAG' or tok['type'] == 'TAG':
+                self.ungetkn()
+                break
             elif tok['type'] == 'DELIM':
                 if self.is_inline_delim(tok):
                     if textlist:
-                        seq.append({ 'type': 'TEXT', 'content': textlist })
+                        seq.append({ 'type': 'TEXT',
+                                     'content': ''.join(textlist) })
                         textlist = []
                     x = self.parse_inline(tok)
                     if x:
                         seq.append(x)
                     else:
                         self.dprint(80, "ROLLBACK parse_para=%s", tok)
@@ -394,13 +524,13 @@ class BaseWikiMarkup(object):
                                             break
                 else:
                     seq.append({ 'type': 'TEXT', 'content': tok['content'] })
                 #    self.ungetkn()
                     break
         if textlist:
-            seq.append({ 'type': 'TEXT', 'content': textlist })
+            seq.append({ 'type': 'TEXT', 'content': ''.join(textlist) })
         self.dprint(80, "LEAVE parse_para=%s", seq)
         return { 'type': type, 'content': seq }
 
     def parse_header(self, delim):
         self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn())
         list = []
@@ -440,107 +570,158 @@ class BaseWikiMarkup(object):
         while 1:
             tok = self.getkn()
             if tok['type'] == 'NL' or tok['type'] == 'NIL':
                 break
             elif tok['type'] == 'TEXT':
                 list.append(tok)
-            elif tok['type'] == 'DELIM' and tok['content'][0] == ":":
-                list.append(self.parse_indent(len(tok['content'])))
-                break
-            else:
-                x = self.parse_inline(tok)
-                if x:
-                    list.append(x)
+            elif tok['type'] == 'DELIM':
+                if tok['content'][0] == ":":
+                    list.append(self.parse_indent(len(tok['content'])))
+                    break
                 else:
-                    list.append(tok)
+                    x = self.parse_inline(tok)
+                    if x:
+                        list.append(x)
+                    else:
+                        list.append(tok)
+            else:
+                list.append(tok)
         self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list)
         return { 'type': 'SEQ', 'content': list }
     
     def parse_env(self, type, lev):
         self.dprint(80, "ENTER parse_env(%s,%s), tok %s",type,lev,self.peektkn())
         list = []
         while 1:
             tok = self.getkn()
             if tok['type'] == 'DELIM' \
-               and envtypes.has_key(tok['content'][0]) \
+               and tok['content'][0] in envtypes \
                and type == envtypes[tok['content'][0]][0]:
                 if len(tok['content']) < lev:
                     self.ungetkn()
                     break
                 elif len(tok['content']) > lev:
                     self.ungetkn()
                     elt = self.parse_env(type, len(tok['content']))
                 else:
                     elt = self.parse_line()
-                    if len(tok.keys()) == 2:
+                    if 'continuation' not in tok:
                         list.append({ 'type': 'ELT',
                                       'subtype': envtypes[tok['content'][0]][1],
                                       'content': elt })
                         continue
 
-                if list[-1]['content']['type'] != 'SEQ':
-                    x = list[-1]['content']['content']
-                    # FIXME:
-                    list[-1]['content'] = { 'type': 'SEQ', 'content': [x] }
-                list[-1]['content']['content'].append(elt)
+                if list:
+                    if list[-1]['content']['type'] != 'SEQ':
+                        x = list[-1]['content']['content']
+                        # FIXME:
+                        list[-1]['content'] = { 'type': 'SEQ', 'content': [x] }
+                    list[-1]['content']['content'].append(elt)
             else:
                 self.ungetkn()
                 break
         self.dprint(80, "LEAVE parse_env=(ENV, %s, %s, %s)", type, lev, list)
         return { 'type': 'ENV', 'envtype': type, 'level': lev, 'content': list }
 
     def parse_indent(self, lev):
         self.dprint(80, "ENTER parse_indent(%s), tok %s", lev, self.peektkn())
         x = { 'type': 'IND', 'level': lev, 'content': self.parse_line() }
         self.dprint(80, "LEAVE parse_indent=%s", x)
         return x
 
+    def parse_til(self, tag):
+        self.dprint(80, "ENTER parse_til(%s)", tag)
+        seq = []
+        save = self.tokind
+        while 1:
+            t = self.parse0()
+            if t == None or t['type'] == 'NIL':
+                self.tokind = save
+                s = '<' + tag['tag']
+                if 'args' in tag and tag['args']:
+                    s += ' ' + tag['args']
+                del tag['args']
+                s += '>'
+                if 'content' in tag:
+                    subtree = tag['content']
+                else:
+                    subtree = None
+                tag['type'] = 'TEXT'
+                tag['content'] = s
+                if subtree:
+                    self.tree[self.tokind:self.tokind] = subtree
+                self.dprint(80, "LEAVE parse_til = %s (tree modified)", tag)
+                self.ungetkn()
+                return self.parse0()
+            
+            if t['type'] == 'CTAG' and tag['tag'] == t['tag']:
+                break
+            seq.append(t)
+
+        ret = { 'type': 'TAG',
+                'tag': tag['tag'],
+                'args': tag['args'],
+                'content': { 'type': 'SEQ', 'content': seq } }
+        self.dprint(80, "LEAVE parse_til = %s", ret)
+        return ret
+            
     def parse0(self):
         tok = self.getkn()
+        self.dprint(80, "parse0: %s", tok)
         toktype = tok['type']
         if toktype == 'NIL':
             return None
         elif toktype == 'TEXT':
             self.ungetkn()
             return self.parse_para()
         elif toktype == 'DELIM':
             if tok['content'] == "----":
                 return { 'type': 'BAR' }
             elif tok['content'][0:2] == "==":
                 return self.parse_header(tok['content'])
-            elif envtypes.has_key(tok['content'][0]):
+            elif tok['content'][0] in envtypes:
                 type = envtypes[tok['content'][0]][0]
                 lev = len(tok['content'])
                 self.ungetkn()
                 return self.parse_env(type, lev)
             elif tok['content'][0] == ":":
                 return self.parse_indent(len(tok['content']))
             else:
                 self.ungetkn()
                 return self.parse_para()
         elif toktype == 'NL':
             return { 'type': 'TEXT', 'content': '\n' }
-#            return self.parse0()
+        elif toktype == 'OTAG':
+            return self.parse_til(tok)
         else:
             return tok
 
     def parse(self):
         if not self.toklist:
             self.tokenize()
-        self.dprint(90, "TOKLIST: %s", self.toklist)
+        if self.debug_level >= 90:
+            print("TOKEN DUMP BEGIN")
+            self.dump(self.toklist)
+            print("TOKEN DUMP END")
+
         self.tokind = 0
         self.tree = []
         while 1:
             subtree = self.parse0()
             if subtree == None:
                 break
             self.tree.append(subtree)
+
         if self.nested:
             if self.tree[0]['type'] == 'PARA':
                 self.tree[0]['type'] = 'SEQ'
-        self.dprint(70, "TREE: %s", self.tree)
+                
+        if self.debug_level >= 70:
+            print("TREE DUMP BEGIN")
+            self.dump(self.tree)
+            print("TREE DUMP END")
 
     def __str__(self):
         return str(self.tree)
 
 
 class WikiMarkup (BaseWikiMarkup):
@@ -616,17 +797,13 @@ class WikiMarkup (BaseWikiMarkup):
                 if m: # and m.group(1) in self.langtab:
                     return True
         return False
     
     def is_empty_text(self, elt):
         if elt['type'] == 'TEXT':
-            if isinstance(elt['content'],list):
-                for s in elt['content']:
-                    if re.search('\w', s):
-                        return False
-            elif re.search('\w', elt['content']):
+            if re.search('\w', elt['content']):
                 return False
             return True
         return False
 
     def is_empty_para(self, seq):
         for x in seq:
author	Sergey Poznyakoff <gray@gnu.org.ua>	2015-07-12 23:11:40 +0300
committer	Sergey Poznyakoff <gray@gnu.org.ua>	2015-07-12 23:11:40 +0300
commit	28072898f1bd9a925d73ac187d560198d6345524 (patch)
tree	a46d781fb85d9dda61fc8f68e0ba6ec43d60ce55
parent	75672b57a2d63f01d00795fe8d661d1efe7b6e8d (diff)
download	wit-28072898f1bd9a925d73ac187d560198d6345524.tar.gz wit-28072898f1bd9a925d73ac187d560198d6345524.tar.bz2