Redo parse tree as a sequence of dictionaries, instead of arrays.

author: Sergey Poznyakoff <gray@gnu.org> 2015-07-06 08:05:31 +0300
committer: Sergey Poznyakoff <gray@gnu.org> 2015-07-06 08:05:31 +0300
commit: f3378aebac7e89000ff097ac51c49b62eb6e9f08 (patch)
tree: cdf7a9b58b52cd6e995ddf63ef05526e60a918f1 /wikimarkup.py
parent: 7ab9949e2c038ee6a7215d91896f2b47a5e7c06d (diff)
download: wikitrans-f3378aebac7e89000ff097ac51c49b62eb6e9f08.tar.gz
wikitrans-f3378aebac7e89000ff097ac51c49b62eb6e9f08.tar.bz2
1 files changed, 128 insertions, 129 deletions
diff --git a/wikimarkup.py b/wikimarkup.py
index 060b7eb..09c48eb 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -20,31 +20,10 @@ import re
 from types import *
 
 __all__ = [ "BaseWikiMarkup", "WikiMarkup",
-            "NIL", "TEXT", "DELIM", "NL", "PARA",
-            "IT", "BOLD", "LINK", "TMPL",
-            "BAR", "HDR", "REF", "ENV", "IND", "SEQ",
             "envtypes" ]
 
 delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)")
 
-NIL = 0
-TEXT = 1
-DELIM = 2
-NL = 3
-
-PARA = 4
-IT = 5
-BOLD = 6
-LINK = 7
-TMPL = 8
-BAR = 9
-HDR = 10
-REF = 11
-ENV = 12
-IND = 13
-SEQ = 14
-ELT = 15
-
 # Environment types:
 envtypes = { "*": [ "unnumbered", 0 ],
              "#": [ "numbered", 0 ],
@@ -77,12 +56,12 @@ class BaseWikiMarkup:
                     
             if not line or line == "":
                 self.dprint(100, "YIELD: NIL")
-                yield(NIL,)
+                yield({ 'type': 'NIL' })
                 break
 
             if line == '\n':
                 self.dprint(100, "YIELD: NL")
-                yield(NL,line)
+                yield({ 'type': 'NL', 'content': line })
                 line = None
                 continue
 
@@ -92,26 +71,33 @@ class BaseWikiMarkup:
             if m:
                 if (pos < m.start(0)):
                     self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)])
-                    yield(TEXT, line[pos:m.start(0)])
+                    yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
                 pos = m.end(0)
                 if envtypes.has_key(m.group(0)[0]) and line[pos] == ":":
                     # FIXME?
                     self.dprint(100, "YIELD: DELIM %s, True", m.group(0))
-                    yield(DELIM, m.group(0), True)
+                    # FIXME: What's "extra"?
+                    yield({ 'type': 'DELIM',
+                            'content': m.group(0),
+                            'extra': True })
                     pos += 1
                 else:
                     self.dprint(100, "YIELD: DELIM %s", m.group(0))
-                    yield(DELIM, m.group(0))
+                    yield({ 'type': 'DELIM',
+                            'content': m.group(0) })
             else:
                 if line[-1] == '\n':
                     self.dprint(100, "YIELD: TEXT %s", line[pos:-1])
                     if line[pos:-1] != '':
-                        yield(TEXT, line[pos:-1])
+                        yield({ 'type': 'TEXT',
+                                'content': line[pos:-1] })
                     self.dprint(100, "YIELD: NL")
-                    yield(NL,'\n')
+                    yield({ 'type': 'NL',
+                            'content': '\n' })
                 else:
                     self.dprint(100, "YIELD: TEXT %s", line[pos:])
-                    yield(TEXT, line[pos:])
+                    yield({ 'type': 'TEXT',
+                            'content': line[pos:] })
                 line = None
 
     def input(self):
@@ -126,11 +112,11 @@ class BaseWikiMarkup:
         #   '''''Door''' files kan ik niet op tijd komen.''
         stack = []
         for i in range(0,len(self.toklist)):
-            if self.toklist[i][0] == DELIM \
-                    and (self.toklist[i][1] == "''" \
-                             or self.toklist[i][1] == "'''"):
+            if self.toklist[i]['type'] == 'DELIM' \
+                    and (self.toklist[i]['content'] == "''" \
+                             or self.toklist[i]['content'] == "'''"):
                 if len(stack) > 0 \
-                        and self.toklist[stack[-1]][1] == self.toklist[i][1]:
+                        and self.toklist[stack[-1]]['content'] == self.toklist[i]['content']:
                     stack.pop()
                 elif len(stack) > 1:
                     x = self.toklist[stack[-2]]
@@ -148,7 +134,7 @@ class BaseWikiMarkup:
     
     def getkn(self):
         tok = self.toklist[self.tokind]
-        if tok[0] != NIL:
+        if tok['type'] != 'NIL':
             self.tokind = self.tokind + 1
         return tok
 
@@ -163,14 +149,14 @@ class BaseWikiMarkup:
         textlist = []
         while 1:
             tok = self.getkn()
-            if tok[0] == TEXT:
-                textlist.append(tok[1])
-            elif tok[0] == DELIM:
-                if tok[1] == delim:
+            if tok['type'] == 'TEXT':
+                textlist.append(tok['content'])
+            elif tok['type'] == 'DELIM':
+                if tok['content'] == delim:
                     break
                 elif self.is_inline_delim(tok):
                     if textlist:
-                        seq.append((TEXT, textlist))
+                        seq.append({ 'type': 'TEXT', 'content': textlist })
                         textlist = []
                     x = self.parse_inline(tok)
                     if x:
@@ -181,17 +167,17 @@ class BaseWikiMarkup:
                 else:
                     self.dprint(80, "LEAVE parse_fontmod=None")
                     return None
-            elif tok[0] == NL:
-                if self.peektkn()[0] == NL:
+            elif tok['type'] == 'NL':
+                if self.peektkn()['type'] == 'NL':
                     self.dprint(80, "LEAVE parse_fontmod=None")
                     return None
-                seq.append((TEXT, '\n'))
+                seq.append({ 'type': 'TEXT', 'content': '\n' })
             else:
                 self.dprint(80, "LEAVE parse_fontmod=None")
                 return None
         if textlist:
-            seq.append((TEXT, textlist))
-        res = (what, seq)
+            seq.append({ 'type': 'TEXT', 'content': textlist })
+        res = { 'type': what, 'content': seq }
         self.dprint(80, "LEAVE parse_fontmod=%s", res)    
         return res
 
@@ -202,14 +188,14 @@ class BaseWikiMarkup:
         list = []
         while 1:
             tok = self.getkn()
-            if tok[0] == DELIM:
-                if tok[1] == delim:
+            if tok['type'] == 'DELIM':
+                if tok['content'] == delim:
                     if list:
-                        subtree.append((SEQ,list))
+                        subtree.append({ 'type': 'SEQ', 'content': list })
                     break
-                elif tok[1] == "|":
+                elif tok['content'] == "|":
                     if len(list) > 1:
-                        subtree.append((SEQ,list))
+                        subtree.append({ 'type': 'SEQ', 'content': list })
                     elif list:
                         subtree.append(list[0])
                     list = []
@@ -220,21 +206,21 @@ class BaseWikiMarkup:
                     else:
                         self.dprint(80, "LEAVE parse_link=%s", "None")
                         return None
-            elif tok[0] == TEXT:
+            elif tok['type'] == 'TEXT':
                 list.append(tok)
             else:
                 self.dprint(80, "LEAVE parse_link=%s", "None")
                 return None
         self.dprint(80, "LEAVE parse_link=(%s,%s)", type, subtree)
-        return (type, subtree)
+        return { 'type': type, 'content': subtree }
 
     def parse_ref(self):
         self.dprint(80, "ENTER parse_ref, tok %s", self.peektkn())
         list = []
         while 1:
             tok = self.getkn()
-            if tok[0] == DELIM:
-                if tok[1] == "]":
+            if tok['type'] == 'DELIM':
+                if tok['content'] == "]":
                     break
                 else:
                     x = self.parse_inline(tok)
@@ -243,42 +229,45 @@ class BaseWikiMarkup:
                     else:
                         self.dprint(80, "LEAVE parse_ref=%s", "None")
                         return None
-            elif tok[0] == TEXT:
+            elif tok['type'] == 'TEXT':
                 list.append(tok)
-            elif tok[0] == NL:
-                list.append((TEXT, '\n'))
+            elif tok['type'] == 'NL':
+                list.append({ 'type': 'TEXT', 'content': '\n' })
                 continue
             else:
                 self.dprint(80, "LEAVE parse_ref=%s", "None")
                 return None
-        if len(list) == 0 or list[0][0] != TEXT:
+        if len(list) == 0 or list[0]['type'] != 'TEXT':
             self.dprint(80, "LEAVE parse_ref=%s", "None")
             return None
-        (ref,sep,text) = list[0][1].partition(' ')
-        ret = (REF, ref, (SEQ, [(TEXT, text)] + list[1:]))
+        (ref,sep,text) = list[0]['content'].partition(' ')
+        ret = { 'type': 'REF', 
+                'ref': ref,
+                'content': { 'type': 'SEQ',
+                           'content': [{ 'type': 'TEXT', 'content': text }] + list[1:] } }
         self.dprint(80, "LEAVE parse_ref= %s", ret)
         return ret
 
     inline_delims = [ "''", "'''", "[", "[[", "{{" ]
 
     def is_inline_delim(self, tok):
-        return tok[0] == DELIM and tok[1] in self.inline_delims
+        return tok['type'] == 'DELIM' and tok['content'] in self.inline_delims
     def is_block_delim(self, tok):
-        return tok[0] == DELIM and tok[1] not in self.inline_delims
+        return tok['type'] == 'DELIM' and tok['content'] not in self.inline_delims
     
     def parse_inline(self, tok):
         self.dprint(80, "ENTER parse_inline(%s), tok %s", tok, self.peektkn())
         tokind = self.tokind
-        if tok[1] == "''":
-            x = self.parse_fontmod(tok[1], IT)
-        elif tok[1] == "'''":
-            x = self.parse_fontmod(tok[1], BOLD)
-        elif tok[1] == "[":
+        if tok['content'] == "''":
+            x = self.parse_fontmod(tok['content'], 'IT')
+        elif tok['content'] == "'''":
+            x = self.parse_fontmod(tok['content'], 'BOLD')
+        elif tok['content'] == "[":
             x = self.parse_ref()
-        elif tok[1] == "[[":
-            x = self.parse_link(LINK, "]]")
-        elif tok[1] == "{{":
-            x = self.parse_link(TMPL, "}}")
+        elif tok['content'] == "[[":
+            x = self.parse_link('LINK', "]]")
+        elif tok['content'] == "{{":
+            x = self.parse_link('TMPL', "}}")
         else: # FIXME
             self.dprint(80, "LEAVE parse_inline=%s", "None")
             x = None
@@ -293,23 +282,23 @@ class BaseWikiMarkup:
         textlist = []
         while 1:
             tok = self.getkn()
-            if tok[0] == TEXT:
-                textlist.append(tok[1])
-            elif tok[0] == NL:
+            if tok['type'] == 'TEXT':
+                textlist.append(tok['content'])
+            elif tok['type'] == 'NL':
                 tok = self.getkn()
-                if tok[0] == NL or tok[0] == NIL:
+                if tok['type'] == 'NL' or tok['type'] == 'NIL':
                     break
                 else:
                     self.ungetkn()
                     if self.is_block_delim(tok):
                         break
                 textlist.append('\n')
-            elif tok[0] == NIL:
+            elif tok['type'] == 'NIL':
                 break
-            elif tok[0] == DELIM:
+            elif tok['type'] == 'DELIM':
                 if self.is_inline_delim(tok):
                     if textlist:
-                        seq.append((TEXT, textlist))
+                        seq.append({ 'type': 'TEXT', 'content': textlist })
                         textlist = []
                     x = self.parse_inline(tok)
                     if x:
@@ -318,27 +307,27 @@ class BaseWikiMarkup:
                         seq.append(tok)
                         break
                 else:
-                    seq.append((TEXT,tok[1]))
+                    seq.append({ 'type': 'TEXT', 'content': tok['content'] })
                 #    self.ungetkn()
                     break
         if textlist:
-            seq.append((TEXT, textlist))
+            seq.append({ 'type': 'TEXT', 'content': textlist })
         self.dprint(80, "LEAVE parse_para=%s", seq)
-        return (PARA, seq)
+        return { 'type': 'PARA', 'content': seq }
 
     def parse_header(self, delim):
         self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn())
         list = []
         while 1:
             tok = self.getkn()
-            if tok[0] == NIL:
+            if tok['type'] == 'NIL':
                 self.dprint(80, "LEAVE parse_header=%s", "None")        
                 return None
-            elif tok[0] == TEXT:
+            elif tok['type'] == 'TEXT':
                 list.append(tok)
-            elif tok[0] == DELIM:
-                if tok[1] == delim:
-                    if self.peektkn()[0] == NL:
+            elif tok['type'] == 'DELIM':
+                if tok['content'] == delim:
+                    if self.peektkn()['type'] == 'NL':
                         break
                     else:
                         self.dprint(80, "LEAVE parse_header=%s", "None")
@@ -354,7 +343,9 @@ class BaseWikiMarkup:
                 self.dprint(80, "LEAVE parse_header=%s", "None")
                 return None
         self.dprint(80, "LEAVE parse_header=(HDR, %s, (SEQ,%s))",len(delim)-1,list)
-        return (HDR,len(delim)-1,(SEQ,list))
+        return { 'type': 'HDR',
+                 'level': len(delim)-1,
+                 'content': { 'type': 'SEQ', 'content': list } }
 
 
     def parse_line(self):
@@ -362,12 +353,12 @@ class BaseWikiMarkup:
         list = []
         while 1:
             tok = self.getkn()
-            if tok[0] == NL or tok[0] == NIL:
+            if tok['type'] == 'NL' or tok['type'] == 'NIL':
                 break
-            elif tok[0] == TEXT:
+            elif tok['type'] == 'TEXT':
                 list.append(tok)
-            elif tok[0] == DELIM and tok[1][0] == ":":
-                list.append(self.parse_indent(len(tok[1])))
+            elif tok['type'] == 'DELIM' and tok['content'][0] == ":":
+                list.append(self.parse_indent(len(tok['content'])))
                 break
             else:
                 x = self.parse_inline(tok)
@@ -376,67 +367,72 @@ class BaseWikiMarkup:
                 else:
                     list.append(tok)
         self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list)
-        return (SEQ, list)
+        return { 'type': 'SEQ', 'content': list }
     
     def parse_env(self, type, lev):
         self.dprint(80, "ENTER parse_env(%s,%s), tok %s",type,lev,self.peektkn())
         list = []
         while 1:
             tok = self.getkn()
-            if tok[0] == DELIM and envtypes.has_key(tok[1][0]) and type == envtypes[tok[1][0]][0]:
-                if len(tok[1]) < lev:
+            if tok['type'] == 'DELIM' \
+               and envtypes.has_key(tok['content'][0]) \
+               and type == envtypes[tok['content'][0]][0]:
+                if len(tok['content']) < lev:
                     self.ungetkn()
                     break
-                elif len(tok[1]) > lev:
+                elif len(tok['content']) > lev:
                     self.ungetkn()
-                    elt = self.parse_env(type, len(tok[1]))
+                    elt = self.parse_env(type, len(tok['content']))
                 else:
                     elt = self.parse_line()
-                    if len(tok) == 2:
-                        list.append((ELT, envtypes[tok[1][0]][1], elt))
+                    if len(tok.keys()) == 2:
+                        list.append({ 'type': 'ELT',
+                                      'subtype': envtypes[tok['content'][0]][1],
+                                      'content': elt })
                         continue
-                    
-                if list[-1][2][0] != SEQ:
-                    x = list[-1][2][1]
-                    list[-1][2] = (SEQ, [x])
-                list[-1][2][1].append(elt)
+
+                if list[-1]['content']['type'] != 'SEQ':
+                    x = list[-1]['content']['content']
+                    # FIXME:
+                    list[-1]['content'] = { 'type': 'SEQ', 'content': [x] }
+                list[-1]['content']['content'].append(elt)
             else:
                 self.ungetkn()
                 break
         self.dprint(80, "LEAVE parse_env=(ENV, %s, %s, %s)", type, lev, list)
-        return (ENV, type, lev, list)
+        return { 'type': 'ENV', 'envtype': type, 'level': lev, 'content': list }
 
     def parse_indent(self, lev):
         self.dprint(80, "ENTER parse_indent(%s), tok %s", lev, self.peektkn())
-        x = (IND, lev, self.parse_line())
+        x = { 'type': 'IND', 'level': lev, 'content': self.parse_line() }
         self.dprint(80, "LEAVE parse_indent=%s", x)
         return x
     
     def parse0(self):
         tok = self.getkn()
-        toktype = tok[0]
-        if toktype == NIL:
+        toktype = tok['type']
+        if toktype == 'NIL':
             return None
-        elif toktype == TEXT:
+        elif toktype == 'TEXT':
             self.ungetkn()
             return self.parse_para()
-        elif toktype == DELIM:
-            if tok[1] == "----":
-                return (BAR,)
-            elif tok[1][0:2] == "==":
-                return self.parse_header(tok[1])
-            elif envtypes.has_key(tok[1][0]):
-                type = envtypes[tok[1][0]][0]
-                lev = len(tok[1])
+        elif toktype == 'DELIM':
+            if tok['content'] == "----":
+                return { 'type': 'BAR' }
+            elif tok['content'][0:2] == "==":
+                return self.parse_header(tok['content'])
+            elif envtypes.has_key(tok['content'][0]):
+                type = envtypes[tok['content'][0]][0]
+                lev = len(tok['content'])
                 self.ungetkn()
                 return self.parse_env(type, lev)
-            elif tok[1][0] == ":":
-                return self.parse_indent(len(tok[1]))
+            elif tok['content'][0] == ":":
+                return self.parse_indent(len(tok['content']))
             else:
                 self.ungetkn()
                 return self.parse_para()
-        elif toktype == NL:
-            return (TEXT, '\n')
+        elif toktype == 'NL':
+            return { 'type': 'TEXT', 'content': '\n' }
 #            return self.parse0()
     
     def parse(self):
@@ -513,25 +509,28 @@ class WikiMarkup (BaseWikiMarkup):
             return None
 
     def is_lang_link(self, elt):
-        if elt[0] == LINK and isinstance(elt[1],list) and len(elt[1]) == 1:
-            if elt[1][0][0] == TEXT:
-                m = re.match('([\w-]+):', elt[1][0][1])
+        if elt['type'] == 'LINK' \
+           and isinstance(elt['content'], list) \
+           and len(elt['content']) == 1:
+            if elt['content'][0]['type'] == TEXT:
+                m = re.match('([\w-]+):', elt['content'][0]['content'])
                 if m: # and m.group(1) in self.langtab:
                     return True
-            elif elt[1][0][0] == SEQ and len(elt[1][0][1]) == 1 and\
-                    elt[1][0][1][0][0] == TEXT:
-                m = re.match('([\w-]+):',elt[1][0][1][0][1])
+            elif elt['content'][0]['type'] == 'SEQ' \
+                    and len(elt['content'][0]['content']) == 1 and\
+                    elt['content'][0]['content'][0]['type'] == TEXT:
+                m = re.match('([\w-]+):',elt['content'][0]['content'][0]['content'])
                 if m: # and m.group(1) in self.langtab:
                     return True
         return False
     
     def is_empty_text(self, elt):
-        if elt[0] == TEXT:
-            if isinstance(elt[1],list):
-                for s in elt[1]:
+        if elt['type'] == 'TEXT':
+            if isinstance(elt['content'],list):
+                for s in elt['content']:
                     if re.search('\w', s):
                         return False
-            elif re.search('\w', elt[1]):
+            elif re.search('\w', elt['content']):
                 return False
             return True
         return False
author	Sergey Poznyakoff <gray@gnu.org>	2015-07-06 08:05:31 +0300
committer	Sergey Poznyakoff <gray@gnu.org>	2015-07-06 08:05:31 +0300
commit	f3378aebac7e89000ff097ac51c49b62eb6e9f08 (patch)
tree	cdf7a9b58b52cd6e995ddf63ef05526e60a918f1 /wikimarkup.py
parent	7ab9949e2c038ee6a7215d91896f2b47a5e7c06d (diff)
download	wikitrans-f3378aebac7e89000ff097ac51c49b62eb6e9f08.tar.gz wikitrans-f3378aebac7e89000ff097ac51c49b62eb6e9f08.tar.bz2