Handle <tags> and implicit preformatted blocks

Among <tags>, this commit handles <nowiki> and <code>. General tag handling mechanism is provided. * wikimarkup.py (otag, ctag, close_delim): New variables. (BaseWikiMarkup)<newline,nested>: New attributes. (otag, ctag, close_delim): New variables. (newline,nested>: New attributes. (input_tag): New abstract method. (tokread): Remove calls to dprint, now done by the callers. Handle xml-style tags. (getkn,ungetkn): Set newline. (inline_delims): Add '|' (parse_para): Decide whether it is going to be a PRE or PARA. Don't mix the two. Fix recovery in case of unmatched/incorrect inline constructs. (parse): eliminate initial PARA, if called as a nested instance. (WikiMarkup): Remove parse method. Rely on the parent class. * wiki2html.py (input_tag, str_tag, str_pre): New methods. (format): Handle PRE and TAG tokens * wiki2text.py: Similar changes. Needs some more work.
author: Sergey Poznyakoff <gray@gnu.org.ua> 2015-07-06 17:01:23 +0300
committer: Sergey Poznyakoff <gray@gnu.org.ua> 2015-07-06 17:36:49 +0300
commit: b74b1d5fe2326f56a2e37f57c38b929307c71282 (patch)
tree: e6029ae08f00bc7affcd1d7aec75d1288f9184ea /wikimarkup.py
parent: f3378aebac7e89000ff097ac51c49b62eb6e9f08 (diff)
download: wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.gz
wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.bz2
1 files changed, 81 insertions, 31 deletions
diff --git a/wikimarkup.py b/wikimarkup.py
index 09c48eb..636012e 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -23,6 +23,14 @@ __all__ = [ "BaseWikiMarkup", "WikiMarkup",
             "envtypes" ]
 
 delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)")
+otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
+ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+
+close_delim = {
+    '[': ']',
+    '[[': ']]',
+    '{{': '}}'
+}
 
 # Environment types:
 envtypes = { "*": [ "unnumbered", 0 ],
@@ -35,13 +43,18 @@ class BaseWikiMarkup:
 
     toklist = None
     tokind = 0
+    newline = 0
     tree = None
 
+    nested = 0
     debug_level = 0
     
     def dprint(self, lev, fmt, *argv):
         if self.debug_level >= lev:
             print "[DEBUG]", fmt % argv
+
+    def input_tag(self, tag):
+        pass
     
     def tokread(self):
         line = None
@@ -55,12 +68,10 @@ class BaseWikiMarkup:
                     line = u''
                     
             if not line or line == "":
-                self.dprint(100, "YIELD: NIL")
                 yield({ 'type': 'NIL' })
                 break
 
             if line == '\n':
-                self.dprint(100, "YIELD: NL")
                 yield({ 'type': 'NL', 'content': line })
                 line = None
                 continue
@@ -70,32 +81,52 @@ class BaseWikiMarkup:
             
             if m:
                 if (pos < m.start(0)):
-                    self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)])
                     yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
                 pos = m.end(0)
                 if envtypes.has_key(m.group(0)[0]) and line[pos] == ":":
                     # FIXME?
-                    self.dprint(100, "YIELD: DELIM %s, True", m.group(0))
                     # FIXME: What's "extra"?
                     yield({ 'type': 'DELIM',
                             'content': m.group(0),
                             'extra': True })
                     pos += 1
                 else:
-                    self.dprint(100, "YIELD: DELIM %s", m.group(0))
                     yield({ 'type': 'DELIM',
                             'content': m.group(0) })
             else:
+                m = otag.match(line)
+                if m:
+                    t = { 'type': 'TAG',
+                          'tag': m.group('tag'),
+                          'args': m.group('args') }
+                    
+                    if self.input_tag(t):
+                        s = ''
+                        if not m.group('closed'):
+                            while 1:
+                                try:
+                                    l = self.input()
+                                    m = ctag.match(l)
+                                    if m and m.group('tag') == t['tag']:
+                                        break
+                                    s += l
+                                except StopIteration:
+                                    break
+                        yield({ 'type': 'TAG',
+                                'tag': t['tag'],
+                                'args': t['args'],
+                                'content': s
+                              })
+                        line = None
+                        continue
+                                
                 if line[-1] == '\n':
-                    self.dprint(100, "YIELD: TEXT %s", line[pos:-1])
                     if line[pos:-1] != '':
                         yield({ 'type': 'TEXT',
                                 'content': line[pos:-1] })
-                    self.dprint(100, "YIELD: NL")
                     yield({ 'type': 'NL',
                             'content': '\n' })
                 else:
-                    self.dprint(100, "YIELD: TEXT %s", line[pos:])
                     yield({ 'type': 'TEXT',
                             'content': line[pos:] })
                 line = None
@@ -106,6 +137,7 @@ class BaseWikiMarkup:
     def tokenize(self):
         self.toklist = []
         for tok in self.tokread():
+            self.dprint(100, "TOK: %s", tok)
             self.toklist.append(tok)
         # Determine and fix up the ordering of bold and italic markers
         # This helps correctly parse inputs like:
@@ -133,13 +165,15 @@ class BaseWikiMarkup:
         self.toklist[self.tokind] = val
     
     def getkn(self):
+        self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
         tok = self.toklist[self.tokind]
         if tok['type'] != 'NIL':
             self.tokind = self.tokind + 1
         return tok
-
+    
     def ungetkn(self):
         self.tokind = self.tokind - 1
+        self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
         return self.toklist[self.tokind]
 
     def parse_fontmod(self,delim,what):
@@ -248,7 +282,7 @@ class BaseWikiMarkup:
         self.dprint(80, "LEAVE parse_ref= %s", ret)
         return ret
 
-    inline_delims = [ "''", "'''", "[", "[[", "{{" ]
+    inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ]
 
     def is_inline_delim(self, tok):
         return tok['type'] == 'DELIM' and tok['content'] in self.inline_delims
@@ -280,9 +314,19 @@ class BaseWikiMarkup:
         self.dprint(80, "ENTER parse_para, tok %s", self.peektkn())
         seq = []
         textlist = []
+        tok = self.peektkn()
+        if re.match("^\s", tok['content']):
+            type = 'PRE'
+            rx = re.compile("^\S")
+        else:
+            type = 'PARA'
+            rx = re.compile("^\s")
         while 1:
             tok = self.getkn()
             if tok['type'] == 'TEXT':
+                if self.newline and rx.match(tok['content']):
+                    self.ungetkn()
+                    break
                 textlist.append(tok['content'])
             elif tok['type'] == 'NL':
                 tok = self.getkn()
@@ -304,8 +348,22 @@ class BaseWikiMarkup:
                     if x:
                         seq.append(x)
                     else:
-                        seq.append(tok)
-                        break
+                        self.dprint(80, "ROLLBACK parse_para=%s", tok)
+                        od = tok['content']
+                        textlist.append(od)
+                        if close_delim.has_key(od):
+                            cd = close_delim[od]
+                            lev = 0
+                            for tok in self.toklist[self.tokind:]:
+                                if tok['type'] == 'NIL':
+                                    break
+                                elif tok['type'] == 'DELIM':
+                                    if tok['content'] == od:
+                                        lev += 1
+                                    elif tok['content'] == cd:
+                                        if lev == 0:
+                                            tok['type'] = 'TEXT'
+                                            break
                 else:
                     seq.append({ 'type': 'TEXT', 'content': tok['content'] })
                 #    self.ungetkn()
@@ -313,7 +371,7 @@ class BaseWikiMarkup:
         if textlist:
             seq.append({ 'type': 'TEXT', 'content': textlist })
         self.dprint(80, "LEAVE parse_para=%s", seq)
-        return { 'type': 'PARA', 'content': seq }
+        return { 'type': type, 'content': seq }
 
     def parse_header(self, delim):
         self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn())
@@ -407,7 +465,7 @@ class BaseWikiMarkup:
         x = { 'type': 'IND', 'level': lev, 'content': self.parse_line() }
         self.dprint(80, "LEAVE parse_indent=%s", x)
         return x
-    
+
     def parse0(self):
         tok = self.getkn()
         toktype = tok['type']
@@ -434,7 +492,9 @@ class BaseWikiMarkup:
         elif toktype == 'NL':
             return { 'type': 'TEXT', 'content': '\n' }
 #            return self.parse0()
-    
+        else:
+            return tok
+
     def parse(self):
         if not self.toklist:
             self.tokenize()
@@ -446,6 +506,9 @@ class BaseWikiMarkup:
             if subtree == None:
                 break
             self.tree.append(subtree)
+        if self.nested:
+            if self.tree[0]['type'] == 'PARA':
+                self.tree[0]['type'] = 'SEQ'
         self.dprint(70, "TREE: %s", self.tree)
 
     def __str__(self):
@@ -495,6 +558,8 @@ class WikiMarkup (BaseWikiMarkup):
                 self.image_base = keywords[kw]
             elif kw == 'media_base':
                 self.media_base = keywords[kw]
+            elif kw == 'nested':
+                self.nested = keywords[kw]
 
     def __del__(self):
         if self.file:
@@ -541,21 +606,6 @@ class WikiMarkup (BaseWikiMarkup):
                 return False
         return True
     
-    def parse(self):
-        BaseWikiMarkup.parse(self)
-        # # Remove everything before the first header
-        # for i in range(0, len(self.tree)):
-        #     if self.tree[i][0] == HDR:
-        #         self.tree = self.tree[i:]
-        #         break
-        # # Remove trailing links
-        # for i in range(len(self.tree)-1, 0, -1):
-        #     if self.tree[i][0] == PARA \
-        #             and not self.is_empty_para(self.tree[i][1]):
-        #         self.tree = self.tree[0:i+1]
-        #         break
-                    
-        
     # ISO 639 
     langtab = {
         "aa": "Afar",            # Afar
@@ -572,7 +622,7 @@ class WikiMarkup (BaseWikiMarkup):
 	"as": "অসমীয়া",         # Assamese
         "ast": "Asturian", 
 	"av": "Авар",            # Avaric
-	"ay": "Aymar",           # Aymara
+	"ay": "Aymara",           # Aymara
 	"az": "Azərbaycan" ,     # Azerbaijani
 
 	"ba": "Башҡорт",         # Bashkir
author	Sergey Poznyakoff <gray@gnu.org.ua>	2015-07-06 17:01:23 +0300
committer	Sergey Poznyakoff <gray@gnu.org.ua>	2015-07-06 17:36:49 +0300
commit	b74b1d5fe2326f56a2e37f57c38b929307c71282 (patch)
tree	e6029ae08f00bc7affcd1d7aec75d1288f9184ea /wikimarkup.py
parent	f3378aebac7e89000ff097ac51c49b62eb6e9f08 (diff)
download	wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.gz wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.bz2