Handle <tags> and implicit preformatted blocks

Among <tags>, this commit handles <nowiki> and <code>. General tag handling mechanism is provided. * wikimarkup.py (otag, ctag, close_delim): New variables. (BaseWikiMarkup)<newline,nested>: New attributes. (otag, ctag, close_delim): New variables. (newline,nested>: New attributes. (input_tag): New abstract method. (tokread): Remove calls to dprint, now done by the callers. Handle xml-style tags. (getkn,ungetkn): Set newline. (inline_delims): Add '|' (parse_para): Decide whether it is going to be a PRE or PARA. Don't mix the two. Fix recovery in case of unmatched/incorrect inline constructs. (parse): eliminate initial PARA, if called as a nested instance. (WikiMarkup): Remove parse method. Rely on the parent class. * wiki2html.py (input_tag, str_tag, str_pre): New methods. (format): Handle PRE and TAG tokens * wiki2text.py: Similar changes. Needs some more work.
author: Sergey Poznyakoff <gray@gnu.org.ua> 2015-07-06 17:01:23 +0300
committer: Sergey Poznyakoff <gray@gnu.org.ua> 2015-07-06 17:36:49 +0300
commit: b74b1d5fe2326f56a2e37f57c38b929307c71282 (patch)
tree: e6029ae08f00bc7affcd1d7aec75d1288f9184ea
parent: f3378aebac7e89000ff097ac51c49b62eb6e9f08 (diff)
download: wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.gz
wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.bz2
3 files changed, 139 insertions, 32 deletions
diff --git a/wiki2html.py b/wiki2html.py
index eee592d..061377b 100644
--- a/wiki2html.py
+++ b/wiki2html.py
@@ -174,2 +174,22 @@ class HtmlWikiMarkup (WikiMarkup):
 
+    supported_tags = [ 'nowiki', 'code' ]
+    def input_tag(self, tag):
+        return tag['tag'] in self.supported_tags
+    
+    def str_tag(self, elt):
+        if elt['tag'] == 'nowiki':
+            return '<pre>' + elt['content'] + '</pre>'
+        elif elt['tag'] == 'code':
+            kwdict = {
+                'nested': self.nested + 1,
+                'lang': self.lang,
+                'text': elt['content'],
+                'html_base': self.html_base,
+                'image_base': self.image_base,
+                'media_base': self.media_base }
+            markup = HtmlWiktionaryMarkup(**kwdict)
+            markup.debug_level = self.debug_level
+            markup.parse()
+            return '<pre><code>' + str(markup) + '</code></pre>' #FIXME
+    
     def str_para(self, elt):
@@ -180,2 +200,8 @@ class HtmlWikiMarkup (WikiMarkup):
 
+    def str_pre(self, elt):
+        string = "";
+        for x in elt['content']:
+            string += self.format(x)
+        return '<pre>' + string + '</pre>'
+    
     def str_ind(self, elt):
@@ -192,4 +218,8 @@ class HtmlWikiMarkup (WikiMarkup):
             return string
+        elif elt['type'] == 'TAG':
+            return self.str_tag(elt)
         elif elt['type'] == 'PARA':
             return self.str_para(elt)
+        elif elt['type'] == 'PRE':
+            return self.str_pre(elt)
         elif elt['type'] == 'IT':
diff --git a/wiki2text.py b/wiki2text.py
index c94ae51..3084ee4 100644
--- a/wiki2text.py
+++ b/wiki2text.py
@@ -144,2 +144,22 @@ class TextWikiMarkup (WikiMarkup):
         
+    supported_tags = [ 'nowiki', 'code' ]
+    def input_tag(self, tag):
+        return tag['tag'] in self.supported_tags
+    
+    def str_tag(self, elt):
+        if elt['tag'] == 'nowiki':
+            return elt['content']
+        elif elt['tag'] == 'code':
+            kwdict = {
+                'nested': self.nested + 1,
+                'lang': self.lang,
+                'text': elt['content'],
+                'html_base': self.html_base,
+                'image_base': self.image_base,
+                'media_base': self.media_base }
+            markup = TextWiktionaryMarkup(**kwdict)
+            markup.debug_level = self.debug_level
+            markup.parse()
+            return str(markup)
+        
     def format(self, elt):
@@ -157,2 +177,7 @@ class TextWikiMarkup (WikiMarkup):
                 string = elt['content']
+        elif elt['type'] == 'PRE':
+            string = ""
+            for x in elt['content']:
+                string += self.format(x)
+            string += '\n'
         elif elt['type'] == 'PARA':
@@ -162,2 +187,4 @@ class TextWikiMarkup (WikiMarkup):
             string = self.fmtpara(string) + '\n\n'
+        elif elt['type'] == 'TAG':
+            string = self.str_tag(elt)
         elif elt['type'] == 'IT':
@@ -216,3 +243,3 @@ class TextWikiMarkup (WikiMarkup):
                 elif type == "defn":
-                    if s[1] == 0:
+                    if s['subtype'] == 0:
                         string += self.indent(lev-1, x)
diff --git a/wikimarkup.py b/wikimarkup.py
index 09c48eb..636012e 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -25,2 +25,10 @@ __all__ = [ "BaseWikiMarkup", "WikiMarkup",
 delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)")
+otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
+ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+
+close_delim = {
+    '[': ']',
+    '[[': ']]',
+    '{{': '}}'
+}
 
@@ -37,4 +45,6 @@ class BaseWikiMarkup:
     tokind = 0
+    newline = 0
     tree = None
 
+    nested = 0
     debug_level = 0
@@ -44,2 +54,5 @@ class BaseWikiMarkup:
             print "[DEBUG]", fmt % argv
+
+    def input_tag(self, tag):
+        pass
     
@@ -57,3 +70,2 @@ class BaseWikiMarkup:
             if not line or line == "":
-                self.dprint(100, "YIELD: NIL")
                 yield({ 'type': 'NIL' })
@@ -62,3 +74,2 @@ class BaseWikiMarkup:
             if line == '\n':
-                self.dprint(100, "YIELD: NL")
                 yield({ 'type': 'NL', 'content': line })
@@ -72,3 +83,2 @@ class BaseWikiMarkup:
                 if (pos < m.start(0)):
-                    self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)])
                     yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
@@ -77,3 +87,2 @@ class BaseWikiMarkup:
                     # FIXME?
-                    self.dprint(100, "YIELD: DELIM %s, True", m.group(0))
                     # FIXME: What's "extra"?
@@ -84,3 +93,2 @@ class BaseWikiMarkup:
                 else:
-                    self.dprint(100, "YIELD: DELIM %s", m.group(0))
                     yield({ 'type': 'DELIM',
@@ -88,4 +96,29 @@ class BaseWikiMarkup:
             else:
+                m = otag.match(line)
+                if m:
+                    t = { 'type': 'TAG',
+                          'tag': m.group('tag'),
+                          'args': m.group('args') }
+                    
+                    if self.input_tag(t):
+                        s = ''
+                        if not m.group('closed'):
+                            while 1:
+                                try:
+                                    l = self.input()
+                                    m = ctag.match(l)
+                                    if m and m.group('tag') == t['tag']:
+                                        break
+                                    s += l
+                                except StopIteration:
+                                    break
+                        yield({ 'type': 'TAG',
+                                'tag': t['tag'],
+                                'args': t['args'],
+                                'content': s
+                              })
+                        line = None
+                        continue
+                                
                 if line[-1] == '\n':
-                    self.dprint(100, "YIELD: TEXT %s", line[pos:-1])
                     if line[pos:-1] != '':
@@ -93,3 +126,2 @@ class BaseWikiMarkup:
                                 'content': line[pos:-1] })
-                    self.dprint(100, "YIELD: NL")
                     yield({ 'type': 'NL',
@@ -97,3 +129,2 @@ class BaseWikiMarkup:
                 else:
-                    self.dprint(100, "YIELD: TEXT %s", line[pos:])
                     yield({ 'type': 'TEXT',
@@ -108,2 +139,3 @@ class BaseWikiMarkup:
         for tok in self.tokread():
+            self.dprint(100, "TOK: %s", tok)
             self.toklist.append(tok)
@@ -135,2 +167,3 @@ class BaseWikiMarkup:
     def getkn(self):
+        self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
         tok = self.toklist[self.tokind]
@@ -139,5 +172,6 @@ class BaseWikiMarkup:
         return tok
-
+    
     def ungetkn(self):
         self.tokind = self.tokind - 1
+        self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
         return self.toklist[self.tokind]
@@ -250,3 +284,3 @@ class BaseWikiMarkup:
 
-    inline_delims = [ "''", "'''", "[", "[[", "{{" ]
+    inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ]
 
@@ -282,2 +316,9 @@ class BaseWikiMarkup:
         textlist = []
+        tok = self.peektkn()
+        if re.match("^\s", tok['content']):
+            type = 'PRE'
+            rx = re.compile("^\S")
+        else:
+            type = 'PARA'
+            rx = re.compile("^\s")
         while 1:
@@ -285,2 +326,5 @@ class BaseWikiMarkup:
             if tok['type'] == 'TEXT':
+                if self.newline and rx.match(tok['content']):
+                    self.ungetkn()
+                    break
                 textlist.append(tok['content'])
@@ -306,4 +350,18 @@ class BaseWikiMarkup:
                     else:
-                        seq.append(tok)
-                        break
+                        self.dprint(80, "ROLLBACK parse_para=%s", tok)
+                        od = tok['content']
+                        textlist.append(od)
+                        if close_delim.has_key(od):
+                            cd = close_delim[od]
+                            lev = 0
+                            for tok in self.toklist[self.tokind:]:
+                                if tok['type'] == 'NIL':
+                                    break
+                                elif tok['type'] == 'DELIM':
+                                    if tok['content'] == od:
+                                        lev += 1
+                                    elif tok['content'] == cd:
+                                        if lev == 0:
+                                            tok['type'] = 'TEXT'
+                                            break
                 else:
@@ -315,3 +373,3 @@ class BaseWikiMarkup:
         self.dprint(80, "LEAVE parse_para=%s", seq)
-        return { 'type': 'PARA', 'content': seq }
+        return { 'type': type, 'content': seq }
 
@@ -409,3 +467,3 @@ class BaseWikiMarkup:
         return x
-    
+
     def parse0(self):
@@ -436,3 +494,5 @@ class BaseWikiMarkup:
 #            return self.parse0()
-    
+        else:
+            return tok
+
     def parse(self):
@@ -448,2 +508,5 @@ class BaseWikiMarkup:
             self.tree.append(subtree)
+        if self.nested:
+            if self.tree[0]['type'] == 'PARA':
+                self.tree[0]['type'] = 'SEQ'
         self.dprint(70, "TREE: %s", self.tree)
@@ -497,2 +560,4 @@ class WikiMarkup (BaseWikiMarkup):
                 self.media_base = keywords[kw]
+            elif kw == 'nested':
+                self.nested = keywords[kw]
 
@@ -543,17 +608,2 @@ class WikiMarkup (BaseWikiMarkup):
     
-    def parse(self):
-        BaseWikiMarkup.parse(self)
-        # # Remove everything before the first header
-        # for i in range(0, len(self.tree)):
-        #     if self.tree[i][0] == HDR:
-        #         self.tree = self.tree[i:]
-        #         break
-        # # Remove trailing links
-        # for i in range(len(self.tree)-1, 0, -1):
-        #     if self.tree[i][0] == PARA \
-        #             and not self.is_empty_para(self.tree[i][1]):
-        #         self.tree = self.tree[0:i+1]
-        #         break
-                    
-        
     # ISO 639 
@@ -574,3 +624,3 @@ class WikiMarkup (BaseWikiMarkup):
 	"av": "Авар",            # Avaric
-	"ay": "Aymar",           # Aymara
+	"ay": "Aymara",           # Aymara
 	"az": "Azərbaycan" ,     # Azerbaijani
author	Sergey Poznyakoff <gray@gnu.org.ua>	2015-07-06 17:01:23 +0300
committer	Sergey Poznyakoff <gray@gnu.org.ua>	2015-07-06 17:36:49 +0300
commit	b74b1d5fe2326f56a2e37f57c38b929307c71282 (patch)
tree	e6029ae08f00bc7affcd1d7aec75d1288f9184ea
parent	f3378aebac7e89000ff097ac51c49b62eb6e9f08 (diff)
download	wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.gz wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.bz2