Handle <tags> and implicit preformatted blocks

Among <tags>, this commit handles <nowiki> and <code>. General tag handling mechanism is provided. * wikimarkup.py (otag, ctag, close_delim): New variables. (BaseWikiMarkup)<newline,nested>: New attributes. (otag, ctag, close_delim): New variables. (newline,nested>: New attributes. (input_tag): New abstract method. (tokread): Remove calls to dprint, now done by the callers. Handle xml-style tags. (getkn,ungetkn): Set newline. (inline_delims): Add '|' (parse_para): Decide whether it is going to be a PRE or PARA. Don't mix the two. Fix recovery in case of unmatched/incorrect inline constructs. (parse): eliminate initial PARA, if called as a nested instance. (WikiMarkup): Remove parse method. Rely on the parent class. * wiki2html.py (input_tag, str_tag, str_pre): New methods. (format): Handle PRE and TAG tokens * wiki2text.py: Similar changes. Needs some more work.
author: Sergey Poznyakoff <gray@gnu.org.ua> 2015-07-06 17:01:23 +0300
committer: Sergey Poznyakoff <gray@gnu.org.ua> 2015-07-06 17:36:49 +0300
commit: b74b1d5fe2326f56a2e37f57c38b929307c71282 (patch)
tree: e6029ae08f00bc7affcd1d7aec75d1288f9184ea
parent: f3378aebac7e89000ff097ac51c49b62eb6e9f08 (diff)
download: wit-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.gz
wit-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.bz2
3 files changed, 135 insertions, 28 deletions
diff --git a/wiki2html.py b/wiki2html.py
index eee592d..061377b 100644
--- a/wiki2html.py
+++ b/wiki2html.py
@@ -172,12 +172,38 @@ class HtmlWikiMarkup (WikiMarkup):
                                 self.envt[type]["hdr"])
         return string
 
+    supported_tags = [ 'nowiki', 'code' ]
+    def input_tag(self, tag):
+        return tag['tag'] in self.supported_tags
+    
+    def str_tag(self, elt):
+        if elt['tag'] == 'nowiki':
+            return '<pre>' + elt['content'] + '</pre>'
+        elif elt['tag'] == 'code':
+            kwdict = {
+                'nested': self.nested + 1,
+                'lang': self.lang,
+                'text': elt['content'],
+                'html_base': self.html_base,
+                'image_base': self.image_base,
+                'media_base': self.media_base }
+            markup = HtmlWiktionaryMarkup(**kwdict)
+            markup.debug_level = self.debug_level
+            markup.parse()
+            return '<pre><code>' + str(markup) + '</code></pre>' #FIXME
+    
     def str_para(self, elt):
         string = "";
         for x in elt['content']:
             string += self.format(x)
         return "<p>" + string + "</p>"
 
+    def str_pre(self, elt):
+        string = "";
+        for x in elt['content']:
+            string += self.format(x)
+        return '<pre>' + string + '</pre>'
+    
     def str_ind(self, elt):
         return ("&nbsp;" * 2 * elt['level']) + self.format(elt['content'])
     
@@ -190,8 +216,12 @@ class HtmlWikiMarkup (WikiMarkup):
             else:
                 string = elt['content']
             return string
+        elif elt['type'] == 'TAG':
+            return self.str_tag(elt)
         elif elt['type'] == 'PARA':
             return self.str_para(elt)
+        elif elt['type'] == 'PRE':
+            return self.str_pre(elt)
         elif elt['type'] == 'IT':
             return self.str_it(elt)
         elif elt['type'] == 'BOLD':
diff --git a/wiki2text.py b/wiki2text.py
index c94ae51..3084ee4 100644
--- a/wiki2text.py
+++ b/wiki2text.py
@@ -142,6 +142,26 @@ class TextWikiMarkup (WikiMarkup):
             length += wsc + wlen
         return output + linebuf
         
+    supported_tags = [ 'nowiki', 'code' ]
+    def input_tag(self, tag):
+        return tag['tag'] in self.supported_tags
+    
+    def str_tag(self, elt):
+        if elt['tag'] == 'nowiki':
+            return elt['content']
+        elif elt['tag'] == 'code':
+            kwdict = {
+                'nested': self.nested + 1,
+                'lang': self.lang,
+                'text': elt['content'],
+                'html_base': self.html_base,
+                'image_base': self.image_base,
+                'media_base': self.media_base }
+            markup = TextWiktionaryMarkup(**kwdict)
+            markup.debug_level = self.debug_level
+            markup.parse()
+            return str(markup)
+        
     def format(self, elt):
         if elt['type'] == 'TEXT':
             if isinstance(elt['content'],list):
@@ -155,11 +175,18 @@ class TextWikiMarkup (WikiMarkup):
                     string += s
             else:
                 string = elt['content']
+        elif elt['type'] == 'PRE':
+            string = ""
+            for x in elt['content']:
+                string += self.format(x)
+            string += '\n'
         elif elt['type'] == 'PARA':
             string = "";
             for x in elt['content']:
                 string += self.format(x)
             string = self.fmtpara(string) + '\n\n'
+        elif elt['type'] == 'TAG':
+            string = self.str_tag(elt)
         elif elt['type'] == 'IT':
             string = ""
             for x in elt['content']:
@@ -214,7 +241,7 @@ class TextWikiMarkup (WikiMarkup):
                     string += self.fmtpara(self.indent(lev, "%d. %s" % (n, x)))
                     n += 1
                 elif type == "defn":
-                    if s[1] == 0:
+                    if s['subtype'] == 0:
                         string += self.indent(lev-1, x)
                     else:
                         string += self.indent(lev+3, x)
diff --git a/wikimarkup.py b/wikimarkup.py
index 09c48eb..636012e 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -23,6 +23,14 @@ __all__ = [ "BaseWikiMarkup", "WikiMarkup",
             "envtypes" ]
 
 delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)")
+otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
+ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+
+close_delim = {
+    '[': ']',
+    '[[': ']]',
+    '{{': '}}'
+}
 
 # Environment types:
 envtypes = { "*": [ "unnumbered", 0 ],
@@ -35,14 +43,19 @@ class BaseWikiMarkup:
 
     toklist = None
     tokind = 0
+    newline = 0
     tree = None
 
+    nested = 0
     debug_level = 0
     
     def dprint(self, lev, fmt, *argv):
         if self.debug_level >= lev:
             print "[DEBUG]", fmt % argv
 
+    def input_tag(self, tag):
+        pass
+    
     def tokread(self):
         line = None
         pos = 0
@@ -55,12 +68,10 @@ class BaseWikiMarkup:
                     line = u''
                     
             if not line or line == "":
-                self.dprint(100, "YIELD: NIL")
                 yield({ 'type': 'NIL' })
                 break
 
             if line == '\n':
-                self.dprint(100, "YIELD: NL")
                 yield({ 'type': 'NL', 'content': line })
                 line = None
                 continue
@@ -70,32 +81,52 @@ class BaseWikiMarkup:
             
             if m:
                 if (pos < m.start(0)):
-                    self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)])
                     yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
                 pos = m.end(0)
                 if envtypes.has_key(m.group(0)[0]) and line[pos] == ":":
                     # FIXME?
-                    self.dprint(100, "YIELD: DELIM %s, True", m.group(0))
                     # FIXME: What's "extra"?
                     yield({ 'type': 'DELIM',
                             'content': m.group(0),
                             'extra': True })
                     pos += 1
                 else:
-                    self.dprint(100, "YIELD: DELIM %s", m.group(0))
                     yield({ 'type': 'DELIM',
                             'content': m.group(0) })
             else:
+                m = otag.match(line)
+                if m:
+                    t = { 'type': 'TAG',
+                          'tag': m.group('tag'),
+                          'args': m.group('args') }
+                    
+                    if self.input_tag(t):
+                        s = ''
+                        if not m.group('closed'):
+                            while 1:
+                                try:
+                                    l = self.input()
+                                    m = ctag.match(l)
+                                    if m and m.group('tag') == t['tag']:
+                                        break
+                                    s += l
+                                except StopIteration:
+                                    break
+                        yield({ 'type': 'TAG',
+                                'tag': t['tag'],
+                                'args': t['args'],
+                                'content': s
+                              })
+                        line = None
+                        continue
+                                
                 if line[-1] == '\n':
-                    self.dprint(100, "YIELD: TEXT %s", line[pos:-1])
                     if line[pos:-1] != '':
                         yield({ 'type': 'TEXT',
                                 'content': line[pos:-1] })
-                    self.dprint(100, "YIELD: NL")
                     yield({ 'type': 'NL',
                             'content': '\n' })
                 else:
-                    self.dprint(100, "YIELD: TEXT %s", line[pos:])
                     yield({ 'type': 'TEXT',
                             'content': line[pos:] })
                 line = None
@@ -106,6 +137,7 @@ class BaseWikiMarkup:
     def tokenize(self):
         self.toklist = []
         for tok in self.tokread():
+            self.dprint(100, "TOK: %s", tok)
             self.toklist.append(tok)
         # Determine and fix up the ordering of bold and italic markers
         # This helps correctly parse inputs like:
@@ -133,6 +165,7 @@ class BaseWikiMarkup:
         self.toklist[self.tokind] = val
     
     def getkn(self):
+        self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
         tok = self.toklist[self.tokind]
         if tok['type'] != 'NIL':
             self.tokind = self.tokind + 1
@@ -140,6 +173,7 @@ class BaseWikiMarkup:
     
     def ungetkn(self):
         self.tokind = self.tokind - 1
+        self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
         return self.toklist[self.tokind]
 
     def parse_fontmod(self,delim,what):
@@ -248,7 +282,7 @@ class BaseWikiMarkup:
         self.dprint(80, "LEAVE parse_ref= %s", ret)
         return ret
 
-    inline_delims = [ "''", "'''", "[", "[[", "{{" ]
+    inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ]
 
     def is_inline_delim(self, tok):
         return tok['type'] == 'DELIM' and tok['content'] in self.inline_delims
@@ -280,9 +314,19 @@ class BaseWikiMarkup:
         self.dprint(80, "ENTER parse_para, tok %s", self.peektkn())
         seq = []
         textlist = []
+        tok = self.peektkn()
+        if re.match("^\s", tok['content']):
+            type = 'PRE'
+            rx = re.compile("^\S")
+        else:
+            type = 'PARA'
+            rx = re.compile("^\s")
         while 1:
             tok = self.getkn()
             if tok['type'] == 'TEXT':
+                if self.newline and rx.match(tok['content']):
+                    self.ungetkn()
+                    break
                 textlist.append(tok['content'])
             elif tok['type'] == 'NL':
                 tok = self.getkn()
@@ -304,7 +348,21 @@ class BaseWikiMarkup:
                     if x:
                         seq.append(x)
                     else:
-                        seq.append(tok)
+                        self.dprint(80, "ROLLBACK parse_para=%s", tok)
+                        od = tok['content']
+                        textlist.append(od)
+                        if close_delim.has_key(od):
+                            cd = close_delim[od]
+                            lev = 0
+                            for tok in self.toklist[self.tokind:]:
+                                if tok['type'] == 'NIL':
+                                    break
+                                elif tok['type'] == 'DELIM':
+                                    if tok['content'] == od:
+                                        lev += 1
+                                    elif tok['content'] == cd:
+                                        if lev == 0:
+                                            tok['type'] = 'TEXT'
                                             break
                 else:
                     seq.append({ 'type': 'TEXT', 'content': tok['content'] })
@@ -313,7 +371,7 @@ class BaseWikiMarkup:
         if textlist:
             seq.append({ 'type': 'TEXT', 'content': textlist })
         self.dprint(80, "LEAVE parse_para=%s", seq)
-        return { 'type': 'PARA', 'content': seq }
+        return { 'type': type, 'content': seq }
 
     def parse_header(self, delim):
         self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn())
@@ -434,6 +492,8 @@ class BaseWikiMarkup:
         elif toktype == 'NL':
             return { 'type': 'TEXT', 'content': '\n' }
 #            return self.parse0()
+        else:
+            return tok
 
     def parse(self):
         if not self.toklist:
@@ -446,6 +506,9 @@ class BaseWikiMarkup:
             if subtree == None:
                 break
             self.tree.append(subtree)
+        if self.nested:
+            if self.tree[0]['type'] == 'PARA':
+                self.tree[0]['type'] = 'SEQ'
         self.dprint(70, "TREE: %s", self.tree)
 
     def __str__(self):
@@ -495,6 +558,8 @@ class WikiMarkup (BaseWikiMarkup):
                 self.image_base = keywords[kw]
             elif kw == 'media_base':
                 self.media_base = keywords[kw]
+            elif kw == 'nested':
+                self.nested = keywords[kw]
 
     def __del__(self):
         if self.file:
@@ -541,21 +606,6 @@ class WikiMarkup (BaseWikiMarkup):
                 return False
         return True
     
-    def parse(self):
-        BaseWikiMarkup.parse(self)
-        # # Remove everything before the first header
-        # for i in range(0, len(self.tree)):
-        #     if self.tree[i][0] == HDR:
-        #         self.tree = self.tree[i:]
-        #         break
-        # # Remove trailing links
-        # for i in range(len(self.tree)-1, 0, -1):
-        #     if self.tree[i][0] == PARA \
-        #             and not self.is_empty_para(self.tree[i][1]):
-        #         self.tree = self.tree[0:i+1]
-        #         break
-                    
-        
     # ISO 639 
     langtab = {
         "aa": "Afar",            # Afar
@@ -572,7 +622,7 @@ class WikiMarkup (BaseWikiMarkup):
 	"as": "অসমীয়া",         # Assamese
         "ast": "Asturian", 
 	"av": "Авар",            # Avaric
-	"ay": "Aymar",           # Aymara
+	"ay": "Aymara",           # Aymara
 	"az": "Azərbaycan" ,     # Azerbaijani
 
 	"ba": "Башҡорт",         # Bashkir
author	Sergey Poznyakoff <gray@gnu.org.ua>	2015-07-06 17:01:23 +0300
committer	Sergey Poznyakoff <gray@gnu.org.ua>	2015-07-06 17:36:49 +0300
commit	b74b1d5fe2326f56a2e37f57c38b929307c71282 (patch)
tree	e6029ae08f00bc7affcd1d7aec75d1288f9184ea
parent	f3378aebac7e89000ff097ac51c49b62eb6e9f08 (diff)
download	wit-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.gz wit-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.bz2