Restructure the package.

The idea is to switch from using this project as a git submodule to having it distributed via PyPI. Since the name 'wit' is already registered there, the package is renamed to 'wikitrans'. * setup.py: Use setuptools Rename package to wikitrans. * wikicvt.py: Remove. Replaced with: * bin/wikitrans: New file. * __init__.py: Move to WikiTrans/__init__.py * wiki2html.py: Move to WikiTrans/wiki2html.py * wiki2texi.py: Move to WikiTrans/wiki2texi.py * wiki2text.py: Move to WikiTrans/wiki2text.py * wikimarkup.py: Move to WikiTrans/wikimarkup.py * wikins.py: Move to WikiTrans/wikins.py * test.py: Move to tests/test.py * MANIFEST.in: New file. * README.rst: New file. * .gitignore: Update.
author: Sergey Poznyakoff <gray@gnu.org.ua> 2015-07-16 13:20:06 +0300
committer: Sergey Poznyakoff <gray@gnu.org.ua> 2015-07-16 13:26:15 +0300
commit: eaf9325ddcff786f3fcd5b9047327ef6e397e778 (patch)
tree: 2f0336efbb1deab9651c5eeb1b5dd753538a5c8e /WikiTrans
parent: 8e11d7f20459697c883df1e421df02006f749792 (diff)
download: wikitrans-eaf9325ddcff786f3fcd5b9047327ef6e397e778.tar.gz
wikitrans-eaf9325ddcff786f3fcd5b9047327ef6e397e778.tar.bz2
6 files changed, 5073 insertions, 0 deletions
diff --git a/WikiTrans/__init__.py b/WikiTrans/__init__.py
new file mode 100644
index 0000000..ad99ce3
--- /dev/null
+++ b/WikiTrans/__init__.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008, 2015 Sergey Poznyakoff
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+__all__ = [ "wikimarkup", "wiki2html", "wiki2text", "wiki2texi", "wikins" ]
diff --git a/WikiTrans/wiki2html.py b/WikiTrans/wiki2html.py
new file mode 100644
index 0000000..754fa9b
--- /dev/null
+++ b/WikiTrans/wiki2html.py
@@ -0,0 +1,283 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008,2015 Sergey Poznyakoff
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from wikimarkup import *
+from wikins import wiki_ns_re, wiki_ns
+import re
+try:
+    from urllib import quote as url_quote
+except ImportError:
+    from urllib.parse import quote as url_quote
+
+__all__ = [ "HtmlWikiMarkup", "HtmlWiktionaryMarkup" ]
+    
+class HtmlWikiMarkup (WikiMarkup):
+    """
+    A (hopefully) general-purpose Wiki->HTML translator class.
+    FIXME: 1. See WikiMarkup for a list
+           2. [[official position]]s : final 's' gets after closing </a> tag.
+           Should be before.
+    """
+
+    def wiki_ns_name(self, str):
+        if str in wiki_ns[self.lang]:
+            return wiki_ns[self.lang][str]
+        elif str in wiki_ns_re[self.lang]:
+            for elt in wiki_ns_re[self.lang][str]:
+                if str.beginswith(elt[0]) and str.endswith(elt[1]):
+                    return elt[2]
+        return None            
+        
+    envt = { "unnumbered": { "hdr": "ul",
+                             "elt": ["li"] },
+             "numbered":   { "hdr": "ol",
+                             "elt": ["li"] },
+             "defn":       { "hdr": "dl",
+                             "elt": ["dt","dd"] } }    
+
+    def mktgt(self, tgt, lang = None):
+        if not lang:
+            lang = self.lang
+        return self.html_base % { 'lang' : lang } + url_quote(tgt)
+
+    def tmpl_term(self, s):
+        if len(s) == 2:
+            return s[1]
+        text = None
+        trans = None
+        for x in s[1:]:
+            m = re.match('(\w+)=', x)
+            if m:
+                if m.group(1) == "tr":
+                    trans = x[m.end(1)+1:]
+            elif not text:
+                text = x
+        if text:
+            if trans:
+                text += ' <span class="trans">[' + trans + ']</span>'
+        return text
+
+    def tmpl_proto(self, s):
+        text = '<span class="proto-lang">Proto-' + s[1] + '</span>'
+        if len(s) >= 4:
+            n = 0
+            for x in s[2:-2]:
+                if n > 0:
+                    text += ','
+                n += 1
+                text += ' <span class="proto">' + x + '</span>'
+                text += ' <span class="meaning">(' + s[-2] + ')</span>'
+        return text
+                
+    
+    def fmtlink(self, elt, istmpl):
+        arg = self.format(elt['content'][0])
+        text = None
+        if len(elt['content']) > 1:
+            s = [x for x in map(self.format, elt['content'])]
+            if s[0] == 'disambigR' or s[0] == 'wikiquote':
+                return ""
+            elif len(s) > 1 and s[1] == 'thumb':
+                return ""
+            text = '<span class="template">' + s[1] + '</span>'
+            if istmpl:
+                if re.match("t[+-]$", s[0]):
+                    if len(s) > 2:
+                        text = s[2]
+                elif s[0] == "term":
+                    text = self.tmpl_term(s)
+                elif s[0] == "proto":
+                    text = self.tmpl_proto(s)
+                return text
+            
+        (qual,sep,tgt) = arg.partition(':')
+        if tgt != '':
+            ns = self.wiki_ns_name(qual)
+            if ns:
+                if ns == 'NS_IMAGE':
+                    return ''
+                elif ns == 'NS_MEDIA':
+                    tgt = self.media_base + '/' + tgt
+                else:
+                    tgt = self.mktgt(tgt)
+            elif not istmpl and qual in self.langtab:
+                tgt = self.mktgt(tgt, qual)
+                if not text or text == '':
+                    text = self.langtab[qual]
+            else:
+                tgt = self.mktgt(tgt)
+        else:
+            tgt = self.mktgt(arg)
+        return "<a href=\"%s\">%s</a>" % (tgt,
+                                          text if (text and text != '') \
+                                               else arg)
+                
+    def str_link(self, elt):
+        return self.fmtlink(elt, False)
+
+    def str_tmpl(self, elt):
+        return self.fmtlink(elt, True)
+
+    def str_ref(self, elt):
+        target = elt['ref']
+        text = self.format(elt['content'])
+        return "<a href=\"%s\">%s</a>" % (target,
+                                          text if (text and text != '') \
+                                                   else target)
+
+    def concat(self, eltlist):
+        string = ""
+        for x in eltlist:
+            string += self.format(x)
+        return string
+    
+    def str_it(self, elt):
+        return "<i>" + self.concat(elt['content']) + "</i>"
+                                          
+    def str_bold(self, elt):
+        return "<b>" + self.concat(elt['content']) + "</b>"
+                                              
+    def str_hdr(self, elt):
+        level = elt['level'] + 1
+        if level > 4:
+            level = 4
+        return "<h%s>%s</h%s>" % (level, self.format(elt['content']), level)
+    
+    def str_bar(self):
+        return "<hr/>"
+    
+    def str_env(self, elt):
+        type = elt['envtype']
+        lev = elt['level']
+        if lev > 4:
+            lev = 2
+        string = ""
+        for s in elt['content']:
+            n = s['subtype'];
+            string += "<%s>%s</%s>" % (self.envt[type]["elt"][n],
+                                       self.format(s['content']),
+                                       self.envt[type]["elt"][n])
+        return "<%s>%s</%s>" % (self.envt[type]["hdr"],
+                                string,
+                                self.envt[type]["hdr"])
+        return string
+
+    def str_tag(self, elt):
+        if elt['tag'] == 'code':
+            self.nested += 1
+            s = self.format(elt['content'])
+            self.nested -= 1
+            return '<pre><code>' + s + '</code></pre>' #FIXME
+        else:
+            s = '<' + elt['tag']
+            if elt['args']:
+                s += ' ' + str(elt['args'])
+            s += '>'
+            s += self.format(elt['content'])
+            return s + '</' + elt['tag'] + '>'
+    
+    def str_para(self, elt):
+        string = "";
+        for x in elt['content']:
+            string += self.format(x)
+        return "<p>" + string + "</p>"
+
+    def str_pre(self, elt):
+        string = "";
+        for x in elt['content']:
+            string += self.format(x)
+        if self.nested:
+            return string            
+        return '<pre>' + string + '</pre>'
+    
+    def str_ind(self, elt):
+        return ("<dl><dd>" * elt['level']) + self.format(elt['content']) + "</dd></dl>" * elt['level']
+    
+    def format(self, elt):
+        if elt['type'] == 'TEXT':
+            if isinstance(elt['content'],list):
+                string = ""
+                for s in elt['content']:
+                    string += s
+            else:
+                string = elt['content']
+            return string
+        elif elt['type'] == 'TAG':
+            return self.str_tag(elt)
+        elif elt['type'] == 'PARA':
+            return self.str_para(elt)
+        elif elt['type'] == 'PRE':
+            return self.str_pre(elt)
+        elif elt['type'] == 'IT':
+            return self.str_it(elt)
+        elif elt['type'] == 'BOLD':
+            return self.str_bold(elt)
+        elif elt['type'] == 'LINK':
+            return self.str_link(elt)
+        elif elt['type'] == 'TMPL':
+            return self.str_tmpl(elt)
+        elif elt['type'] == 'BAR':
+            return self.str_bar()
+        elif elt['type'] == 'HDR':
+            return self.str_hdr(elt)
+        elif elt['type'] == 'REF':
+            return self.str_ref(elt)
+        elif elt['type'] == 'ENV':
+            return self.str_env(elt)
+        elif elt['type'] == 'IND':
+            return self.str_ind(elt)
+        elif elt['type'] == 'SEQ':
+            string = ""
+            for x in elt['content']:
+                string += self.format(x)
+            return string
+        else:
+            return str(elt)
+    
+    def __str__(self):
+        str = ""
+        for elt in self.tree:
+            str += self.format(elt)
+        return str
+
+class HtmlWiktionaryMarkup (HtmlWikiMarkup):
+    """
+ A class for translating Wiktionary articles into HTML.
+ This version does not do much, except that it tries to correctly
+ format templates. But "tries" does not mean "does". The heuristics
+ used here is clearly not enough to cope with it.
+
+ 1. FIXME:    
+ The right solution would be to have a database of templates with their
+ semantics and to decide on their rendering depending on that. E.g.
+ {{term}} in en.wiktionary means "replace this with the search term".
+ This, however, does not work in other wiktionaries. There are
+ also more complex templates, e.g.: {{t+|bg|врата|n|p|tr=vrata|sc=Cyrl}}
+ I don't know what it means. Couldn't find any documentation either.
+ Again, this template does not work in other dictionaries.
+
+ 2. Capitulation notice:    
+ Given the:
+   1. waste amount of wiktionaries available,
+   2. abundance of various templates for each wictionary,
+   3. apparent lack of documentation thereof,
+   4. the lack of standardized language-independent templates,
+ I dont see any way to cope with the template-rendering task within a
+ reasonable amount of time.
+ 
+ Faeci quod potui, faciant meliora potentes.    
+    """
diff --git a/WikiTrans/wiki2texi.py b/WikiTrans/wiki2texi.py
new file mode 100644
index 0000000..6e32c56
--- /dev/null
+++ b/WikiTrans/wiki2texi.py
@@ -0,0 +1,251 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2015 Sergey Poznyakoff
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from wikimarkup import *
+from wikins import wiki_ns_re, wiki_ns
+import re
+import urllib
+
+class TexiWikiMarkup (WikiMarkup):
+    sectcomm = {
+        'numbered': [
+            '@top',
+            '@chapter',      
+            '@section',      
+            '@subsection',   
+            '@subsubsection'
+        ],
+        'unnumbered': [
+            '@top',
+            '@unnumbered',
+            '@unnumberedsec',
+            '@unnumberedsubsec',
+            '@unnumberedsubsubsec'
+        ],
+        'appendix': [
+            '@top',
+            '@appendix',
+            '@appendixsec',
+            '@appendixsubsec',
+            '@appendixsubsubsec'
+        ],
+        'heading': [
+            '@majorheading'
+            '@chapheading',
+            '@heading',
+            '@subheading',
+            '@subsubheading'
+        ]
+    }
+
+    sectioning_model = 'numbered'
+    sectioning_start = 0
+
+    def __init__(self, *args, **keywords):
+        super(TexiWikiMarkup, self).__init__(*args, **keywords)
+        if "sectioning-model" in keywords:
+            val = keywords["sectioning-model"]
+            if val in self.sectcomm:
+                self.sectioning_model = val
+            else:
+                raise ValueError("Invalid value for sectioning model: %s" % val)
+        if "sectioning-start" in keywords:
+            val = keywords["sectioning-start"]
+            if val < 0 or val > 4:
+                raise ValueError("Invalid value for sectioning start: %s" % val)
+            else:
+                self.sectioning_start = val
+
+        
+    def __str__(self):
+        str = ""
+        for elt in self.tree:
+            str += self.format(elt)
+        return str
+
+    def format(self, elt):
+        if elt['type'] == 'TEXT':
+            if isinstance(elt['content'],list):
+                string = ""
+                for s in elt['content']:
+                    string += s
+            else:
+                string = elt['content']
+            return string
+        elif elt['type'] == 'TAG':
+            return self.str_tag(elt)
+        elif elt['type'] == 'PARA':
+            return self.str_para(elt)
+        elif elt['type'] == 'PRE':
+            return self.str_pre(elt)
+        elif elt['type'] == 'IT':
+            return self.str_it(elt)
+        elif elt['type'] == 'BOLD':
+            return self.str_bold(elt)
+        elif elt['type'] == 'LINK':
+            return self.str_link(elt)
+        elif elt['type'] == 'TMPL':
+            return self.str_tmpl(elt)
+        elif elt['type'] == 'BAR':
+            return self.str_bar()
+        elif elt['type'] == 'HDR':
+            return self.str_hdr(elt)
+        elif elt['type'] == 'REF':
+            return self.str_ref(elt)
+        elif elt['type'] == 'ENV':
+            return self.str_env(elt)
+        elif elt['type'] == 'IND':
+            return self.str_ind(elt)
+        elif elt['type'] == 'SEQ':
+            string = ""
+            for x in elt['content']:
+                string += self.format(x)
+            return string
+        else:
+            return str(elt)
+
+    def str_tag(self, elt):
+        if elt['tag'] == 'code':
+            self.nested += 1
+            s = self.format(elt['content'])
+            self.nested -= 1
+            if not s.endswith("\n"):
+                s += "\n"            
+            return '@example\n' + s + '@end example\n'
+        elif elt['tag'] == 'tt':
+            self.nested += 1
+            s = self.format(elt['content'])
+            self.nested -= 1
+            return "@code{%s}" % s
+        elif elt['tag'] == 'div':
+            s = ''
+            if 'args' in elt and 'id' in elt['args']:
+                s += "\n@anchor{%s}\n" % elt['args']['id']
+            s += self.format(elt['content'])
+            return s
+        else:
+            s = '<' + elt['tag']
+            if elt['args']:
+                s += ' ' + elt['args']
+            s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>'
+            return s
+ 
+    def str_para(self, elt):
+        string = "";
+        for x in elt['content']:
+            string += self.format(x)
+        return "\n" + string + "\n"
+
+    def str_pre(self, elt):
+        string = "";
+        for x in elt['content']:
+            string += self.format(x)
+        if self.nested:
+            return string
+        if not string.endswith("\n"):
+            string += "\n";
+        return '\n@example\n' + string + '@end example\n'
+
+    def concat(self, eltlist):
+        string = ""
+        for x in eltlist:
+            string += self.format(x)
+        return string
+    
+    def str_it(self, elt):
+        return "@i{" + self.concat(elt['content']) + "}"
+
+    def str_bold(self, elt):
+        return "@b{" + self.concat(elt['content']) + "}"
+
+    def nodename(self, elt):
+        return self.format(elt) # FIXME
+    
+    def str_hdr(self, elt):
+        level = elt['level']
+        if level > len(self.sectcomm[self.sectioning_model]) - 1 - self.sectioning_start:
+            s ="\n@* %s" % (self.format(elt['content']))
+        else:
+            s = self.sectcomm[self.sectioning_model][level - self.sectioning_start] + " " + self.format(elt['content']) + "\n"
+            if self.sectcomm[self.sectioning_model][0] == '@top':
+                s += "@node %s\n" % (self.nodename(elt['content']))
+        return s + "\n"
+        
+    def str_bar(self):
+        return "\n-----\n" # FIXME
+
+    def str_ind(self, elt):
+        return ("@w{ }" * elt['level']) + self.format(elt['content']) + '\n'
+
+    def str_env(self, elt):
+        if elt['envtype'] == 'unnumbered':
+            string = '\n@itemize @bullet\n'
+            for s in elt['content']:
+                string += '@item ' + self.format(s['content']) + '\n\n'
+            string += '@end itemize\n'
+        elif elt['envtype'] == 'numbered':
+            string = '\n@enumerate\n'
+            for s in elt['content']:
+                string += '@item ' + self.format(s['content']) + '\n\n'
+            string += '@end enumerate\n'
+        elif elt['envtype'] == 'defn':
+            string = "\n@table @asis\n"
+            for s in elt['content']:
+                if s['subtype'] == 0:
+                    string += "@item " + self.format(s['content']) + '\n'
+                else:
+                    string += self.format(s['content']) + '\n'
+            string += '@end table\n'
+        return string
+
+    def str_link(self, elt):
+        # FIXME: A very crude version
+        arg = self.format(elt['content'][0])
+        if len(elt['content']) > 1:
+            s = [x for x in map(self.format, elt['content'])]
+            text = s[1]
+        else:
+            s = None
+            text = None
+
+        if s:
+            if s[0] == 'disambigR' or s[0] == 'wikiquote':
+                return ""
+            if len(s) > 1 and s[1] == 'thumb':
+                return ""
+
+        (qual,sep,tgt) = arg.partition(':')
+        if text:
+            return "@ref{%s,%s}" % (qual, text)
+        else:
+            return "@ref{%s}" % qual
+    
+    def str_tmpl(self, elt):
+        return "FIXME: str_tmpl not implemented\n"
+            
+    def str_ref(self, elt):
+        target = elt['ref']
+        text = self.format(elt['content'])
+        if text and text != '':
+            return "@uref{%s,%s}" % (target, text)
+        else:
+            return "@uref{%s}" % target
+    
+    
+        
+    
+    
diff --git a/WikiTrans/wiki2text.py b/WikiTrans/wiki2text.py
new file mode 100644
index 0000000..916391e
--- /dev/null
+++ b/WikiTrans/wiki2text.py
@@ -0,0 +1,266 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008,2015 Sergey Poznyakoff
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from wikimarkup import *
+from wikins import wiki_ns_re, wiki_ns
+import re
+import urllib
+
+class TextWikiMarkup (WikiMarkup):
+    """
+    A (general-purpose Wiki->Text translator class.
+    """
+
+    # Output width
+    width = 78
+    # Do not show references.
+    references = False
+    # Provide a minimum markup
+    markup = True
+
+    # Number of current element in the environment
+    num = 0
+    
+    def __init__(self, *args, **keywords):
+        WikiMarkup.__init__(self, *args, **keywords)
+        if 'width' in keywords:
+            self.width = keywords['width']
+        if 'refs' in keywords:
+            self.references = keywords['refs']
+        if 'markup' in keywords:
+            self.markup = keywords['markup']
+
+    def xref(self, text, target):
+        if text:
+            return "%s (see %s) " % (text, target)
+        else:
+            return "see " + target
+
+    def wiki_ns_name(self, str):
+        if str in wiki_ns[self.lang]:
+            return wiki_ns[self.lang][str]
+        elif str in wiki_ns_re[self.lang]:
+            for elt in wiki_ns_re[self.lang][str]:
+                if str.beginswith(elt[0]) and str.endswith(elt[1]):
+                    return elt[2]
+        return None
+    
+    def mktgt(self, tgt, lang = None):
+        if not lang:
+            lang = self.lang
+        return self.html_base % { 'lang' : lang } + urllib.quote(tgt)
+    
+    def fmtlink(self, elt, istmpl):
+        arg = self.format(elt['content'][0])
+        if len(elt['content']) > 1:
+            s = [x for x in map(self.format, elt['content'])]
+            text = s[1]
+        else:
+            s = None
+            text = None
+
+        if s:
+            if s[0] == 'disambigR' or s[0] == 'wikiquote':
+                return ""
+            if len(s) > 1 and s[1] == 'thumb':
+                return ""
+        (qual,sep,tgt) = arg.partition(':')
+        if tgt != '':
+            ns = self.wiki_ns_name(qual)
+            if ns:
+                if ns == 'NS_IMAGE':
+                    if not self.references:
+                        return ""
+                    text = "[%s: %s]" % (qual, text if text else arg)
+                    tgt = self.image_base + '/' + \
+                                 urllib.quote(tgt) + \
+                                 '/250px-' + urllib.quote(tgt)
+                elif ns == 'NS_MEDIA':
+                    text = "[%s]" % (qual)
+                else:
+                    tgt = self.mktgt(tgt)
+            elif not istmpl and qual in self.langtab:
+                text = self.langtab[qual] + ": " + tgt
+                tgt = self.mktgt(tgt, qual)
+            else:
+                tgt = self.mktgt(tgt)
+        else:
+            tgt = self.mktgt(arg)
+        if self.references:
+            return "%s (see %s) " % (text, tgt)
+        elif not text or text == '':
+            return arg
+        else:
+            return text
+
+    def indent (self, lev, text):
+        if text.find('\n') == -1:
+            s = (" " * lev) + text 
+        else:
+            s = ""
+            for elt in text.split('\n'):
+                if elt:
+                    s += (" " * lev) + elt + '\n'
+            if not text.endswith('\n'):
+                s = s.rstrip('\n')
+#        print "IN: '%s'" % (text)
+#        print "OUT: '%s'" % (s)
+        return s
+    
+    def fmtpara(self, input):
+        output = ""
+        linebuf = ""
+        length = 0
+        for s in input.split():
+            wlen = len(s)
+            if linebuf.endswith("."):
+                wsc = 2
+            else:
+                wsc = 1
+            if length + wsc + wlen > self.width:
+                # FIXME: fill out linebuf
+                output += linebuf + '\n'
+                wsc = 0
+                length = 0
+                linebuf = ""
+            linebuf += " " * wsc + s
+            length += wsc + wlen
+        return output + linebuf
+        
+    def str_tag(self, elt):
+        if elt['tag'] == 'code':
+            self.nested += 1
+            s = self.format(elt['content'])
+            self.nested -= 1
+            return s #FIXME
+        else:
+            s = '<' + elt['tag']
+            if elt['args']:
+                s += ' ' + str(elt['args'])
+            s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>'
+            return s            
+        
+    def format(self, elt):
+        if elt['type'] == 'TEXT':
+            if isinstance(elt['content'],list):
+                string = ""
+                for s in elt['content']:
+                    if string:
+                        if string.endswith("."):
+                            string += "  "
+                        else:
+                            string += " "
+                    string += s
+            else:
+                string = elt['content']
+        elif elt['type'] == 'PRE':
+            string = ""
+            for x in elt['content']:
+                string += self.format(x)
+            string += '\n'
+        elif elt['type'] == 'PARA':
+            string = "";
+            for x in elt['content']:
+                string += self.format(x)
+            string = self.fmtpara(string) + '\n\n'
+        elif elt['type'] == 'TAG':
+            string = self.str_tag(elt)
+        elif elt['type'] == 'IT':
+            string = ""
+            for x in elt['content']:
+                s = self.format(x)
+                if s:
+                    string += " " + s
+            string = "_" + string.lstrip(" ") + "_"
+        elif elt['type'] == 'BOLD':
+            string = ""
+            for x in elt['content']:
+                s = self.format(x)
+                if s:
+                    if string.endswith("."):
+                        string += "  "
+                    else:
+                        string += " "
+                string += s
+            string = string.upper()
+        elif elt['type'] == 'LINK':
+            string = self.fmtlink(elt, False)
+        elif elt['type'] == 'TMPL':
+            s = self.fmtlink(elt, True)
+            if s:
+                string = '[' + s + ']'
+            else:
+                string = s
+        elif elt['type'] == 'BAR':
+            w = self.width
+            if w < 5:
+                w = 5
+            string = "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
+        elif elt['type'] == 'HDR':
+            level = elt['level']
+            string = "\n" + ("*" * level) + " " + \
+                      self.format(elt['content']).lstrip(" ") + "\n\n"
+        elif elt['type'] == 'REF':
+            string = self.xref(self.format(elt['content']), elt['ref'])
+        elif elt['type'] == 'ENV':
+            type = elt['envtype']
+            lev = elt['level']
+            if lev > self.width - 4:
+                lev = 1
+            string = ""
+            n = 1
+            for s in elt['content']:
+                if not string.endswith("\n"):
+                    string += "\n"
+                x = self.format(s['content'])
+                if type == "unnumbered":
+                    string += self.fmtpara(self.indent(lev, "- " + x.lstrip(" ")))
+                elif type == "numbered":
+                    string += self.fmtpara(self.indent(lev, "%d. %s" % (n, x)))
+                    n += 1
+                elif type == "defn":
+                    if s['subtype'] == 0:
+                        string += self.indent(lev-1, x)
+                    else:
+                        string += self.indent(lev+3, x)
+
+            if not string.endswith("\n"):
+                string += "\n"
+        elif elt['type'] == 'IND':
+            string = (" " * elt['level']) + self.format(elt['content']) + '\n'
+        elif elt['type'] == 'SEQ':
+            string = ""
+            for x in elt['content']:
+                if len(string) > 1 and not string[-1].isspace():
+                    string += ' '
+                string += self.format(x)
+        else:
+            string = str(elt)
+        return string
+
+    def __str__(self):
+        str = ""
+        for elt in self.tree:
+            str += self.format(elt)
+        return str
+
+class TextWiktionaryMarkup (TextWikiMarkup):
+    """
+ See documentation for HtmlWiktionaryMarkup
+    """
+    # FIXME: It is supposed to do something about templates
+
diff --git a/WikiTrans/wikimarkup.py b/WikiTrans/wikimarkup.py
new file mode 100644
index 0000000..2ef6be1
--- /dev/null
+++ b/WikiTrans/wikimarkup.py
@@ -0,0 +1,1215 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008, 2009, 2015 Sergey Poznyakoff
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import print_function
+import sys
+import re
+from types import *
+
+__all__ = [ "BaseWikiMarkup", "WikiMarkup",
+            "TagAttributes", "TagAttributeSyntax" ]
+
+class TagAttributeSyntax(Exception):
+    def __init__(self, value):
+        self.value = value
+    def __str__(self):
+        return repr(self.value)
+
+class TagAttributes(object):
+    attrstart = re.compile("^(?P<attr>[a-zA-Z0-9_-]+)(?P<eq>=\")?")
+    valseg = re.compile("^[^\\\"]+")
+    tab = {}
+    printable = None
+    def __init__(self, string):
+        if not string:
+            self.printable = ''
+            return
+        self.printable = string
+        s = string
+        self.tab = {}
+        while s != '':
+            s = s.strip()
+            m = self.attrstart.match(s)
+            if m:
+                name = m.group('attr')
+                val = ''
+                s = s[m.end(0):]
+                if m.group('eq'):
+                    while 1:
+                        m = self.valseg.match(s)
+                        val += m.group(0)
+                        s = s[m.end(0):]
+                        if s[0] == '\\':
+                            val += s[1]
+                            s += 2
+                        elif s[0] == '"':
+                            s = s[1:]
+                            break
+                else:
+                    val = 1
+                self.tab[name] = val
+            else:
+                raise TagAttributeSyntax(s)
+    def __len__(self):
+        return len(self.tab)
+    def __getitem__(self, key):
+        return self.tab[key]
+    def __contains__(self, key):
+        return key in self.tab
+    def __iter__(self):
+        for key in self.tab:
+            yield(key)
+    def has_key(self, key):
+        return self.__contains__(key)
+    def __setitem__(self, key, value):
+        self.tab[key] = value
+    def __delitem__(self, key):
+        del self.tab[key]
+    def __str__(self):
+        return self.printable
+    def __repr__(self):
+        return self.printable
+
+class BaseWikiMarkup(object):
+
+    delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
+    otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^>]+))?\s*(?P<closed>/)?>")
+    ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+    refstart = re.compile("^https?://")
+    
+    close_delim = {
+        '[': ']',
+        '[[': ']]',
+        '{{': '}}'
+    }
+
+    # Environment types:
+    envtypes = { "*": [ "unnumbered", 0 ],
+                 "#": [ "numbered", 0 ],
+                 ";": [ "defn", 0 ],
+                 ":": [ "defn", 1 ]
+    }
+
+    toklist = None
+    tokind = 0
+    newline = 0
+    tree = None
+
+    tags = [ 'code', 'nowiki', 'tt', 'div' ]
+    
+    nested = 0
+    debug_level = 0
+    
+    def dprint(self, lev, fmt, *argv):
+        if self.debug_level >= lev:
+            print("[DEBUG]", fmt % argv)
+
+    def print_dump_prefix(self, level, file):
+        file.write("[DUMP]" + ' ' * (2*level + 1))
+
+    def dump_nil(self, node, level, file):
+        pass
+    
+    def dump_text(self, node, level, file):
+        self.print_dump_prefix(level, file)
+        file.write("CONTENT: \"%s\"\n" % node['content'])
+    
+    def dump_delim(self, node, level, file):
+        file.write("'%s'" % node['content'])
+        if 'continuation' in node and node['continuation']:
+            file.write(" (cont)")
+        file.write("\n")
+                       
+    def dump_tag(self, node, level, file):
+        self.print_dump_prefix(level, file)
+        file.write("TAG: %s\n" % node['tag'])
+        if 'args' in node:
+            self.print_dump_prefix(level, file)
+            file.write("ARGS: %s\n" % node['args'])
+        if 'content' in node:
+            self.dump_node(node['content'], level + 1, file)
+    
+    def dump_seq(self, n
author	Sergey Poznyakoff <gray@gnu.org.ua>	2015-07-16 13:20:06 +0300
committer	Sergey Poznyakoff <gray@gnu.org.ua>	2015-07-16 13:26:15 +0300
commit	eaf9325ddcff786f3fcd5b9047327ef6e397e778 (patch)
tree	2f0336efbb1deab9651c5eeb1b5dd753538a5c8e /WikiTrans
parent	8e11d7f20459697c883df1e421df02006f749792 (diff)
download	wikitrans-eaf9325ddcff786f3fcd5b9047327ef6e397e778.tar.gz wikitrans-eaf9325ddcff786f3fcd5b9047327ef6e397e778.tar.bz2