diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-16 13:20:06 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-16 13:26:15 +0300 |
commit | eaf9325ddcff786f3fcd5b9047327ef6e397e778 (patch) | |
tree | 2f0336efbb1deab9651c5eeb1b5dd753538a5c8e /WikiTrans | |
parent | 8e11d7f20459697c883df1e421df02006f749792 (diff) | |
download | wikitrans-eaf9325ddcff786f3fcd5b9047327ef6e397e778.tar.gz wikitrans-eaf9325ddcff786f3fcd5b9047327ef6e397e778.tar.bz2 |
Restructure the package.
The idea is to switch from using this project as a git submodule
to having it distributed via PyPI. Since the name 'wit' is already
registered there, the package is renamed to 'wikitrans'.
* setup.py: Use setuptools
Rename package to wikitrans.
* wikicvt.py: Remove. Replaced with:
* bin/wikitrans: New file.
* __init__.py: Move to WikiTrans/__init__.py
* wiki2html.py: Move to WikiTrans/wiki2html.py
* wiki2texi.py: Move to WikiTrans/wiki2texi.py
* wiki2text.py: Move to WikiTrans/wiki2text.py
* wikimarkup.py: Move to WikiTrans/wikimarkup.py
* wikins.py: Move to WikiTrans/wikins.py
* test.py: Move to tests/test.py
* MANIFEST.in: New file.
* README.rst: New file.
* .gitignore: Update.
Diffstat (limited to 'WikiTrans')
-rw-r--r-- | WikiTrans/__init__.py | 18 | ||||
-rw-r--r-- | WikiTrans/wiki2html.py | 283 | ||||
-rw-r--r-- | WikiTrans/wiki2texi.py | 251 | ||||
-rw-r--r-- | WikiTrans/wiki2text.py | 266 | ||||
-rw-r--r-- | WikiTrans/wikimarkup.py | 1215 | ||||
-rw-r--r-- | WikiTrans/wikins.py | 3040 |
6 files changed, 5073 insertions, 0 deletions
diff --git a/WikiTrans/__init__.py b/WikiTrans/__init__.py new file mode 100644 index 0000000..ad99ce3 --- /dev/null +++ b/WikiTrans/__init__.py @@ -0,0 +1,18 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (C) 2008, 2015 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +__all__ = [ "wikimarkup", "wiki2html", "wiki2text", "wiki2texi", "wikins" ] diff --git a/WikiTrans/wiki2html.py b/WikiTrans/wiki2html.py new file mode 100644 index 0000000..754fa9b --- /dev/null +++ b/WikiTrans/wiki2html.py @@ -0,0 +1,283 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (C) 2008,2015 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from wikimarkup import * +from wikins import wiki_ns_re, wiki_ns +import re +try: + from urllib import quote as url_quote +except ImportError: + from urllib.parse import quote as url_quote + +__all__ = [ "HtmlWikiMarkup", "HtmlWiktionaryMarkup" ] + +class HtmlWikiMarkup (WikiMarkup): + """ + A (hopefully) general-purpose Wiki->HTML translator class. + FIXME: 1. See WikiMarkup for a list + 2. [[official position]]s : final 's' gets after closing </a> tag. + Should be before. + """ + + def wiki_ns_name(self, str): + if str in wiki_ns[self.lang]: + return wiki_ns[self.lang][str] + elif str in wiki_ns_re[self.lang]: + for elt in wiki_ns_re[self.lang][str]: + if str.beginswith(elt[0]) and str.endswith(elt[1]): + return elt[2] + return None + + envt = { "unnumbered": { "hdr": "ul", + "elt": ["li"] }, + "numbered": { "hdr": "ol", + "elt": ["li"] }, + "defn": { "hdr": "dl", + "elt": ["dt","dd"] } } + + def mktgt(self, tgt, lang = None): + if not lang: + lang = self.lang + return self.html_base % { 'lang' : lang } + url_quote(tgt) + + def tmpl_term(self, s): + if len(s) == 2: + return s[1] + text = None + trans = None + for x in s[1:]: + m = re.match('(\w+)=', x) + if m: + if m.group(1) == "tr": + trans = x[m.end(1)+1:] + elif not text: + text = x + if text: + if trans: + text += ' <span class="trans">[' + trans + ']</span>' + return text + + def tmpl_proto(self, s): + text = '<span class="proto-lang">Proto-' + s[1] + '</span>' + if len(s) >= 4: + n = 0 + for x in s[2:-2]: + if n > 0: + text += ',' + n += 1 + text += ' <span class="proto">' + x + '</span>' + text += ' <span class="meaning">(' + s[-2] + ')</span>' + return text + + + def fmtlink(self, elt, istmpl): + arg = self.format(elt['content'][0]) + text = None + if len(elt['content']) > 1: + s = [x for x in map(self.format, elt['content'])] + if s[0] == 'disambigR' or s[0] == 'wikiquote': + return "" + elif len(s) > 1 and s[1] == 'thumb': + return "" + text = '<span class="template">' + s[1] + '</span>' + if istmpl: + if re.match("t[+-]$", s[0]): + if len(s) > 2: + text = s[2] + elif s[0] == "term": + text = self.tmpl_term(s) + elif s[0] == "proto": + text = self.tmpl_proto(s) + return text + + (qual,sep,tgt) = arg.partition(':') + if tgt != '': + ns = self.wiki_ns_name(qual) + if ns: + if ns == 'NS_IMAGE': + return '' + elif ns == 'NS_MEDIA': + tgt = self.media_base + '/' + tgt + else: + tgt = self.mktgt(tgt) + elif not istmpl and qual in self.langtab: + tgt = self.mktgt(tgt, qual) + if not text or text == '': + text = self.langtab[qual] + else: + tgt = self.mktgt(tgt) + else: + tgt = self.mktgt(arg) + return "<a href=\"%s\">%s</a>" % (tgt, + text if (text and text != '') \ + else arg) + + def str_link(self, elt): + return self.fmtlink(elt, False) + + def str_tmpl(self, elt): + return self.fmtlink(elt, True) + + def str_ref(self, elt): + target = elt['ref'] + text = self.format(elt['content']) + return "<a href=\"%s\">%s</a>" % (target, + text if (text and text != '') \ + else target) + + def concat(self, eltlist): + string = "" + for x in eltlist: + string += self.format(x) + return string + + def str_it(self, elt): + return "<i>" + self.concat(elt['content']) + "</i>" + + def str_bold(self, elt): + return "<b>" + self.concat(elt['content']) + "</b>" + + def str_hdr(self, elt): + level = elt['level'] + 1 + if level > 4: + level = 4 + return "<h%s>%s</h%s>" % (level, self.format(elt['content']), level) + + def str_bar(self): + return "<hr/>" + + def str_env(self, elt): + type = elt['envtype'] + lev = elt['level'] + if lev > 4: + lev = 2 + string = "" + for s in elt['content']: + n = s['subtype']; + string += "<%s>%s</%s>" % (self.envt[type]["elt"][n], + self.format(s['content']), + self.envt[type]["elt"][n]) + return "<%s>%s</%s>" % (self.envt[type]["hdr"], + string, + self.envt[type]["hdr"]) + return string + + def str_tag(self, elt): + if elt['tag'] == 'code': + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + return '<pre><code>' + s + '</code></pre>' #FIXME + else: + s = '<' + elt['tag'] + if elt['args']: + s += ' ' + str(elt['args']) + s += '>' + s += self.format(elt['content']) + return s + '</' + elt['tag'] + '>' + + def str_para(self, elt): + string = ""; + for x in elt['content']: + string += self.format(x) + return "<p>" + string + "</p>" + + def str_pre(self, elt): + string = ""; + for x in elt['content']: + string += self.format(x) + if self.nested: + return string + return '<pre>' + string + '</pre>' + + def str_ind(self, elt): + return ("<dl><dd>" * elt['level']) + self.format(elt['content']) + "</dd></dl>" * elt['level'] + + def format(self, elt): + if elt['type'] == 'TEXT': + if isinstance(elt['content'],list): + string = "" + for s in elt['content']: + string += s + else: + string = elt['content'] + return string + elif elt['type'] == 'TAG': + return self.str_tag(elt) + elif elt['type'] == 'PARA': + return self.str_para(elt) + elif elt['type'] == 'PRE': + return self.str_pre(elt) + elif elt['type'] == 'IT': + return self.str_it(elt) + elif elt['type'] == 'BOLD': + return self.str_bold(elt) + elif elt['type'] == 'LINK': + return self.str_link(elt) + elif elt['type'] == 'TMPL': + return self.str_tmpl(elt) + elif elt['type'] == 'BAR': + return self.str_bar() + elif elt['type'] == 'HDR': + return self.str_hdr(elt) + elif elt['type'] == 'REF': + return self.str_ref(elt) + elif elt['type'] == 'ENV': + return self.str_env(elt) + elif elt['type'] == 'IND': + return self.str_ind(elt) + elif elt['type'] == 'SEQ': + string = "" + for x in elt['content']: + string += self.format(x) + return string + else: + return str(elt) + + def __str__(self): + str = "" + for elt in self.tree: + str += self.format(elt) + return str + +class HtmlWiktionaryMarkup (HtmlWikiMarkup): + """ + A class for translating Wiktionary articles into HTML. + This version does not do much, except that it tries to correctly + format templates. But "tries" does not mean "does". The heuristics + used here is clearly not enough to cope with it. + + 1. FIXME: + The right solution would be to have a database of templates with their + semantics and to decide on their rendering depending on that. E.g. + {{term}} in en.wiktionary means "replace this with the search term". + This, however, does not work in other wiktionaries. There are + also more complex templates, e.g.: {{t+|bg|врата|n|p|tr=vrata|sc=Cyrl}} + I don't know what it means. Couldn't find any documentation either. + Again, this template does not work in other dictionaries. + + 2. Capitulation notice: + Given the: + 1. waste amount of wiktionaries available, + 2. abundance of various templates for each wictionary, + 3. apparent lack of documentation thereof, + 4. the lack of standardized language-independent templates, + I dont see any way to cope with the template-rendering task within a + reasonable amount of time. + + Faeci quod potui, faciant meliora potentes. + """ diff --git a/WikiTrans/wiki2texi.py b/WikiTrans/wiki2texi.py new file mode 100644 index 0000000..6e32c56 --- /dev/null +++ b/WikiTrans/wiki2texi.py @@ -0,0 +1,251 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (C) 2015 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from wikimarkup import * +from wikins import wiki_ns_re, wiki_ns +import re +import urllib + +class TexiWikiMarkup (WikiMarkup): + sectcomm = { + 'numbered': [ + '@top', + '@chapter', + '@section', + '@subsection', + '@subsubsection' + ], + 'unnumbered': [ + '@top', + '@unnumbered', + '@unnumberedsec', + '@unnumberedsubsec', + '@unnumberedsubsubsec' + ], + 'appendix': [ + '@top', + '@appendix', + '@appendixsec', + '@appendixsubsec', + '@appendixsubsubsec' + ], + 'heading': [ + '@majorheading' + '@chapheading', + '@heading', + '@subheading', + '@subsubheading' + ] + } + + sectioning_model = 'numbered' + sectioning_start = 0 + + def __init__(self, *args, **keywords): + super(TexiWikiMarkup, self).__init__(*args, **keywords) + if "sectioning-model" in keywords: + val = keywords["sectioning-model"] + if val in self.sectcomm: + self.sectioning_model = val + else: + raise ValueError("Invalid value for sectioning model: %s" % val) + if "sectioning-start" in keywords: + val = keywords["sectioning-start"] + if val < 0 or val > 4: + raise ValueError("Invalid value for sectioning start: %s" % val) + else: + self.sectioning_start = val + + + def __str__(self): + str = "" + for elt in self.tree: + str += self.format(elt) + return str + + def format(self, elt): + if elt['type'] == 'TEXT': + if isinstance(elt['content'],list): + string = "" + for s in elt['content']: + string += s + else: + string = elt['content'] + return string + elif elt['type'] == 'TAG': + return self.str_tag(elt) + elif elt['type'] == 'PARA': + return self.str_para(elt) + elif elt['type'] == 'PRE': + return self.str_pre(elt) + elif elt['type'] == 'IT': + return self.str_it(elt) + elif elt['type'] == 'BOLD': + return self.str_bold(elt) + elif elt['type'] == 'LINK': + return self.str_link(elt) + elif elt['type'] == 'TMPL': + return self.str_tmpl(elt) + elif elt['type'] == 'BAR': + return self.str_bar() + elif elt['type'] == 'HDR': + return self.str_hdr(elt) + elif elt['type'] == 'REF': + return self.str_ref(elt) + elif elt['type'] == 'ENV': + return self.str_env(elt) + elif elt['type'] == 'IND': + return self.str_ind(elt) + elif elt['type'] == 'SEQ': + string = "" + for x in elt['content']: + string += self.format(x) + return string + else: + return str(elt) + + def str_tag(self, elt): + if elt['tag'] == 'code': + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + if not s.endswith("\n"): + s += "\n" + return '@example\n' + s + '@end example\n' + elif elt['tag'] == 'tt': + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + return "@code{%s}" % s + elif elt['tag'] == 'div': + s = '' + if 'args' in elt and 'id' in elt['args']: + s += "\n@anchor{%s}\n" % elt['args']['id'] + s += self.format(elt['content']) + return s + else: + s = '<' + elt['tag'] + if elt['args']: + s += ' ' + elt['args'] + s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>' + return s + + def str_para(self, elt): + string = ""; + for x in elt['content']: + string += self.format(x) + return "\n" + string + "\n" + + def str_pre(self, elt): + string = ""; + for x in elt['content']: + string += self.format(x) + if self.nested: + return string + if not string.endswith("\n"): + string += "\n"; + return '\n@example\n' + string + '@end example\n' + + def concat(self, eltlist): + string = "" + for x in eltlist: + string += self.format(x) + return string + + def str_it(self, elt): + return "@i{" + self.concat(elt['content']) + "}" + + def str_bold(self, elt): + return "@b{" + self.concat(elt['content']) + "}" + + def nodename(self, elt): + return self.format(elt) # FIXME + + def str_hdr(self, elt): + level = elt['level'] + if level > len(self.sectcomm[self.sectioning_model]) - 1 - self.sectioning_start: + s ="\n@* %s" % (self.format(elt['content'])) + else: + s = self.sectcomm[self.sectioning_model][level - self.sectioning_start] + " " + self.format(elt['content']) + "\n" + if self.sectcomm[self.sectioning_model][0] == '@top': + s += "@node %s\n" % (self.nodename(elt['content'])) + return s + "\n" + + def str_bar(self): + return "\n-----\n" # FIXME + + def str_ind(self, elt): + return ("@w{ }" * elt['level']) + self.format(elt['content']) + '\n' + + def str_env(self, elt): + if elt['envtype'] == 'unnumbered': + string = '\n@itemize @bullet\n' + for s in elt['content']: + string += '@item ' + self.format(s['content']) + '\n\n' + string += '@end itemize\n' + elif elt['envtype'] == 'numbered': + string = '\n@enumerate\n' + for s in elt['content']: + string += '@item ' + self.format(s['content']) + '\n\n' + string += '@end enumerate\n' + elif elt['envtype'] == 'defn': + string = "\n@table @asis\n" + for s in elt['content']: + if s['subtype'] == 0: + string += "@item " + self.format(s['content']) + '\n' + else: + string += self.format(s['content']) + '\n' + string += '@end table\n' + return string + + def str_link(self, elt): + # FIXME: A very crude version + arg = self.format(elt['content'][0]) + if len(elt['content']) > 1: + s = [x for x in map(self.format, elt['content'])] + text = s[1] + else: + s = None + text = None + + if s: + if s[0] == 'disambigR' or s[0] == 'wikiquote': + return "" + if len(s) > 1 and s[1] == 'thumb': + return "" + + (qual,sep,tgt) = arg.partition(':') + if text: + return "@ref{%s,%s}" % (qual, text) + else: + return "@ref{%s}" % qual + + def str_tmpl(self, elt): + return "FIXME: str_tmpl not implemented\n" + + def str_ref(self, elt): + target = elt['ref'] + text = self.format(elt['content']) + if text and text != '': + return "@uref{%s,%s}" % (target, text) + else: + return "@uref{%s}" % target + + + + + diff --git a/WikiTrans/wiki2text.py b/WikiTrans/wiki2text.py new file mode 100644 index 0000000..916391e --- /dev/null +++ b/WikiTrans/wiki2text.py @@ -0,0 +1,266 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (C) 2008,2015 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from wikimarkup import * +from wikins import wiki_ns_re, wiki_ns +import re +import urllib + +class TextWikiMarkup (WikiMarkup): + """ + A (general-purpose Wiki->Text translator class. + """ + + # Output width + width = 78 + # Do not show references. + references = False + # Provide a minimum markup + markup = True + + # Number of current element in the environment + num = 0 + + def __init__(self, *args, **keywords): + WikiMarkup.__init__(self, *args, **keywords) + if 'width' in keywords: + self.width = keywords['width'] + if 'refs' in keywords: + self.references = keywords['refs'] + if 'markup' in keywords: + self.markup = keywords['markup'] + + def xref(self, text, target): + if text: + return "%s (see %s) " % (text, target) + else: + return "see " + target + + def wiki_ns_name(self, str): + if str in wiki_ns[self.lang]: + return wiki_ns[self.lang][str] + elif str in wiki_ns_re[self.lang]: + for elt in wiki_ns_re[self.lang][str]: + if str.beginswith(elt[0]) and str.endswith(elt[1]): + return elt[2] + return None + + def mktgt(self, tgt, lang = None): + if not lang: + lang = self.lang + return self.html_base % { 'lang' : lang } + urllib.quote(tgt) + + def fmtlink(self, elt, istmpl): + arg = self.format(elt['content'][0]) + if len(elt['content']) > 1: + s = [x for x in map(self.format, elt['content'])] + text = s[1] + else: + s = None + text = None + + if s: + if s[0] == 'disambigR' or s[0] == 'wikiquote': + return "" + if len(s) > 1 and s[1] == 'thumb': + return "" + (qual,sep,tgt) = arg.partition(':') + if tgt != '': + ns = self.wiki_ns_name(qual) + if ns: + if ns == 'NS_IMAGE': + if not self.references: + return "" + text = "[%s: %s]" % (qual, text if text else arg) + tgt = self.image_base + '/' + \ + urllib.quote(tgt) + \ + '/250px-' + urllib.quote(tgt) + elif ns == 'NS_MEDIA': + text = "[%s]" % (qual) + else: + tgt = self.mktgt(tgt) + elif not istmpl and qual in self.langtab: + text = self.langtab[qual] + ": " + tgt + tgt = self.mktgt(tgt, qual) + else: + tgt = self.mktgt(tgt) + else: + tgt = self.mktgt(arg) + if self.references: + return "%s (see %s) " % (text, tgt) + elif not text or text == '': + return arg + else: + return text + + def indent (self, lev, text): + if text.find('\n') == -1: + s = (" " * lev) + text + else: + s = "" + for elt in text.split('\n'): + if elt: + s += (" " * lev) + elt + '\n' + if not text.endswith('\n'): + s = s.rstrip('\n') +# print "IN: '%s'" % (text) +# print "OUT: '%s'" % (s) + return s + + def fmtpara(self, input): + output = "" + linebuf = "" + length = 0 + for s in input.split(): + wlen = len(s) + if linebuf.endswith("."): + wsc = 2 + else: + wsc = 1 + if length + wsc + wlen > self.width: + # FIXME: fill out linebuf + output += linebuf + '\n' + wsc = 0 + length = 0 + linebuf = "" + linebuf += " " * wsc + s + length += wsc + wlen + return output + linebuf + + def str_tag(self, elt): + if elt['tag'] == 'code': + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + return s #FIXME + else: + s = '<' + elt['tag'] + if elt['args']: + s += ' ' + str(elt['args']) + s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>' + return s + + def format(self, elt): + if elt['type'] == 'TEXT': + if isinstance(elt['content'],list): + string = "" + for s in elt['content']: + if string: + if string.endswith("."): + string += " " + else: + string += " " + string += s + else: + string = elt['content'] + elif elt['type'] == 'PRE': + string = "" + for x in elt['content']: + string += self.format(x) + string += '\n' + elif elt['type'] == 'PARA': + string = ""; + for x in elt['content']: + string += self.format(x) + string = self.fmtpara(string) + '\n\n' + elif elt['type'] == 'TAG': + string = self.str_tag(elt) + elif elt['type'] == 'IT': + string = "" + for x in elt['content']: + s = self.format(x) + if s: + string += " " + s + string = "_" + string.lstrip(" ") + "_" + elif elt['type'] == 'BOLD': + string = "" + for x in elt['content']: + s = self.format(x) + if s: + if string.endswith("."): + string += " " + else: + string += " " + string += s + string = string.upper() + elif elt['type'] == 'LINK': + string = self.fmtlink(elt, False) + elif elt['type'] == 'TMPL': + s = self.fmtlink(elt, True) + if s: + string = '[' + s + ']' + else: + string = s + elif elt['type'] == 'BAR': + w = self.width + if w < 5: + w = 5 + string = "\n" + ("-" * (w - 5)).center(w - 1) + "\n" + elif elt['type'] == 'HDR': + level = elt['level'] + string = "\n" + ("*" * level) + " " + \ + self.format(elt['content']).lstrip(" ") + "\n\n" + elif elt['type'] == 'REF': + string = self.xref(self.format(elt['content']), elt['ref']) + elif elt['type'] == 'ENV': + type = elt['envtype'] + lev = elt['level'] + if lev > self.width - 4: + lev = 1 + string = "" + n = 1 + for s in elt['content']: + if not string.endswith("\n"): + string += "\n" + x = self.format(s['content']) + if type == "unnumbered": + string += self.fmtpara(self.indent(lev, "- " + x.lstrip(" "))) + elif type == "numbered": + string += self.fmtpara(self.indent(lev, "%d. %s" % (n, x))) + n += 1 + elif type == "defn": + if s['subtype'] == 0: + string += self.indent(lev-1, x) + else: + string += self.indent(lev+3, x) + + if not string.endswith("\n"): + string += "\n" + elif elt['type'] == 'IND': + string = (" " * elt['level']) + self.format(elt['content']) + '\n' + elif elt['type'] == 'SEQ': + string = "" + for x in elt['content']: + if len(string) > 1 and not string[-1].isspace(): + string += ' ' + string += self.format(x) + else: + string = str(elt) + return string + + def __str__(self): + str = "" + for elt in self.tree: + str += self.format(elt) + return str + +class TextWiktionaryMarkup (TextWikiMarkup): + """ + See documentation for HtmlWiktionaryMarkup + """ + # FIXME: It is supposed to do something about templates + diff --git a/WikiTrans/wikimarkup.py b/WikiTrans/wikimarkup.py new file mode 100644 index 0000000..2ef6be1 --- /dev/null +++ b/WikiTrans/wikimarkup.py @@ -0,0 +1,1215 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (C) 2008, 2009, 2015 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from __future__ import print_function +import sys +import re +from types import * + +__all__ = [ "BaseWikiMarkup", "WikiMarkup", + "TagAttributes", "TagAttributeSyntax" ] + +class TagAttributeSyntax(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return repr(self.value) + +class TagAttributes(object): + attrstart = re.compile("^(?P<attr>[a-zA-Z0-9_-]+)(?P<eq>=\")?") + valseg = re.compile("^[^\\\"]+") + tab = {} + printable = None + def __init__(self, string): + if not string: + self.printable = '' + return + self.printable = string + s = string + self.tab = {} + while s != '': + s = s.strip() + m = self.attrstart.match(s) + if m: + name = m.group('attr') + val = '' + s = s[m.end(0):] + if m.group('eq'): + while 1: + m = self.valseg.match(s) + val += m.group(0) + s = s[m.end(0):] + if s[0] == '\\': + val += s[1] + s += 2 + elif s[0] == '"': + s = s[1:] + break + else: + val = 1 + self.tab[name] = val + else: + raise TagAttributeSyntax(s) + def __len__(self): + return len(self.tab) + def __getitem__(self, key): + return self.tab[key] + def __contains__(self, key): + return key in self.tab + def __iter__(self): + for key in self.tab: + yield(key) + def has_key(self, key): + return self.__contains__(key) + def __setitem__(self, key, value): + self.tab[key] = value + def __delitem__(self, key): + del self.tab[key] + def __str__(self): + return self.printable + def __repr__(self): + return self.printable + +class BaseWikiMarkup(object): + + delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") + otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^>]+))?\s*(?P<closed>/)?>") + ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>") + refstart = re.compile("^https?://") + + close_delim = { + '[': ']', + '[[': ']]', + '{{': '}}' + } + + # Environment types: + envtypes = { "*": [ "unnumbered", 0 ], + "#": [ "numbered", 0 ], + ";": [ "defn", 0 ], + ":": [ "defn", 1 ] + } + + toklist = None + tokind = 0 + newline = 0 + tree = None + + tags = [ 'code', 'nowiki', 'tt', 'div' ] + + nested = 0 + debug_level = 0 + + def dprint(self, lev, fmt, *argv): + if self.debug_level >= lev: + print("[DEBUG]", fmt % argv) + + def print_dump_prefix(self, level, file): + file.write("[DUMP]" + ' ' * (2*level + 1)) + + def dump_nil(self, node, level, file): + pass + + def dump_text(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("CONTENT: \"%s\"\n" % node['content']) + + def dump_delim(self, node, level, file): + file.write("'%s'" % node['content']) + if 'continuation' in node and node['continuation']: + file.write(" (cont)") + file.write("\n") + + def dump_tag(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("TAG: %s\n" % node['tag']) + if 'args' in node: + self.print_dump_prefix(level, file) + file.write("ARGS: %s\n" % node['args']) + if 'content' in node: + self.dump_node(node['content'], level + 1, file) + + def dump_seq(self, n |