summaryrefslogtreecommitdiff
path: root/WikiTrans
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2015-07-16 13:20:06 +0300
committerSergey Poznyakoff <gray@gnu.org.ua>2015-07-16 13:26:15 +0300
commiteaf9325ddcff786f3fcd5b9047327ef6e397e778 (patch)
tree2f0336efbb1deab9651c5eeb1b5dd753538a5c8e /WikiTrans
parent8e11d7f20459697c883df1e421df02006f749792 (diff)
downloadwikitrans-eaf9325ddcff786f3fcd5b9047327ef6e397e778.tar.gz
wikitrans-eaf9325ddcff786f3fcd5b9047327ef6e397e778.tar.bz2
Restructure the package.
The idea is to switch from using this project as a git submodule to having it distributed via PyPI. Since the name 'wit' is already registered there, the package is renamed to 'wikitrans'. * setup.py: Use setuptools Rename package to wikitrans. * wikicvt.py: Remove. Replaced with: * bin/wikitrans: New file. * __init__.py: Move to WikiTrans/__init__.py * wiki2html.py: Move to WikiTrans/wiki2html.py * wiki2texi.py: Move to WikiTrans/wiki2texi.py * wiki2text.py: Move to WikiTrans/wiki2text.py * wikimarkup.py: Move to WikiTrans/wikimarkup.py * wikins.py: Move to WikiTrans/wikins.py * test.py: Move to tests/test.py * MANIFEST.in: New file. * README.rst: New file. * .gitignore: Update.
Diffstat (limited to 'WikiTrans')
-rw-r--r--WikiTrans/__init__.py18
-rw-r--r--WikiTrans/wiki2html.py283
-rw-r--r--WikiTrans/wiki2texi.py251
-rw-r--r--WikiTrans/wiki2text.py266
-rw-r--r--WikiTrans/wikimarkup.py1215
-rw-r--r--WikiTrans/wikins.py3040
6 files changed, 5073 insertions, 0 deletions
diff --git a/WikiTrans/__init__.py b/WikiTrans/__init__.py
new file mode 100644
index 0000000..ad99ce3
--- /dev/null
+++ b/WikiTrans/__init__.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008, 2015 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+__all__ = [ "wikimarkup", "wiki2html", "wiki2text", "wiki2texi", "wikins" ]
diff --git a/WikiTrans/wiki2html.py b/WikiTrans/wiki2html.py
new file mode 100644
index 0000000..754fa9b
--- /dev/null
+++ b/WikiTrans/wiki2html.py
@@ -0,0 +1,283 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008,2015 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from wikimarkup import *
+from wikins import wiki_ns_re, wiki_ns
+import re
+try:
+ from urllib import quote as url_quote
+except ImportError:
+ from urllib.parse import quote as url_quote
+
+__all__ = [ "HtmlWikiMarkup", "HtmlWiktionaryMarkup" ]
+
+class HtmlWikiMarkup (WikiMarkup):
+ """
+ A (hopefully) general-purpose Wiki->HTML translator class.
+ FIXME: 1. See WikiMarkup for a list
+ 2. [[official position]]s : final 's' gets after closing </a> tag.
+ Should be before.
+ """
+
+ def wiki_ns_name(self, str):
+ if str in wiki_ns[self.lang]:
+ return wiki_ns[self.lang][str]
+ elif str in wiki_ns_re[self.lang]:
+ for elt in wiki_ns_re[self.lang][str]:
+ if str.beginswith(elt[0]) and str.endswith(elt[1]):
+ return elt[2]
+ return None
+
+ envt = { "unnumbered": { "hdr": "ul",
+ "elt": ["li"] },
+ "numbered": { "hdr": "ol",
+ "elt": ["li"] },
+ "defn": { "hdr": "dl",
+ "elt": ["dt","dd"] } }
+
+ def mktgt(self, tgt, lang = None):
+ if not lang:
+ lang = self.lang
+ return self.html_base % { 'lang' : lang } + url_quote(tgt)
+
+ def tmpl_term(self, s):
+ if len(s) == 2:
+ return s[1]
+ text = None
+ trans = None
+ for x in s[1:]:
+ m = re.match('(\w+)=', x)
+ if m:
+ if m.group(1) == "tr":
+ trans = x[m.end(1)+1:]
+ elif not text:
+ text = x
+ if text:
+ if trans:
+ text += ' <span class="trans">[' + trans + ']</span>'
+ return text
+
+ def tmpl_proto(self, s):
+ text = '<span class="proto-lang">Proto-' + s[1] + '</span>'
+ if len(s) >= 4:
+ n = 0
+ for x in s[2:-2]:
+ if n > 0:
+ text += ','
+ n += 1
+ text += ' <span class="proto">' + x + '</span>'
+ text += ' <span class="meaning">(' + s[-2] + ')</span>'
+ return text
+
+
+ def fmtlink(self, elt, istmpl):
+ arg = self.format(elt['content'][0])
+ text = None
+ if len(elt['content']) > 1:
+ s = [x for x in map(self.format, elt['content'])]
+ if s[0] == 'disambigR' or s[0] == 'wikiquote':
+ return ""
+ elif len(s) > 1 and s[1] == 'thumb':
+ return ""
+ text = '<span class="template">' + s[1] + '</span>'
+ if istmpl:
+ if re.match("t[+-]$", s[0]):
+ if len(s) > 2:
+ text = s[2]
+ elif s[0] == "term":
+ text = self.tmpl_term(s)
+ elif s[0] == "proto":
+ text = self.tmpl_proto(s)
+ return text
+
+ (qual,sep,tgt) = arg.partition(':')
+ if tgt != '':
+ ns = self.wiki_ns_name(qual)
+ if ns:
+ if ns == 'NS_IMAGE':
+ return ''
+ elif ns == 'NS_MEDIA':
+ tgt = self.media_base + '/' + tgt
+ else:
+ tgt = self.mktgt(tgt)
+ elif not istmpl and qual in self.langtab:
+ tgt = self.mktgt(tgt, qual)
+ if not text or text == '':
+ text = self.langtab[qual]
+ else:
+ tgt = self.mktgt(tgt)
+ else:
+ tgt = self.mktgt(arg)
+ return "<a href=\"%s\">%s</a>" % (tgt,
+ text if (text and text != '') \
+ else arg)
+
+ def str_link(self, elt):
+ return self.fmtlink(elt, False)
+
+ def str_tmpl(self, elt):
+ return self.fmtlink(elt, True)
+
+ def str_ref(self, elt):
+ target = elt['ref']
+ text = self.format(elt['content'])
+ return "<a href=\"%s\">%s</a>" % (target,
+ text if (text and text != '') \
+ else target)
+
+ def concat(self, eltlist):
+ string = ""
+ for x in eltlist:
+ string += self.format(x)
+ return string
+
+ def str_it(self, elt):
+ return "<i>" + self.concat(elt['content']) + "</i>"
+
+ def str_bold(self, elt):
+ return "<b>" + self.concat(elt['content']) + "</b>"
+
+ def str_hdr(self, elt):
+ level = elt['level'] + 1
+ if level > 4:
+ level = 4
+ return "<h%s>%s</h%s>" % (level, self.format(elt['content']), level)
+
+ def str_bar(self):
+ return "<hr/>"
+
+ def str_env(self, elt):
+ type = elt['envtype']
+ lev = elt['level']
+ if lev > 4:
+ lev = 2
+ string = ""
+ for s in elt['content']:
+ n = s['subtype'];
+ string += "<%s>%s</%s>" % (self.envt[type]["elt"][n],
+ self.format(s['content']),
+ self.envt[type]["elt"][n])
+ return "<%s>%s</%s>" % (self.envt[type]["hdr"],
+ string,
+ self.envt[type]["hdr"])
+ return string
+
+ def str_tag(self, elt):
+ if elt['tag'] == 'code':
+ self.nested += 1
+ s = self.format(elt['content'])
+ self.nested -= 1
+ return '<pre><code>' + s + '</code></pre>' #FIXME
+ else:
+ s = '<' + elt['tag']
+ if elt['args']:
+ s += ' ' + str(elt['args'])
+ s += '>'
+ s += self.format(elt['content'])
+ return s + '</' + elt['tag'] + '>'
+
+ def str_para(self, elt):
+ string = "";
+ for x in elt['content']:
+ string += self.format(x)
+ return "<p>" + string + "</p>"
+
+ def str_pre(self, elt):
+ string = "";
+ for x in elt['content']:
+ string += self.format(x)
+ if self.nested:
+ return string
+ return '<pre>' + string + '</pre>'
+
+ def str_ind(self, elt):
+ return ("<dl><dd>" * elt['level']) + self.format(elt['content']) + "</dd></dl>" * elt['level']
+
+ def format(self, elt):
+ if elt['type'] == 'TEXT':
+ if isinstance(elt['content'],list):
+ string = ""
+ for s in elt['content']:
+ string += s
+ else:
+ string = elt['content']
+ return string
+ elif elt['type'] == 'TAG':
+ return self.str_tag(elt)
+ elif elt['type'] == 'PARA':
+ return self.str_para(elt)
+ elif elt['type'] == 'PRE':
+ return self.str_pre(elt)
+ elif elt['type'] == 'IT':
+ return self.str_it(elt)
+ elif elt['type'] == 'BOLD':
+ return self.str_bold(elt)
+ elif elt['type'] == 'LINK':
+ return self.str_link(elt)
+ elif elt['type'] == 'TMPL':
+ return self.str_tmpl(elt)
+ elif elt['type'] == 'BAR':
+ return self.str_bar()
+ elif elt['type'] == 'HDR':
+ return self.str_hdr(elt)
+ elif elt['type'] == 'REF':
+ return self.str_ref(elt)
+ elif elt['type'] == 'ENV':
+ return self.str_env(elt)
+ elif elt['type'] == 'IND':
+ return self.str_ind(elt)
+ elif elt['type'] == 'SEQ':
+ string = ""
+ for x in elt['content']:
+ string += self.format(x)
+ return string
+ else:
+ return str(elt)
+
+ def __str__(self):
+ str = ""
+ for elt in self.tree:
+ str += self.format(elt)
+ return str
+
+class HtmlWiktionaryMarkup (HtmlWikiMarkup):
+ """
+ A class for translating Wiktionary articles into HTML.
+ This version does not do much, except that it tries to correctly
+ format templates. But "tries" does not mean "does". The heuristics
+ used here is clearly not enough to cope with it.
+
+ 1. FIXME:
+ The right solution would be to have a database of templates with their
+ semantics and to decide on their rendering depending on that. E.g.
+ {{term}} in en.wiktionary means "replace this with the search term".
+ This, however, does not work in other wiktionaries. There are
+ also more complex templates, e.g.: {{t+|bg|врата|n|p|tr=vrata|sc=Cyrl}}
+ I don't know what it means. Couldn't find any documentation either.
+ Again, this template does not work in other dictionaries.
+
+ 2. Capitulation notice:
+ Given the:
+ 1. waste amount of wiktionaries available,
+ 2. abundance of various templates for each wictionary,
+ 3. apparent lack of documentation thereof,
+ 4. the lack of standardized language-independent templates,
+ I dont see any way to cope with the template-rendering task within a
+ reasonable amount of time.
+
+ Faeci quod potui, faciant meliora potentes.
+ """
diff --git a/WikiTrans/wiki2texi.py b/WikiTrans/wiki2texi.py
new file mode 100644
index 0000000..6e32c56
--- /dev/null
+++ b/WikiTrans/wiki2texi.py
@@ -0,0 +1,251 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2015 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from wikimarkup import *
+from wikins import wiki_ns_re, wiki_ns
+import re
+import urllib
+
+class TexiWikiMarkup (WikiMarkup):
+ sectcomm = {
+ 'numbered': [
+ '@top',
+ '@chapter',
+ '@section',
+ '@subsection',
+ '@subsubsection'
+ ],
+ 'unnumbered': [
+ '@top',
+ '@unnumbered',
+ '@unnumberedsec',
+ '@unnumberedsubsec',
+ '@unnumberedsubsubsec'
+ ],
+ 'appendix': [
+ '@top',
+ '@appendix',
+ '@appendixsec',
+ '@appendixsubsec',
+ '@appendixsubsubsec'
+ ],
+ 'heading': [
+ '@majorheading'
+ '@chapheading',
+ '@heading',
+ '@subheading',
+ '@subsubheading'
+ ]
+ }
+
+ sectioning_model = 'numbered'
+ sectioning_start = 0
+
+ def __init__(self, *args, **keywords):
+ super(TexiWikiMarkup, self).__init__(*args, **keywords)
+ if "sectioning-model" in keywords:
+ val = keywords["sectioning-model"]
+ if val in self.sectcomm:
+ self.sectioning_model = val
+ else:
+ raise ValueError("Invalid value for sectioning model: %s" % val)
+ if "sectioning-start" in keywords:
+ val = keywords["sectioning-start"]
+ if val < 0 or val > 4:
+ raise ValueError("Invalid value for sectioning start: %s" % val)
+ else:
+ self.sectioning_start = val
+
+
+ def __str__(self):
+ str = ""
+ for elt in self.tree:
+ str += self.format(elt)
+ return str
+
+ def format(self, elt):
+ if elt['type'] == 'TEXT':
+ if isinstance(elt['content'],list):
+ string = ""
+ for s in elt['content']:
+ string += s
+ else:
+ string = elt['content']
+ return string
+ elif elt['type'] == 'TAG':
+ return self.str_tag(elt)
+ elif elt['type'] == 'PARA':
+ return self.str_para(elt)
+ elif elt['type'] == 'PRE':
+ return self.str_pre(elt)
+ elif elt['type'] == 'IT':
+ return self.str_it(elt)
+ elif elt['type'] == 'BOLD':
+ return self.str_bold(elt)
+ elif elt['type'] == 'LINK':
+ return self.str_link(elt)
+ elif elt['type'] == 'TMPL':
+ return self.str_tmpl(elt)
+ elif elt['type'] == 'BAR':
+ return self.str_bar()
+ elif elt['type'] == 'HDR':
+ return self.str_hdr(elt)
+ elif elt['type'] == 'REF':
+ return self.str_ref(elt)
+ elif elt['type'] == 'ENV':
+ return self.str_env(elt)
+ elif elt['type'] == 'IND':
+ return self.str_ind(elt)
+ elif elt['type'] == 'SEQ':
+ string = ""
+ for x in elt['content']:
+ string += self.format(x)
+ return string
+ else:
+ return str(elt)
+
+ def str_tag(self, elt):
+ if elt['tag'] == 'code':
+ self.nested += 1
+ s = self.format(elt['content'])
+ self.nested -= 1
+ if not s.endswith("\n"):
+ s += "\n"
+ return '@example\n' + s + '@end example\n'
+ elif elt['tag'] == 'tt':
+ self.nested += 1
+ s = self.format(elt['content'])
+ self.nested -= 1
+ return "@code{%s}" % s
+ elif elt['tag'] == 'div':
+ s = ''
+ if 'args' in elt and 'id' in elt['args']:
+ s += "\n@anchor{%s}\n" % elt['args']['id']
+ s += self.format(elt['content'])
+ return s
+ else:
+ s = '<' + elt['tag']
+ if elt['args']:
+ s += ' ' + elt['args']
+ s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>'
+ return s
+
+ def str_para(self, elt):
+ string = "";
+ for x in elt['content']:
+ string += self.format(x)
+ return "\n" + string + "\n"
+
+ def str_pre(self, elt):
+ string = "";
+ for x in elt['content']:
+ string += self.format(x)
+ if self.nested:
+ return string
+ if not string.endswith("\n"):
+ string += "\n";
+ return '\n@example\n' + string + '@end example\n'
+
+ def concat(self, eltlist):
+ string = ""
+ for x in eltlist:
+ string += self.format(x)
+ return string
+
+ def str_it(self, elt):
+ return "@i{" + self.concat(elt['content']) + "}"
+
+ def str_bold(self, elt):
+ return "@b{" + self.concat(elt['content']) + "}"
+
+ def nodename(self, elt):
+ return self.format(elt) # FIXME
+
+ def str_hdr(self, elt):
+ level = elt['level']
+ if level > len(self.sectcomm[self.sectioning_model]) - 1 - self.sectioning_start:
+ s ="\n@* %s" % (self.format(elt['content']))
+ else:
+ s = self.sectcomm[self.sectioning_model][level - self.sectioning_start] + " " + self.format(elt['content']) + "\n"
+ if self.sectcomm[self.sectioning_model][0] == '@top':
+ s += "@node %s\n" % (self.nodename(elt['content']))
+ return s + "\n"
+
+ def str_bar(self):
+ return "\n-----\n" # FIXME
+
+ def str_ind(self, elt):
+ return ("@w{ }" * elt['level']) + self.format(elt['content']) + '\n'
+
+ def str_env(self, elt):
+ if elt['envtype'] == 'unnumbered':
+ string = '\n@itemize @bullet\n'
+ for s in elt['content']:
+ string += '@item ' + self.format(s['content']) + '\n\n'
+ string += '@end itemize\n'
+ elif elt['envtype'] == 'numbered':
+ string = '\n@enumerate\n'
+ for s in elt['content']:
+ string += '@item ' + self.format(s['content']) + '\n\n'
+ string += '@end enumerate\n'
+ elif elt['envtype'] == 'defn':
+ string = "\n@table @asis\n"
+ for s in elt['content']:
+ if s['subtype'] == 0:
+ string += "@item " + self.format(s['content']) + '\n'
+ else:
+ string += self.format(s['content']) + '\n'
+ string += '@end table\n'
+ return string
+
+ def str_link(self, elt):
+ # FIXME: A very crude version
+ arg = self.format(elt['content'][0])
+ if len(elt['content']) > 1:
+ s = [x for x in map(self.format, elt['content'])]
+ text = s[1]
+ else:
+ s = None
+ text = None
+
+ if s:
+ if s[0] == 'disambigR' or s[0] == 'wikiquote':
+ return ""
+ if len(s) > 1 and s[1] == 'thumb':
+ return ""
+
+ (qual,sep,tgt) = arg.partition(':')
+ if text:
+ return "@ref{%s,%s}" % (qual, text)
+ else:
+ return "@ref{%s}" % qual
+
+ def str_tmpl(self, elt):
+ return "FIXME: str_tmpl not implemented\n"
+
+ def str_ref(self, elt):
+ target = elt['ref']
+ text = self.format(elt['content'])
+ if text and text != '':
+ return "@uref{%s,%s}" % (target, text)
+ else:
+ return "@uref{%s}" % target
+
+
+
+
+
diff --git a/WikiTrans/wiki2text.py b/WikiTrans/wiki2text.py
new file mode 100644
index 0000000..916391e
--- /dev/null
+++ b/WikiTrans/wiki2text.py
@@ -0,0 +1,266 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008,2015 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from wikimarkup import *
+from wikins import wiki_ns_re, wiki_ns
+import re
+import urllib
+
+class TextWikiMarkup (WikiMarkup):
+ """
+ A (general-purpose Wiki->Text translator class.
+ """
+
+ # Output width
+ width = 78
+ # Do not show references.
+ references = False
+ # Provide a minimum markup
+ markup = True
+
+ # Number of current element in the environment
+ num = 0
+
+ def __init__(self, *args, **keywords):
+ WikiMarkup.__init__(self, *args, **keywords)
+ if 'width' in keywords:
+ self.width = keywords['width']
+ if 'refs' in keywords:
+ self.references = keywords['refs']
+ if 'markup' in keywords:
+ self.markup = keywords['markup']
+
+ def xref(self, text, target):
+ if text:
+ return "%s (see %s) " % (text, target)
+ else:
+ return "see " + target
+
+ def wiki_ns_name(self, str):
+ if str in wiki_ns[self.lang]:
+ return wiki_ns[self.lang][str]
+ elif str in wiki_ns_re[self.lang]:
+ for elt in wiki_ns_re[self.lang][str]:
+ if str.beginswith(elt[0]) and str.endswith(elt[1]):
+ return elt[2]
+ return None
+
+ def mktgt(self, tgt, lang = None):
+ if not lang:
+ lang = self.lang
+ return self.html_base % { 'lang' : lang } + urllib.quote(tgt)
+
+ def fmtlink(self, elt, istmpl):
+ arg = self.format(elt['content'][0])
+ if len(elt['content']) > 1:
+ s = [x for x in map(self.format, elt['content'])]
+ text = s[1]
+ else:
+ s = None
+ text = None
+
+ if s:
+ if s[0] == 'disambigR' or s[0] == 'wikiquote':
+ return ""
+ if len(s) > 1 and s[1] == 'thumb':
+ return ""
+ (qual,sep,tgt) = arg.partition(':')
+ if tgt != '':
+ ns = self.wiki_ns_name(qual)
+ if ns:
+ if ns == 'NS_IMAGE':
+ if not self.references:
+ return ""
+ text = "[%s: %s]" % (qual, text if text else arg)
+ tgt = self.image_base + '/' + \
+ urllib.quote(tgt) + \
+ '/250px-' + urllib.quote(tgt)
+ elif ns == 'NS_MEDIA':
+ text = "[%s]" % (qual)
+ else:
+ tgt = self.mktgt(tgt)
+ elif not istmpl and qual in self.langtab:
+ text = self.langtab[qual] + ": " + tgt
+ tgt = self.mktgt(tgt, qual)
+ else:
+ tgt = self.mktgt(tgt)
+ else:
+ tgt = self.mktgt(arg)
+ if self.references:
+ return "%s (see %s) " % (text, tgt)
+ elif not text or text == '':
+ return arg
+ else:
+ return text
+
+ def indent (self, lev, text):
+ if text.find('\n') == -1:
+ s = (" " * lev) + text
+ else:
+ s = ""
+ for elt in text.split('\n'):
+ if elt:
+ s += (" " * lev) + elt + '\n'
+ if not text.endswith('\n'):
+ s = s.rstrip('\n')
+# print "IN: '%s'" % (text)
+# print "OUT: '%s'" % (s)
+ return s
+
+ def fmtpara(self, input):
+ output = ""
+ linebuf = ""
+ length = 0
+ for s in input.split():
+ wlen = len(s)
+ if linebuf.endswith("."):
+ wsc = 2
+ else:
+ wsc = 1
+ if length + wsc + wlen > self.width:
+ # FIXME: fill out linebuf
+ output += linebuf + '\n'
+ wsc = 0
+ length = 0
+ linebuf = ""
+ linebuf += " " * wsc + s
+ length += wsc + wlen
+ return output + linebuf
+
+ def str_tag(self, elt):
+ if elt['tag'] == 'code':
+ self.nested += 1
+ s = self.format(elt['content'])
+ self.nested -= 1
+ return s #FIXME
+ else:
+ s = '<' + elt['tag']
+ if elt['args']:
+ s += ' ' + str(elt['args'])
+ s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>'
+ return s
+
+ def format(self, elt):
+ if elt['type'] == 'TEXT':
+ if isinstance(elt['content'],list):
+ string = ""
+ for s in elt['content']:
+ if string:
+ if string.endswith("."):
+ string += " "
+ else:
+ string += " "
+ string += s
+ else:
+ string = elt['content']
+ elif elt['type'] == 'PRE':
+ string = ""
+ for x in elt['content']:
+ string += self.format(x)
+ string += '\n'
+ elif elt['type'] == 'PARA':
+ string = "";
+ for x in elt['content']:
+ string += self.format(x)
+ string = self.fmtpara(string) + '\n\n'
+ elif elt['type'] == 'TAG':
+ string = self.str_tag(elt)
+ elif elt['type'] == 'IT':
+ string = ""
+ for x in elt['content']:
+ s = self.format(x)
+ if s:
+ string += " " + s
+ string = "_" + string.lstrip(" ") + "_"
+ elif elt['type'] == 'BOLD':
+ string = ""
+ for x in elt['content']:
+ s = self.format(x)
+ if s:
+ if string.endswith("."):
+ string += " "
+ else:
+ string += " "
+ string += s
+ string = string.upper()
+ elif elt['type'] == 'LINK':
+ string = self.fmtlink(elt, False)
+ elif elt['type'] == 'TMPL':
+ s = self.fmtlink(elt, True)
+ if s:
+ string = '[' + s + ']'
+ else:
+ string = s
+ elif elt['type'] == 'BAR':
+ w = self.width
+ if w < 5:
+ w = 5
+ string = "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
+ elif elt['type'] == 'HDR':
+ level = elt['level']
+ string = "\n" + ("*" * level) + " " + \
+ self.format(elt['content']).lstrip(" ") + "\n\n"
+ elif elt['type'] == 'REF':
+ string = self.xref(self.format(elt['content']), elt['ref'])
+ elif elt['type'] == 'ENV':
+ type = elt['envtype']
+ lev = elt['level']
+ if lev > self.width - 4:
+ lev = 1
+ string = ""
+ n = 1
+ for s in elt['content']:
+ if not string.endswith("\n"):
+ string += "\n"
+ x = self.format(s['content'])
+ if type == "unnumbered":
+ string += self.fmtpara(self.indent(lev, "- " + x.lstrip(" ")))
+ elif type == "numbered":
+ string += self.fmtpara(self.indent(lev, "%d. %s" % (n, x)))
+ n += 1
+ elif type == "defn":
+ if s['subtype'] == 0:
+ string += self.indent(lev-1, x)
+ else:
+ string += self.indent(lev+3, x)
+
+ if not string.endswith("\n"):
+ string += "\n"
+ elif elt['type'] == 'IND':
+ string = (" " * elt['level']) + self.format(elt['content']) + '\n'
+ elif elt['type'] == 'SEQ':
+ string = ""
+ for x in elt['content']:
+ if len(string) > 1 and not string[-1].isspace():
+ string += ' '
+ string += self.format(x)
+ else:
+ string = str(elt)
+ return string
+
+ def __str__(self):
+ str = ""
+ for elt in self.tree:
+ str += self.format(elt)
+ return str
+
+class TextWiktionaryMarkup (TextWikiMarkup):
+ """
+ See documentation for HtmlWiktionaryMarkup
+ """
+ # FIXME: It is supposed to do something about templates
+
diff --git a/WikiTrans/wikimarkup.py b/WikiTrans/wikimarkup.py
new file mode 100644
index 0000000..2ef6be1
--- /dev/null
+++ b/WikiTrans/wikimarkup.py
@@ -0,0 +1,1215 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008, 2009, 2015 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import print_function
+import sys
+import re
+from types import *
+
+__all__ = [ "BaseWikiMarkup", "WikiMarkup",
+ "TagAttributes", "TagAttributeSyntax" ]
+
+class TagAttributeSyntax(Exception):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+class TagAttributes(object):
+ attrstart = re.compile("^(?P<attr>[a-zA-Z0-9_-]+)(?P<eq>=\")?")
+ valseg = re.compile("^[^\\\"]+")
+ tab = {}
+ printable = None
+ def __init__(self, string):
+ if not string:
+ self.printable = ''
+ return
+ self.printable = string
+ s = string
+ self.tab = {}
+ while s != '':
+ s = s.strip()
+ m = self.attrstart.match(s)
+ if m:
+ name = m.group('attr')
+ val = ''
+ s = s[m.end(0):]
+ if m.group('eq'):
+ while 1:
+ m = self.valseg.match(s)
+ val += m.group(0)
+ s = s[m.end(0):]
+ if s[0] == '\\':
+ val += s[1]
+ s += 2
+ elif s[0] == '"':
+ s = s[1:]
+ break
+ else:
+ val = 1
+ self.tab[name] = val
+ else:
+ raise TagAttributeSyntax(s)
+ def __len__(self):
+ return len(self.tab)
+ def __getitem__(self, key):
+ return self.tab[key]
+ def __contains__(self, key):
+ return key in self.tab
+ def __iter__(self):
+ for key in self.tab:
+ yield(key)
+ def has_key(self, key):
+ return self.__contains__(key)
+ def __setitem__(self, key, value):
+ self.tab[key] = value
+ def __delitem__(self, key):
+ del self.tab[key]
+ def __str__(self):
+ return self.printable
+ def __repr__(self):
+ return self.printable
+
+class BaseWikiMarkup(object):
+
+ delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
+ otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^>]+))?\s*(?P<closed>/)?>")
+ ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+ refstart = re.compile("^https?://")
+
+ close_delim = {
+ '[': ']',
+ '[[': ']]',
+ '{{': '}}'
+ }
+
+ # Environment types:
+ envtypes = { "*": [ "unnumbered", 0 ],
+ "#": [ "numbered", 0 ],
+ ";": [ "defn", 0 ],
+ ":": [ "defn", 1 ]
+ }
+
+ toklist = None
+ tokind = 0
+ newline = 0
+ tree = None
+
+ tags = [ 'code', 'nowiki', 'tt', 'div' ]
+
+ nested = 0
+ debug_level = 0
+
+ def dprint(self, lev, fmt, *argv):
+ if self.debug_level >= lev:
+ print("[DEBUG]", fmt % argv)
+
+ def print_dump_prefix(self, level, file):
+ file.write("[DUMP]" + ' ' * (2*level + 1))
+
+ def dump_nil(self, node, level, file):
+ pass
+
+ def dump_text(self, node, level, file):
+ self.print_dump_prefix(level, file)
+ file.write("CONTENT: \"%s\"\n" % node['content'])
+
+ def dump_delim(self, node, level, file):
+ file.write("'%s'" % node['content'])
+ if 'continuation' in node and node['continuation']:
+ file.write(" (cont)")
+ file.write("\n")
+
+ def dump_tag(self, node, level, file):
+ self.print_dump_prefix(level, file)
+ file.write("TAG: %s\n" % node['tag'])
+ if 'args' in node:
+ self.print_dump_prefix(level, file)
+ file.write("ARGS: %s\n" % node['args'])
+ if 'content' in node:
+ self.dump_node(node['content'], level + 1, file)
+
+ def dump_seq(self, n