summaryrefslogtreecommitdiff
path: root/wikitrans
diff options
context:
space:
mode:
Diffstat (limited to 'wikitrans')
-rw-r--r--wikitrans/__init__.py26
-rw-r--r--wikitrans/wiki2html.py320
-rw-r--r--wikitrans/wiki2texi.py410
-rw-r--r--wikitrans/wiki2text.py348
-rw-r--r--wikitrans/wikidump.py77
-rw-r--r--wikitrans/wikimarkup.py1285
-rw-r--r--wikitrans/wikins.py3040
-rw-r--r--wikitrans/wikitoken.py318
8 files changed, 5824 insertions, 0 deletions
diff --git a/wikitrans/__init__.py b/wikitrans/__init__.py
new file mode 100644
index 0000000..5832e38
--- /dev/null
+++ b/wikitrans/__init__.py
@@ -0,0 +1,26 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008-2018 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+__all__ = [
+ "wikitoken",
+ "wikimarkup",
+ "wikidump",
+ "wiki2html",
+ "wiki2text",
+ "wiki2texi",
+ "wikins"
+]
diff --git a/wikitrans/wiki2html.py b/wikitrans/wiki2html.py
new file mode 100644
index 0000000..ce65bae
--- /dev/null
+++ b/wikitrans/wiki2html.py
@@ -0,0 +1,320 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008-2018 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+Wiki markup to HTML translator.
+
+Classes:
+
+HtmlWikiMarkup -- Converts Wiki material to HTML.
+HtmlWiktionaryMarkup -- Reserved for future use. Currently does the same as
+ HtmlWikiMarkup.
+
+"""
+
+from __future__ import print_function
+from wikitrans.wikimarkup import *
+from wikitrans.wikitoken import *
+from wikitrans.wikins import wiki_ns_re, wiki_ns
+import re
+try:
+ from urllib import quote as url_quote
+except ImportError:
+ from urllib.parse import quote as url_quote
+
+try:
+ from html import escape as html_escape
+except ImportError:
+ from cgi import escape as html_escape
+
+__all__ = [ "HtmlWikiMarkup", "HtmlWiktionaryMarkup" ]
+
+class HtmlSeqNode(WikiSeqNode):
+ def format(self):
+ s = ''
+ for x in self.content:
+ s += x.format()
+ return s
+
+class HtmlLinkNode(HtmlSeqNode):
+ def format(self):
+ arg = self.content[0].format()
+ text = None
+ if len(self.content) > 1:
+ s = [x for x in map(lambda x: x.format(), self.content)]
+ if s[0] == 'disambigR' or s[0] == 'wikiquote':
+ return ""
+ elif len(s) > 1 and s[1] == 'thumb':
+ return ""
+ text = '<span class="template">' + s[1] + '</span>'
+ if self.type == 'TMPL':
+ if re.match("t[+-]$", s[0]):
+ if len(s) > 2:
+ text = s[2]
+ elif s[0] == "term":
+ text = self.parser.tmpl_term(s)
+ elif s[0] == "proto":
+ text = self.parser.tmpl_proto(s)
+ return text
+
+ (qual,sep,tgt) = arg.partition(':')
+ if tgt != '':
+ ns = self.parser.wiki_ns_name(qual)
+ if ns:
+ if ns == 'NS_IMAGE':
+ return ''
+ elif ns == 'NS_MEDIA':
+ tgt = self.parser.media_base + '/' + tgt
+ else:
+ tgt = self.parser.mktgt(tgt)
+ elif self.type == 'LINK' and qual in self.parser.langtab:
+ tgt = self.parser.mktgt(tgt, qual)
+ if not text or text == '':
+ text = self.parser.langtab[qual]
+ else:
+ tgt = self.parser.mktgt(tgt)
+ else:
+ tgt = self.parser.mktgt(arg)
+ return "<a href=\"%s\">%s</a>" % (tgt,
+ text if (text and text != '') else arg)
+
+class HtmlRefNode(WikiRefNode):
+ def format(self):
+ target = self.ref
+ text = self.content.format()
+ return "<a href=\"%s\">%s</a>" % (
+ target,
+ text if (text and text != '') else target
+ )
+
+class HtmlFontNode(HtmlSeqNode):
+ def format(self):
+ comm = { 'IT': 'i',
+ 'BOLD': 'b' }
+ s = '<%s>' % comm[self.type]
+ for x in self.content:
+ s += x.format()
+ s += '</%s>' % comm[self.type]
+ return s
+
+class HtmlTextNode(HtmlSeqNode):
+ def format(self):
+ if isinstance(self.content,list):
+ s = ''.join(self.content)
+ else:
+ s = html_escape(self.content, quote=False)
+ return s
+
+class HtmlHdrNode(WikiHdrNode):
+ def format(self):
+ level = self.level
+ if level > 6:
+ level = 6
+ return "<h%s>%s</h%s>\n\n" % (level, self.content.format(), level)
+
+class HtmlBarNode(WikiNode):
+ def format(self):
+ return "<hr/>\n"
+
+class HtmlEnvNode(WikiEnvNode):
+ def format(self):
+ type = self.envtype
+ lev = self.level
+ if lev > 4:
+ lev = 2
+ string = ""
+ for s in self.content:
+ n = s.subtype;
+ string += "<%s>%s</%s>" % (self.parser.envt[type]["elt"][n],
+ s.content.format(),
+ self.parser.envt[type]["elt"][n])
+ return "<%s>%s</%s>" % (self.parser.envt[type]["hdr"],
+ string,
+ self.parser.envt[type]["hdr"])
+ return string
+
+class HtmlTagNode(WikiTagNode):
+ def format(self):
+ if self.tag == 'code':
+ self.parser.nested += 1
+ s = self.content.format()
+ self.parser.nested -= 1
+ return '<pre><code>' + s + '</code></pre>' #FIXME
+ elif self.tag == 'ref':
+ n = self.idx+1
+ return '<sup id="cite_ref-%d" class="reference"><a name="cite_ref-%d" href=#cite_note-%d">%d</a></sup>' % (n,n,n,n)
+ elif self.tag == 'references':
+ s = '<div class="references">\n'
+ s += '<ol class="references">\n'
+ n = 0
+ for ref in self.parser.references:
+ n += 1
+ s += ('<li id="cite_note-%d">'
+ + '<span class="mw-cite-backlink">'
+ + '<b><a href="#cite_ref-%d">^</a></b>'
+ + '</span>'
+ + '<span class="reference-text">'
+ + ref.content.format()
+ + '</span>'
+ + '</li>\n') % (n,n)
+ s += '</ol>\n</div>\n'
+ return s
+ else:
+ s = '<' + self.tag
+ if self.args:
+ s += ' ' + str(self.args)
+ s += '>'
+ s += self.content.format()
+ return s + '</' + self.tag + '>'
+
+class HtmlParaNode(HtmlSeqNode):
+ def format(self):
+ return "<p>" + super(HtmlParaNode, self).format() + "</p>\n"
+
+class HtmlPreNode(HtmlSeqNode):
+ def format(self):
+ s = super(HtmlPreNode, self).format()
+ if self.parser.nested:
+ return s
+ else:
+ return '<pre>' + s + '</pre>'
+
+class HtmlIndNode(WikiIndNode):
+ def format(self):
+ return ("<dl><dd>" * self.level) + self.content.format() + "</dd></dl>" * self.level
+
+
+class HtmlWikiMarkup(WikiMarkup):
+ """A Wiki markup to HTML translator class.
+
+ Usage:
+
+ x = HtmlWikiMarkup(file="input.wiki")
+ # Parse the input:
+ x.parse()
+ # Print it as HTML:
+ print(str(x))
+
+ Known bugs:
+ * [[official position]]s
+ Final 's' gets after closing </a> tag. Should be before.
+ """
+
+ nested = 0
+ references = []
+ def __init__(self, *args, **kwargs):
+ """Create a HtmlWikiMarkup object.
+
+ Arguments:
+
+ filename=FILE
+ Read Wiki material from the file named FILE.
+ file=FD
+ Read Wiki material from file object FD.
+ text=STRING
+ Read Wiki material from STRING.
+ lang=CODE
+ Specifies source language. Default is 'en'. This variable can be
+ referred to as '%(lang)s' in the keyword arguments below.
+ html_base=URL
+ Base URL for cross-references. Default is
+ 'http://%(lang)s.wiktionary.org/wiki/'
+ image_base=URL
+ Base URL for images. Default is
+ 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf'
+ media_base=URL
+ Base URL for media files. Default is
+ 'http://www.mediawiki.org/xml/export-0.3'
+ """
+
+ super(HtmlWikiMarkup, self).__init__(*args, **kwargs)
+ self.token_class['LINK'] = HtmlLinkNode
+ self.token_class['TMPL'] = HtmlLinkNode
+ self.token_class['REF'] = HtmlRefNode
+ self.token_class['IT'] = HtmlFontNode
+ self.token_class['BOLD'] = HtmlFontNode
+ self.token_class['HDR'] = HtmlHdrNode
+ self.token_class['BAR'] = HtmlBarNode
+ self.token_class['ENV'] = HtmlEnvNode
+ self.token_class['TAG'] = HtmlTagNode
+ self.token_class['PARA'] = HtmlParaNode
+ self.token_class['PRE'] = HtmlPreNode
+ self.token_class['IND'] = HtmlIndNode
+ self.token_class['TEXT'] = HtmlTextNode
+ self.token_class['SEQ'] = HtmlSeqNode
+
+ def wiki_ns_name(self, str):
+ if str in wiki_ns[self.lang]:
+ return wiki_ns[self.lang][str]
+ elif str in wiki_ns_re[self.lang]:
+ for elt in wiki_ns_re[self.lang][str]:
+ if str.beginswith(elt[0]) and str.endswith(elt[1]):
+ return elt[2]
+ return None
+
+ envt = { "unnumbered": { "hdr": "ul",
+ "elt": ["li"] },
+ "numbered": { "hdr": "ol",
+ "elt": ["li"] },
+ "defn": { "hdr": "dl",
+ "elt": ["dt","dd"] } }
+
+ def mktgt(self, tgt, lang = None):
+ if not lang:
+ lang = self.lang
+ return self.html_base % { 'lang' : lang } + url_quote(tgt)
+
+ def tmpl_term(self, s):
+ if len(s) == 2:
+ return s[1]
+ text = None
+ trans = None
+ for x in s[1:]:
+ m = re.match('(\w+)=', x)
+ if m:
+ if m.group(1) == "tr":
+ trans = x[m.end(1)+1:]
+ elif not text:
+ text = x
+ if text:
+ if trans:
+ text += ' <span class="trans">[' + trans + ']</span>'
+ return text
+
+ def tmpl_proto(self, s):
+ text = '<span class="proto-lang">Proto-' + s[1] + '</span>'
+ if len(s) >= 4:
+ n = 0
+ for x in s[2:-2]:
+ if n > 0:
+ text += ','
+ n += 1
+ text += ' <span class="proto">' + x + '</span>'
+ text += ' <span class="meaning">(' + s[-2] + ')</span>'
+ return text
+
+ def __str__(self):
+ str = ""
+ for elt in self.tree:
+ str += elt.format()
+ return str
+
+class HtmlWiktionaryMarkup(HtmlWikiMarkup):
+ """A class for translating Wiktionary articles into HTML.
+
+ Reserved for future use. Currently does the same as HtmlWikiMarkup.
+ """
diff --git a/wikitrans/wiki2texi.py b/wikitrans/wiki2texi.py
new file mode 100644
index 0000000..d9e5f52
--- /dev/null
+++ b/wikitrans/wiki2texi.py
@@ -0,0 +1,410 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2015-2018 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+Wiki markup to Texinfo translator.
+
+Classes:
+
+TexiWikiMarkup -- Converts Wiki material to Texinfo.
+
+"""
+
+from wikitrans.wikimarkup import *
+from wikitrans.wikitoken import *
+from wikitrans.wikins import wiki_ns_re, wiki_ns
+import re
+import urllib
+
+class Acc(list):
+ def prepend(self,x):
+ self.insert(0,x)
+
+ def is_empty(self):
+ return len(self) == 0
+
+ def clear(self):
+ self = []
+
+ def tail(self, n = 1):
+ s = Acc()
+ i = len(self)
+ while i > 0 and n > 0:
+ elt = self[i-1]
+ l = len(elt)
+ if l == 0:
+ continue
+ elif l > n:
+ l = n
+ s.prepend(elt[-n:])
+ n -= l
+ i -= 1
+ return str(s)
+
+ def trim(self, n):
+ while len(self) and n > 0:
+ elt = self.pop()
+ l = len(elt)
+ if l == 0:
+ continue
+ elif l > n:
+ self += elt[0:-n]
+ break
+ n -= l
+
+ def trimnl(self):
+ if self.endswith('\n'):
+ self.trim(1)
+
+ def trimpara(self):
+ if self.endswith('\n\n'):
+ self.trim(2)
+
+ def endswith(self, x):
+ return self.tail(len(x)) == x
+
+ def in_new_para(self):
+ return self.is_empty() or self.endswith('\n\n')
+
+ def __str__(self):
+ return ''.join(self)
+
+class TexiTextNode(WikiTextNode):
+ def format(self):
+ parser = self.parser
+ if isinstance(self.content,list):
+ for s in self.content:
+ parser._print(s)
+ else:
+ parser._print(self.content)
+
+class TexiTagNode(WikiTagNode):
+ def format(self):
+ parser = self.parser
+ if self.tag in ['code', 'tt']:
+ save = parser._begin_print()
+ parser.nested += 1
+ self.content.format()
+ parser.nested -= 1
+ s = parser._end_print(save)
+ if self.isblock:
+ parser._print('@example', nl=True, escape=False)
+ parser._print(s, escape=False)
+ parser._print('@end example\n', nl=True, escape=False)
+ else:
+ parser._print('@code{%s}' % s, escape=False)
+ elif self.tag == 'div':
+ if self.args and 'id' in self.args:
+ parser._print("@anchor{%s}\n" % self.args['id'],
+ nl=True, escape=False)
+ self.content.format()
+ elif self.tag == 'ref':
+ parser._print('@footnote{', escape=False);
+ self.content.format();
+ parser._print('}', escape=False)
+ elif self.tag == 'references':
+ pass
+ else:
+ parser._print('<' + self.tag)
+ if self.args:
+ parser._print(' ' + self.args)
+ parser._print('>');
+ self.content.format()
+ parser._print('</' + self.tag + '>')
+
+class TexiParaNode(WikiSeqNode):
+ def format(self):
+ parser = self.parser
+ if not parser.acc.in_new_para():
+ parser._print('\n', nl=True)
+ for x in self.content:
+ x.format()
+ if not parser.acc.in_new_para():
+ parser._print('\n', nl=True)
+
+class TexiPreNode(WikiSeqNode):
+ def format(self):
+ parser = self.parser
+ if not parser.nested:
+ parser._print('@example\n', nl=True, escape=False)
+ for x in self.content:
+ x.format()
+ if not parser.nested:
+ parser._print('@end example\n', nl=True, escape=False)
+
+class TexiFontNode(WikiSeqNode):
+ def format(self):
+ parser = self.parser
+ comm = { 'IT': 'i',
+ 'BOLD': 'b' }
+ parser._print('@%s{' % comm[self.type], escape=False)
+ for x in self.content:
+ x.format()
+ parser._print('}', escape=False)
+
+class TexiHdrNode(WikiHdrNode):
+ def format(self):
+ parser = self.parser
+ level = self.level
+ # FIXME
+ if level > len(parser.sectcomm[parser.sectioning_model]) - 1 - parser.sectioning_start:
+ parser._print("@* ", nl=True, escape=False)
+ self.content.format()
+ else:
+ parser._print(parser.sectcomm[parser.sectioning_model][level - parser.sectioning_start] + " ", nl=True, escape=False)
+ self.content.format()
+ parser._print(None, nl=True)
+ if parser.sectcomm[parser.sectioning_model][0] == '@top':
+ parser._print('@node ', nl=True, escape=False)
+ self.content.format()
+ parser._print('\n')
+ parser._print(None, nl=True)
+
+class TexiBarNode(WikiNode):
+ def format(self):
+ self.parser._print("\n-----\n")
+
+class TexiIndNode(WikiIndNode):
+ def format(self):
+ parser = self.parser
+ parser._print("@w{ }" * self.level, nl=True, escape=False)
+ self.content.format()
+ parser._print(None, nl=True)
+
+class TexiEnvNode(WikiEnvNode):
+ def format(self):
+ parser = self.parser
+ if self.envtype == 'unnumbered':
+ parser._print('@itemize @bullet\n', nl=True, escape=False)
+ for s in self.content:
+ parser._print('@item ', nl=True, escape=False)
+ s.content.format()
+ parser._print(None, nl=True)
+ parser._print('\n')
+ parser._print('@end itemize\n', nl=True, escape=False)
+ elif self.envtype == 'numbered':
+ parser._print('@enumerate\n', nl=True, escape=False)
+ for s in self.content:
+ parser._print('@item ', nl=True, escape=False)
+ s.content.format()
+ parser._print(None, nl=True)
+ parser._print('\n')
+ parser._print('@end enumerate\n', nl=True, escape=False)
+ elif self.envtype == 'defn':
+ parser._print('@table @asis\n', nl=True, escape=False)
+ for s in self.content:
+ if s.subtype == 0:
+ parser._print('@item ', nl=True, escape=False)
+ s.content.format()
+ parser._print(None, nl=True)
+ else:
+ s.content.format()
+ parser._print(None, nl=True)
+ parser._print('\n')
+ parser._print('@end table\n', nl=True, escape=False)
+
+class TexiLinkNode(WikiSeqNode):
+ def format(self):
+ parser = self.parser
+ save = parser._begin_print()
+ self.content[0].format()
+ arg = parser._end_print()
+ if len(self.content) > 1:
+ s = []
+ for x in self.content[0:2]:
+ parser._begin_print()
+ x.format()
+ s.append(parser._end_print())
+ text = s[1]
+ else:
+ s = None
+ text = None
+
+ parser._end_print(save)
+
+ if s:
+ if s[0] == 'disambigR' or s[0] == 'wikiquote':
+ return
+ if len(s) > 1 and s[1] == 'thumb':
+ return
+
+ (qual,sep,tgt) = arg.partition(':')
+ if text:
+ parser._print("@ref{%s,%s}" % (qual, text), escape=False)
+ else:
+ parser._print("@ref{%s}" % qual, escape=False)
+
+class TexiRefNode(WikiRefNode):
+ def format(self):
+ parser = self.parser
+ target = self.ref
+ save = parser._begin_print()
+ self.content.format()
+ text = parser._end_print(save)
+ if text and text != '':
+ parser._print("@uref{%s,%s}" % (target, text), escape=False)
+ else:
+ parser._print("@uref{%s}" % target, escape=False)
+
+class TexiWikiMarkup(WikiMarkup):
+ """Wiki markup to Texinfo translator class.
+
+ Usage:
+
+ x = TexiWikiMarkup(file="input.wiki")
+ # Parse the input:
+ x.parse()
+ # Print it as Texi:
+ print(str(x))
+
+ """
+
+ nested = 0
+ sectcomm = {
+ 'numbered': [
+ '@top',
+ '@chapter',
+ '@section',
+ '@subsection',
+ '@subsubsection'
+ ],
+ 'unnumbered': [
+ '@top',
+ '@unnumbered',
+ '@unnumberedsec',
+ '@unnumberedsubsec',
+ '@unnumberedsubsubsec'
+ ],
+ 'appendix': [
+ '@top',
+ '@appendix',
+ '@appendixsec',
+ '@appendixsubsec',
+ '@appendixsubsubsec'
+ ],
+ 'heading': [
+ '@majorheading'
+ '@chapheading',
+ '@heading',
+ '@subheading',
+ '@subsubheading'
+ ]
+ }
+
+ sectioning_model = 'numbered'
+ sectioning_start = 0
+
+ def __init__(self, *args, **keywords):
+ """Create a TexiWikiMarkup object.
+
+ Arguments:
+
+ filename=FILE
+ Read Wiki material from the file named FILE.
+ file=FD
+ Read Wiki material from file object FD.
+ text=STRING
+ Read Wiki material from STRING.
+
+ sectioning_model=MODEL
+ Select the Texinfo sectioning model for the output document. Possible
+ values are:
+
+ 'numbered'
+ Top of document is marked with "@top". Headings ("=", "==",
+ "===", etc) produce "@chapter", "@section", "@subsection", etc.
+ 'unnumbered'
+ Unnumbered sectioning: "@top", "@unnumbered", "@unnumberedsec",
+ "@unnumberedsubsec".
+ 'appendix'
+ Sectioning suitable for appendix entries: "@top", "@appendix",
+ "@appendixsec", "@appendixsubsec", etc.
+ 'heading'
+ Use heading directives to reflect sectioning: "@majorheading",
+ "@chapheading", "@heading", "@subheading", etc.
+ sectioning_start=N
+ Shift resulting heading level by N positions. For example, supposing
+ "sectioning_model='numbered'", "== A ==" normally produces
+ "@section A" on output. Now, if given "sectioning_start=1", this
+ directive will produce "@subsection A" instead.
+ """
+
+ super(TexiWikiMarkup, self).__init__(*args, **keywords)
+
+ self.token_class['TEXT'] = TexiTextNode
+ self.token_class['TAG'] = TexiTagNode
+ self.token_class['PARA'] = TexiParaNode
+ self.token_class['PRE'] = TexiPreNode
+ self.token_class['IT'] = TexiFontNode
+ self.token_class['BOLD'] = TexiFontNode
+ self.token_class['HDR'] = TexiHdrNode
+ self.token_class['BAR'] = TexiBarNode
+ self.token_class['IND'] = TexiIndNode
+ self.token_class['ENV'] = TexiEnvNode
+ self.token_class['LINK'] = TexiLinkNode
+ self.token_class['REF'] = TexiRefNode
+
+ if "sectioning_model" in keywords:
+ val = keywords["sectioning_model"]
+ if val in self.sectcomm:
+ self.sectioning_model = val
+ else:
+ raise ValueError("Invalid value for sectioning model: %s" % val)
+ if "sectioning_start" in keywords:
+ val = keywords["sectioning_start"]
+ if val < 0 or val > 4:
+ raise ValueError("Invalid value for sectioning start: %s" % val)
+ else:
+ self.sectioning_start = val
+
+ replchars = re.compile(r'([@{}])')
+ acc = Acc()
+
+ def _print(self, text, **kw):
+ nl = kw.pop('nl', False)
+ escape = kw.pop('escape', True)
+ if nl and not self.acc.endswith('\n'):
+ self.acc += '\n'
+ if text:
+ if escape:
+ self.acc += self.replchars.sub(r'@\1', text)
+ else:
+ self.acc += text
+
+ def _begin_print(self):
+ s = self.acc
+ self.acc = Acc()
+ return s
+
+ def _end_print(self, val = None):
+ s = self.acc
+ self.acc = val
+ return str(s)
+
+ def __str__(self):
+ self._begin_print()
+ for elt in self.tree:
+ elt.format()
+ self.acc.trimpara()
+ return self._end_print()
+
+
+
+
+
+
+
diff --git a/wikitrans/wiki2text.py b/wikitrans/wiki2text.py
new file mode 100644
index 0000000..1fbc61b
--- /dev/null
+++ b/wikitrans/wiki2text.py
@@ -0,0 +1,348 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008-2018 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+Wiki markup to plain text translator.
+
+Classes:
+
+TextWikiMarkup -- Converts Wiki material to plain text.
+TextWiktionaryMarkup -- Reserved for future use. Currently does the same as
+ TextWikiMarkup.
+
+"""
+
+from wikitrans.wikitoken import *
+from wikitrans.wikimarkup import *
+from wikitrans.wikins import wiki_ns_re, wiki_ns
+import re
+try:
+ from urllib import quote as url_quote
+except ImportError:
+ from urllib.parse import quote as url_quote
+
+class TextSeqNode(WikiSeqNode):
+ def format(self):
+ string = ""
+ for x in self.content:
+ if len(string) > 1 and not string[-1].isspace():
+ string += ' '
+ string += x.format()
+ return string
+
+class TextTextNode(WikiTextNode):
+ def format(self):
+ if isinstance(self.content,list):
+ string = ""
+ for s in self.content:
+ if string:
+ if string.endswith("."):
+ string += " "
+ else:
+ string += " "
+ string += s
+ else:
+ string = self.content
+ return string
+
+class TextPreNode(WikiSeqNode):
+ def format(self):
+ string = ""
+ for x in self.content:
+ string += x.format()
+ string += '\n'
+ return string
+
+class TextParaNode(WikiSeqNode):
+ def format(self):
+ string = ""
+ for x in self.content:
+ string += x.format()
+ string = self.parser.fmtpara(string) + '\n\n'
+ return string
+
+class TextItNode(WikiSeqNode):
+ def format(self):
+ string = ""
+ for x in self.content:
+ s = x.format()
+ if s:
+ string += " " + s
+ return "_" + string.lstrip(" ") + "_"
+
+class TextBoldNode(WikiSeqNode):
+ def format(self):
+ string = ""
+ for x in self.content:
+ if string.endswith("."):
+ string += " "
+ else:
+ string += " "
+ string += x.format()
+ return string.upper()
+
+class TextLinkNode(WikiSeqNode):
+ def format(self):
+ arg = self.content[0].format()
+ if len(self.content) > 1:
+ s = [x for x in map(lambda x: x.format(), self.content)]
+ text = s[1]
+ else:
+ s = None
+ text = None
+
+ if s:
+ if s[0] == 'disambigR' or s[0] == 'wikiquote':
+ return ""
+ if len(s) > 1 and s[1] == 'thumb':
+ return ""
+ (qual,sep,tgt) = arg.partition(':')
+ if tgt != '':
+ ns = self.parser.wiki_ns_name(qual)
+ if ns:
+ if ns == 'NS_IMAGE':
+ if not self.parser.show_urls:
+ return ""
+ text = "[%s: %s]" % (qual, text if text else arg)
+ tgt = "%s/%s/250px-%s" % (self.image_base,
+ url_quote(tgt),
+ url_quote(tgt))
+ elif ns == 'NS_MEDIA':
+ text = "[%s]" % (qual)
+ else:
+ tgt = self.parser.mktgt(tgt)
+ elif self.type == 'LINK' and qual in self.parser.langtab:
+ text = self.parser.langtab[qual] + ": " + tgt
+ tgt = self.parser.mktgt(tgt, qual)
+ else:
+ tgt = self.parser.mktgt(tgt)
+ else:
+ tgt = self.parser.mktgt(arg)
+ if self.parser.show_urls:
+ return "%s (see %s) " % (text, tgt)
+ elif not text or text == '':
+ return arg
+ else:
+ return text
+
+class TextTmplNode(TextLinkNode):
+ def format(self):
+ return '[' + super(TextTmplNode, self).format() + ']'
+
+class TextBarNode(WikiNode):
+ def format(self):
+ w = self.parser.width
+ if w < 5:
+ w = 5
+ return "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
+
+class TextHdrNode(WikiHdrNode):
+ def format(self):
+ return ("\n"
+ + ("*" * self.level)
+ + " "
+ + self.content.format().lstrip(" ")
+ + "\n\n")
+
+class TextRefNode(WikiRefNode):
+ def format(self):
+ text = self.content.format()
+ if text:
+ return "%s (see %s) " % (text, self.ref)
+ else:
+ return "see " + self.ref
+
+class TextEnvNode(WikiEnvNode):
+ def format(self):
+ type = self.envtype
+ lev = self.level
+ if lev > self.parser.width - 4:
+ lev = 1
+ string = ""
+ n = 1
+ for s in self.content:
+ if not string.endswith("\n"):
+ string += "\n"
+ x = s.content.format()
+ if type == "unnumbered":
+ string += self.parser.indent(lev, "- " + x.lstrip(" "))
+ elif type == "numbered":
+ string += self.parser.indent(lev, "%d. %s" % (n, x))
+ n += 1
+ elif type == "defn":
+ if s.subtype == 0:
+ string += self.parser.indent(lev-1, x)
+ else:
+ string += self.parser.indent(lev+3, x)
+
+ if not string.endswith("\n"):
+ string += "\n"
+
+ return string
+
+class TextIndNode(WikiIndNode):
+ def format(self):
+ return (" " * self.level) + self.content.format() + '\n'
+
+class TextTagNode(WikiTagNode):
+ def format(self):
+ if self.tag == 'code':
+ self.parser.nested += 1
+ s = self.content.format()
+ self.parser.nested -= 1
+ elif self.tag == 'ref':
+ s = '[%d]' % (self.idx+1)
+ elif self.tag == 'references':
+ s = '\nReferences:\n'
+ for ref in self.parser.references:
+ s += ('[%d]. ' % (ref.idx+1)) + ref.content.format() + '\n'
+ else:
+ s = '<' + self.tag
+ if self.args:
+ s += ' ' + str(self.args)
+ s += '>' + self.content.format() + '</' + self.tag + '>'
+ return s
+
+
+class TextWikiMarkup(WikiMarkup):
+ """A Wiki markup to plain text translator.
+
+ Usage:
+
+ x = TextWikiMarkup(file="input.wiki")
+ # Parse the input:
+ x.parse()
+ # Print it as plain text:
+ print(str(x))
+
+ """
+
+ # Output width
+ width = 78
+ # Do not show references.
+ show_urls = False
+ # Provide a minimum markup
+ markup = True
+
+ # Number of current element in the environment
+ num = 0
+
+ # Array of footnote references
+ references = []
+
+ def __init__(self, *args, **keywords):
+ """Create a TextWikiMarkup object.
+
+ Arguments:
+
+ filename=FILE
+ Read Wiki material from the file named FILE.
+ file=FD
+ Read Wiki material from file object FD.
+ text=STRING
+ Read Wiki material from STRING.
+
+ width=N
+ Limit output width to N columns. Default is 78.
+ show_urls=False
+ By default, the link URLs are displayed in parentheses next to the
+ link text. If this argument is given, only the link text will be
+ displayed.
+ """
+
+ super(TextWikiMarkup,self).__init__(*args, **keywords)
+ if 'width' in keywords: