summaryrefslogtreecommitdiffabout
authorSergey Poznyakoff <gray@gnu.org>2018-08-16 12:45:00 (GMT)
committer Sergey Poznyakoff <gray@gnu.org>2018-08-17 10:17:11 (GMT)
commit7186dbab7f1c1227e9229866e086bc417e3e4e52 (patch) (side-by-side diff)
treef29114e9ff7a7b023dd3d611a9bc8808f5cf5bbd
parentd9e26129527ce84f626eb44ff95e4ecfbc5bc92a (diff)
downloadwikitrans-7186dbab7f1c1227e9229866e086bc417e3e4e52.tar.gz
wikitrans-7186dbab7f1c1227e9229866e086bc417e3e4e52.tar.bz2
Fix PEP 8 issues.
Diffstat (more/less context) (ignore whitespace changes)
-rw-r--r--WikiTrans/wikitoken.py188
-rw-r--r--tests/test_html.py8
-rw-r--r--tests/test_texi.py6
-rw-r--r--tests/test_text.py8
-rw-r--r--tests/wikitest.py (renamed from tests/WikiTest.py)6
-rw-r--r--wikitrans/__init__.py (renamed from WikiTrans/__init__.py)0
-rw-r--r--wikitrans/wiki2html.py (renamed from WikiTrans/wiki2html.py)115
-rw-r--r--wikitrans/wiki2texi.py (renamed from WikiTrans/wiki2texi.py)63
-rw-r--r--wikitrans/wiki2text.py (renamed from WikiTrans/wiki2text.py)74
-rw-r--r--wikitrans/wikidump.py (renamed from WikiTrans/wikidump.py)41
-rw-r--r--wikitrans/wikimarkup.py (renamed from WikiTrans/wikimarkup.py)784
-rw-r--r--wikitrans/wikins.py (renamed from WikiTrans/wikins.py)0
-rw-r--r--wikitrans/wikitoken.py318
13 files changed, 978 insertions, 633 deletions
diff --git a/WikiTrans/wikitoken.py b/WikiTrans/wikitoken.py
deleted file mode 100644
index 2238a66..0000000
--- a/WikiTrans/wikitoken.py
+++ b/dev/null
@@ -1,188 +0,0 @@
-# Wiki tokens. -*- coding: utf-8 -*-
-# Copyright (C) 2015-2018 Sergey Poznyakoff
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-
-from __future__ import print_function
-import re
-import json
-
-class WikiNodeEncoder(json.JSONEncoder):
- def default(self, obj):
- if isinstance(obj,WikiNode):
- return obj.jsonEncode()
- return json.JSONEncoder.default(self, obj)
-
-def jsonencoder(func):
- def _mkencoder(self):
- json = func(self)
- json['wikinode'] = self.__class__.__name__
- json['type'] = self.type
- return json
- return _mkencoder
-
-class WikiNode(object):
- type = 'UNDEF'
- parser = None
- def __init__(self, parser, **kwargs):
- self.parser = parser
- for key in kwargs:
- if hasattr(self,key):
- self.__dict__[key] = kwargs[key]
- else:
- raise AttributeError("'%s' has no attribute '%s'" % (self.__class__.__name__, key))
-
- def __str__(self):
- return json.dumps(self, cls=WikiNodeEncoder, sort_keys=True)
-
- @jsonencoder
- def jsonEncode(self):
- ret = {}
- for x in dir(self):
- if x == 'parser' or x.startswith('_') or type(x) == 'function':
- continue
- if x in self.__dict__:
- ret[x] = self.__dict__[x]
- return ret
-
- def format(self):
- pass
-
-class WikiContentNode(WikiNode):
- content = None
- def format(self):
- pass
- @jsonencoder
- def jsonEncode(self):
- ret = {}
- if self.content:
- if self.type == 'TEXT':
- ret['content'] = self.content
- elif isinstance(self.content,list):
- ret['content'] = map(lambda x: x.jsonEncode(), self.content)
- elif isinstance(self.content,WikiNode):
- ret['content'] = self.content.jsonEncode()
- else:
- ret['content'] = self.content
- else:
- ret['content'] = None
- return ret
-
-class WikiSeqNode(WikiContentNode):
- def format(self):
- for x in self.content:
- x.format()
- @jsonencoder
- def jsonEncode(self):
- ret = {}
- if not self.content:
- ret['content'] = None
- elif isinstance(self.content,list):
- ret['content'] = map(lambda x: x.jsonEncode(), self.content)
- elif isinstance(self.content,WikiNode):
- ret['content'] = self.content.jsonEncode()
- else:
- ret['content'] = self.content
- return ret
-
-
-# ##############
-
-class WikiTextNode(WikiContentNode):
- type = 'TEXT'
- @jsonencoder
- def jsonEncode(self):
- return {
- 'content': self.content
- }
-
-class WikiDelimNode(WikiContentNode):
- type = 'DELIM'
- isblock=False
- continuation = False
-
-class WikiTagNode(WikiContentNode):
- tag = None
- isblock = False
- args = None
- idx = None
- def __init__(self, *args, **keywords):
- super(WikiTagNode, self).__init__(*args, **keywords)
- if self.type == 'TAG' and self.tag == 'ref' and hasattr(self.parser,'references'):
- self.idx = len(self.parser.references)
- self.parser.references.append(self)
- @jsonencoder
- def jsonEncode(self):
- return {
- 'tag': self.tag,
- 'isblock': self.isblock,
- 'args': self.args.tab if self.args else None,
- 'content': self.content.jsonEncode() if self.content else None,
- 'idx': self.idx
- }
-
-class WikiRefNode(WikiContentNode):
- type = 'REF'
- ref = None
- @jsonencoder
- def jsonEncode(self):
- return {
- 'ref': self.ref,
- 'content': self.content.jsonEncode()
- }
-
-class WikiHdrNode(WikiContentNode):
- type = 'HDR'
- level = None
- @jsonencoder
- def jsonEncode(self):
- return {
- 'level': self.level,
- 'content': self.content.jsonEncode()
- }
-
-class WikiEltNode(WikiContentNode):
- type = 'ELT'
- subtype = None
- @jsonencoder
- def jsonEncode(self):
- return {
- 'subtype': self.subtype,
- 'content': self.content.jsonEncode()
- }
-
-class WikiEnvNode(WikiContentNode):
- type = 'ENV'
- envtype = None
- level = None
- @jsonencoder
- def jsonEncode(self):
- return {
- 'envtype': self.envtype,
- 'level': self.level,
- 'content': map(lambda x: x.jsonEncode(), self.content)
- }
-
-class WikiIndNode(WikiContentNode):
- type = 'IND'
- level = None
- @jsonencoder
- def jsonEncode(self):
- return {
- 'level': self.level,
- 'content': self.content.jsonEncode()
- }
-
-
-
diff --git a/tests/test_html.py b/tests/test_html.py
index 3da57f6..5a15cb8 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -2,13 +2,13 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import unittest
-from WikiTrans.wiki2html import HtmlWiktionaryMarkup
-from WikiTest import populateMethods
+from wikitrans.wiki2html import HtmlWikiMarkup
+from wikitest import populate_methods
-class TestWiktionaryMarkup (unittest.TestCase):
+class TestWikiMarkup (unittest.TestCase):
pass
-populateMethods(TestWiktionaryMarkup, HtmlWiktionaryMarkup, '.html')
+populate_methods(TestWikiMarkup, HtmlWikiMarkup, '.html')
if __name__ == '__main__':
unittest.main()
diff --git a/tests/test_texi.py b/tests/test_texi.py
index 75314c9..ddd26c7 100644
--- a/tests/test_texi.py
+++ b/tests/test_texi.py
@@ -2,13 +2,13 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import unittest
-from WikiTrans.wiki2texi import TexiWikiMarkup
-from WikiTest import populateMethods
+from wikitrans.wiki2texi import TexiWikiMarkup
+from wikitest import populate_methods
class TestTexiWikiMarkup (unittest.TestCase):
pass
-populateMethods(TestTexiWikiMarkup, TexiWikiMarkup, '.texi')
+populate_methods(TestTexiWikiMarkup, TexiWikiMarkup, '.texi')
if __name__ == '__main__':
unittest.main()
diff --git a/tests/test_text.py b/tests/test_text.py
index a06f519..b3d0a12 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -2,13 +2,13 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import unittest
-from WikiTrans.wiki2text import TextWiktionaryMarkup
-from WikiTest import populateMethods
+from wikitrans.wiki2text import TextWikiMarkup
+from wikitest import populate_methods
-class TestTextWiktionaryMarkup (unittest.TestCase):
+class TestTextWikiMarkup (unittest.TestCase):
pass
-populateMethods(TestTextWiktionaryMarkup, TextWiktionaryMarkup, '.text')
+populate_methods(TestTextWikiMarkup, TextWikiMarkup, '.text')
if __name__ == '__main__':
unittest.main()
diff --git a/tests/WikiTest.py b/tests/wikitest.py
index 1429f5e..ff26227 100644
--- a/tests/WikiTest.py
+++ b/tests/wikitest.py
@@ -4,7 +4,7 @@ from __future__ import print_function
from glob import glob
import os.path
-def MarkupTest(classname, name_in, name_out):
+def wiki_markup_test(classname, name_in, name_out):
fh = open(name_out)
buf = ''.join(fh.readlines()).strip()
fh.close()
@@ -19,10 +19,10 @@ def MarkupTest(classname, name_in, name_out):
print(">>>%s<<<" % str(hwm).strip())
return False
-def populateMethods(cls, wcls, suffix):
+def populate_methods(cls, wcls, suffix):
def settest(self, base, wiki_name, pat_name):
def dyntest(self):
- self.assertTrue(MarkupTest(wcls, wiki_name, pat_name))
+ self.assertTrue(wiki_markup_test(wcls, wiki_name, pat_name))
meth = 'test_' + wcls.__name__ + '_' + base
dyntest.__name__ = meth
setattr(cls, meth, dyntest)
diff --git a/WikiTrans/__init__.py b/wikitrans/__init__.py
index 5832e38..5832e38 100644
--- a/WikiTrans/__init__.py
+++ b/wikitrans/__init__.py
diff --git a/WikiTrans/wiki2html.py b/wikitrans/wiki2html.py
index 6147642..ce65bae 100644
--- a/WikiTrans/wiki2html.py
+++ b/wikitrans/wiki2html.py
@@ -15,10 +15,21 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
+"""
+Wiki markup to HTML translator.
+
+Classes:
+
+HtmlWikiMarkup -- Converts Wiki material to HTML.
+HtmlWiktionaryMarkup -- Reserved for future use. Currently does the same as
+ HtmlWikiMarkup.
+
+"""
+
from __future__ import print_function
-from WikiTrans.wikimarkup import *
-from WikiTrans.wikitoken import *
-from WikiTrans.wikins import wiki_ns_re, wiki_ns
+from wikitrans.wikimarkup import *
+from wikitrans.wikitoken import *
+from wikitrans.wikins import wiki_ns_re, wiki_ns
import re
try:
from urllib import quote as url_quote
@@ -79,16 +90,16 @@ class HtmlLinkNode(HtmlSeqNode):
else:
tgt = self.parser.mktgt(arg)
return "<a href=\"%s\">%s</a>" % (tgt,
- text if (text and text != '') \
- else arg)
+ text if (text and text != '') else arg)
class HtmlRefNode(WikiRefNode):
def format(self):
target = self.ref
text = self.content.format()
- return "<a href=\"%s\">%s</a>" % (target,
- text if (text and text != '') \
- else target)
+ return "<a href=\"%s\">%s</a>" % (
+ target,
+ text if (text and text != '') else target
+ )
class HtmlFontNode(HtmlSeqNode):
def format(self):
@@ -152,14 +163,14 @@ class HtmlTagNode(WikiTagNode):
n = 0
for ref in self.parser.references:
n += 1
- s += ('<li id="cite_note-%d">' + \
- '<span class="mw-cite-backlink">' + \
- '<b><a href="#cite_ref-%d">^</a></b>' + \
- '</span>' + \
- '<span class="reference-text">' + \
- ref.content.format() + \
- '</span>' + \
- '</li>\n') % (n,n)
+ s += ('<li id="cite_note-%d">'
+ + '<span class="mw-cite-backlink">'
+ + '<b><a href="#cite_ref-%d">^</a></b>'
+ + '</span>'
+ + '<span class="reference-text">'
+ + ref.content.format()
+ + '</span>'
+ + '</li>\n') % (n,n)
s += '</ol>\n</div>\n'
return s
else:
@@ -187,17 +198,49 @@ class HtmlIndNode(WikiIndNode):
return ("<dl><dd>" * self.level) + self.content.format() + "</dd></dl>" * self.level
-class HtmlWikiMarkup (WikiMarkup):
- """
- A (hopefully) general-purpose Wiki->HTML translator class.
- FIXME: 1. See WikiMarkup for a list
- 2. [[official position]]s : final 's' gets after closing </a> tag.
- Should be before.
+class HtmlWikiMarkup(WikiMarkup):
+ """A Wiki markup to HTML translator class.
+
+ Usage:
+
+ x = HtmlWikiMarkup(file="input.wiki")
+ # Parse the input:
+ x.parse()
+ # Print it as HTML:
+ print(str(x))
+
+ Known bugs:
+ * [[official position]]s
+ Final 's' gets after closing </a> tag. Should be before.
"""
nested = 0
references = []
def __init__(self, *args, **kwargs):
+ """Create a HtmlWikiMarkup object.
+
+ Arguments:
+
+ filename=FILE
+ Read Wiki material from the file named FILE.
+ file=FD
+ Read Wiki material from file object FD.
+ text=STRING
+ Read Wiki material from STRING.
+ lang=CODE
+ Specifies source language. Default is 'en'. This variable can be
+ referred to as '%(lang)s' in the keyword arguments below.
+ html_base=URL
+ Base URL for cross-references. Default is
+ 'http://%(lang)s.wiktionary.org/wiki/'
+ image_base=URL
+ Base URL for images. Default is
+ 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf'
+ media_base=URL
+ Base URL for media files. Default is
+ 'http://www.mediawiki.org/xml/export-0.3'
+ """
+
super(HtmlWikiMarkup, self).__init__(*args, **kwargs)
self.token_class['LINK'] = HtmlLinkNode
self.token_class['TMPL'] = HtmlLinkNode
@@ -270,30 +313,8 @@ class HtmlWikiMarkup (WikiMarkup):
str += elt.format()
return str
-class HtmlWiktionaryMarkup (HtmlWikiMarkup):
- """
- A class for translating Wiktionary articles into HTML.
- This version does not do much, except that it tries to correctly
- format templates. But "tries" does not mean "does". The heuristics
- used here is clearly not enough to cope with it.
-
- 1. FIXME:
- The right solution would be to have a database of templates with their
- semantics and to decide on their rendering depending on that. E.g.
- {{term}} in en.wiktionary means "replace this with the search term".
- This, however, does not work in other wiktionaries. There are
- also more complex templates, e.g.: {{t+|bg|врата|n|p|tr=vrata|sc=Cyrl}}
- I don't know what it means. Couldn't find any documentation either.
- Again, this template does not work in other dictionaries.
+class HtmlWiktionaryMarkup(HtmlWikiMarkup):
+ """A class for translating Wiktionary articles into HTML.
- 2. Capitulation notice:
- Given the:
- 1. vast amount of wiktionaries available,
- 2. abundance of various templates for each wictionary,
- 3. apparent lack of documentation thereof,
- 4. the lack of standardized language-independent templates,
- I dont see any way to cope with the template-rendering task within a
- reasonable amount of time.
-
- Faeci quod potui, faciant meliora potentes.
+ Reserved for future use. Currently does the same as HtmlWikiMarkup.
"""
diff --git a/WikiTrans/wiki2texi.py b/wikitrans/wiki2texi.py
index 7297195..d9e5f52 100644
--- a/WikiTrans/wiki2texi.py
+++ b/wikitrans/wiki2texi.py
@@ -15,9 +15,18 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
-from WikiTrans.wikimarkup import *
-from WikiTrans.wikitoken import *
-from WikiTrans.wikins import wiki_ns_re, wiki_ns
+"""
+Wiki markup to Texinfo translator.
+
+Classes:
+
+TexiWikiMarkup -- Converts Wiki material to Texinfo.
+
+"""
+
+from wikitrans.wikimarkup import *
+from wikitrans.wikitoken import *
+from wikitrans.wikins import wiki_ns_re, wiki_ns
import re
import urllib
@@ -251,7 +260,19 @@ class TexiRefNode(WikiRefNode):
else:
parser._print("@uref{%s}" % target, escape=False)
-class TexiWikiMarkup (WikiMarkup):
+class TexiWikiMarkup(WikiMarkup):
+ """Wiki markup to Texinfo translator class.
+
+ Usage:
+
+ x = TexiWikiMarkup(file="input.wiki")
+ # Parse the input:
+ x.parse()
+ # Print it as Texi:
+ print(str(x))
+
+ """
+
nested = 0
sectcomm = {
'numbered': [
@@ -288,6 +309,40 @@ class TexiWikiMarkup (WikiMarkup):
sectioning_start = 0
def __init__(self, *args, **keywords):
+ """Create a TexiWikiMarkup object.
+
+ Arguments:
+
+ filename=FILE
+ Read Wiki material from the file named FILE.
+ file=FD
+ Read Wiki material from file object FD.
+ text=STRING
+ Read Wiki material from STRING.
+
+ sectioning_model=MODEL
+ Select the Texinfo sectioning model for the output document. Possible
+ values are:
+
+ 'numbered'
+ Top of document is marked with "@top". Headings ("=", "==",
+ "===", etc) produce "@chapter", "@section", "@subsection", etc.
+ 'unnumbered'
+ Unnumbered sectioning: "@top", "@unnumbered", "@unnumberedsec",
+ "@unnumberedsubsec".
+ 'appendix'
+ Sectioning suitable for appendix entries: "@top", "@appendix",
+ "@appendixsec", "@appendixsubsec", etc.
+ 'heading'
+ Use heading directives to reflect sectioning: "@majorheading",
+ "@chapheading", "@heading", "@subheading", etc.
+ sectioning_start=N
+ Shift resulting heading level by N positions. For example, supposing
+ "sectioning_model='numbered'", "== A ==" normally produces
+ "@section A" on output. Now, if given "sectioning_start=1", this
+ directive will produce "@subsection A" instead.
+ """
+
super(TexiWikiMarkup, self).__init__(*args, **keywords)
self.token_class['TEXT'] = TexiTextNode
diff --git a/WikiTrans/wiki2text.py b/wikitrans/wiki2text.py
index cb3a183..1fbc61b 100644
--- a/WikiTrans/wiki2text.py
+++ b/wikitrans/wiki2text.py
@@ -15,9 +15,20 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
-from WikiTrans.wikitoken import *
-from WikiTrans.wikimarkup import *
-from WikiTrans.wikins import wiki_ns_re, wiki_ns
+"""
+Wiki markup to plain text translator.
+
+Classes:
+
+TextWikiMarkup -- Converts Wiki material to plain text.
+TextWiktionaryMarkup -- Reserved for future use. Currently does the same as
+ TextWikiMarkup.
+
+"""
+
+from wikitrans.wikitoken import *
+from wikitrans.wikimarkup import *
+from wikitrans.wikins import wiki_ns_re, wiki_ns
import re
try:
from urllib import quote as url_quote
@@ -107,9 +118,9 @@ class TextLinkNode(WikiSeqNode):
if not self.parser.show_urls:
return ""
text = "[%s: %s]" % (qual, text if text else arg)
- tgt = self.image_base + '/' + \
- url_quote(tgt) + \
- '/250px-' + url_quote(tgt)
+ tgt = "%s/%s/250px-%s" % (self.image_base,
+ url_quote(tgt),
+ url_quote(tgt))
elif ns == 'NS_MEDIA':
text = "[%s]" % (qual)
else:
@@ -141,8 +152,11 @@ class TextBarNode(WikiNode):
class TextHdrNode(WikiHdrNode):
def format(self):
- return "\n" + ("*" * self.level) + " " + \
- self.content.format().lstrip(" ") + "\n\n"
+ return ("\n"
+ + ("*" * self.level)
+ + " "
+ + self.content.format().lstrip(" ")
+ + "\n\n")
class TextRefNode(WikiRefNode):
def format(self):
@@ -204,9 +218,17 @@ class TextTagNode(WikiTagNode):
return s
-class TextWikiMarkup (WikiMarkup):
- """
- A (general-purpose Wiki->Text translator class.
+class TextWikiMarkup(WikiMarkup):
+ """A Wiki markup to plain text translator.
+
+ Usage:
+
+ x = TextWikiMarkup(file="input.wiki")
+ # Parse the input:
+ x.parse()
+ # Print it as plain text:
+ print(str(x))
+
"""
# Output width
@@ -223,6 +245,25 @@ class TextWikiMarkup (WikiMarkup):
references = []
def __init__(self, *args, **keywords):
+ """Create a TextWikiMarkup object.
+
+ Arguments:
+
+ filename=FILE
+ Read Wiki material from the file named FILE.
+ file=FD
+ Read Wiki material from file object FD.
+ text=STRING
+ Read Wiki material from STRING.
+
+ width=N
+ Limit output width to N columns. Default is 78.
+ show_urls=False
+ By default, the link URLs are displayed in parentheses next to the
+ link text. If this argument is given, only the link text will be
+ displayed.
+ """
+
super(TextWikiMarkup,self).__init__(*args, **keywords)
if 'width' in keywords:
self.width = keywords['width']
@@ -258,7 +299,7 @@ class TextWikiMarkup (WikiMarkup):
lang = self.lang
return self.html_base % { 'lang' : lang } + url_quote(tgt)
- def indent (self, lev, text):
+ def indent(self, lev, text):
if text.find('\n') == -1:
s = (" " * lev) + text
else:
@@ -298,9 +339,10 @@ class TextWikiMarkup (WikiMarkup):
str += elt.format()
return str
-class TextWiktionaryMarkup (TextWikiMarkup):
- """
- See documentation for HtmlWiktionaryMarkup
+class TextWiktionaryMarkup(TextWikiMarkup):
+ """A class for translating Wiktionary articles into plain text.
+
+ Reserved for future use. Currently does the same as TextWikiMarkup.
"""
- # FIXME: It is supposed to do something about templates
+
diff --git a/WikiTrans/wikidump.py b/wikitrans/wikidump.py
index 7457dfa..d5f651c 100644
--- a/WikiTrans/wikidump.py
+++ b/wikitrans/wikidump.py
@@ -14,10 +14,19 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
+"""
+Print Wiki parse tree as JSON.
+
+Classes:
+
+DumpWikiMarkup
+
+"""
+
from __future__ import print_function
-from WikiTrans.wikitoken import *
+from wikitrans.wikitoken import *
import json
-from WikiTrans.wikimarkup import WikiMarkup
+from wikitrans.wikimarkup import WikiMarkup
class DumpReferences(object):
idx = 0
@@ -27,13 +36,39 @@ class DumpReferences(object):
self.idx += 1
class DumpWikiMarkup(WikiMarkup):
+ """Produce a JSON dump of the Wiki markup parse tree.
+
+ Usage:
+
+ x = DumpWikiMarkup(file="input.wiki")
+ # Parse the input:
+ x.parse()
+ # Print a JSON dump of the parse tree
+ print(str(x))
+
+ """
+
indent = None
references = DumpReferences()
def __init__(self, **kwarg):
+ """Create a DumpWikiMarkup object.
+
+ Arguments:
+
+ filename=FILE
+ Read Wiki material from the file named FILE.
+ file=FD
+ Read Wiki material from file object FD.
+ text=STRING
+ Read Wiki material from STRING.
+ indent=N
+ Basic indent offset for JSON objects.
+ """
+
n = kwarg.pop('indent', None)
if n != None:
self.indent = int(n)
- WikiMarkup.__init__(self, **kwarg)
+ super(DumpWikiMarkup,self).__init__(self, **kwarg)
def __str__(self):
return json.dumps(self.tree,
cls=WikiNodeEncoder,
diff --git a/WikiTrans/wikimarkup.py b/wikitrans/wikimarkup.py
index 6cbf5de..77c3b30 100644
--- a/WikiTrans/wikimarkup.py
+++ b/wikitrans/wikimarkup.py
@@ -1,40 +1,65 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2008-2018 Sergey Poznyakoff
-#
+#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.
-#
+#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
-#
+#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
+"""
+Wiki markup parser.
+
+This module provides two class:
+
+WikiMarkupParser:
+ An abstract parser class, which serves as a base class for all markup
+ classes in this package.
+
+WikiMarkup
+ A subclass of the above, providing basic input method.
+
+"""
+
from __future__ import print_function
import sys
import re
from types import *
-from WikiTrans.wikitoken import *
+from wikitrans.wikitoken import *
-__all__ = [ "BaseWikiMarkup", "WikiMarkup",
- "TagAttributes", "TagAttributeSyntax" ]
+__all__ = [ "WikiMarkupParser", "WikiMarkup",
+ "TagAttributes", "TagAttributeSyntaxError" ]
-class UnexpectedToken(Exception):
+class UnexpectedTokenError(Exception):
def __init__(self, value):
self.value = value
-class TagAttributeSyntax(Exception):
+class TagAttributeSyntaxError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class TagAttributes(object):
+ """A dictionary-like collection of tag attributes.
+
+ Example:
+
+ attr = TagAttributes('href="foo" length=2')
+ if 'href' in attr:
+ print(x['href']) # returns "foo"
+ for a in attr:
+ ...
+ """
+
attrstart = re.compile("^(?P<attr>[a-zA-Z0-9_-]+)(?P<eq>=\")?")
valseg = re.compile("^[^\\\"]+")
tab = {}
@@ -68,7 +93,7 @@ class TagAttributes(object):
val = 1
self.tab[name] = val
else:
- raise TagAttributeSyntax(s)
+ raise TagAttributeSyntaxError(s)
def __len__(self):
return len(self.tab)
def __getitem__(self, key):
@@ -89,13 +114,32 @@ class TagAttributes(object):
def __repr__(self):
return self.printable
-class BaseWikiMarkup(object):
+class WikiMarkupParser(object):
+ """Parser for Wiki markup language.
+
+ Given input in Wiki markup language creates an abstract parse tree for it.
+ This is a base class for actual parsers. The subclasses must provide the
+ input method.
+
+ Public methods:
+
+ parse() -- parse the input.
+
+ Abstract methods (must be overridden by the subclass):
+
+ input() -- returns next physical line from the input material.
+
+ Public attributes:
+
+ tree -- constructed parse tree (a subclass of WikiNode)
+
+ """
delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^/][^>]+))?\s*(?P<closed>/)?>")
ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>")
refstart = re.compile("^https?://")
-
+
close_delim = {
'[': ']',
'[[': ']]',
@@ -115,10 +159,13 @@ class BaseWikiMarkup(object):
tree = None
tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ]
-
+
debug_level = 0
-
+
def dprint(self, lev, fmt, *argv):
+ """If current debug level is greater than or equal to lev, print *argv
+ according to format.
+ """
if self.debug_level >= lev:
for l in (fmt % argv).split('\n'):
print("[DEBUG] %s" % l)
@@ -135,7 +182,7 @@ class BaseWikiMarkup(object):
'TEXT': WikiTextNode,
'PRE': WikiContentNode,
'PARA': WikiSeqNode,
- 'BAR': WikiNode,
+ 'BAR': WikiNode,
'SEQ': WikiSeqNode,
'IND': WikiIndNode,
'REF': WikiRefNode,
@@ -148,10 +195,11 @@ class BaseWikiMarkup(object):
'HDR': WikiHdrNode
}
- def __createWikiNode(self,**kwarg):
+ def _new_node(self,**kwarg):
return self.token_class[kwarg['type']](self, **kwarg)
-
+
def tokread(self):
+ """Read next token from the input. Return it as a subclass of WikiNode."""
line = None
pos = 0
while 1:
@@ -161,23 +209,23 @@ class BaseWikiMarkup(object):
pos = 0
except StopIteration:
line = u''
-
+
if not line or line == "":
- yield(self.__createWikiNode(type='NIL'))
+ yield(self._new_node(type='NIL'))
break
if line == '\n':
- yield(self.__createWikiNode(type='NL'))
+ yield(self._new_node(type='NL'))
line = None
continue
self.dprint(100, "LINE: %s", line[pos:])
m = self.delim.search(line, pos)
-
+
if m:
if (pos < m.start(0)):
- yield(self.__createWikiNode(type='TEXT',
- content=line[pos:m.start(0)]))
+ yield(self._new_node(type='TEXT',
+ content=line[pos:m.start(0)]))
pos = m.start(0)
t = None
@@ -191,13 +239,13 @@ class BaseWikiMarkup(object):
try:
m = self.ctag.search(line, pos)
if m and m.group('tag') == 'nowiki':
- yield(self.__createWikiNode(type='TEXT',
- content=line[pos:m.start(0)] ))
+ yield(self._new_node(type='TEXT',
+ content=line[pos:m.start(0)] ))
pos = m.end(0)
break
- yield(self.__createWikiNode(type='TEXT',
- content=line[pos:]))
+ yield(self._new_node(type='TEXT',
+ content=line[pos:]))
line = self.input()
pos = 0
@@ -206,40 +254,41 @@ class BaseWikiMarkup(object):
continue
elif m.group('tag') in self.tags:
try:
- yield(self.__createWikiNode(type='OTAG',
+ yield(self._new_node(type='OTAG',
tag=m.group('tag'),
isblock=(line[pos] == '\n'),
args=TagAttributes(m.group('args'))))
if m.group('closed'):
- yield(self.__createWikiNode(type='CTAG',
- tag=m.group('tag')))
- except TagAttributeSyntax:
- yield(self.__createWikiNode(type='TEXT',content=m.group(0)))
+ yield(self._new_node(type='CTAG',
+ tag=m.group('tag')))
+ except TagAttributeSyntaxError:
+ yield(self._new_node(type='TEXT',
+ content=m.group(0)))
continue
else:
- yield(self.__createWikiNode(type='TEXT',content=m.group(0)))
+ yield(self._new_node(type='TEXT',content=m.group(0)))
continue
else:
m = self.ctag.match(line, pos)
if m:
if m.group('tag') in self.tags:
- yield(self.__createWikiNode(type='CTAG',
- tag=m.group('tag')))
+ yield(self._new_node(type='CTAG',
+ tag=m.group('tag')))
pos = m.end(0)
continue
else:
- yield(self.__createWikiNode(type='TEXT',
- content=line[pos:pos+1]))
+ yield(self._new_node(type='TEXT',
+ content=line[pos:pos+1]))
pos += 1
continue
else:
pos = m.end(0)
content = m.group(0)
if content[0] in self.envtypes:
- node = self.__createWikiNode(type='DELIM',
- content=content,
- isblock=True,
- continuation=pos < len(line) and line[pos] == ":")
+ node = self._new_node(type='DELIM',
+ content=content,
+ isblock=True,
+ continuation=pos < len(line) and line[pos] == ":")
if node.continuation:
node.content += node.content[0]
pos += 1
@@ -247,33 +296,43 @@ class BaseWikiMarkup(object):
yield(node)
while pos < len(line) and line[pos] in [' ', '\t']:
- pos += 1
+ pos += 1
else:
- yield(self.__createWikiNode(type='DELIM',
- isblock=(content.strip() not in self.inline_delims),
- content=content.strip()))
+ yield(self._new_node(type='DELIM',
+ isblock=(content.strip() not in self.inline_delims),
+ content=content.strip()))
continue
if line:
if line[-1] == '\n':
if line[pos:-1] != '':
- yield(self.__createWikiNode(type='TEXT',content=line[pos:-1]))
- yield(self.__createWikiNode(type='NL'))
+ yield(self._new_node(type='TEXT',content=line[pos:-1]))
+ yield(self._new_node(type='NL'))
else:
- yield(self.__createWikiNode(type='TEXT',content=line[pos:]))
+ yield(self._new_node(type='TEXT',content=line[pos:]))
line = None
-
+
def input(self):
+ """Return next physical line from the input.
+
+ This method must be overridden by the subclass.
+ """
return None
def swaptkn(self, i, j):
+ """Swap tokens at indices i and j in toklist."""
self.dprint(80, "SWAPPING %s <-> %s", i, j)
- x = self.toklist[i]
+ x = self.toklist[i]
self.toklist[i] = self.toklist[j]
self.toklist[j] = x
def tokenize(self):
+ """Tokenize the input.
+
+ Read tokens from the input (supplied by the input() method). Place the
+ obtained tokens in the toklist array.
+ """
self.toklist = []
for tok in self.tokread():
self.dprint(100, "TOK: %s", tok)
@@ -286,14 +345,14 @@ class BaseWikiMarkup(object):
#
# 2a. '''''a b'' c d'''
# 2b. '''''a b''' c d''
- #
+ #
# 3a. '''a b ''c d'''''
# 3b. ''a b '''c d'''''
stack = []
for i in range(0,len(self.toklist)):
- if self.toklist[i].type == 'DELIM' \
- and (self.toklist[i].content == "''" \
- or self.toklist[i].content == "'''"):
+ if (self.toklist[i].type == 'DELIM'
+ and (self.toklist[i].content == "''"
+ or self.toklist[i].content == "'''")):
if len(stack) > 0:
if self.toklist[stack[-1]].content == self.toklist[i].content:
# Case 1: just pop the matching delimiter off the stack
@@ -303,12 +362,13 @@ class BaseWikiMarkup(object):
self.swaptkn(stack[-2], stack[-1])
# and pop off the matching one
stack.pop()
- elif i < len(self.toklist) \
- and self.toklist[i+1].type == 'DELIM' \
- and self.toklist[stack[-1]].content == self.toklist[i+1].content:
+ elif (i < len(self.toklist)
+ and self.toklist[i+1].type == 'DELIM'
+ and self.toklist[stack[-1]].content
+ == self.toklist[i+1].content):
# Case 3: swap current and next tokens
self.swaptkn(i, i+1)
- # and pop off the matching one
+ # and pop off the matching one
stack.pop()
else:
# Push the token on stack
@@ -321,34 +381,46 @@ class BaseWikiMarkup(object):
self.toklist[i].type = 'TEXT' # FIXME
mark = []
-
+
def push_mark(self):
+ """Save the current token index on stack."""
self.mark.append(self.tokind)
def pop_mark(self):
+ """Restore the token index from top of stack."""
self.tokind = self.mark.pop()
def clear_mark(self):
+ """Forget the last mark."""
self.mark.pop()
-
+
def lookahead(self, off=0):
+ """Peek a token at index (tokind+off)."""
tok = self.toklist[self.tokind+off]
self.dprint(20, "lookahead(%s): %s", off, tok)
return tok
def setkn(self,val):
+ """Store token val at the current token index."""
self.toklist[self.tokind] = val
-
+
def getkn(self):
+ """Get next token from the toklist. Advance tokind."""
self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL'
if self.tokind == len(self.toklist):
- return self.__createWikiNode(type='NIL')
+ return self._new_node(type='NIL')
tok = self.toklist[self.tokind]
self.tokind = self.tokind + 1
self.dprint(20, "getkn: %s", tok)
return tok
-
+
def ungetkn(self, tok=None):
+ """Unget the last read token.
+
+ Decrease the tokind by one, so the last read token will be read again.
+ If optional argument is supplied and is not None, store it in the toklist
+ in place of the current token.
+ """
self.tokind = self.tokind - 1
self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL'
if tok:
@@ -357,17 +429,20 @@ class BaseWikiMarkup(object):
return self.toklist[self.tokind]
def fixuptkn(self, tok):
+ """Replace the recently read token by tok."""
if self.tokind == 0:
- raise IndexError('wikimarkup.fixuptkn called at start of input')
+ raise IndexError('WikiMarkupParser.fixuptkn called at start of input')
self.toklist[self.tokind-1] = tok
return tok
-
- def dump(self, tree, file=sys.stdout):
+
+ def dump(self, tree, file=sys.stdout):
+ """Dump the tree to file, node by node."""
for node in tree:
file.write(str(node))
file.write('\n')
def is_block_end(self, tok):
+ """Return True if tok ends a block environment."""
if tok.type == 'NIL':
return True
elif tok.type == 'NL':
@@ -383,20 +458,21 @@ class BaseWikiMarkup(object):
return False
def parse_para(self, tok):
+ """Read paragraph starting at tok."""
self.dprint(80, "ENTER parse_para: %s", tok)
acc = { 'seq': [],
'textlist': [] }
-
+
def flush():
if acc['textlist']:
- acc['seq'].append(self.__createWikiNode(type='TEXT',
- content=''.join(acc['textlist'])))
+ acc['seq'].append(self._new_node(type='TEXT',
+ content=''.join(acc['textlist'])))
acc['textlist'] = []
- if isinstance(tok, WikiContentNode) \
- and isinstance(tok.content,str) \
- and re.match("^[ \t]", tok.content):
+ if (isinstance(tok, WikiContentNode)
+ and isinstance(tok.content,str)
+ and re.match("^[ \t]", tok.content)):
type = 'PRE'
rx = re.compile("^\S")
else:
@@ -418,26 +494,27 @@ class BaseWikiMarkup(object):
flush()
acc['seq'].append(self.parse_inline_delim(tok))
else:
- raise UnexpectedToken(tok)
+ raise UnexpectedTokenError(tok)
tok = self.getkn()
flush()
if acc['seq']:
- tok = self.__createWikiNode(type=type, content=acc['seq'])
+ tok = self._new_node(type=type, content=acc['seq'])
else:
tok = None
self.dprint(80, "LEAVE parse_para=%s", tok)
return tok
def parse_block_delim(self, tok):
+ """Parse block environment starting at tok."""
self.dprint(80, "ENTER parse_block_delim")
assert(tok.type == 'DELIM')
if tok.content == "----":
- node = self.__createWikiNode(type = 'BAR')
+ node = self._new_node(type = 'BAR')
elif tok.content[0:2] == "==":
node = self.parse_header(tok)
if not node:
- tok = self.ungetkn(self.__createWikiNode(type='TEXT',
- content=tok.content))
+ tok = self.ungetkn(self._new_node(type='TEXT',
+ content=tok.content))
elif tok.content[0] in self.envtypes:
node = None
if tok.content[0] == ':':
@@ -451,8 +528,9 @@ class BaseWikiMarkup(object):
node = None
self.dprint(80, "LEAVE parse_block_delim=%s", node)
return node
-
+
def parse_line(self):
+ """Parse the input line."""
self.dprint(80, "ENTER parse_line")
list = []
while True:
@@ -463,8 +541,7 @@ class BaseWikiMarkup(object):
list.append(tok)
elif tok.type == 'DELIM':
if tok.isblock:
- tok = self.__createWikiNode(type = 'TEXT',
- content = tok.content)
+ tok = self._new_node(type = 'TEXT', content = tok.content)
self.fixuptkn(tok)
list.append(tok)
elif tok.content[0] == ":":
@@ -476,7 +553,8 @@ class BaseWikiMarkup(object):
if x:
list.append(x)
else:
- list.append(self.fixuptkn(self.__createWikiNode(type = 'TEXT', content = tok.content)))
+ list.append(self.fixuptkn(self._new_node(type = 'TEXT',
+ content = tok.content)))
elif tok.type == 'OTAG':
if tok.isblock:
self.ungetkn()
@@ -484,18 +562,26 @@ class BaseWikiMarkup(object):
list.append(self.parse_tag(tok))
else:
list.append(tok)
- ret = self.__createWikiNode(type='SEQ', content=list)
+ ret = self._new_node(type='SEQ', content=list)
self.dprint(80, "LEAVE parse_line=%s", ret)
return ret
-
+
def parse_indent(self, tok):
+ """Parse indented block starting at tok."""
lev = len(tok.content)
self.dprint(80, "ENTER parse_indent(%s)", lev)
- x = self.__createWikiNode(type='IND', level=lev, content=self.parse_line())
+ x = self._new_node(type='IND', level=lev, content=self.parse_line())
self.dprint(80, "LEAVE parse_indent=%s", x)
return x
-
+
def parse_fontmod(self,delim,what):
+ """Parse font modification directive (bold or italics).
+
+ Arguments:
+
+ delim -- starting delimiter ("''" or "'''")
+ what -- 'IT' or 'BOLD'
+ """
self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s",
delim, what, self.lookahead())
seq = []
@@ -513,7 +599,7 @@ class BaseWikiMarkup(object):
break
else:
if text:
- seq.append(self.__createWikiNode(type='TEXT', content=text))
+ seq.append(self._new_node(type='TEXT', content=text))
text = ''
x = self.parse_inline_delim(tok)
if x:
@@ -522,17 +608,18 @@ class BaseWikiMarkup(object):
self.dprint(80, "LEAVE parse_fontmod=%s", "None")
return None
elif tok.type == 'NL':
- seq.append(self.__createWikiNode(type='TEXT', content='\n'))
+ seq.append(self._new_node(type='TEXT', content='\n'))
else:
self.dprint(80, "LEAVE parse_fontmod=None")
return None
if text:
- seq.append(self.__createWikiNode(type='TEXT', content=text))
- res = self.__createWikiNode(type=what, content=seq)
- self.dprint(80, "LEAVE parse_fontmod=%s", res)
+ seq.append(self._new_node(type='TEXT', content=text))
+ res = self._new_node(type=what, content=seq)
+ self.dprint(80, "LEAVE parse_fontmod=%s", res)
return res
def parse_ref(self):
+ """Parse a reference block ([...])"""
self.dprint(80, "ENTER parse_ref")
tok = self.getkn()
if not (tok.type == 'TEXT' and self.refstart.match(tok.content)):
@@ -542,7 +629,7 @@ class BaseWikiMarkup(object):
seq = []
(ref,sep,text) = tok.content.partition(' ')
if text:
- seq.insert(0, self.__createWikiNode(type='TEXT', content=text))
+ seq.insert(0, self._new_node(type='TEXT', content=text))
while True:
tok = self.getkn()
@@ -567,13 +654,22 @@ class BaseWikiMarkup(object):
else:
seq.append(tok)
- ret = self.__createWikiNode(type='REF',
- ref=ref,
- content=self.__createWikiNode(type='SEQ', content=seq))
+ ret = self._new_node(type='REF', ref=ref,
+ content=self._new_node(type='SEQ', content=seq))
self.dprint(80, "LEAVE parse_ref= %s", ret)
return ret
def parse_link(self, type, delim):
+ """Parse an external link ([[...]]).
+
+ In this implementation, it is also used to parse template
+ references ({{...}}).
+
+ Arguments:
+
+ type -- 'LINK' or 'TMPL'
+ delim -- expected closing delimiter.
+ """
self.dprint(80, "ENTER parse_link(%s,%s)", type, delim)
subtree = []
list = []
@@ -585,13 +681,13 @@ class BaseWikiMarkup(object):
if tok.type == 'DELIM':
if tok.content == delim:
if list:
- subtree.append(self.__createWikiNode(type='SEQ',
- content=list))
+ subtree.append(self._new_node(type='SEQ',
+ content=list))
break
elif tok.content == "|":
if len(list) > 1:
- subtree.append(self.__createWikiNode(type='SEQ',
- content=list))
+ subtree.append(self._new_node(type='SEQ',
+ content=list))
elif list:
subtree.append(list[0])
list = []
@@ -607,11 +703,12 @@ class BaseWikiMarkup(object):
else:
self.dprint(80, "LEAVE parse_link=None [unexpected token]")
return None
- ret = self.__createWikiNode(type=type, content=subtree)
+ ret = self._new_node(type=type, content=subtree)
self.dprint(80, "LEAVE parse_link=%s", ret)
return ret
-
+
def parse_inline_delim(self, tok):
+ """Parse an inline block."""
self.dprint(80, "ENTER parse_inline_delim")
assert(tok.type == 'DELIM')
self.push_mark()
@@ -633,8 +730,7 @@ class BaseWikiMarkup(object):
else:
self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok)
self.pop_mark()
- x = self.fixuptkn(self.__createWikiNode(type='TEXT',
- content=tok.content))
+ x = self.fixuptkn(self._new_node(type='TEXT', content=tok.content))
od = tok.content
if od in self.close_delim:
cd = self.close_delim[od]
@@ -647,8 +743,8 @@ class BaseWikiMarkup(object):
lev += 1
elif tok.content == cd:
if lev == 0:
- tok = self.__createWikiNode(type='TEXT',
- content=tok.content)
+ tok = self._new_node(type='TEXT',
+ content=tok.content)
self.toklist[self.tokind+1+i] = tok
lev -= 1
break
@@ -656,8 +752,9 @@ class BaseWikiMarkup(object):
self.dprint(80, "LEAVE parse_inline_delim=%s", x)
return x
-
+
def parse_tag(self, tag):
+ """Parse an xml-like tag (such as, e.g. "<tt>...</tt>")."""
self.dprint(80, "ENTER parse_tag")
list = []
self.push_mark()
@@ -669,7 +766,7 @@ class BaseWikiMarkup(object):
if tag.args:
s += ' ' + str(tag.args)
s += '>'
- node = self.__createWikiNode(type='TEXT',content=s)
+ node = self._new_node(type='TEXT',content=s)
if tag.content:
self.tree[self.tokind:self.tokind] = tag.content
self.dprint(80, "LEAVE parse_tag = %s (tree modified)", node)
@@ -685,30 +782,30 @@ class BaseWikiMarkup(object):
if tag.tag == tok.tag:
break
s = '</' + tag.tag + '>'
- tok = self.fixuptkn(self.__createWikiNode(type='TEXT',
- content=s))
+ tok = self.fixuptkn(self._new_node(type='TEXT', content=s))
elif tok.type == 'NL':
- tok = self.__createWikiNode(type = 'TEXT', content = '\n')
+ tok = self._new_node(type = 'TEXT', content = '\n')
list.append(tok)
self.clear_mark()
- ret = self.__createWikiNode(type = 'TAG',
- tag = tag.tag,
- args = tag.args,
- isblock = tag.isblock,
- content = self.__createWikiNode(type = 'SEQ', content = list))
+ ret = self._new_node(type = 'TAG',
+ tag = tag.tag,
+ args = tag.args,
+ isblock = tag.isblock,
+ content = self._new_node(type = 'SEQ', content = list))
self.dprint(80, "LEAVE parse_tag = %s", ret)
return ret
-
+
def parse_env(self, tok):
+ """Parse a block environment (numbered, unnumbered, or definition list)."""
type = self.envtypes[tok.content[0]][0]
lev = len(tok.content)
self.dprint(80, "ENTER parse_env(%s,%s)",type,lev)
list = []
while True:
- if tok.type == 'DELIM' \
- and tok.content[0] in self.envtypes \
- and type == self.envtypes[tok.content[0]][0]:
+ if (tok.type == 'DELIM'
+ and tok.content[0] in self.envtypes
+ and type == self.envtypes[tok.content[0]][0]):
if len(tok.content) < lev:
self.ungetkn()
break
@@ -717,9 +814,9 @@ class BaseWikiMarkup(object):
else:
elt = self.parse_line()
if not tok.continuation:
- list.append(self.__createWikiNode(type='ELT',
- subtype=self.envtypes[tok.content[0]][1],
- content=elt))
+ list.append(self._new_node(type='ELT',
+ subtype=self.envtypes[tok.content[0]][1],
+ content=elt))
tok = self.getkn()
continue
@@ -727,7 +824,7 @@ class BaseWikiMarkup(object):
if list[-1].content.type != 'SEQ':
x = list[-1].content.content
# FIXME:
- list[-1].content = self.__createWikiNode(type='SEQ', content=[x])
+ list[-1].content = self._new_node(type='SEQ', content=[x])
list[-1].content.content.append(elt)
else:
self.ungetkn()
@@ -735,21 +832,21 @@ class BaseWikiMarkup(object):
tok = self.getkn()
- ret = self.__createWikiNode(type='ENV',
- envtype=type,
- level=lev,
- content=list)
+ ret = self._new_node(type='ENV',
+ envtype=type,
+ level=lev,
+ content=list)
self.dprint(80, "LEAVE parse_env=%s", ret)
return ret
-
+
def parse_header(self, tok):
+ """Parse a Wiki header."""
self.dprint(80, "ENTER parse_header")
self.push_mark()
list = []
delim = tok.content
while True:
tok = self.getkn()
-
if tok.type == 'NL':
self.pop_mark()
self.dprint(80, "LEAVE parse_header=None")
@@ -779,17 +876,15 @@ class BaseWikiMarkup(object):
self.dprint(80, "LEAVE parse_header=None")
return None
list.append(self.parse_tag(tok))
-
-
self.clear_mark()
- ret = self.__createWikiNode(type='HDR',
- level = len(delim),
- content = self.__createWikiNode(type='SEQ',
- content=list))
+ ret = self._new_node(type='HDR',
+ level=len(delim),
+ content=self._new_node(type='SEQ', content=list))
self.dprint(80, "LEAVE parse_header=%s", ret)
return ret
-
+
def parse_block(self):
+ """Parse next block: newline, delimiter, tag, or paragraph."""
tok = self.getkn()
while tok.type == 'NL':
tok = self.getkn()
@@ -805,8 +900,12 @@ class BaseWikiMarkup(object):
return self.parse_tag(tok)
return self.parse_para(tok)
-
+
def parse(self):
+ """Parse Wiki material supplied by the input() method.
+
+ Store the resulting abstract parsing tree in the tree attribute.
+ """
if not self.toklist:
self.tokenize()
if self.debug_level >= 90:
@@ -829,10 +928,10 @@ class BaseWikiMarkup(object):
return str(self.tree)
-class WikiMarkup (BaseWikiMarkup):
+class WikiMarkup(WikiMarkupParser):
"""
- A derived class, that supplies a basic input method.
-
+ A derived parser class that supplies a basic input method.
+
Three types of inputs are available:
1. filename=<file>
@@ -849,13 +948,14 @@ class WikiMarkup (BaseWikiMarkup):
... Do whatever you need with obj.tree ...
"""
+
file = None
text = None
lang = 'en'
html_base = 'http://%(lang)s.wiktionary.org/wiki/'
image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf'
media_base = 'http://www.mediawiki.org/xml/export-0.3'
-
+
def __init__(self, *args, **keywords):
for kw in keywords:
if kw == 'file':
@@ -885,339 +985,301 @@ class WikiMarkup (BaseWikiMarkup):
else:
return None
- def is_lang_link(self, elt):
- if elt.type == 'LINK' \
- and isinstance(elt.content, list) \
- and len(elt.content) == 1:
- if elt.content[0].type == TEXT:
- m = re.match('([\w-]+):', elt.content[0].content)
- if m: # and m.group(1) in self.langtab:
- return True
- elif elt.content[0].type == 'SEQ' \
- and len(elt.content[0].content) == 1 and\
- elt.content[0].content[0].type == TEXT:
- m = re.match('([\w-]+):',elt.content[0].content[0].content)
- if m: # and m.group(1) in self.langtab:
- return True
- return False
-
- def is_empty_text(self, elt):
- if elt.type == 'TEXT':
- if re.search('\w', elt.content):
- return False
- return True
- return False
-
- def is_empty_para(self, seq):
- for x in seq:
- if not (self.is_lang_link(x) or self.is_empty_text(x)):
- return False
- return True
-
- # ISO 639
+ # ISO 639
langtab = {
"aa": "Afar", # Afar
- "ab": "Аҧсуа", # Abkhazian
- "ae": None, # Avestan
- "af": "Afrikaans", # Afrikaans
- "ak": "Akana", # Akan
+ "ab": "Аҧсуа", # Abkhazian
+ "ae": None, # Avestan
+ "af": "Afrikaans", # Afrikaans
+ "ak": "Akana", # Akan
"als": "Alemannisch",
- "am": "አማርኛ", # Amharic
- "an": "Aragonés", # Aragonese
+ "am": "አማርኛ", # Amharic
+ "an": "Aragonés", # Aragonese
"ang": "Englisc",
- "ar": "العربية" , # Arabic
+ "ar": "العربية" , # Arabic
"arc": "ܐܪܡܝܐ",
- "as": "অসমীয়া", # Assamese
- "ast": "Asturian",
- "av": "Авар", # Avaric
- "ay": "Aymara", # Aymara
- "az": "Azərbaycan" , # Azerbaijani
-
- "ba": "Башҡорт", # Bashkir
- "bar": "Boarisch",
+ "as": "অসমীয়া", # Assamese
+ "ast": "Asturian",
+ "av": "Авар", # Avaric
+ "ay": "Aymara", # Aymara
+ "az": "Azərbaycan" , # Azerbaijani
+
+ "ba": "Башҡорт", # Bashkir
+ "bar": "Boarisch",
"bat-smg": "Žemaitėška",
"bcl": "Bikol",
- "be": "Беларуская", # Byelorussian; Belarusian
+ "be": "Беларуская", # Byelorussian; Belarusian
"be-x-old": "Беларуская (тарашкевіца)",
- "bg": "Български", # Bulgarian
- "bh": "भोजपुरी", # Bihari
- "bi": "Bislama", # Bislama
- "bm": "Bamanankan", # Bambara
- "bn": "বাংলা" , # Bengali; Bangla
- "bo": "བོད་སྐད", # Tibetan
+ "bg": "Български", # Bulgarian
+ "bh": "भोजपुरी", # Bihari
+ "bi": "Bislama", # Bislama
+ "bm": "Bamanankan", # Bambara
+ "bn": "বাংলা" , # Bengali; Bangla
+ "bo": "བོད་སྐད", # Tibetan
"bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" ,
- "br": "Brezhoneg" , # Breton
- "bs": "Bosanski" , # Bosnian
+ "br": "Brezhoneg" , # Breton
+ "bs": "Bosanski" , # Bosnian
"bug": "Basa Ugi",
"bxr": "Буряад",
- "ca": "Català" , # Catalan
+ "ca": "Català" , # Catalan
"cbk-zam": "Chavacano de Zamboanga",
"cdo": "Mìng-dĕ̤ng-ngṳ̄",
"cho": "Choctaw",
- "ce": "Нохчийн", # Chechen
+ "ce": "Нохчийн", # Chechen
"ceb": "Sinugboanong Binisaya" , # Cebuano
- "ch": "Chamor", # Chamorro
+ "ch": "Chamor", # Chamorro
"chr": "ᏣᎳᎩ",
"chy": "Tsetsêhestâhese",
- "co": "Cors", # Corsican
- "cr": "Nehiyaw", # Cree
+ "co": "Cors", # Corsican
+ "cr": "Nehiyaw", # Cree
"crh": "Qırımtatarca",
- "cs": "Česky" , # Czech
+ "cs": "Česky" , # Czech
"csb": "Kaszëbsczi",
- "c": "Словѣньскъ", # Church Slavic
- "cv": "Чăваш", # Chuvash
- "cy": "Cymraeg" , # Welsh
+ "c": "Словѣньскъ", # Church Slavic
+ "cv": "Чăваш", # Chuvash
+ "cy": "Cymraeg" , # Welsh
- "da": "Dansk" , # Danish
- "de": "Deutsch" , # German
+ "da": "Dansk" , # Danish
+ "de": "Deutsch" , # German
"diq": "Zazaki", # Dimli (Southern Zazaki)
"dsb": "Dolnoserbski",
- "dv": "ދިވެހިބަސް", # Divehi
- "dz": "ཇོང་ཁ", # Dzongkha; Bhutani
+ "dv": "ދިވެހިބަސް", # Divehi
+ "dz": "ཇོང་ཁ", # Dzongkha; Bhutani
- "ee": "Eʋegbe", # Ewe
- "el": "Ελληνικά" , # Greek
+ "ee": "Eʋegbe", # Ewe
+ "el": "Ελληνικά" , # Greek
"eml": "Emiliàn e rumagnòl",
- "en": "English" , # English
+ "en": "English" , # English
"eo": "Esperanto" ,
- "es": "Español" , # Spanish
- "et": "Eesti" , # Estonian
- "eu": "Euskara" , # Basque
+ "es": "Español" , # Spanish
+ "et": "Eesti" , # Estonian
+ "eu": "Euskara" , # Basque
"ext": "Estremeñ",
- "fa": "فارسی" , # Persian
- "ff": "Fulfulde", # Fulah
- "fi": "Suomi" , # Finnish
+ "fa": "فارسی" , # Persian
+ "ff": "Fulfulde", # Fulah
+ "fi": "Suomi" , # Finnish
"fiu-vro": "Võro",
- "fj": "Na Vosa Vakaviti",# Fijian; Fiji
- "fo": "Føroyskt" , # Faroese
- "fr": "Français" , # French
+ "fj": "Na Vosa Vakaviti",# Fijian; Fiji
+ "fo": "Føroyskt" , # Faroese
+ "fr": "Français" , # French
"frp": "Arpitan",
"fur": "Furlan",
- "fy": "Frysk", # Frisian
+ "fy": "Frysk", # Frisian
- "ga": "Gaeilge", # Irish
+ "ga": "Gaeilge", # Irish
"gan": "贛語 (Gànyŭ)",
- "gd": "Gàidhlig", # Scots; Gaelic
- "gl": "Gallego" , # Gallegan; Galician
+ "gd": "Gàidhlig", # Scots; Gaelic
+ "gl": "Gallego" , # Gallegan; Galician
"glk": "گیلکی",
"got": "𐌲𐌿𐍄𐌹𐍃𐌺𐍉𐍂𐌰𐌶𐌳𐌰",
- "gn": "Avañe'ẽ", # Guarani
- "g": "ગુજરાતી", # Gujarati
- "gv": "Gaelg", # Manx
+ "gn": "Avañe'ẽ", # Guarani
+ "g": "ગુજરાતી", # Gujarati
+ "gv": "Gaelg", # Manx
- "ha": "هَوُسَ", # Hausa
+ "ha": "هَوُسَ", # Hausa
"hak": "Hak-kâ-fa / 客家話",
"haw": "Hawai`i",
- "he": "עברית" , # Hebrew (formerly iw)
- "hi": "हिन्दी" , # Hindi
+ "he": "עברית" , # Hebrew (formerly iw)
+ "hi": "हिन्दी" , # Hindi
"hif": "Fiji Hindi",
- "ho": "Hiri Mot", # Hiri Motu
- "hr": "Hrvatski" , # Croatian
+ "ho": "Hiri Mot", # Hiri Motu
+ "hr": "Hrvatski" , # Croatian
"hsb": "Hornjoserbsce",
- "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole
- "hu": "Magyar" , # Hungarian
- "hy": "Հայերեն", # Armenian
- "hz": "Otsiherero", # Herero
+ "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole
+ "hu": "Magyar" , # Hungarian
+ "hy": "Հայերեն", # Armenian
+ "hz": "Otsiherero", # Herero
"ia": "Interlingua",
"ie": "Interlingue",
- "id": "Bahasa Indonesia",# Indonesian (formerly in)
- "ig": "Igbo", # Igbo
- "ii": "ꆇꉙ ", # Sichuan Yi
- "ik": "Iñupiak", # Inupiak
+ "id": "Bahasa Indonesia",# Indonesian (formerly in)
+ "ig": "Igbo", # Igbo
+ "ii": "ꆇꉙ ", # Sichuan Yi
+ "ik": "Iñupiak", # Inupiak
"ilo": "Ilokano",
"io": "Ido" ,
- "is": "Íslenska" , # Icelandic
- "it": "Italiano" , # Italian
- "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut
+ "is": "Íslenska" , # Icelandic
+ "it": "Italiano" , # Italian
+ "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut
- "ja": "日本語", # Japanese
+ "ja": "日本語", # Japanese
"jbo": "Lojban",
- "jv": "Basa Jawa", # Javanese
+ "jv": "Basa Jawa", # Javanese
- "ka": "ქართული" , # Georgian
+ "ka": "ქართული" , # Georgian
"kaa": "Qaraqalpaqsha",
"kab": "Taqbaylit",
- "kg": "KiKongo", # Kongo
- "ki": "Gĩkũyũ", # Kikuyu
- "kj": "Kuanyama", # Kuanyama
- "kk": "Қазақша", # Kazakh
- "kl": "Kalaallisut", # Kalaallisut; Greenlandic
- "km": "ភាសាខ្មែរ", # Khmer; Cambodian
- "kn": "ಕನ್ನಡ", # Kannada
- "ko": "한국어" , # Korean
- "kr": "Kanuri", # Kanuri
- "ks": "कश्मीरी / كشميري", # Kashmiri
+ "kg": "KiKongo", # Kongo
+ "ki": "Gĩkũyũ", # Kikuyu
+ "kj": "Kuanyama", # Kuanyama
+ "kk": "Қазақша", # Kazakh
+ "kl": "Kalaallisut", # Kalaallisut; Greenlandic
+ "km": "ភាសាខ្មែរ", # Khmer; Cambodian
+ "kn": "ಕನ್ನಡ", # Kannada
+ "ko": "한국어" , # Korean
+ "kr": "Kanuri", # Kanuri
+ "ks": "कश्मीरी / كشميري", # Kashmiri
"ksh": "Ripoarisch",
- "ku": "Kurdî / كوردی", # Kurdish
- "kv": "Коми", # Komi
- "kw": "Kernewek/Karnuack", # Cornish
- "ky": "Кыргызча", # Kirghiz
+ "ku": "Kurdî / كوردی", # Kurdish
+ "kv": "Коми", # Komi
+ "kw": "Kernewek/Karnuack", # Cornish
+ "ky": "Кыргызча", # Kirghiz
- "la": "Latina" , # Latin
+ "la": "Latina" , # Latin
"lad": "Dzhudezmo",
- "lb": "Lëtzebuergesch" , # Letzeburgesch
+ "lb": "Lëtzebuergesch" , # Letzeburgesch
"lbe": "Лакку",
- "lg": "Luganda", # Ganda
- "li": "Limburgs", # Limburgish; Limburger; Limburgan
- "lij": "Lígur",
- "ln": "Lingala", # Lingala
+ "lg": "Luganda", # Ganda
+ "li": "Limburgs", # Limburgish; Limburger; Limburgan
+ "lij": "Lígur",
+ "ln": "Lingala", # Lingala
"lmo": "Lumbaart",
- "lo": "ລາວ", # Lao; Laotian
- "lt": "Lietuvių" , # Lithuanian
- "lua": "Luba", # Luba
- "lv": "Latvieš" , # Latvian; Lettish
+ "lo": "ລາວ", # Lao; Laotian
+ "lt": "Lietuvių" , # Lithuanian
+ "lua": "Luba", # Luba
+ "lv": "Latvieš" , # Latvian; Lettish
"map-bms": "Basa Banyumasan",
"mdf": "Мокшень (Mokshanj Kälj)",
- "mg": "Malagasy", # Malagasy
- "mh": "Ebon", # Marshall
- "mi": "Māori", # Maori
- "mk": "Македонски" , # Macedonian
- "ml": None, # Malayalam
- "mn": "Монгол", # Mongolian
- "mo": "Молдовеняскэ", # Moldavian
- "mr": "मराठी" , # Marathi
- "ms": "Bahasa Melay" , # Malay
- "mt": "Malti", # Maltese
+ "mg": "Malagasy", # Malagasy
+ "mh": "Ebon", # Marshall
+ "mi": "Māori", # Maori
+ "mk": "Македонски" , # Macedonian
+ "ml": None, # Malayalam
+ "mn": "Монгол", # Mongolian
+ "mo": "Молдовеняскэ", # Moldavian
+ "mr": "मराठी" , # Marathi
+ "ms": "Bahasa Melay" , # Malay
+ "mt": "Malti", # Maltese
"mus": "Muskogee",
- "my": "မ္ရန္‌မာစာ", # Burmese
+ "my": "မ္ရန္‌မာစာ", # Burmese
"myv": "Эрзянь (Erzjanj Kelj)",
"mzn": "مَزِروني",
- "na": "dorerin Naoero", # Nauru
+ "na": "dorerin Naoero", # Nauru
"nah": "Nāhuatl",
"nap": "Nnapulitano",
- "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l
- "nd": None, # Ndebele, North
+ "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l
+ "nd": None, # Ndebele, North
"nds": "Plattdüütsch",
"nds-nl": "Nedersaksisch",
- "ne": "नेपाली", # Nepali
+ "ne": "नेपाली", # Nepali
"new": "नेपाल भाषा" , # Nepal Bhasa
- "ng": "Oshiwambo", # Ndonga
- "nl": "Nederlands" , # Dutch
- "nn": "Nynorsk", # Norwegian Nynorsk
- "no": "Norsk (Bokmål)" , # Norwegian
+ "ng": "Oshiwambo", # Ndonga
+ "nl": "Nederlands" , # Dutch
+ "nn": "Nynorsk", # Norwegian Nynorsk
+ "no": "Norsk (Bokmål)" , # Norwegian
"nov": "Novial",
- "nr": None, # Ndebele, South
+ "nr": None, # Ndebele, South
"nrm": "Nouormand/Normaund",
- "nv": "Diné bizaad", # Navajo
- "ny": "Chi-Chewa", # Chichewa; Nyanja
+ "nv": "Diné bizaad", # Navajo
+ "ny": "Chi-Chewa", # Chichewa; Nyanja
- "oc": "Occitan", # Occitan; Proven@,{c}al
- "oj": None, # Ojibwa
- "om": "Oromoo", # (Afan) Oromo
- "or": "ଓଡ଼ିଆ", # Oriya
- "os": "Иронау", # Ossetian; Ossetic
+ "oc": "Occitan", # Occitan; Proven@,{c}al
+ "oj": None, # Ojibwa
+ "om": "Oromoo", # (Afan) Oromo
+ "or": "ଓଡ଼ିଆ", # Oriya
+ "os": "Иронау", # Ossetian; Ossetic
- "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi
+ "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi
"pag": "Pangasinan",
"pam": "Kapampangan",
"pap": "Papiament",
"pdc": "Deitsch",
- "pi": "पाऴि", # Pali
+ "pi": "पाऴि", # Pali
"pih": "Norfuk",
- "pl": "Polski" , # Polish
+ "pl": "Polski" , # Polish
"pms": "Piemontèis" ,
- "ps": "پښتو", # Pashto, Pushto
- "pt": "Português" , # Portuguese
+ "ps": "پښتو", # Pashto, Pushto
+ "pt": "Português" , # Portuguese
- "q": "Runa Simi" , # Quechua
+ "q": "Runa Simi" , # Quechua
- "rm": "Rumantsch", # Rhaeto-Romance
+ "rm": "Rumantsch", # Rhaeto-Romance
"rmy": "romani - रोमानी",
- "rn": "Kirundi", # Rundi; Kirundi
- "ro": "Română" , # Romanian
+ "rn": "Kirundi", # Rundi; Kirundi
+ "ro": "Română" , # Romanian
"roa-rup": "Armãneashce",
"roa-tara": "Tarandíne",
- "ru": "Русский" , # Russian
- "rw": "Ikinyarwanda", # Kinyarwanda
+ "ru": "Русский" , # Russian
+ "rw": "Ikinyarwanda", # Kinyarwanda
- "sa": "संस्कृतम्", # Sanskrit
+ "sa": "संस्कृतम्", # Sanskrit
"sah": "Саха тыла (Saxa Tyla)",
- "sc": "Sardu", # Sardinian
- "scn": "Sicilian",
+ "sc": "Sardu", # Sardinian
+ "scn": "Sicilian",
"sco": "Scots",
- "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi
- "se": "Sámegiella", # Northern Sami
- "sg": "Sängö", # Sango; Sangro
+ "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi
+ "se": "Sámegiella", # Northern Sami
+ "sg": "Sängö", # Sango; Sangro
"sh": "Srpskohrvatski / Српскохрватски" ,
"si": "සිංහල",
"simple": "Simple English" ,
- "sk": "Slovenčina" , # Slovak
- "sl": "Slovenščina" , # Slovenian
- "sm": "Gagana Samoa", # Samoan
- "sn": "chiShona", # Shona
- "so": "Soomaaliga", # Somali
- "sr": "Српски / Srpski", # Serbian
+ "sk": "Slovenčina" , # Slovak
+ "sl": "Slovenščina" , # Slovenian
+ "sm": "Gagana Samoa", # Samoan
+ "sn": "chiShona", # Shona
+ "so": "Soomaaliga", # Somali
+ "sr": "Српски / Srpski", # Serbian
"srn": "Sranantongo",
- "ss": "SiSwati", # Swati; Siswati
- "st": "Sesotho", # Sesotho; Sotho, Southern
+ "ss": "SiSwati", # Swati; Siswati
+ "st": "Sesotho", # Sesotho; Sotho, Southern
"stk": "Seeltersk",
- "s": "Basa Sunda", # Sundanese
+ "s": "Basa Sunda", # Sundanese
"sq": "Shqip" , # Albanian
"szl": "Ślůnski",
- "sv": "Svenska" , # Swedish
- "sw": "Kiswahili", # Swahili
+ "sv": "Svenska" , # Swedish
+ "sw": "Kiswahili", # Swahili
- "ta": "தமிழ்" , # Tamil
- "te": "తెలుగు" , # Telugu
+ "ta": "தமிழ்" , # Tamil
+ "te": "తెలుగు" , # Telugu
"tet": "Tetun",
- "tg": "Тоҷикӣ", # Tajik
- "th": "ไทย" , # Thai
- "ti": "ትግርኛ", # Tigrinya
- "tk": "تركمن / Туркмен", # Turkmen
- "tl": "Tagalog" , # Tagalog
- "tn": "Setswana", # Tswana; Setswana
- "to": "faka Tonga", # Tonga (?) # Also ZW ; MW
+ "tg": "Тоҷикӣ", # Tajik
+ "th": "ไทย" , # Thai
+ "ti": "ትግርኛ", # Tigrinya
+ "tk": "تركمن / Туркмен", # Turkmen
+ "tl": "Tagalog" , # Tagalog
+ "tn": "Setswana", # Tswana; Setswana
+ "to": "faka Tonga", # Tonga (?) # Also ZW ; MW
"tokipona": "Tokipona",
"tpi": "Tok Pisin",
- "tr": "Türkçe" , # Turkish
- "ts": "Xitsonga", # Tsonga
- "tt": "Tatarça / Татарча", # Tatar
+ "tr": "Türkçe" , # Turkish
+ "ts": "Xitsonga", # Tsonga
+ "tt": "Tatarça / Татарча", # Tatar
"tum": "chiTumbuka",
- "tw": "Twi", # Twi
- "ty": "Reo Mā`ohi", # Tahitian
+ "tw": "Twi", # Twi
+ "ty": "Reo Mā`ohi", # Tahitian
"udm": "Удмурт кыл",
- "ug": "Oyghurque", # Uighur
- "uk": "Українська" , # Ukrainian
- "ur": "اردو", # Urdu
- "uz": "O‘zbek", # Uzbek
+ "ug": "Oyghurque", # Uighur
+ "uk": "Українська" , # Ukrainian
+ "ur": "اردو", # Urdu
+ "uz": "O‘zbek", # Uzbek
- "ve": "Tshivenda", # Venda
+ "ve": "Tshivenda", # Venda
"vec": "Vèneto",
- "vi": "Tiếng Việt" , # Vietnamese
+ "vi": "Tiếng Việt" , # Vietnamese
"vls": "West-Vlams",
"vo": "Volapük" ,
-
- "wa": "Walon", # Walloon
+
+ "wa": "Walon", # Walloon
"war": "Winaray",
- "wo": "Wolof", # Wolof
+ "wo": "Wolof", # Wolof
"w": "吴语",
"xal": "Хальмг",
- "xh": "isiXhosa", # Xhosa
+ "xh": "isiXhosa", # Xhosa
- "yi": "ייִדיש", # Yiddish
- "yo": "Yorùbá", # Yoruba
+ "yi": "ייִדיש", # Yiddish
+ "yo": "Yorùbá", # Yoruba
- "za": "Cuengh", # Zhuang
+ "za": "Cuengh", # Zhuang
"zea": "Zeêuws",
- "zh": "中文" , # Chinese
+ "zh": "中文" , # Chinese
"zh-classical": "古文 / 文言文",
"zm-min-nan": "Bân-lâm-gú",
"zh-yue": "粵語",
- "zu": "isiZulu" # Zulu
+ "zu": "isiZulu" # Zulu
}
-
-
-
-
-
-
-
-
-
diff --git a/WikiTrans/wikins.py b/wikitrans/wikins.py
index 4fb5315..4fb5315 100644
--- a/WikiTrans/wikins.py
+++ b/wikitrans/wikins.py
diff --git a/wikitrans/wikitoken.py b/wikitrans/wikitoken.py
new file mode 100644
index 0000000..49c6c68
--- a/dev/null
+++ b/wikitrans/wikitoken.py
@@ -0,0 +1,318 @@
+# Wiki tokens. -*- coding: utf-8 -*-
+# Copyright (C) 2015-2018 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+Wiki markup tokens and associated classes.
+
+This module defines classes for the basic nodes of the Wiki markup parse tree:
+
+WikiNode -- Abstract parse tree node.
+WikiContentNode -- A node associated with some content.
+WikiSeqNode -- A sequence of nodes.
+WikiTextNode -- Textual content.
+WikiDelimNode -- Delimiter.
+WikiTagNode -- Tag (e.g. <tt>, </tt>, <tt />, etc.)
+WikiRefNode -- Wiki reference (e.g. [target|name])
+WikiHdrNode -- Heading (e.g. == Section ==)
+WikiEltNode -- Environment element.
+WikiEnvNode -- Environment (numbered or unnumbered list, definition, etc.)
+WikiIndNode -- Indent node.
+
+Auxiliary classes:
+
+WikiNodeEncoder -- Custom JSONEncoder subclass for serializing objects of the
+ above classes.
+"""
+
+from __future__ import print_function
+import re
+import json
+
+class WikiNodeEncoder(json.JSONEncoder):
+ """Custom JSONEncoder subclass for serializing WikiNode and its subclasses."""
+ def default(self, obj):
+ if isinstance(obj,WikiNode):
+ return obj.jsonEncode()
+ return json.JSONEncoder.default(self, obj)
+
+def jsonencoder(func):
+ def _mkencoder(self):
+ json = func(self)
+ json['wikinode'] = self.__class__.__name__
+ json['type'] = self.type
+ return json
+ return _mkencoder
+
+class WikiNode(object):
+ """Generic parse tree node.
+
+ Attributes:
+
+ type -- actual type of this object (string)
+ parser -- parser instance that owns this node
+ """
+
+ type = 'UNDEF'
+ parser = None
+
+ def __init__(self, parser, **kwargs):
+ self.parser = parser
+ for key in kwargs:
+ if hasattr(self,key):
+ self.__dict__[key] = kwargs[key]
+ else:
+ raise AttributeError("'%s' has no attribute '%s'" % (self.__class__.__name__, key))
+
+ def __str__(self):
+ return json.dumps(self, cls=WikiNodeEncoder, sort_keys=True)
+
+ @jsonencoder
+ def jsonEncode(self):
+ ret = {}
+ for x in dir(self):
+ if x == 'parser' or x.startswith('_') or type(x) == 'function':
+ continue
+ if x in self.__dict__:
+ ret[x] = self.__dict__[x]
+ return ret
+
+ def format(self):
+ """Abstract formatting function.
+
+ Derived classes must override it.
+ """
+ pass
+
+class WikiContentNode(WikiNode):
+ """Generic content node.
+
+ Attributes:
+
+ content -- Actual content
+ """
+
+ content = None
+
+ def format(self):
+ pass
+
+ @jsonencoder
+ def jsonEncode(self):
+ ret = {}
+ if self.content:
+ if self.type == 'TEXT':
+ ret['content'] = self.content
+ elif isinstance(self.content,list):
+ ret['content'] = map(lambda x: x.jsonEncode(), self.content)
+ elif isinstance(self.content,WikiNode):
+ ret['content'] = self.content.jsonEncode()
+ else:
+ ret['content'] = self.content
+ else:
+ ret['content'] = None
+ return ret
+
+class WikiSeqNode(WikiContentNode):
+ """Generic sequence of nodes.
+
+ Attributes:
+
+ content -- list of nodes.
+ """
+
+ def format(self):
+ for x in self.content:
+ x.format()
+
+ @jsonencoder
+ def jsonEncode(self):
+ ret = {}
+ if not self.content:
+ ret['content'] = None
+ elif isinstance(self.content,list):
+ ret['content'] = map(lambda x: x.jsonEncode(), self.content)
+ elif isinstance(self.content,WikiNode):
+ ret['content'] = self.content.jsonEncode()
+ else:
+ ret['content'] = self.content
+ return ret
+
+
+# ##############
+
+class WikiTextNode(WikiContentNode):
+ """Text node.
+
+ Attributes:
+
+ type -- 'TEXT'
+ content -- string
+ """
+
+ type = 'TEXT'
+
+ @jsonencoder
+ def jsonEncode(self):
+ return {
+ 'content': self.content
+ }
+
+class WikiDelimNode(WikiContentNode):
+ """Delimiter node.
+
+ Attributes:
+
+ type -- 'DELIM'
+ content -- actual delimiter string
+ isblock -- boolean indicating whether it is a block delimiter
+ continuation -- True if continuation is expected
+ """
+
+ type = 'DELIM'
+ isblock=False
+ continuation = False
+
+class WikiTagNode(WikiContentNode):
+ """A Wiki tag.
+
+ Attributes:
+
+ tag -- actual tag name (with '<', '>', and eventual '/' stripped)
+ isblock -- True if this is a block tag
+ args -- List of tag arguments
+ idx -- If this is a "see also" reference, index of this ref in the
+ list of references.
+ FIXME: Perhaps this merits a subclass?
+ """
+
+ tag = None
+ isblock = False
+ args = None
+ idx = None
+
+ def __init__(self, *args, **keywords):
+ super(WikiTagNode, self).__init__(*args, **keywords)
+ if self.type == 'TAG' and self.tag == 'ref' and hasattr(self.parser,'references'):
+ self.idx = len(self.parser.references)
+ self.parser.references.append(self)
+
+ @jsonencoder
+ def jsonEncode(self):
+ return {
+ 'tag': self.tag,
+ 'isblock': self.isblock,
+ 'args': self.args.tab if self.args else None,
+ 'content': self.content.jsonEncode() if self.content else None,
+ 'idx': self.idx
+ }
+
+class WikiRefNode(WikiContentNode):
+ """Reference node.
+
+ This class represents a wiki reference, such as "[ref|content]".
+
+ Attributes:
+
+ ref -- actual reference
+ content -- content string
+ """
+
+ type = 'REF'
+ ref = None
+ @jsonencoder
+ def jsonEncode(self):
+ return {
+ 'ref': self.ref,
+ 'content': self.content.jsonEncode()
+ }
+
+class WikiHdrNode(WikiContentNode):
+ """A wiki markup header class.
+
+ Attributes:
+
+ level -- header level
+ content -- header content (WikiNode subclass object)
+ """
+
+ type = 'HDR'
+ level = None
+
+ @jsonencoder
+ def jsonEncode(self):
+ return {
+ 'level': self.level,
+ 'content': self.content.jsonEncode()
+ }
+
+class WikiEltNode(WikiContentNode):
+ """Environment element node.
+
+ Attributes:
+
+ subtype -- type of the environment (numbered, unnumbered, defn)
+ content -- content of the element (WikiNode subclass object)
+ """
+
+ type = 'ELT'
+ subtype = None
+
+ @jsonencoder
+ def jsonEncode(self):
+ return {
+ 'subtype': self.subtype,
+ 'content': self.content.jsonEncode()
+ }
+
+class WikiEnvNode(WikiContentNode):
+ """Wiki Environment Node
+
+ Attributes:
+
+ envtype -- type of the environment (numbered, unnumbered, defn)
+ level -- nesting level of the environment
+ """
+
+ type = 'ENV'
+ envtype = None
+ level = None
+
+ @jsonencoder
+ def jsonEncode(self):
+ return {
+ 'envtype': self.envtype,
+ 'level': self.level,
+ 'content': map(lambda x: x.jsonEncode(), self.content)
+ }
+
+class WikiIndNode(WikiContentNode):
+ """Indented block node.
+
+ Attributes:
+
+ level -- Indentation level.
+ content -- Indented content (WikiNode subclass object).
+ """
+
+ type = 'IND'
+ level = None
+
+ @jsonencoder
+ def jsonEncode(self):
+ return {
+ 'level': self.level,
+ 'content': self.content.jsonEncode()
+ }

Return to:

Send suggestions and report system problems to the System administrator.