author | Sergey Poznyakoff <gray@gnu.org> | 2018-08-16 12:45:00 (GMT) |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org> | 2018-08-17 10:17:11 (GMT) |
commit | 7186dbab7f1c1227e9229866e086bc417e3e4e52 (patch) (side-by-side diff) | |
tree | f29114e9ff7a7b023dd3d611a9bc8808f5cf5bbd | |
parent | d9e26129527ce84f626eb44ff95e4ecfbc5bc92a (diff) | |
download | wikitrans-7186dbab7f1c1227e9229866e086bc417e3e4e52.tar.gz wikitrans-7186dbab7f1c1227e9229866e086bc417e3e4e52.tar.bz2 |
Fix PEP 8 issues.
-rw-r--r-- | WikiTrans/wikitoken.py | 188 | ||||
-rw-r--r-- | tests/test_html.py | 8 | ||||
-rw-r--r-- | tests/test_texi.py | 6 | ||||
-rw-r--r-- | tests/test_text.py | 8 | ||||
-rw-r--r-- | tests/wikitest.py (renamed from tests/WikiTest.py) | 6 | ||||
-rw-r--r-- | wikitrans/__init__.py (renamed from WikiTrans/__init__.py) | 0 | ||||
-rw-r--r-- | wikitrans/wiki2html.py (renamed from WikiTrans/wiki2html.py) | 115 | ||||
-rw-r--r-- | wikitrans/wiki2texi.py (renamed from WikiTrans/wiki2texi.py) | 63 | ||||
-rw-r--r-- | wikitrans/wiki2text.py (renamed from WikiTrans/wiki2text.py) | 74 | ||||
-rw-r--r-- | wikitrans/wikidump.py (renamed from WikiTrans/wikidump.py) | 41 | ||||
-rw-r--r-- | wikitrans/wikimarkup.py (renamed from WikiTrans/wikimarkup.py) | 784 | ||||
-rw-r--r-- | wikitrans/wikins.py (renamed from WikiTrans/wikins.py) | 0 | ||||
-rw-r--r-- | wikitrans/wikitoken.py | 318 |
13 files changed, 978 insertions, 633 deletions
diff --git a/WikiTrans/wikitoken.py b/WikiTrans/wikitoken.py deleted file mode 100644 index 2238a66..0000000 --- a/WikiTrans/wikitoken.py +++ b/dev/null @@ -1,188 +0,0 @@ -# Wiki tokens. -*- coding: utf-8 -*- -# Copyright (C) 2015-2018 Sergey Poznyakoff -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. - -from __future__ import print_function -import re -import json - -class WikiNodeEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj,WikiNode): - return obj.jsonEncode() - return json.JSONEncoder.default(self, obj) - -def jsonencoder(func): - def _mkencoder(self): - json = func(self) - json['wikinode'] = self.__class__.__name__ - json['type'] = self.type - return json - return _mkencoder - -class WikiNode(object): - type = 'UNDEF' - parser = None - def __init__(self, parser, **kwargs): - self.parser = parser - for key in kwargs: - if hasattr(self,key): - self.__dict__[key] = kwargs[key] - else: - raise AttributeError("'%s' has no attribute '%s'" % (self.__class__.__name__, key)) - - def __str__(self): - return json.dumps(self, cls=WikiNodeEncoder, sort_keys=True) - - @jsonencoder - def jsonEncode(self): - ret = {} - for x in dir(self): - if x == 'parser' or x.startswith('_') or type(x) == 'function': - continue - if x in self.__dict__: - ret[x] = self.__dict__[x] - return ret - - def format(self): - pass - -class WikiContentNode(WikiNode): - content = None - def format(self): - pass - @jsonencoder - def jsonEncode(self): - ret = {} - if self.content: - if self.type == 'TEXT': - ret['content'] = self.content - elif isinstance(self.content,list): - ret['content'] = map(lambda x: x.jsonEncode(), self.content) - elif isinstance(self.content,WikiNode): - ret['content'] = self.content.jsonEncode() - else: - ret['content'] = self.content - else: - ret['content'] = None - return ret - -class WikiSeqNode(WikiContentNode): - def format(self): - for x in self.content: - x.format() - @jsonencoder - def jsonEncode(self): - ret = {} - if not self.content: - ret['content'] = None - elif isinstance(self.content,list): - ret['content'] = map(lambda x: x.jsonEncode(), self.content) - elif isinstance(self.content,WikiNode): - ret['content'] = self.content.jsonEncode() - else: - ret['content'] = self.content - return ret - - -# ############## - -class WikiTextNode(WikiContentNode): - type = 'TEXT' - @jsonencoder - def jsonEncode(self): - return { - 'content': self.content - } - -class WikiDelimNode(WikiContentNode): - type = 'DELIM' - isblock=False - continuation = False - -class WikiTagNode(WikiContentNode): - tag = None - isblock = False - args = None - idx = None - def __init__(self, *args, **keywords): - super(WikiTagNode, self).__init__(*args, **keywords) - if self.type == 'TAG' and self.tag == 'ref' and hasattr(self.parser,'references'): - self.idx = len(self.parser.references) - self.parser.references.append(self) - @jsonencoder - def jsonEncode(self): - return { - 'tag': self.tag, - 'isblock': self.isblock, - 'args': self.args.tab if self.args else None, - 'content': self.content.jsonEncode() if self.content else None, - 'idx': self.idx - } - -class WikiRefNode(WikiContentNode): - type = 'REF' - ref = None - @jsonencoder - def jsonEncode(self): - return { - 'ref': self.ref, - 'content': self.content.jsonEncode() - } - -class WikiHdrNode(WikiContentNode): - type = 'HDR' - level = None - @jsonencoder - def jsonEncode(self): - return { - 'level': self.level, - 'content': self.content.jsonEncode() - } - -class WikiEltNode(WikiContentNode): - type = 'ELT' - subtype = None - @jsonencoder - def jsonEncode(self): - return { - 'subtype': self.subtype, - 'content': self.content.jsonEncode() - } - -class WikiEnvNode(WikiContentNode): - type = 'ENV' - envtype = None - level = None - @jsonencoder - def jsonEncode(self): - return { - 'envtype': self.envtype, - 'level': self.level, - 'content': map(lambda x: x.jsonEncode(), self.content) - } - -class WikiIndNode(WikiContentNode): - type = 'IND' - level = None - @jsonencoder - def jsonEncode(self): - return { - 'level': self.level, - 'content': self.content.jsonEncode() - } - - - diff --git a/tests/test_html.py b/tests/test_html.py index 3da57f6..5a15cb8 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,14 +1,14 @@ #!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import print_function import unittest -from WikiTrans.wiki2html import HtmlWiktionaryMarkup -from WikiTest import populateMethods +from wikitrans.wiki2html import HtmlWikiMarkup +from wikitest import populate_methods -class TestWiktionaryMarkup (unittest.TestCase): +class TestWikiMarkup (unittest.TestCase): pass -populateMethods(TestWiktionaryMarkup, HtmlWiktionaryMarkup, '.html') +populate_methods(TestWikiMarkup, HtmlWikiMarkup, '.html') if __name__ == '__main__': unittest.main() diff --git a/tests/test_texi.py b/tests/test_texi.py index 75314c9..ddd26c7 100644 --- a/tests/test_texi.py +++ b/tests/test_texi.py @@ -1,14 +1,14 @@ #!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import print_function import unittest -from WikiTrans.wiki2texi import TexiWikiMarkup -from WikiTest import populateMethods +from wikitrans.wiki2texi import TexiWikiMarkup +from wikitest import populate_methods class TestTexiWikiMarkup (unittest.TestCase): pass -populateMethods(TestTexiWikiMarkup, TexiWikiMarkup, '.texi') +populate_methods(TestTexiWikiMarkup, TexiWikiMarkup, '.texi') if __name__ == '__main__': unittest.main() diff --git a/tests/test_text.py b/tests/test_text.py index a06f519..b3d0a12 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -1,14 +1,14 @@ #!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import print_function import unittest -from WikiTrans.wiki2text import TextWiktionaryMarkup -from WikiTest import populateMethods +from wikitrans.wiki2text import TextWikiMarkup +from wikitest import populate_methods -class TestTextWiktionaryMarkup (unittest.TestCase): +class TestTextWikiMarkup (unittest.TestCase): pass -populateMethods(TestTextWiktionaryMarkup, TextWiktionaryMarkup, '.text') +populate_methods(TestTextWikiMarkup, TextWikiMarkup, '.text') if __name__ == '__main__': unittest.main() diff --git a/tests/WikiTest.py b/tests/wikitest.py index 1429f5e..ff26227 100644 --- a/tests/WikiTest.py +++ b/tests/wikitest.py @@ -1,13 +1,13 @@ #!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import print_function from glob import glob import os.path -def MarkupTest(classname, name_in, name_out): +def wiki_markup_test(classname, name_in, name_out): fh = open(name_out) buf = ''.join(fh.readlines()).strip() fh.close() hwm = classname(filename=name_in, lang="en") hwm.parse() @@ -16,16 +16,16 @@ def MarkupTest(classname, name_in, name_out): # fail print("\n>>>%s<<<" % buf) print(">>>%s<<<" % str(hwm).strip()) return False -def populateMethods(cls, wcls, suffix): +def populate_methods(cls, wcls, suffix): def settest(self, base, wiki_name, pat_name): def dyntest(self): - self.assertTrue(MarkupTest(wcls, wiki_name, pat_name)) + self.assertTrue(wiki_markup_test(wcls, wiki_name, pat_name)) meth = 'test_' + wcls.__name__ + '_' + base dyntest.__name__ = meth setattr(cls, meth, dyntest) for file in glob('testdata/*.wiki'): if os.path.isfile(file): patfile = file[:len(file) - 5] + suffix diff --git a/WikiTrans/__init__.py b/wikitrans/__init__.py index 5832e38..5832e38 100644 --- a/WikiTrans/__init__.py +++ b/wikitrans/__init__.py diff --git a/WikiTrans/wiki2html.py b/wikitrans/wiki2html.py index 6147642..ce65bae 100644 --- a/WikiTrans/wiki2html.py +++ b/wikitrans/wiki2html.py @@ -12,16 +12,27 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. +""" +Wiki markup to HTML translator. + +Classes: + +HtmlWikiMarkup -- Converts Wiki material to HTML. +HtmlWiktionaryMarkup -- Reserved for future use. Currently does the same as + HtmlWikiMarkup. + +""" + from __future__ import print_function -from WikiTrans.wikimarkup import * -from WikiTrans.wikitoken import * -from WikiTrans.wikins import wiki_ns_re, wiki_ns +from wikitrans.wikimarkup import * +from wikitrans.wikitoken import * +from wikitrans.wikins import wiki_ns_re, wiki_ns import re try: from urllib import quote as url_quote except ImportError: from urllib.parse import quote as url_quote @@ -76,22 +87,22 @@ class HtmlLinkNode(HtmlSeqNode): text = self.parser.langtab[qual] else: tgt = self.parser.mktgt(tgt) else: tgt = self.parser.mktgt(arg) return "<a href=\"%s\">%s</a>" % (tgt, - text if (text and text != '') \ - else arg) + text if (text and text != '') else arg) class HtmlRefNode(WikiRefNode): def format(self): target = self.ref text = self.content.format() - return "<a href=\"%s\">%s</a>" % (target, - text if (text and text != '') \ - else target) + return "<a href=\"%s\">%s</a>" % ( + target, + text if (text and text != '') else target + ) class HtmlFontNode(HtmlSeqNode): def format(self): comm = { 'IT': 'i', 'BOLD': 'b' } s = '<%s>' % comm[self.type] @@ -149,20 +160,20 @@ class HtmlTagNode(WikiTagNode): elif self.tag == 'references': s = '<div class="references">\n' s += '<ol class="references">\n' n = 0 for ref in self.parser.references: n += 1 - s += ('<li id="cite_note-%d">' + \ - '<span class="mw-cite-backlink">' + \ - '<b><a href="#cite_ref-%d">^</a></b>' + \ - '</span>' + \ - '<span class="reference-text">' + \ - ref.content.format() + \ - '</span>' + \ - '</li>\n') % (n,n) + s += ('<li id="cite_note-%d">' + + '<span class="mw-cite-backlink">' + + '<b><a href="#cite_ref-%d">^</a></b>' + + '</span>' + + '<span class="reference-text">' + + ref.content.format() + + '</span>' + + '</li>\n') % (n,n) s += '</ol>\n</div>\n' return s else: s = '<' + self.tag if self.args: s += ' ' + str(self.args) @@ -184,23 +195,55 @@ class HtmlPreNode(HtmlSeqNode): class HtmlIndNode(WikiIndNode): def format(self): return ("<dl><dd>" * self.level) + self.content.format() + "</dd></dl>" * self.level -class HtmlWikiMarkup (WikiMarkup): - """ - A (hopefully) general-purpose Wiki->HTML translator class. - FIXME: 1. See WikiMarkup for a list - 2. [[official position]]s : final 's' gets after closing </a> tag. - Should be before. +class HtmlWikiMarkup(WikiMarkup): + """A Wiki markup to HTML translator class. + + Usage: + + x = HtmlWikiMarkup(file="input.wiki") + # Parse the input: + x.parse() + # Print it as HTML: + print(str(x)) + + Known bugs: + * [[official position]]s + Final 's' gets after closing </a> tag. Should be before. """ nested = 0 references = [] def __init__(self, *args, **kwargs): + """Create a HtmlWikiMarkup object. + + Arguments: + + filename=FILE + Read Wiki material from the file named FILE. + file=FD + Read Wiki material from file object FD. + text=STRING + Read Wiki material from STRING. + lang=CODE + Specifies source language. Default is 'en'. This variable can be + referred to as '%(lang)s' in the keyword arguments below. + html_base=URL + Base URL for cross-references. Default is + 'http://%(lang)s.wiktionary.org/wiki/' + image_base=URL + Base URL for images. Default is + 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf' + media_base=URL + Base URL for media files. Default is + 'http://www.mediawiki.org/xml/export-0.3' + """ + super(HtmlWikiMarkup, self).__init__(*args, **kwargs) self.token_class['LINK'] = HtmlLinkNode self.token_class['TMPL'] = HtmlLinkNode self.token_class['REF'] = HtmlRefNode self.token_class['IT'] = HtmlFontNode self.token_class['BOLD'] = HtmlFontNode @@ -267,33 +310,11 @@ class HtmlWikiMarkup (WikiMarkup): def __str__(self): str = "" for elt in self.tree: str += elt.format() return str -class HtmlWiktionaryMarkup (HtmlWikiMarkup): - """ - A class for translating Wiktionary articles into HTML. - This version does not do much, except that it tries to correctly - format templates. But "tries" does not mean "does". The heuristics - used here is clearly not enough to cope with it. - - 1. FIXME: - The right solution would be to have a database of templates with their - semantics and to decide on their rendering depending on that. E.g. - {{term}} in en.wiktionary means "replace this with the search term". - This, however, does not work in other wiktionaries. There are - also more complex templates, e.g.: {{t+|bg|врата|n|p|tr=vrata|sc=Cyrl}} - I don't know what it means. Couldn't find any documentation either. - Again, this template does not work in other dictionaries. +class HtmlWiktionaryMarkup(HtmlWikiMarkup): + """A class for translating Wiktionary articles into HTML. - 2. Capitulation notice: - Given the: - 1. vast amount of wiktionaries available, - 2. abundance of various templates for each wictionary, - 3. apparent lack of documentation thereof, - 4. the lack of standardized language-independent templates, - I dont see any way to cope with the template-rendering task within a - reasonable amount of time. - - Faeci quod potui, faciant meliora potentes. + Reserved for future use. Currently does the same as HtmlWikiMarkup. """ diff --git a/WikiTrans/wiki2texi.py b/wikitrans/wiki2texi.py index 7297195..d9e5f52 100644 --- a/WikiTrans/wiki2texi.py +++ b/wikitrans/wiki2texi.py @@ -12,15 +12,24 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. -from WikiTrans.wikimarkup import * -from WikiTrans.wikitoken import * -from WikiTrans.wikins import wiki_ns_re, wiki_ns +""" +Wiki markup to Texinfo translator. + +Classes: + +TexiWikiMarkup -- Converts Wiki material to Texinfo. + +""" + +from wikitrans.wikimarkup import * +from wikitrans.wikitoken import * +from wikitrans.wikins import wiki_ns_re, wiki_ns import re import urllib class Acc(list): def prepend(self,x): self.insert(0,x) @@ -248,13 +257,25 @@ class TexiRefNode(WikiRefNode): text = parser._end_print(save) if text and text != '': parser._print("@uref{%s,%s}" % (target, text), escape=False) else: parser._print("@uref{%s}" % target, escape=False) -class TexiWikiMarkup (WikiMarkup): +class TexiWikiMarkup(WikiMarkup): + """Wiki markup to Texinfo translator class. + + Usage: + + x = TexiWikiMarkup(file="input.wiki") + # Parse the input: + x.parse() + # Print it as Texi: + print(str(x)) + + """ + nested = 0 sectcomm = { 'numbered': [ '@top', '@chapter', '@section', @@ -285,12 +306,46 @@ class TexiWikiMarkup (WikiMarkup): } sectioning_model = 'numbered' sectioning_start = 0 def __init__(self, *args, **keywords): + """Create a TexiWikiMarkup object. + + Arguments: + + filename=FILE + Read Wiki material from the file named FILE. + file=FD + Read Wiki material from file object FD. + text=STRING + Read Wiki material from STRING. + + sectioning_model=MODEL + Select the Texinfo sectioning model for the output document. Possible + values are: + + 'numbered' + Top of document is marked with "@top". Headings ("=", "==", + "===", etc) produce "@chapter", "@section", "@subsection", etc. + 'unnumbered' + Unnumbered sectioning: "@top", "@unnumbered", "@unnumberedsec", + "@unnumberedsubsec". + 'appendix' + Sectioning suitable for appendix entries: "@top", "@appendix", + "@appendixsec", "@appendixsubsec", etc. + 'heading' + Use heading directives to reflect sectioning: "@majorheading", + "@chapheading", "@heading", "@subheading", etc. + sectioning_start=N + Shift resulting heading level by N positions. For example, supposing + "sectioning_model='numbered'", "== A ==" normally produces + "@section A" on output. Now, if given "sectioning_start=1", this + directive will produce "@subsection A" instead. + """ + super(TexiWikiMarkup, self).__init__(*args, **keywords) self.token_class['TEXT'] = TexiTextNode self.token_class['TAG'] = TexiTagNode self.token_class['PARA'] = TexiParaNode self.token_class['PRE'] = TexiPreNode diff --git a/WikiTrans/wiki2text.py b/wikitrans/wiki2text.py index cb3a183..1fbc61b 100644 --- a/WikiTrans/wiki2text.py +++ b/wikitrans/wiki2text.py @@ -12,15 +12,26 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. -from WikiTrans.wikitoken import * -from WikiTrans.wikimarkup import * -from WikiTrans.wikins import wiki_ns_re, wiki_ns +""" +Wiki markup to plain text translator. + +Classes: + +TextWikiMarkup -- Converts Wiki material to plain text. +TextWiktionaryMarkup -- Reserved for future use. Currently does the same as + TextWikiMarkup. + +""" + +from wikitrans.wikitoken import * +from wikitrans.wikimarkup import * +from wikitrans.wikins import wiki_ns_re, wiki_ns import re try: from urllib import quote as url_quote except ImportError: from urllib.parse import quote as url_quote @@ -104,15 +115,15 @@ class TextLinkNode(WikiSeqNode): ns = self.parser.wiki_ns_name(qual) if ns: if ns == 'NS_IMAGE': if not self.parser.show_urls: return "" text = "[%s: %s]" % (qual, text if text else arg) - tgt = self.image_base + '/' + \ - url_quote(tgt) + \ - '/250px-' + url_quote(tgt) + tgt = "%s/%s/250px-%s" % (self.image_base, + url_quote(tgt), + url_quote(tgt)) elif ns == 'NS_MEDIA': text = "[%s]" % (qual) else: tgt = self.parser.mktgt(tgt) elif self.type == 'LINK' and qual in self.parser.langtab: text = self.parser.langtab[qual] + ": " + tgt @@ -138,14 +149,17 @@ class TextBarNode(WikiNode): if w < 5: w = 5 return "\n" + ("-" * (w - 5)).center(w - 1) + "\n" class TextHdrNode(WikiHdrNode): def format(self): - return "\n" + ("*" * self.level) + " " + \ - self.content.format().lstrip(" ") + "\n\n" + return ("\n" + + ("*" * self.level) + + " " + + self.content.format().lstrip(" ") + + "\n\n") class TextRefNode(WikiRefNode): def format(self): text = self.content.format() if text: return "%s (see %s) " % (text, self.ref) @@ -201,15 +215,23 @@ class TextTagNode(WikiTagNode): if self.args: s += ' ' + str(self.args) s += '>' + self.content.format() + '</' + self.tag + '>' return s -class TextWikiMarkup (WikiMarkup): - """ - A (general-purpose Wiki->Text translator class. +class TextWikiMarkup(WikiMarkup): + """A Wiki markup to plain text translator. + + Usage: + + x = TextWikiMarkup(file="input.wiki") + # Parse the input: + x.parse() + # Print it as plain text: + print(str(x)) + """ # Output width width = 78 # Do not show references. show_urls = False @@ -220,12 +242,31 @@ class TextWikiMarkup (WikiMarkup): num = 0 # Array of footnote references references = [] def __init__(self, *args, **keywords): + """Create a TextWikiMarkup object. + + Arguments: + + filename=FILE + Read Wiki material from the file named FILE. + file=FD + Read Wiki material from file object FD. + text=STRING + Read Wiki material from STRING. + + width=N + Limit output width to N columns. Default is 78. + show_urls=False + By default, the link URLs are displayed in parentheses next to the + link text. If this argument is given, only the link text will be + displayed. + """ + super(TextWikiMarkup,self).__init__(*args, **keywords) if 'width' in keywords: self.width = keywords['width'] if 'show_urls' in keywords: self.show_urls = keywords['show_urls'] self.token_class['SEQ'] = TextSeqNode @@ -255,13 +296,13 @@ class TextWikiMarkup (WikiMarkup): def mktgt(self, tgt, lang = None): if not lang: lang = self.lang return self.html_base % { 'lang' : lang } + url_quote(tgt) - def indent (self, lev, text): + def indent(self, lev, text): if text.find('\n') == -1: s = (" " * lev) + text else: s = "" for elt in text.split('\n'): if elt: @@ -295,12 +336,13 @@ class TextWikiMarkup (WikiMarkup): def __str__(self): str = "" for elt in self.tree: str += elt.format() return str -class TextWiktionaryMarkup (TextWikiMarkup): - """ - See documentation for HtmlWiktionaryMarkup +class TextWiktionaryMarkup(TextWikiMarkup): + """A class for translating Wiktionary articles into plain text. + + Reserved for future use. Currently does the same as TextWikiMarkup. """ - # FIXME: It is supposed to do something about templates + diff --git a/WikiTrans/wikidump.py b/wikitrans/wikidump.py index 7457dfa..d5f651c 100644 --- a/WikiTrans/wikidump.py +++ b/wikitrans/wikidump.py @@ -11,32 +11,67 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. +""" +Print Wiki parse tree as JSON. + +Classes: + +DumpWikiMarkup + +""" + from __future__ import print_function -from WikiTrans.wikitoken import * +from wikitrans.wikitoken import * import json -from WikiTrans.wikimarkup import WikiMarkup +from wikitrans.wikimarkup import WikiMarkup class DumpReferences(object): idx = 0 def __len__(self): return self.idx + 1 def append(self, obj): self.idx += 1 class DumpWikiMarkup(WikiMarkup): + """Produce a JSON dump of the Wiki markup parse tree. + + Usage: + + x = DumpWikiMarkup(file="input.wiki") + # Parse the input: + x.parse() + # Print a JSON dump of the parse tree + print(str(x)) + + """ + indent = None references = DumpReferences() def __init__(self, **kwarg): + """Create a DumpWikiMarkup object. + + Arguments: + + filename=FILE + Read Wiki material from the file named FILE. + file=FD + Read Wiki material from file object FD. + text=STRING + Read Wiki material from STRING. + indent=N + Basic indent offset for JSON objects. + """ + n = kwarg.pop('indent', None) if n != None: self.indent = int(n) - WikiMarkup.__init__(self, **kwarg) + super(DumpWikiMarkup,self).__init__(self, **kwarg) def __str__(self): return json.dumps(self.tree, cls=WikiNodeEncoder, indent=self.indent, separators=(',',': '), sort_keys=True) diff --git a/WikiTrans/wikimarkup.py b/wikitrans/wikimarkup.py index 6cbf5de..77c3b30 100644 --- a/WikiTrans/wikimarkup.py +++ b/wikitrans/wikimarkup.py @@ -1,43 +1,68 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # Copyright (C) 2008-2018 Sergey Poznyakoff -# +# # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. +""" +Wiki markup parser. + +This module provides two class: + +WikiMarkupParser: + An abstract parser class, which serves as a base class for all markup + classes in this package. + +WikiMarkup + A subclass of the above, providing basic input method. + +""" + from __future__ import print_function import sys import re from types import * -from WikiTrans.wikitoken import * +from wikitrans.wikitoken import * -__all__ = [ "BaseWikiMarkup", "WikiMarkup", - "TagAttributes", "TagAttributeSyntax" ] +__all__ = [ "WikiMarkupParser", "WikiMarkup", + "TagAttributes", "TagAttributeSyntaxError" ] -class UnexpectedToken(Exception): +class UnexpectedTokenError(Exception): def __init__(self, value): self.value = value -class TagAttributeSyntax(Exception): +class TagAttributeSyntaxError(Exception): def __init__(self, value): self.value = value def __str__(self): return repr(self.value) class TagAttributes(object): + """A dictionary-like collection of tag attributes. + + Example: + + attr = TagAttributes('href="foo" length=2') + if 'href' in attr: + print(x['href']) # returns "foo" + for a in attr: + ... + """ + attrstart = re.compile("^(?P<attr>[a-zA-Z0-9_-]+)(?P<eq>=\")?") valseg = re.compile("^[^\\\"]+") tab = {} printable = None def __init__(self, string): if not string: @@ -65,13 +90,13 @@ class TagAttributes(object): s = s[1:] break else: val = 1 self.tab[name] = val else: - raise TagAttributeSyntax(s) + raise TagAttributeSyntaxError(s) def __len__(self): return len(self.tab) def __getitem__(self, key): return self.tab[key] def __contains__(self, key): return key in self.tab @@ -86,19 +111,38 @@ class TagAttributes(object): del self.tab[key] def __str__(self): return self.printable def __repr__(self): return self.printable -class BaseWikiMarkup(object): +class WikiMarkupParser(object): + """Parser for Wiki markup language. + + Given input in Wiki markup language creates an abstract parse tree for it. + This is a base class for actual parsers. The subclasses must provide the + input method. + + Public methods: + + parse() -- parse the input. + + Abstract methods (must be overridden by the subclass): + + input() -- returns next physical line from the input material. + + Public attributes: + + tree -- constructed parse tree (a subclass of WikiNode) + + """ delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^/][^>]+))?\s*(?P<closed>/)?>") ctag = re.compile("</(?P<tag>[a-zA-Z0-9_]+)\s*>") refstart = re.compile("^https?://") - + close_delim = { '[': ']', '[[': ']]', '{{': '}}' } @@ -112,16 +156,19 @@ class BaseWikiMarkup(object): toklist = None tokind = 0 newline = 0 tree = None tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ] - + debug_level = 0 - + def dprint(self, lev, fmt, *argv): + """If current debug level is greater than or equal to lev, print *argv + according to format. + """ if self.debug_level >= lev: for l in (fmt % argv).split('\n'): print("[DEBUG] %s" % l) inline_delims = [ "''", "'''", "[", "]", "[[", "]]", "{{", "}}", "|" ] @@ -132,55 +179,56 @@ class BaseWikiMarkup(object): 'CTAG': WikiTagNode, 'TAG': WikiTagNode, 'DELIM': WikiDelimNode, 'TEXT': WikiTextNode, 'PRE': WikiContentNode, 'PARA': WikiSeqNode, - 'BAR': WikiNode, + 'BAR': WikiNode, 'SEQ': WikiSeqNode, 'IND': WikiIndNode, 'REF': WikiRefNode, 'TMPL': WikiSeqNode, 'IT': WikiSeqNode, 'BOLD': WikiSeqNode, 'ELT': WikiEltNode, 'ENV': WikiEnvNode, 'LINK': WikiSeqNode, 'HDR': WikiHdrNode } - def __createWikiNode(self,**kwarg): + def _new_node(self,**kwarg): return self.token_class[kwarg['type']](self, **kwarg) - + def tokread(self): + """Read next token from the input. Return it as a subclass of WikiNode.""" line = None pos = 0 while 1: if (not line or pos == len(line)): try: line = self.input() pos = 0 except StopIteration: line = u'' - + if not line or line == "": - yield(self.__createWikiNode(type='NIL')) + yield(self._new_node(type='NIL')) break if line == '\n': - yield(self.__createWikiNode(type='NL')) + yield(self._new_node(type='NL')) line = None continue self.dprint(100, "LINE: %s", line[pos:]) m = self.delim.search(line, pos) - + if m: if (pos < m.start(0)): - yield(self.__createWikiNode(type='TEXT', - content=line[pos:m.start(0)])) + yield(self._new_node(type='TEXT', + content=line[pos:m.start(0)])) pos = m.start(0) t = None if line[m.start(0)] == '<': m = self.otag.match(line, pos) if m: @@ -188,189 +236,216 @@ class BaseWikiMarkup(object): if m.group('tag') == 'nowiki': if not m.group('closed'): while 1: try: m = self.ctag.search(line, pos) if m and m.group('tag') == 'nowiki': - yield(self.__createWikiNode(type='TEXT', - content=line[pos:m.start(0)] )) + yield(self._new_node(type='TEXT', + content=line[pos:m.start(0)] )) pos = m.end(0) break - yield(self.__createWikiNode(type='TEXT', - content=line[pos:])) + yield(self._new_node(type='TEXT', + content=line[pos:])) line = self.input() pos = 0 except StopIteration: break continue elif m.group('tag') in self.tags: try: - yield(self.__createWikiNode(type='OTAG', + yield(self._new_node(type='OTAG', tag=m.group('tag'), isblock=(line[pos] == '\n'), args=TagAttributes(m.group('args')))) if m.group('closed'): - yield(self.__createWikiNode(type='CTAG', - tag=m.group('tag'))) - except TagAttributeSyntax: - yield(self.__createWikiNode(type='TEXT',content=m.group(0))) + yield(self._new_node(type='CTAG', + tag=m.group('tag'))) + except TagAttributeSyntaxError: + yield(self._new_node(type='TEXT', + content=m.group(0))) continue else: - yield(self.__createWikiNode(type='TEXT',content=m.group(0))) + yield(self._new_node(type='TEXT',content=m.group(0))) continue else: m = self.ctag.match(line, pos) if m: if m.group('tag') in self.tags: - yield(self.__createWikiNode(type='CTAG', - tag=m.group('tag'))) + yield(self._new_node(type='CTAG', + tag=m.group('tag'))) pos = m.end(0) continue else: - yield(self.__createWikiNode(type='TEXT', - content=line[pos:pos+1])) + yield(self._new_node(type='TEXT', + content=line[pos:pos+1])) pos += 1 continue else: pos = m.end(0) content = m.group(0) if content[0] in self.envtypes: - node = self.__createWikiNode(type='DELIM', - content=content, - isblock=True, - continuation=pos < len(line) and line[pos] == ":") + node = self._new_node(type='DELIM', + content=content, + isblock=True, + continuation=pos < len(line) and line[pos] == ":") if node.continuation: node.content += node.content[0] pos += 1 yield(node) while pos < len(line) and line[pos] in [' ', '\t']: - pos += 1 + pos += 1 else: - yield(self.__createWikiNode(type='DELIM', - isblock=(content.strip() not in self.inline_delims), - content=content.strip())) + yield(self._new_node(type='DELIM', + isblock=(content.strip() not in self.inline_delims), + content=content.strip())) continue if line: if line[-1] == '\n': if line[pos:-1] != '': - yield(self.__createWikiNode(type='TEXT',content=line[pos:-1])) - yield(self.__createWikiNode(type='NL')) + yield(self._new_node(type='TEXT',content=line[pos:-1])) + yield(self._new_node(type='NL')) else: - yield(self.__createWikiNode(type='TEXT',content=line[pos:])) + yield(self._new_node(type='TEXT',content=line[pos:])) line = None - + def input(self): + """Return next physical line from the input. + + This method must be overridden by the subclass. + """ return None def swaptkn(self, i, j): + """Swap tokens at indices i and j in toklist.""" self.dprint(80, "SWAPPING %s <-> %s", i, j) - x = self.toklist[i] + x = self.toklist[i] self.toklist[i] = self.toklist[j] self.toklist[j] = x def tokenize(self): + """Tokenize the input. + + Read tokens from the input (supplied by the input() method). Place the + obtained tokens in the toklist array. + """ self.toklist = [] for tok in self.tokread(): self.dprint(100, "TOK: %s", tok) self.toklist.append(tok) # Determine and fix up the ordering of bold and italic markers # There are three possible cases: # # 1a. '''a b ''c'' d''' # 1b. ''a b '''c''' d'' # # 2a. '''''a b'' c d''' # 2b. '''''a b''' c d'' - # + # # 3a. '''a b ''c d''''' # 3b. ''a b '''c d''''' stack = [] for i in range(0,len(self.toklist)): - if self.toklist[i].type == 'DELIM' \ - and (self.toklist[i].content == "''" \ - or self.toklist[i].content == "'''"): + if (self.toklist[i].type == 'DELIM' + and (self.toklist[i].content == "''" + or self.toklist[i].content == "'''")): if len(stack) > 0: if self.toklist[stack[-1]].content == self.toklist[i].content: # Case 1: just pop the matching delimiter off the stack stack.pop() elif len(stack) == 2 and stack[-2] + 1 == stack[-1]: # Case 2: swap delimiters saved on stack ... self.swaptkn(stack[-2], stack[-1]) # and pop off the matching one stack.pop() - elif i < len(self.toklist) \ - and self.toklist[i+1].type == 'DELIM' \ - and self.toklist[stack[-1]].content == self.toklist[i+1].content: + elif (i < len(self.toklist) + and self.toklist[i+1].type == 'DELIM' + and self.toklist[stack[-1]].content + == self.toklist[i+1].content): # Case 3: swap current and next tokens self.swaptkn(i, i+1) - # and pop off the matching one + # and pop off the matching one stack.pop() else: # Push the token on stack stack.append(i) else: # Push the token on stack stack.append(i) # Redefine all non-matched tokens as TEXT for i in stack: self.toklist[i].type = 'TEXT' # FIXME mark = [] - + def push_mark(self): + """Save the current token index on stack.""" self.mark.append(self.tokind) def pop_mark(self): + """Restore the token index from top of stack.""" self.tokind = self.mark.pop() def clear_mark(self): + """Forget the last mark.""" self.mark.pop() - + def lookahead(self, off=0): + """Peek a token at index (tokind+off).""" tok = self.toklist[self.tokind+off] self.dprint(20, "lookahead(%s): %s", off, tok) return tok def setkn(self,val): + """Store token val at the current token index.""" self.toklist[self.tokind] = val - + def getkn(self): + """Get next token from the toklist. Advance tokind.""" self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL' if self.tokind == len(self.toklist): - return self.__createWikiNode(type='NIL') + return self._new_node(type='NIL') tok = self.toklist[self.tokind] self.tokind = self.tokind + 1 self.dprint(20, "getkn: %s", tok) return tok - + def ungetkn(self, tok=None): + """Unget the last read token. + + Decrease the tokind by one, so the last read token will be read again. + If optional argument is supplied and is not None, store it in the toklist + in place of the current token. + """ self.tokind = self.tokind - 1 self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL' if tok: self.toklist[self.tokind] = tok self.dprint(20, "ungetkn: %s", tok) return self.toklist[self.tokind] def fixuptkn(self, tok): + """Replace the recently read token by tok.""" if self.tokind == 0: - raise IndexError('wikimarkup.fixuptkn called at start of input') + raise IndexError('WikiMarkupParser.fixuptkn called at start of input') self.toklist[self.tokind-1] = tok return tok - - def dump(self, tree, file=sys.stdout): + + def dump(self, tree, file=sys.stdout): + """Dump the tree to file, node by node.""" for node in tree: file.write(str(node)) file.write('\n') def is_block_end(self, tok): + """Return True if tok ends a block environment.""" if tok.type == 'NIL': return True elif tok.type == 'NL': if self.lookahead().type == 'NIL': return True elif self.lookahead().type == 'NL': @@ -380,26 +455,27 @@ class BaseWikiMarkup(object): if tok.isblock: self.ungetkn(tok) return True return False def parse_para(self, tok): + """Read paragraph starting at tok.""" self.dprint(80, "ENTER parse_para: %s", tok) acc = { 'seq': [], 'textlist': [] } - + def flush(): if acc['textlist']: - acc['seq'].append(self.__createWikiNode(type='TEXT', - content=''.join(acc['textlist']))) + acc['seq'].append(self._new_node(type='TEXT', + content=''.join(acc['textlist']))) acc['textlist'] = [] - if isinstance(tok, WikiContentNode) \ - and isinstance(tok.content,str) \ - and re.match("^[ \t]", tok.content): + if (isinstance(tok, WikiContentNode) + and isinstance(tok.content,str) + and re.match("^[ \t]", tok.content)): type = 'PRE' rx = re.compile("^\S") else: type = 'PARA' rx = re.compile("^[ \t]") @@ -415,32 +491,33 @@ class BaseWikiMarkup(object): flush() acc['seq'].append(self.parse_tag(tok)) elif tok.type == 'DELIM': flush() acc['seq'].append(self.parse_inline_delim(tok)) else: - raise UnexpectedToken(tok) + raise UnexpectedTokenError(tok) tok = self.getkn() flush() if acc['seq']: - tok = self.__createWikiNode(type=type, content=acc['seq']) + tok = self._new_node(type=type, content=acc['seq']) else: tok = None self.dprint(80, "LEAVE parse_para=%s", tok) return tok def parse_block_delim(self, tok): + """Parse block environment starting at tok.""" self.dprint(80, "ENTER parse_block_delim") assert(tok.type == 'DELIM') if tok.content == "----": - node = self.__createWikiNode(type = 'BAR') + node = self._new_node(type = 'BAR') elif tok.content[0:2] == "==": node = self.parse_header(tok) if not node: - tok = self.ungetkn(self.__createWikiNode(type='TEXT', - content=tok.content)) + tok = self.ungetkn(self._new_node(type='TEXT', + content=tok.content)) elif tok.content[0] in self.envtypes: node = None if tok.content[0] == ':': t = self.lookahead(-2) if not (t.type == 'DELIM' and t.content == ';'): node = self.parse_indent(tok) @@ -448,57 +525,66 @@ class BaseWikiMarkup(object): node = self.parse_env(tok) else: self.ungetkn(tok) node = None self.dprint(80, "LEAVE parse_block_delim=%s", node) return node - + def parse_line(self): + """Parse the input line.""" self.dprint(80, "ENTER parse_line") list = [] while True: tok = self.getkn() if tok.type == 'NL' or tok.type == 'NIL': break elif tok.type == 'TEXT': list.append(tok) elif tok.type == 'DELIM': if tok.isblock: - tok = self.__createWikiNode(type = 'TEXT', - content = tok.content) + tok = self._new_node(type = 'TEXT', content = tok.content) self.fixuptkn(tok) list.append(tok) elif tok.content[0] == ":": # FIXME list.append(self.parse_indent(tok)) break else: x = self.parse_inline_delim(tok) if x: list.append(x) else: - list.append(self.fixuptkn(self.__createWikiNode(type = 'TEXT', content = tok.content))) + list.append(self.fixuptkn(self._new_node(type = 'TEXT', + content = tok.content))) elif tok.type == 'OTAG': if tok.isblock: self.ungetkn() break list.append(self.parse_tag(tok)) else: list.append(tok) - ret = self.__createWikiNode(type='SEQ', content=list) + ret = self._new_node(type='SEQ', content=list) self.dprint(80, "LEAVE parse_line=%s", ret) return ret - + def parse_indent(self, tok): + """Parse indented block starting at tok.""" lev = len(tok.content) self.dprint(80, "ENTER parse_indent(%s)", lev) - x = self.__createWikiNode(type='IND', level=lev, content=self.parse_line()) + x = self._new_node(type='IND', level=lev, content=self.parse_line()) self.dprint(80, "LEAVE parse_indent=%s", x) return x - + def parse_fontmod(self,delim,what): + """Parse font modification directive (bold or italics). + + Arguments: + + delim -- starting delimiter ("''" or "'''") + what -- 'IT' or 'BOLD' + """ self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s", delim, what, self.lookahead()) seq = [] text = '' while True: tok = self.getkn() @@ -510,42 +596,43 @@ class BaseWikiMarkup(object): elif tok.type == 'DELIM': # self.dprint(80, "got %s, want %s", tok.content, delim) if tok.content == delim: break else: if text: - seq.append(self.__createWikiNode(type='TEXT', content=text)) + seq.append(self._new_node(type='TEXT', content=text)) text = '' x = self.parse_inline_delim(tok) if x: seq.append(x) else: self.dprint(80, "LEAVE parse_fontmod=%s", "None") return None elif tok.type == 'NL': - seq.append(self.__createWikiNode(type='TEXT', content='\n')) + seq.append(self._new_node(type='TEXT', content='\n')) else: self.dprint(80, "LEAVE parse_fontmod=None") return None if text: - seq.append(self.__createWikiNode(type='TEXT', content=text)) - res = self.__createWikiNode(type=what, content=seq) - self.dprint(80, "LEAVE parse_fontmod=%s", res) + seq.append(self._new_node(type='TEXT', content=text)) + res = self._new_node(type=what, content=seq) + self.dprint(80, "LEAVE parse_fontmod=%s", res) return res def parse_ref(self): + """Parse a reference block ([...])""" self.dprint(80, "ENTER parse_ref") tok = self.getkn() if not (tok.type == 'TEXT' and self.refstart.match(tok.content)): self.dprint(80, "LEAVE parse_ref=None") return None seq = [] (ref,sep,text) = tok.content.partition(' ') if text: - seq.insert(0, self.__createWikiNode(type='TEXT', content=text)) + seq.insert(0, self._new_node(type='TEXT', content=text)) while True: tok = self.getkn() if tok.type == 'NIL': self.dprint(80, "LEAVE parse_ref=None") return None @@ -564,37 +651,46 @@ class BaseWikiMarkup(object): return None elif tok.type == 'OTAG': list.append(self.parse_tag(tok)) else: seq.append(tok) - ret = self.__createWikiNode(type='REF', - ref=ref, - content=self.__createWikiNode(type='SEQ', content=seq)) + ret = self._new_node(type='REF', ref=ref, + content=self._new_node(type='SEQ', content=seq)) self.dprint(80, "LEAVE parse_ref= %s", ret) return ret def parse_link(self, type, delim): + """Parse an external link ([[...]]). + + In this implementation, it is also used to parse template + references ({{...}}). + + Arguments: + + type -- 'LINK' or 'TMPL' + delim -- expected closing delimiter. + """ self.dprint(80, "ENTER parse_link(%s,%s)", type, delim) subtree = [] list = [] while True: tok = self.getkn() if tok.type == 'NIL': self.dprint(80, "LEAVE parse_link=None [EOF]") return None if tok.type == 'DELIM': if tok.content == delim: if list: - subtree.append(self.__createWikiNode(type='SEQ', - content=list)) + subtree.append(self._new_node(type='SEQ', + content=list)) break elif tok.content == "|": if len(list) > 1: - subtree.append(self.__createWikiNode(type='SEQ', - content=list)) + subtree.append(self._new_node(type='SEQ', + content=list)) elif list: subtree.append(list[0]) list = [] else: x = self.parse_inline_delim(tok) if x: @@ -604,17 +700,18 @@ class BaseWikiMarkup(object): return None elif tok.type == 'TEXT': list.append(tok) else: self.dprint(80, "LEAVE parse_link=None [unexpected token]") return None - ret = self.__createWikiNode(type=type, content=subtree) + ret = self._new_node(type=type, content=subtree) self.dprint(80, "LEAVE parse_link=%s", ret) return ret - + def parse_inline_delim(self, tok): + """Parse an inline block.""" self.dprint(80, "ENTER parse_inline_delim") assert(tok.type == 'DELIM') self.push_mark() if tok.content == "''": x = self.parse_fontmod(tok.content, 'IT') elif tok.content == "'''": @@ -630,49 +727,49 @@ class BaseWikiMarkup(object): if x: self.clear_mark() else: self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok) self.pop_mark() - x = self.fixuptkn(self.__createWikiNode(type='TEXT', - content=tok.content)) + x = self.fixuptkn(self._new_node(type='TEXT', content=tok.content)) od = tok.content if od in self.close_delim: cd = self.close_delim[od] lev = 0 for i,tok in enumerate(self.toklist[self.tokind+1:]): if tok.type == 'NIL': break elif tok.type == 'DELIM': if tok.content == od: lev += 1 elif tok.content == cd: if lev == 0: - tok = self.__createWikiNode(type='TEXT', - content=tok.content) + tok = self._new_node(type='TEXT', + content=tok.content) self.toklist[self.tokind+1+i] = tok lev -= 1 break self.dprint(80, "END DELIMITER RECOVERY: %s", tok) self.dprint(80, "LEAVE parse_inline_delim=%s", x) return x - + def parse_tag(self, tag): + """Parse an xml-like tag (such as, e.g. "<tt>...</tt>").""" self.dprint(80, "ENTER parse_tag") list = [] self.push_mark() while True: tok = self.getkn() if tok.type == 'NIL': self.pop_mark() s = '<' + tag.tag if tag.args: s += ' ' + str(tag.args) s += '>' - node = self.__createWikiNode(type='TEXT',content=s) + node = self._new_node(type='TEXT',content=s) if tag.content: self.tree[self.tokind:self.tokind] = tag.content self.dprint(80, "LEAVE parse_tag = %s (tree modified)", node) return node elif tok.type == 'DELIM': if tok.isblock: @@ -682,77 +779,77 @@ class BaseWikiMarkup(object): if not tok: tok = self.getkn() elif tok.type == 'CTAG': if tag.tag == tok.tag: break s = '</' + tag.tag + '>' - tok = self.fixuptkn(self.__createWikiNode(type='TEXT', - content=s)) + tok = self.fixuptkn(self._new_node(type='TEXT', content=s)) elif tok.type == 'NL': - tok = self.__createWikiNode(type = 'TEXT', content = '\n') + tok = self._new_node(type = 'TEXT', content = '\n') list.append(tok) self.clear_mark() - ret = self.__createWikiNode(type = 'TAG', - tag = tag.tag, - args = tag.args, - isblock = tag.isblock, - content = self.__createWikiNode(type = 'SEQ', content = list)) + ret = self._new_node(type = 'TAG', + tag = tag.tag, + args = tag.args, + isblock = tag.isblock, + content = self._new_node(type = 'SEQ', content = list)) self.dprint(80, "LEAVE parse_tag = %s", ret) return ret - + def parse_env(self, tok): + """Parse a block environment (numbered, unnumbered, or definition list).""" type = self.envtypes[tok.content[0]][0] lev = len(tok.content) self.dprint(80, "ENTER parse_env(%s,%s)",type,lev) list = [] while True: - if tok.type == 'DELIM' \ - and tok.content[0] in self.envtypes \ - and type == self.envtypes[tok.content[0]][0]: + if (tok.type == 'DELIM' + and tok.content[0] in self.envtypes + and type == self.envtypes[tok.content[0]][0]): if len(tok.content) < lev: self.ungetkn() break elif len(tok.content) > lev: elt = self.parse_env(tok) else: elt = self.parse_line() if not tok.continuation: - list.append(self.__createWikiNode(type='ELT', - subtype=self.envtypes[tok.content[0]][1], - content=elt)) + list.append(self._new_node(type='ELT', + subtype=self.envtypes[tok.content[0]][1], + content=elt)) tok = self.getkn() continue if list: if list[-1].content.type != 'SEQ': x = list[-1].content.content # FIXME: - list[-1].content = self.__createWikiNode(type='SEQ', content=[x]) + list[-1].content = self._new_node(type='SEQ', content=[x]) list[-1].content.content.append(elt) else: self.ungetkn() break tok = self.getkn() - ret = self.__createWikiNode(type='ENV', - envtype=type, - level=lev, - content=list) + ret = self._new_node(type='ENV', + envtype=type, + level=lev, + content=list) self.dprint(80, "LEAVE parse_env=%s", ret) return ret - + def parse_header(self, tok): + """Parse a Wiki header.""" self.dprint(80, "ENTER parse_header") self.push_mark() list = [] delim = tok.content while True: tok = self.getkn() - if tok.type == 'NL': self.pop_mark() self.dprint(80, "LEAVE parse_header=None") return None elif tok.type == 'TEXT': list.append(tok) @@ -776,23 +873,21 @@ class BaseWikiMarkup(object): elif tok.type == 'OTAG': if tok.isblock: self.pop_mark() self.dprint(80, "LEAVE parse_header=None") return None list.append(self.parse_tag(tok)) - - self.clear_mark() - ret = self.__createWikiNode(type='HDR', - level = len(delim), - content = self.__createWikiNode(type='SEQ', - content=list)) + ret = self._new_node(type='HDR', + level=len(delim), + content=self._new_node(type='SEQ', content=list)) self.dprint(80, "LEAVE parse_header=%s", ret) return ret - + def parse_block(self): + """Parse next block: newline, delimiter, tag, or paragraph.""" tok = self.getkn() while tok.type == 'NL': tok = self.getkn() if tok == None or tok.type == 'NIL': return None elif tok.type == 'DELIM': @@ -802,14 +897,18 @@ class BaseWikiMarkup(object): else: tok = self.getkn() elif tok.type == 'OTAG' and tok.isblock: return self.parse_tag(tok) return self.parse_para(tok) - + def parse(self): + """Parse Wiki material supplied by the input() method. + + Store the resulting abstract parsing tree in the tree attribute. + """ if not self.toklist: self.tokenize() if self.debug_level >= 90: print("TOKEN DUMP BEGIN") self.dump(self.toklist) print("TOKEN DUMP END") @@ -826,16 +925,16 @@ class BaseWikiMarkup(object): print("TREE DUMP END") def __str__(self): return str(self.tree) -class WikiMarkup (BaseWikiMarkup): +class WikiMarkup(WikiMarkupParser): """ - A derived class, that supplies a basic input method. - + A derived parser class that supplies a basic input method. + Three types of inputs are available: 1. filename=<file> The file <file> is opened and used for input. 2. file=<file> The already opened file <file> is used for input. @@ -846,19 +945,20 @@ class WikiMarkup (BaseWikiMarkup): obj = WikiMarkup(arg=val) obj.parse ... Do whatever you need with obj.tree ... """ + file = None text = None lang = 'en' html_base = 'http://%(lang)s.wiktionary.org/wiki/' image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf' media_base = 'http://www.mediawiki.org/xml/export-0.3' - + def __init__(self, *args, **keywords): for kw in keywords: if kw == 'file': self.file = keywords[kw] elif kw == 'filename': self.file = open(keywords[kw]) @@ -882,342 +982,304 @@ class WikiMarkup (BaseWikiMarkup): return self.file.readline() elif self.text: return self.text.pop(0) + '\n' else: return None - def is_lang_link(self, elt): - if elt.type == 'LINK' \ - and isinstance(elt.content, list) \ - and len(elt.content) == 1: - if elt.content[0].type == TEXT: - m = re.match('([\w-]+):', elt.content[0].content) - if m: # and m.group(1) in self.langtab: - return True - elif elt.content[0].type == 'SEQ' \ - and len(elt.content[0].content) == 1 and\ - elt.content[0].content[0].type == TEXT: - m = re.match('([\w-]+):',elt.content[0].content[0].content) - if m: # and m.group(1) in self.langtab: - return True - return False - - def is_empty_text(self, elt): - if elt.type == 'TEXT': - if re.search('\w', elt.content): - return False - return True - return False - - def is_empty_para(self, seq): - for x in seq: - if not (self.is_lang_link(x) or self.is_empty_text(x)): - return False - return True - - # ISO 639 + # ISO 639 langtab = { "aa": "Afar", # Afar - "ab": "Аҧсуа", # Abkhazian - "ae": None, # Avestan - "af": "Afrikaans", # Afrikaans - "ak": "Akana", # Akan + "ab": "Аҧсуа", # Abkhazian + "ae": None, # Avestan + "af": "Afrikaans", # Afrikaans + "ak": "Akana", # Akan "als": "Alemannisch", - "am": "አማርኛ", # Amharic - "an": "Aragonés", # Aragonese + "am": "አማርኛ", # Amharic + "an": "Aragonés", # Aragonese "ang": "Englisc", - "ar": "العربية" , # Arabic + "ar": "العربية" , # Arabic "arc": "ܐܪܡܝܐ", - "as": "অসমীয়া", # Assamese - "ast": "Asturian", - "av": "Авар", # Avaric - "ay": "Aymara", # Aymara - "az": "Azərbaycan" , # Azerbaijani - - "ba": "Башҡорт", # Bashkir - "bar": "Boarisch", + "as": "অসমীয়া", # Assamese + "ast": "Asturian", + "av": "Авар", # Avaric + "ay": "Aymara", # Aymara + "az": "Azərbaycan" , # Azerbaijani + + "ba": "Башҡорт", # Bashkir + "bar": "Boarisch", "bat-smg": "Žemaitėška", "bcl": "Bikol", - "be": "Беларуская", # Byelorussian; Belarusian + "be": "Беларуская", # Byelorussian; Belarusian "be-x-old": "Беларуская (тарашкевіца)", - "bg": "Български", # Bulgarian - "bh": "भोजपुरी", # Bihari - "bi": "Bislama", # Bislama - "bm": "Bamanankan", # Bambara - "bn": "বাংলা" , # Bengali; Bangla - "bo": "བོད་སྐད", # Tibetan + "bg": "Български", # Bulgarian + "bh": "भोजपुरी", # Bihari + "bi": "Bislama", # Bislama + "bm": "Bamanankan", # Bambara + "bn": "বাংলা" , # Bengali; Bangla + "bo": "བོད་སྐད", # Tibetan "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" , - "br": "Brezhoneg" , # Breton - "bs": "Bosanski" , # Bosnian + "br": "Brezhoneg" , # Breton + "bs": "Bosanski" , # Bosnian "bug": "Basa Ugi", "bxr": "Буряад", - "ca": "Català" , # Catalan + "ca": "Català" , # Catalan "cbk-zam": "Chavacano de Zamboanga", "cdo": "Mìng-dĕ̤ng-ngṳ̄", "cho": "Choctaw", - "ce": "Нохчийн", # Chechen + "ce": "Нохчийн", # Chechen "ceb": "Sinugboanong Binisaya" , # Cebuano - "ch": "Chamor", # Chamorro + "ch": "Chamor", # Chamorro "chr": "ᏣᎳᎩ", "chy": "Tsetsêhestâhese", - "co": "Cors", # Corsican - "cr": "Nehiyaw", # Cree + "co": "Cors", # Corsican + "cr": "Nehiyaw", # Cree "crh": "Qırımtatarca", - "cs": "Česky" , # Czech + "cs": "Česky" , # Czech "csb": "Kaszëbsczi", - "c": "Словѣньскъ", # Church Slavic - "cv": "Чăваш", # Chuvash - "cy": "Cymraeg" , # Welsh + "c": "Словѣньскъ", # Church Slavic + "cv": "Чăваш", # Chuvash + "cy": "Cymraeg" , # Welsh - "da": "Dansk" , # Danish - "de": "Deutsch" , # German + "da": "Dansk" , # Danish + "de": "Deutsch" , # German "diq": "Zazaki", # Dimli (Southern Zazaki) "dsb": "Dolnoserbski", - "dv": "ދިވެހިބަސް", # Divehi - "dz": "ཇོང་ཁ", # Dzongkha; Bhutani + "dv": "ދިވެހިބަސް", # Divehi + "dz": "ཇོང་ཁ", # Dzongkha; Bhutani - "ee": "Eʋegbe", # Ewe - "el": "Ελληνικά" , # Greek + "ee": "Eʋegbe", # Ewe + "el": "Ελληνικά" , # Greek "eml": "Emiliàn e rumagnòl", - "en": "English" , # English + "en": "English" , # English "eo": "Esperanto" , - "es": "Español" , # Spanish - "et": "Eesti" , # Estonian - "eu": "Euskara" , # Basque + "es": "Español" , # Spanish + "et": "Eesti" , # Estonian + "eu": "Euskara" , # Basque "ext": "Estremeñ", - "fa": "فارسی" , # Persian - "ff": "Fulfulde", # Fulah - "fi": "Suomi" , # Finnish + "fa": "فارسی" , # Persian + "ff": "Fulfulde", # Fulah + "fi": "Suomi" , # Finnish "fiu-vro": "Võro", - "fj": "Na Vosa Vakaviti",# Fijian; Fiji - "fo": "Føroyskt" , # Faroese - "fr": "Français" , # French + "fj": "Na Vosa Vakaviti",# Fijian; Fiji + "fo": "Føroyskt" , # Faroese + "fr": "Français" , # French "frp": "Arpitan", "fur": "Furlan", - "fy": "Frysk", # Frisian + "fy": "Frysk", # Frisian - "ga": "Gaeilge", # Irish + "ga": "Gaeilge", # Irish "gan": "贛語 (Gànyŭ)", - "gd": "Gàidhlig", # Scots; Gaelic - "gl": "Gallego" , # Gallegan; Galician + "gd": "Gàidhlig", # Scots; Gaelic + "gl": "Gallego" , # Gallegan; Galician "glk": "گیلکی", "got": "𐌲𐌿𐍄𐌹𐍃𐌺𐍉𐍂𐌰𐌶𐌳𐌰", - "gn": "Avañe'ẽ", # Guarani - "g": "ગુજરાતી", # Gujarati - "gv": "Gaelg", # Manx + "gn": "Avañe'ẽ", # Guarani + "g": "ગુજરાતી", # Gujarati + "gv": "Gaelg", # Manx - "ha": "هَوُسَ", # Hausa + "ha": "هَوُسَ", # Hausa "hak": "Hak-kâ-fa / 客家話", "haw": "Hawai`i", - "he": "עברית" , # Hebrew (formerly iw) - "hi": "हिन्दी" , # Hindi + "he": "עברית" , # Hebrew (formerly iw) + "hi": "हिन्दी" , # Hindi "hif": "Fiji Hindi", - "ho": "Hiri Mot", # Hiri Motu - "hr": "Hrvatski" , # Croatian + "ho": "Hiri Mot", # Hiri Motu + "hr": "Hrvatski" , # Croatian "hsb": "Hornjoserbsce", - "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole - "hu": "Magyar" , # Hungarian - "hy": "Հայերեն", # Armenian - "hz": "Otsiherero", # Herero + "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole + "hu": "Magyar" , # Hungarian + "hy": "Հայերեն", # Armenian + "hz": "Otsiherero", # Herero "ia": "Interlingua", "ie": "Interlingue", - "id": "Bahasa Indonesia",# Indonesian (formerly in) - "ig": "Igbo", # Igbo - "ii": "ꆇꉙ ", # Sichuan Yi - "ik": "Iñupiak", # Inupiak + "id": "Bahasa Indonesia",# Indonesian (formerly in) + "ig": "Igbo", # Igbo + "ii": "ꆇꉙ ", # Sichuan Yi + "ik": "Iñupiak", # Inupiak "ilo": "Ilokano", "io": "Ido" , - "is": "Íslenska" , # Icelandic - "it": "Italiano" , # Italian - "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut + "is": "Íslenska" , # Icelandic + "it": "Italiano" , # Italian + "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut - "ja": "日本語", # Japanese + "ja": "日本語", # Japanese "jbo": "Lojban", - "jv": "Basa Jawa", # Javanese + "jv": "Basa Jawa", # Javanese - "ka": "ქართული" , # Georgian + "ka": "ქართული" , # Georgian "kaa": "Qaraqalpaqsha", "kab": "Taqbaylit", - "kg": "KiKongo", # Kongo - "ki": "Gĩkũyũ", # Kikuyu - "kj": "Kuanyama", # Kuanyama - "kk": "Қазақша", # Kazakh - "kl": "Kalaallisut", # Kalaallisut; Greenlandic - "km": "ភាសាខ្មែរ", # Khmer; Cambodian - "kn": "ಕನ್ನಡ", # Kannada - "ko": "한국어" , # Korean - "kr": "Kanuri", # Kanuri - "ks": "कश्मीरी / كشميري", # Kashmiri + "kg": "KiKongo", # Kongo + "ki": "Gĩkũyũ", # Kikuyu + "kj": "Kuanyama", # Kuanyama + "kk": "Қазақша", # Kazakh + "kl": "Kalaallisut", # Kalaallisut; Greenlandic + "km": "ភាសាខ្មែរ", # Khmer; Cambodian + "kn": "ಕನ್ನಡ", # Kannada + "ko": "한국어" , # Korean + "kr": "Kanuri", # Kanuri + "ks": "कश्मीरी / كشميري", # Kashmiri "ksh": "Ripoarisch", - "ku": "Kurdî / كوردی", # Kurdish - "kv": "Коми", # Komi - "kw": "Kernewek/Karnuack", # Cornish - "ky": "Кыргызча", # Kirghiz + "ku": "Kurdî / كوردی", # Kurdish + "kv": "Коми", # Komi + "kw": "Kernewek/Karnuack", # Cornish + "ky": "Кыргызча", # Kirghiz - "la": "Latina" , # Latin + "la": "Latina" , # Latin "lad": "Dzhudezmo", - "lb": "Lëtzebuergesch" , # Letzeburgesch + "lb": "Lëtzebuergesch" , # Letzeburgesch "lbe": "Лакку", - "lg": "Luganda", # Ganda - "li": "Limburgs", # Limburgish; Limburger; Limburgan - "lij": "Lígur", - "ln": "Lingala", # Lingala + "lg": "Luganda", # Ganda + "li": "Limburgs", # Limburgish; Limburger; Limburgan + "lij": "Lígur", + "ln": "Lingala", # Lingala "lmo": "Lumbaart", - "lo": "ລາວ", # Lao; Laotian - "lt": "Lietuvių" , # Lithuanian - "lua": "Luba", # Luba - "lv": "Latvieš" , # Latvian; Lettish + "lo": "ລາວ", # Lao; Laotian + "lt": "Lietuvių" , # Lithuanian + "lua": "Luba", # Luba + "lv": "Latvieš" , # Latvian; Lettish "map-bms": "Basa Banyumasan", "mdf": "Мокшень (Mokshanj Kälj)", - "mg": "Malagasy", # Malagasy - "mh": "Ebon", # Marshall - "mi": "Māori", # Maori - "mk": "Македонски" , # Macedonian - "ml": None, # Malayalam - "mn": "Монгол", # Mongolian - "mo": "Молдовеняскэ", # Moldavian - "mr": "मराठी" , # Marathi - "ms": "Bahasa Melay" , # Malay - "mt": "Malti", # Maltese + "mg": "Malagasy", # Malagasy + "mh": "Ebon", # Marshall + "mi": "Māori", # Maori + "mk": "Македонски" , # Macedonian + "ml": None, # Malayalam + "mn": "Монгол", # Mongolian + "mo": "Молдовеняскэ", # Moldavian + "mr": "मराठी" , # Marathi + "ms": "Bahasa Melay" , # Malay + "mt": "Malti", # Maltese "mus": "Muskogee", - "my": "မ္ရန္မာစာ", # Burmese + "my": "မ္ရန္မာစာ", # Burmese "myv": "Эрзянь (Erzjanj Kelj)", "mzn": "مَزِروني", - "na": "dorerin Naoero", # Nauru + "na": "dorerin Naoero", # Nauru "nah": "Nāhuatl", "nap": "Nnapulitano", - "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l - "nd": None, # Ndebele, North + "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l + "nd": None, # Ndebele, North "nds": "Plattdüütsch", "nds-nl": "Nedersaksisch", - "ne": "नेपाली", # Nepali + "ne": "नेपाली", # Nepali "new": "नेपाल भाषा" , # Nepal Bhasa - "ng": "Oshiwambo", # Ndonga - "nl": "Nederlands" , # Dutch - "nn": "Nynorsk", # Norwegian Nynorsk - "no": "Norsk (Bokmål)" , # Norwegian + "ng": "Oshiwambo", # Ndonga + "nl": "Nederlands" , # Dutch + "nn": "Nynorsk", # Norwegian Nynorsk + "no": "Norsk (Bokmål)" , # Norwegian "nov": "Novial", - "nr": None, # Ndebele, South + "nr": None, # Ndebele, South "nrm": "Nouormand/Normaund", - "nv": "Diné bizaad", # Navajo - "ny": "Chi-Chewa", # Chichewa; Nyanja + "nv": "Diné bizaad", # Navajo + "ny": "Chi-Chewa", # Chichewa; Nyanja - "oc": "Occitan", # Occitan; Proven@,{c}al - "oj": None, # Ojibwa - "om": "Oromoo", # (Afan) Oromo - "or": "ଓଡ଼ିଆ", # Oriya - "os": "Иронау", # Ossetian; Ossetic + "oc": "Occitan", # Occitan; Proven@,{c}al + "oj": None, # Ojibwa + "om": "Oromoo", # (Afan) Oromo + "or": "ଓଡ଼ିଆ", # Oriya + "os": "Иронау", # Ossetian; Ossetic - "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi + "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi "pag": "Pangasinan", "pam": "Kapampangan", "pap": "Papiament", "pdc": "Deitsch", - "pi": "पाऴि", # Pali + "pi": "पाऴि", # Pali "pih": "Norfuk", - "pl": "Polski" , # Polish + "pl": "Polski" , # Polish "pms": "Piemontèis" , - "ps": "پښتو", # Pashto, Pushto - "pt": "Português" , # Portuguese + "ps": "پښتو", # Pashto, Pushto + "pt": "Português" , # Portuguese - "q": "Runa Simi" , # Quechua + "q": "Runa Simi" , # Quechua - "rm": "Rumantsch", # Rhaeto-Romance + "rm": "Rumantsch", # Rhaeto-Romance "rmy": "romani - रोमानी", - "rn": "Kirundi", # Rundi; Kirundi - "ro": "Română" , # Romanian + "rn": "Kirundi", # Rundi; Kirundi + "ro": "Română" , # Romanian "roa-rup": "Armãneashce", "roa-tara": "Tarandíne", - "ru": "Русский" , # Russian - "rw": "Ikinyarwanda", # Kinyarwanda + "ru": "Русский" , # Russian + "rw": "Ikinyarwanda", # Kinyarwanda - "sa": "संस्कृतम्", # Sanskrit + "sa": "संस्कृतम्", # Sanskrit "sah": "Саха тыла (Saxa Tyla)", - "sc": "Sardu", # Sardinian - "scn": "Sicilian", + "sc": "Sardu", # Sardinian + "scn": "Sicilian", "sco": "Scots", - "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi - "se": "Sámegiella", # Northern Sami - "sg": "Sängö", # Sango; Sangro + "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi + "se": "Sámegiella", # Northern Sami + "sg": "Sängö", # Sango; Sangro "sh": "Srpskohrvatski / Српскохрватски" , "si": "සිංහල", "simple": "Simple English" , - "sk": "Slovenčina" , # Slovak - "sl": "Slovenščina" , # Slovenian - "sm": "Gagana Samoa", # Samoan - "sn": "chiShona", # Shona - "so": "Soomaaliga", # Somali - "sr": "Српски / Srpski", # Serbian + "sk": "Slovenčina" , # Slovak + "sl": "Slovenščina" , # Slovenian + "sm": "Gagana Samoa", # Samoan + "sn": "chiShona", # Shona + "so": "Soomaaliga", # Somali + "sr": "Српски / Srpski", # Serbian "srn": "Sranantongo", - "ss": "SiSwati", # Swati; Siswati - "st": "Sesotho", # Sesotho; Sotho, Southern + "ss": "SiSwati", # Swati; Siswati + "st": "Sesotho", # Sesotho; Sotho, Southern "stk": "Seeltersk", - "s": "Basa Sunda", # Sundanese + "s": "Basa Sunda", # Sundanese "sq": "Shqip" , # Albanian "szl": "Ślůnski", - "sv": "Svenska" , # Swedish - "sw": "Kiswahili", # Swahili + "sv": "Svenska" , # Swedish + "sw": "Kiswahili", # Swahili - "ta": "தமிழ்" , # Tamil - "te": "తెలుగు" , # Telugu + "ta": "தமிழ்" , # Tamil + "te": "తెలుగు" , # Telugu "tet": "Tetun", - "tg": "Тоҷикӣ", # Tajik - "th": "ไทย" , # Thai - "ti": "ትግርኛ", # Tigrinya - "tk": "تركمن / Туркмен", # Turkmen - "tl": "Tagalog" , # Tagalog - "tn": "Setswana", # Tswana; Setswana - "to": "faka Tonga", # Tonga (?) # Also ZW ; MW + "tg": "Тоҷикӣ", # Tajik + "th": "ไทย" , # Thai + "ti": "ትግርኛ", # Tigrinya + "tk": "تركمن / Туркмен", # Turkmen + "tl": "Tagalog" , # Tagalog + "tn": "Setswana", # Tswana; Setswana + "to": "faka Tonga", # Tonga (?) # Also ZW ; MW "tokipona": "Tokipona", "tpi": "Tok Pisin", - "tr": "Türkçe" , # Turkish - "ts": "Xitsonga", # Tsonga - "tt": "Tatarça / Татарча", # Tatar + "tr": "Türkçe" , # Turkish + "ts": "Xitsonga", # Tsonga + "tt": "Tatarça / Татарча", # Tatar "tum": "chiTumbuka", - "tw": "Twi", # Twi - "ty": "Reo Mā`ohi", # Tahitian + "tw": "Twi", # Twi + "ty": "Reo Mā`ohi", # Tahitian "udm": "Удмурт кыл", - "ug": "Oyghurque", # Uighur - "uk": "Українська" , # Ukrainian - "ur": "اردو", # Urdu - "uz": "O‘zbek", # Uzbek + "ug": "Oyghurque", # Uighur + "uk": "Українська" , # Ukrainian + "ur": "اردو", # Urdu + "uz": "O‘zbek", # Uzbek - "ve": "Tshivenda", # Venda + "ve": "Tshivenda", # Venda "vec": "Vèneto", - "vi": "Tiếng Việt" , # Vietnamese + "vi": "Tiếng Việt" , # Vietnamese "vls": "West-Vlams", "vo": "Volapük" , - - "wa": "Walon", # Walloon + + "wa": "Walon", # Walloon "war": "Winaray", - "wo": "Wolof", # Wolof + "wo": "Wolof", # Wolof "w": "吴语", "xal": "Хальмг", - "xh": "isiXhosa", # Xhosa + "xh": "isiXhosa", # Xhosa - "yi": "ייִדיש", # Yiddish - "yo": "Yorùbá", # Yoruba + "yi": "ייִדיש", # Yiddish + "yo": "Yorùbá", # Yoruba - "za": "Cuengh", # Zhuang + "za": "Cuengh", # Zhuang "zea": "Zeêuws", - "zh": "中文" , # Chinese + "zh": "中文" , # Chinese "zh-classical": "古文 / 文言文", "zm-min-nan": "Bân-lâm-gú", "zh-yue": "粵語", - "zu": "isiZulu" # Zulu + "zu": "isiZulu" # Zulu } - - - - - - - - - diff --git a/WikiTrans/wikins.py b/wikitrans/wikins.py index 4fb5315..4fb5315 100644 --- a/WikiTrans/wikins.py +++ b/wikitrans/wikins.py diff --git a/wikitrans/wikitoken.py b/wikitrans/wikitoken.py new file mode 100644 index 0000000..49c6c68 --- a/dev/null +++ b/wikitrans/wikitoken.py @@ -0,0 +1,318 @@ +# Wiki tokens. -*- coding: utf-8 -*- +# Copyright (C) 2015-2018 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +""" +Wiki markup tokens and associated classes. + +This module defines classes for the basic nodes of the Wiki markup parse tree: + +WikiNode -- Abstract parse tree node. +WikiContentNode -- A node associated with some content. +WikiSeqNode -- A sequence of nodes. +WikiTextNode -- Textual content. +WikiDelimNode -- Delimiter. +WikiTagNode -- Tag (e.g. <tt>, </tt>, <tt />, etc.) +WikiRefNode -- Wiki reference (e.g. [target|name]) +WikiHdrNode -- Heading (e.g. == Section ==) +WikiEltNode -- Environment element. +WikiEnvNode -- Environment (numbered or unnumbered list, definition, etc.) +WikiIndNode -- Indent node. + +Auxiliary classes: + +WikiNodeEncoder -- Custom JSONEncoder subclass for serializing objects of the + above classes. +""" + +from __future__ import print_function +import re +import json + +class WikiNodeEncoder(json.JSONEncoder): + """Custom JSONEncoder subclass for serializing WikiNode and its subclasses.""" + def default(self, obj): + if isinstance(obj,WikiNode): + return obj.jsonEncode() + return json.JSONEncoder.default(self, obj) + +def jsonencoder(func): + def _mkencoder(self): + json = func(self) + json['wikinode'] = self.__class__.__name__ + json['type'] = self.type + return json + return _mkencoder + +class WikiNode(object): + """Generic parse tree node. + + Attributes: + + type -- actual type of this object (string) + parser -- parser instance that owns this node + """ + + type = 'UNDEF' + parser = None + + def __init__(self, parser, **kwargs): + self.parser = parser + for key in kwargs: + if hasattr(self,key): + self.__dict__[key] = kwargs[key] + else: + raise AttributeError("'%s' has no attribute '%s'" % (self.__class__.__name__, key)) + + def __str__(self): + return json.dumps(self, cls=WikiNodeEncoder, sort_keys=True) + + @jsonencoder + def jsonEncode(self): + ret = {} + for x in dir(self): + if x == 'parser' or x.startswith('_') or type(x) == 'function': + continue + if x in self.__dict__: + ret[x] = self.__dict__[x] + return ret + + def format(self): + """Abstract formatting function. + + Derived classes must override it. + """ + pass + +class WikiContentNode(WikiNode): + """Generic content node. + + Attributes: + + content -- Actual content + """ + + content = None + + def format(self): + pass + + @jsonencoder + def jsonEncode(self): + ret = {} + if self.content: + if self.type == 'TEXT': + ret['content'] = self.content + elif isinstance(self.content,list): + ret['content'] = map(lambda x: x.jsonEncode(), self.content) + elif isinstance(self.content,WikiNode): + ret['content'] = self.content.jsonEncode() + else: + ret['content'] = self.content + else: + ret['content'] = None + return ret + +class WikiSeqNode(WikiContentNode): + """Generic sequence of nodes. + + Attributes: + + content -- list of nodes. + """ + + def format(self): + for x in self.content: + x.format() + + @jsonencoder + def jsonEncode(self): + ret = {} + if not self.content: + ret['content'] = None + elif isinstance(self.content,list): + ret['content'] = map(lambda x: x.jsonEncode(), self.content) + elif isinstance(self.content,WikiNode): + ret['content'] = self.content.jsonEncode() + else: + ret['content'] = self.content + return ret + + +# ############## + +class WikiTextNode(WikiContentNode): + """Text node. + + Attributes: + + type -- 'TEXT' + content -- string + """ + + type = 'TEXT' + + @jsonencoder + def jsonEncode(self): + return { + 'content': self.content + } + +class WikiDelimNode(WikiContentNode): + """Delimiter node. + + Attributes: + + type -- 'DELIM' + content -- actual delimiter string + isblock -- boolean indicating whether it is a block delimiter + continuation -- True if continuation is expected + """ + + type = 'DELIM' + isblock=False + continuation = False + +class WikiTagNode(WikiContentNode): + """A Wiki tag. + + Attributes: + + tag -- actual tag name (with '<', '>', and eventual '/' stripped) + isblock -- True if this is a block tag + args -- List of tag arguments + idx -- If this is a "see also" reference, index of this ref in the + list of references. + FIXME: Perhaps this merits a subclass? + """ + + tag = None + isblock = False + args = None + idx = None + + def __init__(self, *args, **keywords): + super(WikiTagNode, self).__init__(*args, **keywords) + if self.type == 'TAG' and self.tag == 'ref' and hasattr(self.parser,'references'): + self.idx = len(self.parser.references) + self.parser.references.append(self) + + @jsonencoder + def jsonEncode(self): + return { + 'tag': self.tag, + 'isblock': self.isblock, + 'args': self.args.tab if self.args else None, + 'content': self.content.jsonEncode() if self.content else None, + 'idx': self.idx + } + +class WikiRefNode(WikiContentNode): + """Reference node. + + This class represents a wiki reference, such as "[ref|content]". + + Attributes: + + ref -- actual reference + content -- content string + """ + + type = 'REF' + ref = None + @jsonencoder + def jsonEncode(self): + return { + 'ref': self.ref, + 'content': self.content.jsonEncode() + } + +class WikiHdrNode(WikiContentNode): + """A wiki markup header class. + + Attributes: + + level -- header level + content -- header content (WikiNode subclass object) + """ + + type = 'HDR' + level = None + + @jsonencoder + def jsonEncode(self): + return { + 'level': self.level, + 'content': self.content.jsonEncode() + } + +class WikiEltNode(WikiContentNode): + """Environment element node. + + Attributes: + + subtype -- type of the environment (numbered, unnumbered, defn) + content -- content of the element (WikiNode subclass object) + """ + + type = 'ELT' + subtype = None + + @jsonencoder + def jsonEncode(self): + return { + 'subtype': self.subtype, + 'content': self.content.jsonEncode() + } + +class WikiEnvNode(WikiContentNode): + """Wiki Environment Node + + Attributes: + + envtype -- type of the environment (numbered, unnumbered, defn) + level -- nesting level of the environment + """ + + type = 'ENV' + envtype = None + level = None + + @jsonencoder + def jsonEncode(self): + return { + 'envtype': self.envtype, + 'level': self.level, + 'content': map(lambda x: x.jsonEncode(), self.content) + } + +class WikiIndNode(WikiContentNode): + """Indented block node. + + Attributes: + + level -- Indentation level. + content -- Indented content (WikiNode subclass object). + """ + + type = 'IND' + level = None + + @jsonencoder + def jsonEncode(self): + return { + 'level': self.level, + 'content': self.content.jsonEncode() + } |