summaryrefslogtreecommitdiffabout
authorSergey Poznyakoff <gray@Pirx.gnu.org.ua>2008-11-26 06:06:06 (GMT)
committer Sergey Poznyakoff <gray@Pirx.gnu.org.ua>2008-11-26 06:06:06 (GMT)
commit5dc93e466efaaa243e6490961b6e545eaa65f06c (patch) (side-by-side diff)
tree844b75613cabb2c0394828492038546c1f9806d8
downloadwikitrans-5dc93e466efaaa243e6490961b6e545eaa65f06c.tar.gz
wikitrans-5dc93e466efaaa243e6490961b6e545eaa65f06c.tar.bz2
Initial commit
Diffstat (more/less context) (ignore whitespace changes)
-rw-r--r--__init__.py18
-rw-r--r--test.py69
-rw-r--r--testdata/colon.html9
-rw-r--r--testdata/colon.wiki8
-rw-r--r--testdata/dom.wiki137
-rw-r--r--testdata/door.html200
-rw-r--r--testdata/door.wiki217
-rw-r--r--testdata/drzwi.html44
-rw-r--r--testdata/drzwi.wiki45
-rw-r--r--testdata/headings.html15
-rw-r--r--testdata/headings.wiki17
-rw-r--r--testdata/hz.html6
-rw-r--r--testdata/hz.wiki6
-rw-r--r--testdata/numlist.html7
-rw-r--r--testdata/numlist.wiki6
-rw-r--r--testdata/unlist.html10
-rw-r--r--testdata/unlist.wiki9
-rw-r--r--wiki2html.py503
-rw-r--r--wiki2plain.py452
-rw-r--r--wikicvt.py52
-rw-r--r--wikimarkup.py362
21 files changed, 2192 insertions, 0 deletions
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..f887a4c
--- a/dev/null
+++ b/__init__.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+__all__ = [ "wiki2html", "wiki2plain" ]
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..22e5393
--- a/dev/null
+++ b/test.py
@@ -0,0 +1,69 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+import wiki2html
+
+class TestMarkupParserBasic (unittest.TestCase):
+
+ def test_colon(self):
+ self.assert_(self.__test('colon'))
+ pass
+
+ def test_headings(self):
+ self.assert_(self.__test('headings'))
+ pass
+
+ def test_hz(self):
+ self.assert_(self.__test('hz'))
+ pass
+
+ def test_numlist(self):
+ self.assert_(self.__test('numlist'))
+ pass
+
+ def test_unlist(self):
+ self.assert_(self.__test('unlist'))
+ pass
+
+ def test_door(self):
+ self.assert_(self.__test('door'))
+ pass
+
+ def test_drzwi(self):
+ self.assert_(self.__test('drzwi'))
+ pass
+
+ def __test(self, filename):
+ name_in = 'testdata/' + filename + '.wiki'
+ name_out = 'testdata/' + filename + '.html'
+ fh = open(name_out)
+ buf = ''.join(fh.readlines()).strip()
+ hwm = wiki2html.HtmlWiktionaryMarkup(filename=name_in, lang="pl")
+ hwm.parse()
+
+ if str(hwm).strip() == buf:
+ return True
+
+ # fail
+ print "\n>>>%s<<<" % buf
+ print ">>>%s<<<" % str(hwm).strip()
+ return False
+
+if __name__ == '__main__':
+ unittest.main()
+
diff --git a/testdata/colon.html b/testdata/colon.html
new file mode 100644
index 0000000..9721b93
--- a/dev/null
+++ b/testdata/colon.html
@@ -0,0 +1,9 @@
+<dl><dd> A colon (:) indents a line or paragraph.
+</dd></dl>A newline starts a new paragraph.
+Should only be used on talk pages.
+For articles, you probably want the blockquote tag.
+<dl><dd> We use 1 colon to indent once.
+</dd><dl><dd> We use 2 colons to indent twice.
+</dd><dl><dd> 3 colons to indent 3 times, and so on.
+</dd></dl></dl></dl>
+
diff --git a/testdata/colon.wiki b/testdata/colon.wiki
new file mode 100644
index 0000000..2a00eee
--- a/dev/null
+++ b/testdata/colon.wiki
@@ -0,0 +1,8 @@
+: A colon (:) indents a line or paragraph.
+A newline starts a new paragraph.
+Should only be used on talk pages.
+For articles, you probably want the blockquote tag.
+: We use 1 colon to indent once.
+:: We use 2 colons to indent twice.
+::: 3 colons to indent 3 times, and so on.
+
diff --git a/testdata/dom.wiki b/testdata/dom.wiki
new file mode 100644
index 0000000..30803c6
--- a/dev/null
+++ b/testdata/dom.wiki
@@ -0,0 +1,137 @@
+[[cs:dom]] [[de:dom]] [[el:dom]] [[en:dom]] [[es:dom]] [[fr:dom]] [[ko:dom]] [[hr:dom]] [[io:dom]] [[id:dom]] [[is:dom]] [[it:dom]] [[ky:dom]] [[ku:dom]] [[lt:dom]] [[li:dom]] [[hu:dom]] [[nl:dom]] [[oc:dom]] [[om:dom]] [[pt:dom]] [[sl:dom]] [[fi:dom]] [[sv:dom]] [[vi:dom]] [[tr:dom]] [[uk:dom]] [[vo:dom]] [[zh:dom]]
+{{zobteż|DOM|Dom}}
+== dom ({{język polski}}) ==
+[[Grafika:BrunnHeiligenstadtBauernhaus.jpg|thumb|right|200px|dom (1.1)]]
+{{wymowa}} {{lp}} {{IPA|dɔm}} {{audio|Pl-dom.ogg}} {{audio|dom.ogg}} {{lm}} {{IPA2|ˈdɔmɨ}}
+{{znaczenia}}
+''rzeczownik, rodzaj męski''
+: (1.1) [[budynek]] [[mieszkalny]]
+: (1.2) [[pomieszczenie]], [[miejsce]] [[stały|stałego]] [[zamieszkanie|zamieszkania]] ([[pobyt]]u)
+: (1.3) [[placówka]] [[społeczny|społeczna]] [[lub]] [[handlowy|handlowa]]
+: (1.4) [[ród]], [[rodzina]], [[dynastia]]
+{{odmiana}} {{lp}} dom, ~u, ~owi, ~, ~em, ~u, ~u; {{lm}} dom|y, ~ów, ~om, ~y, ~ami, ~ach, ~y
+{{przykłady}}
+: (1.1) ''[[w|W]] [[miasto|mieście]] [[wyrosnąć|wyrosło]] [[wiele]] [[nowy]]ch '''domów.'''''
+: (1.2) ''[[szkoła|Szkoła]] [[być|jest]] [[drugi]]m '''domem''' [[uczeń|ucznia]].''
+: (1.3) ''[[za|Za]] [[kradzież]] [[dokonywać|dokonaną]] [[w]] '''domu''' [[handlowy]]m [[trafiać|trafił]] [[do]] '''domu''' [[poprawczy|poprawczego]].''
+: (1.4) ''...[[Joanna]] Kowalska, [[z]] '''domu''' Nowak''.
+{{składnia}}
+{{kolokacje}} (1.1) [[budować]]/[[burzyć]]/[[remontować]] '''~''', [[stary]]/[[nawiedzony]] '''~'''; (1.2) [[uciec]] [[z]] '''~u''', [[nie]] [[mieć]] '''~u'''; (1.3) '''~''' [[poprawczy]]/[[handlowy]]/[[towarowy]]/[[studencki]]; (1.4) [[dziecko]] [[z]] [[dobry|dobrego]]/[[porządny|porządnego]]/[[biedny|biednego]] '''~u'''
+{{synonimy}} (1.1) [[blok]], [[budynek]], [[chałupa]], [[chata]], [[dach nad głową]], [[dwór]], [[gniazdo rodzinne]], [[kamienica]], [[mieszkanie]], [[ojcowska strzecha]], [[ognisko domowe]], [[pałac]], [[pielesze]], [[przybytek]], [[rezydencja]], [[siedlisko]], [[wieżowiec]], [[własny kąt]], [[zamek]]
+{{antonimy}}
+{{pokrewne}} {{rzecz}} [[bezdomność]], [[domator]]/[[domatorka]], [[domownik]], [[podomka]]; {{przym}} [[domowy]], [[przydomowy]]
+{{frazeologia}} (1.1) [[szklane domy]]; (1.2) [[dom boży]]; (1.3) [[dom dziecka]], [[dom publiczny]]
+{{etymologia}}
+{{uwagi}}
+{{tłumaczenia}}
+* angielski: (1.1) [[house]]; (1.2) [[home]]
+* arabski: (1.1) [[آلمنزل]]; (1.2) [[آلدار]]
+* białoruski: (1.1) [[дом]] {{m}}
+* bułgarski: (1.1) [[къща]] {{f}}
+* chorwacki: (1.1) [[kuća]] {{f}}
+* czeski: (1.1) [[dům]] {{m}}
+* dolnołużycki: (1.1) [[#dom (język dolnołużycki)|dom]] {{m}}
+* duński: (1.1) [[hus]] {{n}}; (1.2) [[hjem]] {{n}}
+* esperanto: (1.1) [[domo]]; (1.4) [[hejmo]]
+* fiński: (1.1) [[talo]]; (1.2) [[koti]]
+* francuski: (1.1-2,4) [[maison]] {{f}}
+* górnołużycki: (1.1) [[#dom (język górnołużycki)|dom]] {{m}}
+* grecki: (1.1-2,4) [[σπίτι]] {{n}}; (1.1-4) [[οίκος]] {{m}}; (1.3) [[κατάστημα]] {{n}}
+* hawajski: (1.1) [[hale]]
+* hiszpański: (1.1-4) [[casa]] {{f}}
+* hebrajski: (1.1-4) [[בית]] {{m}} (bajit)
+* interlingua: (1.1) [[casa]]
+* irlandzki: (1.1) [[teach]] {{m}}
+* islandzki: (1.1) [[hús]] {{n}}; (1.2) [[heimili]] {{n}}; (1.3) [[hús]] {{n}}; (1.4) [[ætt]]
+* japoński: (1.1) ([[うち]], uchi)
+* jidysz: (1.1) [[הויז]] {{n}} (hojz); (1.2) [[היים]] {{f}} (hejm); (1.3) ...[[בית]]־ {{n}} (bejs-...)
+* kaszubski: (1.1-2) [[dóm]] {{m}}, [[chëcz]] {{f}}
+* kataloński: (1.1-4) [[casa]] {{f}}
+* klingoński: (1.1) [[juh|juH]]
+* krymskotatarski: (1.1) [[üy]]
+* litewski: (1.1) [[namas]] {{m}}
+* macedoński: (1.1) [[куќа]] {{f}}
+* niemiecki: (1.1,3) [[Haus]] {{n}} (1.2) [[Heim]] {{n}}
+* norweski (bokmål): (1.1) [[hus#hus (język norweski)|hus]] {{n}}
+* perski: (1.1) [[خانِه|خانه]] (khāneh)
+* portugalski: (1.1,3) [[casa]] {{f}}; (1.2) [[casa]] {{f}}, [[lar]] {{m}}, [[residência]] {{f}}
+* rosyjski: (1.1) [[дом]] {{m}}
+* rumuński: (1.1) [[casă]] {{f}}
+* serbski: (1.) [[дом]] {{m}}
+* slovio: (1.1) [[domo]]
+* słowacki: (1.1) [[#dom (język słowacki)|dom]]
+* szwedzki: (1.1) [[hus#hus (język szwedzki)|hus]] {{n}} (1.2) [[hem]] {{n}}
+* turecki: (1.1) [[ev]]
+* ukraiński: (1.1-2) [[дім]] {{m}}; (1.3) [[будинок]] {{m}}
+* węgierski: (1.1) [[ház]]
+* włoski (1.1) [[casa]] {{f}}
+
+== dom ({{język dolnołużycki}}) ==
+{{wymowa}}
+{{znaczenia}}
+''rzeczownik, rodzaj męski''
+: (1.1) [[#dom (język polski)|dom]]
+: (1.2) [[katedra]]
+{{odmiana}} {{lp}} dom, ~a, ~oju, ~, ~om, ~je; {{du}} ~a, ~owo, ~owa, ~a, ~oma, ~oma; {{lm}} ~y, ~ow, ~am, ~y, ~ami, ~ach
+{{przykłady}}
+: (1.1) ''[[wón|Wón]] [[byś|jo]] [[wóna|jej]] '''dom''' [[wugotowaś|wugótował]]''. → [[on#on (język polski)|On]] [[zapisać|zapisał]] [[ona|jej]] '''[[#dom (język polski)|dom]]''' [[w]] [[testament|testamencie]].
+{{składnia}}
+{{kolokacje}} '''dom''' [[za]] [[stary#stary (język dolnołużycki)|starych]] [[luź]]i → [[dom]] [[starzec|starców]]
+{{synonimy}}
+{{antonimy}}
+{{pokrewne}} {{przysł}} [[doma]], [[domoj]]
+{{frazeologia}}
+{{etymologia}} (1.2) {{etym|niem|Dom}}
+{{uwagi}}
+
+== dom ({{esperanto}}) ==
+{{wymowa}}
+{{znaczenia}}
+''morfem''
+: (1.1) [[#dom (język polski)|dom]] ''(budynek)''
+{{odmiana}}
+{{przykłady}}
+: (1.1)
+{{składnia}}
+{{kolokacje}}
+{{synonimy}}
+{{antonimy}}
+{{pochodne}} {{rzecz}} [[domo]]
+{{frazeologia}}
+{{etymologia}}
+{{uwagi}} {{por|hejm}}
+
+== dom ({{język górnołużycki}}) ==
+{{wymowa}}
+{{znaczenia}}
+''rzeczownik, rodzaj męski''
+: (1.1) [[#dom (język polski)|dom]]
+: (1.2) [[katedra]]
+{{odmiana}}
+{{przykłady}}
+: (1.1)
+{{składnia}}
+{{kolokacje}}
+{{synonimy}}
+{{antonimy}}
+{{pokrewne}}
+{{frazeologia}}
+{{etymologia}}
+{{uwagi}}
+
+== dom ({{slovio}}) ==
+{{wymowa}}
+{{znaczenia}}
+''rzeczownik''
+: (1.1) [[#dom (język polski)|dom]]
+{{odmiana}}
+{{przykłady}}
+: (1.1) ''[[oni|Oni]] [[kupit|kupili]] [[starju]] '''dom''' [[vo]] [[malgrod]]''. → Oni [[kupić|kupili]] [[stary]] '''[[#dom (język polski)|dom]]''' [[na]] [[wieś|wsi]].
+{{składnia}}
+{{kolokacje}}
+{{synonimy}} (1.1) [[domo]]
+{{antonimy}}
+{{pokrewne}} {{czas}} [[domovit]]
+{{frazeologia}}
+{{etymologia}}
+{{uwagi}} ''zapis cyrylicą'' [[дом]]
diff --git a/testdata/door.html b/testdata/door.html
new file mode 100644
index 0000000..c4bb0a9
--- a/dev/null
+++ b/testdata/door.html
@@ -0,0 +1,200 @@
+<br/><b>wikipedia:</b><br/>
+<h1>English</h1>
+<br/><b>rank:</b><br/>
+
+<h2>Pronunciation</h2><a href="http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf/Doorway%20La%20Ronce%20National%20Trust%20for%20Jersey.jpg/250px-Doorway%20La%20Ronce%20National%20Trust%20for%20Jersey.jpg">thumb|right|A door.</a>
+<ul><li> <b>a</b> <b>enPR</b>, <b>IPA</b>, <b>SAMPA</b>
+</li><ul><li> <b>rhymes</b>
+</li></ul><li> <b>a</b> <b>enPR</b>, <b>IPA</b>, <b>SAMPA</b>
+</li><ul><li> <b>audio</b>
+</li></ul><li> <b>homophones</b> <b>qualifier</b>
+</li></ul>
+<h2>Etymology</h2>From <b>etyl</b> <b>term</b> &lt; <b>etyl</b> <b>term</b>, <b>term</b> &lt; <b>proto</b> &lt; <b>proto</b>. Cognates include Gothic <b>term</b>, Danish <b>term</b>, German <b>term</b> ( &lt; Old High German <b>term</b>), Icelandic <b>term</b> ( &lt; Old Norse <b>term</b>), Latin <b>term</b>, Modern Greek <b>term</b> ( &lt; Ancient Greek <b>term</b>), Persian <b>term</b>, and Russian <b>term</b>.
+
+<h2>Noun</h2><br/><b>wikipedia:</b><br/>
+<br/><b>en-noun:</b><br/>
+
+<ol><li> A <a href="http://pl.wiktionary.org/portal">portal</a> of entry into a building or room, consisting of a rigid plane movable on a <a href="http://pl.wiktionary.org/hinge">hinge</a>. Doors are frequently made of <a href="http://pl.wiktionary.org/wood">wood</a> or <a href="http://pl.wiktionary.org/metal">metal</a>. May have a <a href="http://pl.wiktionary.org/handle">handle</a> to help open and close, a <a href="http://pl.wiktionary.org/latch">latch</a> to hold the door closed <b>,</b> and a <a href="http://pl.wiktionary.org/lock">lock</a> that ensures the door cannot be opened without the key.
+</li><ol><li> <i>I knocked on the vice president's <b>door<i></i></b>
+</i></li></ol><li> An non-physical entry into the next world, a particular feeling, a company, etc.
+</li><ol><li> <i>Keep a <b>door</b>on your anger.</i>
+</li></ol></ol>
+<h3>Translations</h3><br/><b>trans-top:</b><br/>
+<ul><li> Albanian: <a href="http://pl.wiktionary.org/der%C3%AB">derë</a>
+</li><li> Arabic: <b>Arab</b> <b>IPAchar</b> <b>m</b>, <b>Arab</b> <b>IPAchar</b> <b>p</b>
+</li><li> Aramaic:
+</li><ul><li> Syriac: <a href="http://pl.wiktionary.org/%DC%AC%DC%AA%DC%A5%DC%90">ܬܪܥܐ</a> (tar‘ā’, tar‘o’) <b>m</b>
+</li><li> Hebrew: <a href="http://pl.wiktionary.org/%D7%AA%D7%A8%D7%A2%D7%90">תרעא</a> (tar‘ā’, tar‘o’) <b>m</b>
+</li></ul><li> Armenian: <b>t+</b> (duṙ)
+</li><li> <a href="http://pl.wiktionary.org/Basque">Basque</a>: <a href="http://pl.wiktionary.org/ate">ate</a>
+</li><li> Bengali: <a href="http://pl.wiktionary.org/%E0%A6%A6%E0%A6%B0%E0%A6%9C%E0%A6%BE">দরজা</a>
+</li><li> Bosnian: <a href="http://pl.wiktionary.org/vrata">vrata</a> <b>f</b>/ <b>p</b>
+</li><li> <a href="http://pl.wiktionary.org/Breton">Breton</a>: <a href="http://pl.wiktionary.org/dor">dor</a> <b>f</b>, dorioù <b>p</b> <i>(note:</i> nor <i>after article in the singular)</i>
+</li><li> Bulgarian: <b>t+</b>
+</li><li> <a href="http://pl.wiktionary.org/Catalan">Catalan</a>: <a href="http://pl.wiktionary.org/porta">porta</a> <b>f</b>
+</li><li> Chinese: <a href="http://pl.wiktionary.org/%E9%96%80">門</a>, <a href="http://pl.wiktionary.org/%E9%97%A8">门</a> (mén)
+</li><li> Croatian: <b>t-</b>
+</li><li> Czech: <b>t-</b>
+</li><li> Danish: <b>t-</b>
+</li><li> Dutch: <b>t+</b>
+</li><li> Esperanto: <b>t+</b>
+</li><li> Estonian: <b>t+</b>
+</li><li> <a href="http://pl.wiktionary.org/Ewe">Ewe</a>: <a href="http://pl.wiktionary.org/%CA%8B%C9%94tru">ʋɔtru</a> <b>n</b>
+</li><li> Faroese: <b>t-</b>, <b>t-</b>
+</li><li> Finnish: <b>t+</b>
+</li><li> French: <b>t+</b>
+</li><li> Georgian: <a href="http://pl.wiktionary.org/%E1%83%99%E1%83%90%E1%83%A0%E1%83%98">კარი</a> (kari)
+</li><li> German: <a href="http://pl.wiktionary.org/T%C3%BCr">Tür</a> <b>f</b>, <a href="http://pl.wiktionary.org/T%C3%BCren">Türen</a> <b>p</b>, <a href="http://pl.wiktionary.org/T%C3%BCre">Türe</a> <b>f</b>
+</li><li> Greek: <b>t+</b>, <b>t+</b>
+</li><ul><li> Ancient Greek: <b>t+</b>
+</li></ul><li> Hebrew: <a href="http://pl.wiktionary.org/%D7%93%D7%9C%D7%AA">דלת</a> (délet) <b>f</b>
+</li><li> Hindi: <a href="http://pl.wiktionary.org/%E0%A4%A6%E0%A4%B0%E0%A4%B5%E0%A4%BE%E0%A4%9C%E0%A4%BC%E0%A4%BE">दरवाज़ा</a> (darvāzā) <b>m</b>
+</li><li> Hungarian: <a href="http://pl.wiktionary.org/ajt%C3%B3">ajtó</a>, <a href="http://pl.wiktionary.org/kapu">kapu</a>, <a href="http://pl.wiktionary.org/bej%C3%A1rat">bejárat</a>; <a href="http://pl.wiktionary.org/ajt%C3%B3ny%C3%ADl%C3%A1s">ajtónyílás</a>
+</li><li> Indonesian: <b>t-</b>
+</li><li> Irish: <b>t-</b>
+</li><li> Italian: <b>t+</b>, <b>t-</b>, <b>t+</b>
+</li><li> Japanese: <a href="http://pl.wiktionary.org/%E6%88%B8">戸</a> (<a href="http://pl.wiktionary.org/%E3%81%A8">と</a>, to), <a href="http://pl.wiktionary.org/%E6%89%89">扉</a> (<a href="http://pl.wiktionary.org/%E3%81%A8%E3%81%B3%E3%82%89">とびら</a>, tobira), <a href="http://pl.wiktionary.org/%E3%83%89%E3%82%A2">ドア</a> (dóa)
+</li><li> <b>trreq</b>
+</li><li> Korean: <a href="http://pl.wiktionary.org/%EB%AC%B8">문</a> (mun)
+</li><li> Kurdish: <b>t+</b>, <b>t+</b>, <b>t+</b>, <b>t+</b>
+</li><li> Lao: <b>t</b>
+</li><li> Latin: <b>t-</b>, <b>t+</b>
+</li></ul><br/><b>trans-mid:</b><br/>
+<ul><li> Latvian: <a href="http://pl.wiktionary.org/durvis">durvis</a> <b>m</b>
+</li><li> Lithuanian: <b>t-</b>
+</li><li> <a href="http://pl.wiktionary.org/Lower%20Sorbian">Lower Sorbian</a>: <a href="http://pl.wiktionary.org/%C5%BAurja">źurja</a> <b>p</b>
+</li><li> Malay: <b>t+</b>
+</li><li> <a href="http://pl.wiktionary.org/Malayalam">Malayalam</a>: <a href="http://pl.wiktionary.org/%E0%B4%B5%E0%B4%BE%E0%B4%A4%E0%B4%BF%E0%B4%B2%E0%B5%8D%E2%80%8D">വാതില്‍</a>, <a href="http://pl.wiktionary.org/%E0%B4%95%E0%B4%A4%E0%B4%95%E0%B5%8D">കതക്</a>, <a href="http://pl.wiktionary.org/%E0%B4%95%E0%B4%B5%E0%B4%BE%E0%B4%9F%E0%B4%82">കവാടം</a>, <a href="http://pl.wiktionary.org/%E0%B4%AA%E0%B5%8D%E0%B4%B0%E0%B4%B5%E0%B5%87%E0%B4%B6%E0%B4%A8%E0%B4%AE%E0%B5%81%E0%B4%96%E0%B4%82">പ്രവേശനമുഖം</a>
+</li><li> Maltese: <a href="http://pl.wiktionary.org/bieb">bieb</a> <b>m</b>
+</li><li> <a href="http://pl.wiktionary.org/Marathi">Marathi</a>: <a href="http://pl.wiktionary.org/%E0%A4%A6%E0%A4%B0%E0%A4%B5%E0%A4%BE%E0%A4%9C%E0%A4%BE">दरवाजा</a> (darvājā), <a href="http://pl.wiktionary.org/%E0%A4%A6%E0%A4%BE%E0%A4%B0">दार</a> (dār)
+</li><li> Mongolian: <a href="http://pl.wiktionary.org/%D2%AF%D2%AF%D0%B4">үүд</a> (üüd)
+</li><li> Norwegian: <b>t-</b>
+</li><li> Old English: <b>t+</b>, <b>t-</b>
+</li><li> <b>trreq</b>
+</li><li> Persian: <b>fa-Arab</b> (dar)
+</li><li> Polish: <b>t+</b>
+</li><li> Portuguese: <b>t+</b>
+</li><li> <a href="http://pl.wiktionary.org/Punjabi">Punjabi</a>: <a href="http://pl.wiktionary.org/%E0%A8%AC%E0%A9%82%E0%A8%B9%E0%A8%BE">ਬੂਹਾ</a> (būhā), <a href="http://pl.wiktionary.org/%E0%A8%A6%E0%A8%B0%E0%A8%B5%E0%A8%BE%E0%A8%9C%E0%A8%BC%E0%A8%BE">ਦਰਵਾਜ਼ਾ</a> (darvāzā)
+</li><li> Romanian: <b>t+</b>
+</li><li> Russian: <a href="http://pl.wiktionary.org/%D0%B4%D0%B2%D0%B5%D1%80%D1%8C">дверь</a> (dver’) <b>f</b>
+</li><li> Sanskrit: <a href="http://pl.wiktionary.org/%E0%A4%A6%E0%A5%8D%E0%A4%B5%E0%A4%BE%E0%A4%B0%E0%A4%82">द्वारं</a>
+</li><li> <a href="http://pl.wiktionary.org/Scottish%20Gaelic">Scottish Gaelic</a>: <a href="http://pl.wiktionary.org/dorus">dorus</a> <b>m</b>
+</li><li> Serbian: <b>t-</b>, <b>t-</b>
+</li><li> Slovak: <b>t-</b>
+</li><li> Slovene: <b>t+</b>
+</li><li> Spanish: <b>t+</b>
+</li><li> Swahili: <a href="http://pl.wiktionary.org/mlango">mlango</a>
+</li><li> Swedish: <b>t+</b>
+</li><li> <a href="http://pl.wiktionary.org/Tagalog">Tagalog</a>: <a href="http://pl.wiktionary.org/pinto">pinto</a>
+</li><li> <a href="http://pl.wiktionary.org/Tamil">Tamil</a>: <a href="http://pl.wiktionary.org/%E0%AE%95%E0%AE%A4%E0%AE%B5%E0%AF%81">கதவு</a> (kathavu)
+</li><li> <a href="http://pl.wiktionary.org/Taos">Taos</a>: <a href="http://pl.wiktionary.org/k%C9%99%CC%8Fd%C3%A9nem%C4%85">kə̏dénemą</a>
+</li><li> <a href="http://pl.wiktionary.org/Telugu">Telugu</a>: <a href="http://pl.wiktionary.org/%E0%B0%A4%E0%B0%B2%E0%B1%81%E0%B0%AA%E0%B1%81">తలుపు</a>, <a href="http://pl.wiktionary.org/%E0%B0%A6%E0%B1%8D%E0%B0%B5%E0%B0%BE%E0%B0%B0%E0%B0%AE%E0%B1%81">ద్వారము</a>
+</li><li> Thai: <b>Thai</b> (bprà-dtoo)
+</li><li> Turkish: <b>t+</b>
+</li><li> Ukrainian: <b>t+</b>
+</li><li> <a href="http://pl.wiktionary.org/Upper%20Sorbian">Upper Sorbian</a>: <a href="http://pl.wiktionary.org/durje">durje</a> <b>p</b>
+</li><li> Urdu: <b>ur-Arab</b> (darvāza) <b>m</b>
+</li><li> Vietnamese: <b>t+</b>
+</li><li> Welsh: <b>t-</b>
+</li><li> <a href="http://pl.wiktionary.org/West%20Frisian">West Frisian</a>: <a href="http://pl.wiktionary.org/doar">doar</a>
+</li><li> Yiddish: <a href="http://pl.wiktionary.org/%D7%98%D7%99%D7%A8">טיר</a> (tir) <b>f</b>
+</li><li> <b>trreq</b>
+</li></ul><br/><b>trans-bottom:</b><br/>
+
+<h3>Derived terms</h3><ul><li> <a href="http://pl.wiktionary.org/door%20brake">door brake</a>
+</li><li> <a href="http://pl.wiktionary.org/sliding%20door">sliding door</a>
+</li><li> <a href="http://pl.wiktionary.org/up%20and%20over%20door">up and over door</a>
+</li><li> <a href="http://pl.wiktionary.org/show%20somebody%20the%20door">show somebody the door</a>
+</li></ul>
+<h2>Verb</h2><br/><b>en-verb:</b><br/>
+
+<ol><li> <b>transitive</b> To cause a collision by opening the door of a vehicle in the front of (an oncoming cyclist or pedestrian).
+</li></ol>
+<a href="http://pl.wiktionary.org/Category%3A1000%20English%20basic%20words">Category:1000 English basic words</a>
+
+<hr/>
+<h1>Dutch</h1>
+<h2>Pronunciation</h2><ul><li> <b>audio</b>
+</li><li> <b>IPA</b>
+</li></ul>
+<h2>Preposition</h2><br/><b>infl:</b><br/>
+
+<ol><li> <a href="http://pl.wiktionary.org/through">through</a>
+</li><ol><li> <i>Hij schoot de bal <b>door</b>het raam.</i> &amp;mdash; He kicked the ball <b>through</b>the window.
+</li></ol><li> <a href="http://pl.wiktionary.org/around">around</a> within an enclosed space
+</li><ol><li> <i>Dolenthousiast rende het hondje <b>door</b>de kamer.</i> &amp;mdash; Very enthusiastically the puppy ran <b>around</b>the room.
+</li></ol><li> <a href="http://pl.wiktionary.org/because%20of">because of</a>
+</li><ol><li> <b><i>Door</i></b>files kan ik niet op tijd komen.<i> &amp;mdash; <b>Because of</b>traffic jams I'm unable to arrive on time.
+</i></li></ol></ol>
+<h3>Synonyms</h3><ul><li> <b>sense</b> <a href="http://pl.wiktionary.org/vanwege">vanwege</a>
+</li></ul>
+<h2>Postposition</h2><br/><b>infl:</b><br/>
+
+<ol><li> <a href="http://pl.wiktionary.org/through">through</a> (implying motion)
+</li><ol><li> <i>Ik rijd nu de stad <b>door</b></i> &amp;mdash; I'm now driving <b>through</b>the city.
+</li></ol><li> <a href="http://pl.wiktionary.org/around">around</a> within an enclosed space
+</li><ol><li> <i>Dolenthousiast rende het hondje de kamer <b>door</b></i> &amp;mdash; Very enthusiastically the puppy ran <b>around</b>the room.
+</li></ol></ol>
+<h2>Adverb</h2><br/><b>infl:</b><br/>
+
+<ol><li> <a href="http://pl.wiktionary.org/through">through</a>, <a href="http://pl.wiktionary.org/forward">forward</a>, <a href="http://pl.wiktionary.org/on">on</a>
+</li><ol><li> <i>Ondanks slecht weer ging het feest ging toch <b>door.<i></i></b>&amp;mdash; Despite bad weather, the party went <b>on</b>anyway.
+</i></li></ol></ol>
+<h3>Derived terms</h3><br/><b>top4:</b><br/>
+<ul><li> <a href="http://pl.wiktionary.org/doorgaan">doorgaan</a>
+</li><li> <a href="http://pl.wiktionary.org/doorgang">doorgang</a>
+</li><li> <a href="http://pl.wiktionary.org/doorgeven">doorgeven</a>
+</li><li> <a href="http://pl.wiktionary.org/doorstaan">doorstaan</a>
+</li><li> <a href="http://pl.wiktionary.org/doorstart">doorstart</a>
+</li><li> <a href="http://pl.wiktionary.org/doortocht">doortocht</a>
+</li><li> <a href="http://pl.wiktionary.org/doorwerken">doorwerken</a>
+</li><li> <a href="http://pl.wiktionary.org/doorzichtig">doorzichtig</a>
+</li></ul><br/><b>bottom:</b><br/>
+
+<h2>Conjunction</h2><br/><b>infl:</b><br/>
+
+<ol><li> <a href="http://pl.wiktionary.org/by">by</a>
+</li><ol><li> <i>Hij vermeed een confrontatie <b>door</b>de andere kant op te lopen.</i> &amp;mdash; He avoided a confrontation <b>by</b>walking the other way.
+</li></ol></ol>
+<h3>Derived terms</h3><ul><li> <a href="http://pl.wiktionary.org/door%20middel%20van">door middel van</a>
+</li></ul>
+<a href="http://am.wiktionary.org/door">አማርኛ</a>
+<a href="http://ang.wiktionary.org/door">Englisc</a>
+<a href="http://ar.wiktionary.org/door">العربية</a>
+<a href="http://cs.wiktionary.org/door">Česky</a>
+<a href="http://de.wiktionary.org/door">Deutsch</a>
+<a href="http://et.wiktionary.org/door">Eesti</a>
+<a href="http://el.wiktionary.org/door">Ελληνικά</a>
+<a href="http://es.wiktionary.org/door">Español</a>
+<a href="http://fa.wiktionary.org/door">فارسی</a>
+<a href="http://fr.wiktionary.org/door">Français</a>
+<a href="http://ko.wiktionary.org/door">한국어</a>
+<a href="http://hy.wiktionary.org/door">Հայերեն</a>
+<a href="http://io.wiktionary.org/door">Ido</a>
+<a href="http://id.wiktionary.org/door">Bahasa Indonesia</a>
+<a href="http://it.wiktionary.org/door">Italiano</a>
+<a href="http://kk.wiktionary.org/door">Қазақша</a>
+<a href="http://ku.wiktionary.org/door">Kurdî / كوردی</a>
+<a href="http://lo.wiktionary.org/door">ລາວ</a>
+<a href="http://lt.wiktionary.org/door">Lietuvių</a>
+<a href="http://li.wiktionary.org/door">Limburgs</a>
+<a href="http://hu.wiktionary.org/door">Magyar</a>
+<a href="http://nl.wiktionary.org/door">Nederlands</a>
+<a href="http://ja.wiktionary.org/door">日本語</a>
+<a href="http://no.wiktionary.org/door">Norsk (Bokmål)</a>
+<a href="http://oc.wiktionary.org/door">Occitan</a>
+<a href="http://ug.wiktionary.org/door">Oyghurque</a>
+<a href="http://km.wiktionary.org/door">ភាសាខ្មែរ</a>
+<a href="http://pl.wiktionary.org/door">Polski</a>
+<a href="http://pt.wiktionary.org/door">Português</a>
+<a href="http://simple.wiktionary.org/door">Simple English</a>
+<a href="http://sr.wiktionary.org/door">Српски / Srpski</a>
+<a href="http://fi.wiktionary.org/door">Suomi</a>
+<a href="http://sv.wiktionary.org/door">Svenska</a>
+<a href="http://ta.wiktionary.org/door">தமிழ்</a>
+<a href="http://te.wiktionary.org/door">తెలుగు</a>
+<a href="http://th.wiktionary.org/door">ไทย</a>
+<a href="http://vi.wiktionary.org/door">Tiếng Việt</a>
+<a href="http://tr.wiktionary.org/door">Türkçe</a>
+<a href="http://uk.wiktionary.org/door">Українська</a>
+<a href="http://zh.wiktionary.org/door">中文</a>
diff --git a/testdata/door.wiki b/testdata/door.wiki
new file mode 100644
index 0000000..0be6131
--- a/dev/null
+++ b/testdata/door.wiki
@@ -0,0 +1,217 @@
+{{wikipedia|Door (disambiguation)}}
+==English==
+
+{{rank|myself|morning|money|275|door|round|kind|form}}
+
+===Pronunciation===
+[[Image:Doorway La Ronce National Trust for Jersey.jpg|thumb|right|A door.]]
+* {{a|RP}} {{enPR|dô(r)}}, {{IPA|/dɔː(ɹ)/}}, {{SAMPA|/dO:(r)/}}
+*: {{rhymes|ɔː(r)}}
+* {{a|US}} {{enPR|dôr}}, {{IPA|/dɔːɹ/|/doʊɹ/}}, {{SAMPA|/dO:r/|/doUr/}}
+*: {{audio|en-us-door.ogg|Audio (US)}}
+* {{homophones|daw}} {{qualifier|in [[non-rhotic]] accents}}
+
+===Etymology===
+From {{etyl|enm|en}} {{term|dor|lang=enm}} &lt; {{etyl|ang|en}} {{term|duru||door|lang=ang}}, {{term|dor||gate|lang=ang}} &lt; {{proto|Germanic|dur-|lang=en}} &lt; {{proto|Indo-European|dʰwer-||dʰwor-|doorway, door, gate|lang=en}}. Cognates include Gothic {{term|𐌳𐌰𐌿𐍂|sc=Goth|tr=daúr|lang=got}}, Danish {{term|dør}}, German {{term|Tür|lang=de}} ( &lt; Old High German {{term|turi|lang=goh}}), Icelandic {{term|dyr|lang=is}} ( &lt; Old Norse {{term|dyrr|lang=non}}), Latin {{term|foris|lang=la}}, Modern Greek {{term|sc=Grek|θύρα|tr=thýra}} ( &lt; Ancient Greek {{term|sc=polytonic|θύρα|tr=thura|lang=grc}}), Persian {{term|sc=fa-Arab|در|tr=dar|lang=fa}}, and Russian {{term|sc=Cyrl|дверь|tr=dver’|lang=ru}}.
+
+===Noun===
+{{wikipedia}}
+{{en-noun}}
+
+# A [[portal]] of entry into a building or room, consisting of a rigid plane movable on a [[hinge]]. Doors are frequently made of [[wood]] or [[metal]]. May have a [[handle]] to help open and close, a [[latch]] to hold the door closed{{,}} and a [[lock]] that ensures the door cannot be opened without the key.
+#: ''I knocked on the vice president's '''door'''''
+# An non-physical entry into the next world, a particular feeling, a company, etc.
+#: ''Keep a '''door''' on your anger.''
+
+====Translations====
+{{trans-top|portal of entry into a building or room}}
+* Albanian: [[derë]]
+* Arabic: {{Arab|[[باب|بَابٌ]]}} {{IPAchar|(bāb)}} {{m}}, {{Arab|[[باب|أبْوَاب]]}} {{IPAchar|(’abwāb)}} {{p}}
+* Aramaic:
+*: Syriac: [[ܬܪܥܐ]] (tar‘ā’, tar‘o’) {{m}}
+*: Hebrew: [[תרעא]] (tar‘ā’, tar‘o’) {{m}}
+* Armenian: {{t+|hy|դուռ|sc=Hayeren|xs=Armenian}} (duṙ)
+* [[Basque]]: [[ate]]
+* Bengali: [[দরজা]]
+* Bosnian: [[vrata]] {{f|s}}/{{p}}
+* [[Breton]]: [[dor]] {{f}}, dorioù {{p}} ''(note:'' nor ''after article in the singular)''
+* Bulgarian: {{t+|bg|врата|n|p|tr=vrata|sc=Cyrl}}
+* [[Catalan]]: [[porta]] {{f}}
+* Chinese: [[門]], [[门]] (mén)
+* Croatian: {{t-|hr|vrata|n|p}}
+* Czech: {{t-|cs|dveře|f|p}}
+* Danish: {{t-|da|dør}}
+* Dutch: {{t+|nl|deur|f}}
+* Esperanto: {{t+|eo|pordo|xs=Esperanto}}
+* Estonian: {{t+|et|uks}}
+* [[Ewe]]: [[ʋɔtru]] {{n}}
+* Faroese: {{t-|fo|dyr|xs=Faroese}}, {{t-|fo|hurð|xs=Faroese}}
+* Finnish: {{t+|fi|ovi}}
+* French: {{t+|fr|porte|f}}
+* Georgian: [[კარი]] (kari)
+* German: [[Tür]] {{f}}, [[Türen]] {{p}}, [[Türe]] {{f}}
+* Greek: {{t+|el|πόρτα|f|tr=pórta|sc=Grek}}, {{t+|el|θύρα|f|tr=thýra|sc=Grek}}
+** Ancient Greek: {{t+|el|θύρα|f|tr=thýra|sc=Grek}}
+* Hebrew: [[דלת]] (délet) {{f}}
+* Hindi: [[दरवाज़ा]] (darvāzā) {{m}}
+* Hungarian: [[ajtó]], [[kapu]], [[bejárat]]; [[ajtónyílás]]
+* Indonesian: {{t-|id|pintu|xs=Indonesian}}
+* Irish: {{t-|ga|doras|m|xs=Irish}}
+* Italian: {{t+|it|porta|f}}, {{t-|it|portiera|f}}, {{t+|it|sportello|m}}
+* Japanese: [[戸]] ([[と]], to), [[扉]] ([[とびら]], tobira), [[ドア]] (dóa)
+* {{trreq|Kannada}}
+* Korean: [[문]] (mun)
+* Kurdish: {{t+|ku|derî|m}}, {{t+|ku|dergeh|m}}, {{t+|ku|ده‌رگا|sc=KUchar}}, {{t+|ku|قاپی|sc=KUchar}}
+* Lao: {{t|lo|ປະຕູ|tr=pa-tuu|sc=Laoo}}
+* Latin: {{t-|la|ostium|n}}, {{t+|la|ianua|f}}
+{{trans-mid}}
+* Latvian: [[durvis]] {{m}}
+* Lithuanian: {{t-|lt|durys|xs=Lithuanian}}
+* [[Lower Sorbian]]: [[źurja]] {{p}}
+* Malay: {{t+|ms|pintu|xs=Malay}}
+* [[Malayalam]]: [[വാതില്‍]], [[കതക്]], [[കവാടം]], [[പ്രവേശനമുഖം]]
+* Maltese: [[bieb]] {{m}}
+* [[Marathi]]: [[दरवाजा]] (darvājā), [[दार]] (dār)
+* Mongolian: [[үүд]] (üüd)
+* Norwegian: {{t-|no|dør|m}}
+* Old English: {{t+|ang|duru|xs=Old English}}, {{t-|ang|dor|xs=Old English}}
+* {{trreq|Oriya}}
+* Persian: {{fa-Arab|[[در]]}} (dar)
+* Polish: {{t+|pl|drzwi|n|p}}
+* Portuguese: {{t+|pt|porta|f}}
+* [[Punjabi]]: [[ਬੂਹਾ]] (būhā), [[ਦਰਵਾਜ਼ਾ]] (darvāzā)
+* Romanian: {{t+|ro|uşă|f}}
+* Russian: [[дверь]] (dver’) {{f}}
+* Sanskrit: [[द्वारं]]
+* [[Scottish Gaelic]]: [[dorus]] {{m}}
+* Serbian: {{t-|sr|врата|f|sc=Cyrl}}, {{t-|sr|vrata|f}}
+* Slovak: {{t-|sk|dvere|f|p}}
+* Slovene: {{t+|sl|vrata|n|p}}
+* Spanish: {{t+|es|puerta|f}}
+* Swahili: [[mlango]]
+* Swedish: {{t+|sv|dörr|c}}
+* [[Tagalog]]: [[pinto]]
+* [[Tamil]]: [[கதவு]] (kathavu)
+* [[Taos]]: [[kə̏dénemą]]
+* [[Telugu]]: [[తలుపు]], [[ద్వారము]]
+* Thai: {{Thai|[[ประตู]]}} (bprà-dtoo)
+* Turkish: {{t+|tr|kapı}}
+* Ukrainian: {{t+|uk|двері|f|p|tr=dveri|sc=Cyrl|xs=Ukrainian}}
+* [[Upper Sorbian]]: [[durje]] {{p}}
+* Urdu: {{ur-Arab|[[دروازہ]]}} (darvāza) {{m}}
+* Vietnamese: {{t+|vi|cửa|xs=Vietnamese}}
+* Welsh: {{t-|cy|drws|xs=Welsh}}
+* [[West Frisian]]: [[doar]]
+* Yiddish: [[טיר]] (tir) {{f}}
+* {{trreq|Zulu}}
+{{trans-bottom}}
+
+====Derived terms====
+* [[door brake]]
+* [[sliding door]]
+* [[up and over door]]
+* [[show somebody the door]]
+
+===Verb===
+{{en-verb}}
+
+# {{transitive|cycling}} To cause a collision by opening the door of a vehicle in the front of (an oncoming cyclist or pedestrian).
+
+[[Category:1000 English basic words]]
+
+----
+
+==Dutch==
+
+===Pronunciation===
+* {{audio|Nl-door.ogg|audio}}
+* {{IPA|lang=nl|[dʊːr]}}
+
+===Preposition===
+{{infl|nl|preposition}}
+
+# [[through]]
+#: ''Hij schoot de bal '''door''' het raam.'' &amp;mdash; He kicked the ball '''through''' the window.
+# [[around]] within an enclosed space
+#: ''Dolenthousiast rende het hondje '''door''' de kamer.'' &amp;mdash; Very enthusiastically the puppy ran '''around''' the room.
+# [[because of]]
+#: '''''Door''' files kan ik niet op tijd komen.'' &amp;mdash; '''Because of''' traffic jams I'm unable to arrive on time.
+
+====Synonyms====
+* {{sense|because of}} [[vanwege]]
+
+===Postposition===
+{{infl|nl|postposition}}
+
+# [[through]] (implying motion)
+#: ''Ik rijd nu de stad '''door'''.'' &amp;mdash; I'm now driving '''through''' the city.
+# [[around]] within an enclosed space
+#: ''Dolenthousiast rende het hondje de kamer '''door'''.'' &amp;mdash; Very enthusiastically the puppy ran '''around''' the room.
+
+===Adverb===
+{{infl|nl|adverb}}
+
+# [[through]], [[forward]], [[on]]
+#: ''Ondanks slecht weer ging het feest ging toch '''door.''''' &amp;mdash; Despite bad weather, the party went '''on''' anyway.
+
+====Derived terms====
+{{top4}}
+* [[doorgaan]]
+* [[doorgang]]
+* [[doorgeven]]
+* [[doorstaan]]
+* [[doorstart]]
+* [[doortocht]]
+* [[doorwerken]]
+* [[doorzichtig]]
+{{bottom}}
+
+===Conjunction===
+{{infl|nl|conjunction}}
+
+# [[by]]
+#: ''Hij vermeed een confrontatie '''door''' de andere kant op te lopen.'' &amp;mdash; He avoided a confrontation '''by''' walking the other way.
+
+====Derived terms====
+* [[door middel van]]
+
+[[am:door]]
+[[ang:door]]
+[[ar:door]]
+[[cs:door]]
+[[de:door]]
+[[et:door]]
+[[el:door]]
+[[es:door]]
+[[fa:door]]
+[[fr:door]]
+[[ko:door]]
+[[hy:door]]
+[[io:door]]
+[[id:door]]
+[[it:door]]
+[[kk:door]]
+[[ku:door]]
+[[lo:door]]
+[[lt:door]]
+[[li:door]]
+[[hu:door]]
+[[nl:door]]
+[[ja:door]]
+[[no:door]]
+[[oc:door]]
+[[ug:door]]
+[[km:door]]
+[[pl:door]]
+[[pt:door]]
+[[simple:door]]
+[[sr:door]]
+[[fi:door]]
+[[sv:door]]
+[[ta:door]]
+[[te:door]]
+[[th:door]]
+[[vi:door]]
+[[tr:door]]
+[[uk:door]]
+[[zh:door]] \ No newline at end of file
diff --git a/testdata/drzwi.html b/testdata/drzwi.html
new file mode 100644
index 0000000..904edcc
--- a/dev/null
+++ b/testdata/drzwi.html
@@ -0,0 +1,44 @@
+<a href="http://de.wiktionary.org/drzwi">Deutsch</a> <a href="http://el.wiktionary.org/drzwi">Ελληνικά</a> <a href="http://en.wiktionary.org/drzwi">English</a> <a href="http://eo.wiktionary.org/drzwi">Esperanto</a> <a href="http://fr.wiktionary.org/drzwi">Français</a> <a href="http://ko.wiktionary.org/drzwi">한국어</a> <a href="http://hr.wiktionary.org/drzwi">Hrvatski</a> <a href="http://ug.wiktionary.org/drzwi">Oyghurque</a> <a href="http://fi.wiktionary.org/drzwi">Suomi</a> <a href="http://tr.wiktionary.org/drzwi">Türkçe</a>
+<h1> drzwi ( <b>język polski</b>) </h1><a href="http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf/Doorway%20Hamptonne%20in%20Jersey.jpg/250px-Doorway%20Hamptonne%20in%20Jersey.jpg">thumb|right|200px|drzwi (1.1)</a>
+<br/><b>wymowa:</b><br/> <b>IPA</b> <b>audio</b>
+<br/><b>znaczenia:</b><br/>
+<i>rzeczownik, rodzaj niemęskoosobowy</i>
+<dl><dd>(1.1) <a href="http://pl.wiktionary.org/ruchomy">ruchome</a> <a href="http://pl.wiktionary.org/zakrycie">zakrycie</a> <a href="http://pl.wiktionary.org/otw%C3%B3r">otworu</a> <a href="http://pl.wiktionary.org/w">w</a> <a href="http://pl.wiktionary.org/%C5%9Bciana">ścianie</a> <a href="http://pl.wiktionary.org/umo%C5%BCliwia%C4%87">umożliwiającego</a> <a href="http://pl.wiktionary.org/przej%C5%9Bcie">przejście</a>; <a href="http://pl.wiktionary.org/sam">sam</a> <a href="http://pl.wiktionary.org/taki">taki</a> <a href="http://pl.wiktionary.org/otw%C3%B3r">otwór</a>
+</dd><dd>(1.2) <a href="http://pl.wiktionary.org/zakrycie">zakrycie</a> <a href="http://pl.wiktionary.org/ka%C5%BCdy">każdego</a> <a href="http://pl.wiktionary.org/otw%C3%B3r">otworu</a> <a href="http://pl.wiktionary.org/na">na</a> <a href="http://pl.wiktionary.org/podobie%C5%84stwo">podobieństwo</a> drzwi (1.1)
+</dd></dl><br/><b>odmiana:</b><br/> (1) <i>bez</i> <b>lp</b>; <b>lm</b> drzwi, ~, ~om, ~, ~ami, ~ach, ~
+<br/><b>przykłady:</b><br/>
+<dl><dd> (1.1) <i><a href="http://pl.wiktionary.org/stary">Stare</a> <b>drzwi</b><a href="http://pl.wiktionary.org/otwiera%C4%87">otwierają</a> <a href="http://pl.wiktionary.org/si%C4%99">się</a> <a href="http://pl.wiktionary.org/z">z</a> <a href="http://pl.wiktionary.org/g%C5%82o%C5%9Bny">głośny</a>m <a href="http://pl.wiktionary.org/pisk">pisk</a>iem.</i>
+</dd></dl><br/><b>składnia:</b><br/>
+<br/><b>kolokacje:</b><br/> (1.1) <a href="http://pl.wiktionary.org/d%C4%99bowy">dębowe</a>/<a href="http://pl.wiktionary.org/metalowy">metalowe</a>/<a href="http://pl.wiktionary.org/pancerny">pancerne</a>/<a href="http://pl.wiktionary.org/antyw%C5%82amaniowy">antywłamaniowe</a>/... drzwi; <a href="http://pl.wiktionary.org/otwiera%C4%87">otwierać</a>/<a href="http://pl.wiktionary.org/zamyka%C4%87">zamykać</a> drzwi; <a href="http://pl.wiktionary.org/otwiera%C4%87">otwierać</a> drzwi <a href="http://pl.wiktionary.org/na%20o%C5%9Bcie%C5%BC">na oścież</a>; <a href="http://pl.wiktionary.org/trzaska%C4%87">trzaskać</a> drzwiami; drzwi <a href="http://pl.wiktionary.org/do">do</a> <a href="http://pl.wiktionary.org/pok%C3%B3j">pokoju</a>/<a href="http://pl.wiktionary.org/toaleta">toalety</a>/<a href="http://pl.wiktionary.org/gara%C5%BC">garaż</a>u/<a href="http://pl.wiktionary.org/stajnia">stajni</a>/<a href="http://pl.wiktionary.org/samoch%C3%B3d">samochodu</a>...; <a href="http://pl.wiktionary.org/skrzyd%C5%82o">skrzydło</a> drzwi; <a href="http://pl.wiktionary.org/framuga">framuga</a>/<a href="http://pl.wiktionary.org/zamek">zamek</a>/<a href="http://pl.wiktionary.org/klamka">klamka</a>/... drzwi; <a href="http://pl.wiktionary.org/sta%C4%87">stać</a>/<a href="http://pl.wiktionary.org/oczekiwa%C4%87">oczekiwać</a> <a href="http://pl.wiktionary.org/pod">pod</a> drzwiami; <a href="http://pl.wiktionary.org/drzwi%20obrotowe">drzwi obrotowe</a>; (1.2) drzwi <a href="http://pl.wiktionary.org/do">do</a> <a href="http://pl.wiktionary.org/szafa">szafy</a>/<a href="http://pl.wiktionary.org/piec">piec</a>a/...
+<br/><b>synonimy:</b><br/>
+<br/><b>antonimy:</b><br/>
+<br/><b>pokrewne:</b><br/> (1) <b>zdrobn</b> <a href="http://pl.wiktionary.org/drzwiczki">drzwiczki</a>; <b>przym</b> <a href="http://pl.wiktionary.org/drzwiowy">drzwiowy</a>
+<br/><b>frazeologia:</b><br/> (1.1) <a href="http://pl.wiktionary.org/pokazywa%C4%87%20komu%C5%9B%20drzwi">pokazywać komuś drzwi</a>, <a href="http://pl.wiktionary.org/pi%20razy%20drzwi">pi razy drzwi</a>
+<br/><b>etymologia:</b><br/>
+<br/><b>uwagi:</b><br/>
+<br/><b>tłumaczenia:</b><br/>
+<ul><li> angielski: (1.1) <a href="http://pl.wiktionary.org/door">door</a>
+</li><li> arabski: (1.1) <a href="http://pl.wiktionary.org/%D8%A8%D9%8E%D8%A7%D8%A8%D9%8C">بَابٌ</a> (bāb) <b>m</b>; <a href="http://pl.wiktionary.org/%D8%A3%D8%A8%D9%92%D9%88%D9%8E%D8%A7%D8%A8">أبْوَاب</a> (’abwāb) <b>lm</b>
+</li><li> aramejski: (1.1) <a href="http://pl.wiktionary.org/%DC%AC%DC%B2%DC%AA%DC%A5%DC%B5%DC%90">ܬܲܪܥܵܐ</a>
+</li><li> chorwacki: (1.1) <a href="http://pl.wiktionary.org/vrata">vrata</a> <b>lm</b>
+</li><li> czeski: (1.1) <a href="http://pl.wiktionary.org/dve%C5%99e">dveře</a> <b>lm</b>
+</li><li> dolnołużycki: (1.1) <a href="http://pl.wiktionary.org/%C5%BAurja">źurja</a> <b>lm</b>
+</li><li> duński: (1.1) <a href="http://pl.wiktionary.org/d%C3%B8r">dør</a> <i>w</i>
+</li><li> esperanto: (1.1) <a href="http://pl.wiktionary.org/pordo">pordo</a>
+</li><li> estoński: (1.1) <a href="http://pl.wiktionary.org/uks">uks</a>
+</li><li> fiński: (1.1) <a href="http://pl.wiktionary.org/ovi">ovi</a>
+</li><li> francuski: (1.1) <a href="http://pl.wiktionary.org/porte">porte</a> <b>f</b>
+</li><li> górnołużycki: (1.1) <a href="http://pl.wiktionary.org/durje">durje</a> <b>lm</b>
+</li><li> grecki: (1.1) <a href="http://pl.wiktionary.org/%CF%80%CF%8C%CF%81%CF%84%CE%B1">πόρτα</a> <b>f</b>
+</li><li> hebrajski: (1.1) <a href="http://pl.wiktionary.org/%D7%93%D6%B5%D7%9C%D6%B6%D7%AA">דֵלֶת</a>
+</li><li> hiszpański: (1.1) <a href="http://pl.wiktionary.org/puerta">puerta</a> <i>f</i>
+</li><li> islandzki: (1.1) <a href="http://pl.wiktionary.org/hur%C3%B0">hurð</a> <b>f</b>
+</li><li> niemiecki: (1.1) <a href="http://pl.wiktionary.org/T%C3%BCr">Tür</a> <b>f</b>
+</li><li> norweski (bokmål): (1.1) <a href="http://pl.wiktionary.org/d%C3%B8r">dør</a> <i>m/f</i>
+</li><li> rosyjski: (1.1) <a href="http://pl.wiktionary.org/%D0%B4%D0%B2%D0%B5%D1%80%D1%8C">дверь</a> <b>f</b>
+</li><li> serbski: (1.1) <a href="http://pl.wiktionary.org/%D0%B2%D1%80%D0%B0%D1%82%D0%B0">врата</a>/<a href="http://pl.wiktionary.org/vrata">vrata</a>
+</li><li> slovio: (1.1) <a href="http://pl.wiktionary.org/%D0%B4%D0%B2%D0%B5%D1%80">двер</a>/<a href="http://pl.wiktionary.org/dver">dver</a>
+</li><li> słowacki: (1.1) <a href="http://pl.wiktionary.org/dvere">dvere</a>
+</li><li> słoweński: (1.1) <a href="http://pl.wiktionary.org/vrata">vrata</a>
+</li><li> staropolski: (1.1) <a href="http://pl.wiktionary.org/d%C5%BAwirze">dźwirze</a>
+</li><li> szwedzki: (1.1) <a href="http://pl.wiktionary.org/d%C3%B6rr">dörr</a> <i>w</i></li></ul>
diff --git a/testdata/drzwi.wiki b/testdata/drzwi.wiki
new file mode 100644
index 0000000..53117c1
--- a/dev/null
+++ b/testdata/drzwi.wiki
@@ -0,0 +1,45 @@
+[[de:drzwi]] [[el:drzwi]] [[en:drzwi]] [[eo:drzwi]] [[fr:drzwi]] [[ko:drzwi]] [[hr:drzwi]] [[ug:drzwi]] [[fi:drzwi]] [[tr:drzwi]]
+== drzwi ({{język polski}}) ==
+[[Grafika:Doorway Hamptonne in Jersey.jpg|thumb|right|200px|drzwi (1.1)]]
+{{wymowa}} {{IPA|dʐvi}} {{audio|pl-drzwi.ogg}}
+{{znaczenia}}
+''rzeczownik, rodzaj niemęskoosobowy''
+:(1.1) [[ruchomy|ruchome]] [[zakrycie]] [[otwór|otworu]] [[w]] [[ściana|ścianie]] [[umożliwiać|umożliwiającego]] [[przejście]]; [[sam]] [[taki]] [[otwór]]
+:(1.2) [[zakrycie]] [[każdy|każdego]] [[otwór|otworu]] [[na]] [[podobieństwo]] drzwi (1.1)
+{{odmiana}} (1) ''bez'' {{lp}}; {{lm}} drzwi, ~, ~om, ~, ~ami, ~ach, ~
+{{przykłady}}
+: (1.1) ''[[stary|Stare]] '''drzwi''' [[otwierać|otwierają]] [[się]] [[z]] [[głośny]]m [[pisk]]iem.''
+{{składnia}}
+{{kolokacje}} (1.1) [[dębowy|dębowe]]/[[metalowy|metalowe]]/[[pancerny|pancerne]]/[[antywłamaniowy|antywłamaniowe]]/... drzwi; [[otwierać]]/[[zamykać]] drzwi; [[otwierać]] drzwi [[na oścież]]; [[trzaskać]] drzwiami; drzwi [[do]] [[pokój|pokoju]]/[[toaleta|toalety]]/[[garaż]]u/[[stajnia|stajni]]/[[samochód|samochodu]]...; [[skrzydło]] drzwi; [[framuga]]/[[zamek]]/[[klamka]]/... drzwi; [[stać]]/[[oczekiwać]] [[pod]] drzwiami; [[drzwi obrotowe]]; (1.2) drzwi [[do]] [[szafa|szafy]]/[[piec]]a/...
+{{synonimy}}
+{{antonimy}}
+{{pokrewne}} (1) {{zdrobn}} [[drzwiczki]]; {{przym}} [[drzwiowy]]
+{{frazeologia}} (1.1) [[pokazywać komuś drzwi]], [[pi razy drzwi]]
+{{etymologia}}
+{{uwagi}}
+{{tłumaczenia}}
+* angielski: (1.1) [[door]]
+* arabski: (1.1) [[بَابٌ]] (bāb) {{m}}; [[أبْوَاب]] (’abwāb) {{lm}}
+* aramejski: (1.1) [[ܬܲܪܥܵܐ]]
+* chorwacki: (1.1) [[vrata]] {{lm}}
+* czeski: (1.1) [[dveře]] {{lm}}
+* dolnołużycki: (1.1) [[źurja]] {{lm}}
+* duński: (1.1) [[dør]] ''w''
+* esperanto: (1.1) [[pordo]]
+* estoński: (1.1) [[uks]]
+* fiński: (1.1) [[ovi]]
+* francuski: (1.1) [[porte]] {{f}}
+* górnołużycki: (1.1) [[durje]] {{lm}}
+* grecki: (1.1) [[πόρτα]] {{f}}
+* hebrajski: (1.1) [[דֵלֶת]]
+* hiszpański: (1.1) [[puerta]] ''f''
+* islandzki: (1.1) [[hurð]] {{f}}
+* niemiecki: (1.1) [[Tür]] {{f}}
+* norweski (bokmål): (1.1) [[dør]] ''m/f''
+* rosyjski: (1.1) [[дверь]] {{f}}
+* serbski: (1.1) [[врата]]/[[vrata]]
+* slovio: (1.1) [[двер]]/[[dver]]
+* słowacki: (1.1) [[dvere]]
+* słoweński: (1.1) [[vrata]]
+* staropolski: (1.1) [[dźwirze]]
+* szwedzki: (1.1) [[dörr]] ''w'' \ No newline at end of file
diff --git a/testdata/headings.html b/testdata/headings.html
new file mode 100644
index 0000000..8b247a3
--- a/dev/null
+++ b/testdata/headings.html
@@ -0,0 +1,15 @@
+<h1> Section headings </h1>
+<i>Headings</i> organize your writing into
+sections. The Wiki software can automatically
+generate a <a href="http://pl.wiktionary.org/table%20of%20contents">table of contents</a> from them.
+
+<h2> Subsection </h2>Using more "equals" (=) signs creates a subsection.
+
+<h3> A smaller subsection </h3>
+Don't skip levels,
+like from two to four equals signs.
+
+Start with 2 equals signs not 1
+because 1 creates H1 tags
+which should be reserved for page title.
+
diff --git a/testdata/headings.wiki b/testdata/headings.wiki
new file mode 100644
index 0000000..3406c77
--- a/dev/null
+++ b/testdata/headings.wiki
@@ -0,0 +1,17 @@
+== Section headings ==
+
+''Headings'' organize your writing into
+sections. The Wiki software can automatically
+generate a [[table of contents]] from them.
+
+=== Subsection ===
+Using more "equals" (=) signs creates a subsection.
+
+==== A smaller subsection ====
+
+Don't skip levels,
+like from two to four equals signs.
+
+Start with 2 equals signs not 1
+because 1 creates H1 tags
+which should be reserved for page title.
diff --git a/testdata/hz.html b/testdata/hz.html
new file mode 100644
index 0000000..569d1fe
--- a/dev/null
+++ b/testdata/hz.html
@@ -0,0 +1,6 @@
+You can make horizontal dividing lines (----)
+to separate text.
+<hr/>But you should usually use sections instead,
+so that they go in the table of contents.
+
+
diff --git a/testdata/hz.wiki b/testdata/hz.wiki
new file mode 100644
index 0000000..b381016
--- a/dev/null
+++ b/testdata/hz.wiki
@@ -0,0 +1,6 @@
+You can make horizontal dividing lines (----)
+to separate text.
+----
+But you should usually use sections instead,
+so that they go in the table of contents.
+
diff --git a/testdata/numlist.html b/testdata/numlist.html
new file mode 100644
index 0000000..c65d0a2
--- a/dev/null
+++ b/testdata/numlist.html
@@ -0,0 +1,7 @@
+<ol><li> <i>Numbered lists</i> are:
+</li><ol><li> Very organized
+</li><li> Easy to follow
+</li></ol></ol>A newline marks the end of the list.
+<ol><li> New numbering starts with 1.
+</li></ol>
+
diff --git a/testdata/numlist.wiki b/testdata/numlist.wiki
new file mode 100644
index 0000000..ff12377
--- a/dev/null
+++ b/testdata/numlist.wiki
@@ -0,0 +1,6 @@
+# ''Numbered lists'' are:
+## Very organized
+## Easy to follow
+A newline marks the end of the list.
+# New numbering starts with 1.
+
diff --git a/testdata/unlist.html b/testdata/unlist.html
new file mode 100644
index 0000000..8d3eab7
--- a/dev/null
+++ b/testdata/unlist.html
@@ -0,0 +1,10 @@
+<ul><li> <i>Unordered lists</i> are easy to do:
+</li><ul><li> Start every line with a star.
+</li><ul><li> More stars indicate a deeper level.
+</li></ul><li> Previous item continues.
+</li><li> A newline
+</li></ul><li> in a list
+</li></ul>marks the end of the list.
+<ul><li>Of course you can start again.
+</li></ul>
+
diff --git a/testdata/unlist.wiki b/testdata/unlist.wiki
new file mode 100644
index 0000000..6318a26
--- a/dev/null
+++ b/testdata/unlist.wiki
@@ -0,0 +1,9 @@
+* ''Unordered lists'' are easy to do:
+** Start every line with a star.
+*** More stars indicate a deeper level.
+*: Previous item continues.
+** A newline
+* in a list
+marks the end of the list.
+*Of course you can start again.
+
diff --git a/wiki2html.py b/wiki2html.py
new file mode 100644
index 0000000..7441b97
--- a/dev/null
+++ b/wiki2html.py
@@ -0,0 +1,503 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from wikimarkup import *
+from types import TupleType
+import urllib
+
+class HtmlWikiMarkup (WikiMarkup):
+ """
+ A (hopefully) general-purpose Wiki->HTML translator class.
+ FIXME: 1. See WikiMarkup for a list
+ 2. [[official position]]s : final 's' gets after closing </a> tag.
+ Should be before.
+ """
+ lang = 'en'
+ html_base = 'http://%(lang)s.wiktionary.org'
+ image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf/'
+ media_base = 'http://www.mediawiki.org/xml/export-0.3'
+
+ def __init__(self, *args, **keywords):
+ WikiMarkup.__init__(self, *args, **keywords)
+ if 'lang' in keywords:
+ self.lang = keywords['lang']
+ elif 'html_base' in keywords:
+ self.html_base = keywords['html_base']
+ elif 'image_base' in keywords:
+ self.image_base = keywords['image_base']
+ elif 'media_base' in keywords:
+ self.media_base = keywords['media_base']
+
+ # ISO 639
+ langtab = {
+ "aa": "Afar", # Afar
+ "ab": "Аҧсуа", # Abkhazian
+ "ae": None, # Avestan
+ "af": "Afrikaans", # Afrikaans
+ "ak": "Akana", # Akan # or ak_CI
+ "als": "Alemannisch",
+ "am": "አማርኛ", # Amharic
+ "an": "Aragonés", # Aragonese
+ "ang": "Englisc",
+ "ar": "العربية" , # Arabic
+ "arc": "ܐܪܡܝܐ",
+ "as": "অসমীয়া", # Assamese
+ "ast": "Asturian",
+ "av": "Авар", # Avaric # Spoken mainly in Dagestan
+ "ay": "Aymar", # Aymara
+ "az": "Azərbaycan" , # Azerbaijani
+
+ "ba": "Башҡорт", # Bashkir
+ "bar": "Boarisch",
+ "bat-smg": "Žemaitėška",
+ "bcl": "Bikol",
+ "be": "Беларуская", # Byelorussian; Belarusian
+ "be-x-old": "Беларуская (тарашкевіца)",
+ "bg": "Български", # Bulgarian
+ "bh": "भोजपुरी", # Bihari
+ "bi": "Bislama", # Bislama
+ "bm": "Bamanankan", # Bambara
+ "bn": "বাংলা" , # Bengali; Bangla
+ "bo": "བོད་སྐད", # Tibetan
+ "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" ,
+ "br": "Brezhoneg" , # Breton
+ "bs": "Bosanski" , # Bosnian
+ "bug": "Basa Ugi",
+ "bxr": "Буряад",
+
+ "ca": "Català" , # Catalan
+ "cbk-zam": "Chavacano de Zamboanga",
+ "cdo": "Mìng-dĕ̤ng-ngṳ̄",
+ "cho": "Choctaw",
+ "ce": "Нохчийн", # Chechen
+ "ceb": "Sinugboanong Binisaya" , # Cebuano
+ "ch": "Chamor", # Chamorro
+ "chr": "ᏣᎳᎩ",
+ "chy": "Tsetsêhestâhese",
+ "co": "Cors", # Corsican
+ "cr": "Nehiyaw", # Cree
+ "crh": "Qırımtatarca",
+ "cs": "Česky" , # Czech
+ "csb": "Kaszëbsczi",
+ "c": "Словѣньскъ", # Church Slavic
+ "cv": "Чăваш", # Chuvash
+ "cy": "Cymraeg" , # Welsh
+
+ "da": "Dansk" , # Danish
+ "de": "Deutsch" , # German
+ "diq": "Zazaki", # Dimli (Southern Zazaki)
+ "dsb": "Dolnoserbski",
+ "dv": "ދިވެހިބަސް", # Divehi
+ "dz": "ཇོང་ཁ", # Dzongkha; Bhutani
+
+ "ee": "Eʋegbe", # Ewe
+ "el": "Ελληνικά" , # Greek
+ "eml": "Emiliàn e rumagnòl",
+ "en": "English" , # English
+ "eo": "Esperanto" ,
+ "es": "Español" , # Spanish
+ "et": "Eesti" , # Estonian
+ "e": "Euskara" , # Basque
+ "ext": "Estremeñ",
+
+ "fa": "فارسی" , # Persian
+ "ff": "Fulfulde", # Fulah # Also NG, MR, and many others
+ "fi": "Suomi" , # Finnish
+ "fiu-vro": "Võro",
+ "fj": "Na Vosa Vakaviti", # Fijian; Fiji
+ "fo": "Føroyskt" , # Faroese
+ "fr": "Français" , # French
+ "frp": "Arpitan",
+ "fur": "Furlan",
+ "fy": "Frysk", # Frisian
+
+ "ga": "Gaeilge", # Irish
+ "gan": "贛語 (Gànyŭ)",
+ "gd": "Gàidhlig", # Scots; Gaelic
+ "gl": "Gallego" , # Gallegan; Galician
+ "glk": "گیلکی",
+ "got": "𐌲Œ„𐌹𐌺 ",
+ "gn": "Avañe'ẽ", # Guarani
+ "g": "ગુજરાતી", # Gujarati
+ "gv": "Gaelg", # Manx
+
+ "ha": "هَوُسَ", # Hausa
+ "hak": "Hak-kâ-fa / 客家話",
+ "haw": "Hawai`i",
+ "he": "עברית" , # Hebrew (formerly iw)
+ "hi": "हिन्दी" , # Hindi
+ "hif": "Fiji Hindi",
+ "ho": "Hiri Mot", # Hiri Motu
+ "hr": "Hrvatski" , # Croatian
+ "hsb": "Hornjoserbsce",
+ "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole
+ "hu": "Magyar" , # Hungarian
+ "hy": "Հայերեն", # Armenian
+ "hz": "Otsiherero", # Herero
+
+ "ia": "Interlingua",
+ "ie": "Interlingue",
+ "id": "Bahasa Indonesia", # Indonesian (formerly in)
+ "ig": "Igbo", # Igbo
+ "ii": "ꆇꉙ ", # Sichuan Yi
+ "ik": "Iñupiak", # Inupiak
+ "ilo": "Ilokano",
+ "io": "Ido" ,
+ "is": "Íslenska" , # Icelandic
+ "it": "Italiano" , # Italian
+ "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut
+
+ "ja": "日本語", # Japanese
+ "jbo": "Lojban",
+ "jv": "Basa Jawa", # Javanese
+
+ "ka": "ქართული" , # Georgian
+ "kaa": "Qaraqalpaqsha",
+ "kab": "Taqbaylit",
+ "kg": "KiKongo", # Kongo # also CD and AO
+ "ki": "Gĩkũyũ", # Kikuyu
+ "kj": "Kuanyama", # Kuanyama
+ "kk": "Қазақша", # Kazakh
+ "kl": "Kalaallisut", # Kalaallisut; Greenlandic
+ "km": "ភាសាខ្មែរ", # Khmer; Cambodian
+ "kn": "ಕನ್ನಡ", # Kannada
+ "ko": "한국어" , # Korean
+ "kr": "Kanuri", # Kanuri
+ "ks": "कश्मीरी / كشميري", # Kashmiri
+ "ksh": "Ripoarisch",
+ "ku": "Kurdî / كوردی", # Kurdish
+ "kv": "Коми", # Komi
+ "kw": "Kernewek/Karnuack", # Cornish
+ "ky": "Кыргызча", # Kirghiz
+
+ "la": "Latina" , # Latin
+ "lad": "Dzhudezmo",
+ "lb": "Lëtzebuergesch" , # Letzeburgesch
+ "lbe": "Лакку",
+ "lg": "Luganda", # Ganda
+ "li": "Limburgs", # Limburgish; Limburger; Limburgan
+ "lij": "Lígur",
+ "ln": "Lingala", # Lingala
+ "lmo": "Lumbaart",
+ "lo": "ລາວ", # Lao; Laotian
+ "lt": "Lietuvių" , # Lithuanian
+ "l": None, # Luba-Katanga
+ "lv": "Latvieš" , # Latvian; Lettish
+
+ "map-bms": "Basa Banyumasan",
+ "mdf": "Мокшень (Mokshanj Kälj)",
+ "mg": "Malagasy", # Malagasy
+ "mh": "Ebon", # Marshall
+ "mi": "Māori", # Maori
+ "mk": "Македонски" , # Macedonian
+ "ml": None, # Malayalam
+ "mn": "Монгол", # Mongolian
+ "mo": "Молдовеняскэ", # Moldavian
+ "mr": "मराठी" , # Marathi
+ "ms": "Bahasa Melay" , # Malay
+ "mt": "Malti", # Maltese
+ "mus": "Muskogee",
+ "my": "မ္ရန္‌မာစာ", # Burmese
+ "myv": "Эрзянь (Erzjanj Kelj)",
+ "mzn": "مَزِروني",
+
+ "na": "dorerin Naoero", # Nauru
+ "nah": "Nāhuatl",
+ "nap": "Nnapulitano",
+ "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l
+ "nd": None,# Ndebele, North
+ "nds": "Plattdüütsch",
+ "nds-nl": "Nedersaksisch",
+ "ne": "नेपाली", # Nepali
+ "new": "नेपाल भाषा" , # Nepal Bhasa
+ "ng": "Oshiwambo", # Ndonga
+ "nl": "Nederlands" , # Dutch
+ "nn": "Nynorsk", # Norwegian Nynorsk
+ "no": "Norsk (Bokmål)" , # Norwegian
+ "nov": "Novial",
+ "nr": None, # Ndebele, South
+ "nrm": "Nouormand/Normaund",
+ "nv": "Diné bizaad", # Navajo
+ "ny": "Chi-Chewa", # Chichewa; Nyanja
+
+ "oc": "Occitan", # Occitan; Proven@,{c}al
+ "oj": None, # Ojibwa
+ "om": "Oromoo", # (Afan) Oromo
+ "or": "ଓଡ଼ିଆ", # Oriya
+ "os": "Иронау", # Ossetian; Ossetic
+
+ "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi
+ "pag": "Pangasinan",
+ "pam": "Kapampangan",
+ "pap": "Papiament",
+ "pdc": "Deitsch",
+ "pi": "पाऴि", # Pali
+ "pih": "Norfuk",
+ "pl": "Polski" , # Polish
+ "pms": "Piemontèis" ,
+ "ps": "پښتو", # Pashto, Pushto
+ "pt": "Português" , # Portuguese
+
+ "q": "Runa Simi" , # Quechua
+
+ "rm": "Rumantsch", # Rhaeto-Romance
+ "rmy": "romani - रोमानी",
+ "rn": "Kirundi", # Rundi; Kirundi
+ "ro": "Română" , # Romanian
+ "roa-rup": "Armãneashce",
+ "roa-tara": "Tarandíne",
+ "ru": "Русский" , # Russian
+ "rw": "Ikinyarwanda", # Kinyarwanda
+
+ "sa": "संस्कृतम्", # Sanskrit
+ "sah": "Саха тыла (Saxa Tyla)",
+ "sc": "Sard", # Sardinian
+ "scn": "Sicilian",
+ "sco": "Scots",
+ "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi
+ "se": "Sámegiella", # Northern Sami
+ "sg": "Sängö", # Sango; Sangro
+ "sh": "Srpskohrvatski / Српскохрватски" ,
+ "si": "සිංහල",
+ "simple": "Simple English" ,
+ "sk": "Slovenčina" , # Slovak
+ "sl": "Slovenščina" , # Slovenian
+ "sm": "Gagana Samoa", # Samoan
+ "sn": "chiShona", # Shona
+ "so": "Soomaaliga", # Somali
+ "sr": "Српски / Srpski" , # Serbian
+ "srn": "Sranantongo",
+ "ss": "SiSwati", # Swati; Siswati
+ "st": "Sesotho", # Sesotho; Sotho, Southern
+ "stk": "Seeltersk",
+ "s": "Basa Sunda", # Sundanese
+ "sq": "Shqip" , # Albanian
+ "szl": "Ślůnski",
+ "sv": "Svenska" , # Swedish
+ "sw": "Kiswahili", # Swahili # Also KE
+
+ "ta": "தமிழ்" , # Tamil
+ "te": "తెలుగు" , # Telugu
+ "tet": "Tetun",
+ "tg": "Тоҷикӣ", # Tajik
+ "th": "ไทย" , # Thai
+ "ti": "ትግርኛ", # Tigrinya
+ "tk": "تركمن / Туркмен", # Turkmen
+ "tl": "Tagalog" , # Tagalog
+ "tn": "Setswana", # Tswana; Setswana
+ "to": "faka Tonga", # Tonga (?) # Also ZW ; MW
+ "tokipona": "Tokipona",
+ "tpi": "Tok Pisin",
+ "tr": "Türkçe" , # Turkish
+ "ts": "Xitsonga", # Tsonga # ZA SZ XW
+ "tt": "Tatarça / Татарча", # Tatar
+ "tum": "chiTumbuka",
+ "tw": "Twi", # Twi
+ "ty": "Reo Mā`ohi", # Tahitian
+
+ "udm": "Удмурт кыл",
+ "ug": "Oyghurque", # Uighur
+ "uk": "Українська" , # Ukrainian
+ "ur": "اردو", # Urdu
+ "uz": "O‘zbek", # Uzbek
+
+ "ve": "Tshivenda", # Venda
+ "vec": "Vèneto",
+ "vi": "Tiếng Việt" , # Vietnamese
+ "vls": "West-Vlams",
+ "vo": "Volapük" ,
+
+ "wa": "Walon", # Walloon
+ "war": "Winaray",
+ "wo": "Wolof", # Wolof
+ "w": "吴语",
+
+ "xal": "Хальмг",
+ "xh": "isiXhosa", # Xhosa
+
+ "yi": "ייִדיש", # Yiddish (formerly ji)
+ "yo": "Yorùbá", # Yoruba
+
+ "za": "Cuengh", # Zhuang
+ "zea": "Zeêuws",
+ "zh": "中文" , # Chinese
+ "zh-classical": "古文 / 文言文",
+ "zm-min-nan": "Bân-lâm-gú",
+ "zh-yue": "粵語",
+ "z": "isiZul" # Zulu
+ }
+
+ def target(self, t):
+ (qual,sep,tgt) = t.partition(':')
+ r = None
+ if tgt != '':
+ if qual in ('Image', 'Grafika'):
+ t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt)
+ elif qual == "Media":
+ t = self.media_base + '/' + tgt
+ elif qual in self.langtab:
+ t = self.html_base % { 'lang' : qual } + '/' + urllib.quote(tgt)
+ r = self.langtab[qual]
+ else:
+ t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t)
+ else:
+ t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t)
+ return t, r
+
+ envhdr = [ "ul", "ol", "dl" ]
+ envel = [ "li", "li", "dd" ]
+
+ def str_nil(self, tok, env):
+ return ""
+
+ def str_text(self, tok, env):
+ return tok[1]
+
+ def str_link(self, tok, env):
+ arg = self.fmtok(tok[1], env)
+ (target, r) = self.target(arg)
+ text = self.fmtok(tok[2], env)
+ if not text and r:
+ text = r
+ return "<a href=\"%s\">%s</a>" % (target,
+ text if (text and text != '') \
+ else \
+ r if r else arg)
+
+ def str_tmpl(self, tok, env):
+ arg = self.fmtok(tok[1], env)
+ (target, r) = self.target(arg)
+ text = self.fmtok(tok[2], env)
+ return "<a href=\"%s\">%s</a>" % (target,
+ text if (text and text != '') \
+ else arg)
+
+ def str_ref(self, tok, env):
+ target = self.fmtok(tok[1], env)
+ text = self.fmtok(tok[2], env)
+ return "<a href=\"%s\">%s</a>" % (target,
+ text if (text and text != '') \
+ else target)
+ def str_it(self, tok, env):
+ return "<i>" + self.fmtok(tok[1], env) + "</i>"
+
+ def str_bold(self, tok, env):
+ return "<b>" + self.fmtok(tok[1], env) + "</b>"
+
+ def str_hdr(self, tok, env):
+ level = tok[1]
+ if level > 4:
+ level = 4
+ return "<h%s>%s</h%s>" % (level, self.fmtok(tok[2], env), level)
+
+ def str_bar(self, tok, env):
+ return "<hr/>"
+
+ def str_env(self, tok, env):
+ t = tok[1]
+ return "<" + self.envhdr[t] + ">" + \
+ self.fmtok(tok[3], tok) + \
+ "</" + self.envhdr[t] + ">"
+
+ def str_item(self, tok, env):
+ return "<%s>%s</%s>" % (self.envel[env[1]],
+ self.fmtok(tok[1], env),
+ self.envel[env[1]])
+
+ def str_seq(self, tok, env):
+ s = ""
+ for t in tok[1:]:
+ s += self.fmtok(t, env)
+ return s
+
+ def fmtok(self, tok, env):
+ if type(tok) != TupleType:
+ return ""
+ toktype = tok[0]
+ if toktype == self.NIL:
+ return self.str_nil(tok, env)
+ if toktype == self.TEXT:
+ return self.str_text(tok, env)
+ elif toktype == self.LINK:
+ return self.str_link(tok, env)
+ elif toktype == self.TMPL:
+ return self.str_tmpl(tok, env)
+ elif toktype == self.REF:
+ return self.str_ref(tok, env)
+ elif toktype == self.IT:
+ return self.str_it(tok, env)
+ elif toktype == self.BOLD:
+ return self.str_bold(tok, env)
+ elif toktype == self.HDR:
+ return self.str_hdr(tok, env)
+ elif toktype == self.BAR:
+ return self.str_bar(tok, env)
+ elif toktype == self.ENV:
+ return self.str_env(tok, env)
+ elif toktype == self.ITEM:
+ return self.str_item(tok, env)
+ elif toktype == self.SEQ:
+ return self.str_seq(tok, env)
+
+ def __str__(self):
+ return self.fmtok(self.tree, None)
+
+
+class HtmlWiktionaryMarkup (HtmlWikiMarkup):
+ """
+ A class for translating Wiktionary articles into HTML.
+ This version does not do much, except that it tries to correctly
+ format templates. But "tries" does not mean "does". The heuristics
+ used here is clearly not enogh to cope with it.
+
+ 1. FIXME:
+ The right solution would be to have a database of templates with their
+ semantics and to decide on their rendering depending on that. E.g.
+ {{term}} in en.wiktionary means "replace this with the search term".
+ This, however, does not work in other wiktionaries. There are
+ also more complex templates, e.g.: {{t+|bg|врата|n|p|tr=vrata|sc=Cyrl}}
+ I don't know what it means. Couldn't find any documentation either.
+ Again, this template does not work in other dictionaries.
+
+ 2. Capitulation notice:
+ Given the:
+ 1. waste amount of wiktionaries available,
+ 2. abundance of various templates for each wictionary,
+ 3. apparent lack of documentation thereof,
+ 4. the lack of standardized language-independent templates,
+ I dont see any way to cope with the template-rendering task within a
+ reasonable amount of time.
+
+ Faeci quod potui, faciant meliora potentes.
+ """
+ seq_pos = 0
+
+ def str_seq(self, tok, env):
+ s = ""
+ self.seq_pos=0
+ for t in tok[1:]:
+ s += self.fmtok(t, env)
+ self.seq_pos += 1
+ return s
+
+ def str_tmpl(self, tok, env):
+ arg = self.fmtok(tok[1], env)
+ if self.seq_pos > 0:
+ return " <b>" + arg + "</b>"
+ else:
+ return "<br/><b>" + arg + ":</b><br/>"
+
diff --git a/wiki2plain.py b/wiki2plain.py
new file mode 100644
index 0000000..5080298
--- a/dev/null
+++ b/wiki2plain.py
@@ -0,0 +1,452 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from wikimarkup import *
+from types import TupleType
+import urllib
+
+class PlainMarkup (WikiMarkup):
+ """
+ A (general-purpose Wiki->Text translator class.
+ """
+ lang = 'en'
+ html_base = 'http://%(lang)s.wiktionary.org'
+ image_base = 'http://nie.wiem.gdzie'
+ media_base = 'http://www.mediawiki.org/xml/export-0.3'
+
+ def __init__(self, *args, **keywords):
+ WikiMarkup.__init__(self, *args, **keywords)
+ if 'lang' in keywords:
+ self.lang = keywords['lang']
+ elif 'html_base' in keywords:
+ self.html_base = keywords['html_base']
+ elif 'image_base' in keywords:
+ self.image_base = keywords['image_base']
+ elif 'media_base' in keywords:
+ self.media_base = keywords['media_base']
+
+ # ISO 639
+ langtab = {
+ "aa": "Afar", # Afar
+ "ab": "Аҧсуа", # Abkhazian
+ "ae": None, # Avestan
+ "af": "Afrikaans", # Afrikaans
+ "ak": "Akana", # Akan # or ak_CI
+ "als": "Alemannisch",
+ "am": "አማርኛ", # Amharic
+ "an": "Aragonés", # Aragonese
+ "ang": "Englisc",
+ "ar": "العربية" , # Arabic
+ "arc": "ܐܪܡܝܐ",
+ "as": "অসমীয়া", # Assamese
+ "ast": "Asturian",
+ "av": "Авар", # Avaric # Spoken mainly in Dagestan
+ "ay": "Aymar", # Aymara
+ "az": "Azərbaycan" , # Azerbaijani
+
+ "ba": "Башҡорт", # Bashkir
+ "bar": "Boarisch",
+ "bat-smg": "Žemaitėška",
+ "bcl": "Bikol",
+ "be": "Беларуская", # Byelorussian; Belarusian
+ "be-x-old": "Беларуская (тарашкевіца)",
+ "bg": "Български", # Bulgarian
+ "bh": "भोजपुरी", # Bihari
+ "bi": "Bislama", # Bislama
+ "bm": "Bamanankan", # Bambara
+ "bn": "বাংলা" , # Bengali; Bangla
+ "bo": "བོད་སྐད", # Tibetan
+ "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" ,
+ "br": "Brezhoneg" , # Breton
+ "bs": "Bosanski" , # Bosnian
+ "bug": "Basa Ugi",
+ "bxr": "Буряад",
+
+ "ca": "Català" , # Catalan
+ "cbk-zam": "Chavacano de Zamboanga",
+ "cdo": "Mìng-dĕ̤ng-ngṳ̄",
+ "cho": "Choctaw",
+ "ce": "Нохчийн", # Chechen
+ "ceb": "Sinugboanong Binisaya" , # Cebuano
+ "ch": "Chamor", # Chamorro
+ "chr": "ᏣᎳᎩ",
+ "chy": "Tsetsêhestâhese",
+ "co": "Cors", # Corsican
+ "cr": "Nehiyaw", # Cree
+ "crh": "Qırımtatarca",
+ "cs": "Česky" , # Czech
+ "csb": "Kaszëbsczi",
+ "c": "Словѣньскъ", # Church Slavic
+ "cv": "Чăваш", # Chuvash
+ "cy": "Cymraeg" , # Welsh
+
+ "da": "Dansk" , # Danish
+ "de": "Deutsch" , # German
+ "diq": "Zazaki", # Dimli (Southern Zazaki)
+ "dsb": "Dolnoserbski",
+ "dv": "ދިވެހިބަސް", # Divehi
+ "dz": "ཇོང་ཁ", # Dzongkha; Bhutani
+
+ "ee": "Eʋegbe", # Ewe
+ "el": "Ελληνικά" , # Greek
+ "eml": "Emiliàn e rumagnòl",
+ "en": "English" , # English
+ "eo": "Esperanto" ,
+ "es": "Español" , # Spanish
+ "et": "Eesti" , # Estonian
+ "e": "Euskara" , # Basque
+ "ext": "Estremeñ",
+
+ "fa": "فارسی" , # Persian
+ "ff": "Fulfulde", # Fulah # Also NG, MR, and many others
+ "fi": "Suomi" , # Finnish
+ "fiu-vro": "Võro",
+ "fj": "Na Vosa Vakaviti", # Fijian; Fiji
+ "fo": "Føroyskt" , # Faroese
+ "fr": "Français" , # French
+ "frp": "Arpitan",
+ "fur": "Furlan",
+ "fy": "Frysk", # Frisian
+
+ "ga": "Gaeilge", # Irish
+ "gan": "贛語 (Gànyŭ)",
+ "gd": "Gàidhlig", # Scots; Gaelic
+ "gl": "Gallego" , # Gallegan; Galician
+ "glk": "گیلکی",
+ "got": "𐌲Œ„𐌹𐌺 ",
+ "gn": "Avañe'ẽ", # Guarani
+ "g": "ગુજરાતી", # Gujarati
+ "gv": "Gaelg", # Manx
+
+ "ha": "هَوُسَ", # Hausa
+ "hak": "Hak-kâ-fa / 客家話",
+ "haw": "Hawai`i",
+ "he": "עברית" , # Hebrew (formerly iw)
+ "hi": "हिन्दी" , # Hindi
+ "hif": "Fiji Hindi",
+ "ho": "Hiri Mot", # Hiri Motu
+ "hr": "Hrvatski" , # Croatian
+ "hsb": "Hornjoserbsce",
+ "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole
+ "hu": "Magyar" , # Hungarian
+ "hy": "Հայերեն", # Armenian
+ "hz": "Otsiherero", # Herero
+
+ "ia": "Interlingua",
+ "ie": "Interlingue",
+ "id": "Bahasa Indonesia", # Indonesian (formerly in)
+ "ig": "Igbo", # Igbo
+ "ii": "ꆇꉙ ", # Sichuan Yi
+ "ik": "Iñupiak", # Inupiak
+ "ilo": "Ilokano",
+ "io": "Ido" ,
+ "is": "Íslenska" , # Icelandic
+ "it": "Italiano" , # Italian
+ "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut
+
+ "ja": "日本語", # Japanese
+ "jbo": "Lojban",
+ "jv": "Basa Jawa", # Javanese
+
+ "ka": "ქართული" , # Georgian
+ "kaa": "Qaraqalpaqsha",
+ "kab": "Taqbaylit",
+ "kg": "KiKongo", # Kongo # also CD and AO
+ "ki": "Gĩkũyũ", # Kikuyu
+ "kj": "Kuanyama", # Kuanyama
+ "kk": "Қазақша", # Kazakh
+ "kl": "Kalaallisut", # Kalaallisut; Greenlandic
+ "km": "ភាសាខ្មែរ", # Khmer; Cambodian
+ "kn": "ಕನ್ನಡ", # Kannada
+ "ko": "한국어" , # Korean
+ "kr": "Kanuri", # Kanuri
+ "ks": "कश्मीरी / كشميري", # Kashmiri
+ "ksh": "Ripoarisch",
+ "ku": "Kurdî / كوردی", # Kurdish
+ "kv": "Коми", # Komi
+ "kw": "Kernewek/Karnuack", # Cornish
+ "ky": "Кыргызча", # Kirghiz
+
+ "la": "Latina" , # Latin
+ "lad": "Dzhudezmo",
+ "lb": "Lëtzebuergesch" , # Letzeburgesch
+ "lbe": "Лакку",
+ "lg": "Luganda", # Ganda
+ "li": "Limburgs", # Limburgish; Limburger; Limburgan
+ "lij": "Lígur",
+ "ln": "Lingala", # Lingala
+ "lmo": "Lumbaart",
+ "lo": "ລາວ", # Lao; Laotian
+ "lt": "Lietuvių" , # Lithuanian
+ "l": None, # Luba-Katanga
+ "lv": "Latvieš" , # Latvian; Lettish
+
+ "map-bms": "Basa Banyumasan",
+ "mdf": "Мокшень (Mokshanj Kälj)",
+ "mg": "Malagasy", # Malagasy
+ "mh": "Ebon", # Marshall
+ "mi": "Māori", # Maori
+ "mk": "Македонски" , # Macedonian
+ "ml": None, # Malayalam
+ "mn": "Монгол", # Mongolian
+ "mo": "Молдовеняскэ", # Moldavian
+ "mr": "मराठी" , # Marathi
+ "ms": "Bahasa Melay" , # Malay
+ "mt": "Malti", # Maltese
+ "mus": "Muskogee",
+ "my": "မ္ရန္‌မာစာ", # Burmese
+ "myv": "Эрзянь (Erzjanj Kelj)",
+ "mzn": "مَزِروني",
+
+ "na": "dorerin Naoero", # Nauru
+ "nah": "Nāhuatl",
+ "nap": "Nnapulitano",
+ "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l
+ "nd": None,# Ndebele, North
+ "nds": "Plattdüütsch",
+ "nds-nl": "Nedersaksisch",
+ "ne": "नेपाली", # Nepali
+ "new": "नेपाल भाषा" , # Nepal Bhasa
+ "ng": "Oshiwambo", # Ndonga
+ "nl": "Nederlands" , # Dutch
+ "nn": "Nynorsk", # Norwegian Nynorsk
+ "no": "Norsk (Bokmål)" , # Norwegian
+ "nov": "Novial",
+ "nr": None, # Ndebele, South
+ "nrm": "Nouormand/Normaund",
+ "nv": "Diné bizaad", # Navajo
+ "ny": "Chi-Chewa", # Chichewa; Nyanja
+
+ "oc": "Occitan", # Occitan; Proven@,{c}al
+ "oj": None, # Ojibwa
+ "om": "Oromoo", # (Afan) Oromo
+ "or": "ଓଡ଼ିଆ", # Oriya
+ "os": "Иронау", # Ossetian; Ossetic
+
+ "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi
+ "pag": "Pangasinan",
+ "pam": "Kapampangan",
+ "pap": "Papiament",
+ "pdc": "Deitsch",
+ "pi": "पाऴि", # Pali
+ "pih": "Norfuk",
+ "pl": "Polski" , # Polish
+ "pms": "Piemontèis" ,
+ "ps": "پښتو", # Pashto, Pushto
+ "pt": "Português" , # Portuguese
+
+ "q": "Runa Simi" , # Quechua
+
+ "rm": "Rumantsch", # Rhaeto-Romance
+ "rmy": "romani - रोमानी",
+ "rn": "Kirundi", # Rundi; Kirundi
+ "ro": "Română" , # Romanian
+ "roa-rup": "Armãneashce",
+ "roa-tara": "Tarandíne",
+ "ru": "Русский" , # Russian
+ "rw": "Ikinyarwanda", # Kinyarwanda
+
+ "sa": "संस्कृतम्", # Sanskrit
+ "sah": "Саха тыла (Saxa Tyla)",
+ "sc": "Sard", # Sardinian
+ "scn": "Sicilian",
+ "sco": "Scots",
+ "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi
+ "se": "Sámegiella", # Northern Sami
+ "sg": "Sängö", # Sango; Sangro
+ "sh": "Srpskohrvatski / Српскохрватски" ,
+ "si": "සිංහල",
+ "simple": "Simple English" ,
+ "sk": "Slovenčina" , # Slovak
+ "sl": "Slovenščina" , # Slovenian
+ "sm": "Gagana Samoa", # Samoan
+ "sn": "chiShona", # Shona
+ "so": "Soomaaliga", # Somali
+ "sr": "Српски / Srpski" , # Serbian
+ "srn": "Sranantongo",
+ "ss": "SiSwati", # Swati; Siswati
+ "st": "Sesotho", # Sesotho; Sotho, Southern
+ "stk": "Seeltersk",
+ "s": "Basa Sunda", # Sundanese
+ "sq": "Shqip" , # Albanian
+ "szl": "Ślůnski",
+ "sv": "Svenska" , # Swedish
+ "sw": "Kiswahili", # Swahili # Also KE
+
+ "ta": "தமிழ்" , # Tamil
+ "te": "తెలుగు" , # Telugu
+ "tet": "Tetun",
+ "tg": "Тоҷикӣ", # Tajik
+ "th": "ไทย" , # Thai
+ "ti": "ትግርኛ", # Tigrinya
+ "tk": "تركمن / Туркмен", # Turkmen
+ "tl": "Tagalog" , # Tagalog
+ "tn": "Setswana", # Tswana; Setswana
+ "to": "faka Tonga", # Tonga (?) # Also ZW ; MW
+ "tokipona": "Tokipona",
+ "tpi": "Tok Pisin",
+ "tr": "Türkçe" , # Turkish
+ "ts": "Xitsonga", # Tsonga # ZA SZ XW
+ "tt": "Tatarça / Татарча", # Tatar
+ "tum": "chiTumbuka",
+ "tw": "Twi", # Twi
+ "ty": "Reo Mā`ohi", # Tahitian
+
+ "udm": "Удмурт кыл",
+ "ug": "Oyghurque", # Uighur
+ "uk": "Українська" , # Ukrainian
+ "ur": "اردو", # Urdu
+ "uz": "O‘zbek", # Uzbek
+
+ "ve": "Tshivenda", # Venda
+ "vec": "Vèneto",
+ "vi": "Tiếng Việt" , # Vietnamese
+ "vls": "West-Vlams",
+ "vo": "Volapük" ,
+
+ "wa": "Walon", # Walloon
+ "war": "Winaray",
+ "wo": "Wolof", # Wolof
+ "w": "吴语",
+
+ "xal": "Хальмг",
+ "xh": "isiXhosa", # Xhosa
+
+ "yi": "ייִדיש", # Yiddish (formerly ji)
+ "yo": "Yorùbá", # Yoruba
+
+ "za": "Cuengh", # Zhuang
+ "zea": "Zeêuws",
+ "zh": "中文" , # Chinese
+ "zh-classical": "古文 / 文言文",
+ "zm-min-nan": "Bân-lâm-gú",
+ "zh-yue": "粵語",
+ "z": "isiZul" # Zulu
+ }
+
+ def target(self, t):
+ (qual,sep,tgt) = t.partition(':')
+ r = None
+ if tgt != '':
+ if qual == "Image":
+ t = self.image_base + '/' + urllib.quote(tgt)
+ elif qual == "Media":
+ t = self.media_base + '/' + tgt
+ elif qual in self.langtab:
+ t = self.html_base % { 'lang' : qual } + '/' + urllib.quote(tgt)
+ r = self.langtab[qual]
+ else:
+ t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t)
+ else:
+ t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t)
+ return t, r
+
+ envhdr = [ "ul", "ol", "dl" ]
+ envel = [ "li", "li", "dd" ]
+
+ def str_nil(self, tok, env):
+ return ""
+
+ def str_text(self, tok, env):
+ return tok[1]
+
+ def str_link(self, tok, env):
+ arg = self.fmtok(tok[1], env)
+ (target, r) = self.target(arg)
+ text = self.fmtok(tok[2], env)
+ if not text and r:
+ text = r
+ return "%s" % (text if (text and text != '') \
+ else \
+ r if r else arg)
+
+ def str_tmpl(self, tok, env):
+ arg = self.fmtok(tok[1], env)
+ (target, r) = self.target(arg)
+ text = self.fmtok(tok[2], env)
+ return "%s" % (text if (text and text != '') \
+ else arg)
+
+ def str_ref(self, tok, env):
+ target = self.fmtok(tok[1], env)
+ text = self.fmtok(tok[2], env)
+ return "%s" % (text if (text and text != '') \
+ else target)
+
+ def str_it(self, tok, env):
+ return "<i>" + self.fmtok(tok[1], env) + "</i>"
+
+ def str_bold(self, tok, env):
+ return "<b>" + self.fmtok(tok[1], env) + "</b>"
+
+ def str_hdr(self, tok, env):
+ level = tok[1]
+ if level > 4:
+ level = 4
+ return "<h%s>%s</h%s>" % (level, self.fmtok(tok[2], env), level)
+
+ def str_bar(self, tok, env):
+ return "-----------------"
+
+ def str_env(self, tok, env):
+ t = tok[1]
+ return "<" + self.envhdr[t] + ">" + \
+ self.fmtok(tok[3], tok) + \
+ "</" + self.envhdr[t] + ">"
+
+ def str_item(self, tok, env):
+ return "<%s>%s</%s>" % (self.envel[env[1]],
+ self.fmtok(tok[1], env),
+ self.envel[env[1]])
+
+ def str_seq(self, tok, env):
+ s = ""
+ for t in tok[1:]:
+ s += self.fmtok(t, env)
+ return s
+
+ def fmtok(self, tok, env):
+ if type(tok) != TupleType:
+ return ""
+ toktype = tok[0]
+ if toktype == self.NIL:
+ return self.str_nil(tok, env)
+ if toktype == self.TEXT:
+ return self.str_text(tok, env)
+ elif toktype == self.LINK:
+ return self.str_link(tok, env)
+ elif toktype == self.TMPL:
+ return self.str_tmpl(tok, env)
+ elif toktype == self.REF:
+ return self.str_ref(tok, env)
+ elif toktype == self.IT:
+ return self.str_it(tok, env)
+ elif toktype == self.BOLD:
+ return self.str_bold(tok, env)
+ elif toktype == self.HDR:
+ return self.str_hdr(tok, env)
+ elif toktype == self.BAR:
+ return self.str_bar(tok, env)
+ elif toktype == self.ENV:
+ return self.str_env(tok, env)
+ elif toktype == self.ITEM:
+ return self.str_item(tok, env)
+ elif toktype == self.SEQ:
+ return self.str_seq(tok, env)
+
+ def __str__(self):
+ return self.fmtok(self.tree, None)
diff --git a/wikicvt.py b/wikicvt.py
new file mode 100644
index 0000000..7d22c2e
--- a/dev/null
+++ b/wikicvt.py
@@ -0,0 +1,52 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import getopt
+from wiki2html import *
+
+def usage(code=0):
+ print "usage: " + sys.argv[0] + "[-hv] [--help] [--verbose] file\n"
+ sys.exit(code)
+
+def main():
+ verbose_flag = 0
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], "hv",
+ ["help", "verbose" ])
+ except getopt.GetoptError:
+ usage(1)
+
+ for o, a in opts:
+ if o in ("-h", "--help"):
+ usage()
+ if o in ("-v", "--verbose"):
+ verbose_flag = verbose_flag + 1
+
+ if len(args) == 1:
+ inputfilename = args[0]
+ else:
+ usage(1)
+
+ markup = HtmlWiktionaryMarkup(filename=inputfilename, lang="pl")
+ markup.parse()
+ print str(markup)
+ if verbose_flag > 0:
+ markup.output()
+
+if __name__ == '__main__':
+ main()
diff --git a/wikimarkup.py b/wikimarkup.py
new file mode 100644
index 0000000..3308da2
--- a/dev/null
+++ b/wikimarkup.py
@@ -0,0 +1,362 @@
+#!/usr/bin/python
+# Copyright (C) 2008 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import re
+from types import *
+
+__all__ = [ "BaseWikiMarkup", "WikiMarkup" ]
+
+eltbeg = re.compile("=+|(^----$)|^[\\*#:]+")
+eltre = re.compile("(\\[\\[)|(\\{\\{)|\\[|(\\'\\'\\'?)")
+delims = { "[[" : re.compile("\\||(\\]\\])"),
+ "{{" : re.compile("\\||(\\}\\})") }
+term = { "[[" : "]]" , "{{" : "}}" }
+ends = { "[[" : re.compile("\\]\\]"),
+ "{{" : re.compile("\\}\\}") }
+itend = re.compile("\\'\\'($|[^\\'])")
+boend = re.compile("\\'\\'\\'($|[^\\'])")
+
+class BaseWikiMarkup:
+ """
+A base class for handling Wiki markups.
+It handles:
+ 1. basic block markup (headers, numbered and unnumbered lists,
+ indentations);
+ 2. basic inline markup (bold, italic);
+ 3. basic reference markup (links, templates, external links).
+ It does NOT handle:
+ 1. pseudo-html markup (<nowiki></nowiki>, and similar);
+ 2. leading spaces meaning ``preserve formatting'';
+ 3. tables and math.
+ The above rests for FIXME.
+
+ This class relies on its derived classes for providing input. They must
+ overload method `input', which must return one physical line of input for
+ each call.
+
+ Variables:
+
+ 1. tree
+ The parse tree. Valid after parse() finishes (see below).
+
+ Methods:
+
+ 1. parse()
+ Parse the input and build parse tree
+
+ 2. input()
+ Virtual function. Return next line of input or None on EOF.
+
+ 3. output()
+ Print the tree in internal representation.
+ """
+ ## Token classes
+ # NIL: nothing
+ NIL = 0
+ # TEXT: text
+ TEXT = 1
+ # LINK: target, text
+ LINK = 2
+ # Template: target, text
+ TMPL = 3
+ # External ref: target, text
+ REF = 4
+ # Italics: text
+ IT = 5
+ # Bold: text
+ BOLD = 6
+ # Header: level, text
+ HDR = 7
+ # Horizontal bar:
+ BAR = 8
+ # Environment: type, level
+ ENV = 9
+ # Item: text
+ ITEM = 10
+ # Sequence: seq
+ SEQ = 11
+
+ # Environment types:
+ # Unnumbered list
+ ENVUNNUM = 0
+ # Numbered list
+ ENVNUM = 1
+ # Indent
+ INDENT = 2
+ envtypes = [ "*", "#", ":" ]
+
+ tree = None
+
+ def itend(self, line, pos):
+ while 1:
+ d = itend.search(line, pos)
+ if not d:
+ return -1
+ elif d.start(0) == pos or line[d.start(0)-1] != "'":
+ return d.start(0)
+ else:
+ pos = d.start(0) + 1
+
+ la = None
+ def putback(self, line):
+ self.la = line
+
+ def nextkn(self, curlev=0, type = -1):
+ while 1:
+ if self.la:
+ line = self.la
+ self.putback(None)
+ else:
+ try:
+ line = self.input()
+ except StopIteration:
+ line = u''
+ if not line or line == "":
+ self.putback(line)
+ break
+
+ m = eltbeg.match(line)
+ if m:
+ if m.group(0)[0] in self.envtypes:
+ btype = self.envtypes.index(m.group(0)[0])
+ lev = len(m.group(0))
+ if btype == type:
+ if lev == curlev:
+ yield(self.ITEM,
+ (self.SEQ, self.getkn(line[m.end(0):])))
+ elif lev > curlev:
+ self.putback(line)
+ yield(self.ENV, btype, curlev + 1,
+ (self.SEQ, self.nextkn(curlev + 1, btype)))
+ else:
+ self.putback(line)
+ break
+ else:
+ self.putback(line)
+ yield(self.ENV, btype, 1, self.nextkn(1, btype))
+
+ else:
+ if curlev > 0:
+ self.putback(line)
+ break
+ elif m.group(0)[0:2] == "==" \
+ and line.rstrip('\n').endswith(m.group(0)):
+ yield(self.HDR, len(m.group(0))-1,
+ self.getkn(line[m.end(0):-(1+len(m.group(0)))]))
+ elif m.group(0) == "----":
+ yield(self.BAR,)
+ else:
+ if curlev > 0:
+ self.putback(line)
+ break
+ yield(self.getkn(line))
+
+
+ def getkn(self, line):
+ pos = 0
+ while 1:
+ if pos == len(line):
+ break;
+ m = eltre.search(line, pos)
+ if not m:
+ yield(self.TEXT, line[pos:])
+ pos = len(line)
+ else:
+ yield(self.TEXT, line[pos:m.start(0)])
+ pos = m.end(0)
+ if m.group(0) == "[[" or m.group(0) == "{{":
+ d = delims[m.group(0)].search(line, pos)
+ if d.group(0) == "|":
+ e = ends[m.group(0)].search(line, d.end(0))
+ target = (self.TEXT, line[pos:d.start(0)])
+ text = (self.SEQ, self.getkn(line[d.end(0):e.start(0)]))
+ pos = e.end(0)
+ elif d.group(0) == term[m.group(0)]:
+ target = (self.TEXT, line[pos:d.start(0)])
+ text = (self.NIL,)
+ pos = d.end(0)
+ if m.group(0) == "[[":
+ yield(self.LINK, target, text)
+ else:
+ yield(self.TMPL, target, text)
+ elif m.group(0) == "[":
+ i = line.find("]", m.end(0))
+ if i == -1:
+ i = len(line)
+ (target,sep,text) = line[m.end(0):i].partition(' ')
+ yield(self.REF,
+ (self.TEXT, target),
+ (self.SEQ, self.getkn(text)))
+ pos = i + 1
+ elif m.group(0) == "'''":
+ e = boend.search(line, m.end(0))
+ if e:
+ i = e.start(0)
+ pos = e.end(0)
+ else:
+ pos = len(line)
+ i = pos
+ yield(self.BOLD,
+ (self.SEQ, self.getkn(line[m.end(0):i])))
+ pos = e.end(0)
+ elif m.group(0) == "''":
+ i = self.itend(line, m.end(0))
+ if i == -1:
+ i = len(line)
+ yield(self.IT,
+ (self.SEQ, self.getkn(line[m.end(0):i])))
+ pos = i + 2
+
+ def input(self):
+ return None
+
+ def expandtok(self, tok):
+ if type(tok) == GeneratorType:
+ subtree = [self.SEQ]
+ for t in tok:
+ x = self.expandtok(t)
+ if x:
+ subtree.append(x)
+ return tuple(subtree) if len(subtree) > 2 else \
+ subtree[1] if len(subtree) == 2 else None
+ toktype = tok[0]
+ if toktype == self.NIL:
+ return None
+ if toktype == self.TEXT:
+ return tok if tok[1] != '' else None
+ elif toktype == self.LINK or toktype == self.TMPL \
+ or toktype == self.REF:
+ return toktype, self.expandtok(tok[1]), self.expandtok(tok[2])
+ elif toktype == self.IT or toktype == self.BOLD \
+ or toktype == self.ITEM:
+ return toktype, self.expandtok(tok[1])
+ elif toktype == self.HDR:
+ return toktype, tok[1], self.expandtok(tok[2])
+ elif toktype == self.BAR:
+ return tok
+ elif toktype == self.ENV:
+ return toktype,tok[1],tok[2],self.expandtok(tok[3])
+ elif toktype == self.SEQ:
+ if len(tok) == 2:
+ return self.expandtok(tok[1])
+ elif len(tok) == 1:
+ return None
+ else:
+ subtree = [self.SEQ]
+ for t in tok[1:]:
+ x = self.expandtok(t)
+ if x:
+ subtree.append(x)
+ return tuple(subtree) if len(subtree) > 2 else \
+ subtree[1] if len(subtree) == 2 else None
+
+ def parse(self):
+ tree = [self.SEQ]
+ for tok in self.nextkn():
+ tree.append(self.expandtok(tok))
+ self.tree = tuple(tree)
+
+ def prtok(self, tok, indent):
+ if not tok:
+ print " " * indent, "None"
+ return
+ toktype = tok[0]
+ if toktype == self.SEQ:
+ for t in tok[1:]:
+ self.prtok(t, indent)
+ else:
+ print " " * indent,
+ if toktype == self.NIL:
+ print "NIL"
+ if toktype == self.TEXT:
+ print "TEXT \"%s\"" % (tok[1].encode('string_escape'))
+ elif toktype == self.LINK:
+ print "LINK "
+ self.prtok(tok[1], indent+1) # target
+ self.prtok(tok[2], indent+1) # text
+ elif toktype == self.TMPL:
+ print "TMPL"
+ self.prtok(tok[1], indent+1) # target
+ self.prtok(tok[2], indent+1) # text
+ elif toktype == self.REF:
+ print "REF"
+ self.prtok(tok[1], indent+1) # target
+ self.prtok(tok[2], indent+1) # text
+ elif toktype == self.IT:
+ print "IT"
+ self.prtok(tok[1], indent+1)
+ elif toktype == self.BOLD:
+ print "BOLD"
+ self.prtok(tok[1], indent+1)
+ elif toktype == self.HDR:
+ print "HDR", tok[1]
+ self.prtok(tok[2], indent+1)
+ elif toktype == self.BAR:
+ print "BAR"
+ elif toktype == self.ENV:
+ print "ENV ",self.envtypes[tok[1]],tok[2]
+ self.prtok(tok[3], indent+1)
+ elif toktype == self.ITEM:
+ print "ITEM"
+ self.prtok(tok[1], indent+1)
+
+ def output(self):
+ self.prtok(self.tree, 0)
+
+
+class WikiMarkup (BaseWikiMarkup):
+ """
+ A derived class, that supplies a basic input method.
+
+ Three types of inputs are available:
+
+ 1. filename=<file>
+ The file <file> is opened and used for input.
+ 2. file=<file>
+ The already opened file <file> is used for input.
+ 3. text=<string>
+ Input is taken from <string>, line by line.
+
+ Usage:
+
+ obj = WikiMarkup(arg=val)
+ obj.parse
+ ... Do whatever you need with obj.tree ...
+
+ """
+ file = None
+ text = None
+ def __init__(self, *args, **keywords):
+ if 'file' in keywords:
+ self.file = keywords['file']
+ elif 'filename' in keywords:
+ self.file = open(keywords['filename'])
+ elif 'text' in keywords:
+ self.text = keywords['text'].split("\n")
+
+ def __del__(self):
+ if self.file:
+ self.file.close()
+
+ def input(self):
+ if self.file:
+ return self.file.readline()
+ elif self.text:
+ return self.text.pop(0)
+ else:
+ return None
+

Return to:

Send suggestions and report system problems to the System administrator.