aboutsummaryrefslogtreecommitdiff
path: root/wiki2html.py
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@Pirx.gnu.org.ua>2008-11-26 08:06:06 +0200
committerSergey Poznyakoff <gray@Pirx.gnu.org.ua>2008-11-26 08:06:06 +0200
commit5dc93e466efaaa243e6490961b6e545eaa65f06c (patch)
tree844b75613cabb2c0394828492038546c1f9806d8 /wiki2html.py
downloadwit-5dc93e466efaaa243e6490961b6e545eaa65f06c.tar.gz
wit-5dc93e466efaaa243e6490961b6e545eaa65f06c.tar.bz2
Initial commit
Diffstat (limited to 'wiki2html.py')
-rw-r--r--wiki2html.py503
1 files changed, 503 insertions, 0 deletions
diff --git a/wiki2html.py b/wiki2html.py
new file mode 100644
index 0000000..7441b97
--- /dev/null
+++ b/wiki2html.py
@@ -0,0 +1,503 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from wikimarkup import *
+from types import TupleType
+import urllib
+
+class HtmlWikiMarkup (WikiMarkup):
+ """
+ A (hopefully) general-purpose Wiki->HTML translator class.
+ FIXME: 1. See WikiMarkup for a list
+ 2. [[official position]]s : final 's' gets after closing </a> tag.
+ Should be before.
+ """
+ lang = 'en'
+ html_base = 'http://%(lang)s.wiktionary.org'
+ image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf/'
+ media_base = 'http://www.mediawiki.org/xml/export-0.3'
+
+ def __init__(self, *args, **keywords):
+ WikiMarkup.__init__(self, *args, **keywords)
+ if 'lang' in keywords:
+ self.lang = keywords['lang']
+ elif 'html_base' in keywords:
+ self.html_base = keywords['html_base']
+ elif 'image_base' in keywords:
+ self.image_base = keywords['image_base']
+ elif 'media_base' in keywords:
+ self.media_base = keywords['media_base']
+
+ # ISO 639
+ langtab = {
+ "aa": "Afar", # Afar
+ "ab": "Аҧсуа", # Abkhazian
+ "ae": None, # Avestan
+ "af": "Afrikaans", # Afrikaans
+ "ak": "Akana", # Akan # or ak_CI
+ "als": "Alemannisch",
+ "am": "አማርኛ", # Amharic
+ "an": "Aragonés", # Aragonese
+ "ang": "Englisc",
+ "ar": "العربية" , # Arabic
+ "arc": "ܐܪܡܝܐ",
+ "as": "অসমীয়া", # Assamese
+ "ast": "Asturian",
+ "av": "Авар", # Avaric # Spoken mainly in Dagestan
+ "ay": "Aymar", # Aymara
+ "az": "Azərbaycan" , # Azerbaijani
+
+ "ba": "Башҡорт", # Bashkir
+ "bar": "Boarisch",
+ "bat-smg": "Žemaitėška",
+ "bcl": "Bikol",
+ "be": "Беларуская", # Byelorussian; Belarusian
+ "be-x-old": "Беларуская (тарашкевіца)",
+ "bg": "Български", # Bulgarian
+ "bh": "भोजपुरी", # Bihari
+ "bi": "Bislama", # Bislama
+ "bm": "Bamanankan", # Bambara
+ "bn": "বাংলা" , # Bengali; Bangla
+ "bo": "བོད་སྐད", # Tibetan
+ "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" ,
+ "br": "Brezhoneg" , # Breton
+ "bs": "Bosanski" , # Bosnian
+ "bug": "Basa Ugi",
+ "bxr": "Буряад",
+
+ "ca": "Català" , # Catalan
+ "cbk-zam": "Chavacano de Zamboanga",
+ "cdo": "Mìng-dĕ̤ng-ngṳ̄",
+ "cho": "Choctaw",
+ "ce": "Нохчийн", # Chechen
+ "ceb": "Sinugboanong Binisaya" , # Cebuano
+ "ch": "Chamor", # Chamorro
+ "chr": "ᏣᎳᎩ",
+ "chy": "Tsetsêhestâhese",
+ "co": "Cors", # Corsican
+ "cr": "Nehiyaw", # Cree
+ "crh": "Qırımtatarca",
+ "cs": "Česky" , # Czech
+ "csb": "Kaszëbsczi",
+ "c": "Словѣньскъ", # Church Slavic
+ "cv": "Чăваш", # Chuvash
+ "cy": "Cymraeg" , # Welsh
+
+ "da": "Dansk" , # Danish
+ "de": "Deutsch" , # German
+ "diq": "Zazaki", # Dimli (Southern Zazaki)
+ "dsb": "Dolnoserbski",
+ "dv": "ދިވެހިބަސް", # Divehi
+ "dz": "ཇོང་ཁ", # Dzongkha; Bhutani
+
+ "ee": "Eʋegbe", # Ewe
+ "el": "Ελληνικά" , # Greek
+ "eml": "Emiliàn e rumagnòl",
+ "en": "English" , # English
+ "eo": "Esperanto" ,
+ "es": "Español" , # Spanish
+ "et": "Eesti" , # Estonian
+ "e": "Euskara" , # Basque
+ "ext": "Estremeñ",
+
+ "fa": "فارسی" , # Persian
+ "ff": "Fulfulde", # Fulah # Also NG, MR, and many others
+ "fi": "Suomi" , # Finnish
+ "fiu-vro": "Võro",
+ "fj": "Na Vosa Vakaviti", # Fijian; Fiji
+ "fo": "Føroyskt" , # Faroese
+ "fr": "Français" , # French
+ "frp": "Arpitan",
+ "fur": "Furlan",
+ "fy": "Frysk", # Frisian
+
+ "ga": "Gaeilge", # Irish
+ "gan": "贛語 (Gànyŭ)",
+ "gd": "Gàidhlig", # Scots; Gaelic
+ "gl": "Gallego" , # Gallegan; Galician
+ "glk": "گیلکی",
+ "got": "𐌲Œ„𐌹𐌺 ",
+ "gn": "Avañe'ẽ", # Guarani
+ "g": "ગુજરાતી", # Gujarati
+ "gv": "Gaelg", # Manx
+
+ "ha": "هَوُسَ", # Hausa
+ "hak": "Hak-kâ-fa / 客家話",
+ "haw": "Hawai`i",
+ "he": "עברית" , # Hebrew (formerly iw)
+ "hi": "हिन्दी" , # Hindi
+ "hif": "Fiji Hindi",
+ "ho": "Hiri Mot", # Hiri Motu
+ "hr": "Hrvatski" , # Croatian
+ "hsb": "Hornjoserbsce",
+ "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole
+ "hu": "Magyar" , # Hungarian
+ "hy": "Հայերեն", # Armenian
+ "hz": "Otsiherero", # Herero
+
+ "ia": "Interlingua",
+ "ie": "Interlingue",
+ "id": "Bahasa Indonesia", # Indonesian (formerly in)
+ "ig": "Igbo", # Igbo
+ "ii": "ꆇꉙ ", # Sichuan Yi
+ "ik": "Iñupiak", # Inupiak
+ "ilo": "Ilokano",
+ "io": "Ido" ,
+ "is": "Íslenska" , # Icelandic
+ "it": "Italiano" , # Italian
+ "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut
+
+ "ja": "日本語", # Japanese
+ "jbo": "Lojban",
+ "jv": "Basa Jawa", # Javanese
+
+ "ka": "ქართული" , # Georgian
+ "kaa": "Qaraqalpaqsha",
+ "kab": "Taqbaylit",
+ "kg": "KiKongo", # Kongo # also CD and AO
+ "ki": "Gĩkũyũ", # Kikuyu
+ "kj": "Kuanyama", # Kuanyama
+ "kk": "Қазақша", # Kazakh
+ "kl": "Kalaallisut", # Kalaallisut; Greenlandic
+ "km": "ភាសាខ្មែរ", # Khmer; Cambodian
+ "kn": "ಕನ್ನಡ", # Kannada
+ "ko": "한국어" , # Korean
+ "kr": "Kanuri", # Kanuri
+ "ks": "कश्मीरी / كشميري", # Kashmiri
+ "ksh": "Ripoarisch",
+ "ku": "Kurdî / كوردی", # Kurdish
+ "kv": "Коми", # Komi
+ "kw": "Kernewek/Karnuack", # Cornish
+ "ky": "Кыргызча", # Kirghiz
+
+ "la": "Latina" , # Latin
+ "lad": "Dzhudezmo",
+ "lb": "Lëtzebuergesch" , # Letzeburgesch
+ "lbe": "Лакку",
+ "lg": "Luganda", # Ganda
+ "li": "Limburgs", # Limburgish; Limburger; Limburgan
+ "lij": "Lígur",
+ "ln": "Lingala", # Lingala
+ "lmo": "Lumbaart",
+ "lo": "ລາວ", # Lao; Laotian
+ "lt": "Lietuvių" , # Lithuanian
+ "l": None, # Luba-Katanga
+ "lv": "Latvieš" , # Latvian; Lettish
+
+ "map-bms": "Basa Banyumasan",
+ "mdf": "Мокшень (Mokshanj Kälj)",
+ "mg": "Malagasy", # Malagasy
+ "mh": "Ebon", # Marshall
+ "mi": "Māori", # Maori
+ "mk": "Македонски" , # Macedonian
+ "ml": None, # Malayalam
+ "mn": "Монгол", # Mongolian
+ "mo": "Молдовеняскэ", # Moldavian
+ "mr": "मराठी" , # Marathi
+ "ms": "Bahasa Melay" , # Malay
+ "mt": "Malti", # Maltese
+ "mus": "Muskogee",
+ "my": "မ္ရန္‌မာစာ", # Burmese
+ "myv": "Эрзянь (Erzjanj Kelj)",
+ "mzn": "مَزِروني",
+
+ "na": "dorerin Naoero", # Nauru
+ "nah": "Nāhuatl",
+ "nap": "Nnapulitano",
+ "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l
+ "nd": None,# Ndebele, North
+ "nds": "Plattdüütsch",
+ "nds-nl": "Nedersaksisch",
+ "ne": "नेपाली", # Nepali
+ "new": "नेपाल भाषा" , # Nepal Bhasa
+ "ng": "Oshiwambo", # Ndonga
+ "nl": "Nederlands" , # Dutch
+ "nn": "Nynorsk", # Norwegian Nynorsk
+ "no": "Norsk (Bokmål)" , # Norwegian
+ "nov": "Novial",
+ "nr": None, # Ndebele, South
+ "nrm": "Nouormand/Normaund",
+ "nv": "Diné bizaad", # Navajo
+ "ny": "Chi-Chewa", # Chichewa; Nyanja
+
+ "oc": "Occitan", # Occitan; Proven@,{c}al
+ "oj": None, # Ojibwa
+ "om": "Oromoo", # (Afan) Oromo
+ "or": "ଓଡ଼ିଆ", # Oriya
+ "os": "Иронау", # Ossetian; Ossetic
+
+ "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi
+ "pag": "Pangasinan",
+ "pam": "Kapampangan",
+ "pap": "Papiament",
+ "pdc": "Deitsch",
+ "pi": "पाऴि", # Pali
+ "pih": "Norfuk",
+ "pl": "Polski" , # Polish
+ "pms": "Piemontèis" ,
+ "ps": "پښتو", # Pashto, Pushto
+ "pt": "Português" , # Portuguese
+
+ "q": "Runa Simi" , # Quechua
+
+ "rm": "Rumantsch", # Rhaeto-Romance
+ "rmy": "romani - रोमानी",
+ "rn": "Kirundi", # Rundi; Kirundi
+ "ro": "Română" , # Romanian
+ "roa-rup": "Armãneashce",
+ "roa-tara": "Tarandíne",
+ "ru": "Русский" , # Russian
+ "rw": "Ikinyarwanda", # Kinyarwanda
+
+ "sa": "संस्कृतम्", # Sanskrit
+ "sah": "Саха тыла (Saxa Tyla)",
+ "sc": "Sard", # Sardinian
+ "scn": "Sicilian",
+ "sco": "Scots",
+ "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi
+ "se": "Sámegiella", # Northern Sami
+ "sg": "Sängö", # Sango; Sangro
+ "sh": "Srpskohrvatski / Српскохрватски" ,
+ "si": "සිංහල",
+ "simple": "Simple English" ,
+ "sk": "Slovenčina" , # Slovak
+ "sl": "Slovenščina" , # Slovenian
+ "sm": "Gagana Samoa", # Samoan
+ "sn": "chiShona", # Shona
+ "so": "Soomaaliga", # Somali
+ "sr": "Српски / Srpski" , # Serbian
+ "srn": "Sranantongo",
+ "ss": "SiSwati", # Swati; Siswati
+ "st": "Sesotho", # Sesotho; Sotho, Southern
+ "stk": "Seeltersk",
+ "s": "Basa Sunda", # Sundanese
+ "sq": "Shqip" , # Albanian
+ "szl": "Ślůnski",
+ "sv": "Svenska" , # Swedish
+ "sw": "Kiswahili", # Swahili # Also KE
+
+ "ta": "தமிழ்" , # Tamil
+ "te": "తెలుగు" , # Telugu
+ "tet": "Tetun",
+ "tg": "Тоҷикӣ", # Tajik
+ "th": "ไทย" , # Thai
+ "ti": "ትግርኛ", # Tigrinya
+ "tk": "تركمن / Туркмен", # Turkmen
+ "tl": "Tagalog" , # Tagalog
+ "tn": "Setswana", # Tswana; Setswana
+ "to": "faka Tonga", # Tonga (?) # Also ZW ; MW
+ "tokipona": "Tokipona",
+ "tpi": "Tok Pisin",
+ "tr": "Türkçe" , # Turkish
+ "ts": "Xitsonga", # Tsonga # ZA SZ XW
+ "tt": "Tatarça / Татарча", # Tatar
+ "tum": "chiTumbuka",
+ "tw": "Twi", # Twi
+ "ty": "Reo Mā`ohi", # Tahitian
+
+ "udm": "Удмурт кыл",
+ "ug": "Oyghurque", # Uighur
+ "uk": "Українська" , # Ukrainian
+ "ur": "اردو", # Urdu
+ "uz": "O‘zbek", # Uzbek
+
+ "ve": "Tshivenda", # Venda
+ "vec": "Vèneto",
+ "vi": "Tiếng Việt" , # Vietnamese
+ "vls": "West-Vlams",
+ "vo": "Volapük" ,
+
+ "wa": "Walon", # Walloon
+ "war": "Winaray",
+ "wo": "Wolof", # Wolof
+ "w": "吴语",
+
+ "xal": "Хальмг",
+ "xh": "isiXhosa", # Xhosa
+
+ "yi": "ייִדיש", # Yiddish (formerly ji)
+ "yo": "Yorùbá", # Yoruba
+
+ "za": "Cuengh", # Zhuang
+ "zea": "Zeêuws",
+ "zh": "中文" , # Chinese
+ "zh-classical": "古文 / 文言文",
+ "zm-min-nan": "Bân-lâm-gú",
+ "zh-yue": "粵語",
+ "z": "isiZul" # Zulu
+ }
+
+ def target(self, t):
+ (qual,sep,tgt) = t.partition(':')
+ r = None
+ if tgt != '':
+ if qual in ('Image', 'Grafika'):
+ t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt)
+ elif qual == "Media":
+ t = self.media_base + '/' + tgt
+ elif qual in self.langtab:
+ t = self.html_base % { 'lang' : qual } + '/' + urllib.quote(tgt)
+ r = self.langtab[qual]
+ else:
+ t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t)
+ else:
+ t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t)
+ return t, r
+
+ envhdr = [ "ul", "ol", "dl" ]
+ envel = [ "li", "li", "dd" ]
+
+ def str_nil(self, tok, env):
+ return ""
+
+ def str_text(self, tok, env):
+ return tok[1]
+
+ def str_link(self, tok, env):
+ arg = self.fmtok(tok[1], env)
+ (target, r) = self.target(arg)
+ text = self.fmtok(tok[2], env)
+ if not text and r:
+ text = r
+ return "<a href=\"%s\">%s</a>" % (target,
+ text if (text and text != '') \
+ else \
+ r if r else arg)
+
+ def str_tmpl(self, tok, env):
+ arg = self.fmtok(tok[1], env)
+ (target, r) = self.target(arg)
+ text = self.fmtok(tok[2], env)
+ return "<a href=\"%s\">%s</a>" % (target,
+ text if (text and text != '') \
+ else arg)
+
+ def str_ref(self, tok, env):
+ target = self.fmtok(tok[1], env)
+ text = self.fmtok(tok[2], env)
+ return "<a href=\"%s\">%s</a>" % (target,
+ text if (text and text != '') \
+ else target)
+ def str_it(self, tok, env):
+ return "<i>" + self.fmtok(tok[1], env) + "</i>"
+
+ def str_bold(self, tok, env):
+ return "<b>" + self.fmtok(tok[1], env) + "</b>"
+
+ def str_hdr(self, tok, env):
+ level = tok[1]
+ if level > 4:
+ level = 4
+ return "<h%s>%s</h%s>" % (level, self.fmtok(tok[2], env), level)
+
+ def str_bar(self, tok, env):
+ return "<hr/>"
+
+ def str_env(self, tok, env):
+ t = tok[1]
+ return "<" + self.envhdr[t] + ">" + \
+ self.fmtok(tok[3], tok) + \
+ "</" + self.envhdr[t] + ">"
+
+ def str_item(self, tok, env):
+ return "<%s>%s</%s>" % (self.envel[env[1]],
+ self.fmtok(tok[1], env),
+ self.envel[env[1]])
+
+ def str_seq(self, tok, env):
+ s = ""
+ for t in tok[1:]:
+ s += self.fmtok(t, env)
+ return s
+
+ def fmtok(self, tok, env):
+ if type(tok) != TupleType:
+ return ""
+ toktype = tok[0]
+ if toktype == self.NIL:
+ return self.str_nil(tok, env)
+ if toktype == self.TEXT:
+ return self.str_text(tok, env)
+ elif toktype == self.LINK:
+ return self.str_link(tok, env)
+ elif toktype == self.TMPL:
+ return self.str_tmpl(tok, env)
+ elif toktype == self.REF:
+ return self.str_ref(tok, env)
+ elif toktype == self.IT:
+ return self.str_it(tok, env)
+ elif toktype == self.BOLD:
+ return self.str_bold(tok, env)
+ elif toktype == self.HDR:
+ return self.str_hdr(tok, env)
+ elif toktype == self.BAR:
+ return self.str_bar(tok, env)
+ elif toktype == self.ENV:
+ return self.str_env(tok, env)
+ elif toktype == self.ITEM:
+ return self.str_item(tok, env)
+ elif toktype == self.SEQ:
+ return self.str_seq(tok, env)
+
+ def __str__(self):
+ return self.fmtok(self.tree, None)
+
+
+class HtmlWiktionaryMarkup (HtmlWikiMarkup):
+ """
+ A class for translating Wiktionary articles into HTML.
+ This version does not do much, except that it tries to correctly
+ format templates. But "tries" does not mean "does". The heuristics
+ used here is clearly not enogh to cope with it.
+
+ 1. FIXME:
+ The right solution would be to have a database of templates with their
+ semantics and to decide on their rendering depending on that. E.g.
+ {{term}} in en.wiktionary means "replace this with the search term".
+ This, however, does not work in other wiktionaries. There are
+ also more complex templates, e.g.: {{t+|bg|врата|n|p|tr=vrata|sc=Cyrl}}
+ I don't know what it means. Couldn't find any documentation either.
+ Again, this template does not work in other dictionaries.
+
+ 2. Capitulation notice:
+ Given the:
+ 1. waste amount of wiktionaries available,
+ 2. abundance of various templates for each wictionary,
+ 3. apparent lack of documentation thereof,
+ 4. the lack of standardized language-independent templates,
+ I dont see any way to cope with the template-rendering task within a
+ reasonable amount of time.
+
+ Faeci quod potui, faciant meliora potentes.
+ """
+ seq_pos = 0
+
+ def str_seq(self, tok, env):
+ s = ""
+ self.seq_pos=0
+ for t in tok[1:]:
+ s += self.fmtok(t, env)
+ self.seq_pos += 1
+ return s
+
+ def str_tmpl(self, tok, env):
+ arg = self.fmtok(tok[1], env)
+ if self.seq_pos > 0:
+ return " <b>" + arg + "</b>"
+ else:
+ return "<br/><b>" + arg + ":</b><br/>"
+

Return to:

Send suggestions and report system problems to the System administrator.