diff options
author | Sergey Poznyakoff <gray@Pirx.gnu.org.ua> | 2008-11-26 08:06:06 +0200 |
---|---|---|
committer | Sergey Poznyakoff <gray@Pirx.gnu.org.ua> | 2008-11-26 08:06:06 +0200 |
commit | 5dc93e466efaaa243e6490961b6e545eaa65f06c (patch) | |
tree | 844b75613cabb2c0394828492038546c1f9806d8 /wiki2html.py | |
download | wit-5dc93e466efaaa243e6490961b6e545eaa65f06c.tar.gz wit-5dc93e466efaaa243e6490961b6e545eaa65f06c.tar.bz2 |
Initial commit
Diffstat (limited to 'wiki2html.py')
-rw-r--r-- | wiki2html.py | 503 |
1 files changed, 503 insertions, 0 deletions
diff --git a/wiki2html.py b/wiki2html.py new file mode 100644 index 0000000..7441b97 --- /dev/null +++ b/wiki2html.py @@ -0,0 +1,503 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (C) 2008 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from wikimarkup import * +from types import TupleType +import urllib + +class HtmlWikiMarkup (WikiMarkup): + """ + A (hopefully) general-purpose Wiki->HTML translator class. + FIXME: 1. See WikiMarkup for a list + 2. [[official position]]s : final 's' gets after closing </a> tag. + Should be before. + """ + lang = 'en' + html_base = 'http://%(lang)s.wiktionary.org' + image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf/' + media_base = 'http://www.mediawiki.org/xml/export-0.3' + + def __init__(self, *args, **keywords): + WikiMarkup.__init__(self, *args, **keywords) + if 'lang' in keywords: + self.lang = keywords['lang'] + elif 'html_base' in keywords: + self.html_base = keywords['html_base'] + elif 'image_base' in keywords: + self.image_base = keywords['image_base'] + elif 'media_base' in keywords: + self.media_base = keywords['media_base'] + + # ISO 639 + langtab = { + "aa": "Afar", # Afar + "ab": "Аҧсуа", # Abkhazian + "ae": None, # Avestan + "af": "Afrikaans", # Afrikaans + "ak": "Akana", # Akan # or ak_CI + "als": "Alemannisch", + "am": "አማርኛ", # Amharic + "an": "Aragonés", # Aragonese + "ang": "Englisc", + "ar": "العربية" , # Arabic + "arc": "ܐܪܡܝܐ", + "as": "অসমীয়া", # Assamese + "ast": "Asturian", + "av": "Авар", # Avaric # Spoken mainly in Dagestan + "ay": "Aymar", # Aymara + "az": "Azərbaycan" , # Azerbaijani + + "ba": "Башҡорт", # Bashkir + "bar": "Boarisch", + "bat-smg": "Žemaitėška", + "bcl": "Bikol", + "be": "Беларуская", # Byelorussian; Belarusian + "be-x-old": "Беларуская (тарашкевіца)", + "bg": "Български", # Bulgarian + "bh": "भोजपुरी", # Bihari + "bi": "Bislama", # Bislama + "bm": "Bamanankan", # Bambara + "bn": "বাংলা" , # Bengali; Bangla + "bo": "བོད་སྐད", # Tibetan + "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" , + "br": "Brezhoneg" , # Breton + "bs": "Bosanski" , # Bosnian + "bug": "Basa Ugi", + "bxr": "Буряад", + + "ca": "Català" , # Catalan + "cbk-zam": "Chavacano de Zamboanga", + "cdo": "Mìng-dĕ̤ng-ngṳ̄", + "cho": "Choctaw", + "ce": "Нохчийн", # Chechen + "ceb": "Sinugboanong Binisaya" , # Cebuano + "ch": "Chamor", # Chamorro + "chr": "ᏣᎳᎩ", + "chy": "Tsetsêhestâhese", + "co": "Cors", # Corsican + "cr": "Nehiyaw", # Cree + "crh": "Qırımtatarca", + "cs": "Česky" , # Czech + "csb": "Kaszëbsczi", + "c": "Словѣньскъ", # Church Slavic + "cv": "Чăваш", # Chuvash + "cy": "Cymraeg" , # Welsh + + "da": "Dansk" , # Danish + "de": "Deutsch" , # German + "diq": "Zazaki", # Dimli (Southern Zazaki) + "dsb": "Dolnoserbski", + "dv": "ދިވެހިބަސް", # Divehi + "dz": "ཇོང་ཁ", # Dzongkha; Bhutani + + "ee": "Eʋegbe", # Ewe + "el": "Ελληνικά" , # Greek + "eml": "Emiliàn e rumagnòl", + "en": "English" , # English + "eo": "Esperanto" , + "es": "Español" , # Spanish + "et": "Eesti" , # Estonian + "e": "Euskara" , # Basque + "ext": "Estremeñ", + + "fa": "فارسی" , # Persian + "ff": "Fulfulde", # Fulah # Also NG, MR, and many others + "fi": "Suomi" , # Finnish + "fiu-vro": "Võro", + "fj": "Na Vosa Vakaviti", # Fijian; Fiji + "fo": "Føroyskt" , # Faroese + "fr": "Français" , # French + "frp": "Arpitan", + "fur": "Furlan", + "fy": "Frysk", # Frisian + + "ga": "Gaeilge", # Irish + "gan": "贛語 (Gànyŭ)", + "gd": "Gàidhlig", # Scots; Gaelic + "gl": "Gallego" , # Gallegan; Galician + "glk": "گیلکی", + "got": "𐌲𐌹𐌺 ", + "gn": "Avañe'ẽ", # Guarani + "g": "ગુજરાતી", # Gujarati + "gv": "Gaelg", # Manx + + "ha": "هَوُسَ", # Hausa + "hak": "Hak-kâ-fa / 客家話", + "haw": "Hawai`i", + "he": "עברית" , # Hebrew (formerly iw) + "hi": "हिन्दी" , # Hindi + "hif": "Fiji Hindi", + "ho": "Hiri Mot", # Hiri Motu + "hr": "Hrvatski" , # Croatian + "hsb": "Hornjoserbsce", + "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole + "hu": "Magyar" , # Hungarian + "hy": "Հայերեն", # Armenian + "hz": "Otsiherero", # Herero + + "ia": "Interlingua", + "ie": "Interlingue", + "id": "Bahasa Indonesia", # Indonesian (formerly in) + "ig": "Igbo", # Igbo + "ii": "ꆇꉙ ", # Sichuan Yi + "ik": "Iñupiak", # Inupiak + "ilo": "Ilokano", + "io": "Ido" , + "is": "Íslenska" , # Icelandic + "it": "Italiano" , # Italian + "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut + + "ja": "日本語", # Japanese + "jbo": "Lojban", + "jv": "Basa Jawa", # Javanese + + "ka": "ქართული" , # Georgian + "kaa": "Qaraqalpaqsha", + "kab": "Taqbaylit", + "kg": "KiKongo", # Kongo # also CD and AO + "ki": "Gĩkũyũ", # Kikuyu + "kj": "Kuanyama", # Kuanyama + "kk": "Қазақша", # Kazakh + "kl": "Kalaallisut", # Kalaallisut; Greenlandic + "km": "ភាសាខ្មែរ", # Khmer; Cambodian + "kn": "ಕನ್ನಡ", # Kannada + "ko": "한국어" , # Korean + "kr": "Kanuri", # Kanuri + "ks": "कश्मीरी / كشميري", # Kashmiri + "ksh": "Ripoarisch", + "ku": "Kurdî / كوردی", # Kurdish + "kv": "Коми", # Komi + "kw": "Kernewek/Karnuack", # Cornish + "ky": "Кыргызча", # Kirghiz + + "la": "Latina" , # Latin + "lad": "Dzhudezmo", + "lb": "Lëtzebuergesch" , # Letzeburgesch + "lbe": "Лакку", + "lg": "Luganda", # Ganda + "li": "Limburgs", # Limburgish; Limburger; Limburgan + "lij": "Lígur", + "ln": "Lingala", # Lingala + "lmo": "Lumbaart", + "lo": "ລາວ", # Lao; Laotian + "lt": "Lietuvių" , # Lithuanian + "l": None, # Luba-Katanga + "lv": "Latvieš" , # Latvian; Lettish + + "map-bms": "Basa Banyumasan", + "mdf": "Мокшень (Mokshanj Kälj)", + "mg": "Malagasy", # Malagasy + "mh": "Ebon", # Marshall + "mi": "Māori", # Maori + "mk": "Македонски" , # Macedonian + "ml": None, # Malayalam + "mn": "Монгол", # Mongolian + "mo": "Молдовеняскэ", # Moldavian + "mr": "मराठी" , # Marathi + "ms": "Bahasa Melay" , # Malay + "mt": "Malti", # Maltese + "mus": "Muskogee", + "my": "မ္ရန္မာစာ", # Burmese + "myv": "Эрзянь (Erzjanj Kelj)", + "mzn": "مَزِروني", + + "na": "dorerin Naoero", # Nauru + "nah": "Nāhuatl", + "nap": "Nnapulitano", + "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l + "nd": None,# Ndebele, North + "nds": "Plattdüütsch", + "nds-nl": "Nedersaksisch", + "ne": "नेपाली", # Nepali + "new": "नेपाल भाषा" , # Nepal Bhasa + "ng": "Oshiwambo", # Ndonga + "nl": "Nederlands" , # Dutch + "nn": "Nynorsk", # Norwegian Nynorsk + "no": "Norsk (Bokmål)" , # Norwegian + "nov": "Novial", + "nr": None, # Ndebele, South + "nrm": "Nouormand/Normaund", + "nv": "Diné bizaad", # Navajo + "ny": "Chi-Chewa", # Chichewa; Nyanja + + "oc": "Occitan", # Occitan; Proven@,{c}al + "oj": None, # Ojibwa + "om": "Oromoo", # (Afan) Oromo + "or": "ଓଡ଼ିଆ", # Oriya + "os": "Иронау", # Ossetian; Ossetic + + "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi + "pag": "Pangasinan", + "pam": "Kapampangan", + "pap": "Papiament", + "pdc": "Deitsch", + "pi": "पाऴि", # Pali + "pih": "Norfuk", + "pl": "Polski" , # Polish + "pms": "Piemontèis" , + "ps": "پښتو", # Pashto, Pushto + "pt": "Português" , # Portuguese + + "q": "Runa Simi" , # Quechua + + "rm": "Rumantsch", # Rhaeto-Romance + "rmy": "romani - रोमानी", + "rn": "Kirundi", # Rundi; Kirundi + "ro": "Română" , # Romanian + "roa-rup": "Armãneashce", + "roa-tara": "Tarandíne", + "ru": "Русский" , # Russian + "rw": "Ikinyarwanda", # Kinyarwanda + + "sa": "संस्कृतम्", # Sanskrit + "sah": "Саха тыла (Saxa Tyla)", + "sc": "Sard", # Sardinian + "scn": "Sicilian", + "sco": "Scots", + "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi + "se": "Sámegiella", # Northern Sami + "sg": "Sängö", # Sango; Sangro + "sh": "Srpskohrvatski / Српскохрватски" , + "si": "සිංහල", + "simple": "Simple English" , + "sk": "Slovenčina" , # Slovak + "sl": "Slovenščina" , # Slovenian + "sm": "Gagana Samoa", # Samoan + "sn": "chiShona", # Shona + "so": "Soomaaliga", # Somali + "sr": "Српски / Srpski" , # Serbian + "srn": "Sranantongo", + "ss": "SiSwati", # Swati; Siswati + "st": "Sesotho", # Sesotho; Sotho, Southern + "stk": "Seeltersk", + "s": "Basa Sunda", # Sundanese + "sq": "Shqip" , # Albanian + "szl": "Ślůnski", + "sv": "Svenska" , # Swedish + "sw": "Kiswahili", # Swahili # Also KE + + "ta": "தமிழ்" , # Tamil + "te": "తెలుగు" , # Telugu + "tet": "Tetun", + "tg": "Тоҷикӣ", # Tajik + "th": "ไทย" , # Thai + "ti": "ትግርኛ", # Tigrinya + "tk": "تركمن / Туркмен", # Turkmen + "tl": "Tagalog" , # Tagalog + "tn": "Setswana", # Tswana; Setswana + "to": "faka Tonga", # Tonga (?) # Also ZW ; MW + "tokipona": "Tokipona", + "tpi": "Tok Pisin", + "tr": "Türkçe" , # Turkish + "ts": "Xitsonga", # Tsonga # ZA SZ XW + "tt": "Tatarça / Татарча", # Tatar + "tum": "chiTumbuka", + "tw": "Twi", # Twi + "ty": "Reo Mā`ohi", # Tahitian + + "udm": "Удмурт кыл", + "ug": "Oyghurque", # Uighur + "uk": "Українська" , # Ukrainian + "ur": "اردو", # Urdu + "uz": "O‘zbek", # Uzbek + + "ve": "Tshivenda", # Venda + "vec": "Vèneto", + "vi": "Tiếng Việt" , # Vietnamese + "vls": "West-Vlams", + "vo": "Volapük" , + + "wa": "Walon", # Walloon + "war": "Winaray", + "wo": "Wolof", # Wolof + "w": "吴语", + + "xal": "Хальмг", + "xh": "isiXhosa", # Xhosa + + "yi": "ייִדיש", # Yiddish (formerly ji) + "yo": "Yorùbá", # Yoruba + + "za": "Cuengh", # Zhuang + "zea": "Zeêuws", + "zh": "中文" , # Chinese + "zh-classical": "古文 / 文言文", + "zm-min-nan": "Bân-lâm-gú", + "zh-yue": "粵語", + "z": "isiZul" # Zulu + } + + def target(self, t): + (qual,sep,tgt) = t.partition(':') + r = None + if tgt != '': + if qual in ('Image', 'Grafika'): + t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt) + elif qual == "Media": + t = self.media_base + '/' + tgt + elif qual in self.langtab: + t = self.html_base % { 'lang' : qual } + '/' + urllib.quote(tgt) + r = self.langtab[qual] + else: + t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t) + else: + t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t) + return t, r + + envhdr = [ "ul", "ol", "dl" ] + envel = [ "li", "li", "dd" ] + + def str_nil(self, tok, env): + return "" + + def str_text(self, tok, env): + return tok[1] + + def str_link(self, tok, env): + arg = self.fmtok(tok[1], env) + (target, r) = self.target(arg) + text = self.fmtok(tok[2], env) + if not text and r: + text = r + return "<a href=\"%s\">%s</a>" % (target, + text if (text and text != '') \ + else \ + r if r else arg) + + def str_tmpl(self, tok, env): + arg = self.fmtok(tok[1], env) + (target, r) = self.target(arg) + text = self.fmtok(tok[2], env) + return "<a href=\"%s\">%s</a>" % (target, + text if (text and text != '') \ + else arg) + + def str_ref(self, tok, env): + target = self.fmtok(tok[1], env) + text = self.fmtok(tok[2], env) + return "<a href=\"%s\">%s</a>" % (target, + text if (text and text != '') \ + else target) + def str_it(self, tok, env): + return "<i>" + self.fmtok(tok[1], env) + "</i>" + + def str_bold(self, tok, env): + return "<b>" + self.fmtok(tok[1], env) + "</b>" + + def str_hdr(self, tok, env): + level = tok[1] + if level > 4: + level = 4 + return "<h%s>%s</h%s>" % (level, self.fmtok(tok[2], env), level) + + def str_bar(self, tok, env): + return "<hr/>" + + def str_env(self, tok, env): + t = tok[1] + return "<" + self.envhdr[t] + ">" + \ + self.fmtok(tok[3], tok) + \ + "</" + self.envhdr[t] + ">" + + def str_item(self, tok, env): + return "<%s>%s</%s>" % (self.envel[env[1]], + self.fmtok(tok[1], env), + self.envel[env[1]]) + + def str_seq(self, tok, env): + s = "" + for t in tok[1:]: + s += self.fmtok(t, env) + return s + + def fmtok(self, tok, env): + if type(tok) != TupleType: + return "" + toktype = tok[0] + if toktype == self.NIL: + return self.str_nil(tok, env) + if toktype == self.TEXT: + return self.str_text(tok, env) + elif toktype == self.LINK: + return self.str_link(tok, env) + elif toktype == self.TMPL: + return self.str_tmpl(tok, env) + elif toktype == self.REF: + return self.str_ref(tok, env) + elif toktype == self.IT: + return self.str_it(tok, env) + elif toktype == self.BOLD: + return self.str_bold(tok, env) + elif toktype == self.HDR: + return self.str_hdr(tok, env) + elif toktype == self.BAR: + return self.str_bar(tok, env) + elif toktype == self.ENV: + return self.str_env(tok, env) + elif toktype == self.ITEM: + return self.str_item(tok, env) + elif toktype == self.SEQ: + return self.str_seq(tok, env) + + def __str__(self): + return self.fmtok(self.tree, None) + + +class HtmlWiktionaryMarkup (HtmlWikiMarkup): + """ + A class for translating Wiktionary articles into HTML. + This version does not do much, except that it tries to correctly + format templates. But "tries" does not mean "does". The heuristics + used here is clearly not enogh to cope with it. + + 1. FIXME: + The right solution would be to have a database of templates with their + semantics and to decide on their rendering depending on that. E.g. + {{term}} in en.wiktionary means "replace this with the search term". + This, however, does not work in other wiktionaries. There are + also more complex templates, e.g.: {{t+|bg|врата|n|p|tr=vrata|sc=Cyrl}} + I don't know what it means. Couldn't find any documentation either. + Again, this template does not work in other dictionaries. + + 2. Capitulation notice: + Given the: + 1. waste amount of wiktionaries available, + 2. abundance of various templates for each wictionary, + 3. apparent lack of documentation thereof, + 4. the lack of standardized language-independent templates, + I dont see any way to cope with the template-rendering task within a + reasonable amount of time. + + Faeci quod potui, faciant meliora potentes. + """ + seq_pos = 0 + + def str_seq(self, tok, env): + s = "" + self.seq_pos=0 + for t in tok[1:]: + s += self.fmtok(t, env) + self.seq_pos += 1 + return s + + def str_tmpl(self, tok, env): + arg = self.fmtok(tok[1], env) + if self.seq_pos > 0: + return " <b>" + arg + "</b>" + else: + return "<br/><b>" + arg + ":</b><br/>" + |