diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2008-11-26 09:52:15 +0200 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2008-11-26 09:52:27 +0200 |
commit | bd79a17ca5082789d4cf82f62a6afc0baaca90e8 (patch) | |
tree | 40c002caaab88586c2c8649bc9cd3ffe2b18bd69 /wiki2html.py | |
parent | 5dc93e466efaaa243e6490961b6e545eaa65f06c (diff) | |
download | wit-bd79a17ca5082789d4cf82f62a6afc0baaca90e8.tar.gz wit-bd79a17ca5082789d4cf82f62a6afc0baaca90e8.tar.bz2 |
Implement plain text conversion.
* wiki2html.py (HtmlWikiMarkup): Move lang, html_base, image_base,
media_base, langtab, str_nil, str_text, fmtok, __str__ to WikiMarkup
* wikimarkup.py: See above.
* wiki2plain.py: Remove.
* wiki2text.py: New file (instead of the above)
* wikicvt.py: Implement new options.
Diffstat (limited to 'wiki2html.py')
-rw-r--r-- | wiki2html.py | 353 |
1 files changed, 0 insertions, 353 deletions
diff --git a/wiki2html.py b/wiki2html.py index 7441b97..907e3b1 100644 --- a/wiki2html.py +++ b/wiki2html.py @@ -26,321 +26,7 @@ class HtmlWikiMarkup (WikiMarkup): 2. [[official position]]s : final 's' gets after closing </a> tag. Should be before. """ - lang = 'en' - html_base = 'http://%(lang)s.wiktionary.org' - image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf/' - media_base = 'http://www.mediawiki.org/xml/export-0.3' - - def __init__(self, *args, **keywords): - WikiMarkup.__init__(self, *args, **keywords) - if 'lang' in keywords: - self.lang = keywords['lang'] - elif 'html_base' in keywords: - self.html_base = keywords['html_base'] - elif 'image_base' in keywords: - self.image_base = keywords['image_base'] - elif 'media_base' in keywords: - self.media_base = keywords['media_base'] - - # ISO 639 - langtab = { - "aa": "Afar", # Afar - "ab": "Аҧсуа", # Abkhazian - "ae": None, # Avestan - "af": "Afrikaans", # Afrikaans - "ak": "Akana", # Akan # or ak_CI - "als": "Alemannisch", - "am": "አማርኛ", # Amharic - "an": "Aragonés", # Aragonese - "ang": "Englisc", - "ar": "العربية" , # Arabic - "arc": "ܐܪܡܝܐ", - "as": "অসমীয়া", # Assamese - "ast": "Asturian", - "av": "Авар", # Avaric # Spoken mainly in Dagestan - "ay": "Aymar", # Aymara - "az": "Azərbaycan" , # Azerbaijani - - "ba": "Башҡорт", # Bashkir - "bar": "Boarisch", - "bat-smg": "Žemaitėška", - "bcl": "Bikol", - "be": "Беларуская", # Byelorussian; Belarusian - "be-x-old": "Беларуская (тарашкевіца)", - "bg": "Български", # Bulgarian - "bh": "भोजपुरी", # Bihari - "bi": "Bislama", # Bislama - "bm": "Bamanankan", # Bambara - "bn": "বাংলা" , # Bengali; Bangla - "bo": "བོད་སྐད", # Tibetan - "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" , - "br": "Brezhoneg" , # Breton - "bs": "Bosanski" , # Bosnian - "bug": "Basa Ugi", - "bxr": "Буряад", - - "ca": "Català" , # Catalan - "cbk-zam": "Chavacano de Zamboanga", - "cdo": "Mìng-dĕ̤ng-ngṳ̄", - "cho": "Choctaw", - "ce": "Нохчийн", # Chechen - "ceb": "Sinugboanong Binisaya" , # Cebuano - "ch": "Chamor", # Chamorro - "chr": "ᏣᎳᎩ", - "chy": "Tsetsêhestâhese", - "co": "Cors", # Corsican - "cr": "Nehiyaw", # Cree - "crh": "Qırımtatarca", - "cs": "Česky" , # Czech - "csb": "Kaszëbsczi", - "c": "Словѣньскъ", # Church Slavic - "cv": "Чăваш", # Chuvash - "cy": "Cymraeg" , # Welsh - - "da": "Dansk" , # Danish - "de": "Deutsch" , # German - "diq": "Zazaki", # Dimli (Southern Zazaki) - "dsb": "Dolnoserbski", - "dv": "ދިވެހިބަސް", # Divehi - "dz": "ཇོང་ཁ", # Dzongkha; Bhutani - - "ee": "Eʋegbe", # Ewe - "el": "Ελληνικά" , # Greek - "eml": "Emiliàn e rumagnòl", - "en": "English" , # English - "eo": "Esperanto" , - "es": "Español" , # Spanish - "et": "Eesti" , # Estonian - "e": "Euskara" , # Basque - "ext": "Estremeñ", - - "fa": "فارسی" , # Persian - "ff": "Fulfulde", # Fulah # Also NG, MR, and many others - "fi": "Suomi" , # Finnish - "fiu-vro": "Võro", - "fj": "Na Vosa Vakaviti", # Fijian; Fiji - "fo": "Føroyskt" , # Faroese - "fr": "Français" , # French - "frp": "Arpitan", - "fur": "Furlan", - "fy": "Frysk", # Frisian - - "ga": "Gaeilge", # Irish - "gan": "贛語 (Gànyŭ)", - "gd": "Gàidhlig", # Scots; Gaelic - "gl": "Gallego" , # Gallegan; Galician - "glk": "گیلکی", - "got": "𐌲𐌹𐌺 ", - "gn": "Avañe'ẽ", # Guarani - "g": "ગુજરાતી", # Gujarati - "gv": "Gaelg", # Manx - - "ha": "هَوُسَ", # Hausa - "hak": "Hak-kâ-fa / 客家話", - "haw": "Hawai`i", - "he": "עברית" , # Hebrew (formerly iw) - "hi": "हिन्दी" , # Hindi - "hif": "Fiji Hindi", - "ho": "Hiri Mot", # Hiri Motu - "hr": "Hrvatski" , # Croatian - "hsb": "Hornjoserbsce", - "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole - "hu": "Magyar" , # Hungarian - "hy": "Հայերեն", # Armenian - "hz": "Otsiherero", # Herero - - "ia": "Interlingua", - "ie": "Interlingue", - "id": "Bahasa Indonesia", # Indonesian (formerly in) - "ig": "Igbo", # Igbo - "ii": "ꆇꉙ ", # Sichuan Yi - "ik": "Iñupiak", # Inupiak - "ilo": "Ilokano", - "io": "Ido" , - "is": "Íslenska" , # Icelandic - "it": "Italiano" , # Italian - "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut - - "ja": "日本語", # Japanese - "jbo": "Lojban", - "jv": "Basa Jawa", # Javanese - - "ka": "ქართული" , # Georgian - "kaa": "Qaraqalpaqsha", - "kab": "Taqbaylit", - "kg": "KiKongo", # Kongo # also CD and AO - "ki": "Gĩkũyũ", # Kikuyu - "kj": "Kuanyama", # Kuanyama - "kk": "Қазақша", # Kazakh - "kl": "Kalaallisut", # Kalaallisut; Greenlandic - "km": "ភាសាខ្មែរ", # Khmer; Cambodian - "kn": "ಕನ್ನಡ", # Kannada - "ko": "한국어" , # Korean - "kr": "Kanuri", # Kanuri - "ks": "कश्मीरी / كشميري", # Kashmiri - "ksh": "Ripoarisch", - "ku": "Kurdî / كوردی", # Kurdish - "kv": "Коми", # Komi - "kw": "Kernewek/Karnuack", # Cornish - "ky": "Кыргызча", # Kirghiz - - "la": "Latina" , # Latin - "lad": "Dzhudezmo", - "lb": "Lëtzebuergesch" , # Letzeburgesch - "lbe": "Лакку", - "lg": "Luganda", # Ganda - "li": "Limburgs", # Limburgish; Limburger; Limburgan - "lij": "Lígur", - "ln": "Lingala", # Lingala - "lmo": "Lumbaart", - "lo": "ລາວ", # Lao; Laotian - "lt": "Lietuvių" , # Lithuanian - "l": None, # Luba-Katanga - "lv": "Latvieš" , # Latvian; Lettish - - "map-bms": "Basa Banyumasan", - "mdf": "Мокшень (Mokshanj Kälj)", - "mg": "Malagasy", # Malagasy - "mh": "Ebon", # Marshall - "mi": "Māori", # Maori - "mk": "Македонски" , # Macedonian - "ml": None, # Malayalam - "mn": "Монгол", # Mongolian - "mo": "Молдовеняскэ", # Moldavian - "mr": "मराठी" , # Marathi - "ms": "Bahasa Melay" , # Malay - "mt": "Malti", # Maltese - "mus": "Muskogee", - "my": "မ္ရန္မာစာ", # Burmese - "myv": "Эрзянь (Erzjanj Kelj)", - "mzn": "مَزِروني", - - "na": "dorerin Naoero", # Nauru - "nah": "Nāhuatl", - "nap": "Nnapulitano", - "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l - "nd": None,# Ndebele, North - "nds": "Plattdüütsch", - "nds-nl": "Nedersaksisch", - "ne": "नेपाली", # Nepali - "new": "नेपाल भाषा" , # Nepal Bhasa - "ng": "Oshiwambo", # Ndonga - "nl": "Nederlands" , # Dutch - "nn": "Nynorsk", # Norwegian Nynorsk - "no": "Norsk (Bokmål)" , # Norwegian - "nov": "Novial", - "nr": None, # Ndebele, South - "nrm": "Nouormand/Normaund", - "nv": "Diné bizaad", # Navajo - "ny": "Chi-Chewa", # Chichewa; Nyanja - - "oc": "Occitan", # Occitan; Proven@,{c}al - "oj": None, # Ojibwa - "om": "Oromoo", # (Afan) Oromo - "or": "ଓଡ଼ିଆ", # Oriya - "os": "Иронау", # Ossetian; Ossetic - - "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi - "pag": "Pangasinan", - "pam": "Kapampangan", - "pap": "Papiament", - "pdc": "Deitsch", - "pi": "पाऴि", # Pali - "pih": "Norfuk", - "pl": "Polski" , # Polish - "pms": "Piemontèis" , - "ps": "پښتو", # Pashto, Pushto - "pt": "Português" , # Portuguese - - "q": "Runa Simi" , # Quechua - - "rm": "Rumantsch", # Rhaeto-Romance - "rmy": "romani - रोमानी", - "rn": "Kirundi", # Rundi; Kirundi - "ro": "Română" , # Romanian - "roa-rup": "Armãneashce", - "roa-tara": "Tarandíne", - "ru": "Русский" , # Russian - "rw": "Ikinyarwanda", # Kinyarwanda - - "sa": "संस्कृतम्", # Sanskrit - "sah": "Саха тыла (Saxa Tyla)", - "sc": "Sard", # Sardinian - "scn": "Sicilian", - "sco": "Scots", - "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi - "se": "Sámegiella", # Northern Sami - "sg": "Sängö", # Sango; Sangro - "sh": "Srpskohrvatski / Српскохрватски" , - "si": "සිංහල", - "simple": "Simple English" , - "sk": "Slovenčina" , # Slovak - "sl": "Slovenščina" , # Slovenian - "sm": "Gagana Samoa", # Samoan - "sn": "chiShona", # Shona - "so": "Soomaaliga", # Somali - "sr": "Српски / Srpski" , # Serbian - "srn": "Sranantongo", - "ss": "SiSwati", # Swati; Siswati - "st": "Sesotho", # Sesotho; Sotho, Southern - "stk": "Seeltersk", - "s": "Basa Sunda", # Sundanese - "sq": "Shqip" , # Albanian - "szl": "Ślůnski", - "sv": "Svenska" , # Swedish - "sw": "Kiswahili", # Swahili # Also KE - - "ta": "தமிழ்" , # Tamil - "te": "తెలుగు" , # Telugu - "tet": "Tetun", - "tg": "Тоҷикӣ", # Tajik - "th": "ไทย" , # Thai - "ti": "ትግርኛ", # Tigrinya - "tk": "تركمن / Туркмен", # Turkmen - "tl": "Tagalog" , # Tagalog - "tn": "Setswana", # Tswana; Setswana - "to": "faka Tonga", # Tonga (?) # Also ZW ; MW - "tokipona": "Tokipona", - "tpi": "Tok Pisin", - "tr": "Türkçe" , # Turkish - "ts": "Xitsonga", # Tsonga # ZA SZ XW - "tt": "Tatarça / Татарча", # Tatar - "tum": "chiTumbuka", - "tw": "Twi", # Twi - "ty": "Reo Mā`ohi", # Tahitian - - "udm": "Удмурт кыл", - "ug": "Oyghurque", # Uighur - "uk": "Українська" , # Ukrainian - "ur": "اردو", # Urdu - "uz": "O‘zbek", # Uzbek - - "ve": "Tshivenda", # Venda - "vec": "Vèneto", - "vi": "Tiếng Việt" , # Vietnamese - "vls": "West-Vlams", - "vo": "Volapük" , - "wa": "Walon", # Walloon - "war": "Winaray", - "wo": "Wolof", # Wolof - "w": "吴语", - - "xal": "Хальмг", - "xh": "isiXhosa", # Xhosa - - "yi": "ייִדיש", # Yiddish (formerly ji) - "yo": "Yorùbá", # Yoruba - - "za": "Cuengh", # Zhuang - "zea": "Zeêuws", - "zh": "中文" , # Chinese - "zh-classical": "古文 / 文言文", - "zm-min-nan": "Bân-lâm-gú", - "zh-yue": "粵語", - "z": "isiZul" # Zulu - } - def target(self, t): (qual,sep,tgt) = t.partition(':') r = None @@ -361,18 +47,10 @@ class HtmlWikiMarkup (WikiMarkup): envhdr = [ "ul", "ol", "dl" ] envel = [ "li", "li", "dd" ] - def str_nil(self, tok, env): - return "" - - def str_text(self, tok, env): - return tok[1] - def str_link(self, tok, env): arg = self.fmtok(tok[1], env) (target, r) = self.target(arg) text = self.fmtok(tok[2], env) - if not text and r: - text = r return "<a href=\"%s\">%s</a>" % (target, text if (text and text != '') \ else \ @@ -424,37 +102,6 @@ class HtmlWikiMarkup (WikiMarkup): s += self.fmtok(t, env) return s - def fmtok(self, tok, env): - if type(tok) != TupleType: - return "" - toktype = tok[0] - if toktype == self.NIL: - return self.str_nil(tok, env) - if toktype == self.TEXT: - return self.str_text(tok, env) - elif toktype == self.LINK: - return self.str_link(tok, env) - elif toktype == self.TMPL: - return self.str_tmpl(tok, env) - elif toktype == self.REF: - return self.str_ref(tok, env) - elif toktype == self.IT: - return self.str_it(tok, env) - elif toktype == self.BOLD: - return self.str_bold(tok, env) - elif toktype == self.HDR: - return self.str_hdr(tok, env) - elif toktype == self.BAR: - return self.str_bar(tok, env) - elif toktype == self.ENV: - return self.str_env(tok, env) - elif toktype == self.ITEM: - return self.str_item(tok, env) - elif toktype == self.SEQ: - return self.str_seq(tok, env) - - def __str__(self): - return self.fmtok(self.tree, None) class HtmlWiktionaryMarkup (HtmlWikiMarkup): |