diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2008-11-26 09:52:15 +0200 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2008-11-26 09:52:27 +0200 |
commit | bd79a17ca5082789d4cf82f62a6afc0baaca90e8 (patch) | |
tree | 40c002caaab88586c2c8649bc9cd3ffe2b18bd69 | |
parent | 5dc93e466efaaa243e6490961b6e545eaa65f06c (diff) | |
download | wikitrans-bd79a17ca5082789d4cf82f62a6afc0baaca90e8.tar.gz wikitrans-bd79a17ca5082789d4cf82f62a6afc0baaca90e8.tar.bz2 |
Implement plain text conversion.
* wiki2html.py (HtmlWikiMarkup): Move lang, html_base, image_base,
media_base, langtab, str_nil, str_text, fmtok, __str__ to WikiMarkup
* wikimarkup.py: See above.
* wiki2plain.py: Remove.
* wiki2text.py: New file (instead of the above)
* wikicvt.py: Implement new options.
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | wiki2html.py | 353 | ||||
-rw-r--r-- | wiki2plain.py | 452 | ||||
-rw-r--r-- | wiki2text.py | 163 | ||||
-rw-r--r-- | wikicvt.py | 19 | ||||
-rw-r--r-- | wikimarkup.py | 359 |
6 files changed, 540 insertions, 809 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..aff6316 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*~ +*.pyc +.emacs.desktop diff --git a/wiki2html.py b/wiki2html.py index 7441b97..907e3b1 100644 --- a/wiki2html.py +++ b/wiki2html.py @@ -26,321 +26,7 @@ class HtmlWikiMarkup (WikiMarkup): 2. [[official position]]s : final 's' gets after closing </a> tag. Should be before. """ - lang = 'en' - html_base = 'http://%(lang)s.wiktionary.org' - image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf/' - media_base = 'http://www.mediawiki.org/xml/export-0.3' - - def __init__(self, *args, **keywords): - WikiMarkup.__init__(self, *args, **keywords) - if 'lang' in keywords: - self.lang = keywords['lang'] - elif 'html_base' in keywords: - self.html_base = keywords['html_base'] - elif 'image_base' in keywords: - self.image_base = keywords['image_base'] - elif 'media_base' in keywords: - self.media_base = keywords['media_base'] - - # ISO 639 - langtab = { - "aa": "Afar", # Afar - "ab": "Аҧсуа", # Abkhazian - "ae": None, # Avestan - "af": "Afrikaans", # Afrikaans - "ak": "Akana", # Akan # or ak_CI - "als": "Alemannisch", - "am": "አማርኛ", # Amharic - "an": "Aragonés", # Aragonese - "ang": "Englisc", - "ar": "العربية" , # Arabic - "arc": "ܐܪܡܝܐ", - "as": "অসমীয়া", # Assamese - "ast": "Asturian", - "av": "Авар", # Avaric # Spoken mainly in Dagestan - "ay": "Aymar", # Aymara - "az": "Azərbaycan" , # Azerbaijani - - "ba": "Башҡорт", # Bashkir - "bar": "Boarisch", - "bat-smg": "Žemaitėška", - "bcl": "Bikol", - "be": "Беларуская", # Byelorussian; Belarusian - "be-x-old": "Беларуская (тарашкевіца)", - "bg": "Български", # Bulgarian - "bh": "भोजपुरी", # Bihari - "bi": "Bislama", # Bislama - "bm": "Bamanankan", # Bambara - "bn": "বাংলা" , # Bengali; Bangla - "bo": "བོད་སྐད", # Tibetan - "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" , - "br": "Brezhoneg" , # Breton - "bs": "Bosanski" , # Bosnian - "bug": "Basa Ugi", - "bxr": "Буряад", - - "ca": "Català" , # Catalan - "cbk-zam": "Chavacano de Zamboanga", - "cdo": "Mìng-dĕ̤ng-ngṳ̄", - "cho": "Choctaw", - "ce": "Нохчийн", # Chechen - "ceb": "Sinugboanong Binisaya" , # Cebuano - "ch": "Chamor", # Chamorro - "chr": "ᏣᎳᎩ", - "chy": "Tsetsêhestâhese", - "co": "Cors", # Corsican - "cr": "Nehiyaw", # Cree - "crh": "Qırımtatarca", - "cs": "Česky" , # Czech - "csb": "Kaszëbsczi", - "c": "Словѣньскъ", # Church Slavic - "cv": "Чăваш", # Chuvash - "cy": "Cymraeg" , # Welsh - - "da": "Dansk" , # Danish - "de": "Deutsch" , # German - "diq": "Zazaki", # Dimli (Southern Zazaki) - "dsb": "Dolnoserbski", - "dv": "ދިވެހިބަސް", # Divehi - "dz": "ཇོང་ཁ", # Dzongkha; Bhutani - - "ee": "Eʋegbe", # Ewe - "el": "Ελληνικά" , # Greek - "eml": "Emiliàn e rumagnòl", - "en": "English" , # English - "eo": "Esperanto" , - "es": "Español" , # Spanish - "et": "Eesti" , # Estonian - "e": "Euskara" , # Basque - "ext": "Estremeñ", - - "fa": "فارسی" , # Persian - "ff": "Fulfulde", # Fulah # Also NG, MR, and many others - "fi": "Suomi" , # Finnish - "fiu-vro": "Võro", - "fj": "Na Vosa Vakaviti", # Fijian; Fiji - "fo": "Føroyskt" , # Faroese - "fr": "Français" , # French - "frp": "Arpitan", - "fur": "Furlan", - "fy": "Frysk", # Frisian - - "ga": "Gaeilge", # Irish - "gan": "贛語 (Gànyŭ)", - "gd": "Gàidhlig", # Scots; Gaelic - "gl": "Gallego" , # Gallegan; Galician - "glk": "گیلکی", - "got": "𐌲𐌹𐌺 ", - "gn": "Avañe'ẽ", # Guarani - "g": "ગુજરાતી", # Gujarati - "gv": "Gaelg", # Manx - - "ha": "هَوُسَ", # Hausa - "hak": "Hak-kâ-fa / 客家話", - "haw": "Hawai`i", - "he": "עברית" , # Hebrew (formerly iw) - "hi": "हिन्दी" , # Hindi - "hif": "Fiji Hindi", - "ho": "Hiri Mot", # Hiri Motu - "hr": "Hrvatski" , # Croatian - "hsb": "Hornjoserbsce", - "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole - "hu": "Magyar" , # Hungarian - "hy": "Հայերեն", # Armenian - "hz": "Otsiherero", # Herero - - "ia": "Interlingua", - "ie": "Interlingue", - "id": "Bahasa Indonesia", # Indonesian (formerly in) - "ig": "Igbo", # Igbo - "ii": "ꆇꉙ ", # Sichuan Yi - "ik": "Iñupiak", # Inupiak - "ilo": "Ilokano", - "io": "Ido" , - "is": "Íslenska" , # Icelandic - "it": "Italiano" , # Italian - "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut - - "ja": "日本語", # Japanese - "jbo": "Lojban", - "jv": "Basa Jawa", # Javanese - - "ka": "ქართული" , # Georgian - "kaa": "Qaraqalpaqsha", - "kab": "Taqbaylit", - "kg": "KiKongo", # Kongo # also CD and AO - "ki": "Gĩkũyũ", # Kikuyu - "kj": "Kuanyama", # Kuanyama - "kk": "Қазақша", # Kazakh - "kl": "Kalaallisut", # Kalaallisut; Greenlandic - "km": "ភាសាខ្មែរ", # Khmer; Cambodian - "kn": "ಕನ್ನಡ", # Kannada - "ko": "한국어" , # Korean - "kr": "Kanuri", # Kanuri - "ks": "कश्मीरी / كشميري", # Kashmiri - "ksh": "Ripoarisch", - "ku": "Kurdî / كوردی", # Kurdish - "kv": "Коми", # Komi - "kw": "Kernewek/Karnuack", # Cornish - "ky": "Кыргызча", # Kirghiz - - "la": "Latina" , # Latin - "lad": "Dzhudezmo", - "lb": "Lëtzebuergesch" , # Letzeburgesch - "lbe": "Лакку", - "lg": "Luganda", # Ganda - "li": "Limburgs", # Limburgish; Limburger; Limburgan - "lij": "Lígur", - "ln": "Lingala", # Lingala - "lmo": "Lumbaart", - "lo": "ລາວ", # Lao; Laotian - "lt": "Lietuvių" , # Lithuanian - "l": None, # Luba-Katanga - "lv": "Latvieš" , # Latvian; Lettish - - "map-bms": "Basa Banyumasan", - "mdf": "Мокшень (Mokshanj Kälj)", - "mg": "Malagasy", # Malagasy - "mh": "Ebon", # Marshall - "mi": "Māori", # Maori - "mk": "Македонски" , # Macedonian - "ml": None, # Malayalam - "mn": "Монгол", # Mongolian - "mo": "Молдовеняскэ", # Moldavian - "mr": "मराठी" , # Marathi - "ms": "Bahasa Melay" , # Malay - "mt": "Malti", # Maltese - "mus": "Muskogee", - "my": "မ္ရန္မာစာ", # Burmese - "myv": "Эрзянь (Erzjanj Kelj)", - "mzn": "مَزِروني", - - "na": "dorerin Naoero", # Nauru - "nah": "Nāhuatl", - "nap": "Nnapulitano", - "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l - "nd": None,# Ndebele, North - "nds": "Plattdüütsch", - "nds-nl": "Nedersaksisch", - "ne": "नेपाली", # Nepali - "new": "नेपाल भाषा" , # Nepal Bhasa - "ng": "Oshiwambo", # Ndonga - "nl": "Nederlands" , # Dutch - "nn": "Nynorsk", # Norwegian Nynorsk - "no": "Norsk (Bokmål)" , # Norwegian - "nov": "Novial", - "nr": None, # Ndebele, South - "nrm": "Nouormand/Normaund", - "nv": "Diné bizaad", # Navajo - "ny": "Chi-Chewa", # Chichewa; Nyanja - - "oc": "Occitan", # Occitan; Proven@,{c}al - "oj": None, # Ojibwa - "om": "Oromoo", # (Afan) Oromo - "or": "ଓଡ଼ିଆ", # Oriya - "os": "Иронау", # Ossetian; Ossetic - - "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi - "pag": "Pangasinan", - "pam": "Kapampangan", - "pap": "Papiament", - "pdc": "Deitsch", - "pi": "पाऴि", # Pali - "pih": "Norfuk", - "pl": "Polski" , # Polish - "pms": "Piemontèis" , - "ps": "پښتو", # Pashto, Pushto - "pt": "Português" , # Portuguese - - "q": "Runa Simi" , # Quechua - - "rm": "Rumantsch", # Rhaeto-Romance - "rmy": "romani - रोमानी", - "rn": "Kirundi", # Rundi; Kirundi - "ro": "Română" , # Romanian - "roa-rup": "Armãneashce", - "roa-tara": "Tarandíne", - "ru": "Русский" , # Russian - "rw": "Ikinyarwanda", # Kinyarwanda - - "sa": "संस्कृतम्", # Sanskrit - "sah": "Саха тыла (Saxa Tyla)", - "sc": "Sard", # Sardinian - "scn": "Sicilian", - "sco": "Scots", - "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi - "se": "Sámegiella", # Northern Sami - "sg": "Sängö", # Sango; Sangro - "sh": "Srpskohrvatski / Српскохрватски" , - "si": "සිංහල", - "simple": "Simple English" , - "sk": "Slovenčina" , # Slovak - "sl": "Slovenščina" , # Slovenian - "sm": "Gagana Samoa", # Samoan - "sn": "chiShona", # Shona - "so": "Soomaaliga", # Somali - "sr": "Српски / Srpski" , # Serbian - "srn": "Sranantongo", - "ss": "SiSwati", # Swati; Siswati - "st": "Sesotho", # Sesotho; Sotho, Southern - "stk": "Seeltersk", - "s": "Basa Sunda", # Sundanese - "sq": "Shqip" , # Albanian - "szl": "Ślůnski", - "sv": "Svenska" , # Swedish - "sw": "Kiswahili", # Swahili # Also KE - - "ta": "தமிழ்" , # Tamil - "te": "తెలుగు" , # Telugu - "tet": "Tetun", - "tg": "Тоҷикӣ", # Tajik - "th": "ไทย" , # Thai - "ti": "ትግርኛ", # Tigrinya - "tk": "تركمن / Туркмен", # Turkmen - "tl": "Tagalog" , # Tagalog - "tn": "Setswana", # Tswana; Setswana - "to": "faka Tonga", # Tonga (?) # Also ZW ; MW - "tokipona": "Tokipona", - "tpi": "Tok Pisin", - "tr": "Türkçe" , # Turkish - "ts": "Xitsonga", # Tsonga # ZA SZ XW - "tt": "Tatarça / Татарча", # Tatar - "tum": "chiTumbuka", - "tw": "Twi", # Twi - "ty": "Reo Mā`ohi", # Tahitian - - "udm": "Удмурт кыл", - "ug": "Oyghurque", # Uighur - "uk": "Українська" , # Ukrainian - "ur": "اردو", # Urdu - "uz": "O‘zbek", # Uzbek - - "ve": "Tshivenda", # Venda - "vec": "Vèneto", - "vi": "Tiếng Việt" , # Vietnamese - "vls": "West-Vlams", - "vo": "Volapük" , - "wa": "Walon", # Walloon - "war": "Winaray", - "wo": "Wolof", # Wolof - "w": "吴语", - - "xal": "Хальмг", - "xh": "isiXhosa", # Xhosa - - "yi": "ייִדיש", # Yiddish (formerly ji) - "yo": "Yorùbá", # Yoruba - - "za": "Cuengh", # Zhuang - "zea": "Zeêuws", - "zh": "中文" , # Chinese - "zh-classical": "古文 / 文言文", - "zm-min-nan": "Bân-lâm-gú", - "zh-yue": "粵語", - "z": "isiZul" # Zulu - } - def target(self, t): (qual,sep,tgt) = t.partition(':') r = None @@ -361,18 +47,10 @@ class HtmlWikiMarkup (WikiMarkup): envhdr = [ "ul", "ol", "dl" ] envel = [ "li", "li", "dd" ] - def str_nil(self, tok, env): - return "" - - def str_text(self, tok, env): - return tok[1] - def str_link(self, tok, env): arg = self.fmtok(tok[1], env) (target, r) = self.target(arg) text = self.fmtok(tok[2], env) - if not text and r: - text = r return "<a href=\"%s\">%s</a>" % (target, text if (text and text != '') \ else \ @@ -424,37 +102,6 @@ class HtmlWikiMarkup (WikiMarkup): s += self.fmtok(t, env) return s - def fmtok(self, tok, env): - if type(tok) != TupleType: - return "" - toktype = tok[0] - if toktype == self.NIL: - return self.str_nil(tok, env) - if toktype == self.TEXT: - return self.str_text(tok, env) - elif toktype == self.LINK: - return self.str_link(tok, env) - elif toktype == self.TMPL: - return self.str_tmpl(tok, env) - elif toktype == self.REF: - return self.str_ref(tok, env) - elif toktype == self.IT: - return self.str_it(tok, env) - elif toktype == self.BOLD: - return self.str_bold(tok, env) - elif toktype == self.HDR: - return self.str_hdr(tok, env) - elif toktype == self.BAR: - return self.str_bar(tok, env) - elif toktype == self.ENV: - return self.str_env(tok, env) - elif toktype == self.ITEM: - return self.str_item(tok, env) - elif toktype == self.SEQ: - return self.str_seq(tok, env) - - def __str__(self): - return self.fmtok(self.tree, None) class HtmlWiktionaryMarkup (HtmlWikiMarkup): diff --git a/wiki2plain.py b/wiki2plain.py deleted file mode 100644 index 5080298..0000000 --- a/wiki2plain.py +++ /dev/null @@ -1,452 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- -# Copyright (C) 2008 Sergey Poznyakoff -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. - -from wikimarkup import * -from types import TupleType -import urllib - -class PlainMarkup (WikiMarkup): - """ - A (general-purpose Wiki->Text translator class. - """ - lang = 'en' - html_base = 'http://%(lang)s.wiktionary.org' - image_base = 'http://nie.wiem.gdzie' - media_base = 'http://www.mediawiki.org/xml/export-0.3' - - def __init__(self, *args, **keywords): - WikiMarkup.__init__(self, *args, **keywords) - if 'lang' in keywords: - self.lang = keywords['lang'] - elif 'html_base' in keywords: - self.html_base = keywords['html_base'] - elif 'image_base' in keywords: - self.image_base = keywords['image_base'] - elif 'media_base' in keywords: - self.media_base = keywords['media_base'] - - # ISO 639 - langtab = { - "aa": "Afar", # Afar - "ab": "Аҧсуа", # Abkhazian - "ae": None, # Avestan - "af": "Afrikaans", # Afrikaans - "ak": "Akana", # Akan # or ak_CI - "als": "Alemannisch", - "am": "አማርኛ", # Amharic - "an": "Aragonés", # Aragonese - "ang": "Englisc", - "ar": "العربية" , # Arabic - "arc": "ܐܪܡܝܐ", - "as": "অসমীয়া", # Assamese - "ast": "Asturian", - "av": "Авар", # Avaric # Spoken mainly in Dagestan - "ay": "Aymar", # Aymara - "az": "Azərbaycan" , # Azerbaijani - - "ba": "Башҡорт", # Bashkir - "bar": "Boarisch", - "bat-smg": "Žemaitėška", - "bcl": "Bikol", - "be": "Беларуская", # Byelorussian; Belarusian - "be-x-old": "Беларуская (тарашкевіца)", - "bg": "Български", # Bulgarian - "bh": "भोजपुरी", # Bihari - "bi": "Bislama", # Bislama - "bm": "Bamanankan", # Bambara - "bn": "বাংলা" , # Bengali; Bangla - "bo": "བོད་སྐད", # Tibetan - "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" , - "br": "Brezhoneg" , # Breton - "bs": "Bosanski" , # Bosnian - "bug": "Basa Ugi", - "bxr": "Буряад", - - "ca": "Català" , # Catalan - "cbk-zam": "Chavacano de Zamboanga", - "cdo": "Mìng-dĕ̤ng-ngṳ̄", - "cho": "Choctaw", - "ce": "Нохчийн", # Chechen - "ceb": "Sinugboanong Binisaya" , # Cebuano - "ch": "Chamor", # Chamorro - "chr": "ᏣᎳᎩ", - "chy": "Tsetsêhestâhese", - "co": "Cors", # Corsican - "cr": "Nehiyaw", # Cree - "crh": "Qırımtatarca", - "cs": "Česky" , # Czech - "csb": "Kaszëbsczi", - "c": "Словѣньскъ", # Church Slavic - "cv": "Чăваш", # Chuvash - "cy": "Cymraeg" , # Welsh - - "da": "Dansk" , # Danish - "de": "Deutsch" , # German - "diq": "Zazaki", # Dimli (Southern Zazaki) - "dsb": "Dolnoserbski", - "dv": "ދިވެހިބަސް", # Divehi - "dz": "ཇོང་ཁ", # Dzongkha; Bhutani - - "ee": "Eʋegbe", # Ewe - "el": "Ελληνικά" , # Greek - "eml": "Emiliàn e rumagnòl", - "en": "English" , # English - "eo": "Esperanto" , - "es": "Español" , # Spanish - "et": "Eesti" , # Estonian - "e": "Euskara" , # Basque - "ext": "Estremeñ", - - "fa": "فارسی" , # Persian - "ff": "Fulfulde", # Fulah # Also NG, MR, and many others - "fi": "Suomi" , # Finnish - "fiu-vro": "Võro", - "fj": "Na Vosa Vakaviti", # Fijian; Fiji - "fo": "Føroyskt" , # Faroese - "fr": "Français" , # French - "frp": "Arpitan", - "fur": "Furlan", - "fy": "Frysk", # Frisian - - "ga": "Gaeilge", # Irish - "gan": "贛語 (Gànyŭ)", - "gd": "Gàidhlig", # Scots; Gaelic - "gl": "Gallego" , # Gallegan; Galician - "glk": "گیلکی", - "got": "𐌲𐌹𐌺 ", - "gn": "Avañe'ẽ", # Guarani - "g": "ગુજરાતી", # Gujarati - "gv": "Gaelg", # Manx - - "ha": "هَوُسَ", # Hausa - "hak": "Hak-kâ-fa / 客家話", - "haw": "Hawai`i", - "he": "עברית" , # Hebrew (formerly iw) - "hi": "हिन्दी" , # Hindi - "hif": "Fiji Hindi", - "ho": "Hiri Mot", # Hiri Motu - "hr": "Hrvatski" , # Croatian - "hsb": "Hornjoserbsce", - "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole - "hu": "Magyar" , # Hungarian - "hy": "Հայերեն", # Armenian - "hz": "Otsiherero", # Herero - - "ia": "Interlingua", - "ie": "Interlingue", - "id": "Bahasa Indonesia", # Indonesian (formerly in) - "ig": "Igbo", # Igbo - "ii": "ꆇꉙ ", # Sichuan Yi - "ik": "Iñupiak", # Inupiak - "ilo": "Ilokano", - "io": "Ido" , - "is": "Íslenska" , # Icelandic - "it": "Italiano" , # Italian - "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut - - "ja": "日本語", # Japanese - "jbo": "Lojban", - "jv": "Basa Jawa", # Javanese - - "ka": "ქართული" , # Georgian - "kaa": "Qaraqalpaqsha", - "kab": "Taqbaylit", - "kg": "KiKongo", # Kongo # also CD and AO - "ki": "Gĩkũyũ", # Kikuyu - "kj": "Kuanyama", # Kuanyama - "kk": "Қазақша", # Kazakh - "kl": "Kalaallisut", # Kalaallisut; Greenlandic - "km": "ភាសាខ្មែរ", # Khmer; Cambodian - "kn": "ಕನ್ನಡ", # Kannada - "ko": "한국어" , # Korean - "kr": "Kanuri", # Kanuri - "ks": "कश्मीरी / كشميري", # Kashmiri - "ksh": "Ripoarisch", - "ku": "Kurdî / كوردی", # Kurdish - "kv": "Коми", # Komi - "kw": "Kernewek/Karnuack", # Cornish - "ky": "Кыргызча", # Kirghiz - - "la": "Latina" , # Latin - "lad": "Dzhudezmo", - "lb": "Lëtzebuergesch" , # Letzeburgesch - "lbe": "Лакку", - "lg": "Luganda", # Ganda - "li": "Limburgs", # Limburgish; Limburger; Limburgan - "lij": "Lígur", - "ln": "Lingala", # Lingala - "lmo": "Lumbaart", - "lo": "ລາວ", # Lao; Laotian - "lt": "Lietuvių" , # Lithuanian - "l": None, # Luba-Katanga - "lv": "Latvieš" , # Latvian; Lettish - - "map-bms": "Basa Banyumasan", - "mdf": "Мокшень (Mokshanj Kälj)", - "mg": "Malagasy", # Malagasy - "mh": "Ebon", # Marshall - "mi": "Māori", # Maori - "mk": "Македонски" , # Macedonian - "ml": None, # Malayalam - "mn": "Монгол", # Mongolian - "mo": "Молдовеняскэ", # Moldavian - "mr": "मराठी" , # Marathi - "ms": "Bahasa Melay" , # Malay - "mt": "Malti", # Maltese - "mus": "Muskogee", - "my": "မ္ရန္မာစာ", # Burmese - "myv": "Эрзянь (Erzjanj Kelj)", - "mzn": "مَزِروني", - - "na": "dorerin Naoero", # Nauru - "nah": "Nāhuatl", - "nap": "Nnapulitano", - "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l - "nd": None,# Ndebele, North - "nds": "Plattdüütsch", - "nds-nl": "Nedersaksisch", - "ne": "नेपाली", # Nepali - "new": "नेपाल भाषा" , # Nepal Bhasa - "ng": "Oshiwambo", # Ndonga - "nl": "Nederlands" , # Dutch - "nn": "Nynorsk", # Norwegian Nynorsk - "no": "Norsk (Bokmål)" , # Norwegian - "nov": "Novial", - "nr": None, # Ndebele, South - "nrm": "Nouormand/Normaund", - "nv": "Diné bizaad", # Navajo - "ny": "Chi-Chewa", # Chichewa; Nyanja - - "oc": "Occitan", # Occitan; Proven@,{c}al - "oj": None, # Ojibwa - "om": "Oromoo", # (Afan) Oromo - "or": "ଓଡ଼ିଆ", # Oriya - "os": "Иронау", # Ossetian; Ossetic - - "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi - "pag": "Pangasinan", - "pam": "Kapampangan", - "pap": "Papiament", - "pdc": "Deitsch", - "pi": "पाऴि", # Pali - "pih": "Norfuk", - "pl": "Polski" , # Polish - "pms": "Piemontèis" , - "ps": "پښتو", # Pashto, Pushto - "pt": "Português" , # Portuguese - - "q": "Runa Simi" , # Quechua - - "rm": "Rumantsch", # Rhaeto-Romance - "rmy": "romani - रोमानी", - "rn": "Kirundi", # Rundi; Kirundi - "ro": "Română" , # Romanian - "roa-rup": "Armãneashce", - "roa-tara": "Tarandíne", - "ru": "Русский" , # Russian - "rw": "Ikinyarwanda", # Kinyarwanda - - "sa": "संस्कृतम्", # Sanskrit - "sah": "Саха тыла (Saxa Tyla)", - "sc": "Sard", # Sardinian - "scn": "Sicilian", - "sco": "Scots", - "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi - "se": "Sámegiella", # Northern Sami - "sg": "Sängö", # Sango; Sangro - "sh": "Srpskohrvatski / Српскохрватски" , - "si": "සිංහල", - "simple": "Simple English" , - "sk": "Slovenčina" , # Slovak - "sl": "Slovenščina" , # Slovenian - "sm": "Gagana Samoa", # Samoan - "sn": "chiShona", # Shona - "so": "Soomaaliga", # Somali - "sr": "Српски / Srpski" , # Serbian - "srn": "Sranantongo", - "ss": "SiSwati", # Swati; Siswati - "st": "Sesotho", # Sesotho; Sotho, Southern - "stk": "Seeltersk", - "s": "Basa Sunda", # Sundanese - "sq": "Shqip" , # Albanian - "szl": "Ślůnski", - "sv": "Svenska" , # Swedish - "sw": "Kiswahili", # Swahili # Also KE - - "ta": "தமிழ்" , # Tamil - "te": "తెలుగు" , # Telugu - "tet": "Tetun", - "tg": "Тоҷикӣ", # Tajik - "th": "ไทย" , # Thai - "ti": "ትግርኛ", # Tigrinya - "tk": "تركمن / Туркмен", # Turkmen - "tl": "Tagalog" , # Tagalog - "tn": "Setswana", # Tswana; Setswana - "to": "faka Tonga", # Tonga (?) # Also ZW ; MW - "tokipona": "Tokipona", - "tpi": "Tok Pisin", - "tr": "Türkçe" , # Turkish - "ts": "Xitsonga", # Tsonga # ZA SZ XW - "tt": "Tatarça / Татарча", # Tatar - "tum": "chiTumbuka", - "tw": "Twi", # Twi - "ty": "Reo Mā`ohi", # Tahitian - - "udm": "Удмурт кыл", - "ug": "Oyghurque", # Uighur - "uk": "Українська" , # Ukrainian - "ur": "اردو", # Urdu - "uz": "O‘zbek", # Uzbek - - "ve": "Tshivenda", # Venda - "vec": "Vèneto", - "vi": "Tiếng Việt" , # Vietnamese - "vls": "West-Vlams", - "vo": "Volapük" , - - "wa": "Walon", # Walloon - "war": "Winaray", - "wo": "Wolof", # Wolof - "w": "吴语", - - "xal": "Хальмг", - "xh": "isiXhosa", # Xhosa - - "yi": "ייִדיש", # Yiddish (formerly ji) - "yo": "Yorùbá", # Yoruba - - "za": "Cuengh", # Zhuang - "zea": "Zeêuws", - "zh": "中文" , # Chinese - "zh-classical": "古文 / 文言文", - "zm-min-nan": "Bân-lâm-gú", - "zh-yue": "粵語", - "z": "isiZul" # Zulu - } - - def target(self, t): - (qual,sep,tgt) = t.partition(':') - r = None - if tgt != '': - if qual == "Image": - t = self.image_base + '/' + urllib.quote(tgt) - elif qual == "Media": - t = self.media_base + '/' + tgt - elif qual in self.langtab: - t = self.html_base % { 'lang' : qual } + '/' + urllib.quote(tgt) - r = self.langtab[qual] - else: - t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t) - else: - t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t) - return t, r - - envhdr = [ "ul", "ol", "dl" ] - envel = [ "li", "li", "dd" ] - - def str_nil(self, tok, env): - return "" - - def str_text(self, tok, env): - return tok[1] - - def str_link(self, tok, env): - arg = self.fmtok(tok[1], env) - (target, r) = self.target(arg) - text = self.fmtok(tok[2], env) - if not text and r: - text = r - return "%s" % (text if (text and text != '') \ - else \ - r if r else arg) - - def str_tmpl(self, tok, env): - arg = self.fmtok(tok[1], env) - (target, r) = self.target(arg) - text = self.fmtok(tok[2], env) - return "%s" % (text if (text and text != '') \ - else arg) - - def str_ref(self, tok, env): - target = self.fmtok(tok[1], env) - text = self.fmtok(tok[2], env) - return "%s" % (text if (text and text != '') \ - else target) - - def str_it(self, tok, env): - return "<i>" + self.fmtok(tok[1], env) + "</i>" - - def str_bold(self, tok, env): - return "<b>" + self.fmtok(tok[1], env) + "</b>" - - def str_hdr(self, tok, env): - level = tok[1] - if level > 4: - level = 4 - return "<h%s>%s</h%s>" % (level, self.fmtok(tok[2], env), level) - - def str_bar(self, tok, env): - return "-----------------" - - def str_env(self, tok, env): - t = tok[1] - return "<" + self.envhdr[t] + ">" + \ - self.fmtok(tok[3], tok) + \ - "</" + self.envhdr[t] + ">" - - def str_item(self, tok, env): - return "<%s>%s</%s>" % (self.envel[env[1]], - self.fmtok(tok[1], env), - self.envel[env[1]]) - - def str_seq(self, tok, env): - s = "" - for t in tok[1:]: - s += self.fmtok(t, env) - return s - - def fmtok(self, tok, env): - if type(tok) != TupleType: - return "" - toktype = tok[0] - if toktype == self.NIL: - return self.str_nil(tok, env) - if toktype == self.TEXT: - return self.str_text(tok, env) - elif toktype == self.LINK: - return self.str_link(tok, env) - elif toktype == self.TMPL: - return self.str_tmpl(tok, env) - elif toktype == self.REF: - return self.str_ref(tok, env) - elif toktype == self.IT: - return self.str_it(tok, env) - elif toktype == self.BOLD: - return self.str_bold(tok, env) - elif toktype == self.HDR: - return self.str_hdr(tok, env) - elif toktype == self.BAR: - return self.str_bar(tok, env) - elif toktype == self.ENV: - return self.str_env(tok, env) - elif toktype == self.ITEM: - return self.str_item(tok, env) - elif toktype == self.SEQ: - return self.str_seq(tok, env) - - def __str__(self): - return self.fmtok(self.tree, None) diff --git a/wiki2text.py b/wiki2text.py new file mode 100644 index 0000000..e943f32 --- /dev/null +++ b/wiki2text.py @@ -0,0 +1,163 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (C) 2008 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from wikimarkup import * +from types import TupleType +import urllib + +class TextWikiMarkup (WikiMarkup): + """ + A (general-purpose Wiki->Text translator class. + """ + + width = 80 + num = 0 + references = False + + def __init__(self, *args, **keywords): + WikiMarkup.__init__(self, *args, **keywords) + if 'width' in keywords: + self.width = keywords['width'] + elif 'refs' in keywords: + self.references = keywords['refs'] + + def target(self, t): + (qual,sep,tgt) = t.partition(':') + r = None + if tgt != '': + if qual == "Image": + t = self.image_base + '/' + urllib.quote(tgt) + elif qual == "Media": + t = self.media_base + '/' + tgt + elif qual in self.langtab: + t = self.html_base % { 'lang' : qual } + '/' + urllib.quote(tgt) + r = self.langtab[qual] + else: + t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t) + else: + t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t) + return t, r + + def xref(self, text, target): + if text: + return "%s (see %s) " % (text, target) + else: + return "see " + target + + def str_link(self, tok, env): + arg = self.fmtok(tok[1], env) + text = self.fmtok(tok[2], env) + if self.references: + (target, r) = self.target(arg) + return self.xref(text if text else r, target) + else: + (qual,sep,tgt) = arg.partition(':') + if sep != '': + return "" + elif text: + return text + return arg + + def str_tmpl(self, tok, env): + arg = self.fmtok(tok[1], env) + (target, r) = self.target(arg) + text = self.fmtok(tok[2], env) + if not text and r: + text = r + if self.references: + return self.xref(text, target) + return text + + def str_ref(self, tok, env): + return self.xref(self.fmtok(tok[2], env), self.fmtok(tok[1], env)) + + def str_it(self, tok, env): + return "_" + self.fmtok(tok[1], env) + "_" + + def str_bold(self, tok, env): + return self.fmtok(tok[1], env).upper() + + def str_hdr(self, tok, env): + level = tok[1] + return "\n\n" + ("*" * level) + " " + self.fmtok(tok[2], env) + "\n\n" + + def str_bar(self, tok, env): + w = self.width + if w < 5: + w = 5 + return "\n" + ("-" * (w - 5)).center(w - 1) + "\n" + + def str_env(self, tok, env): + self.num = 1 + return "\n" + self.fmtok(tok[3], tok) + + def indent (self, lev, text): + w = self.width + self.width = w - lev + if text.find('\n') == -1: + s = (" " * lev) + text + else: + s = "" + for elt in text.split('\n'): + s += (" " * lev) + elt + if elt == '': + s += "\n" + + self.width = w + return s + + def str_item(self, tok, env): + t = env[1] + lev = env[2] + if lev > self.width - 4: + lev = 1 + if t == self.INDENT: + return self.indent(lev, self.fmtok(tok[1], env)) + elif t == self.ENVNUM: + n = self.num + self.num += 1 + return "" + self.indent(lev, + "%d. %s" % (n, self.fmtok(tok[1], env))) + elif t == self.ENVUNNUM: + return "" + self.indent(lev, + "- " + self.fmtok(tok[1], env)) + + def __str__(self): + return self.fmtok(self.tree, None) + +class TextWiktionaryMarkup (TextWikiMarkup): + """ + See documentation for HtmlWiktionaryMarkup + """ + + seq_pos = 0 + + def str_seq(self, tok, env): + s = "" + self.seq_pos=0 + for t in tok[1:]: + s += self.fmtok(t, env) + self.seq_pos += 1 + return s + + def str_tmpl(self, tok, env): + arg = self.fmtok(tok[1], env) + if self.seq_pos > 0: + return arg + else: + return "\n" + arg + ":\n" + @@ -18,16 +18,19 @@ import sys import getopt from wiki2html import * +from wiki2text import * def usage(code=0): - print "usage: " + sys.argv[0] + "[-hv] [--help] [--verbose] file\n" + print "usage: " + sys.argv[0] + "[-hvt] [-l lang] [--lang=lang] [--text] [--help] [--verbose] file\n" sys.exit(code) def main(): verbose_flag = 0 + html = 1 + lang = "pl" try: - opts, args = getopt.getopt(sys.argv[1:], "hv", - ["help", "verbose" ]) + opts, args = getopt.getopt(sys.argv[1:], "hl:tv" |