aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2008-11-26 09:52:15 +0200
committerSergey Poznyakoff <gray@gnu.org.ua>2008-11-26 09:52:27 +0200
commitbd79a17ca5082789d4cf82f62a6afc0baaca90e8 (patch)
tree40c002caaab88586c2c8649bc9cd3ffe2b18bd69
parent5dc93e466efaaa243e6490961b6e545eaa65f06c (diff)
downloadwit-bd79a17ca5082789d4cf82f62a6afc0baaca90e8.tar.gz
wit-bd79a17ca5082789d4cf82f62a6afc0baaca90e8.tar.bz2
Implement plain text conversion.
* wiki2html.py (HtmlWikiMarkup): Move lang, html_base, image_base, media_base, langtab, str_nil, str_text, fmtok, __str__ to WikiMarkup * wikimarkup.py: See above. * wiki2plain.py: Remove. * wiki2text.py: New file (instead of the above) * wikicvt.py: Implement new options.
-rw-r--r--.gitignore3
-rw-r--r--wiki2html.py353
-rw-r--r--wiki2plain.py452
-rw-r--r--wiki2text.py163
-rw-r--r--wikicvt.py19
-rw-r--r--wikimarkup.py359
6 files changed, 540 insertions, 809 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..aff6316
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*~
+*.pyc
+.emacs.desktop
diff --git a/wiki2html.py b/wiki2html.py
index 7441b97..907e3b1 100644
--- a/wiki2html.py
+++ b/wiki2html.py
@@ -26,321 +26,7 @@ class HtmlWikiMarkup (WikiMarkup):
2. [[official position]]s : final 's' gets after closing </a> tag.
Should be before.
"""
- lang = 'en'
- html_base = 'http://%(lang)s.wiktionary.org'
- image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf/'
- media_base = 'http://www.mediawiki.org/xml/export-0.3'
-
- def __init__(self, *args, **keywords):
- WikiMarkup.__init__(self, *args, **keywords)
- if 'lang' in keywords:
- self.lang = keywords['lang']
- elif 'html_base' in keywords:
- self.html_base = keywords['html_base']
- elif 'image_base' in keywords:
- self.image_base = keywords['image_base']
- elif 'media_base' in keywords:
- self.media_base = keywords['media_base']
-
- # ISO 639
- langtab = {
- "aa": "Afar", # Afar
- "ab": "Аҧсуа", # Abkhazian
- "ae": None, # Avestan
- "af": "Afrikaans", # Afrikaans
- "ak": "Akana", # Akan # or ak_CI
- "als": "Alemannisch",
- "am": "አማርኛ", # Amharic
- "an": "Aragonés", # Aragonese
- "ang": "Englisc",
- "ar": "العربية" , # Arabic
- "arc": "ܐܪܡܝܐ",
- "as": "অসমীয়া", # Assamese
- "ast": "Asturian",
- "av": "Авар", # Avaric # Spoken mainly in Dagestan
- "ay": "Aymar", # Aymara
- "az": "Azərbaycan" , # Azerbaijani
-
- "ba": "Башҡорт", # Bashkir
- "bar": "Boarisch",
- "bat-smg": "Žemaitėška",
- "bcl": "Bikol",
- "be": "Беларуская", # Byelorussian; Belarusian
- "be-x-old": "Беларуская (тарашкевіца)",
- "bg": "Български", # Bulgarian
- "bh": "भोजपुरी", # Bihari
- "bi": "Bislama", # Bislama
- "bm": "Bamanankan", # Bambara
- "bn": "বাংলা" , # Bengali; Bangla
- "bo": "བོད་སྐད", # Tibetan
- "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" ,
- "br": "Brezhoneg" , # Breton
- "bs": "Bosanski" , # Bosnian
- "bug": "Basa Ugi",
- "bxr": "Буряад",
-
- "ca": "Català" , # Catalan
- "cbk-zam": "Chavacano de Zamboanga",
- "cdo": "Mìng-dĕ̤ng-ngṳ̄",
- "cho": "Choctaw",
- "ce": "Нохчийн", # Chechen
- "ceb": "Sinugboanong Binisaya" , # Cebuano
- "ch": "Chamor", # Chamorro
- "chr": "ᏣᎳᎩ",
- "chy": "Tsetsêhestâhese",
- "co": "Cors", # Corsican
- "cr": "Nehiyaw", # Cree
- "crh": "Qırımtatarca",
- "cs": "Česky" , # Czech
- "csb": "Kaszëbsczi",
- "c": "Словѣньскъ", # Church Slavic
- "cv": "Чăваш", # Chuvash
- "cy": "Cymraeg" , # Welsh
-
- "da": "Dansk" , # Danish
- "de": "Deutsch" , # German
- "diq": "Zazaki", # Dimli (Southern Zazaki)
- "dsb": "Dolnoserbski",
- "dv": "ދިވެހިބަސް", # Divehi
- "dz": "ཇོང་ཁ", # Dzongkha; Bhutani
-
- "ee": "Eʋegbe", # Ewe
- "el": "Ελληνικά" , # Greek
- "eml": "Emiliàn e rumagnòl",
- "en": "English" , # English
- "eo": "Esperanto" ,
- "es": "Español" , # Spanish
- "et": "Eesti" , # Estonian
- "e": "Euskara" , # Basque
- "ext": "Estremeñ",
-
- "fa": "فارسی" , # Persian
- "ff": "Fulfulde", # Fulah # Also NG, MR, and many others
- "fi": "Suomi" , # Finnish
- "fiu-vro": "Võro",
- "fj": "Na Vosa Vakaviti", # Fijian; Fiji
- "fo": "Føroyskt" , # Faroese
- "fr": "Français" , # French
- "frp": "Arpitan",
- "fur": "Furlan",
- "fy": "Frysk", # Frisian
-
- "ga": "Gaeilge", # Irish
- "gan": "贛語 (Gànyŭ)",
- "gd": "Gàidhlig", # Scots; Gaelic
- "gl": "Gallego" , # Gallegan; Galician
- "glk": "گیلکی",
- "got": "𐌲Œ„𐌹𐌺 ",
- "gn": "Avañe'ẽ", # Guarani
- "g": "ગુજરાતી", # Gujarati
- "gv": "Gaelg", # Manx
-
- "ha": "هَوُسَ", # Hausa
- "hak": "Hak-kâ-fa / 客家話",
- "haw": "Hawai`i",
- "he": "עברית" , # Hebrew (formerly iw)
- "hi": "हिन्दी" , # Hindi
- "hif": "Fiji Hindi",
- "ho": "Hiri Mot", # Hiri Motu
- "hr": "Hrvatski" , # Croatian
- "hsb": "Hornjoserbsce",
- "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole
- "hu": "Magyar" , # Hungarian
- "hy": "Հայերեն", # Armenian
- "hz": "Otsiherero", # Herero
-
- "ia": "Interlingua",
- "ie": "Interlingue",
- "id": "Bahasa Indonesia", # Indonesian (formerly in)
- "ig": "Igbo", # Igbo
- "ii": "ꆇꉙ ", # Sichuan Yi
- "ik": "Iñupiak", # Inupiak
- "ilo": "Ilokano",
- "io": "Ido" ,
- "is": "Íslenska" , # Icelandic
- "it": "Italiano" , # Italian
- "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut
-
- "ja": "日本語", # Japanese
- "jbo": "Lojban",
- "jv": "Basa Jawa", # Javanese
-
- "ka": "ქართული" , # Georgian
- "kaa": "Qaraqalpaqsha",
- "kab": "Taqbaylit",
- "kg": "KiKongo", # Kongo # also CD and AO
- "ki": "Gĩkũyũ", # Kikuyu
- "kj": "Kuanyama", # Kuanyama
- "kk": "Қазақша", # Kazakh
- "kl": "Kalaallisut", # Kalaallisut; Greenlandic
- "km": "ភាសាខ្មែរ", # Khmer; Cambodian
- "kn": "ಕನ್ನಡ", # Kannada
- "ko": "한국어" , # Korean
- "kr": "Kanuri", # Kanuri
- "ks": "कश्मीरी / كشميري", # Kashmiri
- "ksh": "Ripoarisch",
- "ku": "Kurdî / كوردی", # Kurdish
- "kv": "Коми", # Komi
- "kw": "Kernewek/Karnuack", # Cornish
- "ky": "Кыргызча", # Kirghiz
-
- "la": "Latina" , # Latin
- "lad": "Dzhudezmo",
- "lb": "Lëtzebuergesch" , # Letzeburgesch
- "lbe": "Лакку",
- "lg": "Luganda", # Ganda
- "li": "Limburgs", # Limburgish; Limburger; Limburgan
- "lij": "Lígur",
- "ln": "Lingala", # Lingala
- "lmo": "Lumbaart",
- "lo": "ລາວ", # Lao; Laotian
- "lt": "Lietuvių" , # Lithuanian
- "l": None, # Luba-Katanga
- "lv": "Latvieš" , # Latvian; Lettish
-
- "map-bms": "Basa Banyumasan",
- "mdf": "Мокшень (Mokshanj Kälj)",
- "mg": "Malagasy", # Malagasy
- "mh": "Ebon", # Marshall
- "mi": "Māori", # Maori
- "mk": "Македонски" , # Macedonian
- "ml": None, # Malayalam
- "mn": "Монгол", # Mongolian
- "mo": "Молдовеняскэ", # Moldavian
- "mr": "मराठी" , # Marathi
- "ms": "Bahasa Melay" , # Malay
- "mt": "Malti", # Maltese
- "mus": "Muskogee",
- "my": "မ္ရန္‌မာစာ", # Burmese
- "myv": "Эрзянь (Erzjanj Kelj)",
- "mzn": "مَزِروني",
-
- "na": "dorerin Naoero", # Nauru
- "nah": "Nāhuatl",
- "nap": "Nnapulitano",
- "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l
- "nd": None,# Ndebele, North
- "nds": "Plattdüütsch",
- "nds-nl": "Nedersaksisch",
- "ne": "नेपाली", # Nepali
- "new": "नेपाल भाषा" , # Nepal Bhasa
- "ng": "Oshiwambo", # Ndonga
- "nl": "Nederlands" , # Dutch
- "nn": "Nynorsk", # Norwegian Nynorsk
- "no": "Norsk (Bokmål)" , # Norwegian
- "nov": "Novial",
- "nr": None, # Ndebele, South
- "nrm": "Nouormand/Normaund",
- "nv": "Diné bizaad", # Navajo
- "ny": "Chi-Chewa", # Chichewa; Nyanja
-
- "oc": "Occitan", # Occitan; Proven@,{c}al
- "oj": None, # Ojibwa
- "om": "Oromoo", # (Afan) Oromo
- "or": "ଓଡ଼ିଆ", # Oriya
- "os": "Иронау", # Ossetian; Ossetic
-
- "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi
- "pag": "Pangasinan",
- "pam": "Kapampangan",
- "pap": "Papiament",
- "pdc": "Deitsch",
- "pi": "पाऴि", # Pali
- "pih": "Norfuk",
- "pl": "Polski" , # Polish
- "pms": "Piemontèis" ,
- "ps": "پښتو", # Pashto, Pushto
- "pt": "Português" , # Portuguese
-
- "q": "Runa Simi" , # Quechua
-
- "rm": "Rumantsch", # Rhaeto-Romance
- "rmy": "romani - रोमानी",
- "rn": "Kirundi", # Rundi; Kirundi
- "ro": "Română" , # Romanian
- "roa-rup": "Armãneashce",
- "roa-tara": "Tarandíne",
- "ru": "Русский" , # Russian
- "rw": "Ikinyarwanda", # Kinyarwanda
-
- "sa": "संस्कृतम्", # Sanskrit
- "sah": "Саха тыла (Saxa Tyla)",
- "sc": "Sard", # Sardinian
- "scn": "Sicilian",
- "sco": "Scots",
- "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi
- "se": "Sámegiella", # Northern Sami
- "sg": "Sängö", # Sango; Sangro
- "sh": "Srpskohrvatski / Српскохрватски" ,
- "si": "සිංහල",
- "simple": "Simple English" ,
- "sk": "Slovenčina" , # Slovak
- "sl": "Slovenščina" , # Slovenian
- "sm": "Gagana Samoa", # Samoan
- "sn": "chiShona", # Shona
- "so": "Soomaaliga", # Somali
- "sr": "Српски / Srpski" , # Serbian
- "srn": "Sranantongo",
- "ss": "SiSwati", # Swati; Siswati
- "st": "Sesotho", # Sesotho; Sotho, Southern
- "stk": "Seeltersk",
- "s": "Basa Sunda", # Sundanese
- "sq": "Shqip" , # Albanian
- "szl": "Ślůnski",
- "sv": "Svenska" , # Swedish
- "sw": "Kiswahili", # Swahili # Also KE
-
- "ta": "தமிழ்" , # Tamil
- "te": "తెలుగు" , # Telugu
- "tet": "Tetun",
- "tg": "Тоҷикӣ", # Tajik
- "th": "ไทย" , # Thai
- "ti": "ትግርኛ", # Tigrinya
- "tk": "تركمن / Туркмен", # Turkmen
- "tl": "Tagalog" , # Tagalog
- "tn": "Setswana", # Tswana; Setswana
- "to": "faka Tonga", # Tonga (?) # Also ZW ; MW
- "tokipona": "Tokipona",
- "tpi": "Tok Pisin",
- "tr": "Türkçe" , # Turkish
- "ts": "Xitsonga", # Tsonga # ZA SZ XW
- "tt": "Tatarça / Татарча", # Tatar
- "tum": "chiTumbuka",
- "tw": "Twi", # Twi
- "ty": "Reo Mā`ohi", # Tahitian
-
- "udm": "Удмурт кыл",
- "ug": "Oyghurque", # Uighur
- "uk": "Українська" , # Ukrainian
- "ur": "اردو", # Urdu
- "uz": "O‘zbek", # Uzbek
-
- "ve": "Tshivenda", # Venda
- "vec": "Vèneto",
- "vi": "Tiếng Việt" , # Vietnamese
- "vls": "West-Vlams",
- "vo": "Volapük" ,
- "wa": "Walon", # Walloon
- "war": "Winaray",
- "wo": "Wolof", # Wolof
- "w": "吴语",
-
- "xal": "Хальмг",
- "xh": "isiXhosa", # Xhosa
-
- "yi": "ייִדיש", # Yiddish (formerly ji)
- "yo": "Yorùbá", # Yoruba
-
- "za": "Cuengh", # Zhuang
- "zea": "Zeêuws",
- "zh": "中文" , # Chinese
- "zh-classical": "古文 / 文言文",
- "zm-min-nan": "Bân-lâm-gú",
- "zh-yue": "粵語",
- "z": "isiZul" # Zulu
- }
-
def target(self, t):
(qual,sep,tgt) = t.partition(':')
r = None
@@ -361,18 +47,10 @@ class HtmlWikiMarkup (WikiMarkup):
envhdr = [ "ul", "ol", "dl" ]
envel = [ "li", "li", "dd" ]
- def str_nil(self, tok, env):
- return ""
-
- def str_text(self, tok, env):
- return tok[1]
-
def str_link(self, tok, env):
arg = self.fmtok(tok[1], env)
(target, r) = self.target(arg)
text = self.fmtok(tok[2], env)
- if not text and r:
- text = r
return "<a href=\"%s\">%s</a>" % (target,
text if (text and text != '') \
else \
@@ -424,37 +102,6 @@ class HtmlWikiMarkup (WikiMarkup):
s += self.fmtok(t, env)
return s
- def fmtok(self, tok, env):
- if type(tok) != TupleType:
- return ""
- toktype = tok[0]
- if toktype == self.NIL:
- return self.str_nil(tok, env)
- if toktype == self.TEXT:
- return self.str_text(tok, env)
- elif toktype == self.LINK:
- return self.str_link(tok, env)
- elif toktype == self.TMPL:
- return self.str_tmpl(tok, env)
- elif toktype == self.REF:
- return self.str_ref(tok, env)
- elif toktype == self.IT:
- return self.str_it(tok, env)
- elif toktype == self.BOLD:
- return self.str_bold(tok, env)
- elif toktype == self.HDR:
- return self.str_hdr(tok, env)
- elif toktype == self.BAR:
- return self.str_bar(tok, env)
- elif toktype == self.ENV:
- return self.str_env(tok, env)
- elif toktype == self.ITEM:
- return self.str_item(tok, env)
- elif toktype == self.SEQ:
- return self.str_seq(tok, env)
-
- def __str__(self):
- return self.fmtok(self.tree, None)
class HtmlWiktionaryMarkup (HtmlWikiMarkup):
diff --git a/wiki2plain.py b/wiki2plain.py
deleted file mode 100644
index 5080298..0000000
--- a/wiki2plain.py
+++ /dev/null
@@ -1,452 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-# Copyright (C) 2008 Sergey Poznyakoff
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-
-from wikimarkup import *
-from types import TupleType
-import urllib
-
-class PlainMarkup (WikiMarkup):
- """
- A (general-purpose Wiki->Text translator class.
- """
- lang = 'en'
- html_base = 'http://%(lang)s.wiktionary.org'
- image_base = 'http://nie.wiem.gdzie'
- media_base = 'http://www.mediawiki.org/xml/export-0.3'
-
- def __init__(self, *args, **keywords):
- WikiMarkup.__init__(self, *args, **keywords)
- if 'lang' in keywords:
- self.lang = keywords['lang']
- elif 'html_base' in keywords:
- self.html_base = keywords['html_base']
- elif 'image_base' in keywords:
- self.image_base = keywords['image_base']
- elif 'media_base' in keywords:
- self.media_base = keywords['media_base']
-
- # ISO 639
- langtab = {
- "aa": "Afar", # Afar
- "ab": "Аҧсуа", # Abkhazian
- "ae": None, # Avestan
- "af": "Afrikaans", # Afrikaans
- "ak": "Akana", # Akan # or ak_CI
- "als": "Alemannisch",
- "am": "አማርኛ", # Amharic
- "an": "Aragonés", # Aragonese
- "ang": "Englisc",
- "ar": "العربية" , # Arabic
- "arc": "ܐܪܡܝܐ",
- "as": "অসমীয়া", # Assamese
- "ast": "Asturian",
- "av": "Авар", # Avaric # Spoken mainly in Dagestan
- "ay": "Aymar", # Aymara
- "az": "Azərbaycan" , # Azerbaijani
-
- "ba": "Башҡорт", # Bashkir
- "bar": "Boarisch",
- "bat-smg": "Žemaitėška",
- "bcl": "Bikol",
- "be": "Беларуская", # Byelorussian; Belarusian
- "be-x-old": "Беларуская (тарашкевіца)",
- "bg": "Български", # Bulgarian
- "bh": "भोजपुरी", # Bihari
- "bi": "Bislama", # Bislama
- "bm": "Bamanankan", # Bambara
- "bn": "বাংলা" , # Bengali; Bangla
- "bo": "བོད་སྐད", # Tibetan
- "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" ,
- "br": "Brezhoneg" , # Breton
- "bs": "Bosanski" , # Bosnian
- "bug": "Basa Ugi",
- "bxr": "Буряад",
-
- "ca": "Català" , # Catalan
- "cbk-zam": "Chavacano de Zamboanga",
- "cdo": "Mìng-dĕ̤ng-ngṳ̄",
- "cho": "Choctaw",
- "ce": "Нохчийн", # Chechen
- "ceb": "Sinugboanong Binisaya" , # Cebuano
- "ch": "Chamor", # Chamorro
- "chr": "ᏣᎳᎩ",
- "chy": "Tsetsêhestâhese",
- "co": "Cors", # Corsican
- "cr": "Nehiyaw", # Cree
- "crh": "Qırımtatarca",
- "cs": "Česky" , # Czech
- "csb": "Kaszëbsczi",
- "c": "Словѣньскъ", # Church Slavic
- "cv": "Чăваш", # Chuvash
- "cy": "Cymraeg" , # Welsh
-
- "da": "Dansk" , # Danish
- "de": "Deutsch" , # German
- "diq": "Zazaki", # Dimli (Southern Zazaki)
- "dsb": "Dolnoserbski",
- "dv": "ދިވެހިބަސް", # Divehi
- "dz": "ཇོང་ཁ", # Dzongkha; Bhutani
-
- "ee": "Eʋegbe", # Ewe
- "el": "Ελληνικά" , # Greek
- "eml": "Emiliàn e rumagnòl",
- "en": "English" , # English
- "eo": "Esperanto" ,
- "es": "Español" , # Spanish
- "et": "Eesti" , # Estonian
- "e": "Euskara" , # Basque
- "ext": "Estremeñ",
-
- "fa": "فارسی" , # Persian
- "ff": "Fulfulde", # Fulah # Also NG, MR, and many others
- "fi": "Suomi" , # Finnish
- "fiu-vro": "Võro",
- "fj": "Na Vosa Vakaviti", # Fijian; Fiji
- "fo": "Føroyskt" , # Faroese
- "fr": "Français" , # French
- "frp": "Arpitan",
- "fur": "Furlan",
- "fy": "Frysk", # Frisian
-
- "ga": "Gaeilge", # Irish
- "gan": "贛語 (Gànyŭ)",
- "gd": "Gàidhlig", # Scots; Gaelic
- "gl": "Gallego" , # Gallegan; Galician
- "glk": "گیلکی",
- "got": "𐌲Œ„𐌹𐌺 ",
- "gn": "Avañe'ẽ", # Guarani
- "g": "ગુજરાતી", # Gujarati
- "gv": "Gaelg", # Manx
-
- "ha": "هَوُسَ", # Hausa
- "hak": "Hak-kâ-fa / 客家話",
- "haw": "Hawai`i",
- "he": "עברית" , # Hebrew (formerly iw)
- "hi": "हिन्दी" , # Hindi
- "hif": "Fiji Hindi",
- "ho": "Hiri Mot", # Hiri Motu
- "hr": "Hrvatski" , # Croatian
- "hsb": "Hornjoserbsce",
- "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole
- "hu": "Magyar" , # Hungarian
- "hy": "Հայերեն", # Armenian
- "hz": "Otsiherero", # Herero
-
- "ia": "Interlingua",
- "ie": "Interlingue",
- "id": "Bahasa Indonesia", # Indonesian (formerly in)
- "ig": "Igbo", # Igbo
- "ii": "ꆇꉙ ", # Sichuan Yi
- "ik": "Iñupiak", # Inupiak
- "ilo": "Ilokano",
- "io": "Ido" ,
- "is": "Íslenska" , # Icelandic
- "it": "Italiano" , # Italian
- "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut
-
- "ja": "日本語", # Japanese
- "jbo": "Lojban",
- "jv": "Basa Jawa", # Javanese
-
- "ka": "ქართული" , # Georgian
- "kaa": "Qaraqalpaqsha",
- "kab": "Taqbaylit",
- "kg": "KiKongo", # Kongo # also CD and AO
- "ki": "Gĩkũyũ", # Kikuyu
- "kj": "Kuanyama", # Kuanyama
- "kk": "Қазақша", # Kazakh
- "kl": "Kalaallisut", # Kalaallisut; Greenlandic
- "km": "ភាសាខ្មែរ", # Khmer; Cambodian
- "kn": "ಕನ್ನಡ", # Kannada
- "ko": "한국어" , # Korean
- "kr": "Kanuri", # Kanuri
- "ks": "कश्मीरी / كشميري", # Kashmiri
- "ksh": "Ripoarisch",
- "ku": "Kurdî / كوردی", # Kurdish
- "kv": "Коми", # Komi
- "kw": "Kernewek/Karnuack", # Cornish
- "ky": "Кыргызча", # Kirghiz
-
- "la": "Latina" , # Latin
- "lad": "Dzhudezmo",
- "lb": "Lëtzebuergesch" , # Letzeburgesch
- "lbe": "Лакку",
- "lg": "Luganda", # Ganda
- "li": "Limburgs", # Limburgish; Limburger; Limburgan
- "lij": "Lígur",
- "ln": "Lingala", # Lingala
- "lmo": "Lumbaart",
- "lo": "ລາວ", # Lao; Laotian
- "lt": "Lietuvių" , # Lithuanian
- "l": None, # Luba-Katanga
- "lv": "Latvieš" , # Latvian; Lettish
-
- "map-bms": "Basa Banyumasan",
- "mdf": "Мокшень (Mokshanj Kälj)",
- "mg": "Malagasy", # Malagasy
- "mh": "Ebon", # Marshall
- "mi": "Māori", # Maori
- "mk": "Македонски" , # Macedonian
- "ml": None, # Malayalam
- "mn": "Монгол", # Mongolian
- "mo": "Молдовеняскэ", # Moldavian
- "mr": "मराठी" , # Marathi
- "ms": "Bahasa Melay" , # Malay
- "mt": "Malti", # Maltese
- "mus": "Muskogee",
- "my": "မ္ရန္‌မာစာ", # Burmese
- "myv": "Эрзянь (Erzjanj Kelj)",
- "mzn": "مَزِروني",
-
- "na": "dorerin Naoero", # Nauru
- "nah": "Nāhuatl",
- "nap": "Nnapulitano",
- "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l
- "nd": None,# Ndebele, North
- "nds": "Plattdüütsch",
- "nds-nl": "Nedersaksisch",
- "ne": "नेपाली", # Nepali
- "new": "नेपाल भाषा" , # Nepal Bhasa
- "ng": "Oshiwambo", # Ndonga
- "nl": "Nederlands" , # Dutch
- "nn": "Nynorsk", # Norwegian Nynorsk
- "no": "Norsk (Bokmål)" , # Norwegian
- "nov": "Novial",
- "nr": None, # Ndebele, South
- "nrm": "Nouormand/Normaund",
- "nv": "Diné bizaad", # Navajo
- "ny": "Chi-Chewa", # Chichewa; Nyanja
-
- "oc": "Occitan", # Occitan; Proven@,{c}al
- "oj": None, # Ojibwa
- "om": "Oromoo", # (Afan) Oromo
- "or": "ଓଡ଼ିଆ", # Oriya
- "os": "Иронау", # Ossetian; Ossetic
-
- "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi
- "pag": "Pangasinan",
- "pam": "Kapampangan",
- "pap": "Papiament",
- "pdc": "Deitsch",
- "pi": "पाऴि", # Pali
- "pih": "Norfuk",
- "pl": "Polski" , # Polish
- "pms": "Piemontèis" ,
- "ps": "پښتو", # Pashto, Pushto
- "pt": "Português" , # Portuguese
-
- "q": "Runa Simi" , # Quechua
-
- "rm": "Rumantsch", # Rhaeto-Romance
- "rmy": "romani - रोमानी",
- "rn": "Kirundi", # Rundi; Kirundi
- "ro": "Română" , # Romanian
- "roa-rup": "Armãneashce",
- "roa-tara": "Tarandíne",
- "ru": "Русский" , # Russian
- "rw": "Ikinyarwanda", # Kinyarwanda
-
- "sa": "संस्कृतम्", # Sanskrit
- "sah": "Саха тыла (Saxa Tyla)",
- "sc": "Sard", # Sardinian
- "scn": "Sicilian",
- "sco": "Scots",
- "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi
- "se": "Sámegiella", # Northern Sami
- "sg": "Sängö", # Sango; Sangro
- "sh": "Srpskohrvatski / Српскохрватски" ,
- "si": "සිංහල",
- "simple": "Simple English" ,
- "sk": "Slovenčina" , # Slovak
- "sl": "Slovenščina" , # Slovenian
- "sm": "Gagana Samoa", # Samoan
- "sn": "chiShona", # Shona
- "so": "Soomaaliga", # Somali
- "sr": "Српски / Srpski" , # Serbian
- "srn": "Sranantongo",
- "ss": "SiSwati", # Swati; Siswati
- "st": "Sesotho", # Sesotho; Sotho, Southern
- "stk": "Seeltersk",
- "s": "Basa Sunda", # Sundanese
- "sq": "Shqip" , # Albanian
- "szl": "Ślůnski",
- "sv": "Svenska" , # Swedish
- "sw": "Kiswahili", # Swahili # Also KE
-
- "ta": "தமிழ்" , # Tamil
- "te": "తెలుగు" , # Telugu
- "tet": "Tetun",
- "tg": "Тоҷикӣ", # Tajik
- "th": "ไทย" , # Thai
- "ti": "ትግርኛ", # Tigrinya
- "tk": "تركمن / Туркмен", # Turkmen
- "tl": "Tagalog" , # Tagalog
- "tn": "Setswana", # Tswana; Setswana
- "to": "faka Tonga", # Tonga (?) # Also ZW ; MW
- "tokipona": "Tokipona",
- "tpi": "Tok Pisin",
- "tr": "Türkçe" , # Turkish
- "ts": "Xitsonga", # Tsonga # ZA SZ XW
- "tt": "Tatarça / Татарча", # Tatar
- "tum": "chiTumbuka",
- "tw": "Twi", # Twi
- "ty": "Reo Mā`ohi", # Tahitian
-
- "udm": "Удмурт кыл",
- "ug": "Oyghurque", # Uighur
- "uk": "Українська" , # Ukrainian
- "ur": "اردو", # Urdu
- "uz": "O‘zbek", # Uzbek
-
- "ve": "Tshivenda", # Venda
- "vec": "Vèneto",
- "vi": "Tiếng Việt" , # Vietnamese
- "vls": "West-Vlams",
- "vo": "Volapük" ,
-
- "wa": "Walon", # Walloon
- "war": "Winaray",
- "wo": "Wolof", # Wolof
- "w": "吴语",
-
- "xal": "Хальмг",
- "xh": "isiXhosa", # Xhosa
-
- "yi": "ייִדיש", # Yiddish (formerly ji)
- "yo": "Yorùbá", # Yoruba
-
- "za": "Cuengh", # Zhuang
- "zea": "Zeêuws",
- "zh": "中文" , # Chinese
- "zh-classical": "古文 / 文言文",
- "zm-min-nan": "Bân-lâm-gú",
- "zh-yue": "粵語",
- "z": "isiZul" # Zulu
- }
-
- def target(self, t):
- (qual,sep,tgt) = t.partition(':')
- r = None
- if tgt != '':
- if qual == "Image":
- t = self.image_base + '/' + urllib.quote(tgt)
- elif qual == "Media":
- t = self.media_base + '/' + tgt
- elif qual in self.langtab:
- t = self.html_base % { 'lang' : qual } + '/' + urllib.quote(tgt)
- r = self.langtab[qual]
- else:
- t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t)
- else:
- t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t)
- return t, r
-
- envhdr = [ "ul", "ol", "dl" ]
- envel = [ "li", "li", "dd" ]
-
- def str_nil(self, tok, env):
- return ""
-
- def str_text(self, tok, env):
- return tok[1]
-
- def str_link(self, tok, env):
- arg = self.fmtok(tok[1], env)
- (target, r) = self.target(arg)
- text = self.fmtok(tok[2], env)
- if not text and r:
- text = r
- return "%s" % (text if (text and text != '') \
- else \
- r if r else arg)
-
- def str_tmpl(self, tok, env):
- arg = self.fmtok(tok[1], env)
- (target, r) = self.target(arg)
- text = self.fmtok(tok[2], env)
- return "%s" % (text if (text and text != '') \
- else arg)
-
- def str_ref(self, tok, env):
- target = self.fmtok(tok[1], env)
- text = self.fmtok(tok[2], env)
- return "%s" % (text if (text and text != '') \
- else target)
-
- def str_it(self, tok, env):
- return "<i>" + self.fmtok(tok[1], env) + "</i>"
-
- def str_bold(self, tok, env):
- return "<b>" + self.fmtok(tok[1], env) + "</b>"
-
- def str_hdr(self, tok, env):
- level = tok[1]
- if level > 4:
- level = 4
- return "<h%s>%s</h%s>" % (level, self.fmtok(tok[2], env), level)
-
- def str_bar(self, tok, env):
- return "-----------------"
-
- def str_env(self, tok, env):
- t = tok[1]
- return "<" + self.envhdr[t] + ">" + \
- self.fmtok(tok[3], tok) + \
- "</" + self.envhdr[t] + ">"
-
- def str_item(self, tok, env):
- return "<%s>%s</%s>" % (self.envel[env[1]],
- self.fmtok(tok[1], env),
- self.envel[env[1]])
-
- def str_seq(self, tok, env):
- s = ""
- for t in tok[1:]:
- s += self.fmtok(t, env)
- return s
-
- def fmtok(self, tok, env):
- if type(tok) != TupleType:
- return ""
- toktype = tok[0]
- if toktype == self.NIL:
- return self.str_nil(tok, env)
- if toktype == self.TEXT:
- return self.str_text(tok, env)
- elif toktype == self.LINK:
- return self.str_link(tok, env)
- elif toktype == self.TMPL:
- return self.str_tmpl(tok, env)
- elif toktype == self.REF:
- return self.str_ref(tok, env)
- elif toktype == self.IT:
- return self.str_it(tok, env)
- elif toktype == self.BOLD:
- return self.str_bold(tok, env)
- elif toktype == self.HDR:
- return self.str_hdr(tok, env)
- elif toktype == self.BAR:
- return self.str_bar(tok, env)
- elif toktype == self.ENV:
- return self.str_env(tok, env)
- elif toktype == self.ITEM:
- return self.str_item(tok, env)
- elif toktype == self.SEQ:
- return self.str_seq(tok, env)
-
- def __str__(self):
- return self.fmtok(self.tree, None)
diff --git a/wiki2text.py b/wiki2text.py
new file mode 100644
index 0000000..e943f32
--- /dev/null
+++ b/wiki2text.py
@@ -0,0 +1,163 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from wikimarkup import *
+from types import TupleType
+import urllib
+
+class TextWikiMarkup (WikiMarkup):
+ """
+ A (general-purpose Wiki->Text translator class.
+ """
+
+ width = 80
+ num = 0
+ references = False
+
+ def __init__(self, *args, **keywords):
+ WikiMarkup.__init__(self, *args, **keywords)
+ if 'width' in keywords:
+ self.width = keywords['width']
+ elif 'refs' in keywords:
+ self.references = keywords['refs']
+
+ def target(self, t):
+ (qual,sep,tgt) = t.partition(':')
+ r = None
+ if tgt != '':
+ if qual == "Image":
+ t = self.image_base + '/' + urllib.quote(tgt)
+ elif qual == "Media":
+ t = self.media_base + '/' + tgt
+ elif qual in self.langtab:
+ t = self.html_base % { 'lang' : qual } + '/' + urllib.quote(tgt)
+ r = self.langtab[qual]
+ else:
+ t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t)
+ else:
+ t = self.html_base % { 'lang' : self.lang } + '/' + urllib.quote(t)
+ return t, r
+
+ def xref(self, text, target):
+ if text:
+ return "%s (see %s) " % (text, target)
+ else:
+ return "see " + target
+
+ def str_link(self, tok, env):
+ arg = self.fmtok(tok[1], env)
+ text = self.fmtok(tok[2], env)
+ if self.references:
+ (target, r) = self.target(arg)
+ return self.xref(text if text else r, target)
+ else:
+ (qual,sep,tgt) = arg.partition(':')
+ if sep != '':
+ return ""
+ elif text:
+ return text
+ return arg
+
+ def str_tmpl(self, tok, env):
+ arg = self.fmtok(tok[1], env)
+ (target, r) = self.target(arg)
+ text = self.fmtok(tok[2], env)
+ if not text and r:
+ text = r
+ if self.references:
+ return self.xref(text, target)
+ return text
+
+ def str_ref(self, tok, env):
+ return self.xref(self.fmtok(tok[2], env), self.fmtok(tok[1], env))
+
+ def str_it(self, tok, env):
+ return "_" + self.fmtok(tok[1], env) + "_"
+
+ def str_bold(self, tok, env):
+ return self.fmtok(tok[1], env).upper()
+
+ def str_hdr(self, tok, env):
+ level = tok[1]
+ return "\n\n" + ("*" * level) + " " + self.fmtok(tok[2], env) + "\n\n"
+
+ def str_bar(self, tok, env):
+ w = self.width
+ if w < 5:
+ w = 5
+ return "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
+
+ def str_env(self, tok, env):
+ self.num = 1
+ return "\n" + self.fmtok(tok[3], tok)
+
+ def indent (self, lev, text):
+ w = self.width
+ self.width = w - lev
+ if text.find('\n') == -1:
+ s = (" " * lev) + text
+ else:
+ s = ""
+ for elt in text.split('\n'):
+ s += (" " * lev) + elt
+ if elt == '':
+ s += "\n"
+
+ self.width = w
+ return s
+
+ def str_item(self, tok, env):
+ t = env[1]
+ lev = env[2]
+ if lev > self.width - 4:
+ lev = 1
+ if t == self.INDENT:
+ return self.indent(lev, self.fmtok(tok[1], env))
+ elif t == self.ENVNUM:
+ n = self.num
+ self.num += 1
+ return "" + self.indent(lev,
+ "%d. %s" % (n, self.fmtok(tok[1], env)))
+ elif t == self.ENVUNNUM:
+ return "" + self.indent(lev,
+ "- " + self.fmtok(tok[1], env))
+
+ def __str__(self):
+ return self.fmtok(self.tree, None)
+
+class TextWiktionaryMarkup (TextWikiMarkup):
+ """
+ See documentation for HtmlWiktionaryMarkup
+ """
+
+ seq_pos = 0
+
+ def str_seq(self, tok, env):
+ s = ""
+ self.seq_pos=0
+ for t in tok[1:]:
+ s += self.fmtok(t, env)
+ self.seq_pos += 1
+ return s
+
+ def str_tmpl(self, tok, env):
+ arg = self.fmtok(tok[1], env)
+ if self.seq_pos > 0:
+ return arg
+ else:
+ return "\n" + arg + ":\n"
+
diff --git a/wikicvt.py b/wikicvt.py
index 7d22c2e..5b8e5a0 100644
--- a/wikicvt.py
+++ b/wikicvt.py
@@ -18,16 +18,19 @@
import sys
import getopt
from wiki2html import *
+from wiki2text import *
def usage(code=0):
- print "usage: " + sys.argv[0] + "[-hv] [--help] [--verbose] file\n"
+ print "usage: " + sys.argv[0] + "[-hvt] [-l lang] [--lang=lang] [--text] [--help] [--verbose] file\n"
sys.exit(code)
def main():
verbose_flag = 0
+ html = 1
+ lang = "pl"
try:
- opts, args = getopt.getopt(sys.argv[1:], "hv",
- ["help", "verbose" ])
+ opts, args = getopt.getopt(sys.argv[1:], "hl:tv",
+ ["help", "lang", "text", "verbose" ])
except getopt.GetoptError: