#!/usr/bin/python # -*- coding: utf-8 -*- # Copyright (C) 2008 Sergey Poznyakoff # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import sys import re from types import * __all__ = [ "BaseWikiMarkup", "WikiMarkup" ] eltbeg = re.compile("=+|(^----$)|^[\\*#:]+") eltre = re.compile("(\\[\\[)|(\\{\\{)|\\[|(\\'\\'\\'?)") delims = { "[[" : re.compile("\\||(\\]\\])"), "{{" : re.compile("\\||(\\}\\})") } term = { "[[" : "]]" , "{{" : "}}" } ends = { "[[" : re.compile("(\\[\\[)|(\\]\\])"), "{{" : re.compile("(\\{\\{)|(\\}\\})") } itend = re.compile("\\'\\'($|[^\\'])") boend = re.compile("\\'\\'\\'($|[^\\'])") class BaseWikiMarkup: """ A base class for handling Wiki markups. It handles: 1. paragraphs; 2. basic block markup (headers, numbered and unnumbered lists, indentations); 3. basic inline markup (bold, italic); 4. basic reference markup (links, templates, external links). It does NOT handle: 1. pseudo-html markup (, and similar); 2. leading spaces meaning ``preserve formatting''; 3. tables and math. The above rests for FIXME. This class relies on its derived classes for providing input. They must overload method `input', which must return one physical line of input for each call. Variables: 1. tree The parse tree. Valid after parse() finishes (see below). Methods: 1. parse() Parse the input and build parse tree 2. input() Virtual function. Return next line of input or None on EOF. 3. output() Print the tree in internal representation. """ ## Token classes # NIL: nothing NIL = 0 # TEXT: text TEXT = 1 # LINK: target, text LINK = 2 # Template: target, text TMPL = 3 # External ref: target, text REF = 4 # Italics: text IT = 5 # Bold: text BOLD = 6 # Header: level, text HDR = 7 # Horizontal bar: BAR = 8 # Environment: type, level ENV = 9 # Item: text ITEM = 10 # Sequence: seq SEQ = 11 # Paragraph PARA = 12 # Environment types: # Unnumbered list ENVUNNUM = 0 # Numbered list ENVNUM = 1 # Indent INDENT = 2 envtypes = [ "*", "#", ":" ] tree = None def itend(self, line, pos): while 1: d = itend.search(line, pos) if not d: return -1 elif d.start(0) == pos or line[d.start(0)-1] != "'": return d.start(0) else: pos = d.start(0) + 1 def linkend(self, paren, line, pos): r = ends[paren] count = 1 while count > 0: m = r.search(line, pos); if not m: return len(line), len(line) else: pos = m.end(0) if m.group(0) == paren: count += 1 else: count -= 1 return m.start(0), m.end(0) la = None def putback(self, line): self.la = line def nextkn(self, curlev=0, type = -1): while 1: if self.la: line = self.la self.putback(None) else: try: line = self.input() except StopIteration: line = u'' if not line or line == "": self.putback(line) break if line == '\n': yield(self.PARA,) continue m = eltbeg.match(line) if m: if m.group(0)[0] in self.envtypes: btype = self.envtypes.index(m.group(0)[0]) lev = len(m.group(0)) if btype == type: if lev == curlev: yield(self.ITEM, (self.SEQ, self.getkn(line[m.end(0):]))) elif lev > curlev: self.putback(line) yield(self.ENV, btype, curlev + 1, (self.SEQ, self.nextkn(curlev + 1, btype))) else: self.putback(line) break else: self.putback(line) yield(self.ENV, btype, 1, self.nextkn(1, btype)) else: if curlev > 0: self.putback(line) break elif m.group(0)[0:2] == "==" \ and line.rstrip('\n').endswith(m.group(0)): yield(self.HDR, len(m.group(0))-1, self.getkn(line[m.end(0):-(1+len(m.group(0)))])) elif m.group(0) == "----": yield(self.BAR,) else: if curlev > 0: self.putback(line) break yield(self.getkn(line)) def getkn(self, line): pos = 0 while 1: if pos == len(line): break; m = eltre.search(line, pos) if not m: yield(self.TEXT, line[pos:]) pos = len(line) else: yield(self.TEXT, line[pos:m.start(0)]) pos = m.end(0) if m.group(0) == "[[" or m.group(0) == "{{": d = delims[m.group(0)].search(line, pos) if d.group(0) == "|": target = (self.TEXT, line[pos:d.start(0)]) (start,pos) = self.linkend(m.group(0), line, m.end(0)) text = (self.SEQ, self.getkn(line[d.end(0):start])) elif d.group(0) == term[m.group(0)]: target = (self.TEXT, line[pos:d.start(0)]) text = (self.NIL,) pos = d.end(0) if m.group(0) == "[[": yield(self.LINK, target, text) else: yield(self.TMPL, target, text) elif m.group(0) == "[": i = line.find("]", m.end(0)) if i == -1: i = len(line) (target,sep,text) = line[m.end(0):i].partition(' ') yield(self.REF, (self.TEXT, target), (self.SEQ, self.getkn(text))) pos = i + 1 elif m.group(0) == "'''": e = boend.search(line, m.end(0)) if e: i = e.start(0) pos = i + 3 else: pos = len(line) i = pos yield(self.BOLD, (self.SEQ, self.getkn(line[m.end(0):i]))) elif m.group(0) == "''": i = self.itend(line, m.end(0)) if i == -1: pos = len(line) i = pos else: pos = i + 2 yield(self.IT, (self.SEQ, self.getkn(line[m.end(0):i]))) def input(self): return None def expandtok(self, tok): if type(tok) == GeneratorType: subtree = [self.SEQ] for t in tok: x = self.expandtok(t) if x: subtree.append(x) return tuple(subtree) if len(subtree) > 2 else \ subtree[1] if len(subtree) == 2 else None toktype = tok[0] if toktype == self.NIL: return None if toktype == self.TEXT: return tok if tok[1] != '' else None elif toktype == self.LINK or toktype == self.TMPL \ or toktype == self.REF: return toktype, self.expandtok(tok[1]), self.expandtok(tok[2]) elif toktype == self.IT or toktype == self.BOLD \ or toktype == self.ITEM: return toktype, self.expandtok(tok[1]) elif toktype == self.HDR: return toktype, tok[1], self.expandtok(tok[2]) elif toktype == self.ENV: return toktype,tok[1],tok[2],self.expandtok(tok[3]) elif toktype == self.SEQ: if len(tok) == 2: return self.expandtok(tok[1]) elif len(tok) == 1: return None else: subtree = [self.SEQ] for t in tok[1:]: x = self.expandtok(t) if x: subtree.append(x) return tuple(subtree) if len(subtree) > 2 else \ subtree[1] if len(subtree) == 2 else None else: return tok def parse(self): tree = [self.SEQ] for tok in self.nextkn(): tree.append(self.expandtok(tok)) self.tree = tuple(tree) def prtok(self, tok, indent): if not tok: print " " * indent, "None" return toktype = tok[0] if toktype == self.SEQ: for t in tok[1:]: self.prtok(t, indent) else: print " " * indent, if toktype == self.NIL: print "NIL" if toktype == self.TEXT: print "TEXT \"%s\"" % (tok[1].encode('string_escape')) elif toktype == self.LINK: print "LINK " self.prtok(tok[1], indent+1) # target self.prtok(tok[2], indent+1) # text elif toktype == self.TMPL: print "TMPL" self.prtok(tok[1], indent+1) # target self.prtok(tok[2], indent+1) # text elif toktype == self.REF: print "REF" self.prtok(tok[1], indent+1) # target self.prtok(tok[2], indent+1) # text elif toktype == self.IT: print "IT" self.prtok(tok[1], indent+1) elif toktype == self.BOLD: print "BOLD" self.prtok(tok[1], indent+1) elif toktype == self.HDR: print "HDR", tok[1] self.prtok(tok[2], indent+1) elif toktype == self.BAR: print "BAR" elif toktype == self.ENV: print "ENV ",self.envtypes[tok[1]],tok[2] self.prtok(tok[3], indent+1) elif toktype == self.ITEM: print "ITEM" self.prtok(tok[1], indent+1) elif toktype == self.PARA: print "PARA" def output(self): self.prtok(self.tree, 0) class WikiMarkup (BaseWikiMarkup): """ A derived class, that supplies a basic input method. Three types of inputs are available: 1. filename= The file is opened and used for input. 2. file= The already opened file is used for input. 3. text= Input is taken from , line by line. Usage: obj = WikiMarkup(arg=val) obj.parse ... Do whatever you need with obj.tree ... """ file = None text = None lang = 'en' html_base = 'http://%(lang)s.wiktionary.org/wiki/' image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf' media_base = 'http://www.mediawiki.org/xml/export-0.3' def __init__(self, *args, **keywords): for kw in keywords: if kw == 'file': self.file = keywords[kw] elif kw == 'filename': self.file = open(keywords[kw]) elif kw == 'text': self.text = keywords[kw].split("\n") elif kw == 'lang': self.lang = keywords[kw] elif kw == 'html_base': self.html_base = keywords[kw] elif kw == 'image_base': self.image_base = keywords[kw] elif kw == 'media_base': self.media_base = keywords[kw] def __del__(self): if self.file: self.file.close() def input(self): if self.file: return self.file.readline() elif self.text: return self.text.pop(0) else: return None # ISO 639 langtab = { "aa": "Afar", # Afar "ab": "Аҧсуа", # Abkhazian "ae": None, # Avestan "af": "Afrikaans", # Afrikaans "ak": "Akana", # Akan "als": "Alemannisch", "am": "አማርኛ", # Amharic "an": "Aragonés", # Aragonese "ang": "Englisc", "ar": "العربية" , # Arabic "arc": "ܐܪܡܝܐ", "as": "অসমীয়া", # Assamese "ast": "Asturian", "av": "Авар", # Avaric "ay": "Aymar", # Aymara "az": "Azərbaycan" , # Azerbaijani "ba": "Башҡорт", # Bashkir "bar": "Boarisch", "bat-smg": "Žemaitėška", "bcl": "Bikol", "be": "Беларуская", # Byelorussian; Belarusian "be-x-old": "Беларуская (тарашкевіца)", "bg": "Български", # Bulgarian "bh": "भोजपुरी", # Bihari "bi": "Bislama", # Bislama "bm": "Bamanankan", # Bambara "bn": "বাংলা" , # Bengali; Bangla "bo": "བོད་སྐད", # Tibetan "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" , "br": "Brezhoneg" , # Breton "bs": "Bosanski" , # Bosnian "bug": "Basa Ugi", "bxr": "Буряад", "ca": "Català" , # Catalan "cbk-zam": "Chavacano de Zamboanga", "cdo": "Mìng-dĕ̤ng-ngṳ̄", "cho": "Choctaw", "ce": "Нохчийн", # Chechen "ceb": "Sinugboanong Binisaya" , # Cebuano "ch": "Chamor", # Chamorro "chr": "ᏣᎳᎩ", "chy": "Tsetsêhestâhese", "co": "Cors", # Corsican "cr": "Nehiyaw", # Cree "crh": "Qırımtatarca", "cs": "Česky" , # Czech "csb": "Kaszëbsczi", "c": "Словѣньскъ", # Church Slavic "cv": "Чăваш", # Chuvash "cy": "Cymraeg" , # Welsh "da": "Dansk" , # Danish "de": "Deutsch" , # German "diq": "Zazaki", # Dimli (Southern Zazaki) "dsb": "Dolnoserbski", "dv": "ދިވެހިބަސް", # Divehi "dz": "ཇོང་ཁ", # Dzongkha; Bhutani "ee": "Eʋegbe", # Ewe "el": "Ελληνικά" , # Greek "eml": "Emiliàn e rumagnòl", "en": "English" , # English "eo": "Esperanto" , "es": "Español" , # Spanish "et": "Eesti" , # Estonian "e": "Euskara" , # Basque "ext": "Estremeñ", "fa": "فارسی" , # Persian "ff": "Fulfulde", # Fulah "fi": "Suomi" , # Finnish "fiu-vro": "Võro", "fj": "Na Vosa Vakaviti",# Fijian; Fiji "fo": "Føroyskt" , # Faroese "fr": "Français" , # French "frp": "Arpitan", "fur": "Furlan", "fy": "Frysk", # Frisian "ga": "Gaeilge", # Irish "gan": "贛語 (Gànyŭ)", "gd": "Gàidhlig", # Scots; Gaelic "gl": "Gallego" , # Gallegan; Galician "glk": "گیلکی", "got": "𐌲Œ„𐌹𐌺 ", "gn": "Avañe'ẽ", # Guarani "g": "ગુજરાતી", # Gujarati "gv": "Gaelg", # Manx "ha": "هَوُسَ", # Hausa "hak": "Hak-kâ-fa / 客家話", "haw": "Hawai`i", "he": "עברית" , # Hebrew (formerly iw) "hi": "हिन्दी" , # Hindi "hif": "Fiji Hindi", "ho": "Hiri Mot", # Hiri Motu "hr": "Hrvatski" , # Croatian "hsb": "Hornjoserbsce", "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole "hu": "Magyar" , # Hungarian "hy": "Հայերեն", # Armenian "hz": "Otsiherero", # Herero "ia": "Interlingua", "ie": "Interlingue", "id": "Bahasa Indonesia",# Indonesian (formerly in) "ig": "Igbo", # Igbo "ii": "ꆇꉙ ", # Sichuan Yi "ik": "Iñupiak", # Inupiak "ilo": "Ilokano", "io": "Ido" , "is": "Íslenska" , # Icelandic "it": "Italiano" , # Italian "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut "ja": "日本語", # Japanese "jbo": "Lojban", "jv": "Basa Jawa", # Javanese "ka": "ქართული" , # Georgian "kaa": "Qaraqalpaqsha", "kab": "Taqbaylit", "kg": "KiKongo", # Kongo "ki": "Gĩkũyũ", # Kikuyu "kj": "Kuanyama", # Kuanyama "kk": "Қазақша", # Kazakh "kl": "Kalaallisut", # Kalaallisut; Greenlandic "km": "ភាសាខ្មែរ", # Khmer; Cambodian "kn": "ಕನ್ನಡ", # Kannada "ko": "한국어" , # Korean "kr": "Kanuri", # Kanuri "ks": "कश्मीरी / كشميري", # Kashmiri "ksh": "Ripoarisch", "ku": "Kurdî / كوردی", # Kurdish "kv": "Коми", # Komi "kw": "Kernewek/Karnuack", # Cornish "ky": "Кыргызча", # Kirghiz "la": "Latina" , # Latin "lad": "Dzhudezmo", "lb": "Lëtzebuergesch" , # Letzeburgesch "lbe": "Лакку", "lg": "Luganda", # Ganda "li": "Limburgs", # Limburgish; Limburger; Limburgan "lij": "Lígur", "ln": "Lingala", # Lingala "lmo": "Lumbaart", "lo": "ລາວ", # Lao; Laotian "lt": "Lietuvių" , # Lithuanian "lua": "Luba", # Luba "lv": "Latvieš" , # Latvian; Lettish "map-bms": "Basa Banyumasan", "mdf": "Мокшень (Mokshanj Kälj)", "mg": "Malagasy", # Malagasy "mh": "Ebon", # Marshall "mi": "Māori", # Maori "mk": "Македонски" , # Macedonian "ml": None, # Malayalam "mn": "Монгол", # Mongolian "mo": "Молдовеняскэ", # Moldavian "mr": "मराठी" , # Marathi "ms": "Bahasa Melay" , # Malay "mt": "Malti", # Maltese "mus": "Muskogee", "my": "မ္ရန္‌မာစာ", # Burmese "myv": "Эрзянь (Erzjanj Kelj)", "mzn": "مَزِروني", "na": "dorerin Naoero", # Nauru "nah": "Nāhuatl", "nap": "Nnapulitano", "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l "nd": None, # Ndebele, North "nds": "Plattdüütsch", "nds-nl": "Nedersaksisch", "ne": "नेपाली", # Nepali "new": "नेपाल भाषा" , # Nepal Bhasa "ng": "Oshiwambo", # Ndonga "nl": "Nederlands" , # Dutch "nn": "Nynorsk", # Norwegian Nynorsk "no": "Norsk (Bokmål)" , # Norwegian "nov": "Novial", "nr": None, # Ndebele, South "nrm": "Nouormand/Normaund", "nv": "Diné bizaad", # Navajo "ny": "Chi-Chewa", # Chichewa; Nyanja "oc": "Occitan", # Occitan; Proven@,{c}al "oj": None, # Ojibwa "om": "Oromoo", # (Afan) Oromo "or": "ଓଡ଼ିଆ", # Oriya "os": "Иронау", # Ossetian; Ossetic "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi "pag": "Pangasinan", "pam": "Kapampangan", "pap": "Papiament", "pdc": "Deitsch", "pi": "पाऴि", # Pali "pih": "Norfuk", "pl": "Polski" , # Polish "pms": "Piemontèis" , "ps": "پښتو", # Pashto, Pushto "pt": "Português" , # Portuguese "q": "Runa Simi" , # Quechua "rm": "Rumantsch", # Rhaeto-Romance "rmy": "romani - रोमानी", "rn": "Kirundi", # Rundi; Kirundi "ro": "Română" , # Romanian "roa-rup": "Armãneashce", "roa-tara": "Tarandíne", "ru": "Русский" , # Russian "rw": "Ikinyarwanda", # Kinyarwanda "sa": "संस्कृतम्", # Sanskrit "sah": "Саха тыла (Saxa Tyla)", "sc": "Sardu", # Sardinian "scn": "Sicilian", "sco": "Scots", "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi "se": "Sámegiella", # Northern Sami "sg": "Sängö", # Sango; Sangro "sh": "Srpskohrvatski / Српскохрватски" , "si": "සිංහල", "simple": "Simple English" , "sk": "Slovenčina" , # Slovak "sl": "Slovenščina" , # Slovenian "sm": "Gagana Samoa", # Samoan "sn": "chiShona", # Shona "so": "Soomaaliga", # Somali "sr": "Српски / Srpski", # Serbian "srn": "Sranantongo", "ss": "SiSwati", # Swati; Siswati "st": "Sesotho", # Sesotho; Sotho, Southern "stk": "Seeltersk", "s": "Basa Sunda", # Sundanese "sq": "Shqip" , # Albanian "szl": "Ślůnski", "sv": "Svenska" , # Swedish "sw": "Kiswahili", # Swahili "ta": "தமிழ்" , # Tamil "te": "తెలుగు" , # Telugu "tet": "Tetun", "tg": "Тоҷикӣ", # Tajik "th": "ไทย" , # Thai "ti": "ትግርኛ", # Tigrinya "tk": "تركمن / Туркмен", # Turkmen "tl": "Tagalog" , # Tagalog "tn": "Setswana", # Tswana; Setswana "to": "faka Tonga", # Tonga (?) # Also ZW ; MW "tokipona": "Tokipona", "tpi": "Tok Pisin", "tr": "Türkçe" , # Turkish "ts": "Xitsonga", # Tsonga "tt": "Tatarça / Татарча", # Tatar "tum": "chiTumbuka", "tw": "Twi", # Twi "ty": "Reo Mā`ohi", # Tahitian "udm": "Удмурт кыл", "ug": "Oyghurque", # Uighur "uk": "Українська" , # Ukrainian "ur": "اردو", # Urdu "uz": "O‘zbek", # Uzbek "ve": "Tshivenda", # Venda "vec": "Vèneto", "vi": "Tiếng Việt" , # Vietnamese "vls": "West-Vlams", "vo": "Volapük" , "wa": "Walon", # Walloon "war": "Winaray", "wo": "Wolof", # Wolof "w": "吴语", "xal": "Хальмг", "xh": "isiXhosa", # Xhosa "yi": "ייִדיש", # Yiddish "yo": "Yorùbá", # Yoruba "za": "Cuengh", # Zhuang "zea": "Zeêuws", "zh": "中文" , # Chinese "zh-classical": "古文 / 文言文", "zm-min-nan": "Bân-lâm-gú", "zh-yue": "粵語", "zu": "isiZulu" # Zulu } def str_nil(self, tok, env): return None def str_text(self, tok, env): return tok[1] def str_seq(self, tok, env): str = "" for t in tok[1:]: s = self.fmtok(t, env) if s: str += s return str def fmtok(self, tok, env): if type(tok) != TupleType: return "" toktype = tok[0] if toktype == self.NIL: return self.str_nil(tok, env) if toktype == self.TEXT: return self.str_text(tok, env) elif toktype == self.LINK: return self.str_link(tok, env) elif toktype == self.TMPL: return self.str_tmpl(tok, env) elif toktype == self.REF: return self.str_ref(tok, env) elif toktype == self.IT: return self.str_it(tok, env) elif toktype == self.BOLD: return self.str_bold(tok, env) elif toktype == self.HDR: return self.str_hdr(tok, env) elif toktype == self.BAR: return self.str_bar(tok, env) elif toktype == self.ENV: return self.str_env(tok, env) elif toktype == self.ITEM: return self.str_item(tok, env) elif toktype == self.SEQ: return self.str_seq(tok, env) elif toktype == self.PARA: return self.str_para(tok, env) def __str__(self): return self.fmtok(self.tree, None)