#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2008 Sergey Poznyakoff
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
import sys
import re
from types import *
__all__ = [ "BaseWikiMarkup", "WikiMarkup" ]
eltbeg = re.compile("=+|(^----$)|^[\\*#:]+")
eltre = re.compile("(\\[\\[)|(\\{\\{)|\\[|(\\'\\'\\'?)")
delims = { "[[" : re.compile("\\||(\\]\\])"),
"{{" : re.compile("\\||(\\}\\})") }
term = { "[[" : "]]" , "{{" : "}}" }
ends = { "[[" : re.compile("\\]\\]"),
"{{" : re.compile("\\}\\}") }
itend = re.compile("\\'\\'($|[^\\'])")
boend = re.compile("\\'\\'\\'($|[^\\'])")
class BaseWikiMarkup:
"""
A base class for handling Wiki markups.
It handles:
1. paragraphs;
2. basic block markup (headers, numbered and unnumbered lists,
indentations);
3. basic inline markup (bold, italic);
4. basic reference markup (links, templates, external links).
It does NOT handle:
1. pseudo-html markup (, and similar);
2. leading spaces meaning ``preserve formatting'';
3. tables and math.
The above rests for FIXME.
This class relies on its derived classes for providing input. They must
overload method `input', which must return one physical line of input for
each call.
Variables:
1. tree
The parse tree. Valid after parse() finishes (see below).
Methods:
1. parse()
Parse the input and build parse tree
2. input()
Virtual function. Return next line of input or None on EOF.
3. output()
Print the tree in internal representation.
"""
## Token classes
# NIL: nothing
NIL = 0
# TEXT: text
TEXT = 1
# LINK: target, text
LINK = 2
# Template: target, text
TMPL = 3
# External ref: target, text
REF = 4
# Italics: text
IT = 5
# Bold: text
BOLD = 6
# Header: level, text
HDR = 7
# Horizontal bar:
BAR = 8
# Environment: type, level
ENV = 9
# Item: text
ITEM = 10
# Sequence: seq
SEQ = 11
# Paragraph
PARA = 12
# Environment types:
# Unnumbered list
ENVUNNUM = 0
# Numbered list
ENVNUM = 1
# Indent
INDENT = 2
envtypes = [ "*", "#", ":" ]
tree = None
def itend(self, line, pos):
while 1:
d = itend.search(line, pos)
if not d:
return -1
elif d.start(0) == pos or line[d.start(0)-1] != "'":
return d.start(0)
else:
pos = d.start(0) + 1
la = None
def putback(self, line):
self.la = line
def nextkn(self, curlev=0, type = -1):
while 1:
if self.la:
line = self.la
self.putback(None)
else:
try:
line = self.input()
except StopIteration:
line = u''
if not line or line == "":
self.putback(line)
break
if line == '\n':
yield(self.PARA,)
continue
m = eltbeg.match(line)
if m:
if m.group(0)[0] in self.envtypes:
btype = self.envtypes.index(m.group(0)[0])
lev = len(m.group(0))
if btype == type:
if lev == curlev:
yield(self.ITEM,
(self.SEQ, self.getkn(line[m.end(0):])))
elif lev > curlev:
self.putback(line)
yield(self.ENV, btype, curlev + 1,
(self.SEQ, self.nextkn(curlev + 1, btype)))
else:
self.putback(line)
break
else:
self.putback(line)
yield(self.ENV, btype, 1, self.nextkn(1, btype))
else:
if curlev > 0:
self.putback(line)
break
elif m.group(0)[0:2] == "==" \
and line.rstrip('\n').endswith(m.group(0)):
yield(self.HDR, len(m.group(0))-1,
self.getkn(line[m.end(0):-(1+len(m.group(0)))]))
elif m.group(0) == "----":
yield(self.BAR,)
else:
if curlev > 0:
self.putback(line)
break
yield(self.getkn(line))
def getkn(self, line):
pos = 0
while 1:
if pos == len(line):
break;
m = eltre.search(line, pos)
if not m:
yield(self.TEXT, line[pos:])
pos = len(line)
else:
yield(self.TEXT, line[pos:m.start(0)])
pos = m.end(0)
if m.group(0) == "[[" or m.group(0) == "{{":
d = delims[m.group(0)].search(line, pos)
if d.group(0) == "|":
e = ends[m.group(0)].search(line, d.end(0))
target = (self.TEXT, line[pos:d.start(0)])
text = (self.SEQ, self.getkn(line[d.end(0):e.start(0)]))
pos = e.end(0)
elif d.group(0) == term[m.group(0)]:
target = (self.TEXT, line[pos:d.start(0)])
text = (self.NIL,)
pos = d.end(0)
if m.group(0) == "[[":
yield(self.LINK, target, text)
else:
yield(self.TMPL, target, text)
elif m.group(0) == "[":
i = line.find("]", m.end(0))
if i == -1:
i = len(line)
(target,sep,text) = line[m.end(0):i].partition(' ')
yield(self.REF,
(self.TEXT, target),
(self.SEQ, self.getkn(text)))
pos = i + 1
elif m.group(0) == "'''":
e = boend.search(line, m.end(0))
if e:
i = e.start(0)
pos = e.end(0)
else:
pos = len(line)
i = pos
yield(self.BOLD,
(self.SEQ, self.getkn(line[m.end(0):i])))
pos = e.end(0)
elif m.group(0) == "''":
i = self.itend(line, m.end(0))
if i == -1:
i = len(line)
yield(self.IT,
(self.SEQ, self.getkn(line[m.end(0):i])))
pos = i + 2
def input(self):
return None
def expandtok(self, tok):
if type(tok) == GeneratorType:
subtree = [self.SEQ]
for t in tok:
x = self.expandtok(t)
if x:
subtree.append(x)
return tuple(subtree) if len(subtree) > 2 else \
subtree[1] if len(subtree) == 2 else None
toktype = tok[0]
if toktype == self.NIL:
return None
if toktype == self.TEXT:
return tok if tok[1] != '' else None
elif toktype == self.LINK or toktype == self.TMPL \
or toktype == self.REF:
return toktype, self.expandtok(tok[1]), self.expandtok(tok[2])
elif toktype == self.IT or toktype == self.BOLD \
or toktype == self.ITEM:
return toktype, self.expandtok(tok[1])
elif toktype == self.HDR:
return toktype, tok[1], self.expandtok(tok[2])
elif toktype == self.ENV:
return toktype,tok[1],tok[2],self.expandtok(tok[3])
elif toktype == self.SEQ:
if len(tok) == 2:
return self.expandtok(tok[1])
elif len(tok) == 1:
return None
else:
subtree = [self.SEQ]
for t in tok[1:]:
x = self.expandtok(t)
if x:
subtree.append(x)
return tuple(subtree) if len(subtree) > 2 else \
subtree[1] if len(subtree) == 2 else None
else:
return tok
def parse(self):
tree = [self.SEQ]
for tok in self.nextkn():
tree.append(self.expandtok(tok))
self.tree = tuple(tree)
def prtok(self, tok, indent):
if not tok:
print " " * indent, "None"
return
toktype = tok[0]
if toktype == self.SEQ:
for t in tok[1:]:
self.prtok(t, indent)
else:
print " " * indent,
if toktype == self.NIL:
print "NIL"
if toktype == self.TEXT:
print "TEXT \"%s\"" % (tok[1].encode('string_escape'))
elif toktype == self.LINK:
print "LINK "
self.prtok(tok[1], indent+1) # target
self.prtok(tok[2], indent+1) # text
elif toktype == self.TMPL:
print "TMPL"
self.prtok(tok[1], indent+1) # target
self.prtok(tok[2], indent+1) # text
elif toktype == self.REF:
print "REF"
self.prtok(tok[1], indent+1) # target
self.prtok(tok[2], indent+1) # text
elif toktype == self.IT:
print "IT"
self.prtok(tok[1], indent+1)
elif toktype == self.BOLD:
print "BOLD"
self.prtok(tok[1], indent+1)
elif toktype == self.HDR:
print "HDR", tok[1]
self.prtok(tok[2], indent+1)
elif toktype == self.BAR:
print "BAR"
elif toktype == self.ENV:
print "ENV ",self.envtypes[tok[1]],tok[2]
self.prtok(tok[3], indent+1)
elif toktype == self.ITEM:
print "ITEM"
self.prtok(tok[1], indent+1)
elif toktype == self.PARA:
print "PARA"
def output(self):
self.prtok(self.tree, 0)
class WikiMarkup (BaseWikiMarkup):
"""
A derived class, that supplies a basic input method.
Three types of inputs are available:
1. filename=
The file is opened and used for input.
2. file=
The already opened file is used for input.
3. text=
Input is taken from , line by line.
Usage:
obj = WikiMarkup(arg=val)
obj.parse
... Do whatever you need with obj.tree ...
"""
file = None
text = None
lang = 'en'
html_base = 'http://%(lang)s.wiktionary.org/wiki'
image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf'
media_base = 'http://www.mediawiki.org/xml/export-0.3'
def __init__(self, *args, **keywords):
for kw in keywords:
if kw == 'file':
self.file = keywords[kw]
elif kw == 'filename':
self.file = open(keywords[kw])
elif kw == 'text':
self.text = keywords[kw].split("\n")
elif kw == 'lang':
self.lang = keywords[kw]
elif kw == 'html_base':
self.html_base = keywords[kw]
elif kw == 'image_base':
self.image_base = keywords[kw]
elif kw == 'media_base':
self.media_base = keywords[kw]
def __del__(self):
if self.file:
self.file.close()
def input(self):
if self.file:
return self.file.readline()
elif self.text:
return self.text.pop(0)
else:
return None
# ISO 639
langtab = {
"aa": "Afar", # Afar
"ab": "Аҧсуа", # Abkhazian
"ae": None, # Avestan
"af": "Afrikaans", # Afrikaans
"ak": "Akana", # Akan
"als": "Alemannisch",
"am": "አማርኛ", # Amharic
"an": "Aragonés", # Aragonese
"ang": "Englisc",
"ar": "العربية" , # Arabic
"arc": "ܐܪܡܝܐ",
"as": "অসমীয়া", # Assamese
"ast": "Asturian",
"av": "Авар", # Avaric
"ay": "Aymar", # Aymara
"az": "Azərbaycan" , # Azerbaijani
"ba": "Башҡорт", # Bashkir
"bar": "Boarisch",
"bat-smg": "Žemaitėška",
"bcl": "Bikol",
"be": "Беларуская", # Byelorussian; Belarusian
"be-x-old": "Беларуская (тарашкевіца)",
"bg": "Български", # Bulgarian
"bh": "भोजपुरी", # Bihari
"bi": "Bislama", # Bislama
"bm": "Bamanankan", # Bambara
"bn": "বাংলা" , # Bengali; Bangla
"bo": "བོད་སྐད", # Tibetan
"bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" ,
"br": "Brezhoneg" , # Breton
"bs": "Bosanski" , # Bosnian
"bug": "Basa Ugi",
"bxr": "Буряад",
"ca": "Català" , # Catalan
"cbk-zam": "Chavacano de Zamboanga",
"cdo": "Mìng-dĕ̤ng-ngṳ̄",
"cho": "Choctaw",
"ce": "Нохчийн", # Chechen
"ceb": "Sinugboanong Binisaya" , # Cebuano
"ch": "Chamor", # Chamorro
"chr": "ᏣᎳᎩ",
"chy": "Tsetsêhestâhese",
"co": "Cors", # Corsican
"cr": "Nehiyaw", # Cree
"crh": "Qırımtatarca",
"cs": "Česky" , # Czech
"csb": "Kaszëbsczi",
"c": "Словѣньскъ", # Church Slavic
"cv": "Чăваш", # Chuvash
"cy": "Cymraeg" , # Welsh
"da": "Dansk" , # Danish
"de": "Deutsch" , # German
"diq": "Zazaki", # Dimli (Southern Zazaki)
"dsb": "Dolnoserbski",
"dv": "ދިވެހިބަސް", # Divehi
"dz": "ཇོང་ཁ", # Dzongkha; Bhutani
"ee": "Eʋegbe", # Ewe
"el": "Ελληνικά" , # Greek
"eml": "Emiliàn e rumagnòl",
"en": "English" , # English
"eo": "Esperanto" ,
"es": "Español" , # Spanish
"et": "Eesti" , # Estonian
"e": "Euskara" , # Basque
"ext": "Estremeñ",
"fa": "فارسی" , # Persian
"ff": "Fulfulde", # Fulah
"fi": "Suomi" , # Finnish
"fiu-vro": "Võro",
"fj": "Na Vosa Vakaviti",# Fijian; Fiji
"fo": "Føroyskt" , # Faroese
"fr": "Français" , # French
"frp": "Arpitan",
"fur": "Furlan",
"fy": "Frysk", # Frisian
"ga": "Gaeilge", # Irish
"gan": "贛語 (Gànyŭ)",
"gd": "Gàidhlig", # Scots; Gaelic
"gl": "Gallego" , # Gallegan; Galician
"glk": "گیلکی",
"got": "𐌲𐌹𐌺 ",
"gn": "Avañe'ẽ", # Guarani
"g": "ગુજરાતી", # Gujarati
"gv": "Gaelg", # Manx
"ha": "هَوُسَ", # Hausa
"hak": "Hak-kâ-fa / 客家話",
"haw": "Hawai`i",
"he": "עברית" , # Hebrew (formerly iw)
"hi": "हिन्दी" , # Hindi
"hif": "Fiji Hindi",
"ho": "Hiri Mot", # Hiri Motu
"hr": "Hrvatski" , # Croatian
"hsb": "Hornjoserbsce",
"ht": "Krèyol ayisyen" , # Haitian; Haitian Creole
"hu": "Magyar" , # Hungarian
"hy": "Հայերեն", # Armenian
"hz": "Otsiherero", # Herero
"ia": "Interlingua",
"ie": "Interlingue",
"id": "Bahasa Indonesia",# Indonesian (formerly in)
"ig": "Igbo", # Igbo
"ii": "ꆇꉙ ", # Sichuan Yi
"ik": "Iñupiak", # Inupiak
"ilo": "Ilokano",
"io": "Ido" ,
"is": "Íslenska" , # Icelandic
"it": "Italiano" , # Italian
"i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut
"ja": "日本語", # Japanese
"jbo": "Lojban",
"jv": "Basa Jawa", # Javanese
"ka": "ქართული" , # Georgian
"kaa": "Qaraqalpaqsha",
"kab": "Taqbaylit",
"kg": "KiKongo", # Kongo
"ki": "Gĩkũyũ", # Kikuyu
"kj": "Kuanyama", # Kuanyama
"kk": "Қазақша", # Kazakh
"kl": "Kalaallisut", # Kalaallisut; Greenlandic
"km": "ភាសាខ្មែរ", # Khmer; Cambodian
"kn": "ಕನ್ನಡ", # Kannada
"ko": "한국어" , # Korean
"kr": "Kanuri", # Kanuri
"ks": "कश्मीरी / كشميري", # Kashmiri
"ksh": "Ripoarisch",
"ku": "Kurdî / كوردی", # Kurdish
"kv": "Коми", # Komi
"kw": "Kernewek/Karnuack", # Cornish
"ky": "Кыргызча", # Kirghiz
"la": "Latina" , # Latin
"lad": "Dzhudezmo",
"lb": "Lëtzebuergesch" , # Letzeburgesch
"lbe": "Лакку",
"lg": "Luganda", # Ganda
"li": "Limburgs", # Limburgish; Limburger; Limburgan
"lij": "Lígur",
"ln": "Lingala", # Lingala
"lmo": "Lumbaart",
"lo": "ລາວ", # Lao; Laotian
"lt": "Lietuvių" , # Lithuanian
"lua": "Luba", # Luba
"lv": "Latvieš" , # Latvian; Lettish
"map-bms": "Basa Banyumasan",
"mdf": "Мокшень (Mokshanj Kälj)",
"mg": "Malagasy", # Malagasy
"mh": "Ebon", # Marshall
"mi": "Māori", # Maori
"mk": "Македонски" , # Macedonian
"ml": None, # Malayalam
"mn": "Монгол", # Mongolian
"mo": "Молдовеняскэ", # Moldavian
"mr": "मराठी" , # Marathi
"ms": "Bahasa Melay" , # Malay
"mt": "Malti", # Maltese
"mus": "Muskogee",
"my": "မ္ရန္မာစာ", # Burmese
"myv": "Эрзянь (Erzjanj Kelj)",
"mzn": "مَزِروني",
"na": "dorerin Naoero", # Nauru
"nah": "Nāhuatl",
"nap": "Nnapulitano",
"nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l
"nd": None, # Ndebele, North
"nds": "Plattdüütsch",
"nds-nl": "Nedersaksisch",
"ne": "नेपाली", # Nepali
"new": "नेपाल भाषा" , # Nepal Bhasa
"ng": "Oshiwambo", # Ndonga
"nl": "Nederlands" , # Dutch
"nn": "Nynorsk", # Norwegian Nynorsk
"no": "Norsk (Bokmål)" , # Norwegian
"nov": "Novial",
"nr": None, # Ndebele, South
"nrm": "Nouormand/Normaund",
"nv": "Diné bizaad", # Navajo
"ny": "Chi-Chewa", # Chichewa; Nyanja
"oc": "Occitan", # Occitan; Proven@,{c}al
"oj": None, # Ojibwa
"om": "Oromoo", # (Afan) Oromo
"or": "ଓଡ଼ିଆ", # Oriya
"os": "Иронау", # Ossetian; Ossetic
"pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi
"pag": "Pangasinan",
"pam": "Kapampangan",
"pap": "Papiament",
"pdc": "Deitsch",
"pi": "पाऴि", # Pali
"pih": "Norfuk",
"pl": "Polski" , # Polish
"pms": "Piemontèis" ,
"ps": "پښتو", # Pashto, Pushto
"pt": "Português" , # Portuguese
"q": "Runa Simi" , # Quechua
"rm": "Rumantsch", # Rhaeto-Romance
"rmy": "romani - रोमानी",
"rn": "Kirundi", # Rundi; Kirundi
"ro": "Română" , # Romanian
"roa-rup": "Armãneashce",
"roa-tara": "Tarandíne",
"ru": "Русский" , # Russian
"rw": "Ikinyarwanda", # Kinyarwanda
"sa": "संस्कृतम्", # Sanskrit
"sah": "Саха тыла (Saxa Tyla)",
"sc": "Sardu", # Sardinian
"scn": "Sicilian",
"sco": "Scots",
"sd": "سنڌي، سندھی ، सिन्ध", # Sindhi
"se": "Sámegiella", # Northern Sami
"sg": "Sängö", # Sango; Sangro
"sh": "Srpskohrvatski / Српскохрватски" ,
"si": "සිංහල",
"simple": "Simple English" ,
"sk": "Slovenčina" , # Slovak
"sl": "Slovenščina" , # Slovenian
"sm": "Gagana Samoa", # Samoan
"sn": "chiShona", # Shona
"so": "Soomaaliga", # Somali
"sr": "Српски / Srpski", # Serbian
"srn": "Sranantongo",
"ss": "SiSwati", # Swati; Siswati
"st": "Sesotho", # Sesotho; Sotho, Southern
"stk": "Seeltersk",
"s": "Basa Sunda", # Sundanese
"sq": "Shqip" , # Albanian
"szl": "Ślůnski",
"sv": "Svenska" , # Swedish
"sw": "Kiswahili", # Swahili
"ta": "தமிழ்" , # Tamil
"te": "తెలుగు" , # Telugu
"tet": "Tetun",
"tg": "Тоҷикӣ", # Tajik
"th": "ไทย" , # Thai
"ti": "ትግርኛ", # Tigrinya
"tk": "تركمن / Туркмен", # Turkmen
"tl": "Tagalog" , # Tagalog
"tn": "Setswana", # Tswana; Setswana
"to": "faka Tonga", # Tonga (?) # Also ZW ; MW
"tokipona": "Tokipona",
"tpi": "Tok Pisin",
"tr": "Türkçe" , # Turkish
"ts": "Xitsonga", # Tsonga
"tt": "Tatarça / Татарча", # Tatar
"tum": "chiTumbuka",
"tw": "Twi", # Twi
"ty": "Reo Mā`ohi", # Tahitian
"udm": "Удмурт кыл",
"ug": "Oyghurque", # Uighur
"uk": "Українська" , # Ukrainian
"ur": "اردو", # Urdu
"uz": "O‘zbek", # Uzbek
"ve": "Tshivenda", # Venda
"vec": "Vèneto",
"vi": "Tiếng Việt" , # Vietnamese
"vls": "West-Vlams",
"vo": "Volapük" ,
"wa": "Walon", # Walloon
"war": "Winaray",
"wo": "Wolof", # Wolof
"w": "吴语",
"xal": "Хальмг",
"xh": "isiXhosa", # Xhosa
"yi": "ייִדיש", # Yiddish
"yo": "Yorùbá", # Yoruba
"za": "Cuengh", # Zhuang
"zea": "Zeêuws",
"zh": "中文" , # Chinese
"zh-classical": "古文 / 文言文",
"zm-min-nan": "Bân-lâm-gú",
"zh-yue": "粵語",
"zu": "isiZulu" # Zulu
}
def str_nil(self, tok, env):
return None
def str_text(self, tok, env):
return tok[1]
def str_seq(self, tok, env):
str = ""
for t in tok[1:]:
s = self.fmtok(t, env)
if s:
str += s
return str
def fmtok(self, tok, env):
if type(tok) != TupleType:
return ""
toktype = tok[0]
if toktype == self.NIL:
return self.str_nil(tok, env)
if toktype == self.TEXT:
return self.str_text(tok, env)
elif toktype == self.LINK:
return self.str_link(tok, env)
elif toktype == self.TMPL:
return self.str_tmpl(tok, env)
elif toktype == self.REF:
return self.str_ref(tok, env)
elif toktype == self.IT:
return self.str_it(tok, env)
elif toktype == self.BOLD:
return self.str_bold(tok, env)
elif toktype == self.HDR:
return self.str_hdr(tok, env)
elif toktype == self.BAR:
return self.str_bar(tok, env)
elif toktype == self.ENV:
return self.str_env(tok, env)
elif toktype == self.ITEM:
return self.str_item(tok, env)
elif toktype == self.SEQ:
return self.str_seq(tok, env)
elif toktype == self.PARA:
return self.str_para(tok, env)
def __str__(self):
return self.fmtok(self.tree, None)