diff options
-rw-r--r-- | wiki2html.py | 69 | ||||
-rw-r--r-- | wiki2text.py | 3 | ||||
-rw-r--r-- | wikimarkup.py | 49 |
3 files changed, 95 insertions, 26 deletions
diff --git a/wiki2html.py b/wiki2html.py index 907e3b1..7fa97b7 100644 --- a/wiki2html.py +++ b/wiki2html.py @@ -27,11 +27,43 @@ class HtmlWikiMarkup (WikiMarkup): Should be before. """ + # FIXME: Awful kludge + image_kw = [ 'Image', + 'Grafika', + 'Bild', + 'Εικόνα', + 'Dosiero', + 'Slika', + 'Resim' + ] + + ST_INIT = 0 + ST_PARA = 1 + ST_OPEN = 2 + + state = [] + + def opara(self): + if self.state[-1] == self.ST_PARA: + self.state[-1] = self.ST_OPEN + return "<p>" + else: + return "" + + def cpara(self): + state = self.state.pop(); + self.state.append(self.ST_INIT) + if state == self.ST_OPEN: + return "</p>" + else: + return "" + + def target(self, t): (qual,sep,tgt) = t.partition(':') r = None if tgt != '': - if qual in ('Image', 'Grafika'): + if qual in self.image_kw: t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt) elif qual == "Media": t = self.media_base + '/' + tgt @@ -96,12 +128,31 @@ class HtmlWikiMarkup (WikiMarkup): self.fmtok(tok[1], env), self.envel[env[1]]) - def str_seq(self, tok, env): + def str_para(self, tok, env): + s = self.cpara() + self.state.append(self.ST_PARA) + return s + + def fmtok(self, tok, env): + if type(tok) != TupleType: + return "" + if tok[0] in [ self.ENV, self.HDR ]: + s = self.cpara() + elif tok[0] == self.BAR: + s = self.str_para(tok, env) + elif tok[0] in [ self.NIL, self.SEQ ]: s = "" - for t in tok[1:]: - s += self.fmtok(t, env) + else: + s = self.opara() + s1 = WikiMarkup.fmtok(self, tok, env) + if s1: + s += s1 return s + def __str__(self): + self.state = [ self.ST_PARA ] + return WikiMarkup.__str__(self) + self.cpara() + class HtmlWiktionaryMarkup (HtmlWikiMarkup): @@ -109,7 +160,7 @@ class HtmlWiktionaryMarkup (HtmlWikiMarkup): A class for translating Wiktionary articles into HTML. This version does not do much, except that it tries to correctly format templates. But "tries" does not mean "does". The heuristics - used here is clearly not enogh to cope with it. + used here is clearly not enough to cope with it. 1. FIXME: The right solution would be to have a database of templates with their @@ -134,12 +185,14 @@ class HtmlWiktionaryMarkup (HtmlWikiMarkup): seq_pos = 0 def str_seq(self, tok, env): - s = "" + str = "" self.seq_pos=0 for t in tok[1:]: - s += self.fmtok(t, env) + s = self.fmtok(t, env) + if s: + str += s self.seq_pos += 1 - return s + return str def str_tmpl(self, tok, env): arg = self.fmtok(tok[1], env) diff --git a/wiki2text.py b/wiki2text.py index e943f32..3669bd7 100644 --- a/wiki2text.py +++ b/wiki2text.py @@ -136,6 +136,9 @@ class TextWikiMarkup (WikiMarkup): return "" + self.indent(lev, "- " + self.fmtok(tok[1], env)) + def str_para(self, tok, env): + return "\n" + def __str__(self): return self.fmtok(self.tree, None) diff --git a/wikimarkup.py b/wikimarkup.py index d9ae7cc..e2a1cab 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -35,10 +35,11 @@ class BaseWikiMarkup: """ A base class for handling Wiki markups. It handles: - 1. basic block markup (headers, numbered and unnumbered lists, + 1. paragraphs; + 2. basic block markup (headers, numbered and unnumbered lists, indentations); - 2. basic inline markup (bold, italic); - 3. basic reference markup (links, templates, external links). + 3. basic inline markup (bold, italic); + 4. basic reference markup (links, templates, external links). It does NOT handle: 1. pseudo-html markup (<nowiki></nowiki>, and similar); 2. leading spaces meaning ``preserve formatting''; @@ -90,6 +91,8 @@ It handles: ITEM = 10 # Sequence: seq SEQ = 11 + # Paragraph + PARA = 12 # Environment types: # Unnumbered list @@ -130,6 +133,10 @@ It handles: self.putback(line) break + if line == '\n': + yield(self.PARA,) + continue + m = eltbeg.match(line) if m: if m.group(0)[0] in self.envtypes: @@ -247,8 +254,6 @@ It handles: return toktype, self.expandtok(tok[1]) elif toktype == self.HDR: return toktype, tok[1], self.expandtok(tok[2]) - elif toktype == self.BAR: - return tok elif toktype == self.ENV: return toktype,tok[1],tok[2],self.expandtok(tok[3]) elif toktype == self.SEQ: @@ -264,6 +269,8 @@ It handles: subtree.append(x) return tuple(subtree) if len(subtree) > 2 else \ subtree[1] if len(subtree) == 2 else None + else: + return tok def parse(self): tree = [self.SEQ] @@ -314,6 +321,8 @@ It handles: elif toktype == self.ITEM: print "ITEM" self.prtok(tok[1], indent+1) + elif toktype == self.PARA: + print "PARA" def output(self): self.prtok(self.tree, 0) @@ -381,7 +390,7 @@ class WikiMarkup (BaseWikiMarkup): "ab": "Аҧсуа", # Abkhazian "ae": None, # Avestan "af": "Afrikaans", # Afrikaans - "ak": "Akana", # Akan # or ak_CI + "ak": "Akana", # Akan "als": "Alemannisch", "am": "አማርኛ", # Amharic "an": "Aragonés", # Aragonese @@ -390,7 +399,7 @@ class WikiMarkup (BaseWikiMarkup): "arc": "ܐܪܡܝܐ", "as": "অসমীয়া", # Assamese "ast": "Asturian", - "av": "Авар", # Avaric # Spoken mainly in Dagestan + "av": "Авар", # Avaric "ay": "Aymar", # Aymara "az": "Azərbaycan" , # Azerbaijani @@ -448,7 +457,7 @@ class WikiMarkup (BaseWikiMarkup): "ext": "Estremeñ", "fa": "فارسی" , # Persian - "ff": "Fulfulde", # Fulah # Also NG, MR, and many others + "ff": "Fulfulde", # Fulah "fi": "Suomi" , # Finnish "fiu-vro": "Võro", "fj": "Na Vosa Vakaviti",# Fijian; Fiji @@ -501,7 +510,7 @@ class WikiMarkup (BaseWikiMarkup): "ka": "ქართული" , # Georgian "kaa": "Qaraqalpaqsha", "kab": "Taqbaylit", - "kg": "KiKongo", # Kongo # also CD and AO + "kg": "KiKongo", # Kongo "ki": "Gĩkũyũ", # Kikuyu "kj": "Kuanyama", # Kuanyama "kk": "Қазақша", # Kazakh @@ -528,7 +537,7 @@ class WikiMarkup (BaseWikiMarkup): "lmo": "Lumbaart", "lo": "ລາວ", # Lao; Laotian "lt": "Lietuvių" , # Lithuanian - "l": None, # Luba-Katanga + "lua": "Luba", # Luba "lv": "Latvieš" , # Latvian; Lettish "map-bms": "Basa Banyumasan", @@ -598,7 +607,7 @@ class WikiMarkup (BaseWikiMarkup): "sa": "संस्कृतम्", # Sanskrit "sah": "Саха тыла (Saxa Tyla)", - "sc": "Sard", # Sardinian + "sc": "Sardu", # Sardinian "scn": "Sicilian", "sco": "Scots", "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi @@ -621,7 +630,7 @@ class WikiMarkup (BaseWikiMarkup): "sq": "Shqip" , # Albanian "szl": "Ślůnski", "sv": "Svenska" , # Swedish - "sw": "Kiswahili", # Swahili # Also KE + "sw": "Kiswahili", # Swahili "ta": "தமிழ்" , # Tamil "te": "తెలుగు" , # Telugu @@ -636,7 +645,7 @@ class WikiMarkup (BaseWikiMarkup): "tokipona": "Tokipona", "tpi": "Tok Pisin", "tr": "Türkçe" , # Turkish - "ts": "Xitsonga", # Tsonga # ZA SZ XW + "ts": "Xitsonga", # Tsonga "tt": "Tatarça / Татарча", # Tatar "tum": "chiTumbuka", "tw": "Twi", # Twi @@ -662,7 +671,7 @@ class WikiMarkup (BaseWikiMarkup): "xal": "Хальмг", "xh": "isiXhosa", # Xhosa - "yi": "ייִדיש", # Yiddish (formerly ji) + "yi": "ייִדיש", # Yiddish "yo": "Yorùbá", # Yoruba "za": "Cuengh", # Zhuang @@ -675,16 +684,18 @@ class WikiMarkup (BaseWikiMarkup): } def str_nil(self, tok, env): - return "" + return None def str_text(self, tok, env): return tok[1] def str_seq(self, tok, env): - s = "" + str = "" for t in tok[1:]: - s += self.fmtok(t, env) - return s + s = self.fmtok(t, env) + if s: + str += s + return str def fmtok(self, tok, env): if type(tok) != TupleType: @@ -714,6 +725,8 @@ class WikiMarkup (BaseWikiMarkup): return self.str_item(tok, env) elif toktype == self.SEQ: return self.str_seq(tok, env) + elif toktype == self.PARA: + return self.str_para(tok, env) def __str__(self): return self.fmtok(self.tree, None) |