summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--wiki2html.py69
-rw-r--r--wiki2text.py3
-rw-r--r--wikimarkup.py49
3 files changed, 95 insertions, 26 deletions
diff --git a/wiki2html.py b/wiki2html.py
index 907e3b1..7fa97b7 100644
--- a/wiki2html.py
+++ b/wiki2html.py
@@ -27,11 +27,43 @@ class HtmlWikiMarkup (WikiMarkup):
Should be before.
"""
+ # FIXME: Awful kludge
+ image_kw = [ 'Image',
+ 'Grafika',
+ 'Bild',
+ 'Εικόνα',
+ 'Dosiero',
+ 'Slika',
+ 'Resim'
+ ]
+
+ ST_INIT = 0
+ ST_PARA = 1
+ ST_OPEN = 2
+
+ state = []
+
+ def opara(self):
+ if self.state[-1] == self.ST_PARA:
+ self.state[-1] = self.ST_OPEN
+ return "<p>"
+ else:
+ return ""
+
+ def cpara(self):
+ state = self.state.pop();
+ self.state.append(self.ST_INIT)
+ if state == self.ST_OPEN:
+ return "</p>"
+ else:
+ return ""
+
+
def target(self, t):
(qual,sep,tgt) = t.partition(':')
r = None
if tgt != '':
- if qual in ('Image', 'Grafika'):
+ if qual in self.image_kw:
t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt)
elif qual == "Media":
t = self.media_base + '/' + tgt
@@ -96,12 +128,31 @@ class HtmlWikiMarkup (WikiMarkup):
self.fmtok(tok[1], env),
self.envel[env[1]])
- def str_seq(self, tok, env):
+ def str_para(self, tok, env):
+ s = self.cpara()
+ self.state.append(self.ST_PARA)
+ return s
+
+ def fmtok(self, tok, env):
+ if type(tok) != TupleType:
+ return ""
+ if tok[0] in [ self.ENV, self.HDR ]:
+ s = self.cpara()
+ elif tok[0] == self.BAR:
+ s = self.str_para(tok, env)
+ elif tok[0] in [ self.NIL, self.SEQ ]:
s = ""
- for t in tok[1:]:
- s += self.fmtok(t, env)
+ else:
+ s = self.opara()
+ s1 = WikiMarkup.fmtok(self, tok, env)
+ if s1:
+ s += s1
return s
+ def __str__(self):
+ self.state = [ self.ST_PARA ]
+ return WikiMarkup.__str__(self) + self.cpara()
+
class HtmlWiktionaryMarkup (HtmlWikiMarkup):
@@ -109,7 +160,7 @@ class HtmlWiktionaryMarkup (HtmlWikiMarkup):
A class for translating Wiktionary articles into HTML.
This version does not do much, except that it tries to correctly
format templates. But "tries" does not mean "does". The heuristics
- used here is clearly not enogh to cope with it.
+ used here is clearly not enough to cope with it.
1. FIXME:
The right solution would be to have a database of templates with their
@@ -134,12 +185,14 @@ class HtmlWiktionaryMarkup (HtmlWikiMarkup):
seq_pos = 0
def str_seq(self, tok, env):
- s = ""
+ str = ""
self.seq_pos=0
for t in tok[1:]:
- s += self.fmtok(t, env)
+ s = self.fmtok(t, env)
+ if s:
+ str += s
self.seq_pos += 1
- return s
+ return str
def str_tmpl(self, tok, env):
arg = self.fmtok(tok[1], env)
diff --git a/wiki2text.py b/wiki2text.py
index e943f32..3669bd7 100644
--- a/wiki2text.py
+++ b/wiki2text.py
@@ -136,6 +136,9 @@ class TextWikiMarkup (WikiMarkup):
return "" + self.indent(lev,
"- " + self.fmtok(tok[1], env))
+ def str_para(self, tok, env):
+ return "\n"
+
def __str__(self):
return self.fmtok(self.tree, None)
diff --git a/wikimarkup.py b/wikimarkup.py
index d9ae7cc..e2a1cab 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -35,10 +35,11 @@ class BaseWikiMarkup:
"""
A base class for handling Wiki markups.
It handles:
- 1. basic block markup (headers, numbered and unnumbered lists,
+ 1. paragraphs;
+ 2. basic block markup (headers, numbered and unnumbered lists,
indentations);
- 2. basic inline markup (bold, italic);
- 3. basic reference markup (links, templates, external links).
+ 3. basic inline markup (bold, italic);
+ 4. basic reference markup (links, templates, external links).
It does NOT handle:
1. pseudo-html markup (<nowiki></nowiki>, and similar);
2. leading spaces meaning ``preserve formatting'';
@@ -90,6 +91,8 @@ It handles:
ITEM = 10
# Sequence: seq
SEQ = 11
+ # Paragraph
+ PARA = 12
# Environment types:
# Unnumbered list
@@ -130,6 +133,10 @@ It handles:
self.putback(line)
break
+ if line == '\n':
+ yield(self.PARA,)
+ continue
+
m = eltbeg.match(line)
if m:
if m.group(0)[0] in self.envtypes:
@@ -247,8 +254,6 @@ It handles:
return toktype, self.expandtok(tok[1])
elif toktype == self.HDR:
return toktype, tok[1], self.expandtok(tok[2])
- elif toktype == self.BAR:
- return tok
elif toktype == self.ENV:
return toktype,tok[1],tok[2],self.expandtok(tok[3])
elif toktype == self.SEQ:
@@ -264,6 +269,8 @@ It handles:
subtree.append(x)
return tuple(subtree) if len(subtree) > 2 else \
subtree[1] if len(subtree) == 2 else None
+ else:
+ return tok
def parse(self):
tree = [self.SEQ]
@@ -314,6 +321,8 @@ It handles:
elif toktype == self.ITEM:
print "ITEM"
self.prtok(tok[1], indent+1)
+ elif toktype == self.PARA:
+ print "PARA"
def output(self):
self.prtok(self.tree, 0)
@@ -381,7 +390,7 @@ class WikiMarkup (BaseWikiMarkup):
"ab": "Аҧсуа", # Abkhazian
"ae": None, # Avestan
"af": "Afrikaans", # Afrikaans
- "ak": "Akana", # Akan # or ak_CI
+ "ak": "Akana", # Akan
"als": "Alemannisch",
"am": "አማርኛ", # Amharic
"an": "Aragonés", # Aragonese
@@ -390,7 +399,7 @@ class WikiMarkup (BaseWikiMarkup):
"arc": "ܐܪܡܝܐ",
"as": "অসমীয়া", # Assamese
"ast": "Asturian",
- "av": "Авар", # Avaric # Spoken mainly in Dagestan
+ "av": "Авар", # Avaric
"ay": "Aymar", # Aymara
"az": "Azərbaycan" , # Azerbaijani
@@ -448,7 +457,7 @@ class WikiMarkup (BaseWikiMarkup):
"ext": "Estremeñ",
"fa": "فارسی" , # Persian
- "ff": "Fulfulde", # Fulah # Also NG, MR, and many others
+ "ff": "Fulfulde", # Fulah
"fi": "Suomi" , # Finnish
"fiu-vro": "Võro",
"fj": "Na Vosa Vakaviti",# Fijian; Fiji
@@ -501,7 +510,7 @@ class WikiMarkup (BaseWikiMarkup):
"ka": "ქართული" , # Georgian
"kaa": "Qaraqalpaqsha",
"kab": "Taqbaylit",
- "kg": "KiKongo", # Kongo # also CD and AO
+ "kg": "KiKongo", # Kongo
"ki": "Gĩkũyũ", # Kikuyu
"kj": "Kuanyama", # Kuanyama
"kk": "Қазақша", # Kazakh
@@ -528,7 +537,7 @@ class WikiMarkup (BaseWikiMarkup):
"lmo": "Lumbaart",
"lo": "ລາວ", # Lao; Laotian
"lt": "Lietuvių" , # Lithuanian
- "l": None, # Luba-Katanga
+ "lua": "Luba", # Luba
"lv": "Latvieš" , # Latvian; Lettish
"map-bms": "Basa Banyumasan",
@@ -598,7 +607,7 @@ class WikiMarkup (BaseWikiMarkup):
"sa": "संस्कृतम्", # Sanskrit
"sah": "Саха тыла (Saxa Tyla)",
- "sc": "Sard", # Sardinian
+ "sc": "Sardu", # Sardinian
"scn": "Sicilian",
"sco": "Scots",
"sd": "سنڌي، سندھی ، सिन्ध", # Sindhi
@@ -621,7 +630,7 @@ class WikiMarkup (BaseWikiMarkup):
"sq": "Shqip" , # Albanian
"szl": "Ślůnski",
"sv": "Svenska" , # Swedish
- "sw": "Kiswahili", # Swahili # Also KE
+ "sw": "Kiswahili", # Swahili
"ta": "தமிழ்" , # Tamil
"te": "తెలుగు" , # Telugu
@@ -636,7 +645,7 @@ class WikiMarkup (BaseWikiMarkup):
"tokipona": "Tokipona",
"tpi": "Tok Pisin",
"tr": "Türkçe" , # Turkish
- "ts": "Xitsonga", # Tsonga # ZA SZ XW
+ "ts": "Xitsonga", # Tsonga
"tt": "Tatarça / Татарча", # Tatar
"tum": "chiTumbuka",
"tw": "Twi", # Twi
@@ -662,7 +671,7 @@ class WikiMarkup (BaseWikiMarkup):
"xal": "Хальмг",
"xh": "isiXhosa", # Xhosa
- "yi": "ייִדיש", # Yiddish (formerly ji)
+ "yi": "ייִדיש", # Yiddish
"yo": "Yorùbá", # Yoruba
"za": "Cuengh", # Zhuang
@@ -675,16 +684,18 @@ class WikiMarkup (BaseWikiMarkup):
}
def str_nil(self, tok, env):
- return ""
+ return None
def str_text(self, tok, env):
return tok[1]
def str_seq(self, tok, env):
- s = ""
+ str = ""
for t in tok[1:]:
- s += self.fmtok(t, env)
- return s
+ s = self.fmtok(t, env)
+ if s:
+ str += s
+ return str
def fmtok(self, tok, env):
if type(tok) != TupleType:
@@ -714,6 +725,8 @@ class WikiMarkup (BaseWikiMarkup):
return self.str_item(tok, env)
elif toktype == self.SEQ:
return self.str_seq(tok, env)
+ elif toktype == self.PARA:
+ return self.str_para(tok, env)
def __str__(self):
return self.fmtok(self.tree, None)

Return to:

Send suggestions and report system problems to the System administrator.