summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--wiki2html.py73
-rw-r--r--wiki2text.py3
-rw-r--r--wikimarkup.py371
3 files changed, 258 insertions, 189 deletions
diff --git a/wiki2html.py b/wiki2html.py
index 907e3b1..7fa97b7 100644
--- a/wiki2html.py
+++ b/wiki2html.py
@@ -26,12 +26,44 @@ class HtmlWikiMarkup (WikiMarkup):
26 2. [[official position]]s : final 's' gets after closing </a> tag. 26 2. [[official position]]s : final 's' gets after closing </a> tag.
27 Should be before. 27 Should be before.
28 """ 28 """
29
30 # FIXME: Awful kludge
31 image_kw = [ 'Image',
32 'Grafika',
33 'Bild',
34 'Εικόνα',
35 'Dosiero',
36 'Slika',
37 'Resim'
38 ]
39
40 ST_INIT = 0
41 ST_PARA = 1
42 ST_OPEN = 2
43
44 state = []
45
46 def opara(self):
47 if self.state[-1] == self.ST_PARA:
48 self.state[-1] = self.ST_OPEN
49 return "<p>"
50 else:
51 return ""
29 52
53 def cpara(self):
54 state = self.state.pop();
55 self.state.append(self.ST_INIT)
56 if state == self.ST_OPEN:
57 return "</p>"
58 else:
59 return ""
60
61
30 def target(self, t): 62 def target(self, t):
31 (qual,sep,tgt) = t.partition(':') 63 (qual,sep,tgt) = t.partition(':')
32 r = None 64 r = None
33 if tgt != '': 65 if tgt != '':
34 if qual in ('Image', 'Grafika'): 66 if qual in self.image_kw:
35 t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt) 67 t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt)
36 elif qual == "Media": 68 elif qual == "Media":
37 t = self.media_base + '/' + tgt 69 t = self.media_base + '/' + tgt
@@ -96,11 +128,30 @@ class HtmlWikiMarkup (WikiMarkup):
96 self.fmtok(tok[1], env), 128 self.fmtok(tok[1], env),
97 self.envel[env[1]]) 129 self.envel[env[1]])
98 130
99 def str_seq(self, tok, env): 131 def str_para(self, tok, env):
132 s = self.cpara()
133 self.state.append(self.ST_PARA)
134 return s
135
136 def fmtok(self, tok, env):
137 if type(tok) != TupleType:
138 return ""
139 if tok[0] in [ self.ENV, self.HDR ]:
140 s = self.cpara()
141 elif tok[0] == self.BAR:
142 s = self.str_para(tok, env)
143 elif tok[0] in [ self.NIL, self.SEQ ]:
100 s = "" 144 s = ""
101 for t in tok[1:]: 145 else:
102 s += self.fmtok(t, env) 146 s = self.opara()
103 return s 147 s1 = WikiMarkup.fmtok(self, tok, env)
148 if s1:
149 s += s1
150 return s
151
152 def __str__(self):
153 self.state = [ self.ST_PARA ]
154 return WikiMarkup.__str__(self) + self.cpara()
104 155
105 156
106 157
@@ -109,7 +160,7 @@ class HtmlWiktionaryMarkup (HtmlWikiMarkup):
109 A class for translating Wiktionary articles into HTML. 160 A class for translating Wiktionary articles into HTML.
110 This version does not do much, except that it tries to correctly 161 This version does not do much, except that it tries to correctly
111 format templates. But "tries" does not mean "does". The heuristics 162 format templates. But "tries" does not mean "does". The heuristics
112 used here is clearly not enogh to cope with it. 163 used here is clearly not enough to cope with it.
113 164
114 1. FIXME: 165 1. FIXME:
115 The right solution would be to have a database of templates with their 166 The right solution would be to have a database of templates with their
@@ -134,12 +185,14 @@ class HtmlWiktionaryMarkup (HtmlWikiMarkup):
134 seq_pos = 0 185 seq_pos = 0
135 186
136 def str_seq(self, tok, env): 187 def str_seq(self, tok, env):
137 s = "" 188 str = ""
138 self.seq_pos=0 189 self.seq_pos=0
139 for t in tok[1:]: 190 for t in tok[1:]:
140 s += self.fmtok(t, env) 191 s = self.fmtok(t, env)
141 self.seq_pos += 1 192 if s:
142 return s 193 str += s
194 self.seq_pos += 1
195 return str
143 196
144 def str_tmpl(self, tok, env): 197 def str_tmpl(self, tok, env):
145 arg = self.fmtok(tok[1], env) 198 arg = self.fmtok(tok[1], env)
diff --git a/wiki2text.py b/wiki2text.py
index e943f32..3669bd7 100644
--- a/wiki2text.py
+++ b/wiki2text.py
@@ -136,6 +136,9 @@ class TextWikiMarkup (WikiMarkup):
136 return "" + self.indent(lev, 136 return "" + self.indent(lev,
137 "- " + self.fmtok(tok[1], env)) 137 "- " + self.fmtok(tok[1], env))
138 138
139 def str_para(self, tok, env):
140 return "\n"
141
139 def __str__(self): 142 def __str__(self):
140 return self.fmtok(self.tree, None) 143 return self.fmtok(self.tree, None)
141 144
diff --git a/wikimarkup.py b/wikimarkup.py
index d9ae7cc..e2a1cab 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -35,10 +35,11 @@ class BaseWikiMarkup:
35 """ 35 """
36A base class for handling Wiki markups. 36A base class for handling Wiki markups.
37It handles: 37It handles:
38 1. basic block markup (headers, numbered and unnumbered lists, 38 1. paragraphs;
39 2. basic block markup (headers, numbered and unnumbered lists,
39 indentations); 40 indentations);
40 2. basic inline markup (bold, italic); 41 3. basic inline markup (bold, italic);
41 3. basic reference markup (links, templates, external links). 42 4. basic reference markup (links, templates, external links).
42 It does NOT handle: 43 It does NOT handle:
43 1. pseudo-html markup (<nowiki></nowiki>, and similar); 44 1. pseudo-html markup (<nowiki></nowiki>, and similar);
44 2. leading spaces meaning ``preserve formatting''; 45 2. leading spaces meaning ``preserve formatting'';
@@ -90,6 +91,8 @@ It handles:
90 ITEM = 10 91 ITEM = 10
91 # Sequence: seq 92 # Sequence: seq
92 SEQ = 11 93 SEQ = 11
94 # Paragraph
95 PARA = 12
93 96
94 # Environment types: 97 # Environment types:
95 # Unnumbered list 98 # Unnumbered list
@@ -130,6 +133,10 @@ It handles:
130 self.putback(line) 133 self.putback(line)
131 break 134 break
132 135
136 if line == '\n':
137 yield(self.PARA,)
138 continue
139
133 m = eltbeg.match(line) 140 m = eltbeg.match(line)
134 if m: 141 if m:
135 if m.group(0)[0] in self.envtypes: 142 if m.group(0)[0] in self.envtypes:
@@ -247,8 +254,6 @@ It handles:
247 return toktype, self.expandtok(tok[1]) 254 return toktype, self.expandtok(tok[1])
248 elif toktype == self.HDR: 255 elif toktype == self.HDR:
249 return toktype, tok[1], self.expandtok(tok[2]) 256 return toktype, tok[1], self.expandtok(tok[2])
250 elif toktype == self.BAR:
251 return tok
252 elif toktype == self.ENV: 257 elif toktype == self.ENV:
253 return toktype,tok[1],tok[2],self.expandtok(tok[3]) 258 return toktype,tok[1],tok[2],self.expandtok(tok[3])
254 elif toktype == self.SEQ: 259 elif toktype == self.SEQ:
@@ -264,6 +269,8 @@ It handles:
264 subtree.append(x) 269 subtree.append(x)
265 return tuple(subtree) if len(subtree) > 2 else \ 270 return tuple(subtree) if len(subtree) > 2 else \
266 subtree[1] if len(subtree) == 2 else None 271 subtree[1] if len(subtree) == 2 else None
272 else:
273 return tok
267 274
268 def parse(self): 275 def parse(self):
269 tree = [self.SEQ] 276 tree = [self.SEQ]
@@ -314,6 +321,8 @@ It handles:
314 elif toktype == self.ITEM: 321 elif toktype == self.ITEM:
315 print "ITEM" 322 print "ITEM"
316 self.prtok(tok[1], indent+1) 323 self.prtok(tok[1], indent+1)
324 elif toktype == self.PARA:
325 print "PARA"
317 326
318 def output(self): 327 def output(self):
319 self.prtok(self.tree, 0) 328 self.prtok(self.tree, 0)
@@ -377,314 +386,316 @@ class WikiMarkup (BaseWikiMarkup):
377 386
378 # ISO 639 387 # ISO 639
379 langtab = { 388 langtab = {
380 "aa": "Afar", # Afar 389 "aa": "Afar", # Afar
381 "ab": "Аҧсуа", # Abkhazian 390 "ab": "Аҧсуа", # Abkhazian
382 "ae": None, # Avestan 391 "ae": None, # Avestan
383 "af": "Afrikaans", # Afrikaans 392 "af": "Afrikaans", # Afrikaans
384 "ak": "Akana", # Akan # or ak_CI 393 "ak": "Akana", # Akan
385 "als": "Alemannisch", 394 "als": "Alemannisch",
386 "am": "አማርኛ", # Amharic 395 "am": "አማርኛ", # Amharic
387 "an": "Aragonés", # Aragonese 396 "an": "Aragonés", # Aragonese
388 "ang": "Englisc", 397 "ang": "Englisc",
389 "ar": "العربية" , # Arabic 398 "ar": "العربية" , # Arabic
390 "arc": "ܐܪܡܝܐ", 399 "arc": "ܐܪܡܝܐ",
391 "as": "অসমীয়া", # Assamese 400 "as": "অসমীয়া", # Assamese
392 "ast": "Asturian", 401 "ast": "Asturian",
393 "av": "Авар", # Avaric # Spoken mainly in Dagestan 402