diff options
-rw-r--r-- | wiki2html.py | 73 | ||||
-rw-r--r-- | wiki2text.py | 3 | ||||
-rw-r--r-- | wikimarkup.py | 371 |
3 files changed, 258 insertions, 189 deletions
diff --git a/wiki2html.py b/wiki2html.py index 907e3b1..7fa97b7 100644 --- a/wiki2html.py +++ b/wiki2html.py | |||
@@ -26,12 +26,44 @@ class HtmlWikiMarkup (WikiMarkup): | |||
26 | 2. [[official position]]s : final 's' gets after closing </a> tag. | 26 | 2. [[official position]]s : final 's' gets after closing </a> tag. |
27 | Should be before. | 27 | Should be before. |
28 | """ | 28 | """ |
29 | |||
30 | # FIXME: Awful kludge | ||
31 | image_kw = [ 'Image', | ||
32 | 'Grafika', | ||
33 | 'Bild', | ||
34 | 'Εικόνα', | ||
35 | 'Dosiero', | ||
36 | 'Slika', | ||
37 | 'Resim' | ||
38 | ] | ||
39 | |||
40 | ST_INIT = 0 | ||
41 | ST_PARA = 1 | ||
42 | ST_OPEN = 2 | ||
43 | |||
44 | state = [] | ||
45 | |||
46 | def opara(self): | ||
47 | if self.state[-1] == self.ST_PARA: | ||
48 | self.state[-1] = self.ST_OPEN | ||
49 | return "<p>" | ||
50 | else: | ||
51 | return "" | ||
29 | 52 | ||
53 | def cpara(self): | ||
54 | state = self.state.pop(); | ||
55 | self.state.append(self.ST_INIT) | ||
56 | if state == self.ST_OPEN: | ||
57 | return "</p>" | ||
58 | else: | ||
59 | return "" | ||
60 | |||
61 | |||
30 | def target(self, t): | 62 | def target(self, t): |
31 | (qual,sep,tgt) = t.partition(':') | 63 | (qual,sep,tgt) = t.partition(':') |
32 | r = None | 64 | r = None |
33 | if tgt != '': | 65 | if tgt != '': |
34 | if qual in ('Image', 'Grafika'): | 66 | if qual in self.image_kw: |
35 | t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt) | 67 | t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt) |
36 | elif qual == "Media": | 68 | elif qual == "Media": |
37 | t = self.media_base + '/' + tgt | 69 | t = self.media_base + '/' + tgt |
@@ -96,11 +128,30 @@ class HtmlWikiMarkup (WikiMarkup): | |||
96 | self.fmtok(tok[1], env), | 128 | self.fmtok(tok[1], env), |
97 | self.envel[env[1]]) | 129 | self.envel[env[1]]) |
98 | 130 | ||
99 | def str_seq(self, tok, env): | 131 | def str_para(self, tok, env): |
132 | s = self.cpara() | ||
133 | self.state.append(self.ST_PARA) | ||
134 | return s | ||
135 | |||
136 | def fmtok(self, tok, env): | ||
137 | if type(tok) != TupleType: | ||
138 | return "" | ||
139 | if tok[0] in [ self.ENV, self.HDR ]: | ||
140 | s = self.cpara() | ||
141 | elif tok[0] == self.BAR: | ||
142 | s = self.str_para(tok, env) | ||
143 | elif tok[0] in [ self.NIL, self.SEQ ]: | ||
100 | s = "" | 144 | s = "" |
101 | for t in tok[1:]: | 145 | else: |
102 | s += self.fmtok(t, env) | 146 | s = self.opara() |
103 | return s | 147 | s1 = WikiMarkup.fmtok(self, tok, env) |
148 | if s1: | ||
149 | s += s1 | ||
150 | return s | ||
151 | |||
152 | def __str__(self): | ||
153 | self.state = [ self.ST_PARA ] | ||
154 | return WikiMarkup.__str__(self) + self.cpara() | ||
104 | 155 | ||
105 | 156 | ||
106 | 157 | ||
@@ -109,7 +160,7 @@ class HtmlWiktionaryMarkup (HtmlWikiMarkup): | |||
109 | A class for translating Wiktionary articles into HTML. | 160 | A class for translating Wiktionary articles into HTML. |
110 | This version does not do much, except that it tries to correctly | 161 | This version does not do much, except that it tries to correctly |
111 | format templates. But "tries" does not mean "does". The heuristics | 162 | format templates. But "tries" does not mean "does". The heuristics |
112 | used here is clearly not enogh to cope with it. | 163 | used here is clearly not enough to cope with it. |
113 | 164 | ||
114 | 1. FIXME: | 165 | 1. FIXME: |
115 | The right solution would be to have a database of templates with their | 166 | The right solution would be to have a database of templates with their |
@@ -134,12 +185,14 @@ class HtmlWiktionaryMarkup (HtmlWikiMarkup): | |||
134 | seq_pos = 0 | 185 | seq_pos = 0 |
135 | 186 | ||
136 | def str_seq(self, tok, env): | 187 | def str_seq(self, tok, env): |
137 | s = "" | 188 | str = "" |
138 | self.seq_pos=0 | 189 | self.seq_pos=0 |
139 | for t in tok[1:]: | 190 | for t in tok[1:]: |
140 | s += self.fmtok(t, env) | 191 | s = self.fmtok(t, env) |
141 | self.seq_pos += 1 | 192 | if s: |
142 | return s | 193 | str += s |
194 | self.seq_pos += 1 | ||
195 | return str | ||
143 | 196 | ||
144 | def str_tmpl(self, tok, env): | 197 | def str_tmpl(self, tok, env): |
145 | arg = self.fmtok(tok[1], env) | 198 | arg = self.fmtok(tok[1], env) |
diff --git a/wiki2text.py b/wiki2text.py index e943f32..3669bd7 100644 --- a/wiki2text.py +++ b/wiki2text.py | |||
@@ -136,6 +136,9 @@ class TextWikiMarkup (WikiMarkup): | |||
136 | return "" + self.indent(lev, | 136 | return "" + self.indent(lev, |
137 | "- " + self.fmtok(tok[1], env)) | 137 | "- " + self.fmtok(tok[1], env)) |
138 | 138 | ||
139 | def str_para(self, tok, env): | ||
140 | return "\n" | ||
141 | |||
139 | def __str__(self): | 142 | def __str__(self): |
140 | return self.fmtok(self.tree, None) | 143 | return self.fmtok(self.tree, None) |
141 | 144 | ||
diff --git a/wikimarkup.py b/wikimarkup.py index d9ae7cc..e2a1cab 100644 --- a/wikimarkup.py +++ b/wikimarkup.py | |||
@@ -35,10 +35,11 @@ class BaseWikiMarkup: | |||
35 | """ | 35 | """ |
36 | A base class for handling Wiki markups. | 36 | A base class for handling Wiki markups. |
37 | It handles: | 37 | It handles: |
38 | 1. basic block markup (headers, numbered and unnumbered lists, | 38 | 1. paragraphs; |
39 | 2. basic block markup (headers, numbered and unnumbered lists, | ||
39 | indentations); | 40 | indentations); |
40 | 2. basic inline markup (bold, italic); | 41 | 3. basic inline markup (bold, italic); |
41 | 3. basic reference markup (links, templates, external links). | 42 | 4. basic reference markup (links, templates, external links). |
42 | It does NOT handle: | 43 | It does NOT handle: |
43 | 1. pseudo-html markup (<nowiki></nowiki>, and similar); | 44 | 1. pseudo-html markup (<nowiki></nowiki>, and similar); |
44 | 2. leading spaces meaning ``preserve formatting''; | 45 | 2. leading spaces meaning ``preserve formatting''; |
@@ -90,6 +91,8 @@ It handles: | |||
90 | ITEM = 10 | 91 | ITEM = 10 |
91 | # Sequence: seq | 92 | # Sequence: seq |
92 | SEQ = 11 | 93 | SEQ = 11 |
94 | # Paragraph | ||
95 | PARA = 12 | ||
93 | 96 | ||
94 | # Environment types: | 97 | # Environment types: |
95 | # Unnumbered list | 98 | # Unnumbered list |
@@ -130,6 +133,10 @@ It handles: | |||
130 | self.putback(line) | 133 | self.putback(line) |
131 | break | 134 | break |
132 | 135 | ||
136 | if line == '\n': | ||
137 | yield(self.PARA,) | ||
138 | continue | ||
139 | |||
133 | m = eltbeg.match(line) | 140 | m = eltbeg.match(line) |
134 | if m: | 141 | if m: |
135 | if m.group(0)[0] in self.envtypes: | 142 | if m.group(0)[0] in self.envtypes: |
@@ -247,8 +254,6 @@ It handles: | |||
247 | return toktype, self.expandtok(tok[1]) | 254 | return toktype, self.expandtok(tok[1]) |
248 | elif toktype == self.HDR: | 255 | elif toktype == self.HDR: |
249 | return toktype, tok[1], self.expandtok(tok[2]) | 256 | return toktype, tok[1], self.expandtok(tok[2]) |
250 | elif toktype == self.BAR: | ||
251 | return tok | ||
252 | elif toktype == self.ENV: | 257 | elif toktype == self.ENV: |
253 | return toktype,tok[1],tok[2],self.expandtok(tok[3]) | 258 | return toktype,tok[1],tok[2],self.expandtok(tok[3]) |
254 | elif toktype == self.SEQ: | 259 | elif toktype == self.SEQ: |
@@ -264,6 +269,8 @@ It handles: | |||
264 | subtree.append(x) | 269 | subtree.append(x) |
265 | return tuple(subtree) if len(subtree) > 2 else \ | 270 | return tuple(subtree) if len(subtree) > 2 else \ |
266 | subtree[1] if len(subtree) == 2 else None | 271 | subtree[1] if len(subtree) == 2 else None |
272 | else: | ||
273 | return tok | ||
267 | 274 | ||
268 | def parse(self): | 275 | def parse(self): |
269 | tree = [self.SEQ] | 276 | tree = [self.SEQ] |
@@ -314,6 +321,8 @@ It handles: | |||
314 | elif toktype == self.ITEM: | 321 | elif toktype == self.ITEM: |
315 | print "ITEM" | 322 | print "ITEM" |
316 | self.prtok(tok[1], indent+1) | 323 | self.prtok(tok[1], indent+1) |
324 | elif toktype == self.PARA: | ||
325 | print "PARA" | ||
317 | 326 | ||
318 | def output(self): | 327 | def output(self): |
319 | self.prtok(self.tree, 0) | 328 | self.prtok(self.tree, 0) |
@@ -377,314 +386,316 @@ class WikiMarkup (BaseWikiMarkup): | |||
377 | 386 | ||
378 | # ISO 639 | 387 | # ISO 639 |
379 | langtab = { | 388 | langtab = { |
380 | "aa": "Afar", # Afar | 389 | "aa": "Afar", # Afar |
381 | "ab": "Аҧсуа", # Abkhazian | 390 | "ab": "Аҧсуа", # Abkhazian |
382 | "ae": None, # Avestan | 391 | "ae": None, # Avestan |
383 | "af": "Afrikaans", # Afrikaans | 392 | "af": "Afrikaans", # Afrikaans |
384 | "ak": "Akana", # Akan # or ak_CI | 393 | "ak": "Akana", # Akan |
385 | "als": "Alemannisch", | 394 | "als": "Alemannisch", |
386 | "am": "አማርኛ", # Amharic | 395 | "am": "አማርኛ", # Amharic |
387 | "an": "Aragonés", # Aragonese | 396 | "an": "Aragonés", # Aragonese |
388 | "ang": "Englisc", | 397 | "ang": "Englisc", |
389 | "ar": "العربية" , # Arabic | 398 | "ar": "العربية" , # Arabic |
390 | "arc": "ܐܪܡܝܐ", | 399 | "arc": "ܐܪܡܝܐ", |
391 | "as": "অসমীয়া", # Assamese | 400 | "as": "অসমীয়া", # Assamese |
392 | "ast": "Asturian", | 401 | "ast": "Asturian", |
393 | "av": "Авар", # Avaric # Spoken mainly in Dagestan | 402 |