diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2008-11-26 12:25:45 +0200 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2008-11-26 12:25:45 +0200 |
commit | 67cd79c3c64d6dfe73ff5dd80ba51d0bf9267b04 (patch) | |
tree | 6a211974fc0a7c99720fe1b4af52f54bfe6370ce | |
parent | bd79a17ca5082789d4cf82f62a6afc0baaca90e8 (diff) | |
download | wikitrans-67cd79c3c64d6dfe73ff5dd80ba51d0bf9267b04.tar.gz wikitrans-67cd79c3c64d6dfe73ff5dd80ba51d0bf9267b04.tar.bz2 |
Implement paragraphs
-rw-r--r-- | wiki2html.py | 73 | ||||
-rw-r--r-- | wiki2text.py | 3 | ||||
-rw-r--r-- | wikimarkup.py | 371 |
3 files changed, 258 insertions, 189 deletions
diff --git a/wiki2html.py b/wiki2html.py index 907e3b1..7fa97b7 100644 --- a/wiki2html.py +++ b/wiki2html.py | |||
@@ -28,3 +28,35 @@ class HtmlWikiMarkup (WikiMarkup): | |||
28 | """ | 28 | """ |
29 | |||
30 | # FIXME: Awful kludge | ||
31 | image_kw = [ 'Image', | ||
32 | 'Grafika', | ||
33 | 'Bild', | ||
34 | 'Εικόνα', | ||
35 | 'Dosiero', | ||
36 | 'Slika', | ||
37 | 'Resim' | ||
38 | ] | ||
39 | |||
40 | ST_INIT = 0 | ||
41 | ST_PARA = 1 | ||
42 | ST_OPEN = 2 | ||
43 | |||
44 | state = [] | ||
45 | |||
46 | def opara(self): | ||
47 | if self.state[-1] == self.ST_PARA: | ||
48 | self.state[-1] = self.ST_OPEN | ||
49 | return "<p>" | ||
50 | else: | ||
51 | return "" | ||
29 | 52 | ||
53 | def cpara(self): | ||
54 | state = self.state.pop(); | ||
55 | self.state.append(self.ST_INIT) | ||
56 | if state == self.ST_OPEN: | ||
57 | return "</p>" | ||
58 | else: | ||
59 | return "" | ||
60 | |||
61 | |||
30 | def target(self, t): | 62 | def target(self, t): |
@@ -33,3 +65,3 @@ class HtmlWikiMarkup (WikiMarkup): | |||
33 | if tgt != '': | 65 | if tgt != '': |
34 | if qual in ('Image', 'Grafika'): | 66 | if qual in self.image_kw: |
35 | t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt) | 67 | t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt) |
@@ -98,7 +130,26 @@ class HtmlWikiMarkup (WikiMarkup): | |||
98 | 130 | ||
99 | def str_seq(self, tok, env): | 131 | def str_para(self, tok, env): |
132 | s = self.cpara() | ||
133 | self.state.append(self.ST_PARA) | ||
134 | return s | ||
135 | |||
136 | def fmtok(self, tok, env): | ||
137 | if type(tok) != TupleType: | ||
138 | return "" | ||
139 | if tok[0] in [ self.ENV, self.HDR ]: | ||
140 | s = self.cpara() | ||
141 | elif tok[0] == self.BAR: | ||
142 | s = self.str_para(tok, env) | ||
143 | elif tok[0] in [ self.NIL, self.SEQ ]: | ||
100 | s = "" | 144 | s = "" |
101 | for t in tok[1:]: | 145 | else: |
102 | s += self.fmtok(t, env) | 146 | s = self.opara() |
103 | return s | 147 | s1 = WikiMarkup.fmtok(self, tok, env) |
148 | if s1: | ||
149 | s += s1 | ||
150 | return s | ||
151 | |||
152 | def __str__(self): | ||
153 | self.state = [ self.ST_PARA ] | ||
154 | return WikiMarkup.__str__(self) + self.cpara() | ||
104 | 155 | ||
@@ -111,3 +162,3 @@ class HtmlWiktionaryMarkup (HtmlWikiMarkup): | |||
111 | format templates. But "tries" does not mean "does". The heuristics | 162 | format templates. But "tries" does not mean "does". The heuristics |
112 | used here is clearly not enogh to cope with it. | 163 | used here is clearly not enough to cope with it. |
113 | 164 | ||
@@ -136,8 +187,10 @@ class HtmlWiktionaryMarkup (HtmlWikiMarkup): | |||
136 | def str_seq(self, tok, env): | 187 | def str_seq(self, tok, env): |
137 | s = "" | 188 | str = "" |
138 | self.seq_pos=0 | 189 | self.seq_pos=0 |
139 | for t in tok[1:]: | 190 | for t in tok[1:]: |
140 | s += self.fmtok(t, env) | 191 | s = self.fmtok(t, env) |
141 | self.seq_pos += 1 | 192 | if s: |
142 | return s | 193 | str += s |
194 | self.seq_pos += 1 | ||
195 | return str | ||
143 | 196 | ||
diff --git a/wiki2text.py b/wiki2text.py index e943f32..3669bd7 100644 --- a/wiki2text.py +++ b/wiki2text.py | |||
@@ -138,2 +138,5 @@ class TextWikiMarkup (WikiMarkup): | |||
138 | 138 | ||
139 | def str_para(self, tok, env): | ||
140 | return "\n" | ||
141 | |||
139 | def __str__(self): | 142 | def __str__(self): |
diff --git a/wikimarkup.py b/wikimarkup.py index d9ae7cc..e2a1cab 100644 --- a/wikimarkup.py +++ b/wikimarkup.py | |||
@@ -37,6 +37,7 @@ A base class for handling Wiki markups. | |||
37 | It handles: | 37 | It handles: |
38 | 1. basic block markup (headers, numbered and unnumbered lists, | 38 | 1. paragraphs; |
39 | 2. basic block markup (headers, numbered and unnumbered lists, | ||
39 | indentations); | 40 | indentations); |
40 | 2. basic inline markup (bold, italic); | 41 | 3. basic inline markup (bold, italic); |
41 | 3. basic reference markup (links, templates, external links). | 42 | 4. basic reference markup (links, templates, external links). |
42 | It does NOT handle: | 43 | It does NOT handle: |
@@ -92,2 +93,4 @@ It handles: | |||
92 | SEQ = 11 | 93 | SEQ = 11 |
94 | # Paragraph | ||
95 | PARA = 12 | ||
93 | 96 | ||
@@ -132,2 +135,6 @@ It handles: | |||
132 | 135 | ||
136 | if line == '\n': | ||
137 | yield(self.PARA,) | ||
138 | continue | ||
139 | |||
133 | m = eltbeg.match(line) | 140 | m = eltbeg.match(line) |
@@ -249,4 +256,2 @@ It handles: | |||
249 | return toktype, tok[1], self.expandtok(tok[2]) | 256 | return toktype, tok[1], self.expandtok(tok[2]) |
250 | elif toktype == self.BAR: | ||
251 | return tok | ||
252 | elif toktype == self.ENV: | 257 | elif toktype == self.ENV: |
@@ -266,2 +271,4 @@ It handles: | |||
266 | subtree[1] if len(subtree) == 2 else None | 271 | subtree[1] if len(subtree) == 2 else None |
272 | else: | ||
273 | return tok | ||
267 | 274 | ||
@@ -316,2 +323,4 @@ It handles: | |||
316 | self.prtok(tok[1], indent+1) | 323 | self.prtok(tok[1], indent+1) |
324 | elif toktype == self.PARA: | ||
325 | print "PARA" | ||
317 | 326 | ||
@@ -379,20 +388,20 @@ class WikiMarkup (BaseWikiMarkup): | |||
379 | langtab = { | 388 | langtab = { |
380 | "aa": "Afar", # Afar | 389 | "aa": "Afar", # Afar |
381 | "ab": "Аҧсуа", # Abkhazian | 390 | "ab": "Аҧсуа", # Abkhazian |
382 | "ae": None, # Avestan | 391 | "ae": None, # Avestan |
383 | "af": "Afrikaans", # Afrikaans | 392 | "af": "Afrikaans", # Afrikaans |
384 | "ak": "Akana", # Akan # or ak_CI | 393 | "ak": "Akana", # Akan |
385 | "als": "Alemannisch", | 394 | "als": "Alemannisch", |
386 | "am": "አማርኛ", # Amharic | 395 | "am": "አማርኛ", # Amharic |
387 | "an": "Aragonés", # Aragonese | 396 | "an": "Aragonés", # Aragonese |
388 | "ang": "Englisc", | 397 | "ang": "Englisc", |
389 | "ar": "العربية" , # Arabic | 398 | "ar": "العربية" , # Arabic |
390 | "arc": "ܐܪܡܝܐ", | 399 | "arc": "ܐܪܡܝܐ", |
391 | "as": "অসমীয়া", # Assamese | 400 | "as": "অসমীয়া", # Assamese |
392 | "ast": "Asturian", | 401 | "ast": "Asturian", |
393 | "av": "Авар", # Avaric # Spoken mainly in Dagestan | 402 | "av": "Авар", # Avaric |
394 | "ay": "Aymar", # Aymara | 403 | "ay": "Aymar", # Aymara |
395 | "az": "Azərbaycan" , # Azerbaijani | 404 | "az": "Azərbaycan" , # Azerbaijani |
396 | 405 | ||
397 | "ba": "Башҡорт", # Bashkir | 406 | "ba": "Башҡорт", # Bashkir |
398 | "bar": "Boarisch", | 407 | "bar": "Boarisch", |
@@ -400,13 +409,13 @@ class WikiMarkup (BaseWikiMarkup): | |||
400 | "bcl": "Bikol", | 409 | "bcl": "Bikol", |
401 | "be": "Беларуская", # Byelorussian; Belarusian | 410 | "be": "Беларуская", # Byelorussian; Belarusian |
402 | "be-x-old": "Беларуская (тарашкевіца)", | 411 | "be-x-old": "Беларуская (тарашкевіца)", |
403 | "bg": "Български", # Bulgarian | 412 | "bg": "Български", # Bulgarian |
404 | "bh": "भोजपुरी", # Bihari | 413 | "bh": "भोजपुरी", # Bihari |
405 | "bi": "Bislama", # Bislama | 414 | "bi": "Bislama", # Bislama |
406 | "bm": "Bamanankan", # Bambara | 415 | "bm": "Bamanankan", # Bambara |
407 | "bn": "বাংলা" , # Bengali; Bangla | 416 | "bn": "বাংলা" , # Bengali; Bangla |
408 | "bo": "བོད་སྐད", # Tibetan | 417 | "bo": "བོད་སྐད", # Tibetan |
409 | "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" , | 418 | "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" , |
410 | "br": "Brezhoneg" , # Breton | 419 | "br": "Brezhoneg" , # Breton |
411 | "bs": "Bosanski" , # Bosnian | 420 | "bs": "Bosanski" , # Bosnian |
412 | "bug": "Basa Ugi", | 421 | "bug": "Basa Ugi", |
@@ -414,3 +423,3 @@ class WikiMarkup (BaseWikiMarkup): | |||
414 | 423 | ||
415 | "ca": "Català" , # Catalan | 424 | "ca": "Català" , # Catalan |
416 | "cbk-zam": "Chavacano de Zamboanga", | 425 | "cbk-zam": "Chavacano de Zamboanga", |
@@ -418,67 +427,67 @@ class WikiMarkup (BaseWikiMarkup): | |||
418 | "cho": "Choctaw", | 427 | "cho": "Choctaw", |
419 | "ce": "Нохчийн", # Chechen | 428 | "ce": "Нохчийн", # Chechen |
420 | "ceb": "Sinugboanong Binisaya" , # Cebuano | 429 | "ceb": "Sinugboanong Binisaya" , # Cebuano |
421 | "ch": "Chamor", # Chamorro | 430 | "ch": "Chamor", # Chamorro |
422 | "chr": "ᏣᎳᎩ", | 431 | "chr": "ᏣᎳᎩ", |
423 | "chy": "Tsetsêhestâhese", | 432 | "chy": "Tsetsêhestâhese", |
424 | "co": "Cors", # Corsican | 433 | "co": "Cors", # Corsican |
425 | "cr": "Nehiyaw", # Cree | 434 | "cr": "Nehiyaw", # Cree |
426 | "crh": "Qırımtatarca", | 435 | "crh": "Qırımtatarca", |
427 | "cs": "Česky" , # Czech | 436 | "cs": "Česky" , # Czech |
428 | "csb": "Kaszëbsczi", | 437 | "csb": "Kaszëbsczi", |
429 | "c": "Словѣньскъ", # Church Slavic | 438 | "c": "Словѣньскъ", # Church Slavic |
430 | "cv": "Чăваш", # Chuvash | 439 | "cv": "Чăваш", # Chuvash |
431 | "cy": "Cymraeg" , # Welsh | 440 | "cy": "Cymraeg" , # Welsh |
432 | 441 | ||
433 | "da": "Dansk" , # Danish | 442 | "da": "Dansk" , # Danish |
434 | "de": "Deutsch" , # German | 443 | "de": "Deutsch" , # German |
435 | "diq": "Zazaki", # Dimli (Southern Zazaki) | 444 | "diq": "Zazaki", # Dimli (Southern Zazaki) |
436 | "dsb": "Dolnoserbski", | 445 | "dsb": "Dolnoserbski", |