summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2008-11-26 12:25:45 +0200
committerSergey Poznyakoff <gray@gnu.org.ua>2008-11-26 12:25:45 +0200
commit67cd79c3c64d6dfe73ff5dd80ba51d0bf9267b04 (patch)
tree6a211974fc0a7c99720fe1b4af52f54bfe6370ce
parentbd79a17ca5082789d4cf82f62a6afc0baaca90e8 (diff)
downloadwikitrans-67cd79c3c64d6dfe73ff5dd80ba51d0bf9267b04.tar.gz
wikitrans-67cd79c3c64d6dfe73ff5dd80ba51d0bf9267b04.tar.bz2
Implement paragraphs
-rw-r--r--wiki2html.py73
-rw-r--r--wiki2text.py3
-rw-r--r--wikimarkup.py371
3 files changed, 258 insertions, 189 deletions
diff --git a/wiki2html.py b/wiki2html.py
index 907e3b1..7fa97b7 100644
--- a/wiki2html.py
+++ b/wiki2html.py
@@ -28,3 +28,35 @@ class HtmlWikiMarkup (WikiMarkup):
28 """ 28 """
29
30 # FIXME: Awful kludge
31 image_kw = [ 'Image',
32 'Grafika',
33 'Bild',
34 'Εικόνα',
35 'Dosiero',
36 'Slika',
37 'Resim'
38 ]
39
40 ST_INIT = 0
41 ST_PARA = 1
42 ST_OPEN = 2
43
44 state = []
45
46 def opara(self):
47 if self.state[-1] == self.ST_PARA:
48 self.state[-1] = self.ST_OPEN
49 return "<p>"
50 else:
51 return ""
29 52
53 def cpara(self):
54 state = self.state.pop();
55 self.state.append(self.ST_INIT)
56 if state == self.ST_OPEN:
57 return "</p>"
58 else:
59 return ""
60
61
30 def target(self, t): 62 def target(self, t):
@@ -33,3 +65,3 @@ class HtmlWikiMarkup (WikiMarkup):
33 if tgt != '': 65 if tgt != '':
34 if qual in ('Image', 'Grafika'): 66 if qual in self.image_kw:
35 t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt) 67 t = self.image_base + urllib.quote(tgt) + '/250px-' + urllib.quote(tgt)
@@ -98,7 +130,26 @@ class HtmlWikiMarkup (WikiMarkup):
98 130
99 def str_seq(self, tok, env): 131 def str_para(self, tok, env):
132 s = self.cpara()
133 self.state.append(self.ST_PARA)
134 return s
135
136 def fmtok(self, tok, env):
137 if type(tok) != TupleType:
138 return ""
139 if tok[0] in [ self.ENV, self.HDR ]:
140 s = self.cpara()
141 elif tok[0] == self.BAR:
142 s = self.str_para(tok, env)
143 elif tok[0] in [ self.NIL, self.SEQ ]:
100 s = "" 144 s = ""
101 for t in tok[1:]: 145 else:
102 s += self.fmtok(t, env) 146 s = self.opara()
103 return s 147 s1 = WikiMarkup.fmtok(self, tok, env)
148 if s1:
149 s += s1
150 return s
151
152 def __str__(self):
153 self.state = [ self.ST_PARA ]
154 return WikiMarkup.__str__(self) + self.cpara()
104 155
@@ -111,3 +162,3 @@ class HtmlWiktionaryMarkup (HtmlWikiMarkup):
111 format templates. But "tries" does not mean "does". The heuristics 162 format templates. But "tries" does not mean "does". The heuristics
112 used here is clearly not enogh to cope with it. 163 used here is clearly not enough to cope with it.
113 164
@@ -136,8 +187,10 @@ class HtmlWiktionaryMarkup (HtmlWikiMarkup):
136 def str_seq(self, tok, env): 187 def str_seq(self, tok, env):
137 s = "" 188 str = ""
138 self.seq_pos=0 189 self.seq_pos=0
139 for t in tok[1:]: 190 for t in tok[1:]:
140 s += self.fmtok(t, env) 191 s = self.fmtok(t, env)
141 self.seq_pos += 1 192 if s:
142 return s 193 str += s
194 self.seq_pos += 1
195 return str
143 196
diff --git a/wiki2text.py b/wiki2text.py
index e943f32..3669bd7 100644
--- a/wiki2text.py
+++ b/wiki2text.py
@@ -138,2 +138,5 @@ class TextWikiMarkup (WikiMarkup):
138 138
139 def str_para(self, tok, env):
140 return "\n"
141
139 def __str__(self): 142 def __str__(self):
diff --git a/wikimarkup.py b/wikimarkup.py
index d9ae7cc..e2a1cab 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -37,6 +37,7 @@ A base class for handling Wiki markups.
37It handles: 37It handles:
38 1. basic block markup (headers, numbered and unnumbered lists, 38 1. paragraphs;
39 2. basic block markup (headers, numbered and unnumbered lists,
39 indentations); 40 indentations);
40 2. basic inline markup (bold, italic); 41 3. basic inline markup (bold, italic);
41 3. basic reference markup (links, templates, external links). 42 4. basic reference markup (links, templates, external links).
42 It does NOT handle: 43 It does NOT handle:
@@ -92,2 +93,4 @@ It handles:
92 SEQ = 11 93 SEQ = 11
94 # Paragraph
95 PARA = 12
93 96
@@ -132,2 +135,6 @@ It handles:
132 135
136 if line == '\n':
137 yield(self.PARA,)
138 continue
139
133 m = eltbeg.match(line) 140 m = eltbeg.match(line)
@@ -249,4 +256,2 @@ It handles:
249 return toktype, tok[1], self.expandtok(tok[2]) 256 return toktype, tok[1], self.expandtok(tok[2])
250 elif toktype == self.BAR:
251 return tok
252 elif toktype == self.ENV: 257 elif toktype == self.ENV:
@@ -266,2 +271,4 @@ It handles:
266 subtree[1] if len(subtree) == 2 else None 271 subtree[1] if len(subtree) == 2 else None
272 else:
273 return tok
267 274
@@ -316,2 +323,4 @@ It handles:
316 self.prtok(tok[1], indent+1) 323 self.prtok(tok[1], indent+1)
324 elif toktype == self.PARA:
325 print "PARA"
317 326
@@ -379,20 +388,20 @@ class WikiMarkup (BaseWikiMarkup):
379 langtab = { 388 langtab = {
380 "aa": "Afar", # Afar 389 "aa": "Afar", # Afar
381 "ab": "Аҧсуа", # Abkhazian 390 "ab": "Аҧсуа", # Abkhazian
382 "ae": None, # Avestan 391 "ae": None, # Avestan
383 "af": "Afrikaans", # Afrikaans 392 "af": "Afrikaans", # Afrikaans
384 "ak": "Akana", # Akan # or ak_CI 393 "ak": "Akana", # Akan
385 "als": "Alemannisch", 394 "als": "Alemannisch",
386 "am": "አማርኛ", # Amharic 395 "am": "አማርኛ", # Amharic
387 "an": "Aragonés", # Aragonese 396 "an": "Aragonés", # Aragonese
388 "ang": "Englisc", 397 "ang": "Englisc",
389 "ar": "العربية" , # Arabic 398 "ar": "العربية" , # Arabic
390 "arc": "ܐܪܡܝܐ", 399 "arc": "ܐܪܡܝܐ",
391 "as": "অসমীয়া", # Assamese 400 "as": "অসমীয়া", # Assamese
392 "ast": "Asturian", 401 "ast": "Asturian",
393 "av": "Авар", # Avaric # Spoken mainly in Dagestan 402 "av": "Авар", # Avaric
394 "ay": "Aymar", # Aymara 403 "ay": "Aymar", # Aymara
395 "az": "Azərbaycan" , # Azerbaijani 404 "az": "Azərbaycan" , # Azerbaijani
396 405
397 "ba": "Башҡорт", # Bashkir 406 "ba": "Башҡорт", # Bashkir
398 "bar": "Boarisch", 407 "bar": "Boarisch",
@@ -400,13 +409,13 @@ class WikiMarkup (BaseWikiMarkup):
400 "bcl": "Bikol", 409 "bcl": "Bikol",
401 "be": "Беларуская", # Byelorussian; Belarusian 410 "be": "Беларуская", # Byelorussian; Belarusian
402 "be-x-old": "Беларуская (тарашкевіца)", 411 "be-x-old": "Беларуская (тарашкевіца)",
403 "bg": "Български", # Bulgarian 412 "bg": "Български", # Bulgarian
404 "bh": "भोजपुरी", # Bihari 413 "bh": "भोजपुरी", # Bihari
405 "bi": "Bislama", # Bislama 414 "bi": "Bislama", # Bislama
406 "bm": "Bamanankan", # Bambara 415 "bm": "Bamanankan", # Bambara
407 "bn": "বাংলা" , # Bengali; Bangla 416 "bn": "বাংলা" , # Bengali; Bangla
408 "bo": "བོད་སྐད", # Tibetan 417 "bo": "བོད་སྐད", # Tibetan
409 "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" , 418 "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" ,
410 "br": "Brezhoneg" , # Breton 419 "br": "Brezhoneg" , # Breton
411 "bs": "Bosanski" , # Bosnian 420 "bs": "Bosanski" , # Bosnian
412 "bug": "Basa Ugi", 421 "bug": "Basa Ugi",
@@ -414,3 +423,3 @@ class WikiMarkup (BaseWikiMarkup):
414 423
415 "ca": "Català" , # Catalan 424 "ca": "Català" , # Catalan
416 "cbk-zam": "Chavacano de Zamboanga", 425 "cbk-zam": "Chavacano de Zamboanga",
@@ -418,67 +427,67 @@ class WikiMarkup (BaseWikiMarkup):
418 "cho": "Choctaw", 427 "cho": "Choctaw",
419 "ce": "Нохчийн", # Chechen 428 "ce": "Нохчийн", # Chechen
420 "ceb": "Sinugboanong Binisaya" , # Cebuano 429 "ceb": "Sinugboanong Binisaya" , # Cebuano
421 "ch": "Chamor", # Chamorro 430 "ch": "Chamor", # Chamorro
422 "chr": "ᏣᎳᎩ", 431 "chr": "ᏣᎳᎩ",
423 "chy": "Tsetsêhestâhese", 432 "chy": "Tsetsêhestâhese",
424 "co": "Cors", # Corsican 433 "co": "Cors", # Corsican
425 "cr": "Nehiyaw", # Cree 434 "cr": "Nehiyaw", # Cree
426 "crh": "Qırımtatarca", 435 "crh": "Qırımtatarca",
427 "cs": "Česky" , # Czech 436 "cs": "Česky" , # Czech
428 "csb": "Kaszëbsczi", 437 "csb": "Kaszëbsczi",
429 "c": "Словѣньскъ", # Church Slavic 438 "c": "Словѣньскъ", # Church Slavic
430 "cv": "Чăваш", # Chuvash 439 "cv": "Чăваш", # Chuvash
431 "cy": "Cymraeg" , # Welsh 440 "cy": "Cymraeg" , # Welsh
432 441
433 "da": "Dansk" , # Danish 442 "da": "Dansk" , # Danish
434 "de": "Deutsch" , # German 443 "de": "Deutsch" , # German
435 "diq": "Zazaki", # Dimli (Southern Zazaki) 444 "diq": "Zazaki", # Dimli (Southern Zazaki)
436 "dsb": "Dolnoserbski", 445 "dsb": "Dolnoserbski",