summaryrefslogtreecommitdiff
path: root/wiki2text.py
diff options
context:
space:
mode:
Diffstat (limited to 'wiki2text.py')
-rw-r--r--wiki2text.py177
1 files changed, 116 insertions, 61 deletions
diff --git a/wiki2text.py b/wiki2text.py
index f28c343..c41c4e0 100644
--- a/wiki2text.py
+++ b/wiki2text.py
@@ -23,13 +23,13 @@ import urllib
23class TextWikiMarkup (WikiMarkup): 23class TextWikiMarkup (WikiMarkup):
24 """ 24 """
25 A (general-purpose Wiki->Text translator class. 25 A (general-purpose Wiki->Text translator class.
26 """ 26 """
27 27
28 # Output width 28 # Output width
29 width = 80 29 width = 78
30 # Do not show references. 30 # Do not show references.
31 references = False 31 references = False
32 # Provide a minimum markup 32 # Provide a minimum markup
33 markup = True 33 markup = True
34 34
35 # Number of current element in the environment 35 # Number of current element in the environment
@@ -54,28 +54,32 @@ class TextWikiMarkup (WikiMarkup):
54 if str in wiki_ns[self.lang]: 54 if str in wiki_ns[self.lang]:
55 return wiki_ns[self.lang][str] 55 return wiki_ns[self.lang][str]
56 elif str in wiki_ns_re[self.lang]: 56 elif str in wiki_ns_re[self.lang]:
57 for elt in wiki_ns_re[self.lang][str]: 57 for elt in wiki_ns_re[self.lang][str]:
58 if str.beginswith(elt[0]) and str.endswith(elt[1]): 58 if str.beginswith(elt[0]) and str.endswith(elt[1]):
59 return elt[2] 59 return elt[2]
60 return None 60 return None
61
61 def mktgt(self, tgt, lang = None): 62 def mktgt(self, tgt, lang = None):
62 if not lang: 63 if not lang:
63 lang = self.lang 64 lang = self.lang
64 return self.html_base % { 'lang' : lang } + urllib.quote(tgt) 65 return self.html_base % { 'lang' : lang } + urllib.quote(tgt)
65 66
66 def link(self, tok, env, istmpl): 67 def fmtlink(self, elt, istmpl):
67 arg = self.fmtok(tok[1], env) 68 arg = self.format(elt[1][0])
68 text = self.fmtok(tok[2], env) 69 if len(elt[1]) > 1:
70 text = self.format(elt[1][1])
71 else:
72 text = None
69 (qual,sep,tgt) = arg.partition(':') 73 (qual,sep,tgt) = arg.partition(':')
70 if tgt != '': 74 if tgt != '':
71 ns = self.wiki_ns_name(qual) 75 ns = self.wiki_ns_name(qual)
72 if ns: 76 if ns:
73 if ns == 'NS_IMAGE': 77 if ns == 'NS_IMAGE':
74 if not self.references: 78 if not self.references:
75 return None 79 return ""
76 text = "[%s: %s]" % (qual, text if text else arg) 80 text = "[%s: %s]" % (qual, text if text else arg)
77 tgt = self.image_base + '/' + \ 81 tgt = self.image_base + '/' + \
78 urllib.quote(tgt) + \ 82 urllib.quote(tgt) + \
79 '/250px-' + urllib.quote(tgt) 83 '/250px-' + urllib.quote(tgt)
80 elif ns == 'NS_MEDIA': 84 elif ns == 'NS_MEDIA':
81 text = "[%s]" % (qual) 85 text = "[%s]" % (qual)
@@ -91,82 +95,133 @@ class TextWikiMarkup (WikiMarkup):
91 if self.references: 95 if self.references:
92 return "%s (see %s) " % (text, tgt) 96 return "%s (see %s) " % (text, tgt)
93 elif not text or text == '': 97 elif not text or text == '':
94 return arg 98 return arg
95 else: 99 else:
96 return text 100 return text
97
98 def str_link(self, tok, env):
99 return self.link(tok, env, False)
100
101 def str_tmpl(self, tok, env):
102 return self.link(tok, env, True)
103
104 def str_ref(self, tok, env):
105 return self.xref(self.fmtok(tok[2], env), self.fmtok(tok[1], env))
106
107 def str_it(self, tok, env):
108 if self.markup:
109 return "_" + self.fmtok(tok[1], env) + "_"
110 return self.fmtok(tok[1], env);
111
112 def str_bold(self, tok, env):
113 if self.markup:
114 return self.fmtok(tok[1], env).upper()
115 return self.fmtok(tok[1], env);
116
117 def str_hdr(self, tok, env):
118 level = tok[1]
119 return "\n\n" + ("*" * level) + " " + self.fmtok(tok[2], env) + "\n\n"
120
121 def str_bar(self, tok, env):
122 w = self.width
123 if w < 5:
124 w = 5
125 return "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
126
127 def str_env(self, tok, env):
128 self.num = 1
129 return "\n" + self.fmtok(tok[3], tok)
130 101
131 def indent (self, lev, text): 102 def indent (self, lev, text):
103 print "T \"",text,"\""
132 w = self.width 104 w = self.width
133 self.width = w - lev 105 self.width = w - lev
134 if text.find('\n') == -1: 106 if text.find('\n') == -1:
135 s = (" " * lev) + text 107 s = (" " * lev) + text
136 else: 108 else:
137 s = "" 109 s = ""
138 for elt in text.split('\n'): 110 for elt in text.split('\n'):
139 s += (" " * lev) + elt 111 s += (" " * lev) + elt + '\n'
140 if elt == '':
141 s += "\n"
142 112
143 self.width = w 113 self.width = w
144 return s 114 return s
115
116 def fmtpara(self, input):
117 output = ""
118 linebuf = ""
119 length = 0
120 for s in input.split():
121 wlen = len(s)
122 if linebuf.endswith("."):
123 wsc = 2
124 else:
125 wsc = 1
126 if length + wsc + wlen > self.width:
127 # FIXME: fill out linebuf
128 output += linebuf + '\n'
129 wsc = 0
130 length = 0
131 linebuf = ""
132 linebuf += " " * wsc + s
133 length += wsc + wlen
134 return output + linebuf
145 135
146 def str_item(self, tok, env): 136 def fmtelt(self, elt, indent=0):
147 t = env[1] 137 if elt[0] == TEXT:
148 lev = env[2] 138 if isinstance(elt[1],list):
149 if lev > self.width - 4: 139 string = ""
150 lev = 1 140 for s in elt[1]:
151 if t == self.INDENT: 141 if string:
152 return self.indent(lev, self.fmtok(tok[1], env)) 142 if string.endswith("."):
153 elif t == self.ENVNUM: 143 string += " "
154 n = self.num 144 else:
155 self.num += 1 145 string += " "
156 return "" + self.indent(lev, 146 string += s.rstrip(" ")
157 "%d. %s" % (n, self.fmtok(tok[1], env))) 147 else:
158 elif t == self.ENVUNNUM: 148 string = elt[1]
159 return "" + self.indent(lev, 149 elif elt[0] == PARA:
160 "- " + self.fmtok(tok[1], env)) 150 string = "";
151 for x in elt[1]:
152 string += self.format(x)
153 string = self.fmtpara(string) + '\n\n'
154 elif elt[0] == IT:
155 string = ""
156 for x in elt[1]:
157 s = self.format(x)
158 if s:
159 string += " " + s.rstrip(" ")
160 string = "_" + string.lstrip(" ") + "_"
161 elif elt[0] == BOLD:
162 string = ""
163 for x in elt[1]:
164 s = self.format(x)
165 if s:
166 if string.endswith("."):
167 string += " "
168 else:
169 string += " "
170 string += s.rstrip(" ")
171 string = string.upper()
172 elif elt[0] == LINK:
173 string = self.fmtlink(elt, False)
174 elif elt[0] == TMPL:
175 string = '\n' + self.fmtlink(elt, True) + '\n'
176 elif elt[0] == BAR:
177 w = self.width
178 if w < 5:
179 w = 5
180 string = "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
181 elif elt[0] == HDR:
182 level = elt[1]
183 string = "\n" + ("*" * level) + " " + \
184 self.format(elt[2]).lstrip(" ") + "\n\n"
185 elif elt[0] == REF:
186 string = self.xref(self.format(elt[2]), elt[1])
187 elif elt[0] == ENV:
188 type = elt[1]
189 lev = elt[2]
190 if lev > self.width - 4:
191 lev = 1
192 string = "\n"
193 n = 1
194 for s in elt[3]:
195 x = self.format(s)
196# print "X",x
197 if type == ENVUNNUM:
198 string += self.indent(lev, "*" + x.lstrip(" ")) + '\n'
199 elif type == ENVNUM:
200 string += self.indent(lev, "%d. %s" % (n, x)) + '\n'
201 n += 1
202 elif elt[0] == IND:
203 string = (" " * elt[1]) + self.format(elt[2]) + '\n'
204 else:
205 string = str(elt)
206 return string
207
208 def format(self, elt, indent=0):
209 string = ""
210 if elt[0] == SEQ:
211