summaryrefslogtreecommitdiffabout
path: root/wikitrans/wiki2text.py
Unidiff
Diffstat (limited to 'wikitrans/wiki2text.py') (more/less context) (ignore whitespace changes)
-rw-r--r--wikitrans/wiki2text.py348
1 files changed, 348 insertions, 0 deletions
diff --git a/wikitrans/wiki2text.py b/wikitrans/wiki2text.py
new file mode 100644
index 0000000..1fbc61b
--- a/dev/null
+++ b/wikitrans/wiki2text.py
@@ -0,0 +1,348 @@
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# Copyright (C) 2008-2018 Sergey Poznyakoff
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 3, or (at your option)
8# any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18"""
19Wiki markup to plain text translator.
20
21Classes:
22
23TextWikiMarkup -- Converts Wiki material to plain text.
24TextWiktionaryMarkup -- Reserved for future use. Currently does the same as
25 TextWikiMarkup.
26
27"""
28
29from wikitrans.wikitoken import *
30from wikitrans.wikimarkup import *
31from wikitrans.wikins import wiki_ns_re, wiki_ns
32import re
33try:
34 from urllib import quote as url_quote
35except ImportError:
36 from urllib.parse import quote as url_quote
37
38class TextSeqNode(WikiSeqNode):
39 def format(self):
40 string = ""
41 for x in self.content:
42 if len(string) > 1 and not string[-1].isspace():
43 string += ' '
44 string += x.format()
45 return string
46
47class TextTextNode(WikiTextNode):
48 def format(self):
49 if isinstance(self.content,list):
50 string = ""
51 for s in self.content:
52 if string:
53 if string.endswith("."):
54 string += " "
55 else:
56 string += " "
57 string += s
58 else:
59 string = self.content
60 return string
61
62class TextPreNode(WikiSeqNode):
63 def format(self):
64 string = ""
65 for x in self.content:
66 string += x.format()
67 string += '\n'
68 return string
69
70class TextParaNode(WikiSeqNode):
71 def format(self):
72 string = ""
73 for x in self.content:
74 string += x.format()
75 string = self.parser.fmtpara(string) + '\n\n'
76 return string
77
78class TextItNode(WikiSeqNode):
79 def format(self):
80 string = ""
81 for x in self.content:
82 s = x.format()
83 if s:
84 string += " " + s
85 return "_" + string.lstrip(" ") + "_"
86
87class TextBoldNode(WikiSeqNode):
88 def format(self):
89 string = ""
90 for x in self.content:
91 if string.endswith("."):
92 string += " "
93 else:
94 string += " "
95 string += x.format()
96 return string.upper()
97
98class TextLinkNode(WikiSeqNode):
99 def format(self):
100 arg = self.content[0].format()
101 if len(self.content) > 1:
102 s = [x for x in map(lambda x: x.format(), self.content)]
103 text = s[1]
104 else:
105 s = None
106 text = None
107
108 if s:
109 if s[0] == 'disambigR' or s[0] == 'wikiquote':
110 return ""
111 if len(s) > 1 and s[1] == 'thumb':
112 return ""
113 (qual,sep,tgt) = arg.partition(':')
114 if tgt != '':
115 ns = self.parser.wiki_ns_name(qual)
116 if ns:
117 if ns == 'NS_IMAGE':
118 if not self.parser.show_urls:
119 return ""
120 text = "[%s: %s]" % (qual, text if text else arg)
121 tgt = "%s/%s/250px-%s" % (self.image_base,
122 url_quote(tgt),
123 url_quote(tgt))
124 elif ns == 'NS_MEDIA':
125 text = "[%s]" % (qual)
126 else:
127 tgt = self.parser.mktgt(tgt)
128 elif self.type == 'LINK' and qual in self.parser.langtab:
129 text = self.parser.langtab[qual] + ": " + tgt
130 tgt = self.parser.mktgt(tgt, qual)
131 else:
132 tgt = self.parser.mktgt(tgt)
133 else:
134 tgt = self.parser.mktgt(arg)
135 if self.parser.show_urls:
136 return "%s (see %s) " % (text, tgt)
137 elif not text or text == '':
138 return arg
139 else:
140 return text
141
142class TextTmplNode(TextLinkNode):
143 def format(self):
144 return '[' + super(TextTmplNode, self).format() + ']'
145
146class TextBarNode(WikiNode):
147 def format(self):
148 w = self.parser.width
149 if w < 5:
150 w = 5
151 return "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
152
153class TextHdrNode(WikiHdrNode):
154 def format(self):
155 return ("\n"
156 + ("*" * self.level)
157 + " "
158 + self.content.format().lstrip(" ")
159 + "\n\n")
160
161class TextRefNode(WikiRefNode):
162 def format(self):
163 text = self.content.format()
164 if text:
165 return "%s (see %s) " % (text, self.ref)
166 else:
167 return "see " + self.ref
168
169class TextEnvNode(WikiEnvNode):
170 def format(self):
171 type = self.envtype
172 lev = self.level
173 if lev > self.parser.width - 4:
174 lev = 1
175 string = ""
176 n = 1
177 for s in self.content:
178 if not string.endswith("\n"):
179 string += "\n"
180 x = s.content.format()
181 if type == "unnumbered":
182 string += self.parser.indent(lev, "- " + x.lstrip(" "))
183 elif type == "numbered":
184 string += self.parser.indent(lev, "%d. %s" % (n, x))
185 n += 1
186 elif type == "defn":
187 if s.subtype == 0:
188 string += self.parser.indent(lev-1, x)
189 else:
190 string += self.parser.indent(lev+3, x)
191
192 if not string.endswith("\n"):
193 string += "\n"
194
195 return string
196
197class TextIndNode(WikiIndNode):
198 def format(self):
199 return (" " * self.level) + self.content.format() + '\n'
200
201class TextTagNode(WikiTagNode):
202 def format(self):
203 if self.tag == 'code':
204 self.parser.nested += 1
205 s = self.content.format()
206 self.parser.nested -= 1
207 elif self.tag == 'ref':
208 s = '[%d]' % (self.idx+1)
209 elif self.tag == 'references':
210 s = '\nReferences:\n'
211 for ref in self.parser.references:
212 s += ('[%d]. ' % (ref.idx+1)) + ref.content.format() + '\n'
213 else:
214 s = '<' + self.tag
215 if self.args:
216 s += ' ' + str(self.args)
217 s += '>' + self.content.format() + '</' + self.tag + '>'
218 return s
219
220
221class TextWikiMarkup(WikiMarkup):
222 """A Wiki markup to plain text translator.
223
224 Usage:
225
226 x = TextWikiMarkup(file="input.wiki")
227 # Parse the input:
228 x.parse()
229 # Print it as plain text:
230 print(str(x))
231
232 """
233
234 # Output width
235 width = 78
236 # Do not show references.
237 show_urls = False
238 # Provide a minimum markup
239 markup = True
240
241 # Number of current element in the environment
242 num = 0
243
244 # Array of footnote references
245 references = []
246
247 def __init__(self, *args, **keywords):
248 """Create a TextWikiMarkup object.
249
250 Arguments:
251
252 filename=FILE
253 Read Wiki material from the file named FILE.
254 file=FD
255 Read Wiki material from file object FD.
256 text=STRING
257 Read Wiki material from STRING.
258
259 width=N
260 Limit output width to N columns. Default is 78.
261 show_urls=False
262 By default, the link URLs are displayed in parentheses next to the
263 link text. If this argument is given, only the link text will be
264 displayed.
265 """
266
267 super(TextWikiMarkup,self).__init__(*args, **keywords)
268 if 'width' in keywords:
269 self.width = keywords['width']
270 if 'show_urls' in keywords:
271 self.show_urls = keywords['show_urls']
272 self.token_class['SEQ'] = TextSeqNode
273 self.token_class['TEXT'] = TextTextNode
274 self.token_class['PRE'] = TextPreNode
275 self.token_class['PARA'] = TextParaNode
276 self.token_class['SEQ'] = TextSeqNode
277 self.token_class['IT'] = TextItNode
278 self.token_class['BOLD'] = TextBoldNode
279 self.token_class['LINK'] = TextLinkNode
280 self.token_class['TMPL'] = TextTmplNode
281 self.token_class['BAR'] = TextBarNode
282 self.token_class['HDR'] = TextHdrNode
283 self.token_class['REF'] = TextRefNode
284 self.token_class['ENV'] = TextEnvNode
285 self.token_class['IND'] = TextIndNode
286 self.token_class['TAG'] = TextTagNode
287
288 def wiki_ns_name(self, str):
289 if str in wiki_ns[self.lang]:
290 return wiki_ns[self.lang][str]
291 elif str in wiki_ns_re[self.lang]:
292 for elt in wiki_ns_re[self.lang][str]:
293 if str.beginswith(elt[0]) and str.endswith(elt[1]):
294 return elt[2]
295 return None
296
297 def mktgt(self, tgt, lang = None):
298 if not lang:
299 lang = self.lang
300 return self.html_base % { 'lang' : lang } + url_quote(tgt)
301
302 def indent(self, lev, text):
303 if text.find('\n') == -1:
304 s = (" " * lev) + text
305 else:
306 s = ""
307 for elt in text.split('\n'):
308 if elt:
309 s += (" " * lev) + elt + '\n'
310 if not text.endswith('\n'):
311 s = s.rstrip('\n')
312 return s
313
314 def fmtpara(self, input):
315 output = ""
316 linebuf = ""
317 length = 0
318 for s in input.split():
319 wlen = len(s)
320 if len(linebuf) == 0:
321 wsc = 0
322 elif linebuf.endswith("."):
323 wsc = 2
324 else:
325 wsc = 1
326 if length + wsc + wlen > self.width:
327 # FIXME: fill out linebuf
328 output += linebuf + '\n'
329 wsc = 0
330 length = 0
331 linebuf = ""
332 linebuf += " " * wsc + s
333 length += wsc + wlen
334 return output + linebuf
335
336 def __str__(self):
337 str = ""
338 for elt in self.tree:
339 str += elt.format()
340 return str
341
342class TextWiktionaryMarkup(TextWikiMarkup):
343 """A class for translating Wiktionary articles into plain text.
344
345 Reserved for future use. Currently does the same as TextWikiMarkup.
346 """
347
348

Return to:

Send suggestions and report system problems to the System administrator.