summaryrefslogtreecommitdiff
path: root/wikitrans/wiki2text.py
diff options
context:
space:
mode:
Diffstat (limited to 'wikitrans/wiki2text.py')
-rw-r--r--wikitrans/wiki2text.py348
1 files changed, 348 insertions, 0 deletions
diff --git a/wikitrans/wiki2text.py b/wikitrans/wiki2text.py
new file mode 100644
index 0000000..1fbc61b
--- /dev/null
+++ b/wikitrans/wiki2text.py
@@ -0,0 +1,348 @@
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# Copyright (C) 2008-2018 Sergey Poznyakoff
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 3, or (at your option)
8# any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18"""
19Wiki markup to plain text translator.
20
21Classes:
22
23TextWikiMarkup -- Converts Wiki material to plain text.
24TextWiktionaryMarkup -- Reserved for future use. Currently does the same as
25 TextWikiMarkup.
26
27"""
28
29from wikitrans.wikitoken import *
30from wikitrans.wikimarkup import *
31from wikitrans.wikins import wiki_ns_re, wiki_ns
32import re
33try:
34 from urllib import quote as url_quote
35except ImportError:
36 from urllib.parse import quote as url_quote
37
38class TextSeqNode(WikiSeqNode):
39 def format(self):
40 string = ""
41 for x in self.content:
42 if len(string) > 1 and not string[-1].isspace():
43 string += ' '
44 string += x.format()
45 return string
46
47class TextTextNode(WikiTextNode):
48 def format(self):
49 if isinstance(self.content,list):
50 string = ""
51 for s in self.content:
52 if string:
53 if string.endswith("."):
54 string += " "
55 else:
56 string += " "
57 string += s
58 else:
59 string = self.content
60 return string
61
62class TextPreNode(WikiSeqNode):
63 def format(self):
64 string = ""
65 for x in self.content:
66 string += x.format()
67 string += '\n'
68 return string
69
70class TextParaNode(WikiSeqNode):
71 def format(self):
72 string = ""
73 for x in self.content:
74 string += x.format()
75 string = self.parser.fmtpara(string) + '\n\n'
76 return string
77
78class TextItNode(WikiSeqNode):
79 def format(self):
80 string = ""
81 for x in self.content:
82 s = x.format()
83 if s:
84 string += " " + s
85 return "_" + string.lstrip(" ") + "_"
86
87class TextBoldNode(WikiSeqNode):
88 def format(self):
89 string = ""
90 for x in self.content:
91 if string.endswith("."):
92 string += " "
93 else:
94 string += " "
95 string += x.format()
96 return string.upper()
97
98class TextLinkNode(WikiSeqNode):
99 def format(self):
100 arg = self.content[0].format()
101 if len(self.content) > 1:
102 s = [x for x in map(lambda x: x.format(), self.content)]
103 text = s[1]
104 else:
105 s = None
106 text = None
107
108 if s:
109 if s[0] == 'disambigR' or s[0] == 'wikiquote':
110 return ""
111 if len(s) > 1 and s[1] == 'thumb':
112 return ""
113 (qual,sep,tgt) = arg.partition(':')
114 if tgt != '':
115 ns = self.parser.wiki_ns_name(qual)
116 if ns:
117 if ns == 'NS_IMAGE':
118 if not self.parser.show_urls:
119 return ""
120 text = "[%s: %s]" % (qual, text if text else arg)
121 tgt = "%s/%s/250px-%s" % (self.image_base,
122 url_quote(tgt),
123 url_quote(tgt))
124 elif ns == 'NS_MEDIA':
125 text = "[%s]" % (qual)
126 else:
127 tgt = self.parser.mktgt(tgt)
128 elif self.type == 'LINK' and qual in self.parser.langtab:
129 text = self.parser.langtab[qual] + ": " + tgt
130 tgt = self.parser.mktgt(tgt, qual)
131 else:
132 tgt = self.parser.mktgt(tgt)
133 else:
134 tgt = self.parser.mktgt(arg)
135 if self.parser.show_urls:
136 return "%s (see %s) " % (text, tgt)
137 elif not text or text == '':
138 return arg
139 else:
140 return text
141
142class TextTmplNode(TextLinkNode):
143 def format(self):
144 return '[' + super(TextTmplNode, self).format() + ']'
145
146class TextBarNode(WikiNode):
147 def format(self):
148 w = self.parser.width
149 if w < 5:
150 w = 5
151 return "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
152
153class TextHdrNode(WikiHdrNode):
154 def format(self):
155 return ("\n"
156 + ("*" * self.level)
157 + " "
158 + self.content.format().lstrip(" ")
159 + "\n\n")
160
161class TextRefNode(WikiRefNode):
162 def format(self):
163 text = self.content.format()
164 if text:
165 return "%s (see %s) " % (text, self.ref)
166 else:
167 return "see " + self.ref
168
169class TextEnvNode(WikiEnvNode):
170 def format(self):
171 type = self.envtype
172 lev = self.level
173 if lev > self.parser.width - 4:
174 lev = 1
175 string = ""
176 n = 1
177 for s in self.content:
178 if not string.endswith("\n"):
179 string += "\n"
180 x = s.content.format()
181 if type == "unnumbered":
182 string += self.parser.indent(lev, "- " + x.lstrip(" "))
183 elif type == "numbered":
184 string += self.parser.indent(lev, "%d. %s" % (n, x))
185 n += 1
186 elif type == "defn":
187 if s.subtype == 0:
188 string += self.parser.indent(lev-1, x)
189 else:
190 string += self.parser.indent(lev+3, x)
191
192 if not string.endswith("\n"):
193 string += "\n"
194
195 return string
196
197class TextIndNode(WikiIndNode):
198 def format(self):
199 return (" " * self.level) + self.content.format() + '\n'
200
201class TextTagNode(WikiTagNode):
202 def format(self):
203 if self.tag == 'code':
204 self.parser.nested += 1
205 s = self.content.format()
206 self.parser.nested -= 1
207 elif self.tag == 'ref':
208 s = '[%d]' % (self.idx+1)
209 elif self.tag == 'references':
210 s = '\nReferences:\n'
211 for ref in self.parser.references:
212 s += ('[%d]. ' % (ref.idx+1)) + ref.content.format() + '\n'
213 else:
214 s = '<' + self.tag
215 if self.args:
216 s += ' ' + str(self.args)
217 s += '>' + self.content.format() + '</' + self.tag + '>'
218 return s
219
220
221class TextWikiMarkup(WikiMarkup):
222 """A Wiki markup to plain text translator.
223
224 Usage:
225
226 x = TextWikiMarkup(file="input.wiki")
227 # Parse the input:
228 x.parse()
229 # Print it as plain text:
230 print(str(x))
231
232 """
233
234 # Output width
235 width = 78
236 # Do not show references.