diff options
Diffstat (limited to 'wikitrans/wiki2text.py')
-rw-r--r-- | wikitrans/wiki2text.py | 348 |
1 files changed, 348 insertions, 0 deletions
diff --git a/wikitrans/wiki2text.py b/wikitrans/wiki2text.py new file mode 100644 index 0000000..1fbc61b --- /dev/null +++ b/wikitrans/wiki2text.py | |||
@@ -0,0 +1,348 @@ | |||
1 | #!/usr/bin/python | ||
2 | # -*- coding: utf-8 -*- | ||
3 | # Copyright (C) 2008-2018 Sergey Poznyakoff | ||
4 | # | ||
5 | # This program is free software; you can redistribute it and/or modify | ||
6 | # it under the terms of the GNU General Public License as published by | ||
7 | # the Free Software Foundation; either version 3, or (at your option) | ||
8 | # any later version. | ||
9 | # | ||
10 | # This program is distributed in the hope that it will be useful, | ||
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | # GNU General Public License for more details. | ||
14 | # | ||
15 | # You should have received a copy of the GNU General Public License | ||
16 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
17 | |||
18 | """ | ||
19 | Wiki markup to plain text translator. | ||
20 | |||
21 | Classes: | ||
22 | |||
23 | TextWikiMarkup -- Converts Wiki material to plain text. | ||
24 | TextWiktionaryMarkup -- Reserved for future use. Currently does the same as | ||
25 | TextWikiMarkup. | ||
26 | |||
27 | """ | ||
28 | |||
29 | from wikitrans.wikitoken import * | ||
30 | from wikitrans.wikimarkup import * | ||
31 | from wikitrans.wikins import wiki_ns_re, wiki_ns | ||
32 | import re | ||
33 | try: | ||
34 | from urllib import quote as url_quote | ||
35 | except ImportError: | ||
36 | from urllib.parse import quote as url_quote | ||
37 | |||
38 | class TextSeqNode(WikiSeqNode): | ||
39 | def format(self): | ||
40 | string = "" | ||
41 | for x in self.content: | ||
42 | if len(string) > 1 and not string[-1].isspace(): | ||
43 | string += ' ' | ||
44 | string += x.format() | ||
45 | return string | ||
46 | |||
47 | class TextTextNode(WikiTextNode): | ||
48 | def format(self): | ||
49 | if isinstance(self.content,list): | ||
50 | string = "" | ||
51 | for s in self.content: | ||
52 | if string: | ||
53 | if string.endswith("."): | ||
54 | string += " " | ||
55 | else: | ||
56 | string += " " | ||
57 | string += s | ||
58 | else: | ||
59 | string = self.content | ||
60 | return string | ||
61 | |||
62 | class TextPreNode(WikiSeqNode): | ||
63 | def format(self): | ||
64 | string = "" | ||
65 | for x in self.content: | ||
66 | string += x.format() | ||
67 | string += '\n' | ||
68 | return string | ||
69 | |||
70 | class TextParaNode(WikiSeqNode): | ||
71 | def format(self): | ||
72 | string = "" | ||
73 | for x in self.content: | ||
74 | string += x.format() | ||
75 | string = self.parser.fmtpara(string) + '\n\n' | ||
76 | return string | ||
77 | |||
78 | class TextItNode(WikiSeqNode): | ||
79 | def format(self): | ||
80 | string = "" | ||
81 | for x in self.content: | ||
82 | s = x.format() | ||
83 | if s: | ||
84 | string += " " + s | ||
85 | return "_" + string.lstrip(" ") + "_" | ||
86 | |||
87 | class TextBoldNode(WikiSeqNode): | ||
88 | def format(self): | ||
89 | string = "" | ||
90 | for x in self.content: | ||
91 | if string.endswith("."): | ||
92 | string += " " | ||
93 | else: | ||
94 | string += " " | ||
95 | string += x.format() | ||
96 | return string.upper() | ||
97 | |||
98 | class TextLinkNode(WikiSeqNode): | ||
99 | def format(self): | ||
100 | arg = self.content[0].format() | ||
101 | if len(self.content) > 1: | ||
102 | s = [x for x in map(lambda x: x.format(), self.content)] | ||
103 | text = s[1] | ||
104 | else: | ||
105 | s = None | ||
106 | text = None | ||
107 | |||
108 | if s: | ||
109 | if s[0] == 'disambigR' or s[0] == 'wikiquote': | ||
110 | return "" | ||
111 | if len(s) > 1 and s[1] == 'thumb': | ||
112 | return "" | ||
113 | (qual,sep,tgt) = arg.partition(':') | ||
114 | if tgt != '': | ||
115 | ns = self.parser.wiki_ns_name(qual) | ||
116 | if ns: | ||
117 | if ns == 'NS_IMAGE': | ||
118 | if not self.parser.show_urls: | ||
119 | return "" | ||
120 | text = "[%s: %s]" % (qual, text if text else arg) | ||
121 | tgt = "%s/%s/250px-%s" % (self.image_base, | ||
122 | url_quote(tgt), | ||
123 | url_quote(tgt)) | ||
124 | elif ns == 'NS_MEDIA': | ||
125 | text = "[%s]" % (qual) | ||
126 | else: | ||
127 | tgt = self.parser.mktgt(tgt) | ||
128 | elif self.type == 'LINK' and qual in self.parser.langtab: | ||
129 | text = self.parser.langtab[qual] + ": " + tgt | ||
130 | tgt = self.parser.mktgt(tgt, qual) | ||
131 | else: | ||
132 | tgt = self.parser.mktgt(tgt) | ||
133 | else: | ||
134 | tgt = self.parser.mktgt(arg) | ||
135 | if self.parser.show_urls: | ||
136 | return "%s (see %s) " % (text, tgt) | ||
137 | elif not text or text == '': | ||
138 | return arg | ||
139 | else: | ||
140 | return text | ||
141 | |||
142 | class TextTmplNode(TextLinkNode): | ||
143 | def format(self): | ||
144 | return '[' + super(TextTmplNode, self).format() + ']' | ||
145 | |||
146 | class TextBarNode(WikiNode): | ||
147 | def format(self): | ||
148 | w = self.parser.width | ||
149 | if w < 5: | ||
150 | w = 5 | ||
151 | return "\n" + ("-" * (w - 5)).center(w - 1) + "\n" | ||
152 | |||
153 | class TextHdrNode(WikiHdrNode): | ||
154 | def format(self): | ||
155 | return ("\n" | ||
156 | + ("*" * self.level) | ||
157 | + " " | ||
158 | + self.content.format().lstrip(" ") | ||
159 | + "\n\n") | ||
160 | |||
161 | class TextRefNode(WikiRefNode): | ||
162 | def format(self): | ||
163 | text = self.content.format() | ||
164 | if text: | ||
165 | return "%s (see %s) " % (text, self.ref) | ||
166 | else: | ||
167 | return "see " + self.ref | ||
168 | |||
169 | class TextEnvNode(WikiEnvNode): | ||
170 | def format(self): | ||
171 | type = self.envtype | ||
172 | lev = self.level | ||
173 | if lev > self.parser.width - 4: | ||
174 | lev = 1 | ||
175 | string = "" | ||
176 | n = 1 | ||
177 | for s in self.content: | ||
178 | if not string.endswith("\n"): | ||
179 | string += "\n" | ||
180 | x = s.content.format() | ||
181 | if type == "unnumbered": | ||
182 | string += self.parser.indent(lev, "- " + x.lstrip(" ")) | ||
183 | elif type == "numbered": | ||
184 | string += self.parser.indent(lev, "%d. %s" % (n, x)) | ||
185 | n += 1 | ||
186 | elif type == "defn": | ||
187 | if s.subtype == 0: | ||
188 | string += self.parser.indent(lev-1, x) | ||
189 | else: | ||
190 | string += self.parser.indent(lev+3, x) | ||
191 | |||
192 | if not string.endswith("\n"): | ||
193 | string += "\n" | ||
194 | |||
195 | return string | ||
196 | |||
197 | class TextIndNode(WikiIndNode): | ||
198 | def format(self): | ||
199 | return (" " * self.level) + self.content.format() + '\n' | ||
200 | |||
201 | class TextTagNode(WikiTagNode): | ||
202 | def format(self): | ||
203 | if self.tag == 'code': | ||
204 | self.parser.nested += 1 | ||
205 | s = self.content.format() | ||
206 | self.parser.nested -= 1 | ||
207 | elif self.tag == 'ref': | ||
208 | s = '[%d]' % (self.idx+1) | ||
209 | elif self.tag == 'references': | ||
210 | s = '\nReferences:\n' | ||
211 | for ref in self.parser.references: | ||
212 | s += ('[%d]. ' % (ref.idx+1)) + ref.content.format() + '\n' | ||
213 | else: | ||
214 | s = '<' + self.tag | ||
215 | if self.args: | ||
216 | s += ' ' + str(self.args) | ||
217 | s += '>' + self.content.format() + '</' + self.tag + '>' | ||
218 | return s | ||
219 | |||
220 | |||
221 | class TextWikiMarkup(WikiMarkup): | ||
222 | """A Wiki markup to plain text translator. | ||
223 | |||
224 | Usage: | ||
225 | |||
226 | x = TextWikiMarkup(file="input.wiki") | ||
227 | # Parse the input: | ||
228 | x.parse() | ||
229 | # Print it as plain text: | ||
230 | print(str(x)) | ||
231 | |||
232 | """ | ||
233 | |||
234 | # Output width | ||
235 | width = 78 | ||
236 | # Do not show references. | ||