summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org>2018-09-01 22:10:01 +0300
committerSergey Poznyakoff <gray@gnu.org>2018-09-01 22:24:02 +0300
commit5320bea15e388200e613e6a2bdac3c1449030986 (patch)
tree7d8571c63f72cc690cea8323e43be09c18527c45
parent0aae19835045bac0be0f22ecd0e84527cdaee21c (diff)
downloadwikitrans-5320bea15e388200e613e6a2bdac3c1449030986.tar.gz
wikitrans-5320bea15e388200e613e6a2bdac3c1449030986.tar.bz2
Bugfixes
* README.rst: Describe new options. * bin/wikitrans: Change handling of the --debug option. * wikitrans/wikimarkup.py (WikiMarkupParser): New attribute - strict. (parse_para): Don't throw UnexpectedTokenError if self.strict is False, instead ignore invalid token. (WikiMarkup): Fix Python 3 compatibility * wikitrans/wikitoken.py: Fix Python 3 compatibility
-rw-r--r--README.rst6
-rwxr-xr-xbin/wikitrans13
-rw-r--r--wikitrans/wikimarkup.py27
-rw-r--r--wikitrans/wikitoken.py2
4 files changed, 42 insertions, 6 deletions
diff --git a/README.rst b/README.rst
index 7838fa1..7edeb04 100644
--- a/README.rst
+++ b/README.rst
@@ -34,12 +34,18 @@ image_base = *url*
34 Base URL for images. Default is 34 Base URL for images. Default is
35 ``http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf`` 35 ``http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf``
36media_base = *url* 36media_base = *url*
37 Base URL for media files. Default is 37 Base URL for media files. Default is
38 ``http://www.mediawiki.org/xml/export-0.3`` 38 ``http://www.mediawiki.org/xml/export-0.3``
39 39
40debug_level = *int*
41 Debug verbosity level (0 - no debug info, 100 - excessively
42 copious debug messages). Default is 0.
43
44strict = *bool*
45 Strict parsing mode. Throw exceptions on syntax errors. Default is False.
40 46
41class ``TextWikiMarkup`` 47class ``TextWikiMarkup``
42------------------------ 48------------------------
43Translates material in Wiki markup language to plain text. Usage:: 49Translates material in Wiki markup language to plain text. Usage::
44 50
45 from WikiTrans.wiki2text import TextWikiMarkup 51 from WikiTrans.wiki2text import TextWikiMarkup
diff --git a/bin/wikitrans b/bin/wikitrans
index 4a0fc06..09ba0b3 100755
--- a/bin/wikitrans
+++ b/bin/wikitrans
@@ -68,12 +68,17 @@ def setkw(option, opt, value, parser):
68 if not parser.values.kwdict: 68 if not parser.values.kwdict:
69 parser.values.kwdict = {} 69 parser.values.kwdict = {}
70 (kw,sep,val) = value.partition('=') 70 (kw,sep,val) = value.partition('=')
71 if val: 71 if val:
72 parser.values.kwdict[kw] = val 72 parser.values.kwdict[kw] = val
73 73
74def setdebug(option, opt, value, parser):
75 if not parser.values.kwdict:
76 parser.values.kwdict = {}
77 parser.values.kwdict['debug_level'] = value
78
74def getwiki(url, options): 79def getwiki(url, options):
75 tmp = tempfile.NamedTemporaryFile() 80 tmp = tempfile.NamedTemporaryFile()
76 if sys.version_info[0] > 2: 81 if sys.version_info[0] > 2:
77 import urllib.request 82 import urllib.request
78 with urllib.request.urlopen(url) as u: 83 with urllib.request.urlopen(url) as u:
79 root = etree.fromstring(u.read()) 84 root = etree.fromstring(u.read())
@@ -132,14 +137,14 @@ Otherwise, ARG is name of the file to read wiki material from.
132 parser.add_option('-o', '--option', 137 parser.add_option('-o', '--option',
133 action='callback', callback=setkw, 138 action='callback', callback=setkw,
134 type='string', dest='kwdict', 139 type='string', dest='kwdict',
135 default={}, 140 default={},
136 help='set keyword option for the parser class constructor') 141 help='set keyword option for the parser class constructor')
137 parser.add_option('-d', '--debug', 142 parser.add_option('-d', '--debug',
138 action='store', type='int', dest='debug', 143 action='callback', callback=setdebug,
139 default=0, 144 type='int', dest='kwdict',
140 help='set debug level (0..100)') 145 help='set debug level (0..100)')
141 parser.add_option('-D', '--dump', 146 parser.add_option('-D', '--dump',
142 action='store_const', const='dump', 147 action='store_const', const='dump',
143 dest='otype', 148 dest='otype',
144 help='dump parse tree and exit; similar to --type=dump') 149 help='dump parse tree and exit; similar to --type=dump')
145 parser.add_option('-b', '--base-url', 150 parser.add_option('-b', '--base-url',
@@ -147,13 +152,14 @@ Otherwise, ARG is name of the file to read wiki material from.
147 help='set base url') 152 help='set base url')
148 153
149 154
150 (options, args) = parser.parse_args() 155 (options, args) = parser.parse_args()
151 if len(args) == 1: 156 if len(args) == 1:
152 if options.base_url: 157 if options.base_url:
153 getwiki(options.base_url + '/wiki/Special:Export/' + args[0], options) 158 getwiki(options.base_url + '/wiki/Special:Export/' + args[0],
159 options)
154 elif args[0] == '-': 160 elif args[0] == '-':
155 options.kwdict['file'] = sys.stdin 161 options.kwdict['file'] = sys.stdin
156 elif re.match('^(http|ftp)s?://',args[0]): 162 elif re.match('^(http|ftp)s?://',args[0]):
157 getwiki(args[0], options) 163 getwiki(args[0], options)
158 else: 164 else:
159 options.kwdict['filename'] = args[0] 165 options.kwdict['filename'] = args[0]
@@ -164,13 +170,12 @@ Otherwise, ARG is name of the file to read wiki material from.
164 170
165 if options.otype == 'dump' and not 'indent' in options.kwdict: 171 if options.otype == 'dump' and not 'indent' in options.kwdict:
166 options.kwdict['indent'] = 2 172 options.kwdict['indent'] = 2
167 if options.otype in handlers: 173 if options.otype in handlers:
168 if options.itype in handlers[options.otype]: 174 if options.itype in handlers[options.otype]:
169 markup = handlers[options.otype][options.itype](**options.kwdict) 175 markup = handlers[options.otype][options.itype](**options.kwdict)
170 markup.debug_level = options.debug
171 markup.parse() 176 markup.parse()
172 print("%s" % str(markup)) 177 print("%s" % str(markup))
173 exit(0) 178 exit(0)
174 else: 179 else:
175 print("input type %s is not supported for %s output" % (options.itype, options.otype)) 180 print("input type %s is not supported for %s output" % (options.itype, options.otype))
176 else: 181 else:
diff --git a/wikitrans/wikimarkup.py b/wikitrans/wikimarkup.py
index 19f69e6..1e2429f 100644
--- a/wikitrans/wikimarkup.py
+++ b/wikitrans/wikimarkup.py
@@ -142,12 +142,19 @@ class WikiMarkupParser(object):
142 Abstract methods (must be overridden by the subclass): 142 Abstract methods (must be overridden by the subclass):
143 143
144 input() -- returns next physical line from the input material. 144 input() -- returns next physical line from the input material.
145 145
146 Public attributes: 146 Public attributes:
147 147
148 Input:
149 debug_level -- debug verbosity level (0 - no debug info, 100 - excessively
150 copious debug messages). Default is 0.
151 strict -- if True, parser will throw exception upon encountering
152 invalid markup tag (mostly for future use)
153
154 Output:
148 tree -- constructed parse tree (a subclass of WikiNode) 155 tree -- constructed parse tree (a subclass of WikiNode)
149 156
150 """ 157 """
151 158
152 delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") 159 delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
153 otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^/][^>]+))?\s*(?P<closed>/)?>") 160 otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^/][^>]+))?\s*(?P<closed>/)?>")
@@ -172,12 +179,13 @@ class WikiMarkupParser(object):
172 newline = 0 179 newline = 0
173 tree = None 180 tree = None
174 181
175 tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ] 182 tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ]
176 183
177 debug_level = 0 184 debug_level = 0
185 strict = False
178 186
179 def dprint(self, lev, fmt, *argv): 187 def dprint(self, lev, fmt, *argv):
180 """If current debug level is greater than or equal to lev, print *argv 188 """If current debug level is greater than or equal to lev, print *argv
181 according to format. 189 according to format.
182 """ 190 """
183 if self.debug_level >= lev: 191 if self.debug_level >= lev:
@@ -371,13 +379,13 @@ class WikiMarkupParser(object):
371 stack.append(i) 379 stack.append(i)
372 else: 380 else:
373 # Push the token on stack 381 # Push the token on stack
374 stack.append(i) 382 stack.append(i)
375 # Redefine all non-matched tokens as TEXT 383 # Redefine all non-matched tokens as TEXT
376 for i in stack: 384 for i in stack:
377 # FIXME 385 # FIXME: How to convert node to TEXT?
378 self.toklist[i] = self._new_node(type='TEXT', 386 self.toklist[i] = self._new_node(type='TEXT',
379 content=str(self.toklist[i])) 387 content=str(self.toklist[i]))
380 388
381 mark = [] 389 mark = []
382 390
383 def push_mark(self): 391 def push_mark(self):
@@ -489,13 +497,16 @@ class WikiMarkupParser(object):
489 flush() 497 flush()
490 acc['seq'].append(self.parse_tag(tok)) 498 acc['seq'].append(self.parse_tag(tok))
491 elif tok.type == 'DELIM': 499 elif tok.type == 'DELIM':
492 flush() 500 flush()
493 acc['seq'].append(self.parse_inline_delim(tok)) 501 acc['seq'].append(self.parse_inline_delim(tok))
494 else: 502 else:
503 if self.strict:
495 raise UnexpectedTokenError(tok) 504 raise UnexpectedTokenError(tok)
505 # FIXME: Another possible variant of handling this case is to
506 # convert tok to TEXT node and append it to acc['seq']
496 tok = self.getkn() 507 tok = self.getkn()
497 flush() 508 flush()
498 if acc['seq']: 509 if acc['seq']:
499 tok = self._new_node(type=type, content=acc['seq']) 510 tok = self._new_node(type=type, content=acc['seq'])
500 else: 511 else:
501 tok = None 512 tok = None
@@ -973,12 +984,19 @@ class WikiMarkup(WikiMarkupParser):
973 image_base=URL 984 image_base=URL
974 Base URL for images. Default is 985 Base URL for images. Default is
975 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf' 986 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf'
976 media_base=URL 987 media_base=URL
977 Base URL for media files. Default is 988 Base URL for media files. Default is
978 'http://www.mediawiki.org/xml/export-0.3' 989 'http://www.mediawiki.org/xml/export-0.3'
990
991 debug_level=INT
992 debug verbosity level (0 - no debug info, 100 - excessively
993