diff options
author | Sergey Poznyakoff <gray@gnu.org> | 2018-09-01 22:10:01 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org> | 2018-09-01 22:24:02 +0300 |
commit | 5320bea15e388200e613e6a2bdac3c1449030986 (patch) | |
tree | 7d8571c63f72cc690cea8323e43be09c18527c45 | |
parent | 0aae19835045bac0be0f22ecd0e84527cdaee21c (diff) | |
download | wikitrans-5320bea15e388200e613e6a2bdac3c1449030986.tar.gz wikitrans-5320bea15e388200e613e6a2bdac3c1449030986.tar.bz2 |
Bugfixes
* README.rst: Describe new options.
* bin/wikitrans: Change handling of the --debug option.
* wikitrans/wikimarkup.py (WikiMarkupParser): New attribute - strict.
(parse_para): Don't throw UnexpectedTokenError if self.strict is False,
instead ignore invalid token.
(WikiMarkup): Fix Python 3 compatibility
* wikitrans/wikitoken.py: Fix Python 3 compatibility
-rw-r--r-- | README.rst | 6 | ||||
-rwxr-xr-x | bin/wikitrans | 13 | ||||
-rw-r--r-- | wikitrans/wikimarkup.py | 27 | ||||
-rw-r--r-- | wikitrans/wikitoken.py | 2 |
4 files changed, 42 insertions, 6 deletions
@@ -34,12 +34,18 @@ image_base = *url* | |||
34 | Base URL for images. Default is | 34 | Base URL for images. Default is |
35 | ``http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf`` | 35 | ``http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf`` |
36 | media_base = *url* | 36 | media_base = *url* |
37 | Base URL for media files. Default is | 37 | Base URL for media files. Default is |
38 | ``http://www.mediawiki.org/xml/export-0.3`` | 38 | ``http://www.mediawiki.org/xml/export-0.3`` |
39 | 39 | ||
40 | debug_level = *int* | ||
41 | Debug verbosity level (0 - no debug info, 100 - excessively | ||
42 | copious debug messages). Default is 0. | ||
43 | |||
44 | strict = *bool* | ||
45 | Strict parsing mode. Throw exceptions on syntax errors. Default is False. | ||
40 | 46 | ||
41 | class ``TextWikiMarkup`` | 47 | class ``TextWikiMarkup`` |
42 | ------------------------ | 48 | ------------------------ |
43 | Translates material in Wiki markup language to plain text. Usage:: | 49 | Translates material in Wiki markup language to plain text. Usage:: |
44 | 50 | ||
45 | from WikiTrans.wiki2text import TextWikiMarkup | 51 | from WikiTrans.wiki2text import TextWikiMarkup |
diff --git a/bin/wikitrans b/bin/wikitrans index 4a0fc06..09ba0b3 100755 --- a/bin/wikitrans +++ b/bin/wikitrans | |||
@@ -68,12 +68,17 @@ def setkw(option, opt, value, parser): | |||
68 | if not parser.values.kwdict: | 68 | if not parser.values.kwdict: |
69 | parser.values.kwdict = {} | 69 | parser.values.kwdict = {} |
70 | (kw,sep,val) = value.partition('=') | 70 | (kw,sep,val) = value.partition('=') |
71 | if val: | 71 | if val: |
72 | parser.values.kwdict[kw] = val | 72 | parser.values.kwdict[kw] = val |
73 | 73 | ||
74 | def setdebug(option, opt, value, parser): | ||
75 | if not parser.values.kwdict: | ||
76 | parser.values.kwdict = {} | ||
77 | parser.values.kwdict['debug_level'] = value | ||
78 | |||
74 | def getwiki(url, options): | 79 | def getwiki(url, options): |
75 | tmp = tempfile.NamedTemporaryFile() | 80 | tmp = tempfile.NamedTemporaryFile() |
76 | if sys.version_info[0] > 2: | 81 | if sys.version_info[0] > 2: |
77 | import urllib.request | 82 | import urllib.request |
78 | with urllib.request.urlopen(url) as u: | 83 | with urllib.request.urlopen(url) as u: |
79 | root = etree.fromstring(u.read()) | 84 | root = etree.fromstring(u.read()) |
@@ -132,14 +137,14 @@ Otherwise, ARG is name of the file to read wiki material from. | |||
132 | parser.add_option('-o', '--option', | 137 | parser.add_option('-o', '--option', |
133 | action='callback', callback=setkw, | 138 | action='callback', callback=setkw, |
134 | type='string', dest='kwdict', | 139 | type='string', dest='kwdict', |
135 | default={}, | 140 | default={}, |
136 | help='set keyword option for the parser class constructor') | 141 | help='set keyword option for the parser class constructor') |
137 | parser.add_option('-d', '--debug', | 142 | parser.add_option('-d', '--debug', |
138 | action='store', type='int', dest='debug', | 143 | action='callback', callback=setdebug, |
139 | default=0, | 144 | type='int', dest='kwdict', |
140 | help='set debug level (0..100)') | 145 | help='set debug level (0..100)') |
141 | parser.add_option('-D', '--dump', | 146 | parser.add_option('-D', '--dump', |
142 | action='store_const', const='dump', | 147 | action='store_const', const='dump', |
143 | dest='otype', | 148 | dest='otype', |
144 | help='dump parse tree and exit; similar to --type=dump') | 149 | help='dump parse tree and exit; similar to --type=dump') |
145 | parser.add_option('-b', '--base-url', | 150 | parser.add_option('-b', '--base-url', |
@@ -147,13 +152,14 @@ Otherwise, ARG is name of the file to read wiki material from. | |||
147 | help='set base url') | 152 | help='set base url') |
148 | 153 | ||
149 | 154 | ||
150 | (options, args) = parser.parse_args() | 155 | (options, args) = parser.parse_args() |
151 | if len(args) == 1: | 156 | if len(args) == 1: |
152 | if options.base_url: | 157 | if options.base_url: |
153 | getwiki(options.base_url + '/wiki/Special:Export/' + args[0], options) | 158 | getwiki(options.base_url + '/wiki/Special:Export/' + args[0], |
159 | options) | ||
154 | elif args[0] == '-': | 160 | elif args[0] == '-': |
155 | options.kwdict['file'] = sys.stdin | 161 | options.kwdict['file'] = sys.stdin |
156 | elif re.match('^(http|ftp)s?://',args[0]): | 162 | elif re.match('^(http|ftp)s?://',args[0]): |
157 | getwiki(args[0], options) | 163 | getwiki(args[0], options) |
158 | else: | 164 | else: |
159 | options.kwdict['filename'] = args[0] | 165 | options.kwdict['filename'] = args[0] |
@@ -164,13 +170,12 @@ Otherwise, ARG is name of the file to read wiki material from. | |||
164 | 170 | ||
165 | if options.otype == 'dump' and not 'indent' in options.kwdict: | 171 | if options.otype == 'dump' and not 'indent' in options.kwdict: |
166 | options.kwdict['indent'] = 2 | 172 | options.kwdict['indent'] = 2 |
167 | if options.otype in handlers: | 173 | if options.otype in handlers: |
168 | if options.itype in handlers[options.otype]: | 174 | if options.itype in handlers[options.otype]: |
169 | markup = handlers[options.otype][options.itype](**options.kwdict) | 175 | markup = handlers[options.otype][options.itype](**options.kwdict) |
170 | markup.debug_level = options.debug | ||
171 | markup.parse() | 176 | markup.parse() |
172 | print("%s" % str(markup)) | 177 | print("%s" % str(markup)) |
173 | exit(0) | 178 | exit(0) |
174 | else: | 179 | else: |
175 | print("input type %s is not supported for %s output" % (options.itype, options.otype)) | 180 | print("input type %s is not supported for %s output" % (options.itype, options.otype)) |
176 | else: | 181 | else: |
diff --git a/wikitrans/wikimarkup.py b/wikitrans/wikimarkup.py index 19f69e6..1e2429f 100644 --- a/wikitrans/wikimarkup.py +++ b/wikitrans/wikimarkup.py | |||
@@ -142,12 +142,19 @@ class WikiMarkupParser(object): | |||
142 | Abstract methods (must be overridden by the subclass): | 142 | Abstract methods (must be overridden by the subclass): |
143 | 143 | ||
144 | input() -- returns next physical line from the input material. | 144 | input() -- returns next physical line from the input material. |
145 | 145 | ||
146 | Public attributes: | 146 | Public attributes: |
147 | 147 | ||
148 | Input: | ||
149 | debug_level -- debug verbosity level (0 - no debug info, 100 - excessively | ||
150 | copious debug messages). Default is 0. | ||
151 | strict -- if True, parser will throw exception upon encountering | ||
152 | invalid markup tag (mostly for future use) | ||
153 | |||
154 | Output: | ||
148 | tree -- constructed parse tree (a subclass of WikiNode) | 155 | tree -- constructed parse tree (a subclass of WikiNode) |
149 | 156 | ||
150 | """ | 157 | """ |
151 | 158 | ||
152 | delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") | 159 | delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") |
153 | otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^/][^>]+))?\s*(?P<closed>/)?>") | 160 | otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^/][^>]+))?\s*(?P<closed>/)?>") |
@@ -172,12 +179,13 @@ class WikiMarkupParser(object): | |||
172 | newline = 0 | 179 | newline = 0 |
173 | tree = None | 180 | tree = None |
174 | 181 | ||
175 | tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ] | 182 | tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ] |
176 | 183 | ||
177 | debug_level = 0 | 184 | debug_level = 0 |
185 | strict = False | ||
178 | 186 | ||
179 | def dprint(self, lev, fmt, *argv): | 187 | def dprint(self, lev, fmt, *argv): |
180 | """If current debug level is greater than or equal to lev, print *argv | 188 | """If current debug level is greater than or equal to lev, print *argv |
181 | according to format. | 189 | according to format. |
182 | """ | 190 | """ |
183 | if self.debug_level >= lev: | 191 | if self.debug_level >= lev: |
@@ -371,13 +379,13 @@ class WikiMarkupParser(object): | |||
371 | stack.append(i) | 379 | stack.append(i) |
372 | else: | 380 | else: |
373 | # Push the token on stack | 381 | # Push the token on stack |
374 | stack.append(i) | 382 | stack.append(i) |
375 | # Redefine all non-matched tokens as TEXT | 383 | # Redefine all non-matched tokens as TEXT |
376 | for i in stack: | 384 | for i in stack: |
377 | # FIXME | 385 | # FIXME: How to convert node to TEXT? |
378 | self.toklist[i] = self._new_node(type='TEXT', | 386 | self.toklist[i] = self._new_node(type='TEXT', |
379 | content=str(self.toklist[i])) | 387 | content=str(self.toklist[i])) |
380 | 388 | ||
381 | mark = [] | 389 | mark = [] |
382 | 390 | ||
383 | def push_mark(self): | 391 | def push_mark(self): |
@@ -489,13 +497,16 @@ class WikiMarkupParser(object): | |||
489 | flush() | 497 | flush() |
490 | acc['seq'].append(self.parse_tag(tok)) | 498 | acc['seq'].append(self.parse_tag(tok)) |
491 | elif tok.type == 'DELIM': | 499 | elif tok.type == 'DELIM': |
492 | flush() | 500 | flush() |
493 | acc['seq'].append(self.parse_inline_delim(tok)) | 501 | acc['seq'].append(self.parse_inline_delim(tok)) |
494 | else: | 502 | else: |
503 | if self.strict: | ||
495 | raise UnexpectedTokenError(tok) | 504 | raise UnexpectedTokenError(tok) |
505 | # FIXME: Another possible variant of handling this case is to | ||
506 | # convert tok to TEXT node and append it to acc['seq'] | ||
496 | tok = self.getkn() | 507 | tok = self.getkn() |
497 | flush() | 508 | flush() |
498 | if acc['seq']: | 509 | if acc['seq']: |
499 | tok = self._new_node(type=type, content=acc['seq']) | 510 | tok = self._new_node(type=type, content=acc['seq']) |
500 | else: | 511 | else: |
501 | tok = None | 512 | tok = None |
@@ -973,12 +984,19 @@ class WikiMarkup(WikiMarkupParser): | |||
973 | image_base=URL | 984 | image_base=URL |
974 | Base URL for images. Default is | 985 | Base URL for images. Default is |
975 | 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf' | 986 | 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf' |
976 | media_base=URL | 987 | media_base=URL |
977 | Base URL for media files. Default is | 988 | Base URL for media files. Default is |
978 | 'http://www.mediawiki.org/xml/export-0.3' | 989 | 'http://www.mediawiki.org/xml/export-0.3' |
990 | |||
991 | debug_level=INT | ||
992 | debug verbosity level (0 - no debug info, 100 - excessively | ||
993 |