summaryrefslogtreecommitdiffabout
authorSergey Poznyakoff <gray@gnu.org>2018-09-01 19:10:01 (GMT)
committer Sergey Poznyakoff <gray@gnu.org>2018-09-01 19:24:02 (GMT)
commit5320bea15e388200e613e6a2bdac3c1449030986 (patch) (side-by-side diff)
tree7d8571c63f72cc690cea8323e43be09c18527c45
parent0aae19835045bac0be0f22ecd0e84527cdaee21c (diff)
downloadwikitrans-5320bea15e388200e613e6a2bdac3c1449030986.tar.gz
wikitrans-5320bea15e388200e613e6a2bdac3c1449030986.tar.bz2
Bugfixes
* README.rst: Describe new options. * bin/wikitrans: Change handling of the --debug option. * wikitrans/wikimarkup.py (WikiMarkupParser): New attribute - strict. (parse_para): Don't throw UnexpectedTokenError if self.strict is False, instead ignore invalid token. (WikiMarkup): Fix Python 3 compatibility * wikitrans/wikitoken.py: Fix Python 3 compatibility
Diffstat (more/less context) (show whitespace changes)
-rw-r--r--README.rst6
-rwxr-xr-xbin/wikitrans13
-rw-r--r--wikitrans/wikimarkup.py27
-rw-r--r--wikitrans/wikitoken.py2
4 files changed, 42 insertions, 6 deletions
diff --git a/README.rst b/README.rst
index 7838fa1..7edeb04 100644
--- a/README.rst
+++ b/README.rst
@@ -34,12 +34,18 @@ image_base = *url*
Base URL for images. Default is
``http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf``
media_base = *url*
Base URL for media files. Default is
``http://www.mediawiki.org/xml/export-0.3``
+debug_level = *int*
+ Debug verbosity level (0 - no debug info, 100 - excessively
+ copious debug messages). Default is 0.
+
+strict = *bool*
+ Strict parsing mode. Throw exceptions on syntax errors. Default is False.
class ``TextWikiMarkup``
------------------------
Translates material in Wiki markup language to plain text. Usage::
from WikiTrans.wiki2text import TextWikiMarkup
diff --git a/bin/wikitrans b/bin/wikitrans
index 4a0fc06..09ba0b3 100755
--- a/bin/wikitrans
+++ b/bin/wikitrans
@@ -68,12 +68,17 @@ def setkw(option, opt, value, parser):
if not parser.values.kwdict:
parser.values.kwdict = {}
(kw,sep,val) = value.partition('=')
if val:
parser.values.kwdict[kw] = val
+def setdebug(option, opt, value, parser):
+ if not parser.values.kwdict:
+ parser.values.kwdict = {}
+ parser.values.kwdict['debug_level'] = value
+
def getwiki(url, options):
tmp = tempfile.NamedTemporaryFile()
if sys.version_info[0] > 2:
import urllib.request
with urllib.request.urlopen(url) as u:
root = etree.fromstring(u.read())
@@ -132,14 +137,14 @@ Otherwise, ARG is name of the file to read wiki material from.
parser.add_option('-o', '--option',
action='callback', callback=setkw,
type='string', dest='kwdict',
default={},
help='set keyword option for the parser class constructor')
parser.add_option('-d', '--debug',
- action='store', type='int', dest='debug',
- default=0,
+ action='callback', callback=setdebug,
+ type='int', dest='kwdict',
help='set debug level (0..100)')
parser.add_option('-D', '--dump',
action='store_const', const='dump',
dest='otype',
help='dump parse tree and exit; similar to --type=dump')
parser.add_option('-b', '--base-url',
@@ -147,13 +152,14 @@ Otherwise, ARG is name of the file to read wiki material from.
help='set base url')
(options, args) = parser.parse_args()
if len(args) == 1:
if options.base_url:
- getwiki(options.base_url + '/wiki/Special:Export/' + args[0], options)
+ getwiki(options.base_url + '/wiki/Special:Export/' + args[0],
+ options)
elif args[0] == '-':
options.kwdict['file'] = sys.stdin
elif re.match('^(http|ftp)s?://',args[0]):
getwiki(args[0], options)
else:
options.kwdict['filename'] = args[0]
@@ -164,13 +170,12 @@ Otherwise, ARG is name of the file to read wiki material from.
if options.otype == 'dump' and not 'indent' in options.kwdict:
options.kwdict['indent'] = 2
if options.otype in handlers:
if options.itype in handlers[options.otype]:
markup = handlers[options.otype][options.itype](**options.kwdict)
- markup.debug_level = options.debug
markup.parse()
print("%s" % str(markup))
exit(0)
else:
print("input type %s is not supported for %s output" % (options.itype, options.otype))
else:
diff --git a/wikitrans/wikimarkup.py b/wikitrans/wikimarkup.py
index 19f69e6..1e2429f 100644
--- a/wikitrans/wikimarkup.py
+++ b/wikitrans/wikimarkup.py
@@ -142,12 +142,19 @@ class WikiMarkupParser(object):
Abstract methods (must be overridden by the subclass):
input() -- returns next physical line from the input material.
Public attributes:
+ Input:
+ debug_level -- debug verbosity level (0 - no debug info, 100 - excessively
+ copious debug messages). Default is 0.
+ strict -- if True, parser will throw exception upon encountering
+ invalid markup tag (mostly for future use)
+
+ Output:
tree -- constructed parse tree (a subclass of WikiNode)
"""
delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^/][^>]+))?\s*(?P<closed>/)?>")
@@ -172,12 +179,13 @@ class WikiMarkupParser(object):
newline = 0
tree = None
tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ]
debug_level = 0
+ strict = False
def dprint(self, lev, fmt, *argv):
"""If current debug level is greater than or equal to lev, print *argv
according to format.
"""
if self.debug_level >= lev:
@@ -371,13 +379,13 @@ class WikiMarkupParser(object):
stack.append(i)
else:
# Push the token on stack
stack.append(i)
# Redefine all non-matched tokens as TEXT
for i in stack:
- # FIXME
+ # FIXME: How to convert node to TEXT?
self.toklist[i] = self._new_node(type='TEXT',
content=str(self.toklist[i]))
mark = []
def push_mark(self):
@@ -489,13 +497,16 @@ class WikiMarkupParser(object):
flush()
acc['seq'].append(self.parse_tag(tok))
elif tok.type == 'DELIM':
flush()
acc['seq'].append(self.parse_inline_delim(tok))
else:
+ if self.strict:
raise UnexpectedTokenError(tok)
+ # FIXME: Another possible variant of handling this case is to
+ # convert tok to TEXT node and append it to acc['seq']
tok = self.getkn()
flush()
if acc['seq']:
tok = self._new_node(type=type, content=acc['seq'])
else:
tok = None
@@ -973,12 +984,19 @@ class WikiMarkup(WikiMarkupParser):
image_base=URL
Base URL for images. Default is
'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf'
media_base=URL
Base URL for media files. Default is
'http://www.mediawiki.org/xml/export-0.3'
+
+ debug_level=INT
+ debug verbosity level (0 - no debug info, 100 - excessively
+ copious debug messages). Default is 0.
+ strict=BOOL
+ Strict parsing mode. Throw exceptions on syntax errors. Default
+ is False.
"""
self.token_class = {
'NIL': WikiNode,
'NL': WikiNode,
'OTAG': WikiTagNode,
'CTAG': WikiTagNode,
@@ -1003,21 +1021,28 @@ class WikiMarkup(WikiMarkupParser):
for kw in keywords:
if kw == 'file':
self.file = keywords[kw]
elif kw == 'filename':
self.file = open(keywords[kw])
elif kw == 'text':
+ if sys.version_info[0] > 2:
+ self.text = keywords[kw].decode('utf-8').split("\n")
+ else:
self.text = keywords[kw].split("\n")
elif kw == 'lang':
self.lang = keywords[kw]
elif kw == 'html_base':
self.html_base = keywords[kw]
elif kw == 'image_base':
self.image_base = keywords[kw]
elif kw == 'media_base':
self.media_base = keywords[kw]
+ elif kw == 'strict':
+ self.strict = keywords[kw]
+ elif kw == 'debug_level':
+ self.debug_level = keywords[kw]
def __del__(self):
if self.file:
self.file.close()
def input(self):
diff --git a/wikitrans/wikitoken.py b/wikitrans/wikitoken.py
index deedea8..1f81092 100644
--- a/wikitrans/wikitoken.py
+++ b/wikitrans/wikitoken.py
@@ -308,13 +308,13 @@ class WikiEnvNode(WikiContentNode):
@jsonencoder
def json_encode(self):
return {
'envtype': self.envtype,
'level': self.level,
- 'content': map(lambda x: x.json_encode(), self.content)
+ 'content': [x for x in map(lambda x: x.json_encode(), self.content)]
}
class WikiIndNode(WikiContentNode):
"""Indented block node.

Return to:

Send suggestions and report system problems to the System administrator.