summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.rst6
-rwxr-xr-xbin/wikitrans45
-rw-r--r--wikitrans/wikimarkup.py35
-rw-r--r--wikitrans/wikitoken.py2
4 files changed, 62 insertions, 26 deletions
diff --git a/README.rst b/README.rst
index 7838fa1..7edeb04 100644
--- a/README.rst
+++ b/README.rst
@@ -37,6 +37,12 @@ media_base = *url*
Base URL for media files. Default is
``http://www.mediawiki.org/xml/export-0.3``
+debug_level = *int*
+ Debug verbosity level (0 - no debug info, 100 - excessively
+ copious debug messages). Default is 0.
+
+strict = *bool*
+ Strict parsing mode. Throw exceptions on syntax errors. Default is False.
class ``TextWikiMarkup``
------------------------
diff --git a/bin/wikitrans b/bin/wikitrans
index 4a0fc06..09ba0b3 100755
--- a/bin/wikitrans
+++ b/bin/wikitrans
@@ -1,17 +1,17 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2008-2018 Sergey Poznyakoff
-#
+#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.
-#
+#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
-#
+#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
@@ -32,13 +32,13 @@ from wikitrans.wiki2texi import TexiWikiMarkup
from wikitrans.wikimarkup import WikiMarkup
from wikitrans.wikidump import DumpWikiMarkup
-# Set utf-8 as the default encoding for Python 2.7.
-# Trying to do so using encode('utf_8')/unicode, which is
+# Set utf-8 as the default encoding for Python 2.7.
+# Trying to do so using encode('utf_8')/unicode, which is
# supposed to be the right way, does not work in Python 2.7
# Simply calling sys.setdefaultencoding is not possible,
-# because, for some obscure reason, Python chooses to delete
-# this symbol from the namespace after setting its default
-# encoding in site.py. That's why reload is needed.
+# because, for some obscure reason, Python chooses to delete
+# this symbol from the namespace after setting its default
+# encoding in site.py. That's why reload is needed.
try:
reload(sys)
sys.setdefaultencoding('utf-8')
@@ -71,8 +71,13 @@ def setkw(option, opt, value, parser):
if val:
parser.values.kwdict[kw] = val
+def setdebug(option, opt, value, parser):
+ if not parser.values.kwdict:
+ parser.values.kwdict = {}
+ parser.values.kwdict['debug_level'] = value
+
def getwiki(url, options):
- tmp = tempfile.NamedTemporaryFile()
+ tmp = tempfile.NamedTemporaryFile()
if sys.version_info[0] > 2:
import urllib.request
with urllib.request.urlopen(url) as u:
@@ -95,9 +100,9 @@ def getwiki(url, options):
options.kwdict['html_base'] = m.group('url') + '/wiki/'
if m.group('root') == 'wiktionary':
options.itype = 'wiktionary'
-
+
options.kwdict['text'] = text.text.encode()
-
+
def main():
usage = '%prog [OPTIONS] ARG'
version = '%prog 1.2'
@@ -106,10 +111,10 @@ If ARG looks like a URL, the wiki text to be converted will be downloaded
from that URL.
Otherwise, if --base-url is given, ARG is treated as the name of the page to
get from the WikiMedia istallation at that URL.
-Otherwise, ARG is name of the file to read wiki material from.
+Otherwise, ARG is name of the file to read wiki material from.
"""
epilog = "Report bugs to: <gray+wikitrans@gnu.org.ua>"
-
+
parser = OptionParser(usage=usage,
version=version,
description=description,
@@ -135,8 +140,8 @@ Otherwise, ARG is name of the file to read wiki material from.
default={},
help='set keyword option for the parser class constructor')
parser.add_option('-d', '--debug',
- action='store', type='int', dest='debug',
- default=0,
+ action='callback', callback=setdebug,
+ type='int', dest='kwdict',
help='set debug level (0..100)')
parser.add_option('-D', '--dump',
action='store_const', const='dump',
@@ -145,12 +150,13 @@ Otherwise, ARG is name of the file to read wiki material from.
parser.add_option('-b', '--base-url',
action='store', type='string', dest='base_url',
help='set base url')
-
+
(options, args) = parser.parse_args()
if len(args) == 1:
if options.base_url:
- getwiki(options.base_url + '/wiki/Special:Export/' + args[0], options)
+ getwiki(options.base_url + '/wiki/Special:Export/' + args[0],
+ options)
elif args[0] == '-':
options.kwdict['file'] = sys.stdin
elif re.match('^(http|ftp)s?://',args[0]):
@@ -159,7 +165,7 @@ Otherwise, ARG is name of the file to read wiki material from.
options.kwdict['filename'] = args[0]
else:
parser.error("bad number of arguments")
-
+
options.kwdict['lang'] = options.lang # FIXME
if options.otype == 'dump' and not 'indent' in options.kwdict:
@@ -167,7 +173,6 @@ Otherwise, ARG is name of the file to read wiki material from.
if options.otype in handlers:
if options.itype in handlers[options.otype]:
markup = handlers[options.otype][options.itype](**options.kwdict)
- markup.debug_level = options.debug
markup.parse()
print("%s" % str(markup))
exit(0)
@@ -178,4 +183,4 @@ Otherwise, ARG is name of the file to read wiki material from.
exit(1)
if __name__ == '__main__':
- main()
+ main()
diff --git a/wikitrans/wikimarkup.py b/wikitrans/wikimarkup.py
index 19f69e6..1e2429f 100644
--- a/wikitrans/wikimarkup.py
+++ b/wikitrans/wikimarkup.py
@@ -145,6 +145,13 @@ class WikiMarkupParser(object):
Public attributes:
+ Input:
+ debug_level -- debug verbosity level (0 - no debug info, 100 - excessively
+ copious debug messages). Default is 0.
+ strict -- if True, parser will throw exception upon encountering
+ invalid markup tag (mostly for future use)
+
+ Output:
tree -- constructed parse tree (a subclass of WikiNode)
"""
@@ -175,6 +182,7 @@ class WikiMarkupParser(object):
tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ]
debug_level = 0
+ strict = False
def dprint(self, lev, fmt, *argv):
"""If current debug level is greater than or equal to lev, print *argv
@@ -187,7 +195,7 @@ class WikiMarkupParser(object):
inline_delims = [ "''", "'''", "[", "]", "[[", "]]", "{{", "}}", "|" ]
token_class = {}
-
+
def _new_node(self, **kwarg):
return self.token_class[kwarg['type']](self, **kwarg)
@@ -374,7 +382,7 @@ class WikiMarkupParser(object):
stack.append(i)
# Redefine all non-matched tokens as TEXT
for i in stack:
- # FIXME
+ # FIXME: How to convert node to TEXT?
self.toklist[i] = self._new_node(type='TEXT',
content=str(self.toklist[i]))
@@ -492,7 +500,10 @@ class WikiMarkupParser(object):
flush()
acc['seq'].append(self.parse_inline_delim(tok))
else:
- raise UnexpectedTokenError(tok)
+ if self.strict:
+ raise UnexpectedTokenError(tok)
+ # FIXME: Another possible variant of handling this case is to
+ # convert tok to TEXT node and append it to acc['seq']
tok = self.getkn()
flush()
if acc['seq']:
@@ -976,6 +987,13 @@ class WikiMarkup(WikiMarkupParser):
media_base=URL
Base URL for media files. Default is
'http://www.mediawiki.org/xml/export-0.3'
+
+ debug_level=INT
+ debug verbosity level (0 - no debug info, 100 - excessively
+ copious debug messages). Default is 0.
+ strict=BOOL
+ Strict parsing mode. Throw exceptions on syntax errors. Default
+ is False.
"""
self.token_class = {
'NIL': WikiNode,
@@ -999,14 +1017,17 @@ class WikiMarkup(WikiMarkupParser):
'LINK': WikiSeqNode,
'HDR': WikiHdrNode
}
-
+
for kw in keywords:
if kw == 'file':
self.file = keywords[kw]
elif kw == 'filename':
self.file = open(keywords[kw])
elif kw == 'text':
- self.text = keywords[kw].split("\n")
+ if sys.version_info[0] > 2:
+ self.text = keywords[kw].decode('utf-8').split("\n")
+ else:
+ self.text = keywords[kw].split("\n")
elif kw == 'lang':
self.lang = keywords[kw]
elif kw == 'html_base':
@@ -1015,6 +1036,10 @@ class WikiMarkup(WikiMarkupParser):
self.image_base = keywords[kw]
elif kw == 'media_base':
self.media_base = keywords[kw]
+ elif kw == 'strict':
+ self.strict = keywords[kw]
+ elif kw == 'debug_level':
+ self.debug_level = keywords[kw]
def __del__(self):
if self.file:
diff --git a/wikitrans/wikitoken.py b/wikitrans/wikitoken.py
index deedea8..1f81092 100644
--- a/wikitrans/wikitoken.py
+++ b/wikitrans/wikitoken.py
@@ -311,7 +311,7 @@ class WikiEnvNode(WikiContentNode):
return {
'envtype': self.envtype,
'level': self.level,
- 'content': map(lambda x: x.json_encode(), self.content)
+ 'content': [x for x in map(lambda x: x.json_encode(), self.content)]
}

Return to:

Send suggestions and report system problems to the System administrator.