#!/usr/bin/python # -*- coding: utf-8 -*- # Copyright (C) 2008-2018 Sergey Poznyakoff # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import unicode_literals import sys import re import tempfile import xml.etree.ElementTree as etree from optparse import OptionParser try: from StringIO import StringIO except ImportError: from io import StringIO from WikiTrans.wiki2html import HtmlWikiMarkup, HtmlWiktionaryMarkup from WikiTrans.wiki2text import TextWikiMarkup, TextWiktionaryMarkup from WikiTrans.wiki2texi import TexiWikiMarkup from WikiTrans.wikimarkup import WikiMarkup from WikiTrans.wikidump import DumpWikiMarkup # Set utf-8 as the default encoding for Python 2.7. # Trying to do so using encode('utf_8')/unicode, which is # supposed to be the right way, does not work in Python 2.7 # Simply calling sys.setdefaultencoding is not possible, # because, for some obscure reason, Python chooses to delete # this symbol from the namespace after setting its default # encoding in site.py. That's why reload is needed. try: reload(sys) sys.setdefaultencoding('utf-8') except: pass handlers = { 'dump': { 'default': DumpWikiMarkup, 'wiktionary': DumpWikiMarkup }, 'html': { 'default': HtmlWikiMarkup, 'wiktionary': HtmlWiktionaryMarkup }, 'text': { 'default': TextWikiMarkup, 'wiktionary': TextWiktionaryMarkup }, 'texi': { 'default': TexiWikiMarkup } } def setkw(option, opt, value, parser): if not parser.values.kwdict: parser.values.kwdict = {} (kw,sep,val) = value.partition('=') if val: parser.values.kwdict[kw] = val def getwiki(url): tmp = tempfile.NamedTemporaryFile() if sys.version_info[0] > 2: import urllib.request with urllib.request.urlopen(url) as u: root = etree.fromstring(u.read()) else: import urllib urllib.urlretrieve(url, tmp.name) root = etree.parse(tmp.name).getroot() ns = { 'wiki':'' } if 'version' in root.attrib: ns['wiki'] = 'http://www.mediawiki.org/xml/export-%s/' % root.attrib['version'] text = root.find('wiki:page/wiki:revision/wiki:text',ns) if text is None: print("no page/revision/text element in the downloaded page") exit(0) return text.text.encode() def main(): usage = '%prog [OPTIONS] ARG' version = '%prog 1.0' description = """Translates MediaWiki documents markup to various other formats. If ARG looks like a URL, the wiki text to be converted will be downloaded from that URL. Otherwise, if --base-url is given, ARG is treated as the name of the page to get from the WikiMedia istallation at that URL. Otherwise, ARG is name of the file to read wiki material from. """ epilog = "Report bugs to: " parser = OptionParser(usage=usage, version=version, description=description, epilog=epilog) parser.add_option('-v', '--verbose', action="count", dest="verbose", help="verbose operation") parser.add_option('-I', '--input-type', action='store', type='string', dest='itype', default='default', help='set input document type ("default" or "wiktionary")') parser.add_option('-t', '--to', '--type', action='store', type='string', dest='otype', default='html', help='set output document type ("html" (default), "texi" or "text")') parser.add_option('-l', '--lang', action='store', type='string', dest='lang', default='pl', help='set input document language') parser.add_option('-o', '--option', action='callback', callback=setkw, type='string', dest='kwdict', default={}, help='set keyword option for the parser class') parser.add_option('-d', '--debug', action='store', type='int', dest='debug', default=0, help='set debug level (0..100)') parser.add_option('-D', '--dump', action='store_const', const='dump', dest='otype', help='dump parse tree and exit; similar to --type=dump') parser.add_option('-b', '--base-url', action='store', type='string', dest='base_url', help='set base url') (options, args) = parser.parse_args() if len(args) == 1: if options.base_url: options.kwdict['text'] = getwiki(options.base_url + '/wiki/Special:Export/' + args[0]) elif args[0] == '-': options.kwdict['file'] = sys.stdin elif re.match('^(http|ftp)s?://',args[0]): options.kwdict['text'] = getwiki(args[0]) else: options.kwdict['filename'] = args[0] else: parser.error("bad number of arguments") options.kwdict['lang'] = options.lang # FIXME if options.otype == 'dump' and not 'indent' in options.kwdict: options.kwdict['indent'] = 2 if options.otype in handlers: if options.itype in handlers[options.otype]: markup = handlers[options.otype][options.itype](**options.kwdict) markup.debug_level = options.debug markup.parse() print("%s" % str(markup)) exit(0) else: print("input type %s is not supported for %s output" % (options.itype, options.otype)) else: print("unsupported output type: %s" % options.otype) exit(1) if __name__ == '__main__': main()