diff options
author | Sergey Poznyakoff <gray@gnu.org> | 2018-08-17 17:05:32 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org> | 2018-08-17 17:05:32 +0300 |
commit | 0c03a5a7b40b598b88f22f46b9e9086af6c59877 (patch) | |
tree | 4c894eef1dc0d998330683d2ecab12dfca99b803 | |
parent | ad4a97d83528b00e76435d3d2674ff05a44bd398 (diff) | |
download | wikitrans-0c03a5a7b40b598b88f22f46b9e9086af6c59877.tar.gz wikitrans-0c03a5a7b40b598b88f22f46b9e9086af6c59877.tar.bz2 |
Improve bin/wikitrans
* bin/wikitrans (getwiki): Take options as second argument and
modify it directly. Deduce options.lang, options.itype, and
options.kwdict['html_base'] from the URL, when possible.
* README.rst: Update description of bin/wikitrans
-rw-r--r-- | README.rst | 4 | ||||
-rwxr-xr-x | bin/wikitrans | 24 |
2 files changed, 15 insertions, 13 deletions
@@ -167,8 +167,10 @@ Options are: Set debug level (0..100) ``-D``, ``--dump`` Dump parse tree and exit; same as ``--type=dump``. ``-b URL``, ``--base-url=URL`` Set base url. - +Note: when using ``--base-url`` or passing URL as an argument (2nd and 3rd +use cases above), if the URL is in 'wikipedia.org' or 'wiktionary.org' +domain, the options ``--input-type``, and ``--lang`` are set automatically. diff --git a/bin/wikitrans b/bin/wikitrans index e9ab81f..caaa885 100755 --- a/bin/wikitrans +++ b/bin/wikitrans @@ -67,13 +67,13 @@ def setkw(option, opt, value, parser): if not parser.values.kwdict: parser.values.kwdict = {} (kw,sep,val) = value.partition('=') if val: parser.values.kwdict[kw] = val -def getwiki(url): +def getwiki(url, options): tmp = tempfile.NamedTemporaryFile() if sys.version_info[0] > 2: import urllib.request with urllib.request.urlopen(url) as u: root = etree.fromstring(u.read()) else: @@ -84,13 +84,21 @@ def getwiki(url): if 'version' in root.attrib: ns['wiki'] = 'http://www.mediawiki.org/xml/export-%s/' % root.attrib['version'] text = root.find('wiki:page/wiki:revision/wiki:text',ns) if text is None: print("no page/revision/text element in the downloaded page") exit(0) - return text.text.encode() + + m = re.match('(?P<url>(?:.+://)(?P<lang>.+?)\.(?P<root>wik(?:ipedia|tionary))\.org)', url) + if m: + options.lang = m.group('lang') + options.kwdict['html_base'] = m.group('url') + '/wiki/' + if m.group('root') == 'wiktionary': + options.itype = 'wiktionary' + + options.kwdict['text'] = text.text.encode() def main(): usage = '%prog [OPTIONS] ARG' version = '%prog 1.0' description = """Translates MediaWiki documents markup to various other formats. If ARG looks like a URL, the wiki text to be converted will be downloaded @@ -138,25 +146,17 @@ Otherwise, ARG is name of the file to read wiki material from. help='set base url') (options, args) = parser.parse_args() if len(args) == 1: if options.base_url: - options.kwdict['text'] = getwiki(options.base_url - + '/wiki/Special:Export/' + args[0]) - m = re.match('(?:.+://)(.+?)\.(wik(?:ipedia|tionary))\.org', - options.base_url) - if m: - options.lang = m.group(1) - options.kwdict['html_base'] = options.base_url + '/wiki/' - if m.group(2) == 'wiktionary': - options.itype = 'wiktionary' + getwiki(options.base_url + '/wiki/Special:Export/' + args[0], options) elif args[0] == '-': options.kwdict['file'] = sys.stdin elif re.match('^(http|ftp)s?://',args[0]): - options.kwdict['text'] = getwiki(args[0]) + getwiki(args[0], options) else: options.kwdict['filename'] = args[0] else: parser.error("bad number of arguments") options.kwdict['lang'] = options.lang # FIXME |