From 0c03a5a7b40b598b88f22f46b9e9086af6c59877 Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Fri, 17 Aug 2018 17:05:32 +0300 Subject: Improve bin/wikitrans * bin/wikitrans (getwiki): Take options as second argument and modify it directly. Deduce options.lang, options.itype, and options.kwdict['html_base'] from the URL, when possible. * README.rst: Update description of bin/wikitrans --- README.rst | 4 +++- bin/wikitrans | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/README.rst b/README.rst index b6863bc..d189c98 100644 --- a/README.rst +++ b/README.rst @@ -170,5 +170,7 @@ Options are: ``-b URL``, ``--base-url=URL`` Set base url. - +Note: when using ``--base-url`` or passing URL as an argument (2nd and 3rd +use cases above), if the URL is in 'wikipedia.org' or 'wiktionary.org' +domain, the options ``--input-type``, and ``--lang`` are set automatically. diff --git a/bin/wikitrans b/bin/wikitrans index e9ab81f..caaa885 100755 --- a/bin/wikitrans +++ b/bin/wikitrans @@ -70,7 +70,7 @@ def setkw(option, opt, value, parser): if val: parser.values.kwdict[kw] = val -def getwiki(url): +def getwiki(url, options): tmp = tempfile.NamedTemporaryFile() if sys.version_info[0] > 2: import urllib.request @@ -87,7 +87,15 @@ def getwiki(url): if text is None: print("no page/revision/text element in the downloaded page") exit(0) - return text.text.encode() + + m = re.match('(?P(?:.+://)(?P.+?)\.(?Pwik(?:ipedia|tionary))\.org)', url) + if m: + options.lang = m.group('lang') + options.kwdict['html_base'] = m.group('url') + '/wiki/' + if m.group('root') == 'wiktionary': + options.itype = 'wiktionary' + + options.kwdict['text'] = text.text.encode() def main(): usage = '%prog [OPTIONS] ARG' @@ -141,19 +149,11 @@ Otherwise, ARG is name of the file to read wiki material from. (options, args) = parser.parse_args() if len(args) == 1: if options.base_url: - options.kwdict['text'] = getwiki(options.base_url - + '/wiki/Special:Export/' + args[0]) - m = re.match('(?:.+://)(.+?)\.(wik(?:ipedia|tionary))\.org', - options.base_url) - if m: - options.lang = m.group(1) - options.kwdict['html_base'] = options.base_url + '/wiki/' - if m.group(2) == 'wiktionary': - options.itype = 'wiktionary' + getwiki(options.base_url + '/wiki/Special:Export/' + args[0], options) elif args[0] == '-': options.kwdict['file'] = sys.stdin elif re.match('^(http|ftp)s?://',args[0]): - options.kwdict['text'] = getwiki(args[0]) + getwiki(args[0], options) else: options.kwdict['filename'] = args[0] else: -- cgit v1.2.1