diff options
author | Sergey Poznyakoff <gray@gnu.org> | 2018-08-26 23:16:31 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org> | 2018-08-26 23:16:31 +0300 |
commit | 5d25aa3815a470ae497ff313d15ba5b9270d6d71 (patch) | |
tree | ecae8b41be4b43877fa0f6f3a79162e7eb0451ed | |
parent | 06fcb57554c4bd401b255ec887bd1970156ec3fa (diff) | |
download | dico-5d25aa3815a470ae497ff313d15ba5b9270d6d71.tar.gz dico-5d25aa3815a470ae497ff313d15ba5b9270d6d71.tar.bz2 |
Rewrite mediawiki.py to work with both Python versions
* app/python/mediawiki.py: Rewritten to work with Python 2.7 and 3.x
* app/python/mediawiki2.py: Remove
-rw-r--r-- | app/python/mediawiki.py | 46 | ||||
-rw-r--r-- | app/python/mediawiki2.py | 154 |
2 files changed, 32 insertions, 168 deletions
diff --git a/app/python/mediawiki.py b/app/python/mediawiki.py index 6c8fb52..b9586c7 100644 --- a/app/python/mediawiki.py +++ b/app/python/mediawiki.py @@ -17,14 +17,30 @@ # You should have received a copy of the GNU General Public License # along with GNU Dico. If not, see <http://www.gnu.org/licenses/>. +from __future__ import print_function import sys import re import socket -import urllib.request, urllib.error, urllib.parse -from html.entities import name2codepoint from xml.dom import minidom from wikitrans.wiki2text import TextWiktionaryMarkup -import imp + +if sys.version_info[0] > 2: + from urllib.request import urlopen, Request + from urllib.error import URLError + from urllib.parse import quote as url_quote + from html.entities import name2codepoint +else: + from urllib2 import urlopen, Request, quote as url_quote, URLError + from htmlentitydefs import name2codepoint + # Set utf-8 as the default encoding. + # Trying to do so using encode('utf_8')/unicode, which is + # supposed to be the right way, does not work. + # Simply calling sys.setdefaultencoding is not possible, + # because, for some obscure reason, Python chooses to delete + # this symbol from the namespace after setting its default + # encoding in site.py. That's why reload is needed. + reload(sys) + sys.setdefaultencoding('utf-8') try: import json @@ -60,12 +76,12 @@ class DicoModule: def define_word (self, word): url = 'http://%s%s%s' % (self.wikihost, self.endpoint_define, - urllib.parse.quote (word)) - req = urllib.request.Request (url) + url_quote (word)) + req = Request (url) req.add_header ('User-Agent', self.user_agent) try: - xml = urllib.request.urlopen (req).read () - except urllib.error.URLError: + xml = urlopen (req).read () + except URLError: return False dom = minidom.parseString (xml) el = dom.getElementsByTagName ('text') @@ -73,6 +89,8 @@ class DicoModule: data = el[0].firstChild.data if dico.current_markup () != 'wiki': data = self.__htmlentitydecode (data) + if sys.version_info[0] == 2: + data = data.encode ('utf-8') wikiparser = TextWiktionaryMarkup (text=data) wikiparser.parse () data = str (wikiparser) @@ -82,11 +100,11 @@ class DicoModule: def match_word (self, strat, key): url = 'http://%s%s%s' % (self.wikihost, self.endpoint_match, - urllib.parse.quote (key.word)) - req = urllib.request.Request (url) + url_quote (key.word)) + req = Request (url) req.add_header ('User-Agent', self.user_agent) try: - result = json.load (urllib.request.urlopen (req)) + result = json.load (urlopen (req)) if result: if strat.has_selector: fltres = [] @@ -94,12 +112,12 @@ class DicoModule: if strat.select (k, key): fltres.append (k) if len(fltres) > 0: - return ['match', sorted(fltres, key=str.lower)] + return ['match', sorted(fltres, key=unicode.lower)] else: result[1].sort () - return ['match', sorted(result[1], key=str.lower)] + return ['match', sorted(result[1], key=unicode.lower)] return False - except urllib.error.URLError: + except URLError: return False def output (self, rh, n): @@ -136,4 +154,4 @@ class DicoModule: def __htmlentitydecode (self, s): return re.sub ('&(%s);' % '|'.join (name2codepoint), - lambda m: chr (name2codepoint[m.group (1)]), s) + lambda m: unichr (name2codepoint[m.group (1)]), s) diff --git a/app/python/mediawiki2.py b/app/python/mediawiki2.py deleted file mode 100644 index 4d2a227..0000000 --- a/app/python/mediawiki2.py +++ /dev/null @@ -1,154 +0,0 @@ -# -*- coding: utf-8 -*- -# Mediawiki module for Python 2 -# This file is part of GNU Dico. -# Copyright (C) 2008-2010, 2012 Wojciech Polak -# -# GNU Dico is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Dico is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Dico. If not, see <http://www.gnu.org/licenses/>. - -import sys -import re -import socket -import urllib2 -from htmlentitydefs import name2codepoint -from xml.dom import minidom -from wikitrans.wiki2text import TextWiktionaryMarkup - -# Set utf-8 as the default encoding. -# Trying to do so using encode('utf_8')/unicode, which is -# supposed to be the right way, does not work. -# Simply calling sys.setdefaultencoding is not possible, -# because, for some obscure reason, Python chooses to delete -# this symbol from the namespace after setting its default -# encoding in site.py. That's why reload is needed. - -reload(sys) -sys.setdefaultencoding('utf-8') - -try: - import json -except ImportError: - import simplejson as json - -import dico - -__version__ = '1.03' - -class DicoModule: - user_agent = 'Mozilla/1.0' - endpoint_match = '/w/api.php?action=opensearch&format=json&search=' - endpoint_define = '/wiki/Special:Export/' - - def __init__ (self, *argv): - self.wikihost = argv[0] - socket.setdefaulttimeout (4) - dico.register_markup ('wiki') - - def open (self, dbname): - self.dbname = dbname - return True - - def close (self): - return True - - def descr (self): - return self.wikihost - - def info (self): - return False - - def define_word (self, word): - url = 'http://%s%s%s' % (self.wikihost, self.endpoint_define, - urllib2.quote (word)) - req = urllib2.Request (url) - req.add_header ('User-Agent', self.user_agent) - try: - xml = urllib2.urlopen (req).read () - except urllib2.URLError: - return False - dom = minidom.parseString (xml) - el = dom.getElementsByTagName ('text') - if len (el): - data = el[0].firstChild.data - if dico.current_markup () != 'wiki': - data = self.__htmlentitydecode (data).encode ('utf_8') - wikiparser = TextWiktionaryMarkup (text=data) - wikiparser.parse () - data = str (wikiparser) - return ['define', data] - else: - return False - - def match_word (self, strat, key): - url = 'http://%s%s%s' % (self.wikihost, self.endpoint_match, - urllib2.quote (key.word)) - req = urllib2.Request (url) - req.add_header ('User-Agent', self.user_agent) - try: - result = json.load (urllib2.urlopen (req)) - if result: - if strat.has_selector: - fltres = [] - for k in result[1]: - if strat.select (k, key): - fltres.append (k) - if fltres.count > 0: - return ['match', sorted(fltres, key=unicode.lower)] - else: - result[1].sort () - return ['match', sorted(result[1], key=unicode.lower)] - return False - except urllib2.URLError: - return False - - def output (self, rh, n): - if rh[0] == 'define': - try: - print rh[1].encode ('utf_8'), - except UnicodeDecodeError: - print rh[1], - else: - list = rh[1] - sys.stdout.softspace = 0 - try: - print list[n].encode ('utf_8'), - except UnicodeDecodeError: - print list[n], - return True - - def result_count (self, rh): - if rh[0] == 'define': - return 1 - else: - return len (rh[1]) - - def compare_count (self, rh): - return 1 - - def result_headers (self, rh, hdr): - if dico.current_markup () != 'wiki': - hdr['Content-Type'] = 'text/plain'; - elif '.wikipedia.org' in self.wikihost: - hdr['Content-Type'] = 'text/x-wiki-wikipedia'; - elif '.wiktionary.org' in self.wikihost: - hdr['Content-Type'] = 'text/x-wiki-wiktionary'; - else: - hdr['Content-Type'] = 'text/x-wiki'; - return hdr - - def free_result (self, rh): - pass - - def __htmlentitydecode (self, s): - return re.sub ('&(%s);' % '|'.join (name2codepoint), - lambda m: unichr (name2codepoint[m.group (1)]), s) |