#!/usr/bin/python # -*- coding: utf-8 -*- # Copyright (C) 2008, 2009, 2015 Sergey Poznyakoff # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . from __future__ import print_function import sys import re from types import * __all__ = [ "BaseWikiMarkup", "WikiMarkup", "TagAttributes", "TagAttributeSyntax" ] class TagAttributeSyntax(Exception): def __init__(self, value): self.value = value def __str__(self): return repr(self.value) class TagAttributes(object): attrstart = re.compile("^(?P[a-zA-Z0-9_-]+)(?P=\")?") valseg = re.compile("^[^\\\"]+") tab = {} printable = None def __init__(self, string): if not string: self.printable = '' return self.printable = string s = string self.tab = {} while s != '': s = s.strip() m = self.attrstart.match(s) if m: name = m.group('attr') val = '' s = s[m.end(0):] if m.group('eq'): while 1: m = self.valseg.match(s) val += m.group(0) s = s[m.end(0):] if s[0] == '\\': val += s[1] s += 2 elif s[0] == '"': s = s[1:] break else: val = 1 self.tab[name] = val else: raise TagAttributeSyntax(s) def __len__(self): return len(self.tab) def __getitem__(self, key): return self.tab[key] def __contains__(self, key): return key in self.tab def __iter__(self): for key in self.tab: yield(key) def has_key(self, key): return self.__contains__(key) def __setitem__(self, key, value): self.tab[key] = value def __delitem__(self, key): del self.tab[key] def __str__(self): return self.printable def __repr__(self): return self.printable class BaseWikiMarkup(object): delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") otag = re.compile("<(?P[a-zA-Z0-9_]+)(?:\s+(?P[^>]+))?\s*(?P/)?>") ctag = re.compile("[a-zA-Z0-9_]+)\s*>") refstart = re.compile("^https?://") close_delim = { '[': ']', '[[': ']]', '{{': '}}' } # Environment types: envtypes = { "*": [ "unnumbered", 0 ], "#": [ "numbered", 0 ], ";": [ "defn", 0 ], ":": [ "defn", 1 ] } toklist = None tokind = 0 newline = 0 tree = None tags = [ 'code', 'nowiki', 'tt', 'div' ] nested = 0 debug_level = 0 def dprint(self, lev, fmt, *argv): if self.debug_level >= lev: print("[DEBUG]", fmt % argv) def print_dump_prefix(self, level, file): file.write("[DUMP]" + ' ' * (2*level + 1)) def dump_nil(self, node, level, file): pass def dump_text(self, node, level, file): self.print_dump_prefix(level, file) file.write("CONTENT: \"%s\"\n" % node['content']) def dump_delim(self, node, level, file): file.write("'%s'" % node['content']) if 'continuation' in node and node['continuation']: file.write(" (cont)") file.write("\n") def dump_tag(self, node, level, file): self.print_dump_prefix(level, file) file.write("TAG: %s\n" % node['tag']) if 'args' in node: self.print_dump_prefix(level, file) file.write("ARGS: %s\n" % node['args']) if 'content' in node: self.dump_node(node['content'], level + 1, file) def dump_seq(self, node, level, file): self.dump(node['content'], level + 1, file) def dump_ref(self, node, level, file): self.print_dump_prefix(level, file) file.write("REF: %s\n" % node['ref']) self.dump_node(node['content'], level + 1, file) def dump_hdr(self, node, level, file): self.print_dump_prefix(level, file) file.write("LEVEL: %s\n" % node['level']) self.dump_node(node['content'], level + 1, file) def dump_elt(self, node, level, file): self.print_dump_prefix(level, file) file.write("SUBTYPE: %s\n" % node['subtype']) self.dump_node(node['content'], level + 1, file) def dump_env(self, node, level, file): self.print_dump_prefix(level, file) file.write("ENVTYPE: %s\n" % node['envtype']) self.print_dump_prefix(level, file) file.write("LEVEL: %s\n" % node['level']) self.dump(node['content'], level + 1, file) def dump_ind(self, node, level, file): self.print_dump_prefix(level, file) file.write("LEVEL: %s\n" % node['level']) self.dump_node(node['content'], level + 1, file) def dump_link(self, node, level, file): self.dump(node['content'], level + 1, file) dump_type = { 'NIL': dump_nil, 'NL': dump_nil, 'TEXT': dump_text, 'DELIM': dump_delim, 'OTAG': dump_tag, 'CTAG': dump_tag, 'TAG': dump_tag, 'SEQ': dump_seq, 'REF': dump_ref, 'HDR': dump_hdr, 'ELT': dump_elt, 'ENV': dump_env, 'IND': dump_ind, 'BAR': dump_nil, 'PARA': dump_seq, 'PRE': dump_text, 'BOLD': dump_seq, 'IT': dump_seq, 'LINK': dump_link, } def dump_node(self, node, level, file): if type(node) != dict: file.write("UNHANDLED NODE: %s, %s\n" % (type(node),node)) return self.print_dump_prefix(level, file) file.write("NODE " + node['type'] + ":\n") if node['type'] in self.dump_type: self.dump_type[node['type']](self, node, level, file) else: self.print_dump_prefix(level, file) file.write("(UNHANDLED) ") file.write("%s\n" % node) self.print_dump_prefix(level, file) file.write("END NODE " + node['type'] + "\n") def dump(self, tree, level=0, file=sys.stdout): for node in tree: self.dump_node(node, level, file) def tokread(self): line = None pos = 0 while 1: if (not line or pos == len(line)): try: line = self.input() pos = 0 except StopIteration: line = u'' if not line or line == "": yield({ 'type': 'NIL' }) break if line == '\n': yield({ 'type': 'NL', 'content': line }) line = None continue self.dprint(100, "LINE: %s", line[pos:]) m = self.delim.search(line, pos) if m: if (pos < m.start(0)): yield({'type': 'TEXT', 'content': line[pos:m.start(0)]}) pos = m.start(0) t = None if line[m.start(0)] == '<': m = self.otag.match(line, pos) if m: pos = m.end(0) if m.group('tag') == 'nowiki': if not m.group('closed'): while 1: try: m = self.ctag.match(line) if m and m.group('tag') == 'nowiki': yield({ 'type': 'TEXT', 'content': line[pos:m.start(0)] }) pos = m.end(0) break yield({ 'type': 'TEXT', 'content': line[pos:] }) line = self.input() pos = 0 except StopIteration: break continue elif m.group('tag') in self.tags: try: t = { 'type': 'OTAG', 'tag': m.group('tag'), 'args': TagAttributes(m.group('args')) } yield(t) if m.group('closed'): t['type'] = 'CTAG' yield(t) except TagAttributeSyntax: yield({'type': 'TEXT', 'content': m.group(0)}) continue else: m = self.ctag.match(line, pos) if m: if m.group('tag') in self.tags: yield( { 'type': 'CTAG', 'tag': m.group('tag') } ) pos = m.end(0) continue else: yield( { 'type': 'TEXT', 'content': line[pos:pos+1] }) pos += 1 continue else: pos = m.end(0) content = m.group(0) if content[0] in self.envtypes: t = { 'type': 'DELIM', 'content': content, 'continuation': pos < len(line) and line[pos] == ":" } if t['continuation']: t['content'] += t['content'][0] pos += 1 yield(t) while pos < len(line) and line[pos] in [' ', '\t']: pos += 1 else: yield({ 'type': 'DELIM', 'content': content, 'continuation': False}) continue if line: if line[-1] == '\n': if line[pos:-1] != '': yield({ 'type': 'TEXT', 'content': line[pos:-1] }) yield({ 'type': 'NL', 'content': '\n' }) else: yield({ 'type': 'TEXT', 'content': line[pos:] }) line = None def input(self): return None def swaptkn(self, i, j): self.dprint(80, "SWAPPING %s <-> %s", i, j) x = self.toklist[i] self.toklist[i] = self.toklist[j] self.toklist[j] = x def tokenize(self): self.toklist = [] for tok in self.tokread(): self.dprint(100, "TOK: %s", tok) self.toklist.append(tok) # Determine and fix up the ordering of bold and italic markers # There are three possible cases: # # 1a. '''a b ''c'' d''' # 1b. ''a b '''c''' d'' # # 2a. '''''a b'' c d''' # 2b. '''''a b''' c d'' # # 3a. '''a b ''c d''''' # 3b. ''a b '''c d''''' stack = [] for i in range(0,len(self.toklist)): if self.toklist[i]['type'] == 'DELIM' \ and (self.toklist[i]['content'] == "''" \ or self.toklist[i]['content'] == "'''"): if len(stack) > 0: if self.toklist[stack[-1]]['content'] == self.toklist[i]['content']: # Case 1: just pop the matching delimiter off the stack stack.pop() elif len(stack) == 2 and stack[-2] + 1 == stack[-1]: # Case 2: swap delimiters saved on stack ... self.swaptkn(stack[-2], stack[-1]) # and pop off the matching one stack.pop() elif i < len(self.toklist) \ and self.toklist[i+1]['type'] == 'DELIM' \ and self.toklist[stack[-1]]['content'] == self.toklist[i+1]['content']: # Case 3: swap current and next tokens self.swaptkn(i, i+1) # and pop off the matching one stack.pop() else: # Push the token on stack stack.append(i) else: # Push the token on stack stack.append(i) # Redefine all non-matched tokens as TEXT for i in stack: self.toklist[i]['type'] = 'TEXT' def peektkn(self, off=0): return self.toklist[self.tokind-off] def setkn(self,val): self.toklist[self.tokind] = val def getkn(self): self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' if self.tokind == len(self.toklist): return { 'type': 'NIL' } tok = self.toklist[self.tokind] self.tokind = self.tokind + 1 return tok def ungetkn(self): self.tokind = self.tokind - 1 self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' return self.toklist[self.tokind] def parse_fontmod(self,delim,what): self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s", delim, what, self.peektkn()) seq = [] text = '' while 1: tok = self.getkn() if tok['type'] == 'TEXT': text += tok['content'] elif tok['type'] == 'DELIM': if tok['content'] == delim: break elif self.is_inline_delim(tok): if text: seq.append({ 'type': 'TEXT', 'content': text }) text = '' x = self.parse_inline(tok) if x: seq.append(x) else: self.dprint(80, "LEAVE parse_fontmod=%s", "None") return None else: self.dprint(80, "LEAVE parse_fontmod=None") return None elif tok['type'] == 'NL': if self.peektkn()['type'] == 'NL': self.dprint(80, "LEAVE parse_fontmod=None") return None seq.append({ 'type': 'TEXT', 'content': '\n' }) else: self.dprint(80, "LEAVE parse_fontmod=None") return None if text: seq.append({ 'type': 'TEXT', 'content': text }) res = { 'type': what, 'content': seq } self.dprint(80, "LEAVE parse_fontmod=%s", res) return res def parse_link(self, type, delim): self.dprint(80, "ENTER parse_link(%s,%s), tok %s", type, delim, self.peektkn()) subtree = [] list = [] while 1: tok = self.getkn() if tok['type'] == 'DELIM': if tok['content'] == delim: if list: subtree.append({ 'type': 'SEQ', 'content': list }) break elif tok['content'] == "|": if len(list) > 1: subtree.append({ 'type': 'SEQ', 'content': list }) elif list: subtree.append(list[0]) list = [] else: x = self.parse_inline(tok) if x: list.append(x) else: self.dprint(80, "LEAVE parse_link=%s", "None") return None elif tok['type'] == 'TEXT': list.append(tok) else: self.dprint(80, "LEAVE parse_link=%s", "None") return None self.dprint(80, "LEAVE parse_link=(%s,%s)", type, subtree) return { 'type': type, 'content': subtree } def parse_ref(self): tok = self.getkn() self.dprint(80, "ENTER parse_ref, tok %s", tok) if not (tok['type'] == 'TEXT' and self.refstart.match(tok['content'])): self.dprint(80, "LEAVE parse_ref=None") return None seq = [] (ref,sep,text) = tok['content'].partition(' ') if text: seq.insert(0, {'type': 'TEXT', 'content': text }) while 1: tok = self.getkn() if tok == None or tok['type'] == 'NIL': self.dprint(80, "LEAVE parse_ref=None") return None if tok['type'] == 'DELIM': if tok['content'] == ']': break else: tok = self.parse_inline(tok) if tok: seq.append(tok) else: self.dprint(80, "LEAVE parse_ref=None") return None elif tok['type'] == 'OTAG': list.append(self.parse_til(tok)) else: seq.append(tok) ret = { 'type': 'REF', 'ref': ref, 'content': { 'type': 'SEQ', 'content': seq } } self.dprint(80, "LEAVE parse_ref= %s", ret) return ret inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ] def is_inline_delim(self, tok): return tok['type'] == 'DELIM' and tok['content'] in self.inline_delims def is_block_delim(self, tok): return tok['type'] == 'DELIM' and tok['content'] not in self.inline_delims def parse_inline(self, tok): self.dprint(80, "ENTER parse_inline(%s), tok %s", tok, self.peektkn()) tokind = self.tokind if tok['content'] == "''": x = self.parse_fontmod(tok['content'], 'IT') elif tok['content'] == "'''": x = self.parse_fontmod(tok['content'], 'BOLD') elif tok['content'] == "[": x = self.parse_ref() elif tok['content'] == "[[": x = self.parse_link('LINK', "]]") elif tok['content'] == "{{": x = self.parse_link('TMPL', "}}") else: self.dprint(80, "LEAVE parse_inline=%s (unhandled delimiter)", "None") x = None if not x: self.tokind = tokind tok['type'] = 'TEXT' self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok) od = tok['content'] if od in self.close_delim: cd = self.close_delim[od] lev = 0 for tok in self.toklist[self.tokind+1:]: if tok['type'] == 'NIL': break elif tok['type'] == 'DELIM': if tok['content'] == od: lev += 1 elif tok['content'] == cd: if lev == 0: tok['type'] = 'TEXT' lev -= 1 break self.dprint(80, "END DELIMITER RECOVERY: %s", tok) self.dprint(80, "LEAVE parse_inline=%s", x) return x def parse_para(self): self.dprint(80, "ENTER parse_para, tok %s", self.peektkn()) seq = [] textlist = [] tok = self.peektkn() if self.newline: if re.match("^\s", tok['content']): type = 'PRE' rx = re.compile("^\S") else: type = 'PARA' rx = re.compile("^\s") else: type = 'SEQ' rx = None while 1: tok = self.getkn() if tok['type'] == 'TEXT': if rx and self.newline and rx.match(tok['content']): self.ungetkn() break textlist.append(tok['content']) elif tok['type'] == 'NL': tok = self.getkn() if tok['type'] == 'NL' or tok['type'] == 'NIL': break else: self.ungetkn() if self.is_block_delim(tok): break textlist.append('\n') elif tok['type'] == 'NIL': break elif tok['type'] == 'OTAG' or tok['type'] == 'CTAG' or tok['type'] == 'TAG': self.ungetkn() break elif tok['type'] == 'DELIM': if self.is_inline_delim(tok): if textlist: seq.append({ 'type': 'TEXT', 'content': ''.join(textlist) }) textlist = [] x = self.parse_inline(tok) if x: seq.append(x) else: self.ungetkn() # restart else: seq.append({ 'type': 'TEXT', 'content': tok['content'] }) # self.ungetkn() break if textlist: seq.append({ 'type': 'TEXT', 'content': ''.join(textlist) }) self.dprint(80, "LEAVE parse_para=%s", seq) return { 'type': type, 'content': seq } def parse_header(self, delim): self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn()) list = [] while 1: tok = self.getkn() if tok['type'] == 'NIL': self.dprint(80, "LEAVE parse_header=%s", "None") return None elif tok['type'] == 'TEXT': list.append(tok) elif tok['type'] == 'DELIM': if tok['content'] == delim: if self.peektkn()['type'] == 'NL': break else: self.dprint(80, "LEAVE parse_header=%s", "None") return None else: x = self.parse_inline(tok) if x: list.append(x) else: self.ungetkn() self.dprint(80, "LEAVE parse_header=%s", "None") return None #FIXME? else: self.dprint(80, "LEAVE parse_header=%s", "None") return None self.dprint(80, "LEAVE parse_header=(HDR, %s, (SEQ,%s))",len(delim)-1,list) return { 'type': 'HDR', 'level': len(delim)-1, 'content': { 'type': 'SEQ', 'content': list } } def parse_line(self): self.dprint(80, "ENTER parse_line, tok %s", self.peektkn()) list = [] while 1: tok = self.getkn() if tok['type'] == 'NL' or tok['type'] == 'NIL': break elif tok['type'] == 'TEXT': list.append(tok) elif tok['type'] == 'DELIM': if tok['content'][0] == ":": list.append(self.parse_indent(len(tok['content']))) break else: x = self.parse_inline(tok) if x: list.append(x) else: list.append(tok) elif tok['type'] == 'OTAG': list.append(self.parse_til(tok)) else: list.append(tok) self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list) return { 'type': 'SEQ', 'content': list } def parse_env(self, type, lev): self.dprint(80, "ENTER parse_env(%s,%s), tok %s",type,lev,self.peektkn()) list = [] while 1: tok = self.getkn() if tok['type'] == 'DELIM' \ and tok['content'][0] in self.envtypes \ and type == self.envtypes[tok['content'][0]][0]: if len(tok['content']) < lev: self.ungetkn() break elif len(tok['content']) > lev: self.ungetkn() elt = self.parse_env(type, len(tok['content'])) else: elt = self.parse_line() if not tok['continuation']: list.append({ 'type': 'ELT', 'subtype': self.envtypes[tok['content'][0]][1], 'content': elt }) continue if list: if list[-1]['content']['type'] != 'SEQ': x = list[-1]['content']['content'] # FIXME: list[-1]['content'] = { 'type': 'SEQ', 'content': [x] } list[-1]['content']['content'].append(elt) else: self.ungetkn() break self.dprint(80, "LEAVE parse_env=(ENV, %s, %s, %s)", type, lev, list) return { 'type': 'ENV', 'envtype': type, 'level': lev, 'content': list } def parse_indent(self, lev): self.dprint(80, "ENTER parse_indent(%s), tok %s", lev, self.peektkn()) x = { 'type': 'IND', 'level': lev, 'content': self.parse_line() } self.dprint(80, "LEAVE parse_indent=%s", x) return x def parse_til(self, tag): self.dprint(80, "ENTER parse_til(%s)", tag) seq = [] save = self.tokind while 1: t = self.parse0() if t == None or t['type'] == 'NIL': self.tokind = save s = '<' + tag['tag'] if 'args' in tag and tag['args']: s += ' ' + str(tag['args']) del tag['args'] s += '>' if 'content' in tag: subtree = tag['content'] else: subtree = None tag['type'] = 'TEXT' tag['content'] = s if subtree: self.tree[self.tokind:self.tokind] = subtree self.dprint(80, "LEAVE parse_til = %s (tree modified)", tag) self.ungetkn() return self.parse0() if t['type'] == 'CTAG' and tag['tag'] == t['tag']: break seq.append(t) ret = { 'type': 'TAG', 'tag': tag['tag'], 'args': tag['args'], 'content': { 'type': 'SEQ', 'content': seq } } self.dprint(80, "LEAVE parse_til = %s", ret) return ret def parse0(self): tok = self.getkn() self.dprint(80, "ENTER parse0(%s)", tok) toktype = tok['type'] if toktype == 'NIL': return None elif toktype == 'TEXT': self.ungetkn() return self.parse_para() elif toktype == 'DELIM': if tok['content'] == "----": return { 'type': 'BAR' } elif tok['content'][0:2] == "==": return self.parse_header(tok['content']) elif tok['content'][0] in self.envtypes: type = self.envtypes[tok['content'][0]][0] lev = len(tok['content']) if tok['content'][0] == ':': t = self.peektkn(2) if not (t['type'] == 'DELIM' and t['content'] == ';'): return self.parse_indent(lev) self.ungetkn() return self.parse_env(type, lev) else: self.ungetkn() return self.parse_para() elif toktype == 'NL': return { 'type': 'TEXT', 'content': '\n' } elif toktype == 'OTAG': return self.parse_til(tok) else: return tok def parse(self): if not self.toklist: self.tokenize() if self.debug_level >= 90: print("TOKEN DUMP BEGIN") self.dump(self.toklist) print("TOKEN DUMP END") self.tokind = 0 self.tree = [] while 1: subtree = self.parse0() if subtree == None: break self.tree.append(subtree) if self.nested: if self.tree[0]['type'] == 'PARA': self.tree[0]['type'] = 'SEQ' if self.debug_level >= 70: print("TREE DUMP BEGIN") self.dump(self.tree) print("TREE DUMP END") def __str__(self): return str(self.tree) class WikiMarkup (BaseWikiMarkup): """ A derived class, that supplies a basic input method. Three types of inputs are available: 1. filename= The file is opened and used for input. 2. file= The already opened file is used for input. 3. text= Input is taken from , line by line. Usage: obj = WikiMarkup(arg=val) obj.parse ... Do whatever you need with obj.tree ... """ file = None text = None lang = 'en' html_base = 'http://%(lang)s.wiktionary.org/wiki/' image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf' media_base = 'http://www.mediawiki.org/xml/export-0.3' def __init__(self, *args, **keywords): for kw in keywords: if kw == 'file': self.file = keywords[kw] elif kw == 'filename': self.file = open(keywords[kw]) elif kw == 'text': self.text = keywords[kw].decode("utf-8").split("\n") elif kw == 'lang': self.lang = keywords[kw] elif kw == 'html_base': self.html_base = keywords[kw] elif kw == 'image_base': self.image_base = keywords[kw] elif kw == 'media_base': self.media_base = keywords[kw] elif kw == 'nested': self.nested = keywords[kw] def __del__(self): if self.file: self.file.close() def input(self): if self.file: return self.file.readline() elif self.text: return self.text.pop(0) + '\n' else: return None def is_lang_link(self, elt): if elt['type'] == 'LINK' \ and isinstance(elt['content'], list) \ and len(elt['content']) == 1: if elt['content'][0]['type'] == TEXT: m = re.match('([\w-]+):', elt['content'][0]['content']) if m: # and m.group(1) in self.langtab: return True elif elt['content'][0]['type'] == 'SEQ' \ and len(elt['content'][0]['content']) == 1 and\ elt['content'][0]['content'][0]['type'] == TEXT: m = re.match('([\w-]+):',elt['content'][0]['content'][0]['content']) if m: # and m.group(1) in self.langtab: return True return False def is_empty_text(self, elt): if elt['type'] == 'TEXT': if re.search('\w', elt['content']): return False return True return False def is_empty_para(self, seq): for x in seq: if not (self.is_lang_link(x) or self.is_empty_text(x)): return False return True # ISO 639 langtab = { "aa": "Afar", # Afar "ab": "Аҧсуа", # Abkhazian "ae": None, # Avestan "af": "Afrikaans", # Afrikaans "ak": "Akana", # Akan "als": "Alemannisch", "am": "አማርኛ", # Amharic "an": "Aragonés", # Aragonese "ang": "Englisc", "ar": "العربية" , # Arabic "arc": "ܐܪܡܝܐ", "as": "অসমীয়া", # Assamese "ast": "Asturian", "av": "Авар", # Avaric "ay": "Aymara", # Aymara "az": "Azərbaycan" , # Azerbaijani "ba": "Башҡорт", # Bashkir "bar": "Boarisch", "bat-smg": "Žemaitėška", "bcl": "Bikol", "be": "Беларуская", # Byelorussian; Belarusian "be-x-old": "Беларуская (тарашкевіца)", "bg": "Български", # Bulgarian "bh": "भोजपुरी", # Bihari "bi": "Bislama", # Bislama "bm": "Bamanankan", # Bambara "bn": "বাংলা" , # Bengali; Bangla "bo": "བོད་སྐད", # Tibetan "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" , "br": "Brezhoneg" , # Breton "bs": "Bosanski" , # Bosnian "bug": "Basa Ugi", "bxr": "Буряад", "ca": "Català" , # Catalan "cbk-zam": "Chavacano de Zamboanga", "cdo": "Mìng-dĕ̤ng-ngṳ̄", "cho": "Choctaw", "ce": "Нохчийн", # Chechen "ceb": "Sinugboanong Binisaya" , # Cebuano "ch": "Chamor", # Chamorro "chr": "ᏣᎳᎩ", "chy": "Tsetsêhestâhese", "co": "Cors", # Corsican "cr": "Nehiyaw", # Cree "crh": "Qırımtatarca", "cs": "Česky" , # Czech "csb": "Kaszëbsczi", "c": "Словѣньскъ", # Church Slavic "cv": "Чăваш", # Chuvash "cy": "Cymraeg" , # Welsh "da": "Dansk" , # Danish "de": "Deutsch" , # German "diq": "Zazaki", # Dimli (Southern Zazaki) "dsb": "Dolnoserbski", "dv": "ދިވެހިބަސް", # Divehi "dz": "ཇོང་ཁ", # Dzongkha; Bhutani "ee": "Eʋegbe", # Ewe "el": "Ελληνικά" , # Greek "eml": "Emiliàn e rumagnòl", "en": "English" , # English "eo": "Esperanto" , "es": "Español" , # Spanish "et": "Eesti" , # Estonian "eu": "Euskara" , # Basque "ext": "Estremeñ", "fa": "فارسی" , # Persian "ff": "Fulfulde", # Fulah "fi": "Suomi" , # Finnish "fiu-vro": "Võro", "fj": "Na Vosa Vakaviti",# Fijian; Fiji "fo": "Føroyskt" , # Faroese "fr": "Français" , # French "frp": "Arpitan", "fur": "Furlan", "fy": "Frysk", # Frisian "ga": "Gaeilge", # Irish "gan": "贛語 (Gànyŭ)", "gd": "Gàidhlig", # Scots; Gaelic "gl": "Gallego" , # Gallegan; Galician "glk": "گیلکی", "got": "𐌲𐌿𐍄𐌹𐍃𐌺𐍉𐍂𐌰𐌶𐌳𐌰", "gn": "Avañe'ẽ", # Guarani "g": "ગુજરાતી", # Gujarati "gv": "Gaelg", # Manx "ha": "هَوُسَ", # Hausa "hak": "Hak-kâ-fa / 客家話", "haw": "Hawai`i", "he": "עברית" , # Hebrew (formerly iw) "hi": "हिन्दी" , # Hindi "hif": "Fiji Hindi", "ho": "Hiri Mot", # Hiri Motu "hr": "Hrvatski" , # Croatian "hsb": "Hornjoserbsce", "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole "hu": "Magyar" , # Hungarian "hy": "Հայերեն", # Armenian "hz": "Otsiherero", # Herero "ia": "Interlingua", "ie": "Interlingue", "id": "Bahasa Indonesia",# Indonesian (formerly in) "ig": "Igbo", # Igbo "ii": "ꆇꉙ ", # Sichuan Yi "ik": "Iñupiak", # Inupiak "ilo": "Ilokano", "io": "Ido" , "is": "Íslenska" , # Icelandic "it": "Italiano" , # Italian "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut "ja": "日本語", # Japanese "jbo": "Lojban", "jv": "Basa Jawa", # Javanese "ka": "ქართული" , # Georgian "kaa": "Qaraqalpaqsha", "kab": "Taqbaylit", "kg": "KiKongo", # Kongo "ki": "Gĩkũyũ", # Kikuyu "kj": "Kuanyama", # Kuanyama "kk": "Қазақша", # Kazakh "kl": "Kalaallisut", # Kalaallisut; Greenlandic "km": "ភាសាខ្មែរ", # Khmer; Cambodian "kn": "ಕನ್ನಡ", # Kannada "ko": "한국어" , # Korean "kr": "Kanuri", # Kanuri "ks": "कश्मीरी / كشميري", # Kashmiri "ksh": "Ripoarisch", "ku": "Kurdî / كوردی", # Kurdish "kv": "Коми", # Komi "kw": "Kernewek/Karnuack", # Cornish "ky": "Кыргызча", # Kirghiz "la": "Latina" , # Latin "lad": "Dzhudezmo", "lb": "Lëtzebuergesch" , # Letzeburgesch "lbe": "Лакку", "lg": "Luganda", # Ganda "li": "Limburgs", # Limburgish; Limburger; Limburgan "lij": "Lígur", "ln": "Lingala", # Lingala "lmo": "Lumbaart", "lo": "ລາວ", # Lao; Laotian "lt": "Lietuvių" , # Lithuanian "lua": "Luba", # Luba "lv": "Latvieš" , # Latvian; Lettish "map-bms": "Basa Banyumasan", "mdf": "Мокшень (Mokshanj Kälj)", "mg": "Malagasy", # Malagasy "mh": "Ebon", # Marshall "mi": "Māori", # Maori "mk": "Македонски" , # Macedonian "ml": None, # Malayalam "mn": "Монгол", # Mongolian "mo": "Молдовеняскэ", # Moldavian "mr": "मराठी" , # Marathi "ms": "Bahasa Melay" , # Malay "mt": "Malti", # Maltese "mus": "Muskogee", "my": "မ္ရန္‌မာစာ", # Burmese "myv": "Эрзянь (Erzjanj Kelj)", "mzn": "مَزِروني", "na": "dorerin Naoero", # Nauru "nah": "Nāhuatl", "nap": "Nnapulitano", "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l "nd": None, # Ndebele, North "nds": "Plattdüütsch", "nds-nl": "Nedersaksisch", "ne": "नेपाली", # Nepali "new": "नेपाल भाषा" , # Nepal Bhasa "ng": "Oshiwambo", # Ndonga "nl": "Nederlands" , # Dutch "nn": "Nynorsk", # Norwegian Nynorsk "no": "Norsk (Bokmål)" , # Norwegian "nov": "Novial", "nr": None, # Ndebele, South "nrm": "Nouormand/Normaund", "nv": "Diné bizaad", # Navajo "ny": "Chi-Chewa", # Chichewa; Nyanja "oc": "Occitan", # Occitan; Proven@,{c}al "oj": None, # Ojibwa "om": "Oromoo", # (Afan) Oromo "or": "ଓଡ଼ିଆ", # Oriya "os": "Иронау", # Ossetian; Ossetic "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi "pag": "Pangasinan", "pam": "Kapampangan", "pap": "Papiament", "pdc": "Deitsch", "pi": "पाऴि", # Pali "pih": "Norfuk", "pl": "Polski" , # Polish "pms": "Piemontèis" , "ps": "پښتو", # Pashto, Pushto "pt": "Português" , # Portuguese "q": "Runa Simi" , # Quechua "rm": "Rumantsch", # Rhaeto-Romance "rmy": "romani - रोमानी", "rn": "Kirundi", # Rundi; Kirundi "ro": "Română" , # Romanian "roa-rup": "Armãneashce", "roa-tara": "Tarandíne", "ru": "Русский" , # Russian "rw": "Ikinyarwanda", # Kinyarwanda "sa": "संस्कृतम्", # Sanskrit "sah": "Саха тыла (Saxa Tyla)", "sc": "Sardu", # Sardinian "scn": "Sicilian", "sco": "Scots", "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi "se": "Sámegiella", # Northern Sami "sg": "Sängö", # Sango; Sangro "sh": "Srpskohrvatski / Српскохрватски" , "si": "සිංහල", "simple": "Simple English" , "sk": "Slovenčina" , # Slovak "sl": "Slovenščina" , # Slovenian "sm": "Gagana Samoa", # Samoan "sn": "chiShona", # Shona "so": "Soomaaliga", # Somali "sr": "Српски / Srpski", # Serbian "srn": "Sranantongo", "ss": "SiSwati", # Swati; Siswati "st": "Sesotho", # Sesotho; Sotho, Southern "stk": "Seeltersk", "s": "Basa Sunda", # Sundanese "sq": "Shqip" , # Albanian "szl": "Ślůnski", "sv": "Svenska" , # Swedish "sw": "Kiswahili", # Swahili "ta": "தமிழ்" , # Tamil "te": "తెలుగు" , # Telugu "tet": "Tetun", "tg": "Тоҷикӣ", # Tajik "th": "ไทย" , # Thai "ti": "ትግርኛ", # Tigrinya "tk": "تركمن / Туркмен", # Turkmen "tl": "Tagalog" , # Tagalog "tn": "Setswana", # Tswana; Setswana "to": "faka Tonga", # Tonga (?) # Also ZW ; MW "tokipona": "Tokipona", "tpi": "Tok Pisin", "tr": "Türkçe" , # Turkish "ts": "Xitsonga", # Tsonga "tt": "Tatarça / Татарча", # Tatar "tum": "chiTumbuka", "tw": "Twi", # Twi "ty": "Reo Mā`ohi", # Tahitian "udm": "Удмурт кыл", "ug": "Oyghurque", # Uighur "uk": "Українська" , # Ukrainian "ur": "اردو", # Urdu "uz": "O‘zbek", # Uzbek "ve": "Tshivenda", # Venda "vec": "Vèneto", "vi": "Tiếng Việt" , # Vietnamese "vls": "West-Vlams", "vo": "Volapük" , "wa": "Walon", # Walloon "war": "Winaray", "wo": "Wolof", # Wolof "w": "吴语", "xal": "Хальмг", "xh": "isiXhosa", # Xhosa "yi": "ייִדיש", # Yiddish "yo": "Yorùbá", # Yoruba "za": "Cuengh", # Zhuang "zea": "Zeêuws", "zh": "中文" , # Chinese "zh-classical": "古文 / 文言文", "zm-min-nan": "Bân-lâm-gú", "zh-yue": "粵語", "zu": "isiZulu" # Zulu }