#!/usr/bin/python # -*- coding: utf-8 -*- # Copyright (C) 2008-2018 Sergey Poznyakoff # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . """ Wiki markup parser. This module provides two class: WikiMarkupParser: An abstract parser class, which serves as a base class for all markup classes in this package. WikiMarkup A subclass of the above, providing basic input method. """ from __future__ import print_function import sys import re from types import * from wikitrans.wikitoken import * __all__ = [ "WikiMarkupParser", "WikiMarkup", "TagAttributes", "TagAttributeSyntaxError" ] class UnexpectedTokenError(Exception): def __init__(self, value): self.value = value class TagAttributeSyntaxError(Exception): def __init__(self, value): self.value = value def __str__(self): return repr(self.value) class TagAttributes(object): """A dictionary-like collection of tag attributes. Example: attr = TagAttributes('href="foo" length=2') if 'href' in attr: print(x['href']) # returns "foo" for a in attr: ... """ attrstart = re.compile("^(?P[a-zA-Z0-9_-]+)(?P=\")?") valseg = re.compile("^[^\\\"]+") tab = {} printable = None def __init__(self, string): if not string: self.printable = '' return self.printable = string s = string self.tab = {} while s != '': s = s.strip() m = self.attrstart.match(s) if m: name = m.group('attr') val = '' s = s[m.end(0):] if m.group('eq'): while 1: m = self.valseg.match(s) val += m.group(0) s = s[m.end(0):] if s[0] == '\\': val += s[1] s += 2 elif s[0] == '"': s = s[1:] break else: val = 1 self.tab[name] = val else: raise TagAttributeSyntaxError(s) def __len__(self): return len(self.tab) def __getitem__(self, key): return self.tab[key] def __contains__(self, key): return key in self.tab def __iter__(self): for key in self.tab: yield(key) def has_key(self, key): return self.__contains__(key) def __setitem__(self, key, value): self.tab[key] = value def __delitem__(self, key): del self.tab[key] def __str__(self): return self.printable def __repr__(self): return self.printable class WikiMarkupParser(object): """Parser for Wiki markup language. Given input in Wiki markup language creates an abstract parse tree for it. This is a base class for actual parsers. The subclasses must provide the input method. Public methods: parse() -- parse the input. Abstract methods (must be overridden by the subclass): input() -- returns next physical line from the input material. Public attributes: tree -- constructed parse tree (a subclass of WikiNode) """ delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") otag = re.compile("<(?P[a-zA-Z0-9_]+)(?:\s+(?P[^/][^>]+))?\s*(?P/)?>") ctag = re.compile("[a-zA-Z0-9_]+)\s*>") refstart = re.compile("^https?://") close_delim = { '[': ']', '[[': ']]', '{{': '}}' } # Environment types: envtypes = { "*": [ "unnumbered", 0 ], "#": [ "numbered", 0 ], ";": [ "defn", 0 ], ":": [ "defn", 1 ] } toklist = None tokind = 0 newline = 0 tree = None tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ] debug_level = 0 def dprint(self, lev, fmt, *argv): """If current debug level is greater than or equal to lev, print *argv according to format. """ if self.debug_level >= lev: for l in (fmt % argv).split('\n'): print("[DEBUG] %s" % l) inline_delims = [ "''", "'''", "[", "]", "[[", "]]", "{{", "}}", "|" ] token_class = {} def _new_node(self, **kwarg): return self.token_class[kwarg['type']](self, **kwarg) def tokread(self): """Read next token from the input. Return it as a subclass of WikiNode.""" line = None pos = 0 while 1: if (not line or pos == len(line)): try: line = self.input() pos = 0 except StopIteration: line = u'' if not line or line == "": yield(self._new_node(type='NIL')) break if line == '\n': yield(self._new_node(type='NL')) line = None continue self.dprint(100, "LINE: %s", line[pos:]) m = self.delim.search(line, pos) if m: if (pos < m.start(0)): yield(self._new_node(type='TEXT', content=line[pos:m.start(0)])) pos = m.start(0) t = None if line[m.start(0)] == '<': m = self.otag.match(line, pos) if m: pos = m.end(0) if m.group('tag') == 'nowiki': if not m.group('closed'): while 1: try: m = self.ctag.search(line, pos) if m and m.group('tag') == 'nowiki': yield(self._new_node(type='TEXT', content=line[pos:m.start(0)] )) pos = m.end(0) break yield(self._new_node(type='TEXT', content=line[pos:])) line = self.input() pos = 0 except StopIteration: break continue elif m.group('tag') in self.tags: try: yield(self._new_node(type='OTAG', tag=m.group('tag'), isblock=(line[pos] == '\n'), args=TagAttributes(m.group('args')))) if m.group('closed'): yield(self._new_node(type='CTAG', tag=m.group('tag'))) except TagAttributeSyntaxError: yield(self._new_node(type='TEXT', content=m.group(0))) continue else: yield(self._new_node(type='TEXT', content=m.group(0))) continue else: m = self.ctag.match(line, pos) if m: if m.group('tag') in self.tags: yield(self._new_node(type='CTAG', tag=m.group('tag'))) pos = m.end(0) continue else: yield(self._new_node(type='TEXT', content=line[pos:pos+1])) pos += 1 continue else: pos = m.end(0) content = m.group(0) if content[0] in self.envtypes: node = self._new_node(type='DELIM', content=content, isblock=True, continuation=pos < len(line) and line[pos] == ":") if node.continuation: node.content += node.content[0] pos += 1 yield(node) while pos < len(line) and line[pos] in [' ', '\t']: pos += 1 else: yield(self._new_node(type='DELIM', isblock=(content.strip() not in self.inline_delims), content=content.strip())) continue if line: if line[-1] == '\n': if line[pos:-1] != '': yield(self._new_node(type='TEXT', content=line[pos:-1])) yield(self._new_node(type='NL')) else: yield(self._new_node(type='TEXT', content=line[pos:])) line = None def input(self): """Return next physical line from the input. This method must be overridden by the subclass. """ return None def swaptkn(self, i, j): """Swap tokens at indices i and j in toklist.""" self.dprint(80, "SWAPPING %s <-> %s", i, j) x = self.toklist[i] self.toklist[i] = self.toklist[j] self.toklist[j] = x def tokenize(self): """Tokenize the input. Read tokens from the input (supplied by the input() method). Place the obtained tokens in the toklist array. """ self.toklist = [] for tok in self.tokread(): self.dprint(100, "TOK: %s", tok) self.toklist.append(tok) # Determine and fix up the ordering of bold and italic markers # There are three possible cases: # # 1a. '''a b ''c'' d''' # 1b. ''a b '''c''' d'' # # 2a. '''''a b'' c d''' # 2b. '''''a b''' c d'' # # 3a. '''a b ''c d''''' # 3b. ''a b '''c d''''' stack = [] for i in range(0, len(self.toklist)): if (self.toklist[i].type == 'DELIM' and (self.toklist[i].content == "''" or self.toklist[i].content == "'''")): if len(stack) > 0: if self.toklist[stack[-1]].content == self.toklist[i].content: # Case 1: just pop the matching delimiter off the stack stack.pop() elif len(stack) == 2 and stack[-2] + 1 == stack[-1]: # Case 2: swap delimiters saved on stack ... self.swaptkn(stack[-2], stack[-1]) # and pop off the matching one stack.pop() elif (i < len(self.toklist) and self.toklist[i+1].type == 'DELIM' and self.toklist[stack[-1]].content == self.toklist[i+1].content): # Case 3: swap current and next tokens self.swaptkn(i, i+1) # and pop off the matching one stack.pop() else: # Push the token on stack stack.append(i) else: # Push the token on stack stack.append(i) # Redefine all non-matched tokens as TEXT for i in stack: self.toklist[i].type = 'TEXT' # FIXME mark = [] def push_mark(self): """Save the current token index on stack.""" self.mark.append(self.tokind) def pop_mark(self): """Restore the token index from top of stack.""" self.tokind = self.mark.pop() def clear_mark(self): """Forget the last mark.""" self.mark.pop() def lookahead(self, off=0): """Peek a token at index (tokind+off).""" tok = self.toklist[self.tokind+off] self.dprint(20, "lookahead(%s): %s", off, tok) return tok def setkn(self, val): """Store token val at the current token index.""" self.toklist[self.tokind] = val def getkn(self): """Get next token from the toklist. Advance tokind.""" self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL' if self.tokind == len(self.toklist): return self._new_node(type='NIL') tok = self.toklist[self.tokind] self.tokind = self.tokind + 1 self.dprint(20, "getkn: %s", tok) return tok def ungetkn(self, tok=None): """Unget the last read token. Decrease the tokind by one, so the last read token will be read again. If optional argument is supplied and is not None, store it in the toklist in place of the current token. """ self.tokind = self.tokind - 1 self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL' if tok: self.toklist[self.tokind] = tok self.dprint(20, "ungetkn: %s", tok) return self.toklist[self.tokind] def fixuptkn(self, tok): """Replace the recently read token by tok.""" if self.tokind == 0: raise IndexError('WikiMarkupParser.fixuptkn called at start of input') self.toklist[self.tokind-1] = tok return tok def dump(self, tree, file=sys.stdout): """Dump the tree to file, node by node.""" for node in tree: file.write(str(node)) file.write('\n') def is_block_end(self, tok): """Return True if tok ends a block environment.""" if tok.type == 'NIL': return True elif tok.type == 'NL': if self.lookahead().type == 'NIL': return True elif self.lookahead().type == 'NL': self.getkn() return True elif tok.type in ['DELIM', 'CTAG', 'TAG']: if tok.isblock: self.ungetkn(tok) return True return False def parse_para(self, tok): """Read paragraph starting at tok.""" self.dprint(80, "ENTER parse_para: %s", tok) acc = { 'seq': [], 'textlist': [] } def flush(): if acc['textlist']: acc['seq'].append(self._new_node(type='TEXT', content=''.join(acc['textlist']))) acc['textlist'] = [] if (isinstance(tok, WikiContentNode) and isinstance(tok.content, str) and re.match("^[ \t]", tok.content)): type = 'PRE' rx = re.compile("^\S") else: type = 'PARA' rx = re.compile("^[ \t]") while not self.is_block_end(tok): if tok.type == 'TEXT': if rx and self.newline and rx.match(tok.content): self.ungetkn() break acc['textlist'].append(tok.content) elif tok.type == 'NL': acc['textlist'].append('\n') elif tok.type == 'OTAG': flush() acc['seq'].append(self.parse_tag(tok)) elif tok.type == 'DELIM': flush() acc['seq'].append(self.parse_inline_delim(tok)) else: raise UnexpectedTokenError(tok) tok = self.getkn() flush() if acc['seq']: tok = self._new_node(type=type, content=acc['seq']) else: tok = None self.dprint(80, "LEAVE parse_para=%s", tok) return tok def parse_block_delim(self, tok): """Parse block environment starting at tok.""" self.dprint(80, "ENTER parse_block_delim") assert(tok.type == 'DELIM') if tok.content == "----": node = self._new_node(type = 'BAR') elif tok.content[0:2] == "==": node = self.parse_header(tok) if not node: tok = self.ungetkn(self._new_node(type='TEXT', content=tok.content)) elif tok.content[0] in self.envtypes: node = None if tok.content[0] == ':': t = self.lookahead(-2) if not (t.type == 'DELIM' and t.content == ';'): node = self.parse_indent(tok) if not node: node = self.parse_env(tok) else: self.ungetkn(tok) node = None self.dprint(80, "LEAVE parse_block_delim=%s", node) return node def parse_line(self): """Parse the input line.""" self.dprint(80, "ENTER parse_line") list = [] while True: tok = self.getkn() if tok.type == 'NL' or tok.type == 'NIL': break elif tok.type == 'TEXT': list.append(tok) elif tok.type == 'DELIM': if tok.isblock: tok = self._new_node(type = 'TEXT', content = tok.content) self.fixuptkn(tok) list.append(tok) elif tok.content[0] == ":": # FIXME list.append(self.parse_indent(tok)) break else: x = self.parse_inline_delim(tok) if x: list.append(x) else: list.append(self.fixuptkn(self._new_node(type = 'TEXT', content = tok.content))) elif tok.type == 'OTAG': if tok.isblock: self.ungetkn() break list.append(self.parse_tag(tok)) else: list.append(tok) ret = self._new_node(type='SEQ', content=list) self.dprint(80, "LEAVE parse_line=%s", ret) return ret def parse_indent(self, tok): """Parse indented block starting at tok.""" lev = len(tok.content) self.dprint(80, "ENTER parse_indent(%s)", lev) x = self._new_node(type='IND', level=lev, content=self.parse_line()) self.dprint(80, "LEAVE parse_indent=%s", x) return x def parse_fontmod(self, delim, what): """Parse font modification directive (bold or italics). Arguments: delim -- starting delimiter ("''" or "'''") what -- 'IT' or 'BOLD' """ self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s", delim, what, self.lookahead()) seq = [] text = '' while True: tok = self.getkn() if tok.type == 'TEXT': text += tok.content elif self.is_block_end(tok): self.dprint(80, "LEAVE parse_fontmod=%s", "None") return None elif tok.type == 'DELIM': # self.dprint(80, "got %s, want %s", tok.content, delim) if tok.content == delim: break else: if text: seq.append(self._new_node(type='TEXT', content=text)) text = '' x = self.parse_inline_delim(tok) if x: seq.append(x) else: self.dprint(80, "LEAVE parse_fontmod=%s", "None") return None elif tok.type == 'NL': seq.append(self._new_node(type='TEXT', content='\n')) else: self.dprint(80, "LEAVE parse_fontmod=None") return None if text: seq.append(self._new_node(type='TEXT', content=text)) res = self._new_node(type=what, content=seq) self.dprint(80, "LEAVE parse_fontmod=%s", res) return res def parse_ref(self): """Parse a reference block ([...])""" self.dprint(80, "ENTER parse_ref") tok = self.getkn() if not (tok.type == 'TEXT' and self.refstart.match(tok.content)): self.dprint(80, "LEAVE parse_ref=None") return None seq = [] (ref, sep, text) = tok.content.partition(' ') if text: seq.insert(0, self._new_node(type='TEXT', content=text)) while True: tok = self.getkn() if tok.type == 'NIL': self.dprint(80, "LEAVE parse_ref=None") return None elif self.is_block_end(tok): self.dprint(80, "LEAVE parse_ref=None") return None elif tok.type == 'DELIM': if tok.content == ']': break else: tok = self.parse_inline_delim(tok) if tok: seq.append(tok) else: self.dprint(80, "LEAVE parse_ref=None") return None elif tok.type == 'OTAG': list.append(self.parse_tag(tok)) else: seq.append(tok) ret = self._new_node(type='REF', ref=ref, content=self._new_node(type='SEQ', content=seq)) self.dprint(80, "LEAVE parse_ref= %s", ret) return ret def parse_link(self, type, delim): """Parse an external link ([[...]]). In this implementation, it is also used to parse template references ({{...}}). Arguments: type -- 'LINK' or 'TMPL' delim -- expected closing delimiter. """ self.dprint(80, "ENTER parse_link(%s,%s)", type, delim) subtree = [] list = [] while True: tok = self.getkn() if tok.type == 'NIL': self.dprint(80, "LEAVE parse_link=None [EOF]") return None if tok.type == 'DELIM': if tok.content == delim: if list: subtree.append(self._new_node(type='SEQ', content=list)) break elif tok.content == "|": if len(list) > 1: subtree.append(self._new_node(type='SEQ', content=list)) elif list: subtree.append(list[0]) list = [] else: x = self.parse_inline_delim(tok) if x: list.append(x) else: self.dprint(80, "LEAVE parse_link=None [bad inline]") return None elif tok.type == 'TEXT': list.append(tok) else: self.dprint(80, "LEAVE parse_link=None [unexpected token]") return None ret = self._new_node(type=type, content=subtree) self.dprint(80, "LEAVE parse_link=%s", ret) return ret def parse_inline_delim(self, tok): """Parse an inline block.""" self.dprint(80, "ENTER parse_inline_delim") assert(tok.type == 'DELIM') self.push_mark() if tok.content == "''": x = self.parse_fontmod(tok.content, 'IT') elif tok.content == "'''": x = self.parse_fontmod(tok.content, 'BOLD') elif tok.content == "[": x = self.parse_ref() elif tok.content == "[[": x = self.parse_link('LINK', "]]") elif tok.content == "{{": x = self.parse_link('TMPL', "}}") else: x = None if x: self.clear_mark() else: self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok) self.pop_mark() x = self.fixuptkn(self._new_node(type='TEXT', content=tok.content)) od = tok.content if od in self.close_delim: cd = self.close_delim[od] lev = 0 for i, tok in enumerate(self.toklist[self.tokind+1:]): if tok.type == 'NIL': break elif tok.type == 'DELIM': if tok.content == od: lev += 1 elif tok.content == cd: if lev == 0: tok = self._new_node(type='TEXT', content=tok.content) self.toklist[self.tokind+1+i] = tok lev -= 1 break self.dprint(80, "END DELIMITER RECOVERY: %s", tok) self.dprint(80, "LEAVE parse_inline_delim=%s", x) return x def parse_tag(self, tag): """Parse an xml-like tag (such as, e.g. "...").""" self.dprint(80, "ENTER parse_tag") list = [] self.push_mark() while True: tok = self.getkn() if tok.type == 'NIL': self.pop_mark() s = '<' + tag.tag if tag.args: s += ' ' + str(tag.args) s += '>' node = self._new_node(type='TEXT', content=s) if tag.content: self.tree[self.tokind:self.tokind] = tag.content self.dprint(80, "LEAVE parse_tag = %s (tree modified)", node) return node elif tok.type == 'DELIM': if tok.isblock: tok = self.parse_block_delim(tok) else: tok = self.parse_inline_delim(tok) if not tok: tok = self.getkn() elif tok.type == 'CTAG': if tag.tag == tok.tag: break s = '' tok = self.fixuptkn(self._new_node(type='TEXT', content=s)) elif tok.type == 'NL': tok = self._new_node(type = 'TEXT', content = '\n') list.append(tok) self.clear_mark() ret = self._new_node(type = 'TAG', tag = tag.tag, args = tag.args, isblock = tag.isblock, content = self._new_node(type = 'SEQ', content = list)) self.dprint(80, "LEAVE parse_tag = %s", ret) return ret def parse_env(self, tok): """Parse a block environment (numbered, unnumbered, or definition list).""" type = self.envtypes[tok.content[0]][0] lev = len(tok.content) self.dprint(80, "ENTER parse_env(%s,%s)", type, lev) list = [] while True: if (tok.type == 'DELIM' and tok.content[0] in self.envtypes and type == self.envtypes[tok.content[0]][0]): if len(tok.content) < lev: self.ungetkn() break elif len(tok.content) > lev: elt = self.parse_env(tok) else: elt = self.parse_line() if not tok.continuation: list.append(self._new_node(type='ELT', subtype=self.envtypes[tok.content[0]][1], content=elt)) tok = self.getkn() continue if list: if list[-1].content.type != 'SEQ': x = list[-1].content.content # FIXME: list[-1].content = self._new_node(type='SEQ', content=[x]) list[-1].content.content.append(elt) else: self.ungetkn() break tok = self.getkn() ret = self._new_node(type='ENV', envtype=type, level=lev, content=list) self.dprint(80, "LEAVE parse_env=%s", ret) return ret def parse_header(self, tok): """Parse a Wiki header.""" self.dprint(80, "ENTER parse_header") self.push_mark() list = [] delim = tok.content while True: tok = self.getkn() if tok.type == 'NL': self.pop_mark() self.dprint(80, "LEAVE parse_header=None") return None elif tok.type == 'TEXT': list.append(tok) elif tok.type == 'DELIM': if tok.content == delim: if self.lookahead().type == 'NL': self.getkn() if self.lookahead().type == 'NL': self.getkn() break else: self.pop_mark() self.dprint(80, "LEAVE parse_header=None") return None elif tok.isblock: self.pop_mark() self.dprint(80, "LEAVE parse_header=None") return None else: list.append(self.parse_inline_delim(tok)) elif tok.type == 'OTAG': if tok.isblock: self.pop_mark() self.dprint(80, "LEAVE parse_header=None") return None list.append(self.parse_tag(tok)) self.clear_mark() ret = self._new_node(type='HDR', level=len(delim), content=self._new_node(type='SEQ', content=list)) self.dprint(80, "LEAVE parse_header=%s", ret) return ret def parse_block(self): """Parse next block: newline, delimiter, tag, or paragraph.""" tok = self.getkn() while tok.type == 'NL': tok = self.getkn() if tok == None or tok.type == 'NIL': return None elif tok.type == 'DELIM': tok = self.parse_block_delim(tok) if tok: return tok else: tok = self.getkn() elif tok.type == 'OTAG' and tok.isblock: return self.parse_tag(tok) return self.parse_para(tok) def parse(self): """Parse Wiki material supplied by the input() method. Store the resulting abstract parsing tree in the tree attribute. """ if not self.toklist: self.tokenize() if self.debug_level >= 90: print("TOKEN DUMP BEGIN") self.dump(self.toklist) print("TOKEN DUMP END") self.tokind = 0 self.tree = [] while 1: subtree = self.parse_block() if subtree == None: break self.tree.append(subtree) if self.debug_level >= 70: print("TREE DUMP BEGIN") self.dump(self.tree) print("TREE DUMP END") def __str__(self): return str(self.tree) class WikiMarkup(WikiMarkupParser): """ A derived parser class that supplies a basic input method. Three types of inputs are available: 1. filename= The file is opened and used for input. 2. file= The already opened file is used for input. 3. text= Input is taken from , line by line. Usage: obj = WikiMarkup(arg=val) obj.parse ... Do whatever you need with obj.tree ... """ file = None text = None lang = 'en' html_base = 'http://%(lang)s.wikipedia.org/wiki/' image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf' media_base = 'http://www.mediawiki.org/xml/export-0.3' def __init__(self, *args, **keywords): """Create a WikiMarkup object. Arguments: filename=FILE Read Wiki material from the file named FILE. file=FD Read Wiki material from file object FD. text=STRING Read Wiki material from STRING. lang=CODE Specifies source language. Default is 'en'. This variable can be referred to as '%(lang)s' in the keyword arguments below. html_base=URL Base URL for cross-references. Default is 'http://%(lang)s.wikipedia.org/wiki/' image_base=URL Base URL for images. Default is 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf' media_base=URL Base URL for media files. Default is 'http://www.mediawiki.org/xml/export-0.3' """ self.token_class = { 'NIL': WikiNode, 'NL': WikiNode, 'OTAG': WikiTagNode, 'CTAG': WikiTagNode, 'TAG': WikiTagNode, 'DELIM': WikiDelimNode, 'TEXT': WikiTextNode, 'PRE': WikiContentNode, 'PARA': WikiSeqNode, 'BAR': WikiNode, 'SEQ': WikiSeqNode, 'IND': WikiIndNode, 'REF': WikiRefNode, 'TMPL': WikiSeqNode, 'IT': WikiSeqNode, 'BOLD': WikiSeqNode, 'ELT': WikiEltNode, 'ENV': WikiEnvNode, 'LINK': WikiSeqNode, 'HDR': WikiHdrNode } for kw in keywords: if kw == 'file': self.file = keywords[kw] elif kw == 'filename': self.file = open(keywords[kw]) elif kw == 'text': self.text = keywords[kw].split("\n") elif kw == 'lang': self.lang = keywords[kw] elif kw == 'html_base': self.html_base = keywords[kw] elif kw == 'image_base': self.image_base = keywords[kw] elif kw == 'media_base': self.media_base = keywords[kw] def __del__(self): if self.file: self.file.close() def input(self): if self.file: return self.file.readline() elif self.text: return self.text.pop(0) + '\n' else: return None # ISO 639 langtab = { "aa": "Afar", # Afar "ab": "Аҧсуа", # Abkhazian "ae": None, # Avestan "af": "Afrikaans", # Afrikaans "ak": "Akana", # Akan "als": "Alemannisch", "am": "አማርኛ", # Amharic "an": "Aragonés", # Aragonese "ang": "Englisc", "ar": "العربية" , # Arabic "arc": "ܐܪܡܝܐ", "as": "অসমীয়া", # Assamese "ast": "Asturian", "av": "Авар", # Avaric "ay": "Aymara", # Aymara "az": "Azərbaycan" , # Azerbaijani "ba": "Башҡорт", # Bashkir "bar": "Boarisch", "bat-smg": "Žemaitėška", "bcl": "Bikol", "be": "Беларуская", # Byelorussian; Belarusian "be-x-old": "Беларуская (тарашкевіца)", "bg": "Български", # Bulgarian "bh": "भोजपुरी", # Bihari "bi": "Bislama", # Bislama "bm": "Bamanankan", # Bambara "bn": "বাংলা" , # Bengali; Bangla "bo": "བོད་སྐད", # Tibetan "bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" , "br": "Brezhoneg" , # Breton "bs": "Bosanski" , # Bosnian "bug": "Basa Ugi", "bxr": "Буряад", "ca": "Català" , # Catalan "cbk-zam": "Chavacano de Zamboanga", "cdo": "Mìng-dĕ̤ng-ngṳ̄", "cho": "Choctaw", "ce": "Нохчийн", # Chechen "ceb": "Sinugboanong Binisaya" , # Cebuano "ch": "Chamor", # Chamorro "chr": "ᏣᎳᎩ", "chy": "Tsetsêhestâhese", "co": "Cors", # Corsican "cr": "Nehiyaw", # Cree "crh": "Qırımtatarca", "cs": "Česky" , # Czech "csb": "Kaszëbsczi", "c": "Словѣньскъ", # Church Slavic "cv": "Чăваш", # Chuvash "cy": "Cymraeg" , # Welsh "da": "Dansk" , # Danish "de": "Deutsch" , # German "diq": "Zazaki", # Dimli (Southern Zazaki) "dsb": "Dolnoserbski", "dv": "ދިވެހިބަސް", # Divehi "dz": "ཇོང་ཁ", # Dzongkha; Bhutani "ee": "Eʋegbe", # Ewe "el": "Ελληνικά" , # Greek "eml": "Emiliàn e rumagnòl", "en": "English" , # English "eo": "Esperanto" , "es": "Español" , # Spanish "et": "Eesti" , # Estonian "eu": "Euskara" , # Basque "ext": "Estremeñ", "fa": "فارسی" , # Persian "ff": "Fulfulde", # Fulah "fi": "Suomi" , # Finnish "fiu-vro": "Võro", "fj": "Na Vosa Vakaviti",# Fijian; Fiji "fo": "Føroyskt" , # Faroese "fr": "Français" , # French "frp": "Arpitan", "fur": "Furlan", "fy": "Frysk", # Frisian "ga": "Gaeilge", # Irish "gan": "贛語 (Gànyŭ)", "gd": "Gàidhlig", # Scots; Gaelic "gl": "Gallego" , # Gallegan; Galician "glk": "گیلکی", "got": "𐌲𐌿𐍄𐌹𐍃𐌺𐍉𐍂𐌰𐌶𐌳𐌰", "gn": "Avañe'ẽ", # Guarani "g": "ગુજરાતી", # Gujarati "gv": "Gaelg", # Manx "ha": "هَوُسَ", # Hausa "hak": "Hak-kâ-fa / 客家話", "haw": "Hawai`i", "he": "עברית" , # Hebrew (formerly iw) "hi": "हिन्दी" , # Hindi "hif": "Fiji Hindi", "ho": "Hiri Mot", # Hiri Motu "hr": "Hrvatski" , # Croatian "hsb": "Hornjoserbsce", "ht": "Krèyol ayisyen" , # Haitian; Haitian Creole "hu": "Magyar" , # Hungarian "hy": "Հայերեն", # Armenian "hz": "Otsiherero", # Herero "ia": "Interlingua", "ie": "Interlingue", "id": "Bahasa Indonesia",# Indonesian (formerly in) "ig": "Igbo", # Igbo "ii": "ꆇꉙ ", # Sichuan Yi "ik": "Iñupiak", # Inupiak "ilo": "Ilokano", "io": "Ido" , "is": "Íslenska" , # Icelandic "it": "Italiano" , # Italian "i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut "ja": "日本語", # Japanese "jbo": "Lojban", "jv": "Basa Jawa", # Javanese "ka": "ქართული" , # Georgian "kaa": "Qaraqalpaqsha", "kab": "Taqbaylit", "kg": "KiKongo", # Kongo "ki": "Gĩkũyũ", # Kikuyu "kj": "Kuanyama", # Kuanyama "kk": "Қазақша", # Kazakh "kl": "Kalaallisut", # Kalaallisut; Greenlandic "km": "ភាសាខ្មែរ", # Khmer; Cambodian "kn": "ಕನ್ನಡ", # Kannada "ko": "한국어" , # Korean "kr": "Kanuri", # Kanuri "ks": "कश्मीरी / كشميري", # Kashmiri "ksh": "Ripoarisch", "ku": "Kurdî / كوردی", # Kurdish "kv": "Коми", # Komi "kw": "Kernewek/Karnuack", # Cornish "ky": "Кыргызча", # Kirghiz "la": "Latina" , # Latin "lad": "Dzhudezmo", "lb": "Lëtzebuergesch" , # Letzeburgesch "lbe": "Лакку", "lg": "Luganda", # Ganda "li": "Limburgs", # Limburgish; Limburger; Limburgan "lij": "Lígur", "ln": "Lingala", # Lingala "lmo": "Lumbaart", "lo": "ລາວ", # Lao; Laotian "lt": "Lietuvių" , # Lithuanian "lua": "Luba", # Luba "lv": "Latvieš" , # Latvian; Lettish "map-bms": "Basa Banyumasan", "mdf": "Мокшень (Mokshanj Kälj)", "mg": "Malagasy", # Malagasy "mh": "Ebon", # Marshall "mi": "Māori", # Maori "mk": "Македонски" , # Macedonian "ml": None, # Malayalam "mn": "Монгол", # Mongolian "mo": "Молдовеняскэ", # Moldavian "mr": "मराठी" , # Marathi "ms": "Bahasa Melay" , # Malay "mt": "Malti", # Maltese "mus": "Muskogee", "my": "မ္ရန္‌မာစာ", # Burmese "myv": "Эрзянь (Erzjanj Kelj)", "mzn": "مَزِروني", "na": "dorerin Naoero", # Nauru "nah": "Nāhuatl", "nap": "Nnapulitano", "nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l "nd": None, # Ndebele, North "nds": "Plattdüütsch", "nds-nl": "Nedersaksisch", "ne": "नेपाली", # Nepali "new": "नेपाल भाषा" , # Nepal Bhasa "ng": "Oshiwambo", # Ndonga "nl": "Nederlands" , # Dutch "nn": "Nynorsk", # Norwegian Nynorsk "no": "Norsk (Bokmål)" , # Norwegian "nov": "Novial", "nr": None, # Ndebele, South "nrm": "Nouormand/Normaund", "nv": "Diné bizaad", # Navajo "ny": "Chi-Chewa", # Chichewa; Nyanja "oc": "Occitan", # Occitan; Proven@,{c}al "oj": None, # Ojibwa "om": "Oromoo", # (Afan) Oromo "or": "ଓଡ଼ିଆ", # Oriya "os": "Иронау", # Ossetian; Ossetic "pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi "pag": "Pangasinan", "pam": "Kapampangan", "pap": "Papiament", "pdc": "Deitsch", "pi": "पाऴि", # Pali "pih": "Norfuk", "pl": "Polski" , # Polish "pms": "Piemontèis" , "ps": "پښتو", # Pashto, Pushto "pt": "Português" , # Portuguese "q": "Runa Simi" , # Quechua "rm": "Rumantsch", # Rhaeto-Romance "rmy": "romani - रोमानी", "rn": "Kirundi", # Rundi; Kirundi "ro": "Română" , # Romanian "roa-rup": "Armãneashce", "roa-tara": "Tarandíne", "ru": "Русский" , # Russian "rw": "Ikinyarwanda", # Kinyarwanda "sa": "संस्कृतम्", # Sanskrit "sah": "Саха тыла (Saxa Tyla)", "sc": "Sardu", # Sardinian "scn": "Sicilian", "sco": "Scots", "sd": "سنڌي، سندھی ، सिन्ध", # Sindhi "se": "Sámegiella", # Northern Sami "sg": "Sängö", # Sango; Sangro "sh": "Srpskohrvatski / Српскохрватски" , "si": "සිංහල", "simple": "Simple English" , "sk": "Slovenčina" , # Slovak "sl": "Slovenščina" , # Slovenian "sm": "Gagana Samoa", # Samoan "sn": "chiShona", # Shona "so": "Soomaaliga", # Somali "sr": "Српски / Srpski", # Serbian "srn": "Sranantongo", "ss": "SiSwati", # Swati; Siswati "st": "Sesotho", # Sesotho; Sotho, Southern "stk": "Seeltersk", "s": "Basa Sunda", # Sundanese "sq": "Shqip" , # Albanian "szl": "Ślůnski", "sv": "Svenska" , # Swedish "sw": "Kiswahili", # Swahili "ta": "தமிழ்" , # Tamil "te": "తెలుగు" , # Telugu "tet": "Tetun", "tg": "Тоҷикӣ", # Tajik "th": "ไทย" , # Thai "ti": "ትግርኛ", # Tigrinya "tk": "تركمن / Туркмен", # Turkmen "tl": "Tagalog" , # Tagalog "tn": "Setswana", # Tswana; Setswana "to": "faka Tonga", # Tonga (?) # Also ZW ; MW "tokipona": "Tokipona", "tpi": "Tok Pisin", "tr": "Türkçe" , # Turkish "ts": "Xitsonga", # Tsonga "tt": "Tatarça / Татарча", # Tatar "tum": "chiTumbuka", "tw": "Twi", # Twi "ty": "Reo Mā`ohi", # Tahitian "udm": "Удмурт кыл", "ug": "Oyghurque", # Uighur "uk": "Українська" , # Ukrainian "ur": "اردو", # Urdu "uz": "O‘zbek", # Uzbek "ve": "Tshivenda", # Venda "vec": "Vèneto", "vi": "Tiếng Việt" , # Vietnamese "vls": "West-Vlams", "vo": "Volapük" , "wa": "Walon", # Walloon "war": "Winaray", "wo": "Wolof", # Wolof "w": "吴语", "xal": "Хальмг", "xh": "isiXhosa", # Xhosa "yi": "ייִדיש", # Yiddish "yo": "Yorùbá", # Yoruba "za": "Cuengh", # Zhuang "zea": "Zeêuws", "zh": "中文" , # Chinese "zh-classical": "古文 / 文言文", "zm-min-nan": "Bân-lâm-gú", "zh-yue": "粵語", "zu": "isiZulu" # Zulu }