#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2008-2018 Sergey Poznyakoff
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
from __future__ import print_function
import sys
import re
from types import *
from WikiTrans.wikitoken import *
__all__ = [ "BaseWikiMarkup", "WikiMarkup",
"TagAttributes", "TagAttributeSyntax" ]
class UnexpectedToken(Exception):
def __init__(self, value):
self.value = value
class TagAttributeSyntax(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class TagAttributes(object):
attrstart = re.compile("^(?P[a-zA-Z0-9_-]+)(?P=\")?")
valseg = re.compile("^[^\\\"]+")
tab = {}
printable = None
def __init__(self, string):
if not string:
self.printable = ''
return
self.printable = string
s = string
self.tab = {}
while s != '':
s = s.strip()
m = self.attrstart.match(s)
if m:
name = m.group('attr')
val = ''
s = s[m.end(0):]
if m.group('eq'):
while 1:
m = self.valseg.match(s)
val += m.group(0)
s = s[m.end(0):]
if s[0] == '\\':
val += s[1]
s += 2
elif s[0] == '"':
s = s[1:]
break
else:
val = 1
self.tab[name] = val
else:
raise TagAttributeSyntax(s)
def __len__(self):
return len(self.tab)
def __getitem__(self, key):
return self.tab[key]
def __contains__(self, key):
return key in self.tab
def __iter__(self):
for key in self.tab:
yield(key)
def has_key(self, key):
return self.__contains__(key)
def __setitem__(self, key, value):
self.tab[key] = value
def __delitem__(self, key):
del self.tab[key]
def __str__(self):
return self.printable
def __repr__(self):
return self.printable
class BaseWikiMarkup(object):
delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
otag = re.compile("<(?P[a-zA-Z0-9_]+)(?:\s+(?P[^>]+))?\s*(?P/)?>")
ctag = re.compile("(?P[a-zA-Z0-9_]+)\s*>")
refstart = re.compile("^https?://")
close_delim = {
'[': ']',
'[[': ']]',
'{{': '}}'
}
# Environment types:
envtypes = { "*": [ "unnumbered", 0 ],
"#": [ "numbered", 0 ],
";": [ "defn", 0 ],
":": [ "defn", 1 ]
}
toklist = None
tokind = 0
newline = 0
tree = None
tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ]
debug_level = 0
def dprint(self, lev, fmt, *argv):
if self.debug_level >= lev:
for l in (fmt % argv).split('\n'):
print("[DEBUG] %s" % l)
inline_delims = [ "''", "'''", "[", "]", "[[", "]]", "{{", "}}", "|" ]
token_class = {
'NIL': WikiNode,
'NL': WikiNode,
'OTAG': WikiTagNode,
'CTAG': WikiTagNode,
'TAG': WikiTagNode,
'DELIM': WikiDelimNode,
'TEXT': WikiTextNode,
'PRE': WikiContentNode,
'PARA': WikiSeqNode,
'BAR': WikiNode,
'SEQ': WikiSeqNode,
'IND': WikiIndNode,
'REF': WikiRefNode,
'TMPL': WikiSeqNode,
'IT': WikiSeqNode,
'BOLD': WikiSeqNode,
'ELT': WikiEltNode,
'ENV': WikiEnvNode,
'LINK': WikiSeqNode,
'HDR': WikiHdrNode
}
def __createWikiNode(self,**kwarg):
return self.token_class[kwarg['type']](self, **kwarg)
def tokread(self):
line = None
pos = 0
while 1:
if (not line or pos == len(line)):
try:
line = self.input()
pos = 0
except StopIteration:
line = u''
if not line or line == "":
yield(self.__createWikiNode(type='NIL'))
break
if line == '\n':
yield(self.__createWikiNode(type='NL'))
line = None
continue
self.dprint(100, "LINE: %s", line[pos:])
m = self.delim.search(line, pos)
if m:
if (pos < m.start(0)):
yield(self.__createWikiNode(type='TEXT',
content=line[pos:m.start(0)]))
pos = m.start(0)
t = None
if line[m.start(0)] == '<':
m = self.otag.match(line, pos)
if m:
pos = m.end(0)
if m.group('tag') == 'nowiki':
if not m.group('closed'):
while 1:
try:
m = self.ctag.search(line, pos)
if m and m.group('tag') == 'nowiki':
yield(self.__createWikiNode(type='TEXT',
content=line[pos:m.start(0)] ))
pos = m.end(0)
break
yield(self.__createWikiNode(type='TEXT',
content=line[pos:]))
line = self.input()
pos = 0
except StopIteration:
break
continue
elif m.group('tag') in self.tags:
try:
yield(self.__createWikiNode(type='OTAG',
tag=m.group('tag'),
isblock=(line[pos] == '\n'),
args=TagAttributes(m.group('args'))))
if m.group('closed'):
yield(self.__createWikiNode(type='CTAG',
tag=m.group('tag')))
except TagAttributeSyntax:
yield(self.__createWikiNode(type='TEXT',content=m.group(0)))
continue
else:
yield(self.__createWikiNode(type='TEXT',content=m.group(0)))
continue
else:
m = self.ctag.match(line, pos)
if m:
if m.group('tag') in self.tags:
yield(self.__createWikiNode(type='CTAG',
tag=m.group('tag')))
pos = m.end(0)
continue
else:
yield(self.__createWikiNode(type='TEXT',
content=line[pos:pos+1]))
pos += 1
continue
else:
pos = m.end(0)
content = m.group(0)
if content[0] in self.envtypes:
node = self.__createWikiNode(type='DELIM',
content=content,
isblock=True,
continuation=pos < len(line) and line[pos] == ":")
if node.continuation:
node.content += node.content[0]
pos += 1
yield(node)
while pos < len(line) and line[pos] in [' ', '\t']:
pos += 1
else:
yield(self.__createWikiNode(type='DELIM',
isblock=(content.strip() not in self.inline_delims),
content=content.strip()))
continue
if line:
if line[-1] == '\n':
if line[pos:-1] != '':
yield(self.__createWikiNode(type='TEXT',content=line[pos:-1]))
yield(self.__createWikiNode(type='NL'))
else:
yield(self.__createWikiNode(type='TEXT',content=line[pos:]))
line = None
def input(self):
return None
def swaptkn(self, i, j):
self.dprint(80, "SWAPPING %s <-> %s", i, j)
x = self.toklist[i]
self.toklist[i] = self.toklist[j]
self.toklist[j] = x
def tokenize(self):
self.toklist = []
for tok in self.tokread():
self.dprint(100, "TOK: %s", tok)
self.toklist.append(tok)
# Determine and fix up the ordering of bold and italic markers
# There are three possible cases:
#
# 1a. '''a b ''c'' d'''
# 1b. ''a b '''c''' d''
#
# 2a. '''''a b'' c d'''
# 2b. '''''a b''' c d''
#
# 3a. '''a b ''c d'''''
# 3b. ''a b '''c d'''''
stack = []
for i in range(0,len(self.toklist)):
if self.toklist[i].type == 'DELIM' \
and (self.toklist[i].content == "''" \
or self.toklist[i].content == "'''"):
if len(stack) > 0:
if self.toklist[stack[-1]].content == self.toklist[i].content:
# Case 1: just pop the matching delimiter off the stack
stack.pop()
elif len(stack) == 2 and stack[-2] + 1 == stack[-1]:
# Case 2: swap delimiters saved on stack ...
self.swaptkn(stack[-2], stack[-1])
# and pop off the matching one
stack.pop()
elif i < len(self.toklist) \
and self.toklist[i+1].type == 'DELIM' \
and self.toklist[stack[-1]].content == self.toklist[i+1].content:
# Case 3: swap current and next tokens
self.swaptkn(i, i+1)
# and pop off the matching one
stack.pop()
else:
# Push the token on stack
stack.append(i)
else:
# Push the token on stack
stack.append(i)
# Redefine all non-matched tokens as TEXT
for i in stack:
self.toklist[i].type = 'TEXT' # FIXME
mark = []
def push_mark(self):
self.mark.append(self.tokind)
def pop_mark(self):
self.tokind = self.mark.pop()
def clear_mark(self):
self.mark.pop()
def lookahead(self, off=0):
tok = self.toklist[self.tokind+off]
self.dprint(20, "lookahead(%s): %s", off, tok)
return tok
def setkn(self,val):
self.toklist[self.tokind] = val
def getkn(self):
self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL'
if self.tokind == len(self.toklist):
return self.__createWikiNode(type='NIL')
tok = self.toklist[self.tokind]
self.tokind = self.tokind + 1
self.dprint(20, "getkn: %s", tok)
return tok
def ungetkn(self, tok=None):
self.tokind = self.tokind - 1
self.newline = self.tokind == 0 or self.toklist[self.tokind-1].type == 'NL'
if tok:
self.toklist[self.tokind] = tok
self.dprint(20, "ungetkn: %s", tok)
return self.toklist[self.tokind]
def fixuptkn(self, tok):
if self.tokind == 0:
raise IndexError('wikimarkup.fixuptkn called at start of input')
self.toklist[self.tokind-1] = tok
return tok
def dump(self, tree, file=sys.stdout):
for node in tree:
file.write(str(node))
file.write('\n')
def is_block_end(self, tok):
if tok.type == 'NIL':
return True
elif tok.type == 'NL':
if self.lookahead().type == 'NIL':
return True
elif self.lookahead().type == 'NL':
self.getkn()
return True
elif tok.type in ['DELIM', 'CTAG', 'TAG']:
if tok.isblock:
self.ungetkn(tok)
return True
return False
def parse_para(self, tok):
self.dprint(80, "ENTER parse_para: %s", tok)
acc = { 'seq': [],
'textlist': [] }
def flush():
if acc['textlist']:
acc['seq'].append(self.__createWikiNode(type='TEXT',
content=''.join(acc['textlist'])))
acc['textlist'] = []
if isinstance(tok, WikiContentNode) \
and isinstance(tok.content,str) \
and re.match("^[ \t]", tok.content):
type = 'PRE'
rx = re.compile("^\S")
else:
type = 'PARA'
rx = re.compile("^[ \t]")
while not self.is_block_end(tok):
if tok.type == 'TEXT':
if rx and self.newline and rx.match(tok.content):
self.ungetkn()
break
acc['textlist'].append(tok.content)
elif tok.type == 'NL':
acc['textlist'].append('\n')
elif tok.type == 'OTAG':
flush()
acc['seq'].append(self.parse_tag(tok))
elif tok.type == 'DELIM':
flush()
acc['seq'].append(self.parse_inline_delim(tok))
else:
raise UnexpectedToken(tok)
tok = self.getkn()
flush()
if acc['seq']:
tok = self.__createWikiNode(type=type, content=acc['seq'])
else:
tok = None
self.dprint(80, "LEAVE parse_para=%s", tok)
return tok
def parse_block_delim(self, tok):
self.dprint(80, "ENTER parse_block_delim")
assert(tok.type == 'DELIM')
if tok.content == "----":
node = self.__createWikiNode(type = 'BAR')
elif tok.content[0:2] == "==":
node = self.parse_header(tok)
if not node:
tok = self.ungetkn(self.__createWikiNode(type='TEXT',
content=tok.content))
elif tok.content[0] in self.envtypes:
node = None
if tok.content[0] == ':':
t = self.lookahead(-2)
if not (t.type == 'DELIM' and t.content == ';'):
node = self.parse_indent(tok)
if not node:
node = self.parse_env(tok)
else:
self.ungetkn(tok)
node = None
self.dprint(80, "LEAVE parse_block_delim=%s", node)
return node
def parse_line(self):
self.dprint(80, "ENTER parse_line")
list = []
while True:
tok = self.getkn()
if tok.type == 'NL' or tok.type == 'NIL':
break
elif tok.type == 'TEXT':
list.append(tok)
elif tok.type == 'DELIM':
if tok.isblock:
tok = self.__createWikiNode(type = 'TEXT',
content = tok.content)
self.fixuptkn(tok)
list.append(tok)
elif tok.content[0] == ":":
# FIXME
list.append(self.parse_indent(tok))
break
else:
x = self.parse_inline_delim(tok)
if x:
list.append(x)
else:
list.append(self.fixuptkn(self.__createWikiNode(type = 'TEXT', content = tok.content)))
elif tok.type == 'OTAG':
if tok.isblock:
self.ungetkn()
break
list.append(self.parse_tag(tok))
else:
list.append(tok)
ret = self.__createWikiNode(type='SEQ', content=list)
self.dprint(80, "LEAVE parse_line=%s", ret)
return ret
def parse_indent(self, tok):
lev = len(tok.content)
self.dprint(80, "ENTER parse_indent(%s)", lev)
x = self.__createWikiNode(type='IND', level=lev, content=self.parse_line())
self.dprint(80, "LEAVE parse_indent=%s", x)
return x
def parse_fontmod(self,delim,what):
self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s",
delim, what, self.lookahead())
seq = []
text = ''
while True:
tok = self.getkn()
if tok.type == 'TEXT':
text += tok.content
elif self.is_block_end(tok):
self.dprint(80, "LEAVE parse_fontmod=%s", "None")
return None
elif tok.type == 'DELIM':
# self.dprint(80, "got %s, want %s", tok.content, delim)
if tok.content == delim:
break
else:
if text:
seq.append(self.__createWikiNode(type='TEXT', content=text))
text = ''
x = self.parse_inline_delim(tok)
if x:
seq.append(x)
else:
self.dprint(80, "LEAVE parse_fontmod=%s", "None")
return None
elif tok.type == 'NL':
seq.append(self.__createWikiNode(type='TEXT', content='\n'))
else:
self.dprint(80, "LEAVE parse_fontmod=None")
return None
if text:
seq.append(self.__createWikiNode(type='TEXT', content=text))
res = self.__createWikiNode(type=what, content=seq)
self.dprint(80, "LEAVE parse_fontmod=%s", res)
return res
def parse_ref(self):
self.dprint(80, "ENTER parse_ref")
tok = self.getkn()
if not (tok.type == 'TEXT' and self.refstart.match(tok.content)):
self.dprint(80, "LEAVE parse_ref=None")
return None
seq = []
(ref,sep,text) = tok.content.partition(' ')
if text:
seq.insert(0, self.__createWikiNode(type='TEXT', content=text))
while True:
tok = self.getkn()
if tok.type == 'NIL':
self.dprint(80, "LEAVE parse_ref=None")
return None
elif self.is_block_end(tok):
self.dprint(80, "LEAVE parse_ref=None")
return None
elif tok.type == 'DELIM':
if tok.content == ']':
break
else:
tok = self.parse_inline_delim(tok)
if tok:
seq.append(tok)
else:
self.dprint(80, "LEAVE parse_ref=None")
return None
elif tok.type == 'OTAG':
list.append(self.parse_tag(tok))
else:
seq.append(tok)
ret = self.__createWikiNode(type='REF',
ref=ref,
content=self.__createWikiNode(type='SEQ', content=seq))
self.dprint(80, "LEAVE parse_ref= %s", ret)
return ret
def parse_link(self, type, delim):
self.dprint(80, "ENTER parse_link(%s,%s)", type, delim)
subtree = []
list = []
while True:
tok = self.getkn()
if tok.type == 'NIL':
self.dprint(80, "LEAVE parse_link=None [EOF]")
return None
if tok.type == 'DELIM':
if tok.content == delim:
if list:
subtree.append(self.__createWikiNode(type='SEQ',
content=list))
break
elif tok.content == "|":
if len(list) > 1:
subtree.append(self.__createWikiNode(type='SEQ',
content=list))
elif list:
subtree.append(list[0])
list = []
else:
x = self.parse_inline_delim(tok)
if x:
list.append(x)
else:
self.dprint(80, "LEAVE parse_link=None [bad inline]")
return None
elif tok.type == 'TEXT':
list.append(tok)
else:
self.dprint(80, "LEAVE parse_link=None [unexpected token]")
return None
ret = self.__createWikiNode(type=type, content=subtree)
self.dprint(80, "LEAVE parse_link=%s", ret)
return ret
def parse_inline_delim(self, tok):
self.dprint(80, "ENTER parse_inline_delim")
assert(tok.type == 'DELIM')
self.push_mark()
if tok.content == "''":
x = self.parse_fontmod(tok.content, 'IT')
elif tok.content == "'''":
x = self.parse_fontmod(tok.content, 'BOLD')
elif tok.content == "[":
x = self.parse_ref()
elif tok.content == "[[":
x = self.parse_link('LINK', "]]")
elif tok.content == "{{":
x = self.parse_link('TMPL', "}}")
else:
x = None
if x:
self.clear_mark()
else:
self.dprint(80, "BEGIN DELIMITER RECOVERY: %s", tok)
self.pop_mark()
x = self.fixuptkn(self.__createWikiNode(type='TEXT',
content=tok.content))
od = tok.content
if od in self.close_delim:
cd = self.close_delim[od]
lev = 0
for i,tok in enumerate(self.toklist[self.tokind+1:]):
if tok.type == 'NIL':
break
elif tok.type == 'DELIM':
if tok.content == od:
lev += 1
elif tok.content == cd:
if lev == 0:
tok = self.__createWikiNode(type='TEXT',
content=tok.content)
self.toklist[self.tokind+1+i] = tok
lev -= 1
break
self.dprint(80, "END DELIMITER RECOVERY: %s", tok)
self.dprint(80, "LEAVE parse_inline_delim=%s", x)
return x
def parse_tag(self, tag):
self.dprint(80, "ENTER parse_tag")
list = []
self.push_mark()
while True:
tok = self.getkn()
if tok.type == 'NIL':
self.pop_mark()
s = '<' + tag.tag
if tag.args:
s += ' ' + str(tag.args)
s += '>'
node = self.__createWikiNode(type='TEXT',content=s)
if tag.content:
self.tree[self.tokind:self.tokind] = tag.content
self.dprint(80, "LEAVE parse_tag = %s (tree modified)", node)
return node
elif tok.type == 'DELIM':
if tok.isblock:
tok = self.parse_block_delim(tok)
else:
tok = self.parse_inline_delim(tok)
if not tok:
tok = self.getkn()
elif tok.type == 'CTAG':
if tag.tag == tok.tag:
break
s = '' + tag.tag + '>'
tok = self.fixuptkn(self.__createWikiNode(type='TEXT',
content=s))
elif tok.type == 'NL':
tok = self.__createWikiNode(type = 'TEXT', content = '\n')
list.append(tok)
self.clear_mark()
ret = self.__createWikiNode(type = 'TAG',
tag = tag.tag,
args = tag.args,
isblock = tag.isblock,
content = self.__createWikiNode(type = 'SEQ', content = list))
self.dprint(80, "LEAVE parse_tag = %s", ret)
return ret
def parse_env(self, tok):
type = self.envtypes[tok.content[0]][0]
lev = len(tok.content)
self.dprint(80, "ENTER parse_env(%s,%s)",type,lev)
list = []
while True:
if tok.type == 'DELIM' \
and tok.content[0] in self.envtypes \
and type == self.envtypes[tok.content[0]][0]:
if len(tok.content) < lev:
self.ungetkn()
break
elif len(tok.content) > lev:
elt = self.parse_env(tok)
else:
elt = self.parse_line()
if not tok.continuation:
list.append(self.__createWikiNode(type='ELT',
subtype=self.envtypes[tok.content[0]][1],
content=elt))
tok = self.getkn()
continue
if list:
if list[-1].content.type != 'SEQ':
x = list[-1].content.content
# FIXME:
list[-1].content = self.__createWikiNode(type='SEQ', content=[x])
list[-1].content.content.append(elt)
else:
self.ungetkn()
break
tok = self.getkn()
ret = self.__createWikiNode(type='ENV',
envtype=type,
level=lev,
content=list)
self.dprint(80, "LEAVE parse_env=%s", ret)
return ret
def parse_header(self, tok):
self.dprint(80, "ENTER parse_header")
self.push_mark()
list = []
delim = tok.content
while True:
tok = self.getkn()
if tok.type == 'NL':
self.pop_mark()
self.dprint(80, "LEAVE parse_header=None")
return None
elif tok.type == 'TEXT':
list.append(tok)
elif tok.type == 'DELIM':
if tok.content == delim:
if self.lookahead().type == 'NL':
self.getkn()
if self.lookahead().type == 'NL':
self.getkn()
break
else:
self.pop_mark()
self.dprint(80, "LEAVE parse_header=None")
return None
elif tok.isblock:
self.pop_mark()
self.dprint(80, "LEAVE parse_header=None")
return None
else:
list.append(self.parse_inline_delim(tok))
elif tok.type == 'OTAG':
if tok.isblock:
self.pop_mark()
self.dprint(80, "LEAVE parse_header=None")
return None
list.append(self.parse_tag(tok))
self.clear_mark()
ret = self.__createWikiNode(type='HDR',
level = len(delim),
content = self.__createWikiNode(type='SEQ',
content=list))
self.dprint(80, "LEAVE parse_header=%s", ret)
return ret
def parse_block(self):
tok = self.getkn()
while tok.type == 'NL':
tok = self.getkn()
if tok == None or tok.type == 'NIL':
return None
elif tok.type == 'DELIM':
tok = self.parse_block_delim(tok)
if tok:
return tok
else:
tok = self.getkn()
elif tok.type == 'OTAG' and tok.isblock:
return self.parse_tag(tok)
return self.parse_para(tok)
def parse(self):
if not self.toklist:
self.tokenize()
if self.debug_level >= 90:
print("TOKEN DUMP BEGIN")
self.dump(self.toklist)
print("TOKEN DUMP END")
self.tokind = 0
self.tree = []
while 1:
subtree = self.parse_block()
if subtree == None:
break
self.tree.append(subtree)
if self.debug_level >= 70:
print("TREE DUMP BEGIN")
self.dump(self.tree)
print("TREE DUMP END")
def __str__(self):
return str(self.tree)
class WikiMarkup (BaseWikiMarkup):
"""
A derived class, that supplies a basic input method.
Three types of inputs are available:
1. filename=
The file is opened and used for input.
2. file=
The already opened file is used for input.
3. text=
Input is taken from , line by line.
Usage:
obj = WikiMarkup(arg=val)
obj.parse
... Do whatever you need with obj.tree ...
"""
file = None
text = None
lang = 'en'
html_base = 'http://%(lang)s.wiktionary.org/wiki/'
image_base = 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf'
media_base = 'http://www.mediawiki.org/xml/export-0.3'
def __init__(self, *args, **keywords):
for kw in keywords:
if kw == 'file':
self.file = keywords[kw]
elif kw == 'filename':
self.file = open(keywords[kw])
elif kw == 'text':
self.text = keywords[kw].split("\n")
elif kw == 'lang':
self.lang = keywords[kw]
elif kw == 'html_base':
self.html_base = keywords[kw]
elif kw == 'image_base':
self.image_base = keywords[kw]
elif kw == 'media_base':
self.media_base = keywords[kw]
def __del__(self):
if self.file:
self.file.close()
def input(self):
if self.file:
return self.file.readline()
elif self.text:
return self.text.pop(0) + '\n'
else:
return None
def is_lang_link(self, elt):
if elt.type == 'LINK' \
and isinstance(elt.content, list) \
and len(elt.content) == 1:
if elt.content[0].type == TEXT:
m = re.match('([\w-]+):', elt.content[0].content)
if m: # and m.group(1) in self.langtab:
return True
elif elt.content[0].type == 'SEQ' \
and len(elt.content[0].content) == 1 and\
elt.content[0].content[0].type == TEXT:
m = re.match('([\w-]+):',elt.content[0].content[0].content)
if m: # and m.group(1) in self.langtab:
return True
return False
def is_empty_text(self, elt):
if elt.type == 'TEXT':
if re.search('\w', elt.content):
return False
return True
return False
def is_empty_para(self, seq):
for x in seq:
if not (self.is_lang_link(x) or self.is_empty_text(x)):
return False
return True
# ISO 639
langtab = {
"aa": "Afar", # Afar
"ab": "Аҧсуа", # Abkhazian
"ae": None, # Avestan
"af": "Afrikaans", # Afrikaans
"ak": "Akana", # Akan
"als": "Alemannisch",
"am": "አማርኛ", # Amharic
"an": "Aragonés", # Aragonese
"ang": "Englisc",
"ar": "العربية" , # Arabic
"arc": "ܐܪܡܝܐ",
"as": "অসমীয়া", # Assamese
"ast": "Asturian",
"av": "Авар", # Avaric
"ay": "Aymara", # Aymara
"az": "Azərbaycan" , # Azerbaijani
"ba": "Башҡорт", # Bashkir
"bar": "Boarisch",
"bat-smg": "Žemaitėška",
"bcl": "Bikol",
"be": "Беларуская", # Byelorussian; Belarusian
"be-x-old": "Беларуская (тарашкевіца)",
"bg": "Български", # Bulgarian
"bh": "भोजपुरी", # Bihari
"bi": "Bislama", # Bislama
"bm": "Bamanankan", # Bambara
"bn": "বাংলা" , # Bengali; Bangla
"bo": "བོད་སྐད", # Tibetan
"bpy": "ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী" ,
"br": "Brezhoneg" , # Breton
"bs": "Bosanski" , # Bosnian
"bug": "Basa Ugi",
"bxr": "Буряад",
"ca": "Català" , # Catalan
"cbk-zam": "Chavacano de Zamboanga",
"cdo": "Mìng-dĕ̤ng-ngṳ̄",
"cho": "Choctaw",
"ce": "Нохчийн", # Chechen
"ceb": "Sinugboanong Binisaya" , # Cebuano
"ch": "Chamor", # Chamorro
"chr": "ᏣᎳᎩ",
"chy": "Tsetsêhestâhese",
"co": "Cors", # Corsican
"cr": "Nehiyaw", # Cree
"crh": "Qırımtatarca",
"cs": "Česky" , # Czech
"csb": "Kaszëbsczi",
"c": "Словѣньскъ", # Church Slavic
"cv": "Чăваш", # Chuvash
"cy": "Cymraeg" , # Welsh
"da": "Dansk" , # Danish
"de": "Deutsch" , # German
"diq": "Zazaki", # Dimli (Southern Zazaki)
"dsb": "Dolnoserbski",
"dv": "ދިވެހިބަސް", # Divehi
"dz": "ཇོང་ཁ", # Dzongkha; Bhutani
"ee": "Eʋegbe", # Ewe
"el": "Ελληνικά" , # Greek
"eml": "Emiliàn e rumagnòl",
"en": "English" , # English
"eo": "Esperanto" ,
"es": "Español" , # Spanish
"et": "Eesti" , # Estonian
"eu": "Euskara" , # Basque
"ext": "Estremeñ",
"fa": "فارسی" , # Persian
"ff": "Fulfulde", # Fulah
"fi": "Suomi" , # Finnish
"fiu-vro": "Võro",
"fj": "Na Vosa Vakaviti",# Fijian; Fiji
"fo": "Føroyskt" , # Faroese
"fr": "Français" , # French
"frp": "Arpitan",
"fur": "Furlan",
"fy": "Frysk", # Frisian
"ga": "Gaeilge", # Irish
"gan": "贛語 (Gànyŭ)",
"gd": "Gàidhlig", # Scots; Gaelic
"gl": "Gallego" , # Gallegan; Galician
"glk": "گیلکی",
"got": "𐌲𐌿𐍄𐌹𐍃𐌺𐍉𐍂𐌰𐌶𐌳𐌰",
"gn": "Avañe'ẽ", # Guarani
"g": "ગુજરાતી", # Gujarati
"gv": "Gaelg", # Manx
"ha": "هَوُسَ", # Hausa
"hak": "Hak-kâ-fa / 客家話",
"haw": "Hawai`i",
"he": "עברית" , # Hebrew (formerly iw)
"hi": "हिन्दी" , # Hindi
"hif": "Fiji Hindi",
"ho": "Hiri Mot", # Hiri Motu
"hr": "Hrvatski" , # Croatian
"hsb": "Hornjoserbsce",
"ht": "Krèyol ayisyen" , # Haitian; Haitian Creole
"hu": "Magyar" , # Hungarian
"hy": "Հայերեն", # Armenian
"hz": "Otsiherero", # Herero
"ia": "Interlingua",
"ie": "Interlingue",
"id": "Bahasa Indonesia",# Indonesian (formerly in)
"ig": "Igbo", # Igbo
"ii": "ꆇꉙ ", # Sichuan Yi
"ik": "Iñupiak", # Inupiak
"ilo": "Ilokano",
"io": "Ido" ,
"is": "Íslenska" , # Icelandic
"it": "Italiano" , # Italian
"i": "ᐃᓄᒃᑎᑐᑦ", # Inuktitut
"ja": "日本語", # Japanese
"jbo": "Lojban",
"jv": "Basa Jawa", # Javanese
"ka": "ქართული" , # Georgian
"kaa": "Qaraqalpaqsha",
"kab": "Taqbaylit",
"kg": "KiKongo", # Kongo
"ki": "Gĩkũyũ", # Kikuyu
"kj": "Kuanyama", # Kuanyama
"kk": "Қазақша", # Kazakh
"kl": "Kalaallisut", # Kalaallisut; Greenlandic
"km": "ភាសាខ្មែរ", # Khmer; Cambodian
"kn": "ಕನ್ನಡ", # Kannada
"ko": "한국어" , # Korean
"kr": "Kanuri", # Kanuri
"ks": "कश्मीरी / كشميري", # Kashmiri
"ksh": "Ripoarisch",
"ku": "Kurdî / كوردی", # Kurdish
"kv": "Коми", # Komi
"kw": "Kernewek/Karnuack", # Cornish
"ky": "Кыргызча", # Kirghiz
"la": "Latina" , # Latin
"lad": "Dzhudezmo",
"lb": "Lëtzebuergesch" , # Letzeburgesch
"lbe": "Лакку",
"lg": "Luganda", # Ganda
"li": "Limburgs", # Limburgish; Limburger; Limburgan
"lij": "Lígur",
"ln": "Lingala", # Lingala
"lmo": "Lumbaart",
"lo": "ລາວ", # Lao; Laotian
"lt": "Lietuvių" , # Lithuanian
"lua": "Luba", # Luba
"lv": "Latvieš" , # Latvian; Lettish
"map-bms": "Basa Banyumasan",
"mdf": "Мокшень (Mokshanj Kälj)",
"mg": "Malagasy", # Malagasy
"mh": "Ebon", # Marshall
"mi": "Māori", # Maori
"mk": "Македонски" , # Macedonian
"ml": None, # Malayalam
"mn": "Монгол", # Mongolian
"mo": "Молдовеняскэ", # Moldavian
"mr": "मराठी" , # Marathi
"ms": "Bahasa Melay" , # Malay
"mt": "Malti", # Maltese
"mus": "Muskogee",
"my": "မ္ရန္မာစာ", # Burmese
"myv": "Эрзянь (Erzjanj Kelj)",
"mzn": "مَزِروني",
"na": "dorerin Naoero", # Nauru
"nah": "Nāhuatl",
"nap": "Nnapulitano",
"nb": "Norsk (Bokmål)", # Norwegian Bokm@aa{}l
"nd": None, # Ndebele, North
"nds": "Plattdüütsch",
"nds-nl": "Nedersaksisch",
"ne": "नेपाली", # Nepali
"new": "नेपाल भाषा" , # Nepal Bhasa
"ng": "Oshiwambo", # Ndonga
"nl": "Nederlands" , # Dutch
"nn": "Nynorsk", # Norwegian Nynorsk
"no": "Norsk (Bokmål)" , # Norwegian
"nov": "Novial",
"nr": None, # Ndebele, South
"nrm": "Nouormand/Normaund",
"nv": "Diné bizaad", # Navajo
"ny": "Chi-Chewa", # Chichewa; Nyanja
"oc": "Occitan", # Occitan; Proven@,{c}al
"oj": None, # Ojibwa
"om": "Oromoo", # (Afan) Oromo
"or": "ଓଡ଼ିଆ", # Oriya
"os": "Иронау", # Ossetian; Ossetic
"pa": "ਪੰਜਾਬੀ" , # Panjabi; Punjabi
"pag": "Pangasinan",
"pam": "Kapampangan",
"pap": "Papiament",
"pdc": "Deitsch",
"pi": "पाऴि", # Pali
"pih": "Norfuk",
"pl": "Polski" , # Polish
"pms": "Piemontèis" ,
"ps": "پښتو", # Pashto, Pushto
"pt": "Português" , # Portuguese
"q": "Runa Simi" , # Quechua
"rm": "Rumantsch", # Rhaeto-Romance
"rmy": "romani - रोमानी",
"rn": "Kirundi", # Rundi; Kirundi
"ro": "Română" , # Romanian
"roa-rup": "Armãneashce",
"roa-tara": "Tarandíne",
"ru": "Русский" , # Russian
"rw": "Ikinyarwanda", # Kinyarwanda
"sa": "संस्कृतम्", # Sanskrit
"sah": "Саха тыла (Saxa Tyla)",
"sc": "Sardu", # Sardinian
"scn": "Sicilian",
"sco": "Scots",
"sd": "سنڌي، سندھی ، सिन्ध", # Sindhi
"se": "Sámegiella", # Northern Sami
"sg": "Sängö", # Sango; Sangro
"sh": "Srpskohrvatski / Српскохрватски" ,
"si": "සිංහල",
"simple": "Simple English" ,
"sk": "Slovenčina" , # Slovak
"sl": "Slovenščina" , # Slovenian
"sm": "Gagana Samoa", # Samoan
"sn": "chiShona", # Shona
"so": "Soomaaliga", # Somali
"sr": "Српски / Srpski", # Serbian
"srn": "Sranantongo",
"ss": "SiSwati", # Swati; Siswati
"st": "Sesotho", # Sesotho; Sotho, Southern
"stk": "Seeltersk",
"s": "Basa Sunda", # Sundanese
"sq": "Shqip" , # Albanian
"szl": "Ślůnski",
"sv": "Svenska" , # Swedish
"sw": "Kiswahili", # Swahili
"ta": "தமிழ்" , # Tamil
"te": "తెలుగు" , # Telugu
"tet": "Tetun",
"tg": "Тоҷикӣ", # Tajik
"th": "ไทย" , # Thai
"ti": "ትግርኛ", # Tigrinya
"tk": "تركمن / Туркмен", # Turkmen
"tl": "Tagalog" , # Tagalog
"tn": "Setswana", # Tswana; Setswana
"to": "faka Tonga", # Tonga (?) # Also ZW ; MW
"tokipona": "Tokipona",
"tpi": "Tok Pisin",
"tr": "Türkçe" , # Turkish
"ts": "Xitsonga", # Tsonga
"tt": "Tatarça / Татарча", # Tatar
"tum": "chiTumbuka",
"tw": "Twi", # Twi
"ty": "Reo Mā`ohi", # Tahitian
"udm": "Удмурт кыл",
"ug": "Oyghurque", # Uighur
"uk": "Українська" , # Ukrainian
"ur": "اردو", # Urdu
"uz": "O‘zbek", # Uzbek
"ve": "Tshivenda", # Venda
"vec": "Vèneto",
"vi": "Tiếng Việt" , # Vietnamese
"vls": "West-Vlams",
"vo": "Volapük" ,
"wa": "Walon", # Walloon
"war": "Winaray",
"wo": "Wolof", # Wolof
"w": "吴语",
"xal": "Хальмг",
"xh": "isiXhosa", # Xhosa
"yi": "ייִדיש", # Yiddish
"yo": "Yorùbá", # Yoruba
"za": "Cuengh", # Zhuang
"zea": "Zeêuws",
"zh": "中文" , # Chinese
"zh-classical": "古文 / 文言文",
"zm-min-nan": "Bân-lâm-gú",
"zh-yue": "粵語",
"zu": "isiZulu" # Zulu
}