diff options
author | Sergey Poznyakoff <gray@gnu.org> | 2015-07-06 08:05:31 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org> | 2015-07-06 08:05:31 +0300 |
commit | f3378aebac7e89000ff097ac51c49b62eb6e9f08 (patch) | |
tree | cdf7a9b58b52cd6e995ddf63ef05526e60a918f1 /wikimarkup.py | |
parent | 7ab9949e2c038ee6a7215d91896f2b47a5e7c06d (diff) | |
download | wikitrans-f3378aebac7e89000ff097ac51c49b62eb6e9f08.tar.gz wikitrans-f3378aebac7e89000ff097ac51c49b62eb6e9f08.tar.bz2 |
Redo parse tree as a sequence of dictionaries, instead of arrays.
Diffstat (limited to 'wikimarkup.py')
-rw-r--r-- | wikimarkup.py | 257 |
1 files changed, 128 insertions, 129 deletions
diff --git a/wikimarkup.py b/wikimarkup.py index 060b7eb..09c48eb 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -20,31 +20,10 @@ import re from types import * __all__ = [ "BaseWikiMarkup", "WikiMarkup", - "NIL", "TEXT", "DELIM", "NL", "PARA", - "IT", "BOLD", "LINK", "TMPL", - "BAR", "HDR", "REF", "ENV", "IND", "SEQ", "envtypes" ] delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)") -NIL = 0 -TEXT = 1 -DELIM = 2 -NL = 3 - -PARA = 4 -IT = 5 -BOLD = 6 -LINK = 7 -TMPL = 8 -BAR = 9 -HDR = 10 -REF = 11 -ENV = 12 -IND = 13 -SEQ = 14 -ELT = 15 - # Environment types: envtypes = { "*": [ "unnumbered", 0 ], "#": [ "numbered", 0 ], @@ -77,12 +56,12 @@ class BaseWikiMarkup: if not line or line == "": self.dprint(100, "YIELD: NIL") - yield(NIL,) + yield({ 'type': 'NIL' }) break if line == '\n': self.dprint(100, "YIELD: NL") - yield(NL,line) + yield({ 'type': 'NL', 'content': line }) line = None continue @@ -92,26 +71,33 @@ class BaseWikiMarkup: if m: if (pos < m.start(0)): self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)]) - yield(TEXT, line[pos:m.start(0)]) + yield({'type': 'TEXT', 'content': line[pos:m.start(0)]}) pos = m.end(0) if envtypes.has_key(m.group(0)[0]) and line[pos] == ":": # FIXME? self.dprint(100, "YIELD: DELIM %s, True", m.group(0)) - yield(DELIM, m.group(0), True) + # FIXME: What's "extra"? + yield({ 'type': 'DELIM', + 'content': m.group(0), + 'extra': True }) pos += 1 else: self.dprint(100, "YIELD: DELIM %s", m.group(0)) - yield(DELIM, m.group(0)) + yield({ 'type': 'DELIM', + 'content': m.group(0) }) else: if line[-1] == '\n': self.dprint(100, "YIELD: TEXT %s", line[pos:-1]) if line[pos:-1] != '': - yield(TEXT, line[pos:-1]) + yield({ 'type': 'TEXT', + 'content': line[pos:-1] }) self.dprint(100, "YIELD: NL") - yield(NL,'\n') + yield({ 'type': 'NL', + 'content': '\n' }) else: self.dprint(100, "YIELD: TEXT %s", line[pos:]) - yield(TEXT, line[pos:]) + yield({ 'type': 'TEXT', + 'content': line[pos:] }) line = None def input(self): @@ -126,11 +112,11 @@ class BaseWikiMarkup: # '''''Door''' files kan ik niet op tijd komen.'' stack = [] for i in range(0,len(self.toklist)): - if self.toklist[i][0] == DELIM \ - and (self.toklist[i][1] == "''" \ - or self.toklist[i][1] == "'''"): + if self.toklist[i]['type'] == 'DELIM' \ + and (self.toklist[i]['content'] == "''" \ + or self.toklist[i]['content'] == "'''"): if len(stack) > 0 \ - and self.toklist[stack[-1]][1] == self.toklist[i][1]: + and self.toklist[stack[-1]]['content'] == self.toklist[i]['content']: stack.pop() elif len(stack) > 1: x = self.toklist[stack[-2]] @@ -148,7 +134,7 @@ class BaseWikiMarkup: def getkn(self): tok = self.toklist[self.tokind] - if tok[0] != NIL: + if tok['type'] != 'NIL': self.tokind = self.tokind + 1 return tok @@ -163,14 +149,14 @@ class BaseWikiMarkup: textlist = [] while 1: tok = self.getkn() - if tok[0] == TEXT: - textlist.append(tok[1]) - elif tok[0] == DELIM: - if tok[1] == delim: + if tok['type'] == 'TEXT': + textlist.append(tok['content']) + elif tok['type'] == 'DELIM': + if tok['content'] == delim: break elif self.is_inline_delim(tok): if textlist: - seq.append((TEXT, textlist)) + seq.append({ 'type': 'TEXT', 'content': textlist }) textlist = [] x = self.parse_inline(tok) if x: @@ -181,17 +167,17 @@ class BaseWikiMarkup: else: self.dprint(80, "LEAVE parse_fontmod=None") return None - elif tok[0] == NL: - if self.peektkn()[0] == NL: + elif tok['type'] == 'NL': + if self.peektkn()['type'] == 'NL': self.dprint(80, "LEAVE parse_fontmod=None") return None - seq.append((TEXT, '\n')) + seq.append({ 'type': 'TEXT', 'content': '\n' }) else: self.dprint(80, "LEAVE parse_fontmod=None") return None if textlist: - seq.append((TEXT, textlist)) - res = (what, seq) + seq.append({ 'type': 'TEXT', 'content': textlist }) + res = { 'type': what, 'content': seq } self.dprint(80, "LEAVE parse_fontmod=%s", res) return res @@ -202,14 +188,14 @@ class BaseWikiMarkup: list = [] while 1: tok = self.getkn() - if tok[0] == DELIM: - if tok[1] == delim: + if tok['type'] == 'DELIM': + if tok['content'] == delim: if list: - subtree.append((SEQ,list)) + subtree.append({ 'type': 'SEQ', 'content': list }) break - elif tok[1] == "|": + elif tok['content'] == "|": if len(list) > 1: - subtree.append((SEQ,list)) + subtree.append({ 'type': 'SEQ', 'content': list }) elif list: subtree.append(list[0]) list = [] @@ -220,21 +206,21 @@ class BaseWikiMarkup: else: self.dprint(80, "LEAVE parse_link=%s", "None") return None - elif tok[0] == TEXT: + elif tok['type'] == 'TEXT': list.append(tok) else: self.dprint(80, "LEAVE parse_link=%s", "None") return None self.dprint(80, "LEAVE parse_link=(%s,%s)", type, subtree) - return (type, subtree) + return { 'type': type, 'content': subtree } def parse_ref(self): self.dprint(80, "ENTER parse_ref, tok %s", self.peektkn()) list = [] while 1: tok = self.getkn() - if tok[0] == DELIM: - if tok[1] == "]": + if tok['type'] == 'DELIM': + if tok['content'] == "]": break else: x = self.parse_inline(tok) @@ -243,42 +229,45 @@ class BaseWikiMarkup: else: self.dprint(80, "LEAVE parse_ref=%s", "None") return None - elif tok[0] == TEXT: + elif tok['type'] == 'TEXT': list.append(tok) - elif tok[0] == NL: - list.append((TEXT, '\n')) + elif tok['type'] == 'NL': + list.append({ 'type': 'TEXT', 'content': '\n' }) continue else: self.dprint(80, "LEAVE parse_ref=%s", "None") return None - if len(list) == 0 or list[0][0] != TEXT: + if len(list) == 0 or list[0]['type'] != 'TEXT': self.dprint(80, "LEAVE parse_ref=%s", "None") return None - (ref,sep,text) = list[0][1].partition(' ') - ret = (REF, ref, (SEQ, [(TEXT, text)] + list[1:])) + (ref,sep,text) = list[0]['content'].partition(' ') + ret = { 'type': 'REF', + 'ref': ref, + 'content': { 'type': 'SEQ', + 'content': [{ 'type': 'TEXT', 'content': text }] + list[1:] } } self.dprint(80, "LEAVE parse_ref= %s", ret) return ret inline_delims = [ "''", "'''", "[", "[[", "{{" ] def is_inline_delim(self, tok): - return tok[0] == DELIM and tok[1] in self.inline_delims + return tok['type'] == 'DELIM' and tok['content'] in self.inline_delims def is_block_delim(self, tok): - return tok[0] == DELIM and tok[1] not in self.inline_delims + return tok['type'] == 'DELIM' and tok['content'] not in self.inline_delims def parse_inline(self, tok): self.dprint(80, "ENTER parse_inline(%s), tok %s", tok, self.peektkn()) tokind = self.tokind - if tok[1] == "''": - x = self.parse_fontmod(tok[1], IT) - elif tok[1] == "'''": - x = self.parse_fontmod(tok[1], BOLD) - elif tok[1] == "[": + if tok['content'] == "''": + x = self.parse_fontmod(tok['content'], 'IT') + elif tok['content'] == "'''": + x = self.parse_fontmod(tok['content'], 'BOLD') + elif tok['content'] == "[": x = self.parse_ref() - elif tok[1] == "[[": - x = self.parse_link(LINK, "]]") - elif tok[1] == "{{": - x = self.parse_link(TMPL, "}}") + elif tok['content'] == "[[": + x = self.parse_link('LINK', "]]") + elif tok['content'] == "{{": + x = self.parse_link('TMPL', "}}") else: # FIXME self.dprint(80, "LEAVE parse_inline=%s", "None") x = None @@ -293,23 +282,23 @@ class BaseWikiMarkup: textlist = [] while 1: tok = self.getkn() - if tok[0] == TEXT: - textlist.append(tok[1]) - elif tok[0] == NL: + if tok['type'] == 'TEXT': + textlist.append(tok['content']) + elif tok['type'] == 'NL': tok = self.getkn() - if tok[0] == NL or tok[0] == NIL: + if tok['type'] == 'NL' or tok['type'] == 'NIL': break else: self.ungetkn() if self.is_block_delim(tok): break textlist.append('\n') - elif tok[0] == NIL: + elif tok['type'] == 'NIL': break - elif tok[0] == DELIM: + elif tok['type'] == 'DELIM': if self.is_inline_delim(tok): if textlist: - seq.append((TEXT, textlist)) + seq.append({ 'type': 'TEXT', 'content': textlist }) textlist = [] x = self.parse_inline(tok) if x: @@ -318,27 +307,27 @@ class BaseWikiMarkup: seq.append(tok) break else: - seq.append((TEXT,tok[1])) + seq.append({ 'type': 'TEXT', 'content': tok['content'] }) # self.ungetkn() break if textlist: - seq.append((TEXT, textlist)) + seq.append({ 'type': 'TEXT', 'content': textlist }) self.dprint(80, "LEAVE parse_para=%s", seq) - return (PARA, seq) + return { 'type': 'PARA', 'content': seq } def parse_header(self, delim): self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn()) list = [] while 1: tok = self.getkn() - if tok[0] == NIL: + if tok['type'] == 'NIL': self.dprint(80, "LEAVE parse_header=%s", "None") return None - elif tok[0] == TEXT: + elif tok['type'] == 'TEXT': list.append(tok) - elif tok[0] == DELIM: - if tok[1] == delim: - if self.peektkn()[0] == NL: + elif tok['type'] == 'DELIM': + if tok['content'] == delim: + if self.peektkn()['type'] == 'NL': break else: self.dprint(80, "LEAVE parse_header=%s", "None") @@ -354,7 +343,9 @@ class BaseWikiMarkup: self.dprint(80, "LEAVE parse_header=%s", "None") return None self.dprint(80, "LEAVE parse_header=(HDR, %s, (SEQ,%s))",len(delim)-1,list) - return (HDR,len(delim)-1,(SEQ,list)) + return { 'type': 'HDR', + 'level': len(delim)-1, + 'content': { 'type': 'SEQ', 'content': list } } def parse_line(self): @@ -362,12 +353,12 @@ class BaseWikiMarkup: list = [] while 1: tok = self.getkn() - if tok[0] == NL or tok[0] == NIL: + if tok['type'] == 'NL' or tok['type'] == 'NIL': break - elif tok[0] == TEXT: + elif tok['type'] == 'TEXT': list.append(tok) - elif tok[0] == DELIM and tok[1][0] == ":": - list.append(self.parse_indent(len(tok[1]))) + elif tok['type'] == 'DELIM' and tok['content'][0] == ":": + list.append(self.parse_indent(len(tok['content']))) break else: x = self.parse_inline(tok) @@ -376,67 +367,72 @@ class BaseWikiMarkup: else: list.append(tok) self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list) - return (SEQ, list) + return { 'type': 'SEQ', 'content': list } def parse_env(self, type, lev): self.dprint(80, "ENTER parse_env(%s,%s), tok %s",type,lev,self.peektkn()) list = [] while 1: tok = self.getkn() - if tok[0] == DELIM and envtypes.has_key(tok[1][0]) and type == envtypes[tok[1][0]][0]: - if len(tok[1]) < lev: + if tok['type'] == 'DELIM' \ + and envtypes.has_key(tok['content'][0]) \ + and type == envtypes[tok['content'][0]][0]: + if len(tok['content']) < lev: self.ungetkn() break - elif len(tok[1]) > lev: + elif len(tok['content']) > lev: self.ungetkn() - elt = self.parse_env(type, len(tok[1])) + elt = self.parse_env(type, len(tok['content'])) else: elt = self.parse_line() - if len(tok) == 2: - list.append((ELT, envtypes[tok[1][0]][1], elt)) + if len(tok.keys()) == 2: + list.append({ 'type': 'ELT', + 'subtype': envtypes[tok['content'][0]][1], + 'content': elt }) continue - - if list[-1][2][0] != SEQ: - x = list[-1][2][1] - list[-1][2] = (SEQ, [x]) - list[-1][2][1].append(elt) + + if list[-1]['content']['type'] != 'SEQ': + x = list[-1]['content']['content'] + # FIXME: + list[-1]['content'] = { 'type': 'SEQ', 'content': [x] } + list[-1]['content']['content'].append(elt) else: self.ungetkn() break self.dprint(80, "LEAVE parse_env=(ENV, %s, %s, %s)", type, lev, list) - return (ENV, type, lev, list) + return { 'type': 'ENV', 'envtype': type, 'level': lev, 'content': list } def parse_indent(self, lev): self.dprint(80, "ENTER parse_indent(%s), tok %s", lev, self.peektkn()) - x = (IND, lev, self.parse_line()) + x = { 'type': 'IND', 'level': lev, 'content': self.parse_line() } self.dprint(80, "LEAVE parse_indent=%s", x) return x def parse0(self): tok = self.getkn() - toktype = tok[0] - if toktype == NIL: + toktype = tok['type'] + if toktype == 'NIL': return None - elif toktype == TEXT: + elif toktype == 'TEXT': self.ungetkn() return self.parse_para() - elif toktype == DELIM: - if tok[1] == "----": - return (BAR,) - elif tok[1][0:2] == "==": - return self.parse_header(tok[1]) - elif envtypes.has_key(tok[1][0]): - type = envtypes[tok[1][0]][0] - lev = len(tok[1]) + elif toktype == 'DELIM': + if tok['content'] == "----": + return { 'type': 'BAR' } + elif tok['content'][0:2] == "==": + return self.parse_header(tok['content']) + elif envtypes.has_key(tok['content'][0]): + type = envtypes[tok['content'][0]][0] + lev = len(tok['content']) self.ungetkn() return self.parse_env(type, lev) - elif tok[1][0] == ":": - return self.parse_indent(len(tok[1])) + elif tok['content'][0] == ":": + return self.parse_indent(len(tok['content'])) else: self.ungetkn() return self.parse_para() - elif toktype == NL: - return (TEXT, '\n') + elif toktype == 'NL': + return { 'type': 'TEXT', 'content': '\n' } # return self.parse0() def parse(self): @@ -513,25 +509,28 @@ class WikiMarkup (BaseWikiMarkup): return None def is_lang_link(self, elt): - if elt[0] == LINK and isinstance(elt[1],list) and len(elt[1]) == 1: - if elt[1][0][0] == TEXT: - m = re.match('([\w-]+):', elt[1][0][1]) + if elt['type'] == 'LINK' \ + and isinstance(elt['content'], list) \ + and len(elt['content']) == 1: + if elt['content'][0]['type'] == TEXT: + m = re.match('([\w-]+):', elt['content'][0]['content']) if m: # and m.group(1) in self.langtab: return True - elif elt[1][0][0] == SEQ and len(elt[1][0][1]) == 1 and\ - elt[1][0][1][0][0] == TEXT: - m = re.match('([\w-]+):',elt[1][0][1][0][1]) + elif elt['content'][0]['type'] == 'SEQ' \ + and len(elt['content'][0]['content']) == 1 and\ + elt['content'][0]['content'][0]['type'] == TEXT: + m = re.match('([\w-]+):',elt['content'][0]['content'][0]['content']) if m: # and m.group(1) in self.langtab: return True return False def is_empty_text(self, elt): - if elt[0] == TEXT: - if isinstance(elt[1],list): - for s in elt[1]: + if elt['type'] == 'TEXT': + if isinstance(elt['content'],list): + for s in elt['content']: if re.search('\w', s): return False - elif re.search('\w', elt[1]): + elif re.search('\w', elt['content']): return False return True return False |