summaryrefslogtreecommitdiff
path: root/wikimarkup.py
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org>2015-07-06 08:05:31 +0300
committerSergey Poznyakoff <gray@gnu.org>2015-07-06 08:05:31 +0300
commitf3378aebac7e89000ff097ac51c49b62eb6e9f08 (patch)
treecdf7a9b58b52cd6e995ddf63ef05526e60a918f1 /wikimarkup.py
parent7ab9949e2c038ee6a7215d91896f2b47a5e7c06d (diff)
downloadwikitrans-f3378aebac7e89000ff097ac51c49b62eb6e9f08.tar.gz
wikitrans-f3378aebac7e89000ff097ac51c49b62eb6e9f08.tar.bz2
Redo parse tree as a sequence of dictionaries, instead of arrays.
Diffstat (limited to 'wikimarkup.py')
-rw-r--r--wikimarkup.py257
1 files changed, 128 insertions, 129 deletions
diff --git a/wikimarkup.py b/wikimarkup.py
index 060b7eb..09c48eb 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -20,31 +20,10 @@ import re
from types import *
__all__ = [ "BaseWikiMarkup", "WikiMarkup",
- "NIL", "TEXT", "DELIM", "NL", "PARA",
- "IT", "BOLD", "LINK", "TMPL",
- "BAR", "HDR", "REF", "ENV", "IND", "SEQ",
"envtypes" ]
delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)")
-NIL = 0
-TEXT = 1
-DELIM = 2
-NL = 3
-
-PARA = 4
-IT = 5
-BOLD = 6
-LINK = 7
-TMPL = 8
-BAR = 9
-HDR = 10
-REF = 11
-ENV = 12
-IND = 13
-SEQ = 14
-ELT = 15
-
# Environment types:
envtypes = { "*": [ "unnumbered", 0 ],
"#": [ "numbered", 0 ],
@@ -77,12 +56,12 @@ class BaseWikiMarkup:
if not line or line == "":
self.dprint(100, "YIELD: NIL")
- yield(NIL,)
+ yield({ 'type': 'NIL' })
break
if line == '\n':
self.dprint(100, "YIELD: NL")
- yield(NL,line)
+ yield({ 'type': 'NL', 'content': line })
line = None
continue
@@ -92,26 +71,33 @@ class BaseWikiMarkup:
if m:
if (pos < m.start(0)):
self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)])
- yield(TEXT, line[pos:m.start(0)])
+ yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
pos = m.end(0)
if envtypes.has_key(m.group(0)[0]) and line[pos] == ":":
# FIXME?
self.dprint(100, "YIELD: DELIM %s, True", m.group(0))
- yield(DELIM, m.group(0), True)
+ # FIXME: What's "extra"?
+ yield({ 'type': 'DELIM',
+ 'content': m.group(0),
+ 'extra': True })
pos += 1
else:
self.dprint(100, "YIELD: DELIM %s", m.group(0))
- yield(DELIM, m.group(0))
+ yield({ 'type': 'DELIM',
+ 'content': m.group(0) })
else:
if line[-1] == '\n':
self.dprint(100, "YIELD: TEXT %s", line[pos:-1])
if line[pos:-1] != '':
- yield(TEXT, line[pos:-1])
+ yield({ 'type': 'TEXT',
+ 'content': line[pos:-1] })
self.dprint(100, "YIELD: NL")
- yield(NL,'\n')
+ yield({ 'type': 'NL',
+ 'content': '\n' })
else:
self.dprint(100, "YIELD: TEXT %s", line[pos:])
- yield(TEXT, line[pos:])
+ yield({ 'type': 'TEXT',
+ 'content': line[pos:] })
line = None
def input(self):
@@ -126,11 +112,11 @@ class BaseWikiMarkup:
# '''''Door''' files kan ik niet op tijd komen.''
stack = []
for i in range(0,len(self.toklist)):
- if self.toklist[i][0] == DELIM \
- and (self.toklist[i][1] == "''" \
- or self.toklist[i][1] == "'''"):
+ if self.toklist[i]['type'] == 'DELIM' \
+ and (self.toklist[i]['content'] == "''" \
+ or self.toklist[i]['content'] == "'''"):
if len(stack) > 0 \
- and self.toklist[stack[-1]][1] == self.toklist[i][1]:
+ and self.toklist[stack[-1]]['content'] == self.toklist[i]['content']:
stack.pop()
elif len(stack) > 1:
x = self.toklist[stack[-2]]
@@ -148,7 +134,7 @@ class BaseWikiMarkup:
def getkn(self):
tok = self.toklist[self.tokind]
- if tok[0] != NIL:
+ if tok['type'] != 'NIL':
self.tokind = self.tokind + 1
return tok
@@ -163,14 +149,14 @@ class BaseWikiMarkup:
textlist = []
while 1:
tok = self.getkn()
- if tok[0] == TEXT:
- textlist.append(tok[1])
- elif tok[0] == DELIM:
- if tok[1] == delim:
+ if tok['type'] == 'TEXT':
+ textlist.append(tok['content'])
+ elif tok['type'] == 'DELIM':
+ if tok['content'] == delim:
break
elif self.is_inline_delim(tok):
if textlist:
- seq.append((TEXT, textlist))
+ seq.append({ 'type': 'TEXT', 'content': textlist })
textlist = []
x = self.parse_inline(tok)
if x:
@@ -181,17 +167,17 @@ class BaseWikiMarkup:
else:
self.dprint(80, "LEAVE parse_fontmod=None")
return None
- elif tok[0] == NL:
- if self.peektkn()[0] == NL:
+ elif tok['type'] == 'NL':
+ if self.peektkn()['type'] == 'NL':
self.dprint(80, "LEAVE parse_fontmod=None")
return None
- seq.append((TEXT, '\n'))
+ seq.append({ 'type': 'TEXT', 'content': '\n' })
else:
self.dprint(80, "LEAVE parse_fontmod=None")
return None
if textlist:
- seq.append((TEXT, textlist))
- res = (what, seq)
+ seq.append({ 'type': 'TEXT', 'content': textlist })
+ res = { 'type': what, 'content': seq }
self.dprint(80, "LEAVE parse_fontmod=%s", res)
return res
@@ -202,14 +188,14 @@ class BaseWikiMarkup:
list = []
while 1:
tok = self.getkn()
- if tok[0] == DELIM:
- if tok[1] == delim:
+ if tok['type'] == 'DELIM':
+ if tok['content'] == delim:
if list:
- subtree.append((SEQ,list))
+ subtree.append({ 'type': 'SEQ', 'content': list })
break
- elif tok[1] == "|":
+ elif tok['content'] == "|":
if len(list) > 1:
- subtree.append((SEQ,list))
+ subtree.append({ 'type': 'SEQ', 'content': list })
elif list:
subtree.append(list[0])
list = []
@@ -220,21 +206,21 @@ class BaseWikiMarkup:
else:
self.dprint(80, "LEAVE parse_link=%s", "None")
return None
- elif tok[0] == TEXT:
+ elif tok['type'] == 'TEXT':
list.append(tok)
else:
self.dprint(80, "LEAVE parse_link=%s", "None")
return None
self.dprint(80, "LEAVE parse_link=(%s,%s)", type, subtree)
- return (type, subtree)
+ return { 'type': type, 'content': subtree }
def parse_ref(self):
self.dprint(80, "ENTER parse_ref, tok %s", self.peektkn())
list = []
while 1:
tok = self.getkn()
- if tok[0] == DELIM:
- if tok[1] == "]":
+ if tok['type'] == 'DELIM':
+ if tok['content'] == "]":
break
else:
x = self.parse_inline(tok)
@@ -243,42 +229,45 @@ class BaseWikiMarkup:
else:
self.dprint(80, "LEAVE parse_ref=%s", "None")
return None
- elif tok[0] == TEXT:
+ elif tok['type'] == 'TEXT':
list.append(tok)
- elif tok[0] == NL:
- list.append((TEXT, '\n'))
+ elif tok['type'] == 'NL':
+ list.append({ 'type': 'TEXT', 'content': '\n' })
continue
else:
self.dprint(80, "LEAVE parse_ref=%s", "None")
return None
- if len(list) == 0 or list[0][0] != TEXT:
+ if len(list) == 0 or list[0]['type'] != 'TEXT':
self.dprint(80, "LEAVE parse_ref=%s", "None")
return None
- (ref,sep,text) = list[0][1].partition(' ')
- ret = (REF, ref, (SEQ, [(TEXT, text)] + list[1:]))
+ (ref,sep,text) = list[0]['content'].partition(' ')
+ ret = { 'type': 'REF',
+ 'ref': ref,
+ 'content': { 'type': 'SEQ',
+ 'content': [{ 'type': 'TEXT', 'content': text }] + list[1:] } }
self.dprint(80, "LEAVE parse_ref= %s", ret)
return ret
inline_delims = [ "''", "'''", "[", "[[", "{{" ]
def is_inline_delim(self, tok):
- return tok[0] == DELIM and tok[1] in self.inline_delims
+ return tok['type'] == 'DELIM' and tok['content'] in self.inline_delims
def is_block_delim(self, tok):
- return tok[0] == DELIM and tok[1] not in self.inline_delims
+ return tok['type'] == 'DELIM' and tok['content'] not in self.inline_delims
def parse_inline(self, tok):
self.dprint(80, "ENTER parse_inline(%s), tok %s", tok, self.peektkn())
tokind = self.tokind
- if tok[1] == "''":
- x = self.parse_fontmod(tok[1], IT)
- elif tok[1] == "'''":
- x = self.parse_fontmod(tok[1], BOLD)
- elif tok[1] == "[":
+ if tok['content'] == "''":
+ x = self.parse_fontmod(tok['content'], 'IT')
+ elif tok['content'] == "'''":
+ x = self.parse_fontmod(tok['content'], 'BOLD')
+ elif tok['content'] == "[":
x = self.parse_ref()
- elif tok[1] == "[[":
- x = self.parse_link(LINK, "]]")
- elif tok[1] == "{{":
- x = self.parse_link(TMPL, "}}")
+ elif tok['content'] == "[[":
+ x = self.parse_link('LINK', "]]")
+ elif tok['content'] == "{{":
+ x = self.parse_link('TMPL', "}}")
else: # FIXME
self.dprint(80, "LEAVE parse_inline=%s", "None")
x = None
@@ -293,23 +282,23 @@ class BaseWikiMarkup:
textlist = []
while 1:
tok = self.getkn()
- if tok[0] == TEXT:
- textlist.append(tok[1])
- elif tok[0] == NL:
+ if tok['type'] == 'TEXT':
+ textlist.append(tok['content'])
+ elif tok['type'] == 'NL':
tok = self.getkn()
- if tok[0] == NL or tok[0] == NIL:
+ if tok['type'] == 'NL' or tok['type'] == 'NIL':
break
else:
self.ungetkn()
if self.is_block_delim(tok):
break
textlist.append('\n')
- elif tok[0] == NIL:
+ elif tok['type'] == 'NIL':
break
- elif tok[0] == DELIM:
+ elif tok['type'] == 'DELIM':
if self.is_inline_delim(tok):
if textlist:
- seq.append((TEXT, textlist))
+ seq.append({ 'type': 'TEXT', 'content': textlist })
textlist = []
x = self.parse_inline(tok)
if x:
@@ -318,27 +307,27 @@ class BaseWikiMarkup:
seq.append(tok)
break
else:
- seq.append((TEXT,tok[1]))
+ seq.append({ 'type': 'TEXT', 'content': tok['content'] })
# self.ungetkn()
break
if textlist:
- seq.append((TEXT, textlist))
+ seq.append({ 'type': 'TEXT', 'content': textlist })
self.dprint(80, "LEAVE parse_para=%s", seq)
- return (PARA, seq)
+ return { 'type': 'PARA', 'content': seq }
def parse_header(self, delim):
self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn())
list = []
while 1:
tok = self.getkn()
- if tok[0] == NIL:
+ if tok['type'] == 'NIL':
self.dprint(80, "LEAVE parse_header=%s", "None")
return None
- elif tok[0] == TEXT:
+ elif tok['type'] == 'TEXT':
list.append(tok)
- elif tok[0] == DELIM:
- if tok[1] == delim:
- if self.peektkn()[0] == NL:
+ elif tok['type'] == 'DELIM':
+ if tok['content'] == delim:
+ if self.peektkn()['type'] == 'NL':
break
else:
self.dprint(80, "LEAVE parse_header=%s", "None")
@@ -354,7 +343,9 @@ class BaseWikiMarkup:
self.dprint(80, "LEAVE parse_header=%s", "None")
return None
self.dprint(80, "LEAVE parse_header=(HDR, %s, (SEQ,%s))",len(delim)-1,list)
- return (HDR,len(delim)-1,(SEQ,list))
+ return { 'type': 'HDR',
+ 'level': len(delim)-1,
+ 'content': { 'type': 'SEQ', 'content': list } }
def parse_line(self):
@@ -362,12 +353,12 @@ class BaseWikiMarkup:
list = []
while 1:
tok = self.getkn()
- if tok[0] == NL or tok[0] == NIL:
+ if tok['type'] == 'NL' or tok['type'] == 'NIL':
break
- elif tok[0] == TEXT:
+ elif tok['type'] == 'TEXT':
list.append(tok)
- elif tok[0] == DELIM and tok[1][0] == ":":
- list.append(self.parse_indent(len(tok[1])))
+ elif tok['type'] == 'DELIM' and tok['content'][0] == ":":
+ list.append(self.parse_indent(len(tok['content'])))
break
else:
x = self.parse_inline(tok)
@@ -376,67 +367,72 @@ class BaseWikiMarkup:
else:
list.append(tok)
self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list)
- return (SEQ, list)
+ return { 'type': 'SEQ', 'content': list }
def parse_env(self, type, lev):
self.dprint(80, "ENTER parse_env(%s,%s), tok %s",type,lev,self.peektkn())
list = []
while 1:
tok = self.getkn()
- if tok[0] == DELIM and envtypes.has_key(tok[1][0]) and type == envtypes[tok[1][0]][0]:
- if len(tok[1]) < lev:
+ if tok['type'] == 'DELIM' \
+ and envtypes.has_key(tok['content'][0]) \
+ and type == envtypes[tok['content'][0]][0]:
+ if len(tok['content']) < lev:
self.ungetkn()
break
- elif len(tok[1]) > lev:
+ elif len(tok['content']) > lev:
self.ungetkn()
- elt = self.parse_env(type, len(tok[1]))
+ elt = self.parse_env(type, len(tok['content']))
else:
elt = self.parse_line()
- if len(tok) == 2:
- list.append((ELT, envtypes[tok[1][0]][1], elt))
+ if len(tok.keys()) == 2:
+ list.append({ 'type': 'ELT',
+ 'subtype': envtypes[tok['content'][0]][1],
+ 'content': elt })
continue
-
- if list[-1][2][0] != SEQ:
- x = list[-1][2][1]
- list[-1][2] = (SEQ, [x])
- list[-1][2][1].append(elt)
+
+ if list[-1]['content']['type'] != 'SEQ':
+ x = list[-1]['content']['content']
+ # FIXME:
+ list[-1]['content'] = { 'type': 'SEQ', 'content': [x] }
+ list[-1]['content']['content'].append(elt)
else:
self.ungetkn()
break
self.dprint(80, "LEAVE parse_env=(ENV, %s, %s, %s)", type, lev, list)
- return (ENV, type, lev, list)
+ return { 'type': 'ENV', 'envtype': type, 'level': lev, 'content': list }
def parse_indent(self, lev):
self.dprint(80, "ENTER parse_indent(%s), tok %s", lev, self.peektkn())
- x = (IND, lev, self.parse_line())
+ x = { 'type': 'IND', 'level': lev, 'content': self.parse_line() }
self.dprint(80, "LEAVE parse_indent=%s", x)
return x
def parse0(self):
tok = self.getkn()
- toktype = tok[0]
- if toktype == NIL:
+ toktype = tok['type']
+ if toktype == 'NIL':
return None
- elif toktype == TEXT:
+ elif toktype == 'TEXT':
self.ungetkn()
return self.parse_para()
- elif toktype == DELIM:
- if tok[1] == "----":
- return (BAR,)
- elif tok[1][0:2] == "==":
- return self.parse_header(tok[1])
- elif envtypes.has_key(tok[1][0]):
- type = envtypes[tok[1][0]][0]
- lev = len(tok[1])
+ elif toktype == 'DELIM':
+ if tok['content'] == "----":
+ return { 'type': 'BAR' }
+ elif tok['content'][0:2] == "==":
+ return self.parse_header(tok['content'])
+ elif envtypes.has_key(tok['content'][0]):
+ type = envtypes[tok['content'][0]][0]
+ lev = len(tok['content'])
self.ungetkn()
return self.parse_env(type, lev)
- elif tok[1][0] == ":":
- return self.parse_indent(len(tok[1]))
+ elif tok['content'][0] == ":":
+ return self.parse_indent(len(tok['content']))
else:
self.ungetkn()
return self.parse_para()
- elif toktype == NL:
- return (TEXT, '\n')
+ elif toktype == 'NL':
+ return { 'type': 'TEXT', 'content': '\n' }
# return self.parse0()
def parse(self):
@@ -513,25 +509,28 @@ class WikiMarkup (BaseWikiMarkup):
return None
def is_lang_link(self, elt):
- if elt[0] == LINK and isinstance(elt[1],list) and len(elt[1]) == 1:
- if elt[1][0][0] == TEXT:
- m = re.match('([\w-]+):', elt[1][0][1])
+ if elt['type'] == 'LINK' \
+ and isinstance(elt['content'], list) \
+ and len(elt['content']) == 1:
+ if elt['content'][0]['type'] == TEXT:
+ m = re.match('([\w-]+):', elt['content'][0]['content'])
if m: # and m.group(1) in self.langtab:
return True
- elif elt[1][0][0] == SEQ and len(elt[1][0][1]) == 1 and\
- elt[1][0][1][0][0] == TEXT:
- m = re.match('([\w-]+):',elt[1][0][1][0][1])
+ elif elt['content'][0]['type'] == 'SEQ' \
+ and len(elt['content'][0]['content']) == 1 and\
+ elt['content'][0]['content'][0]['type'] == TEXT:
+ m = re.match('([\w-]+):',elt['content'][0]['content'][0]['content'])
if m: # and m.group(1) in self.langtab:
return True
return False
def is_empty_text(self, elt):
- if elt[0] == TEXT:
- if isinstance(elt[1],list):
- for s in elt[1]:
+ if elt['type'] == 'TEXT':
+ if isinstance(elt['content'],list):
+ for s in elt['content']:
if re.search('\w', s):
return False
- elif re.search('\w', elt[1]):
+ elif re.search('\w', elt['content']):
return False
return True
return False

Return to:

Send suggestions and report system problems to the System administrator.