diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-06 17:01:23 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-06 17:36:49 +0300 |
commit | b74b1d5fe2326f56a2e37f57c38b929307c71282 (patch) | |
tree | e6029ae08f00bc7affcd1d7aec75d1288f9184ea /wikimarkup.py | |
parent | f3378aebac7e89000ff097ac51c49b62eb6e9f08 (diff) | |
download | wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.gz wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.bz2 |
Handle <tags> and implicit preformatted blocks
Among <tags>, this commit handles <nowiki> and <code>. General tag handling
mechanism is provided.
* wikimarkup.py (otag, ctag, close_delim): New variables.
(BaseWikiMarkup)<newline,nested>: New attributes.
(otag, ctag, close_delim): New variables.
(newline,nested>: New attributes.
(input_tag): New abstract method.
(tokread): Remove calls to dprint, now done by the callers.
Handle xml-style tags.
(getkn,ungetkn): Set newline.
(inline_delims): Add '|'
(parse_para): Decide whether it is going to be a PRE or
PARA. Don't mix the two.
Fix recovery in case of unmatched/incorrect inline constructs.
(parse): eliminate initial PARA, if called as a nested instance.
(WikiMarkup): Remove parse method. Rely on the parent class.
* wiki2html.py (input_tag, str_tag, str_pre): New methods.
(format): Handle PRE and TAG tokens
* wiki2text.py: Similar changes. Needs some more work.
Diffstat (limited to 'wikimarkup.py')
-rw-r--r-- | wikimarkup.py | 112 |
1 files changed, 81 insertions, 31 deletions
diff --git a/wikimarkup.py b/wikimarkup.py index 09c48eb..636012e 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -23,6 +23,14 @@ __all__ = [ "BaseWikiMarkup", "WikiMarkup", "envtypes" ] delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)") +otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") +ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>") + +close_delim = { + '[': ']', + '[[': ']]', + '{{': '}}' +} # Environment types: envtypes = { "*": [ "unnumbered", 0 ], @@ -35,13 +43,18 @@ class BaseWikiMarkup: toklist = None tokind = 0 + newline = 0 tree = None + nested = 0 debug_level = 0 def dprint(self, lev, fmt, *argv): if self.debug_level >= lev: print "[DEBUG]", fmt % argv + + def input_tag(self, tag): + pass def tokread(self): line = None @@ -55,12 +68,10 @@ class BaseWikiMarkup: line = u'' if not line or line == "": - self.dprint(100, "YIELD: NIL") yield({ 'type': 'NIL' }) break if line == '\n': - self.dprint(100, "YIELD: NL") yield({ 'type': 'NL', 'content': line }) line = None continue @@ -70,32 +81,52 @@ class BaseWikiMarkup: if m: if (pos < m.start(0)): - self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)]) yield({'type': 'TEXT', 'content': line[pos:m.start(0)]}) pos = m.end(0) if envtypes.has_key(m.group(0)[0]) and line[pos] == ":": # FIXME? - self.dprint(100, "YIELD: DELIM %s, True", m.group(0)) # FIXME: What's "extra"? yield({ 'type': 'DELIM', 'content': m.group(0), 'extra': True }) pos += 1 else: - self.dprint(100, "YIELD: DELIM %s", m.group(0)) yield({ 'type': 'DELIM', 'content': m.group(0) }) else: + m = otag.match(line) + if m: + t = { 'type': 'TAG', + 'tag': m.group('tag'), + 'args': m.group('args') } + + if self.input_tag(t): + s = '' + if not m.group('closed'): + while 1: + try: + l = self.input() + m = ctag.match(l) + if m and m.group('tag') == t['tag']: + break + s += l + except StopIteration: + break + yield({ 'type': 'TAG', + 'tag': t['tag'], + 'args': t['args'], + 'content': s + }) + line = None + continue + if line[-1] == '\n': - self.dprint(100, "YIELD: TEXT %s", line[pos:-1]) if line[pos:-1] != '': yield({ 'type': 'TEXT', 'content': line[pos:-1] }) - self.dprint(100, "YIELD: NL") yield({ 'type': 'NL', 'content': '\n' }) else: - self.dprint(100, "YIELD: TEXT %s", line[pos:]) yield({ 'type': 'TEXT', 'content': line[pos:] }) line = None @@ -106,6 +137,7 @@ class BaseWikiMarkup: def tokenize(self): self.toklist = [] for tok in self.tokread(): + self.dprint(100, "TOK: %s", tok) self.toklist.append(tok) # Determine and fix up the ordering of bold and italic markers # This helps correctly parse inputs like: @@ -133,13 +165,15 @@ class BaseWikiMarkup: self.toklist[self.tokind] = val def getkn(self): + self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' tok = self.toklist[self.tokind] if tok['type'] != 'NIL': self.tokind = self.tokind + 1 return tok - + def ungetkn(self): self.tokind = self.tokind - 1 + self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' return self.toklist[self.tokind] def parse_fontmod(self,delim,what): @@ -248,7 +282,7 @@ class BaseWikiMarkup: self.dprint(80, "LEAVE parse_ref= %s", ret) return ret - inline_delims = [ "''", "'''", "[", "[[", "{{" ] + inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ] def is_inline_delim(self, tok): return tok['type'] == 'DELIM' and tok['content'] in self.inline_delims @@ -280,9 +314,19 @@ class BaseWikiMarkup: self.dprint(80, "ENTER parse_para, tok %s", self.peektkn()) seq = [] textlist = [] + tok = self.peektkn() + if re.match("^\s", tok['content']): + type = 'PRE' + rx = re.compile("^\S") + else: + type = 'PARA' + rx = re.compile("^\s") while 1: tok = self.getkn() if tok['type'] == 'TEXT': + if self.newline and rx.match(tok['content']): + self.ungetkn() + break textlist.append(tok['content']) elif tok['type'] == 'NL': tok = self.getkn() @@ -304,8 +348,22 @@ class BaseWikiMarkup: if x: seq.append(x) else: - seq.append(tok) - break + self.dprint(80, "ROLLBACK parse_para=%s", tok) + od = tok['content'] + textlist.append(od) + if close_delim.has_key(od): + cd = close_delim[od] + lev = 0 + for tok in self.toklist[self.tokind:]: + if tok['type'] == 'NIL': + break + elif tok['type'] == 'DELIM': + if tok['content'] == od: + lev += 1 + elif tok['content'] == cd: + if lev == 0: + tok['type'] = 'TEXT' + break else: seq.append({ 'type': 'TEXT', 'content': tok['content'] }) # self.ungetkn() @@ -313,7 +371,7 @@ class BaseWikiMarkup: if textlist: seq.append({ 'type': 'TEXT', 'content': textlist }) self.dprint(80, "LEAVE parse_para=%s", seq) - return { 'type': 'PARA', 'content': seq } + return { 'type': type, 'content': seq } def parse_header(self, delim): self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn()) @@ -407,7 +465,7 @@ class BaseWikiMarkup: x = { 'type': 'IND', 'level': lev, 'content': self.parse_line() } self.dprint(80, "LEAVE parse_indent=%s", x) return x - + def parse0(self): tok = self.getkn() toktype = tok['type'] @@ -434,7 +492,9 @@ class BaseWikiMarkup: elif toktype == 'NL': return { 'type': 'TEXT', 'content': '\n' } # return self.parse0() - + else: + return tok + def parse(self): if not self.toklist: self.tokenize() @@ -446,6 +506,9 @@ class BaseWikiMarkup: if subtree == None: break self.tree.append(subtree) + if self.nested: + if self.tree[0]['type'] == 'PARA': + self.tree[0]['type'] = 'SEQ' self.dprint(70, "TREE: %s", self.tree) def __str__(self): @@ -495,6 +558,8 @@ class WikiMarkup (BaseWikiMarkup): self.image_base = keywords[kw] elif kw == 'media_base': self.media_base = keywords[kw] + elif kw == 'nested': + self.nested = keywords[kw] def __del__(self): if self.file: @@ -541,21 +606,6 @@ class WikiMarkup (BaseWikiMarkup): return False return True - def parse(self): - BaseWikiMarkup.parse(self) - # # Remove everything before the first header - # for i in range(0, len(self.tree)): - # if self.tree[i][0] == HDR: - # self.tree = self.tree[i:] - # break - # # Remove trailing links - # for i in range(len(self.tree)-1, 0, -1): - # if self.tree[i][0] == PARA \ - # and not self.is_empty_para(self.tree[i][1]): - # self.tree = self.tree[0:i+1] - # break - - # ISO 639 langtab = { "aa": "Afar", # Afar @@ -572,7 +622,7 @@ class WikiMarkup (BaseWikiMarkup): "as": "অসমীয়া", # Assamese "ast": "Asturian", "av": "Авар", # Avaric - "ay": "Aymar", # Aymara + "ay": "Aymara", # Aymara "az": "Azərbaycan" , # Azerbaijani "ba": "Башҡорт", # Bashkir |