diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-06 17:01:23 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-06 17:36:49 +0300 |
commit | b74b1d5fe2326f56a2e37f57c38b929307c71282 (patch) | |
tree | e6029ae08f00bc7affcd1d7aec75d1288f9184ea | |
parent | f3378aebac7e89000ff097ac51c49b62eb6e9f08 (diff) | |
download | wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.gz wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.bz2 |
Handle <tags> and implicit preformatted blocks
Among <tags>, this commit handles <nowiki> and <code>. General tag handling
mechanism is provided.
* wikimarkup.py (otag, ctag, close_delim): New variables.
(BaseWikiMarkup)<newline,nested>: New attributes.
(otag, ctag, close_delim): New variables.
(newline,nested>: New attributes.
(input_tag): New abstract method.
(tokread): Remove calls to dprint, now done by the callers.
Handle xml-style tags.
(getkn,ungetkn): Set newline.
(inline_delims): Add '|'
(parse_para): Decide whether it is going to be a PRE or
PARA. Don't mix the two.
Fix recovery in case of unmatched/incorrect inline constructs.
(parse): eliminate initial PARA, if called as a nested instance.
(WikiMarkup): Remove parse method. Rely on the parent class.
* wiki2html.py (input_tag, str_tag, str_pre): New methods.
(format): Handle PRE and TAG tokens
* wiki2text.py: Similar changes. Needs some more work.
-rw-r--r-- | wiki2html.py | 30 | ||||
-rw-r--r-- | wiki2text.py | 29 | ||||
-rw-r--r-- | wikimarkup.py | 112 |
3 files changed, 139 insertions, 32 deletions
diff --git a/wiki2html.py b/wiki2html.py index eee592d..061377b 100644 --- a/wiki2html.py +++ b/wiki2html.py @@ -174,2 +174,22 @@ class HtmlWikiMarkup (WikiMarkup): + supported_tags = [ 'nowiki', 'code' ] + def input_tag(self, tag): + return tag['tag'] in self.supported_tags + + def str_tag(self, elt): + if elt['tag'] == 'nowiki': + return '<pre>' + elt['content'] + '</pre>' + elif elt['tag'] == 'code': + kwdict = { + 'nested': self.nested + 1, + 'lang': self.lang, + 'text': elt['content'], + 'html_base': self.html_base, + 'image_base': self.image_base, + 'media_base': self.media_base } + markup = HtmlWiktionaryMarkup(**kwdict) + markup.debug_level = self.debug_level + markup.parse() + return '<pre><code>' + str(markup) + '</code></pre>' #FIXME + def str_para(self, elt): @@ -180,2 +200,8 @@ class HtmlWikiMarkup (WikiMarkup): + def str_pre(self, elt): + string = ""; + for x in elt['content']: + string += self.format(x) + return '<pre>' + string + '</pre>' + def str_ind(self, elt): @@ -192,4 +218,8 @@ class HtmlWikiMarkup (WikiMarkup): return string + elif elt['type'] == 'TAG': + return self.str_tag(elt) elif elt['type'] == 'PARA': return self.str_para(elt) + elif elt['type'] == 'PRE': + return self.str_pre(elt) elif elt['type'] == 'IT': diff --git a/wiki2text.py b/wiki2text.py index c94ae51..3084ee4 100644 --- a/wiki2text.py +++ b/wiki2text.py @@ -144,2 +144,22 @@ class TextWikiMarkup (WikiMarkup): + supported_tags = [ 'nowiki', 'code' ] + def input_tag(self, tag): + return tag['tag'] in self.supported_tags + + def str_tag(self, elt): + if elt['tag'] == 'nowiki': + return elt['content'] + elif elt['tag'] == 'code': + kwdict = { + 'nested': self.nested + 1, + 'lang': self.lang, + 'text': elt['content'], + 'html_base': self.html_base, + 'image_base': self.image_base, + 'media_base': self.media_base } + markup = TextWiktionaryMarkup(**kwdict) + markup.debug_level = self.debug_level + markup.parse() + return str(markup) + def format(self, elt): @@ -157,2 +177,7 @@ class TextWikiMarkup (WikiMarkup): string = elt['content'] + elif elt['type'] == 'PRE': + string = "" + for x in elt['content']: + string += self.format(x) + string += '\n' elif elt['type'] == 'PARA': @@ -162,2 +187,4 @@ class TextWikiMarkup (WikiMarkup): string = self.fmtpara(string) + '\n\n' + elif elt['type'] == 'TAG': + string = self.str_tag(elt) elif elt['type'] == 'IT': @@ -216,3 +243,3 @@ class TextWikiMarkup (WikiMarkup): elif type == "defn": - if s[1] == 0: + if s['subtype'] == 0: string += self.indent(lev-1, x) diff --git a/wikimarkup.py b/wikimarkup.py index 09c48eb..636012e 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -25,2 +25,10 @@ __all__ = [ "BaseWikiMarkup", "WikiMarkup", delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)") +otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") +ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>") + +close_delim = { + '[': ']', + '[[': ']]', + '{{': '}}' +} @@ -37,4 +45,6 @@ class BaseWikiMarkup: tokind = 0 + newline = 0 tree = None + nested = 0 debug_level = 0 @@ -44,2 +54,5 @@ class BaseWikiMarkup: print "[DEBUG]", fmt % argv + + def input_tag(self, tag): + pass @@ -57,3 +70,2 @@ class BaseWikiMarkup: if not line or line == "": - self.dprint(100, "YIELD: NIL") yield({ 'type': 'NIL' }) @@ -62,3 +74,2 @@ class BaseWikiMarkup: if line == '\n': - self.dprint(100, "YIELD: NL") yield({ 'type': 'NL', 'content': line }) @@ -72,3 +83,2 @@ class BaseWikiMarkup: if (pos < m.start(0)): - self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)]) yield({'type': 'TEXT', 'content': line[pos:m.start(0)]}) @@ -77,3 +87,2 @@ class BaseWikiMarkup: # FIXME? - self.dprint(100, "YIELD: DELIM %s, True", m.group(0)) # FIXME: What's "extra"? @@ -84,3 +93,2 @@ class BaseWikiMarkup: else: - self.dprint(100, "YIELD: DELIM %s", m.group(0)) yield({ 'type': 'DELIM', @@ -88,4 +96,29 @@ class BaseWikiMarkup: else: + m = otag.match(line) + if m: + t = { 'type': 'TAG', + 'tag': m.group('tag'), + 'args': m.group('args') } + + if self.input_tag(t): + s = '' + if not m.group('closed'): + while 1: + try: + l = self.input() + m = ctag.match(l) + if m and m.group('tag') == t['tag']: + break + s += l + except StopIteration: + break + yield({ 'type': 'TAG', + 'tag': t['tag'], + 'args': t['args'], + 'content': s + }) + line = None + continue + if line[-1] == '\n': - self.dprint(100, "YIELD: TEXT %s", line[pos:-1]) if line[pos:-1] != '': @@ -93,3 +126,2 @@ class BaseWikiMarkup: 'content': line[pos:-1] }) - self.dprint(100, "YIELD: NL") yield({ 'type': 'NL', @@ -97,3 +129,2 @@ class BaseWikiMarkup: else: - self.dprint(100, "YIELD: TEXT %s", line[pos:]) yield({ 'type': 'TEXT', @@ -108,2 +139,3 @@ class BaseWikiMarkup: for tok in self.tokread(): + self.dprint(100, "TOK: %s", tok) self.toklist.append(tok) @@ -135,2 +167,3 @@ class BaseWikiMarkup: def getkn(self): + self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' tok = self.toklist[self.tokind] @@ -139,5 +172,6 @@ class BaseWikiMarkup: return tok - + def ungetkn(self): self.tokind = self.tokind - 1 + self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' return self.toklist[self.tokind] @@ -250,3 +284,3 @@ class BaseWikiMarkup: - inline_delims = [ "''", "'''", "[", "[[", "{{" ] + inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ] @@ -282,2 +316,9 @@ class BaseWikiMarkup: textlist = [] + tok = self.peektkn() + if re.match("^\s", tok['content']): + type = 'PRE' + rx = re.compile("^\S") + else: + type = 'PARA' + rx = re.compile("^\s") while 1: @@ -285,2 +326,5 @@ class BaseWikiMarkup: if tok['type'] == 'TEXT': + if self.newline and rx.match(tok['content']): + self.ungetkn() + break textlist.append(tok['content']) @@ -306,4 +350,18 @@ class BaseWikiMarkup: else: - seq.append(tok) - break + self.dprint(80, "ROLLBACK parse_para=%s", tok) + od = tok['content'] + textlist.append(od) + if close_delim.has_key(od): + cd = close_delim[od] + lev = 0 + for tok in self.toklist[self.tokind:]: + if tok['type'] == 'NIL': + break + elif tok['type'] == 'DELIM': + if tok['content'] == od: + lev += 1 + elif tok['content'] == cd: + if lev == 0: + tok['type'] = 'TEXT' + break else: @@ -315,3 +373,3 @@ class BaseWikiMarkup: self.dprint(80, "LEAVE parse_para=%s", seq) - return { 'type': 'PARA', 'content': seq } + return { 'type': type, 'content': seq } @@ -409,3 +467,3 @@ class BaseWikiMarkup: return x - + def parse0(self): @@ -436,3 +494,5 @@ class BaseWikiMarkup: # return self.parse0() - + else: + return tok + def parse(self): @@ -448,2 +508,5 @@ class BaseWikiMarkup: self.tree.append(subtree) + if self.nested: + if self.tree[0]['type'] == 'PARA': + self.tree[0]['type'] = 'SEQ' self.dprint(70, "TREE: %s", self.tree) @@ -497,2 +560,4 @@ class WikiMarkup (BaseWikiMarkup): self.media_base = keywords[kw] + elif kw == 'nested': + self.nested = keywords[kw] @@ -543,17 +608,2 @@ class WikiMarkup (BaseWikiMarkup): - def parse(self): - BaseWikiMarkup.parse(self) - # # Remove everything before the first header - # for i in range(0, len(self.tree)): - # if self.tree[i][0] == HDR: - # self.tree = self.tree[i:] - # break - # # Remove trailing links - # for i in range(len(self.tree)-1, 0, -1): - # if self.tree[i][0] == PARA \ - # and not self.is_empty_para(self.tree[i][1]): - # self.tree = self.tree[0:i+1] - # break - - # ISO 639 @@ -574,3 +624,3 @@ class WikiMarkup (BaseWikiMarkup): "av": "Авар", # Avaric - "ay": "Aymar", # Aymara + "ay": "Aymara", # Aymara "az": "Azərbaycan" , # Azerbaijani |