diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-06 17:01:23 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-06 17:36:49 +0300 |
commit | b74b1d5fe2326f56a2e37f57c38b929307c71282 (patch) | |
tree | e6029ae08f00bc7affcd1d7aec75d1288f9184ea | |
parent | f3378aebac7e89000ff097ac51c49b62eb6e9f08 (diff) | |
download | wit-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.gz wit-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.bz2 |
Handle <tags> and implicit preformatted blocks
Among <tags>, this commit handles <nowiki> and <code>. General tag handling
mechanism is provided.
* wikimarkup.py (otag, ctag, close_delim): New variables.
(BaseWikiMarkup)<newline,nested>: New attributes.
(otag, ctag, close_delim): New variables.
(newline,nested>: New attributes.
(input_tag): New abstract method.
(tokread): Remove calls to dprint, now done by the callers.
Handle xml-style tags.
(getkn,ungetkn): Set newline.
(inline_delims): Add '|'
(parse_para): Decide whether it is going to be a PRE or
PARA. Don't mix the two.
Fix recovery in case of unmatched/incorrect inline constructs.
(parse): eliminate initial PARA, if called as a nested instance.
(WikiMarkup): Remove parse method. Rely on the parent class.
* wiki2html.py (input_tag, str_tag, str_pre): New methods.
(format): Handle PRE and TAG tokens
* wiki2text.py: Similar changes. Needs some more work.
-rw-r--r-- | wiki2html.py | 30 | ||||
-rw-r--r-- | wiki2text.py | 29 | ||||
-rw-r--r-- | wikimarkup.py | 104 |
3 files changed, 135 insertions, 28 deletions
diff --git a/wiki2html.py b/wiki2html.py index eee592d..061377b 100644 --- a/wiki2html.py +++ b/wiki2html.py @@ -172,12 +172,38 @@ class HtmlWikiMarkup (WikiMarkup): self.envt[type]["hdr"]) return string + supported_tags = [ 'nowiki', 'code' ] + def input_tag(self, tag): + return tag['tag'] in self.supported_tags + + def str_tag(self, elt): + if elt['tag'] == 'nowiki': + return '<pre>' + elt['content'] + '</pre>' + elif elt['tag'] == 'code': + kwdict = { + 'nested': self.nested + 1, + 'lang': self.lang, + 'text': elt['content'], + 'html_base': self.html_base, + 'image_base': self.image_base, + 'media_base': self.media_base } + markup = HtmlWiktionaryMarkup(**kwdict) + markup.debug_level = self.debug_level + markup.parse() + return '<pre><code>' + str(markup) + '</code></pre>' #FIXME + def str_para(self, elt): string = ""; for x in elt['content']: string += self.format(x) return "<p>" + string + "</p>" + def str_pre(self, elt): + string = ""; + for x in elt['content']: + string += self.format(x) + return '<pre>' + string + '</pre>' + def str_ind(self, elt): return (" " * 2 * elt['level']) + self.format(elt['content']) @@ -190,8 +216,12 @@ class HtmlWikiMarkup (WikiMarkup): else: string = elt['content'] return string + elif elt['type'] == 'TAG': + return self.str_tag(elt) elif elt['type'] == 'PARA': return self.str_para(elt) + elif elt['type'] == 'PRE': + return self.str_pre(elt) elif elt['type'] == 'IT': return self.str_it(elt) elif elt['type'] == 'BOLD': diff --git a/wiki2text.py b/wiki2text.py index c94ae51..3084ee4 100644 --- a/wiki2text.py +++ b/wiki2text.py @@ -142,6 +142,26 @@ class TextWikiMarkup (WikiMarkup): length += wsc + wlen return output + linebuf + supported_tags = [ 'nowiki', 'code' ] + def input_tag(self, tag): + return tag['tag'] in self.supported_tags + + def str_tag(self, elt): + if elt['tag'] == 'nowiki': + return elt['content'] + elif elt['tag'] == 'code': + kwdict = { + 'nested': self.nested + 1, + 'lang': self.lang, + 'text': elt['content'], + 'html_base': self.html_base, + 'image_base': self.image_base, + 'media_base': self.media_base } + markup = TextWiktionaryMarkup(**kwdict) + markup.debug_level = self.debug_level + markup.parse() + return str(markup) + def format(self, elt): if elt['type'] == 'TEXT': if isinstance(elt['content'],list): @@ -155,11 +175,18 @@ class TextWikiMarkup (WikiMarkup): string += s else: string = elt['content'] + elif elt['type'] == 'PRE': + string = "" + for x in elt['content']: + string += self.format(x) + string += '\n' elif elt['type'] == 'PARA': string = ""; for x in elt['content']: string += self.format(x) string = self.fmtpara(string) + '\n\n' + elif elt['type'] == 'TAG': + string = self.str_tag(elt) elif elt['type'] == 'IT': string = "" for x in elt['content']: @@ -214,7 +241,7 @@ class TextWikiMarkup (WikiMarkup): string += self.fmtpara(self.indent(lev, "%d. %s" % (n, x))) n += 1 elif type == "defn": - if s[1] == 0: + if s['subtype'] == 0: string += self.indent(lev-1, x) else: string += self.indent(lev+3, x) diff --git a/wikimarkup.py b/wikimarkup.py index 09c48eb..636012e 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -23,6 +23,14 @@ __all__ = [ "BaseWikiMarkup", "WikiMarkup", "envtypes" ] delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)") +otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") +ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>") + +close_delim = { + '[': ']', + '[[': ']]', + '{{': '}}' +} # Environment types: envtypes = { "*": [ "unnumbered", 0 ], @@ -35,14 +43,19 @@ class BaseWikiMarkup: toklist = None tokind = 0 + newline = 0 tree = None + nested = 0 debug_level = 0 def dprint(self, lev, fmt, *argv): if self.debug_level >= lev: print "[DEBUG]", fmt % argv + def input_tag(self, tag): + pass + def tokread(self): line = None pos = 0 @@ -55,12 +68,10 @@ class BaseWikiMarkup: line = u'' if not line or line == "": - self.dprint(100, "YIELD: NIL") yield({ 'type': 'NIL' }) break if line == '\n': - self.dprint(100, "YIELD: NL") yield({ 'type': 'NL', 'content': line }) line = None continue @@ -70,32 +81,52 @@ class BaseWikiMarkup: if m: if (pos < m.start(0)): - self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)]) yield({'type': 'TEXT', 'content': line[pos:m.start(0)]}) pos = m.end(0) if envtypes.has_key(m.group(0)[0]) and line[pos] == ":": # FIXME? - self.dprint(100, "YIELD: DELIM %s, True", m.group(0)) # FIXME: What's "extra"? yield({ 'type': 'DELIM', 'content': m.group(0), 'extra': True }) pos += 1 else: - self.dprint(100, "YIELD: DELIM %s", m.group(0)) yield({ 'type': 'DELIM', 'content': m.group(0) }) else: + m = otag.match(line) + if m: + t = { 'type': 'TAG', + 'tag': m.group('tag'), + 'args': m.group('args') } + + if self.input_tag(t): + s = '' + if not m.group('closed'): + while 1: + try: + l = self.input() + m = ctag.match(l) + if m and m.group('tag') == t['tag']: + break + s += l + except StopIteration: + break + yield({ 'type': 'TAG', + 'tag': t['tag'], + 'args': t['args'], + 'content': s + }) + line = None + continue + if line[-1] == '\n': - self.dprint(100, "YIELD: TEXT %s", line[pos:-1]) if line[pos:-1] != '': yield({ 'type': 'TEXT', 'content': line[pos:-1] }) - self.dprint(100, "YIELD: NL") yield({ 'type': 'NL', 'content': '\n' }) else: - self.dprint(100, "YIELD: TEXT %s", line[pos:]) yield({ 'type': 'TEXT', 'content': line[pos:] }) line = None @@ -106,6 +137,7 @@ class BaseWikiMarkup: def tokenize(self): self.toklist = [] for tok in self.tokread(): + self.dprint(100, "TOK: %s", tok) self.toklist.append(tok) # Determine and fix up the ordering of bold and italic markers # This helps correctly parse inputs like: @@ -133,6 +165,7 @@ class BaseWikiMarkup: self.toklist[self.tokind] = val def getkn(self): + self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' tok = self.toklist[self.tokind] if tok['type'] != 'NIL': self.tokind = self.tokind + 1 @@ -140,6 +173,7 @@ class BaseWikiMarkup: def ungetkn(self): self.tokind = self.tokind - 1 + self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' return self.toklist[self.tokind] def parse_fontmod(self,delim,what): @@ -248,7 +282,7 @@ class BaseWikiMarkup: self.dprint(80, "LEAVE parse_ref= %s", ret) return ret - inline_delims = [ "''", "'''", "[", "[[", "{{" ] + inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ] def is_inline_delim(self, tok): return tok['type'] == 'DELIM' and tok['content'] in self.inline_delims @@ -280,9 +314,19 @@ class BaseWikiMarkup: self.dprint(80, "ENTER parse_para, tok %s", self.peektkn()) seq = [] textlist = [] + tok = self.peektkn() + if re.match("^\s", tok['content']): + type = 'PRE' + rx = re.compile("^\S") + else: + type = 'PARA' + rx = re.compile("^\s") while 1: tok = self.getkn() if tok['type'] == 'TEXT': + if self.newline and rx.match(tok['content']): + self.ungetkn() + break textlist.append(tok['content']) elif tok['type'] == 'NL': tok = self.getkn() @@ -304,7 +348,21 @@ class BaseWikiMarkup: if x: seq.append(x) else: - seq.append(tok) + self.dprint(80, "ROLLBACK parse_para=%s", tok) + od = tok['content'] + textlist.append(od) + if close_delim.has_key(od): + cd = close_delim[od] + lev = 0 + for tok in self.toklist[self.tokind:]: + if tok['type'] == 'NIL': + break + elif tok['type'] == 'DELIM': + if tok['content'] == od: + lev += 1 + elif tok['content'] == cd: + if lev == 0: + tok['type'] = 'TEXT' break else: seq.append({ 'type': 'TEXT', 'content': tok['content'] }) @@ -313,7 +371,7 @@ class BaseWikiMarkup: if textlist: seq.append({ 'type': 'TEXT', 'content': textlist }) self.dprint(80, "LEAVE parse_para=%s", seq) - return { 'type': 'PARA', 'content': seq } + return { 'type': type, 'content': seq } def parse_header(self, delim): self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn()) @@ -434,6 +492,8 @@ class BaseWikiMarkup: elif toktype == 'NL': return { 'type': 'TEXT', 'content': '\n' } # return self.parse0() + else: + return tok def parse(self): if not self.toklist: @@ -446,6 +506,9 @@ class BaseWikiMarkup: if subtree == None: break self.tree.append(subtree) + if self.nested: + if self.tree[0]['type'] == 'PARA': + self.tree[0]['type'] = 'SEQ' self.dprint(70, "TREE: %s", self.tree) def __str__(self): @@ -495,6 +558,8 @@ class WikiMarkup (BaseWikiMarkup): self.image_base = keywords[kw] elif kw == 'media_base': self.media_base = keywords[kw] + elif kw == 'nested': + self.nested = keywords[kw] def __del__(self): if self.file: @@ -541,21 +606,6 @@ class WikiMarkup (BaseWikiMarkup): return False return True - def parse(self): - BaseWikiMarkup.parse(self) - # # Remove everything before the first header - # for i in range(0, len(self.tree)): - # if self.tree[i][0] == HDR: - # self.tree = self.tree[i:] - # break - # # Remove trailing links - # for i in range(len(self.tree)-1, 0, -1): - # if self.tree[i][0] == PARA \ - # and not self.is_empty_para(self.tree[i][1]): - # self.tree = self.tree[0:i+1] - # break - - # ISO 639 langtab = { "aa": "Afar", # Afar @@ -572,7 +622,7 @@ class WikiMarkup (BaseWikiMarkup): "as": "অসমীয়া", # Assamese "ast": "Asturian", "av": "Авар", # Avaric - "ay": "Aymar", # Aymara + "ay": "Aymara", # Aymara "az": "Azərbaycan" , # Azerbaijani "ba": "Башҡорт", # Bashkir |