diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-12 23:11:40 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-12 23:11:40 +0300 |
commit | 28072898f1bd9a925d73ac187d560198d6345524 (patch) | |
tree | a46d781fb85d9dda61fc8f68e0ba6ec43d60ce55 | |
parent | 75672b57a2d63f01d00795fe8d661d1efe7b6e8d (diff) | |
download | wikitrans-28072898f1bd9a925d73ac187d560198d6345524.tar.gz wikitrans-28072898f1bd9a925d73ac187d560198d6345524.tar.bz2 |
Improve tag handling and debugging
* wikimarkup.py: Rewrite tag recognition.
Implement dump method.
* wikicvt.py: New options -D (--dump), and -t dump
* wiki2html.py (input_tag): Remove method
(str_tag): Change handling of tags
* wiki2texi.py: Likewise.
* wiki2text.py: Likewise.
-rw-r--r-- | wiki2html.py | 28 | ||||
-rw-r--r-- | wiki2texi.py | 37 | ||||
-rw-r--r-- | wiki2text.py | 27 | ||||
-rwxr-xr-x | wikicvt.py | 26 | ||||
-rw-r--r-- | wikimarkup.py | 317 |
5 files changed, 309 insertions, 126 deletions
diff --git a/wiki2html.py b/wiki2html.py index 441bc76..66939c4 100644 --- a/wiki2html.py +++ b/wiki2html.py @@ -174,21 +174,17 @@ class HtmlWikiMarkup (WikiMarkup): - supported_tags = [ 'nowiki', 'code' ] - def input_tag(self, tag): - return tag['tag'] in self.supported_tags - def str_tag(self, elt): if elt['tag'] == 'nowiki': - return '<pre>' + elt['content'] + '</pre>' + return '<pre>' + self.format(elt['content']) + '</pre>' elif elt['tag'] == 'code': - kwdict = { - 'nested': self.nested + 1, - 'lang': self.lang, - 'text': elt['content'], - 'html_base': self.html_base, - 'image_base': self.image_base, - 'media_base': self.media_base } - markup = HtmlWiktionaryMarkup(**kwdict) - markup.debug_level = self.debug_level - markup.parse() - return '<pre><code>' + str(markup) + '</code></pre>' #FIXME + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + return '<pre><code>' + s + '</code></pre>' #FIXME + else: + s = '<' + elt['tag'] + if elt['args']: + s += ' ' + elt['args'] + s += '>' + s += self.format(elt['content']) + return s + '</' + elt['tag'] + '>' diff --git a/wiki2texi.py b/wiki2texi.py index 7cc67bd..0b3eb77 100644 --- a/wiki2texi.py +++ b/wiki2texi.py @@ -121,25 +121,24 @@ class TexiWikiMarkup (WikiMarkup): - supported_tags = [ 'nowiki', 'code' ] - def input_tag(self, tag): - return tag['tag'] in self.supported_tags - def str_tag(self, elt): if elt['tag'] == 'nowiki': - return '@example\n' + elt['content'] + '@end example\n' + return '@example\n' + self.format(elt['content']) + '@end example\n' elif elt['tag'] == 'code': - kwdict = { - 'nested': self.nested + 1, - 'lang': self.lang, - 'text': elt['content'], - 'html_base': self.html_base, - 'image_base': self.image_base, - 'media_base': self.media_base } - markup = TexiWikiMarkup(**kwdict) - markup.debug_level = self.debug_level - markup.parse() - s = str(markup) + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 if not s.endswith("\n"): - s += "\n"; + s += "\n" return '@example\n' + s + '@end example\n' - + elif elt['tag'] == 'tt': + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + return "@code{%s}" % s + else: + s = '<' + elt['tag'] + if elt['args']: + s += ' ' + elt['args'] + s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>' + return s + def str_para(self, elt): @@ -158,3 +157,3 @@ class TexiWikiMarkup (WikiMarkup): string += "\n"; - return '@example\n' + string + '@end example\n' + return '\n@example\n' + string + '@end example\n' diff --git a/wiki2text.py b/wiki2text.py index 27a7051..d4cab81 100644 --- a/wiki2text.py +++ b/wiki2text.py @@ -144,21 +144,16 @@ class TextWikiMarkup (WikiMarkup): - supported_tags = [ 'nowiki', 'code' ] - def input_tag(self, tag): - return tag['tag'] in self.supported_tags - def str_tag(self, elt): if elt['tag'] == 'nowiki': - return elt['content'] + return self.format(elt['content']) elif elt['tag'] == 'code': - kwdict = { - 'nested': self.nested + 1, - 'lang': self.lang, - 'text': elt['content'], - 'html_base': self.html_base, - 'image_base': self.image_base, - 'media_base': self.media_base } - markup = TextWiktionaryMarkup(**kwdict) - markup.debug_level = self.debug_level - markup.parse() - return str(markup) + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + return s #FIXME + else: + s = '<' + elt['tag'] + if elt['args']: + s += ' ' + elt['args'] + s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>' + return s @@ -19,2 +19,3 @@ import sys import getopt +import StringIO from wiki2html import * @@ -23,2 +24,11 @@ from wiki2texi import * +class DumpWikiMarkup (WikiMarkup): + def __str__(self): + if self.tree: + s = StringIO.StringIO() + self.dump(self.tree, 0, s) + return s.getvalue() + else: + return "" + def usage(code=0): @@ -31,2 +41,5 @@ usage: %s [-hvt] [-I INTYPE] [-l lang] [-o kw=val] [--lang=lang] [--option kw=va handlers = { + 'dump': { + 'default': DumpWikiMarkup + }, 'html': { @@ -53,5 +66,6 @@ def main(): try: - opts, args = getopt.getopt(sys.argv[1:], "d:I:hl:o:t:v", - ["debug=", "help", "lang=", "option=", - "to", "type", "input-text", "input-type", + opts, args = getopt.getopt(sys.argv[1:], "Dd:I:hl:o:t:v", + ["dump", + "debug=", "help", "lang=", "option=", + "to=", "type=", "input-text", "input-type=", "verbose" ]) @@ -79,2 +93,4 @@ def main(): debug = eval(a) + elif o in ("-D", "--dump"): + otype = 'dump' @@ -90,4 +106,4 @@ def main(): - if handlers.has_key(otype): - if handlers[otype].has_key(itype): + if otype in handlers: + if itype in handlers[otype]: markup = handlers[otype][itype](**kwdict) diff --git a/wikimarkup.py b/wikimarkup.py index fde1ec1..9a79d1e 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -24,5 +24,5 @@ __all__ = [ "BaseWikiMarkup", "WikiMarkup", -delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)") -otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") -ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>") +delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") +otag = re.compile("(?P<pfx>[^<]*)<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") +ctag = re.compile("(?P<pfx>[^<]*)</(?P<tag>[a-zA-Z0-9_]+)\s*>") @@ -48,2 +48,4 @@ class BaseWikiMarkup(object): + tags = [ 'code', 'nowiki', 'tt', 'div' ] + nested = 0 @@ -55,5 +57,102 @@ class BaseWikiMarkup(object): - def input_tag(self, tag): + def print_dump_prefix(self, level, file): + file.write("[DUMP]" + ' ' * (2*level + 1)) + + def dump_nil(self, node, level, file): pass + def dump_text(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("CONTENT: \"%s\"\n" % node['content']) + + def dump_delim(self, node, level, file): + file.write("'%s'" % node['content']) + if 'continuation' in node: + file.write(" (cont)") + file.write("\n") + + def dump_tag(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("TAG: %s\n" % node['tag']) + if 'args' in node: + self.print_dump_prefix(level, file) + file.write("ARGS: %s\n" % node['args']) + if 'content' in node: + self.dump_node(node['content'], level + 1, file) + + def dump_seq(self, node, level, file): + self.dump(node['content'], level + 1, file) + + def dump_ref(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("REF: %s\n" % node['ref']) + self.dump_node(node['content'], level + 1, file) + + def dump_hdr(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("LEVEL: %s\n" % node['level']) + self.dump_node(node['content'], level + 1, file) + + def dump_elt(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("SUBTYPE: %s\n" % node['subtype']) + self.dump_node(node['content'], level + 1, file) + + def dump_env(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("ENVTYPE: %s\n" % node['envtype']) + self.print_dump_prefix(level, file) + file.write("LEVEL: %s\n" % node['level']) + self.dump(node['content'], level + 1, file) + + def dump_ind(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("LEVEL: %s\n" % node['level']) + self.dump_node(node['content'], level + 1, file) + + def dump_link(self, node, level, file): + self.dump(node['content'], level + 1, file) + + dump_type = { + 'NIL': dump_nil, + 'NL': dump_nil, + 'TEXT': dump_text, + 'DELIM': dump_delim, + 'OTAG': dump_tag, + 'CTAG': dump_tag, + 'TAG': dump_tag, + 'SEQ': dump_seq, + 'REF': dump_ref, + 'HDR': dump_hdr, + 'ELT': dump_elt, + 'ENV': dump_env, + 'IND': dump_ind, + 'BAR': dump_nil, + 'PARA': dump_seq, + 'PRE': dump_text, + 'BOLD': dump_seq, + 'IT': dump_seq, + 'LINK': dump_link, + } + + def dump_node(self, node, level, file): + if type(node) != dict: + file.write("UNHANDLED NODE: %s, %s\n" % (type(node),node)) + return + + self.print_dump_prefix(level, file) + file.write("NODE " + node['type'] + ":\n") + if node['type'] in self.dump_type: + self.dump_type[node['type']](self, node, level, file) + else: + self.print_dump_prefix(level, file) + file.write("(UNHANDLED) ") + file.write("%s\n" % node) + self.print_dump_prefix(level, file) + file.write("END NODE " + node['type'] + "\n") + + def dump(self, tree, level=0, file=sys.stdout): + for node in tree: + self.dump_node(node, level, file) + def tokread(self): @@ -85,7 +184,8 @@ class BaseWikiMarkup(object): pos = m.end(0) - if envtypes.has_key(m.group(0)[0]) and line[pos] == ":": - # FIXME? - # FIXME: What's "extra"? + + if m and line[m.start(0)] != '<': + if m.group(0)[0] in envtypes and pos < len(line) and line[pos] == ":": yield({ 'type': 'DELIM', - 'content': m.group(0) }) + 'content': m.group(0), + 'continuation': True }) pos += 1 @@ -95,9 +195,35 @@ class BaseWikiMarkup(object): else: - m = otag.match(line) if m: - t = { 'type': 'TAG', + pos -= 1 + t = None + m = otag.match(line, pos) + if m and m.group('tag') in self.tags: + rest = line[m.end(0):] + line = m.group('pfx') + pos = 0 + t = { 'type': 'OTAG', 'tag': m.group('tag'), 'args': m.group('args') } - - if self.input_tag(t): + else: + m = ctag.match(line, pos) + if m and m.group('tag') in self.tags: + rest = line[m.end(0):] + line = m.group('pfx') + pos = 0 + t = { 'type': 'CTAG', + 'tag': m.group('tag') } + + if line: + if line[-1] == '\n': + if line[pos:-1] != '': + yield({ 'type': 'TEXT', + 'content': line[pos:-1] }) + yield({ 'type': 'NL', + 'content': '\n' }) + else: + yield({ 'type': 'TEXT', + 'content': line[pos:] }) + + if t: + if t['type'] == 'OTAG' and t['tag'] == 'nowiki': s = '' @@ -113,20 +239,13 @@ class BaseWikiMarkup(object): break - yield({ 'type': 'TAG', - 'tag': t['tag'], - 'args': t['args'], - 'content': s - }) - line = None - continue - - if line[-1] == '\n': - if line[pos:-1] != '': - yield({ 'type': 'TEXT', - 'content': line[pos:-1] }) - yield({ 'type': 'NL', - 'content': '\n' }) + t['type'] = 'TAG' + t['content'] = {'type': 'TEXT', 'content': s} + + yield(t) + if t['type'] == 'OTAG' and m.group('closed'): + t['type'] = 'CTAG' + yield(t) + line = rest + pos = 0 else: - yield({ 'type': 'TEXT', - 'content': line[pos:] }) - line = None + line = None @@ -196,5 +315,6 @@ class BaseWikiMarkup(object): self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' + if self.tokind == len(self.toklist): + return { 'type': 'NIL' } tok = self.toklist[self.tokind] - if tok['type'] != 'NIL': - self.tokind = self.tokind + 1 + self.tokind = self.tokind + 1 return tok @@ -210,3 +330,3 @@ class BaseWikiMarkup(object): seq = [] - textlist = [] + text = '' while 1: @@ -214,3 +334,3 @@ class BaseWikiMarkup(object): if tok['type'] == 'TEXT': - textlist.append(tok['content']) + text += tok['content'] elif tok['type'] == 'DELIM': @@ -219,5 +339,5 @@ class BaseWikiMarkup(object): elif self.is_inline_delim(tok): - if textlist: - seq.append({ 'type': 'TEXT', 'content': textlist }) - textlist = [] + if text: + seq.append({ 'type': 'TEXT', 'content': text }) + text = '' x = self.parse_inline(tok) @@ -239,4 +359,4 @@ class BaseWikiMarkup(object): return None - if textlist: - seq.append({ 'type': 'TEXT', 'content': textlist }) + if text: + seq.append({ 'type': 'TEXT', 'content': text }) res = { 'type': what, 'content': seq } @@ -345,8 +465,14 @@ class BaseWikiMarkup(object): tok = self.peektkn() - if re.match("^\s", tok['content']): - type = 'PRE' - rx = re.compile("^\S") + + if self.newline: + if re.match("^\s", tok['content']): + type = 'PRE' + rx = re.compile("^\S") + else: + type = 'PARA' + rx = re.compile("^\s") else: - type = 'PARA' - rx = re.compile("^\s") + type = 'SEQ' + rx = None + while 1: @@ -354,3 +480,3 @@ class BaseWikiMarkup(object): if tok['type'] == 'TEXT': - if self.newline and rx.match(tok['content']): + if rx and self.newline and rx.match(tok['content']): self.ungetkn() @@ -369,2 +495,5 @@ class BaseWikiMarkup(object): break + elif tok['type'] == 'OTAG' or tok['type'] == 'CTAG' or tok['type'] == 'TAG': + self.ungetkn() + break elif tok['type'] == 'DELIM': @@ -372,3 +501,4 @@ class BaseWikiMarkup(object): if textlist: - seq.append({ 'type': 'TEXT', 'content': textlist }) + seq.append({ 'type': 'TEXT', + 'content': ''.join(textlist) }) textlist = [] @@ -399,3 +529,3 @@ class BaseWikiMarkup(object): if textlist: - seq.append({ 'type': 'TEXT', 'content': textlist }) + seq.append({ 'type': 'TEXT', 'content': ''.join(textlist) }) self.dprint(80, "LEAVE parse_para=%s", seq) @@ -445,11 +575,14 @@ class BaseWikiMarkup(object): list.append(tok) - elif tok['type'] == 'DELIM' and tok['content'][0] == ":": - list.append(self.parse_indent(len(tok['content']))) - break - else: - x = self.parse_inline(tok) - if x: - list.append(x) + elif tok['type'] == 'DELIM': + if tok['content'][0] == ":": + list.append(self.parse_indent(len(tok['content']))) + break else: - list.append(tok) + x = self.parse_inline(tok) + if x: + list.append(x) + else: + list.append(tok) + else: + list.append(tok) self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list) @@ -463,3 +596,3 @@ class BaseWikiMarkup(object): if tok['type'] == 'DELIM' \ - and envtypes.has_key(tok['content'][0]) \ + and tok['content'][0] in envtypes \ and type == envtypes[tok['content'][0]][0]: @@ -473,3 +606,3 @@ class BaseWikiMarkup(object): elt = self.parse_line() - if len(tok.keys()) == 2: + if 'continuation' not in tok: list.append({ 'type': 'ELT', @@ -479,7 +612,8 @@ class BaseWikiMarkup(object): - if list[-1]['content']['type'] != 'SEQ': - x = list[-1]['content']['content'] - # FIXME: - list[-1]['content'] = { 'type': 'SEQ', 'content': [x] } - list[-1]['content']['content'].append(elt) + if list: + if list[-1]['content']['type'] != 'SEQ': + x = list[-1]['content']['content'] + # FIXME: + list[-1]['content'] = { 'type': 'SEQ', 'content': [x] } + list[-1]['content']['content'].append(elt) else: @@ -496,4 +630,41 @@ class BaseWikiMarkup(object): + def parse_til(self, tag): + self.dprint(80, "ENTER parse_til(%s)", tag) + seq = [] + save = self.tokind + while 1: + t = self.parse0() + if t == None or t['type'] == 'NIL': + self.tokind = save + s = '<' + tag['tag'] + if 'args' in tag and tag['args']: + s += ' ' + tag['args'] + del tag['args'] + s += '>' + if 'content' in tag: + subtree = tag['content'] + else: + subtree = None + tag['type'] = 'TEXT' + tag['content'] = s + if subtree: + self.tree[self.tokind:self.tokind] = subtree + self.dprint(80, "LEAVE parse_til = %s (tree modified)", tag) + self.ungetkn() + return self.parse0() + + if t['type'] == 'CTAG' and tag['tag'] == t['tag']: + break + seq.append(t) + + ret = { 'type': 'TAG', + 'tag': tag['tag'], + 'args': tag['args'], + 'content': { 'type': 'SEQ', 'content': seq } } + self.dprint(80, "LEAVE parse_til = %s", ret) + return ret + def parse0(self): tok = self.getkn() + self.dprint(80, "parse0: %s", tok) toktype = tok['type'] @@ -509,3 +680,3 @@ class BaseWikiMarkup(object): return self.parse_header(tok['content']) - elif envtypes.has_key(tok['content'][0]): + elif tok['content'][0] in envtypes: type = envtypes[tok['content'][0]][0] @@ -521,3 +692,4 @@ class BaseWikiMarkup(object): return { 'type': 'TEXT', 'content': '\n' } -# return self.parse0() + elif toktype == 'OTAG': + return self.parse_til(tok) else: @@ -528,3 +700,7 @@ class BaseWikiMarkup(object): self.tokenize() - self.dprint(90, "TOKLIST: %s", self.toklist) + if self.debug_level >= 90: + print("TOKEN DUMP BEGIN") + self.dump(self.toklist) + print("TOKEN DUMP END") + self.tokind = 0 @@ -536,2 +712,3 @@ class BaseWikiMarkup(object): self.tree.append(subtree) + if self.nested: @@ -539,3 +716,7 @@ class BaseWikiMarkup(object): self.tree[0]['type'] = 'SEQ' - self.dprint(70, "TREE: %s", self.tree) + + if self.debug_level >= 70: + print("TREE DUMP BEGIN") + self.dump(self.tree) + print("TREE DUMP END") @@ -621,7 +802,3 @@ class WikiMarkup (BaseWikiMarkup): if elt['type'] == 'TEXT': - if isinstance(elt['content'],list): - for s in elt['content']: - if re.search('\w', s): - return False - elif re.search('\w', elt['content']): + if re.search('\w', elt['content']): return False |