From 28072898f1bd9a925d73ac187d560198d6345524 Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Sun, 12 Jul 2015 23:11:40 +0300 Subject: Improve tag handling and debugging * wikimarkup.py: Rewrite tag recognition. Implement dump method. * wikicvt.py: New options -D (--dump), and -t dump * wiki2html.py (input_tag): Remove method (str_tag): Change handling of tags * wiki2texi.py: Likewise. * wiki2text.py: Likewise. --- wiki2html.py | 28 +++--- wiki2texi.py | 37 ++++--- wiki2text.py | 27 ++--- wikicvt.py | 26 ++++- wikimarkup.py | 317 +++++++++++++++++++++++++++++++++++++++++++++------------- 5 files changed, 309 insertions(+), 126 deletions(-) diff --git a/wiki2html.py b/wiki2html.py index 441bc76..66939c4 100644 --- a/wiki2html.py +++ b/wiki2html.py @@ -172,25 +172,21 @@ class HtmlWikiMarkup (WikiMarkup): self.envt[type]["hdr"]) return string - supported_tags = [ 'nowiki', 'code' ] - def input_tag(self, tag): - return tag['tag'] in self.supported_tags - def str_tag(self, elt): if elt['tag'] == 'nowiki': - return '
' + elt['content'] + '
' + return '
' + self.format(elt['content']) + '
' elif elt['tag'] == 'code': - kwdict = { - 'nested': self.nested + 1, - 'lang': self.lang, - 'text': elt['content'], - 'html_base': self.html_base, - 'image_base': self.image_base, - 'media_base': self.media_base } - markup = HtmlWiktionaryMarkup(**kwdict) - markup.debug_level = self.debug_level - markup.parse() - return '
' + str(markup) + '
' #FIXME + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + return '
' + s + '
' #FIXME + else: + s = '<' + elt['tag'] + if elt['args']: + s += ' ' + elt['args'] + s += '>' + s += self.format(elt['content']) + return s + '' def str_para(self, elt): string = ""; diff --git a/wiki2texi.py b/wiki2texi.py index 7cc67bd..0b3eb77 100644 --- a/wiki2texi.py +++ b/wiki2texi.py @@ -119,29 +119,28 @@ class TexiWikiMarkup (WikiMarkup): else: return str(elt) - supported_tags = [ 'nowiki', 'code' ] - def input_tag(self, tag): - return tag['tag'] in self.supported_tags - def str_tag(self, elt): if elt['tag'] == 'nowiki': - return '@example\n' + elt['content'] + '@end example\n' + return '@example\n' + self.format(elt['content']) + '@end example\n' elif elt['tag'] == 'code': - kwdict = { - 'nested': self.nested + 1, - 'lang': self.lang, - 'text': elt['content'], - 'html_base': self.html_base, - 'image_base': self.image_base, - 'media_base': self.media_base } - markup = TexiWikiMarkup(**kwdict) - markup.debug_level = self.debug_level - markup.parse() - s = str(markup) + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 if not s.endswith("\n"): - s += "\n"; + s += "\n" return '@example\n' + s + '@end example\n' - + elif elt['tag'] == 'tt': + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + return "@code{%s}" % s + else: + s = '<' + elt['tag'] + if elt['args']: + s += ' ' + elt['args'] + s += '>' + self.format(elt['content']) + '' + return s + def str_para(self, elt): string = ""; for x in elt['content']: @@ -156,7 +155,7 @@ class TexiWikiMarkup (WikiMarkup): return string if not string.endswith("\n"): string += "\n"; - return '@example\n' + string + '@end example\n' + return '\n@example\n' + string + '@end example\n' def concat(self, eltlist): string = "" diff --git a/wiki2text.py b/wiki2text.py index 27a7051..d4cab81 100644 --- a/wiki2text.py +++ b/wiki2text.py @@ -142,25 +142,20 @@ class TextWikiMarkup (WikiMarkup): length += wsc + wlen return output + linebuf - supported_tags = [ 'nowiki', 'code' ] - def input_tag(self, tag): - return tag['tag'] in self.supported_tags - def str_tag(self, elt): if elt['tag'] == 'nowiki': - return elt['content'] + return self.format(elt['content']) elif elt['tag'] == 'code': - kwdict = { - 'nested': self.nested + 1, - 'lang': self.lang, - 'text': elt['content'], - 'html_base': self.html_base, - 'image_base': self.image_base, - 'media_base': self.media_base } - markup = TextWiktionaryMarkup(**kwdict) - markup.debug_level = self.debug_level - markup.parse() - return str(markup) + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + return s #FIXME + else: + s = '<' + elt['tag'] + if elt['args']: + s += ' ' + elt['args'] + s += '>' + self.format(elt['content']) + '' + return s def format(self, elt): if elt['type'] == 'TEXT': diff --git a/wikicvt.py b/wikicvt.py index e61e28b..c8ca887 100755 --- a/wikicvt.py +++ b/wikicvt.py @@ -17,10 +17,20 @@ import sys import getopt +import StringIO from wiki2html import * from wiki2text import * from wiki2texi import * +class DumpWikiMarkup (WikiMarkup): + def __str__(self): + if self.tree: + s = StringIO.StringIO() + self.dump(self.tree, 0, s) + return s.getvalue() + else: + return "" + def usage(code=0): print """ usage: %s [-hvt] [-I INTYPE] [-l lang] [-o kw=val] [--lang=lang] [--option kw=val] @@ -29,6 +39,9 @@ usage: %s [-hvt] [-I INTYPE] [-l lang] [-o kw=val] [--lang=lang] [--option kw=va sys.exit(code) handlers = { + 'dump': { + 'default': DumpWikiMarkup + }, 'html': { 'default': HtmlWikiMarkup, 'wiktionary': HtmlWiktionaryMarkup @@ -51,9 +64,10 @@ def main(): debug = 0 try: - opts, args = getopt.getopt(sys.argv[1:], "d:I:hl:o:t:v", - ["debug=", "help", "lang=", "option=", - "to", "type", "input-text", "input-type", + opts, args = getopt.getopt(sys.argv[1:], "Dd:I:hl:o:t:v", + ["dump", + "debug=", "help", "lang=", "option=", + "to=", "type=", "input-text", "input-type=", "verbose" ]) except getopt.GetoptError: usage(1) @@ -77,6 +91,8 @@ def main(): input_text = True elif o in ("-d", "--debug"): debug = eval(a) + elif o in ("-D", "--dump"): + otype = 'dump' if len(args) == 1: if args[0] == '-': @@ -88,8 +104,8 @@ def main(): kwdict['lang']=lang - if handlers.has_key(otype): - if handlers[otype].has_key(itype): + if otype in handlers: + if itype in handlers[otype]: markup = handlers[otype][itype](**kwdict) markup.debug_level = debug markup.parse() diff --git a/wikimarkup.py b/wikimarkup.py index fde1ec1..9a79d1e 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -22,9 +22,9 @@ from types import * __all__ = [ "BaseWikiMarkup", "WikiMarkup", "envtypes" ] -delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)") -otag = re.compile("^\s*<(?P[a-zA-Z0-9_]+)(?:\s+(?P.+))?\s*(?P/)?>") -ctag = re.compile("^\s*[a-zA-Z0-9_]+)\s*>") +delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") +otag = re.compile("(?P[^<]*)<(?P[a-zA-Z0-9_]+)(?:\s+(?P.+))?\s*(?P/)?>") +ctag = re.compile("(?P[^<]*)[a-zA-Z0-9_]+)\s*>") close_delim = { '[': ']', @@ -46,6 +46,8 @@ class BaseWikiMarkup(object): newline = 0 tree = None + tags = [ 'code', 'nowiki', 'tt', 'div' ] + nested = 0 debug_level = 0 @@ -53,9 +55,106 @@ class BaseWikiMarkup(object): if self.debug_level >= lev: print "[DEBUG]", fmt % argv - def input_tag(self, tag): + def print_dump_prefix(self, level, file): + file.write("[DUMP]" + ' ' * (2*level + 1)) + + def dump_nil(self, node, level, file): pass + def dump_text(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("CONTENT: \"%s\"\n" % node['content']) + + def dump_delim(self, node, level, file): + file.write("'%s'" % node['content']) + if 'continuation' in node: + file.write(" (cont)") + file.write("\n") + + def dump_tag(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("TAG: %s\n" % node['tag']) + if 'args' in node: + self.print_dump_prefix(level, file) + file.write("ARGS: %s\n" % node['args']) + if 'content' in node: + self.dump_node(node['content'], level + 1, file) + + def dump_seq(self, node, level, file): + self.dump(node['content'], level + 1, file) + + def dump_ref(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("REF: %s\n" % node['ref']) + self.dump_node(node['content'], level + 1, file) + + def dump_hdr(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("LEVEL: %s\n" % node['level']) + self.dump_node(node['content'], level + 1, file) + + def dump_elt(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("SUBTYPE: %s\n" % node['subtype']) + self.dump_node(node['content'], level + 1, file) + + def dump_env(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("ENVTYPE: %s\n" % node['envtype']) + self.print_dump_prefix(level, file) + file.write("LEVEL: %s\n" % node['level']) + self.dump(node['content'], level + 1, file) + + def dump_ind(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("LEVEL: %s\n" % node['level']) + self.dump_node(node['content'], level + 1, file) + + def dump_link(self, node, level, file): + self.dump(node['content'], level + 1, file) + + dump_type = { + 'NIL': dump_nil, + 'NL': dump_nil, + 'TEXT': dump_text, + 'DELIM': dump_delim, + 'OTAG': dump_tag, + 'CTAG': dump_tag, + 'TAG': dump_tag, + 'SEQ': dump_seq, + 'REF': dump_ref, + 'HDR': dump_hdr, + 'ELT': dump_elt, + 'ENV': dump_env, + 'IND': dump_ind, + 'BAR': dump_nil, + 'PARA': dump_seq, + 'PRE': dump_text, + 'BOLD': dump_seq, + 'IT': dump_seq, + 'LINK': dump_link, + } + + def dump_node(self, node, level, file): + if type(node) != dict: + file.write("UNHANDLED NODE: %s, %s\n" % (type(node),node)) + return + + self.print_dump_prefix(level, file) + file.write("NODE " + node['type'] + ":\n") + if node['type'] in self.dump_type: + self.dump_type[node['type']](self, node, level, file) + else: + self.print_dump_prefix(level, file) + file.write("(UNHANDLED) ") + file.write("%s\n" % node) + self.print_dump_prefix(level, file) + file.write("END NODE " + node['type'] + "\n") + + def dump(self, tree, level=0, file=sys.stdout): + for node in tree: + self.dump_node(node, level, file) + def tokread(self): line = None pos = 0 @@ -83,23 +182,50 @@ class BaseWikiMarkup(object): if (pos < m.start(0)): yield({'type': 'TEXT', 'content': line[pos:m.start(0)]}) pos = m.end(0) - if envtypes.has_key(m.group(0)[0]) and line[pos] == ":": - # FIXME? - # FIXME: What's "extra"? + + if m and line[m.start(0)] != '<': + if m.group(0)[0] in envtypes and pos < len(line) and line[pos] == ":": yield({ 'type': 'DELIM', - 'content': m.group(0) }) + 'content': m.group(0), + 'continuation': True }) pos += 1 else: yield({ 'type': 'DELIM', 'content': m.group(0) }) else: - m = otag.match(line) if m: - t = { 'type': 'TAG', + pos -= 1 + t = None + m = otag.match(line, pos) + if m and m.group('tag') in self.tags: + rest = line[m.end(0):] + line = m.group('pfx') + pos = 0 + t = { 'type': 'OTAG', 'tag': m.group('tag'), 'args': m.group('args') } - - if self.input_tag(t): + else: + m = ctag.match(line, pos) + if m and m.group('tag') in self.tags: + rest = line[m.end(0):] + line = m.group('pfx') + pos = 0 + t = { 'type': 'CTAG', + 'tag': m.group('tag') } + + if line: + if line[-1] == '\n': + if line[pos:-1] != '': + yield({ 'type': 'TEXT', + 'content': line[pos:-1] }) + yield({ 'type': 'NL', + 'content': '\n' }) + else: + yield({ 'type': 'TEXT', + 'content': line[pos:] }) + + if t: + if t['type'] == 'OTAG' and t['tag'] == 'nowiki': s = '' if not m.group('closed'): while 1: @@ -111,24 +237,17 @@ class BaseWikiMarkup(object): s += l except StopIteration: break - yield({ 'type': 'TAG', - 'tag': t['tag'], - 'args': t['args'], - 'content': s - }) - line = None - continue - - if line[-1] == '\n': - if line[pos:-1] != '': - yield({ 'type': 'TEXT', - 'content': line[pos:-1] }) - yield({ 'type': 'NL', - 'content': '\n' }) + t['type'] = 'TAG' + t['content'] = {'type': 'TEXT', 'content': s} + + yield(t) + if t['type'] == 'OTAG' and m.group('closed'): + t['type'] = 'CTAG' + yield(t) + line = rest + pos = 0 else: - yield({ 'type': 'TEXT', - 'content': line[pos:] }) - line = None + line = None def input(self): return None @@ -194,9 +313,10 @@ class BaseWikiMarkup(object): def getkn(self): self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' + if self.tokind == len(self.toklist): + return { 'type': 'NIL' } tok = self.toklist[self.tokind] - if tok['type'] != 'NIL': - self.tokind = self.tokind + 1 + self.tokind = self.tokind + 1 return tok def ungetkn(self): @@ -208,18 +328,18 @@ class BaseWikiMarkup(object): self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s", delim, what, self.peektkn()) seq = [] - textlist = [] + text = '' while 1: tok = self.getkn() if tok['type'] == 'TEXT': - textlist.append(tok['content']) + text += tok['content'] elif tok['type'] == 'DELIM': if tok['content'] == delim: break elif self.is_inline_delim(tok): - if textlist: - seq.append({ 'type': 'TEXT', 'content': textlist }) - textlist = [] + if text: + seq.append({ 'type': 'TEXT', 'content': text }) + text = '' x = self.parse_inline(tok) if x: seq.append(x) @@ -237,8 +357,8 @@ class BaseWikiMarkup(object): else: self.dprint(80, "LEAVE parse_fontmod=None") return None - if textlist: - seq.append({ 'type': 'TEXT', 'content': textlist }) + if text: + seq.append({ 'type': 'TEXT', 'content': text }) res = { 'type': what, 'content': seq } self.dprint(80, "LEAVE parse_fontmod=%s", res) return res @@ -343,16 +463,22 @@ class BaseWikiMarkup(object): seq = [] textlist = [] tok = self.peektkn() - if re.match("^\s", tok['content']): - type = 'PRE' - rx = re.compile("^\S") + + if self.newline: + if re.match("^\s", tok['content']): + type = 'PRE' + rx = re.compile("^\S") + else: + type = 'PARA' + rx = re.compile("^\s") else: - type = 'PARA' - rx = re.compile("^\s") + type = 'SEQ' + rx = None + while 1: tok = self.getkn() if tok['type'] == 'TEXT': - if self.newline and rx.match(tok['content']): + if rx and self.newline and rx.match(tok['content']): self.ungetkn() break textlist.append(tok['content']) @@ -367,10 +493,14 @@ class BaseWikiMarkup(object): textlist.append('\n') elif tok['type'] == 'NIL': break + elif tok['type'] == 'OTAG' or tok['type'] == 'CTAG' or tok['type'] == 'TAG': + self.ungetkn() + break elif tok['type'] == 'DELIM': if self.is_inline_delim(tok): if textlist: - seq.append({ 'type': 'TEXT', 'content': textlist }) + seq.append({ 'type': 'TEXT', + 'content': ''.join(textlist) }) textlist = [] x = self.parse_inline(tok) if x: @@ -397,7 +527,7 @@ class BaseWikiMarkup(object): # self.ungetkn() break if textlist: - seq.append({ 'type': 'TEXT', 'content': textlist }) + seq.append({ 'type': 'TEXT', 'content': ''.join(textlist) }) self.dprint(80, "LEAVE parse_para=%s", seq) return { 'type': type, 'content': seq } @@ -443,15 +573,18 @@ class BaseWikiMarkup(object): break elif tok['type'] == 'TEXT': list.append(tok) - elif tok['type'] == 'DELIM' and tok['content'][0] == ":": - list.append(self.parse_indent(len(tok['content']))) - break - else: - x = self.parse_inline(tok) - if x: - list.append(x) + elif tok['type'] == 'DELIM': + if tok['content'][0] == ":": + list.append(self.parse_indent(len(tok['content']))) + break else: - list.append(tok) + x = self.parse_inline(tok) + if x: + list.append(x) + else: + list.append(tok) + else: + list.append(tok) self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list) return { 'type': 'SEQ', 'content': list } @@ -461,7 +594,7 @@ class BaseWikiMarkup(object): while 1: tok = self.getkn() if tok['type'] == 'DELIM' \ - and envtypes.has_key(tok['content'][0]) \ + and tok['content'][0] in envtypes \ and type == envtypes[tok['content'][0]][0]: if len(tok['content']) < lev: self.ungetkn() @@ -471,17 +604,18 @@ class BaseWikiMarkup(object): elt = self.parse_env(type, len(tok['content'])) else: elt = self.parse_line() - if len(tok.keys()) == 2: + if 'continuation' not in tok: list.append({ 'type': 'ELT', 'subtype': envtypes[tok['content'][0]][1], 'content': elt }) continue - if list[-1]['content']['type'] != 'SEQ': - x = list[-1]['content']['content'] - # FIXME: - list[-1]['content'] = { 'type': 'SEQ', 'content': [x] } - list[-1]['content']['content'].append(elt) + if list: + if list[-1]['content']['type'] != 'SEQ': + x = list[-1]['content']['content'] + # FIXME: + list[-1]['content'] = { 'type': 'SEQ', 'content': [x] } + list[-1]['content']['content'].append(elt) else: self.ungetkn() break @@ -494,8 +628,45 @@ class BaseWikiMarkup(object): self.dprint(80, "LEAVE parse_indent=%s", x) return x + def parse_til(self, tag): + self.dprint(80, "ENTER parse_til(%s)", tag) + seq = [] + save = self.tokind + while 1: + t = self.parse0() + if t == None or t['type'] == 'NIL': + self.tokind = save + s = '<' + tag['tag'] + if 'args' in tag and tag['args']: + s += ' ' + tag['args'] + del tag['args'] + s += '>' + if 'content' in tag: + subtree = tag['content'] + else: + subtree = None + tag['type'] = 'TEXT' + tag['content'] = s + if subtree: + self.tree[self.tokind:self.tokind] = subtree + self.dprint(80, "LEAVE parse_til = %s (tree modified)", tag) + self.ungetkn() + return self.parse0() + + if t['type'] == 'CTAG' and tag['tag'] == t['tag']: + break + seq.append(t) + + ret = { 'type': 'TAG', + 'tag': tag['tag'], + 'args': tag['args'], + 'content': { 'type': 'SEQ', 'content': seq } } + self.dprint(80, "LEAVE parse_til = %s", ret) + return ret + def parse0(self): tok = self.getkn() + self.dprint(80, "parse0: %s", tok) toktype = tok['type'] if toktype == 'NIL': return None @@ -507,7 +678,7 @@ class BaseWikiMarkup(object): return { 'type': 'BAR' } elif tok['content'][0:2] == "==": return self.parse_header(tok['content']) - elif envtypes.has_key(tok['content'][0]): + elif tok['content'][0] in envtypes: type = envtypes[tok['content'][0]][0] lev = len(tok['content']) self.ungetkn() @@ -519,14 +690,19 @@ class BaseWikiMarkup(object): return self.parse_para() elif toktype == 'NL': return { 'type': 'TEXT', 'content': '\n' } -# return self.parse0() + elif toktype == 'OTAG': + return self.parse_til(tok) else: return tok def parse(self): if not self.toklist: self.tokenize() - self.dprint(90, "TOKLIST: %s", self.toklist) + if self.debug_level >= 90: + print("TOKEN DUMP BEGIN") + self.dump(self.toklist) + print("TOKEN DUMP END") + self.tokind = 0 self.tree = [] while 1: @@ -534,10 +710,15 @@ class BaseWikiMarkup(object): if subtree == None: break self.tree.append(subtree) + if self.nested: if self.tree[0]['type'] == 'PARA': self.tree[0]['type'] = 'SEQ' - self.dprint(70, "TREE: %s", self.tree) + + if self.debug_level >= 70: + print("TREE DUMP BEGIN") + self.dump(self.tree) + print("TREE DUMP END") def __str__(self): return str(self.tree) @@ -619,11 +800,7 @@ class WikiMarkup (BaseWikiMarkup): def is_empty_text(self, elt): if elt['type'] == 'TEXT': - if isinstance(elt['content'],list): - for s in elt['content']: - if re.search('\w', s): - return False - elif re.search('\w', elt['content']): + if re.search('\w', elt['content']): return False return True return False -- cgit v1.2.1