diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-12 23:11:40 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-12 23:11:40 +0300 |
commit | 28072898f1bd9a925d73ac187d560198d6345524 (patch) | |
tree | a46d781fb85d9dda61fc8f68e0ba6ec43d60ce55 | |
parent | 75672b57a2d63f01d00795fe8d661d1efe7b6e8d (diff) | |
download | wit-28072898f1bd9a925d73ac187d560198d6345524.tar.gz wit-28072898f1bd9a925d73ac187d560198d6345524.tar.bz2 |
Improve tag handling and debugging
* wikimarkup.py: Rewrite tag recognition.
Implement dump method.
* wikicvt.py: New options -D (--dump), and -t dump
* wiki2html.py (input_tag): Remove method
(str_tag): Change handling of tags
* wiki2texi.py: Likewise.
* wiki2text.py: Likewise.
-rw-r--r-- | wiki2html.py | 28 | ||||
-rw-r--r-- | wiki2texi.py | 37 | ||||
-rw-r--r-- | wiki2text.py | 27 | ||||
-rwxr-xr-x | wikicvt.py | 26 | ||||
-rw-r--r-- | wikimarkup.py | 317 |
5 files changed, 309 insertions, 126 deletions
diff --git a/wiki2html.py b/wiki2html.py index 441bc76..66939c4 100644 --- a/wiki2html.py +++ b/wiki2html.py @@ -151,67 +151,63 @@ class HtmlWikiMarkup (WikiMarkup): level = elt['level'] + 1 if level > 4: level = 4 return "<h%s>%s</h%s>" % (level, self.format(elt['content']), level) def str_bar(self): return "<hr/>" def str_env(self, elt): type = elt['envtype'] lev = elt['level'] if lev > 4: lev = 2 string = "" for s in elt['content']: n = s['subtype']; string += "<%s>%s</%s>" % (self.envt[type]["elt"][n], self.format(s['content']), self.envt[type]["elt"][n]) return "<%s>%s</%s>" % (self.envt[type]["hdr"], string, self.envt[type]["hdr"]) return string - supported_tags = [ 'nowiki', 'code' ] - def input_tag(self, tag): - return tag['tag'] in self.supported_tags - def str_tag(self, elt): if elt['tag'] == 'nowiki': - return '<pre>' + elt['content'] + '</pre>' + return '<pre>' + self.format(elt['content']) + '</pre>' elif elt['tag'] == 'code': - kwdict = { - 'nested': self.nested + 1, - 'lang': self.lang, - 'text': elt['content'], - 'html_base': self.html_base, - 'image_base': self.image_base, - 'media_base': self.media_base } - markup = HtmlWiktionaryMarkup(**kwdict) - markup.debug_level = self.debug_level - markup.parse() - return '<pre><code>' + str(markup) + '</code></pre>' #FIXME + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + return '<pre><code>' + s + '</code></pre>' #FIXME + else: + s = '<' + elt['tag'] + if elt['args']: + s += ' ' + elt['args'] + s += '>' + s += self.format(elt['content']) + return s + '</' + elt['tag'] + '>' def str_para(self, elt): string = ""; for x in elt['content']: string += self.format(x) return "<p>" + string + "</p>" def str_pre(self, elt): string = ""; for x in elt['content']: string += self.format(x) if self.nested: return string return '<pre>' + string + '</pre>' def str_ind(self, elt): return (" " * 2 * elt['level']) + self.format(elt['content']) def format(self, elt): if elt['type'] == 'TEXT': if isinstance(elt['content'],list): string = "" for s in elt['content']: string += s diff --git a/wiki2texi.py b/wiki2texi.py index 7cc67bd..0b3eb77 100644 --- a/wiki2texi.py +++ b/wiki2texi.py @@ -98,86 +98,85 @@ class TexiWikiMarkup (WikiMarkup): elif elt['type'] == 'BOLD': return self.str_bold(elt) elif elt['type'] == 'LINK': return self.str_link(elt) elif elt['type'] == 'TMPL': return self.str_tmpl(elt) elif elt['type'] == 'BAR': return self.str_bar() elif elt['type'] == 'HDR': return self.str_hdr(elt) elif elt['type'] == 'REF': return self.str_ref(elt) elif elt['type'] == 'ENV': return self.str_env(elt) elif elt['type'] == 'IND': return self.str_ind(elt) elif elt['type'] == 'SEQ': string = "" for x in elt['content']: string += self.format(x) return string else: return str(elt) - supported_tags = [ 'nowiki', 'code' ] - def input_tag(self, tag): - return tag['tag'] in self.supported_tags - def str_tag(self, elt): if elt['tag'] == 'nowiki': - return '@example\n' + elt['content'] + '@end example\n' + return '@example\n' + self.format(elt['content']) + '@end example\n' elif elt['tag'] == 'code': - kwdict = { - 'nested': self.nested + 1, - 'lang': self.lang, - 'text': elt['content'], - 'html_base': self.html_base, - 'image_base': self.image_base, - 'media_base': self.media_base } - markup = TexiWikiMarkup(**kwdict) - markup.debug_level = self.debug_level - markup.parse() - s = str(markup) + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 if not s.endswith("\n"): - s += "\n"; + s += "\n" return '@example\n' + s + '@end example\n' - + elif elt['tag'] == 'tt': + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + return "@code{%s}" % s + else: + s = '<' + elt['tag'] + if elt['args']: + s += ' ' + elt['args'] + s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>' + return s + def str_para(self, elt): string = ""; for x in elt['content']: string += self.format(x) return "\n" + string + "\n" def str_pre(self, elt): string = ""; for x in elt['content']: string += self.format(x) if self.nested: return string if not string.endswith("\n"): string += "\n"; - return '@example\n' + string + '@end example\n' + return '\n@example\n' + string + '@end example\n' def concat(self, eltlist): string = "" for x in eltlist: string += self.format(x) return string def str_it(self, elt): return "@i{" + self.concat(elt['content']) + "}" def str_bold(self, elt): return "@b{" + self.concat(elt['content']) + "}" def nodename(self, elt): return self.format(elt) # FIXME def str_hdr(self, elt): level = elt['level'] if level > len(self.sectcomm[self.sectioning_model]) - 1 - self.sectioning_start: s ="\n@* %s" % (self.format(elt['content'])) else: s = self.sectcomm[self.sectioning_model][level - self.sectioning_start] + " " + self.format(elt['content']) + "\n" if self.sectcomm[self.sectioning_model][0] == '@top': s += "@node %s\n" % (self.nodename(elt['content'])) diff --git a/wiki2text.py b/wiki2text.py index 27a7051..d4cab81 100644 --- a/wiki2text.py +++ b/wiki2text.py @@ -121,67 +121,62 @@ class TextWikiMarkup (WikiMarkup): # print "IN: '%s'" % (text) # print "OUT: '%s'" % (s) return s def fmtpara(self, input): output = "" linebuf = "" length = 0 for s in input.split(): wlen = len(s) if linebuf.endswith("."): wsc = 2 else: wsc = 1 if length + wsc + wlen > self.width: # FIXME: fill out linebuf output += linebuf + '\n' wsc = 0 length = 0 linebuf = "" linebuf += " " * wsc + s length += wsc + wlen return output + linebuf - supported_tags = [ 'nowiki', 'code' ] - def input_tag(self, tag): - return tag['tag'] in self.supported_tags - def str_tag(self, elt): if elt['tag'] == 'nowiki': - return elt['content'] + return self.format(elt['content']) elif elt['tag'] == 'code': - kwdict = { - 'nested': self.nested + 1, - 'lang': self.lang, - 'text': elt['content'], - 'html_base': self.html_base, - 'image_base': self.image_base, - 'media_base': self.media_base } - markup = TextWiktionaryMarkup(**kwdict) - markup.debug_level = self.debug_level - markup.parse() - return str(markup) + self.nested += 1 + s = self.format(elt['content']) + self.nested -= 1 + return s #FIXME + else: + s = '<' + elt['tag'] + if elt['args']: + s += ' ' + elt['args'] + s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>' + return s def format(self, elt): if elt['type'] == 'TEXT': if isinstance(elt['content'],list): string = "" for s in elt['content']: if string: if string.endswith("."): string += " " else: string += " " string += s else: string = elt['content'] elif elt['type'] == 'PRE': string = "" for x in elt['content']: string += self.format(x) string += '\n' elif elt['type'] == 'PARA': string = ""; for x in elt['content']: string += self.format(x) string = self.fmtpara(string) + '\n\n' @@ -1,105 +1,121 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # Copyright (C) 2008,2015 Sergey Poznyakoff # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import sys import getopt +import StringIO from wiki2html import * from wiki2text import * from wiki2texi import * +class DumpWikiMarkup (WikiMarkup): + def __str__(self): + if self.tree: + s = StringIO.StringIO() + self.dump(self.tree, 0, s) + return s.getvalue() + else: + return "" + def usage(code=0): print """ usage: %s [-hvt] [-I INTYPE] [-l lang] [-o kw=val] [--lang=lang] [--option kw=val] [--input-type=INTYPE] [--type=OUTTYPE] [--help] [--verbose] file """ % (sys.argv[0]) sys.exit(code) handlers = { + 'dump': { + 'default': DumpWikiMarkup + }, 'html': { 'default': HtmlWikiMarkup, 'wiktionary': HtmlWiktionaryMarkup }, 'text': { 'default': TextWikiMarkup, 'wiktionary': TextWiktionaryMarkup }, 'texi': { 'default': TexiWikiMarkup } } def main(): verbose_flag = 0 itype = 'default' otype = 'html' lang = "pl" kwdict = {} debug = 0 try: - opts, args = getopt.getopt(sys.argv[1:], "d:I:hl:o:t:v", - ["debug=", "help", "lang=", "option=", - "to", "type", "input-text", "input-type", + opts, args = getopt.getopt(sys.argv[1:], "Dd:I:hl:o:t:v", + ["dump", + "debug=", "help", "lang=", "option=", + "to=", "type=", "input-text", "input-type=", "verbose" ]) except getopt.GetoptError: usage(1) for o, a in opts: if o in ("-h", "--help"): usage() elif o in ("-v", "--verbose"): verbose_flag = verbose_flag + 1 elif o in ("-I", "--input-type"): itype = a elif o in ("-t", "--to", "--type"): otype = a elif o in ("-l", "--lang"): lang = a elif o in ("-o", "--option"): (kw,sep,val) = a.partition('=') if val != '': kwdict[kw] = val elif o == "--input-text": input_text = True elif o in ("-d", "--debug"): debug = eval(a) + elif o in ("-D", "--dump"): + otype = 'dump' if len(args) == 1: if args[0] == '-': kwdict['file'] = sys.stdin else: kwdict['filename'] = args[0] else: usage(1) kwdict['lang']=lang - if handlers.has_key(otype): - if handlers[otype].has_key(itype): + if otype in handlers: + if itype in handlers[otype]: markup = handlers[otype][itype](**kwdict) markup.debug_level = debug markup.parse() print str(markup) exit(0) else: print "unsupported input type: %s" % (itype) else: print "unsupported output type: %s" % (otype) exit(1) if __name__ == '__main__': main() diff --git a/wikimarkup.py b/wikimarkup.py index fde1ec1..9a79d1e 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -1,155 +1,274 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # Copyright (C) 2008, 2009, 2015 Sergey Poznyakoff # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import sys import re from types import * __all__ = [ "BaseWikiMarkup", "WikiMarkup", "envtypes" ] -delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)") -otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") -ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>") +delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") +otag = re.compile("(?P<pfx>[^<]*)<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") +ctag = re.compile("(?P<pfx>[^<]*)</(?P<tag>[a-zA-Z0-9_]+)\s*>") close_delim = { '[': ']', '[[': ']]', '{{': '}}' } # Environment types: envtypes = { "*": [ "unnumbered", 0 ], "#": [ "numbered", 0 ], ";": [ "defn", 0 ], ":": [ "defn", 1 ] } class BaseWikiMarkup(object): toklist = None tokind = 0 newline = 0 tree = None + tags = [ 'code', 'nowiki', 'tt', 'div' ] + nested = 0 debug_level = 0 def dprint(self, lev, fmt, *argv): if self.debug_level >= lev: print "[DEBUG]", fmt % argv - def input_tag(self, tag): + def print_dump_prefix(self, level, file): + file.write("[DUMP]" + ' ' * (2*level + 1)) + + def dump_nil(self, node, level, file): pass + def dump_text(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("CONTENT: \"%s\"\n" % node['content']) + + def dump_delim(self, node, level, file): + file.write("'%s'" % node['content']) + if 'continuation' in node: + file.write(" (cont)") + file.write("\n") + + def dump_tag(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("TAG: %s\n" % node['tag']) + if 'args' in node: + self.print_dump_prefix(level, file) + file.write("ARGS: %s\n" % node['args']) + if 'content' in node: + self.dump_node(node['content'], level + 1, file) + + def dump_seq(self, node, level, file): + self.dump(node['content'], level + 1, file) + + def dump_ref(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("REF: %s\n" % node['ref']) + self.dump_node(node['content'], level + 1, file) + + def dump_hdr(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("LEVEL: %s\n" % node['level']) + self.dump_node(node['content'], level + 1, file) + + def dump_elt(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("SUBTYPE: %s\n" % node['subtype']) + self.dump_node(node['content'], level + 1, file) + + def dump_env(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("ENVTYPE: %s\n" % node['envtype']) + self.print_dump_prefix(level, file) + file.write("LEVEL: %s\n" % node['level']) + self.dump(node['content'], level + 1, file) + + def dump_ind(self, node, level, file): + self.print_dump_prefix(level, file) + file.write("LEVEL: %s\n" % node['level']) + self.dump_node(node['content'], level + 1, file) + + def dump_link(self, node, level, file): + self.dump(node['content'], level + 1, file) + + dump_type = { + 'NIL': dump_nil, + 'NL': dump_nil, + 'TEXT': dump_text, + 'DELIM': dump_delim, + 'OTAG': dump_tag, + 'CTAG': dump_tag, + 'TAG': dump_tag, + 'SEQ': dump_seq, + 'REF': dump_ref, + 'HDR': dump_hdr, + 'ELT': dump_elt, + 'ENV': dump_env, + 'IND': dump_ind, + 'BAR': dump_nil, + 'PARA': dump_seq, + 'PRE': dump_text, + 'BOLD': dump_seq, + 'IT': dump_seq, + 'LINK': dump_link, + } + + def dump_node(self, node, level, file): + if type(node) != dict: + file.write("UNHANDLED NODE: %s, %s\n" % (type(node),node)) + return + + self.print_dump_prefix(level, file) + file.write("NODE " + node['type'] + ":\n") + if node['type'] in self.dump_type: + self.dump_type[node['type']](self, node, level, file) + else: + self.print_dump_prefix(level, file) + file.write("(UNHANDLED) ") + file.write("%s\n" % node) + self.print_dump_prefix(level, file) + file.write("END NODE " + node['type'] + "\n") + + def dump(self, tree, level=0, file=sys.stdout): + for node in tree: + self.dump_node(node, level, file) + def tokread(self): line = None pos = 0 while 1: if (not line or pos == len(line)): try: line = self.input() pos = 0 except StopIteration: line = u'' if not line or line == "": yield({ 'type': 'NIL' }) break if line == '\n': yield({ 'type': 'NL', 'content': line }) line = None continue self.dprint(100, "LINE: %s", line[pos:]) m = delim.search(line, pos) if m: if (pos < m.start(0)): yield({'type': 'TEXT', 'content': line[pos:m.start(0)]}) pos = m.end(0) - if envtypes.has_key(m.group(0)[0]) and line[pos] == ":": - # FIXME? - # FIXME: What's "extra"? + + if m and line[m.start(0)] != '<': + if m.group(0)[0] in envtypes and pos < len(line) and line[pos] == ":": yield({ 'type': 'DELIM', - 'content': m.group(0) }) + 'content': m.group(0), + 'continuation': True }) pos += 1 else: yield({ 'type': 'DELIM', 'content': m.group(0) }) else: - m = otag.match(line) if m: - t = { 'type': 'TAG', + pos -= 1 + t = None + m = otag.match(line, pos) + if m and m.group('tag') in self.tags: + rest = line[m.end(0):] + line = m.group('pfx') + pos = 0 + t = { 'type': 'OTAG', 'tag': m.group('tag'), 'args': m.group('args') } - - if self.input_tag(t): + else: + m = ctag.match(line, pos) + if m and m.group('tag') in self.tags: + rest = line[m.end(0):] + line = m.group('pfx') + pos = 0 + t = { 'type': 'CTAG', + 'tag': m.group('tag') } + + if line: + if line[-1] == '\n': + if line[pos:-1] != '': + yield({ 'type': 'TEXT', + 'content': line[pos:-1] }) + yield({ 'type': 'NL', + 'content': '\n' }) + else: + yield({ 'type': 'TEXT', + 'content': line[pos:] }) + + if t: + if t['type'] == 'OTAG' and t['tag'] == 'nowiki': s = '' if not m.group('closed'): while 1: try: l = self.input() m = ctag.match(l) if m and m.group('tag') == t['tag']: break s += l except StopIteration: break - yield({ 'type': 'TAG', - 'tag': t['tag'], - 'args': t['args'], - 'content': s - }) - line = None - continue - - if line[-1] == '\n': - if line[pos:-1] != '': - yield({ 'type': 'TEXT', - 'content': line[pos:-1] }) - yield({ 'type': 'NL', - 'content': '\n' }) + t['type'] = 'TAG' + t['content'] = {'type': 'TEXT', 'content': s} + + yield(t) + if t['type'] == 'OTAG' and m.group('closed'): + t['type'] = 'CTAG' + yield(t) + line = rest + pos = 0 else: - yield({ 'type': 'TEXT', - 'content': line[pos:] }) - line = None + line = None def input(self): return None def swaptkn(self, i, j): self.dprint(80, "SWAPPING %s <-> %s", i, j) x = self.toklist[i] self.toklist[i] = self.toklist[j] self.toklist[j] = x def tokenize(self): self.toklist = [] for tok in self.tokread(): self.dprint(100, "TOK: %s", tok) self.toklist.append(tok) # Determine and fix up the ordering of bold and italic markers # There are three possible cases: # # 1a. '''a b ''c'' d''' # 1b. ''a b '''c''' d'' # # 2a. '''''a b'' c d''' # 2b. '''''a b''' c d'' # @@ -173,93 +292,94 @@ class BaseWikiMarkup(object): and self.toklist[i+1]['type'] == 'DELIM' \ and self.toklist[stack[-1]]['content'] == self.toklist[i+1]['content']: # Case 3: swap current and next tokens self.swaptkn(i, i+1) # and pop off the matching one stack.pop() else: # Push the token on stack stack.append(i) else: # Push the token on stack stack.append(i) # Redefine all non-matched tokens as TEXT for i in stack: self.toklist[i]['type'] = 'TEXT' def peektkn(self): return self.toklist[self.tokind] def setkn(self,val): self.toklist[self.tokind] = val def getkn(self): self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' + if self.tokind == len(self.toklist): + return { 'type': 'NIL' } tok = self.toklist[self.tokind] - if tok['type'] != 'NIL': - self.tokind = self.tokind + 1 + self.tokind = self.tokind + 1 return tok def ungetkn(self): self.tokind = self.tokind - 1 self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' return self.toklist[self.tokind] def parse_fontmod(self,delim,what): self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s", delim, what, self.peektkn()) seq = [] - textlist = [] + text = '' while 1: tok = self.getkn() if tok['type'] == 'TEXT': - textlist.append(tok['content']) + text += tok['content'] elif tok['type'] == 'DELIM': if tok['content'] == delim: break elif self.is_inline_delim(tok): - if textlist: - seq.append({ 'type': 'TEXT', 'content': textlist }) - textlist = [] + if text: + seq.append({ 'type': 'TEXT', 'content': text }) + text = '' x = self.parse_inline(tok) if x: seq.append(x) else: self.dprint(80, "LEAVE parse_fontmod=%s", "None") return None else: self.dprint(80, "LEAVE parse_fontmod=None") return None elif tok['type'] == 'NL': if self.peektkn()['type'] == 'NL': self.dprint(80, "LEAVE parse_fontmod=None") return None seq.append({ 'type': 'TEXT', 'content': '\n' }) else: self.dprint(80, "LEAVE parse_fontmod=None") return None - if textlist: - seq.append({ 'type': 'TEXT', 'content': textlist }) + if text: + seq.append({ 'type': 'TEXT', 'content': text }) res = { 'type': what, 'content': seq } self.dprint(80, "LEAVE parse_fontmod=%s", res) return res def parse_link(self, type, delim): self.dprint(80, "ENTER parse_link(%s,%s), tok %s", type, delim, self.peektkn()) subtree = [] list = [] while 1: tok = self.getkn() if tok['type'] == 'DELIM': if tok['content'] == delim: if list: subtree.append({ 'type': 'SEQ', 'content': list }) break elif tok['content'] == "|": if len(list) > 1: subtree.append({ 'type': 'SEQ', 'content': list }) elif list: subtree.append(list[0]) list = [] else: x = self.parse_inline(tok) @@ -322,243 +442,304 @@ class BaseWikiMarkup(object): tokind = self.tokind if tok['content'] == "''": x = self.parse_fontmod(tok['content'], 'IT') elif tok['content'] == "'''": x = self.parse_fontmod(tok['content'], 'BOLD') elif tok['content'] == "[": x = self.parse_ref() elif tok['content'] == "[[": x = self.parse_link('LINK', "]]") elif tok['content'] == "{{": x = self.parse_link('TMPL', "}}") else: # FIXME self.dprint(80, "LEAVE parse_inline=%s", "None") x = None if not x: self.tokind = tokind self.dprint(80, "LEAVE parse_inline=%s", x) return x def parse_para(self): self.dprint(80, "ENTER parse_para, tok %s", self.peektkn()) seq = [] textlist = [] tok = self.peektkn() - if re.match("^\s", tok['content']): - type = 'PRE' - rx = re.compile("^\S") + + if self.newline: + if re.match("^\s", tok['content']): + type = 'PRE' + rx = re.compile("^\S") + else: + type = 'PARA' + rx = re.compile("^\s") else: - type = 'PARA' - rx = re.compile("^\s") + type = 'SEQ' + rx = None + while 1: tok = self.getkn() if tok['type'] == 'TEXT': - if self.newline and rx.match(tok['content']): + if rx and self.newline and rx.match(tok['content']): self.ungetkn() break textlist.append(tok['content']) elif tok['type'] == 'NL': tok = self.getkn() if tok['type'] == 'NL' or tok['type'] == 'NIL': break else: self.ungetkn() if self.is_block_delim(tok): break textlist.append('\n') elif tok['type'] == 'NIL': break + elif tok['type'] == 'OTAG' or tok['type'] == 'CTAG' or tok['type'] == 'TAG': + self.ungetkn() + break elif tok['type'] == 'DELIM': if self.is_inline_delim(tok): if textlist: - seq.append({ 'type': 'TEXT', 'content': textlist }) + seq.append({ 'type': 'TEXT', + 'content': ''.join(textlist) }) textlist = [] x = self.parse_inline(tok) if x: seq.append(x) else: self.dprint(80, "ROLLBACK parse_para=%s", tok) od = tok['content'] textlist.append(od) if close_delim.has_key(od): cd = close_delim[od] lev = 0 for tok in self.toklist[self.tokind:]: if tok['type'] == 'NIL': break elif tok['type'] == 'DELIM': if tok['content'] == od: lev += 1 elif tok['content'] == cd: if lev == 0: tok['type'] = 'TEXT' break else: seq.append({ 'type': 'TEXT', 'content': tok['content'] }) # self.ungetkn() break if textlist: - seq.append({ 'type': 'TEXT', 'content': textlist }) + seq.append({ 'type': 'TEXT', 'content': ''.join(textlist) }) self.dprint(80, "LEAVE parse_para=%s", seq) return { 'type': type, 'content': seq } def parse_header(self, delim): self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn()) list = [] while 1: tok = self.getkn() if tok['type'] == 'NIL': self.dprint(80, "LEAVE parse_header=%s", "None") return None elif tok['type'] == 'TEXT': list.append(tok) elif tok['type'] == 'DELIM': if tok['content'] == delim: if self.peektkn()['type'] == 'NL': break else: self.dprint(80, "LEAVE parse_header=%s", "None") return None else: x = self.parse_inline(tok) if x: list.append(x) else: self.dprint(80, "LEAVE parse_header=%s", "None") return None #FIXME? else: self.dprint(80, "LEAVE parse_header=%s", "None") return None self.dprint(80, "LEAVE parse_header=(HDR, %s, (SEQ,%s))",len(delim)-1,list) return { 'type': 'HDR', 'level': len(delim)-1, 'content': { 'type': 'SEQ', 'content': list } } def parse_line(self): self.dprint(80, "ENTER parse_line, tok %s", self.peektkn()) list = [] while 1: tok = self.getkn() if tok['type'] == 'NL' or tok['type'] == 'NIL': break elif tok['type'] == 'TEXT': list.append(tok) - elif tok['type'] == 'DELIM' and tok['content'][0] == ":": - list.append(self.parse_indent(len(tok['content']))) - break - else: - x = self.parse_inline(tok) - if x: - list.append(x) + elif tok['type'] == 'DELIM': + if tok['content'][0] == ":": + list.append(self.parse_indent(len(tok['content']))) + break else: - list.append(tok) + x = self.parse |